Adding upstream version 18.2.2.upstream/18.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
commit: e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree: 64f88b554b444a49f656b6c656111a145cbbaa28 /src/rgw
parent: Initial commit. (diff)
download: ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
495 files changed, 242939 insertions, 0 deletions
diff --git a/src/rgw/CMakeLists.txt b/src/rgw/CMakeLists.txt
new file mode 100644
index 000000000..b010f303a
--- /dev/null
+++ b/src/rgw/CMakeLists.txt
@@ -0,0 +1,579 @@
+find_program(GPERF gperf)
+if(NOT GPERF)
+  message(FATAL_ERROR "Can't find gperf")
+endif()
+
+if(WITH_RADOSGW_BACKTRACE_LOGGING)
+  add_definitions(-D_BACKTRACE_LOGGING)
+endif(WITH_RADOSGW_BACKTRACE_LOGGING)
+
+if(WITH_RADOSGW_SELECT_PARQUET)
+  set(ARROW_LIBRARIES Arrow::Arrow Arrow::Parquet)
+  add_definitions(-D_ARROW_EXIST)
+  message("-- arrow is installed, radosgw/s3select-op is able to process parquet objects")
+endif(WITH_RADOSGW_SELECT_PARQUET)
+
+if(WITH_RADOSGW_ARROW_FLIGHT)
+  set(ARROW_FLIGHT_LIBRARIES Arrow::Arrow Arrow::Parquet Arrow::Flight utf8proc::utf8proc) # order is important
+  add_definitions(-D_ARROW_EXIST)
+  message("-- arrow flight is installed")
+endif(WITH_RADOSGW_ARROW_FLIGHT)
+
+function(gperf_generate input output)
+  add_custom_command(
+    OUTPUT ${output}
+    COMMAND ${GPERF} ${input} | sed "s/register //g" > ${output}
+    DEPENDS ${input}
+    COMMENT "Generate ${output}"
+    )
+endfunction()
+
+find_package(ICU 52.0 COMPONENTS uc REQUIRED)
+
+set(librgw_common_srcs
+  services/svc_finisher.cc
+  services/svc_bi_rados.cc
+  services/svc_bilog_rados.cc
+  services/svc_bucket.cc
+  services/svc_bucket_sobj.cc
+  services/svc_bucket_sync_sobj.cc
+  services/svc_cls.cc
+  services/svc_config_key_rados.cc
+  services/svc_mdlog.cc
+  services/svc_meta.cc
+  services/svc_meta_be.cc
+  services/svc_meta_be_otp.cc
+  services/svc_meta_be_sobj.cc
+  services/svc_notify.cc
+  services/svc_otp.cc
+  services/svc_quota.cc
+  services/svc_sync_modules.cc
+  services/svc_rados.cc
+  services/svc_role_rados.cc
+  services/svc_sys_obj.cc
+  services/svc_sys_obj_cache.cc
+  services/svc_sys_obj_core.cc
+  services/svc_tier_rados.cc
+  services/svc_user.cc
+  services/svc_user_rados.cc
+  services/svc_zone.cc
+  services/svc_zone_utils.cc
+  rgw_acl.cc
+  rgw_acl_s3.cc
+  rgw_acl_swift.cc
+  rgw_aio.cc
+  rgw_aio_throttle.cc
+  rgw_auth.cc
+  rgw_auth_s3.cc
+  rgw_arn.cc
+  rgw_basic_types.cc
+  rgw_bucket.cc
+  rgw_bucket_layout.cc
+  rgw_cache.cc
+  rgw_common.cc
+  rgw_compression.cc
+  rgw_cors.cc
+  rgw_cors_s3.cc
+  rgw_env.cc
+  rgw_es_query.cc
+  rgw_formats.cc
+  rgw_http_client.cc
+  rgw_keystone.cc
+  rgw_ldap.cc
+  rgw_lc.cc
+  rgw_lc_s3.cc
+  rgw_metadata.cc
+  rgw_multi.cc
+  rgw_multi_del.cc
+  rgw_multipart_meta_filter.cc
+  rgw_obj_manifest.cc
+  rgw_period.cc
+  rgw_realm.cc
+  rgw_sync.cc
+  rgw_sync_policy.cc
+  rgw_notify_event_type.cc
+  rgw_period_history.cc
+  rgw_period_puller.cc
+  rgw_pubsub.cc
+  rgw_coroutine.cc
+  rgw_cr_rest.cc
+  rgw_op.cc
+  rgw_policy_s3.cc
+  rgw_public_access.cc
+  rgw_putobj.cc
+  rgw_quota.cc
+  rgw_resolve.cc
+  rgw_rest.cc
+  rgw_rest_client.cc
+  rgw_rest_config.cc
+  rgw_rest_conn.cc
+  rgw_rest_metadata.cc
+  rgw_rest_ratelimit.cc
+  rgw_rest_role.cc
+  rgw_rest_s3.cc
+  rgw_rest_pubsub.cc
+  rgw_s3select.cc
+  rgw_role.cc
+  rgw_sal.cc
+  rgw_sal_filter.cc
+  rgw_string.cc
+  rgw_tag.cc
+  rgw_tag_s3.cc
+  rgw_tools.cc
+  rgw_user.cc
+  rgw_website.cc
+  rgw_xml.cc
+  rgw_torrent.cc
+  rgw_crypt.cc
+  rgw_crypt_sanitize.cc
+  rgw_iam_policy.cc
+  rgw_rest_user_policy.cc
+  rgw_zone.cc
+  rgw_sts.cc
+  rgw_rest_sts.cc
+  rgw_perf_counters.cc
+  rgw_rest_oidc_provider.cc
+  rgw_rest_iam.cc
+  rgw_object_lock.cc
+  rgw_kms.cc
+  rgw_kmip_client.cc
+  rgw_url.cc
+  rgw_oidc_provider.cc
+  rgw_log.cc
+  rgw_lua_request.cc
+  rgw_lua_utils.cc
+  rgw_lua.cc
+  rgw_lua_data_filter.cc
+  rgw_bucket_encryption.cc
+  rgw_tracer.cc
+  rgw_lua_background.cc
+  driver/rados/cls_fifo_legacy.cc
+  driver/rados/rgw_bucket.cc
+  driver/rados/rgw_bucket_sync.cc
+  driver/rados/rgw_cr_rados.cc
+  driver/rados/rgw_cr_tools.cc
+  driver/rados/rgw_d3n_datacache.cc
+  driver/rados/rgw_datalog.cc
+  driver/rados/rgw_datalog_notify.cc
+  driver/rados/rgw_data_sync.cc
+  driver/rados/rgw_etag_verifier.cc
+  driver/rados/rgw_gc.cc
+  driver/rados/rgw_gc_log.cc
+  driver/rados/rgw_lc_tier.cc
+  driver/rados/rgw_log_backing.cc
+  driver/rados/rgw_metadata.cc
+  driver/rados/rgw_notify.cc
+  driver/rados/rgw_obj_manifest.cc
+  driver/rados/rgw_object_expirer_core.cc
+  driver/rados/rgw_otp.cc
+  driver/rados/rgw_period.cc
+  driver/rados/rgw_pubsub_push.cc
+  driver/rados/rgw_putobj_processor.cc
+  driver/rados/rgw_rados.cc
+  driver/rados/rgw_reshard.cc
+  driver/rados/rgw_rest_bucket.cc
+  driver/rados/rgw_rest_log.cc
+  driver/rados/rgw_rest_realm.cc
+  driver/rados/rgw_rest_user.cc
+  driver/rados/rgw_sal_rados.cc
+  driver/rados/rgw_service.cc
+  driver/rados/rgw_sync.cc
+  driver/rados/rgw_sync_counters.cc
+  driver/rados/rgw_sync_error_repo.cc
+  driver/rados/rgw_sync_module.cc
+  driver/rados/rgw_sync_module_aws.cc
+  driver/rados/rgw_sync_module_es.cc
+  driver/rados/rgw_sync_module_es_rest.cc
+  driver/rados/rgw_sync_module_log.cc
+  driver/rados/rgw_sync_trace.cc
+  driver/rados/rgw_tools.cc
+  driver/rados/rgw_trim_bilog.cc
+  driver/rados/rgw_trim_datalog.cc
+  driver/rados/rgw_trim_mdlog.cc
+  driver/rados/rgw_user.cc
+  driver/rados/rgw_zone.cc)
+
+list(APPEND librgw_common_srcs
+  driver/immutable_config/store.cc
+  driver/json_config/store.cc
+  driver/rados/config/impl.cc
+  driver/rados/config/period.cc
+  driver/rados/config/period_config.cc
+  driver/rados/config/realm.cc
+  driver/rados/config/store.cc
+  driver/rados/config/zone.cc
+  driver/rados/config/zonegroup.cc)
+
+if(WITH_RADOSGW_AMQP_ENDPOINT)
+  list(APPEND librgw_common_srcs rgw_amqp.cc)
+endif()
+if(WITH_RADOSGW_KAFKA_ENDPOINT)
+  list(APPEND librgw_common_srcs rgw_kafka.cc)
+endif()
+if(WITH_RADOSGW_DBSTORE)
+  add_subdirectory(driver/dbstore)
+  list(APPEND librgw_common_srcs rgw_sal_dbstore.cc)
+endif()
+if(WITH_RADOSGW_MOTR)
+  list(APPEND librgw_common_srcs rgw_sal_motr.cc)
+endif()
+if(WITH_RADOSGW_DAOS)
+  list(APPEND librgw_common_srcs rgw_sal_daos.cc)
+endif()
+if(WITH_JAEGER)
+  list(APPEND librgw_common_srcs rgw_tracer.cc)
+endif()
+if(WITH_RADOSGW_ARROW_FLIGHT)
+  # NOTE: eventually don't want this in common but just in radosgw daemon
+  # list(APPEND radosgw_srcs rgw_flight.cc rgw_flight_frontend.cc)
+  list(APPEND librgw_common_srcs rgw_flight.cc rgw_flight_frontend.cc)
+endif(WITH_RADOSGW_ARROW_FLIGHT)
+
+
+add_library(rgw_common STATIC ${librgw_common_srcs})
+
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-Wimplicit-const-int-float-conversion"
+  COMPILER_SUPPORTS_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION)
+if(COMPILER_SUPPORTS_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION)
+  target_compile_definitions(common-objs PRIVATE
+    HAVE_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION)
+endif()
+
+target_link_libraries(rgw_common
+  PRIVATE
+    global
+    cls_2pc_queue_client
+    cls_cmpomap_client
+    cls_lock_client
+    cls_log_client
+    cls_otp_client
+    cls_refcount_client
+    cls_rgw_client
+    cls_rgw_gc_client
+    cls_timeindex_client
+    cls_user_client
+    cls_version_client
+    librados
+    rt
+    ICU::uc
+    OATH::OATH
+    dmclock::dmclock
+    ${CURL_LIBRARIES}
+    ${EXPAT_LIBRARIES}
+    ${ARROW_LIBRARIES}
+    ${ARROW_FLIGHT_LIBRARIES}
+    ${ALLOC_LIBS}
+  PUBLIC
+    ${LUA_LIBRARIES}
+    RapidJSON::RapidJSON
+    spawn
+    fmt::fmt)
+target_include_directories(rgw_common
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/services"
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/driver/rados"
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw"
+  PUBLIC "${LUA_INCLUDE_DIR}")
+if(WITH_RADOSGW_KAFKA_ENDPOINT)
+  # used by rgw_kafka.cc
+  target_link_libraries(rgw_common
+    PRIVATE
+      RDKafka::RDKafka)
+endif()
+if(WITH_RADOSGW_AMQP_ENDPOINT)
+  # used by rgw_amqp.cc
+  target_link_libraries(rgw_common
+    PRIVATE
+      RabbitMQ::RabbitMQ
+      OpenSSL::SSL)
+endif()
+if(WITH_OPENLDAP)
+  target_link_libraries(rgw_common
+    PRIVATE
+      OpenLDAP::OpenLDAP)
+endif()
+if(WITH_RADOSGW_LUA_PACKAGES)
+  target_link_libraries(rgw_common
+    PRIVATE Boost::filesystem StdFilesystem::filesystem)
+endif()
+
+if(WITH_LTTNG)
+  # rgw/rgw_op.cc includes "tracing/rgw_op.h"
+  # rgw/rgw_rados.cc includes "tracing/rgw_rados.h"
+  add_dependencies(rgw_common rgw_op-tp rgw_rados-tp)
+endif()
+
+if(WITH_JAEGER)
+  add_dependencies(rgw_common jaeger_base)
+  target_link_libraries(rgw_common PUBLIC jaeger_base)
+endif()
+
+if(WITH_RADOSGW_DBSTORE)
+  target_link_libraries(rgw_common PRIVATE global dbstore)
+endif()
+
+if(WITH_RADOSGW_MOTR)
+  find_package(motr REQUIRED)
+  target_link_libraries(rgw_common PRIVATE motr::motr)
+endif()
+
+if(WITH_RADOSGW_DAOS)
+  find_package(DAOS REQUIRED)
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG ")
+  target_link_libraries(rgw_common PRIVATE daos dfs ds3 uuid duns)
+  target_include_directories(rgw_common PRIVATE ${PC_DAOS_INCLUDEDIR} )
+  link_directories( ${PC_DAOS_LIBRARY_DIRS} )
+endif()
+
+set(rgw_a_srcs
+  rgw_appmain.cc
+  rgw_asio_client.cc
+  rgw_asio_frontend.cc
+  rgw_auth_keystone.cc
+  rgw_client_io.cc
+  rgw_file.cc
+  rgw_frontend.cc
+  rgw_http_client_curl.cc
+  rgw_kmip_client_impl.cc
+  rgw_lib.cc
+  rgw_loadgen.cc
+  rgw_loadgen_process.cc
+  rgw_log.cc
+  rgw_lua_request.cc
+  rgw_opa.cc
+  rgw_os_lib.cc
+  rgw_period_pusher.cc
+  rgw_process.cc
+  rgw_realm_reloader.cc
+  rgw_realm_watcher.cc
+  rgw_rest_config.cc
+  rgw_rest_info.cc
+  rgw_rest_metadata.cc
+  rgw_rest_ratelimit.cc
+  rgw_rest_sts.cc
+  rgw_rest_swift.cc
+  rgw_rest_usage.cc
+  rgw_signal.cc
+  rgw_swift_auth.cc
+  rgw_usage.cc
+  rgw_sts.cc
+  driver/rados/rgw_rest_bucket.cc
+  driver/rados/rgw_rest_log.cc
+  driver/rados/rgw_rest_realm.cc)
+
+gperf_generate(${CMAKE_SOURCE_DIR}/src/rgw/rgw_iam_policy_keywords.gperf
+  rgw_iam_policy_keywords.frag.cc)
+set_source_files_properties(rgw_iam_policy.cc PROPERTIES
+  OBJECT_DEPENDS ${CMAKE_BINARY_DIR}/src/rgw/rgw_iam_policy_keywords.frag.cc
+  COMPILE_FLAGS -I${CMAKE_BINARY_DIR}/src/rgw)
+
+
+add_library(rgw_a STATIC
+    ${rgw_a_srcs})
+
+target_compile_definitions(rgw_a PUBLIC "-DCLS_CLIENT_HIDE_IOCTX")
+
+target_include_directories(rgw_a
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/dmclock/support/src"
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw"
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/driver/rados"
+  PRIVATE "${CMAKE_SOURCE_DIR}/src/libkmip")
+
+if(WITH_RADOSGW_AMQP_ENDPOINT)
+  find_package(RabbitMQ REQUIRED)
+endif()
+if(WITH_RADOSGW_KAFKA_ENDPOINT)
+  find_package(RDKafka 0.9.2 REQUIRED)
+endif()
+
+target_link_libraries(rgw_a
+  PRIVATE
+    common_utf8 global
+    ${CRYPTO_LIBS}
+    ${ARROW_LIBRARIES}
+    ${ARROW_FLIGHT_LIBRARIES}
+    OATH::OATH
+  PUBLIC
+    rgw_common
+    spawn)
+
+if(WITH_CURL_OPENSSL)
+  # used by rgw_http_client_curl.cc
+  target_link_libraries(rgw_a PRIVATE OpenSSL::Crypto)
+endif()
+
+set(rgw_libs rgw_a)
+
+set(rgw_schedulers_srcs
+  rgw_dmclock_scheduler_ctx.cc
+  rgw_dmclock_sync_scheduler.cc
+  rgw_dmclock_async_scheduler.cc)
+
+add_library(rgw_schedulers STATIC ${rgw_schedulers_srcs})
+target_link_libraries(rgw_schedulers
+  PUBLIC dmclock::dmclock spawn)
+
+set(radosgw_srcs
+  rgw_main.cc)
+
+add_executable(radosgw ${radosgw_srcs})
+
+if(WITH_RADOSGW_ARROW_FLIGHT)
+  # target_compile_definitions(radosgw PUBLIC WITH_ARROW_FLIGHT)
+  target_compile_definitions(rgw_common PUBLIC WITH_ARROW_FLIGHT)
+  target_include_directories(rgw_common
+    PUBLIC "${CMAKE_SOURCE_DIR}/src/arrow/cpp/src")
+  # target_include_directories(radosgw PUBLIC Arrow::Arrow)
+endif(WITH_RADOSGW_ARROW_FLIGHT)
+
+target_compile_definitions(radosgw PUBLIC "-DCLS_CLIENT_HIDE_IOCTX")
+target_include_directories(radosgw
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/dmclock/support/src"
+  PRIVATE "${CMAKE_SOURCE_DIR}/src/libkmip"
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw"
+  PRIVATE "${LUA_INCLUDE_DIR}")
+
+target_include_directories(radosgw SYSTEM PUBLIC "../rapidjson/include")
+
+target_link_libraries(radosgw PRIVATE ${rgw_libs} rgw_schedulers kmip)
+if(WITH_RADOSGW_BEAST_OPENSSL)
+  # used by rgw_asio_frontend.cc
+  target_link_libraries(radosgw PRIVATE OpenSSL::SSL)
+endif()
+install(TARGETS radosgw DESTINATION bin)
+
+set(radosgw_admin_srcs
+  rgw_admin.cc
+  rgw_sync_checkpoint.cc
+  rgw_orphan.cc)
+
+# this is unsatisfying and hopefully temporary; ARROW should not be
+# part of radosgw_admin
+if(WITH_RADOSGW_ARROW_FLIGHT)
+  list(APPEND radosgw_admin_srcs rgw_flight.cc)
+endif(WITH_RADOSGW_ARROW_FLIGHT)
+
+add_executable(radosgw-admin ${radosgw_admin_srcs})
+target_link_libraries(radosgw-admin ${rgw_libs} librados
+  cls_rgw_client cls_otp_client cls_lock_client cls_refcount_client
+  cls_log_client cls_timeindex_client
+  cls_version_client cls_user_client
+  global ${LIB_RESOLV}
+  OATH::OATH
+  ${CURL_LIBRARIES} ${EXPAT_LIBRARIES} ${BLKID_LIBRARIES})
+
+# this is unsatisfying and hopefully temporary; ARROW should not be
+# part of radosgw_admin
+if(WITH_RADOSGW_ARROW_FLIGHT)
+  target_link_libraries(radosgw-admin ${ARROW_LIBRARIES} ${ARROW_FLIGHT_LIBRARIES})
+endif(WITH_RADOSGW_ARROW_FLIGHT)
+
+install(TARGETS radosgw-admin DESTINATION bin)
+
+set(radosgw_es_srcs
+  rgw_es_main.cc)
+add_executable(radosgw-es ${radosgw_es_srcs})
+target_link_libraries(radosgw-es ${rgw_libs} librados
+  cls_rgw_client cls_otp_client cls_lock_client cls_refcount_client
+  cls_log_client cls_timeindex_client
+  cls_version_client cls_user_client
+  global ${LIB_RESOLV}
+  ${CURL_LIBRARIES} ${EXPAT_LIBRARIES} ${BLKID_LIBRARIES})
+install(TARGETS radosgw-es DESTINATION bin)
+
+set(radosgw_token_srcs
+  rgw_token.cc)
+add_executable(radosgw-token ${radosgw_token_srcs})
+target_link_libraries(radosgw-token librados
+  global ${ALLOC_LIBS})
+install(TARGETS radosgw-token DESTINATION bin)
+
+set(radosgw_object_expirer_srcs
+  rgw_object_expirer.cc)
+add_executable(radosgw-object-expirer ${radosgw_object_expirer_srcs})
+target_link_libraries(radosgw-object-expirer ${rgw_libs} librados
+  cls_rgw_client cls_otp_client cls_lock_client cls_refcount_client
+  cls_log_client cls_timeindex_client
+  cls_version_client cls_user_client
+  global ${LIB_RESOLV}
+  ${CURL_LIBRARIES} ${EXPAT_LIBRARIES})
+install(TARGETS radosgw-object-expirer DESTINATION bin)
+
+set(radosgw_polparser_srcs
+  rgw_polparser.cc)
+add_executable(rgw-policy-check ${radosgw_polparser_srcs})
+target_link_libraries(rgw-policy-check ${rgw_libs})
+install(TARGETS rgw-policy-check DESTINATION bin)
+
+set(librgw_srcs
+  librgw.cc)
+add_library(rgw SHARED ${librgw_srcs})
+
+target_compile_definitions(rgw PUBLIC "-DCLS_CLIENT_HIDE_IOCTX")
+target_include_directories(rgw
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/dmclock/support/src"
+  PRIVATE "${CMAKE_SOURCE_DIR}/src/libkmip"
+  PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw"
+  PRIVATE "${LUA_INCLUDE_DIR}")
+
+target_include_directories(rgw SYSTEM PUBLIC "../rapidjson/include")
+
+target_link_libraries(rgw
+  PRIVATE
+  ${rgw_libs}
+  rgw_schedulers
+  kmip
+  librados
+  cls_rgw_client
+  cls_otp_client
+  cls_lock_client
+  cls_refcount_client
+  cls_log_client
+  cls_timeindex_client
+  cls_version_client
+  cls_user_client
+  ${LIB_RESOLV}
+  ${CURL_LIBRARIES}
+  ${EXPAT_LIBRARIES}
+  PUBLIC
+  RapidJSON::RapidJSON
+  dmclock::dmclock)
+
+if(WITH_RADOSGW_AMQP_ENDPOINT)
+  target_link_libraries(rgw PRIVATE RabbitMQ::RabbitMQ)
+  target_link_libraries(rgw PRIVATE OpenSSL::SSL)
+endif()
+
+if(WITH_RADOSGW_KAFKA_ENDPOINT)
+  target_link_libraries(rgw PRIVATE RDKafka::RDKafka)
+endif()
+
+set_target_properties(rgw PROPERTIES OUTPUT_NAME rgw VERSION 2.0.0
+  SOVERSION 2)
+install(TARGETS rgw DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+if(WITH_TESTS)
+  add_executable(ceph_rgw_jsonparser
+    rgw_jsonparser.cc)
+  target_link_libraries(ceph_rgw_jsonparser
+    ${rgw_libs}
+    global)
+
+  add_executable(ceph_rgw_multiparser
+    rgw_multiparser.cc)
+  target_link_libraries(ceph_rgw_multiparser
+    ${rgw_libs}
+    global)
+
+  install(TARGETS
+    ceph_rgw_jsonparser
+    ceph_rgw_multiparser
+    DESTINATION bin)
+endif(WITH_TESTS)
+
+install(PROGRAMS
+  rgw-gap-list
+  rgw-gap-list-comparator
+  rgw-orphan-list
+  rgw-restore-bucket-index
+  DESTINATION bin)
diff --git a/src/rgw/MAINTAINERS.md b/src/rgw/MAINTAINERS.md
new file mode 100644
index 000000000..4636a636e
--- /dev/null
+++ b/src/rgw/MAINTAINERS.md
@@ -0,0 +1,28 @@
+# RGW Maintainers
+
+Maintainers are the default assignee for related tracker issues and pull requests.
+
+| Component                       | Name                            |
+|---------------------------------|---------------------------------|
+| auth, STS                       | Pritha Srivastava               |
+| bucket index, resharding        | J. Eric Ivancich                |
+| bucket notifications            | Yuval Lifshitz                  |
+| data caching                    | Mark Kogan                      |
+| garbage collection              | Pritha Srivastava               |
+| http frontends                  | Casey Bodley                    |
+| lifecycle                       | Matt Benjamin                   |
+| lua scripting                   | Yuval Lifshitz                  |
+| multisite                       | Casey Bodley                    |
+| object i/o                      | Casey Bodley                    |
+| rgw orchestration, admin APIs   | Ali Maredia                     |
+| radosgw-admin                   | Daniel Gryniewicz               |
+| rest ops                        | Daniel Gryniewicz               |
+| rgw-nfs                         | Matt Benjamin                   |
+| performance                     | Mark Kogan                      |
+| s3 select                       | Gal Salomon                     |
+| storage abstraction layer       | Daniel Gryniewicz               |
+
+# Looking for maintainer
+
+* security (crypto, SSE, CVEs)
+* swift api
diff --git a/src/rgw/driver/daos/README.md b/src/rgw/driver/daos/README.md
new file mode 100644
index 000000000..de6d215a0
--- /dev/null
+++ b/src/rgw/driver/daos/README.md
@@ -0,0 +1,47 @@
+# DAOS
+
+Standalone RADOS Gateway (RGW) on [DAOS](http://daos.io/) (Experimental)
+
+## CMake Option
+
+Add below cmake option
+
+```bash
+    -DWITH_RADOSGW_DAOS=ON
+```
+
+## Build
+
+```bash
+    cd build
+    ninja [vstart]
+```
+
+## Running Test cluster
+
+Edit ceph.conf to add below option
+
+```conf
+    [client]
+        rgw backend store = daos
+```
+
+Restart vstart cluster or just RGW server
+
+```bash
+    [..] RGW=1 ../src/vstart.sh -d
+```
+
+The above configuration brings up an RGW server on DAOS.
+
+## Creating a test user
+
+ To create a `testid` user to be used for s3 operations, use the following command:
+
+ ```bash
+local akey='0555b35654ad1656d804'
+local skey='h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q=='
+    radosgw-admin user create --uid testid \
+        --access-key $akey --secret $skey \
+        --display-name 'M. Tester' --email tester@ceph.com --no-mon-config
+ ```
diff --git a/src/rgw/driver/dbstore/CMakeLists.txt b/src/rgw/driver/dbstore/CMakeLists.txt
new file mode 100644
index 000000000..a3aca7a64
--- /dev/null
+++ b/src/rgw/driver/dbstore/CMakeLists.txt
@@ -0,0 +1,71 @@
+#need to update cmake version here
+cmake_minimum_required(VERSION 3.14.0)
+project(dbstore)
+
+option(USE_SQLITE "Enable SQLITE DB" ON)
+
+set (CMAKE_INCLUDE_DIR ${CMAKE_INCLUDE_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/common")
+
+set(dbstore_srcs
+    common/dbstore_log.h
+    common/dbstore.h
+    common/dbstore.cc
+    config/store.cc)
+IF(USE_SQLITE)
+  list(APPEND dbstore_srcs
+      config/sqlite.cc
+      sqlite/connection.cc
+      sqlite/error.cc
+      sqlite/statement.cc)
+endif()
+
+set(dbstore_mgr_srcs
+    dbstore_mgr.h
+    dbstore_mgr.cc
+    )
+
+add_library(dbstore_lib ${dbstore_srcs})
+target_include_directories(dbstore_lib
+    PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw"
+    PUBLIC "${CMAKE_SOURCE_DIR}/src/rgw/store/rados"
+    PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
+set(link_targets spawn)
+if(WITH_JAEGER)
+  list(APPEND link_targets jaeger_base)
+endif()
+list(APPEND link_targets rgw_common)
+target_link_libraries(dbstore_lib PUBLIC ${link_targets})
+
+set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} dbstore_lib)
+
+IF(USE_SQLITE)
+  add_subdirectory(sqlite)
+  set(CMAKE_INCLUDE_DIR ${CMAKE_INCLUDE_DIR} "${CMAKE_CURRENT_SOURCE_DIR}/sqlite")
+  add_compile_definitions(SQLITE_ENABLED=1)
+  set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} rgw_common)
+  set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} sqlite_db)
+  add_dependencies(sqlite_db dbstore_lib)
+ENDIF()
+
+# add pthread library
+set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} pthread)
+
+find_package(gtest QUIET)
+if(WITH_TESTS)
+    add_subdirectory(tests)
+else()
+	message(WARNING "Gtest not enabled")
+endif()
+
+include_directories(${CMAKE_INCLUDE_DIR})
+add_library(dbstore STATIC ${dbstore_mgr_srcs})
+target_link_libraries(dbstore ${CMAKE_LINK_LIBRARIES})
+
+# testing purpose
+set(dbstore_main_srcs
+    dbstore_main.cc)
+
+set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} dbstore)
+add_executable(dbstore-bin ${dbstore_main_srcs})
+add_dependencies(dbstore-bin dbstore)
+target_link_libraries(dbstore-bin ${CMAKE_LINK_LIBRARIES})
diff --git a/src/rgw/driver/dbstore/README.md b/src/rgw/driver/dbstore/README.md
new file mode 100644
index 000000000..0867bc2cc
--- /dev/null
+++ b/src/rgw/driver/dbstore/README.md
@@ -0,0 +1,53 @@
+# DBStore
+Standalone Rados Gateway (RGW) on DBStore (Experimental)
+
+
+## CMake Option
+Add below cmake option (enabled by default)
+
+    -DWITH_RADOSGW_DBSTORE=ON 
+
+
+## Build
+
+    cd build
+    ninja [vstart]
+
+
+## Running Test cluster
+Edit ceph.conf to add below option
+
+    [client]
+        rgw backend store = dbstore
+
+Start vstart cluster
+
+    [..] RGW=1 ../src/vstart.sh -o rgw_backend_store=dbstore -n -d
+
+The above vstart command brings up RGW server on dbstore and creates few default users (eg., testid) to be used for s3 operations.
+
+`radosgw-admin` can be used to create and remove other users.
+
+
+By default, dbstore creates .db file *'/var/lib/ceph/radosgw/dbstore-default_ns.db'* to store the data. This can be configured using below options in ceph.conf
+
+    [client]
+        dbstore db dir = <path for the directory for storing the db backend store data>
+        dbstore db name prefix = <prefix to the file names created by db backend store>
+
+
+## DBStore Unit Tests
+To execute DBStore unit test cases (using Gtest framework), from build directory
+
+    ninja unittest_dbstore_tests
+    ./bin/unittest_dbstore_tests [logfile] [loglevel]
+    (default logfile: rgw_dbstore_tests.log, loglevel: 20)
+    ninja unittest_dbstore_mgr_tests
+    ./bin/unittest_dbstore_mgr_tests
+
+To execute Sample test file
+
+    ninja src/rgw/driver/dbstore/install
+    ./bin/dbstore-bin [logfile] [loglevel]
+    (default logfile: rgw_dbstore_bin.log, loglevel: 20)
+
diff --git a/src/rgw/driver/dbstore/common/connection_pool.h b/src/rgw/driver/dbstore/common/connection_pool.h
new file mode 100644
index 000000000..07f3c81c3
--- /dev/null
+++ b/src/rgw/driver/dbstore/common/connection_pool.h
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <concepts>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <boost/circular_buffer.hpp>
+#include "common/dout.h"
+
+namespace rgw::dbstore {
+
+template <typename Connection>
+class ConnectionHandle;
+
+/// A thread-safe base class that manages a fixed-size pool of generic database
+/// connections and supports the reclamation of ConnectionHandles. This class
+/// is the subset of ConnectionPool which doesn't depend on the Factory type.
+template <typename Connection>
+class ConnectionPoolBase {
+ public:
+  ConnectionPoolBase(std::size_t max_connections)
+      : connections(max_connections)
+  {}
+ private:
+  friend class ConnectionHandle<Connection>;
+
+  // TODO: the caller may detect a connection error that prevents the connection
+  // from being reused. allow them to indicate these errors here
+  void put(std::unique_ptr<Connection> connection)
+  {
+    auto lock = std::scoped_lock{mutex};
+    connections.push_back(std::move(connection));
+
+    if (connections.size() == 1) { // was empty
+      cond.notify_one();
+    }
+  }
+ protected:
+  std::mutex mutex;
+  std::condition_variable cond;
+  boost::circular_buffer<std::unique_ptr<Connection>> connections;
+};
+
+/// Handle to a database connection borrowed from the pool. Automatically
+/// returns the connection to its pool on the handle's destruction.
+template <typename Connection>
+class ConnectionHandle {
+  ConnectionPoolBase<Connection>* pool = nullptr;
+  std::unique_ptr<Connection> conn;
+ public:
+  ConnectionHandle() noexcept = default;
+  ConnectionHandle(ConnectionPoolBase<Connection>* pool,
+                   std::unique_ptr<Connection> conn) noexcept
+    : pool(pool), conn(std::move(conn)) {}
+
+  ~ConnectionHandle() {
+    if (conn) {
+      pool->put(std::move(conn));
+    }
+  }
+
+  ConnectionHandle(ConnectionHandle&&) = default;
+  ConnectionHandle& operator=(ConnectionHandle&& o) noexcept {
+    if (conn) {
+      pool->put(std::move(conn));
+    }
+    conn = std::move(o.conn);
+    pool = o.pool;
+    return *this;
+  }
+
+  explicit operator bool() const noexcept { return static_cast<bool>(conn); }
+  Connection& operator*() const noexcept { return *conn; }
+  Connection* operator->() const noexcept { return conn.get(); }
+  Connection* get() const noexcept { return conn.get(); }
+};
+
+
+// factory_of concept requires the function signature:
+//   F(const DoutPrefixProvider*) -> std::unique_ptr<T>
+template <typename F, typename T>
+concept factory_of = requires (F factory, const DoutPrefixProvider* dpp) {
+  { factory(dpp) } -> std::same_as<std::unique_ptr<T>>;
+  requires std::move_constructible<F>;
+};
+
+
+/// Generic database connection pool that enforces a limit on open connections.
+template <typename Connection, factory_of<Connection> Factory>
+class ConnectionPool : public ConnectionPoolBase<Connection> {
+ public:
+  ConnectionPool(Factory factory, std::size_t max_connections)
+      : ConnectionPoolBase<Connection>(max_connections),
+        factory(std::move(factory))
+  {}
+
+  /// Borrow a connection from the pool. If all existing connections are in use,
+  /// use the connection factory to create another one. If we've reached the
+  /// limit on open connections, wait on a condition variable for the next one
+  /// returned to the pool.
+  auto get(const DoutPrefixProvider* dpp)
+      -> ConnectionHandle<Connection>
+  {
+    auto lock = std::unique_lock{this->mutex};
+    std::unique_ptr<Connection> conn;
+
+    if (!this->connections.empty()) {
+      // take an existing connection
+      conn = std::move(this->connections.front());
+      this->connections.pop_front();
+    } else if (total < this->connections.capacity()) {
+      // add another connection to the pool
+      conn = factory(dpp);
+      ++total;
+    } else {
+      // wait for the next put()
+      // TODO: support optional_yield
+      ldpp_dout(dpp, 4) << "ConnectionPool waiting on a connection" << dendl;
+      this->cond.wait(lock, [&] { return !this->connections.empty(); });
+      ldpp_dout(dpp, 4) << "ConnectionPool done waiting" << dendl;
+      conn = std::move(this->connections.front());
+      this->connections.pop_front();
+    }
+
+    return {this, std::move(conn)};
+  }
+ private:
+  Factory factory;
+  std::size_t total = 0;
+};
+
+} // namespace rgw::dbstore
diff --git a/src/rgw/driver/dbstore/common/dbstore.cc b/src/rgw/driver/dbstore/common/dbstore.cc
new file mode 100644
index 000000000..dc5a90c31
--- /dev/null
+++ b/src/rgw/driver/dbstore/common/dbstore.cc
@@ -0,0 +1,2252 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "dbstore.h"
+
+using namespace std;
+
+namespace rgw { namespace store {
+
+map<string, class ObjectOp*> DB::objectmap = {};
+
+map<string, class ObjectOp*> DB::getObjectMap() {
+  return DB::objectmap;
+}
+
+int DB::Initialize(string logfile, int loglevel)
+{
+  int ret = -1;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  if (!cct) {
+    cout << "Failed to Initialize. No ceph Context \n";
+    return -1;
+  }
+
+  if (loglevel > 0) {
+    cct->_conf->subsys.set_log_level(ceph_subsys_rgw, loglevel);
+  }
+  if (!logfile.empty()) {
+    cct->_log->set_log_file(logfile);
+    cct->_log->reopen_log_file();
+  }
+
+
+  db = openDB(dpp);
+
+  if (!db) {
+    ldpp_dout(dpp, 0) <<"Failed to open database " << dendl;
+    return ret;
+  }
+
+  ret = InitializeDBOps(dpp);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"InitializeDBOps failed " << dendl;
+    closeDB(dpp);
+    db = NULL;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 0) << "DB successfully initialized - name:" \
+    << db_name << "" << dendl;
+
+  return ret;
+}
+
+int DB::createGC(const DoutPrefixProvider *dpp) {
+  int ret = 0;
+  /* create gc thread */
+
+  gc_worker = std::make_unique<DB::GC>(dpp, this);
+  gc_worker->create("db_gc");
+
+  return ret;
+}
+
+int DB::stopGC() {
+  if (gc_worker) {
+    gc_worker->signal_stop();
+    gc_worker->join();
+  }
+  return 0;
+}
+
+int DB::Destroy(const DoutPrefixProvider *dpp)
+{
+  if (!db)
+    return 0;
+
+  stopGC();
+
+  closeDB(dpp);
+
+
+  ldpp_dout(dpp, 20)<<"DB successfully destroyed - name:" \
+    <<db_name << dendl;
+
+  return 0;
+}
+
+
+std::shared_ptr<class DBOp> DB::getDBOp(const DoutPrefixProvider *dpp, std::string_view Op,
+                  const DBOpParams *params)
+{
+  if (!Op.compare("InsertUser"))
+    return dbops.InsertUser;
+  if (!Op.compare("RemoveUser"))
+    return dbops.RemoveUser;
+  if (!Op.compare("GetUser"))
+    return dbops.GetUser;
+  if (!Op.compare("InsertBucket"))
+    return dbops.InsertBucket;
+  if (!Op.compare("UpdateBucket"))
+    return dbops.UpdateBucket;
+  if (!Op.compare("RemoveBucket"))
+    return dbops.RemoveBucket;
+  if (!Op.compare("GetBucket"))
+    return dbops.GetBucket;
+  if (!Op.compare("ListUserBuckets"))
+    return dbops.ListUserBuckets;
+  if (!Op.compare("InsertLCEntry"))
+    return dbops.InsertLCEntry;
+  if (!Op.compare("RemoveLCEntry"))
+    return dbops.RemoveLCEntry;
+  if (!Op.compare("GetLCEntry"))
+    return dbops.GetLCEntry;
+  if (!Op.compare("ListLCEntries"))
+    return dbops.ListLCEntries;
+  if (!Op.compare("InsertLCHead"))
+    return dbops.InsertLCHead;
+  if (!Op.compare("RemoveLCHead"))
+    return dbops.RemoveLCHead;
+  if (!Op.compare("GetLCHead"))
+    return dbops.GetLCHead;
+
+  /* Object Operations */
+  map<string, class ObjectOp*>::iterator iter;
+  class ObjectOp* Ob;
+
+  {
+    const std::lock_guard<std::mutex> lk(mtx);
+    iter = DB::objectmap.find(params->op.bucket.info.bucket.name);
+  }
+
+  if (iter == DB::objectmap.end()) {
+    ldpp_dout(dpp, 30)<<"No objectmap found for bucket: " \
+      <<params->op.bucket.info.bucket.name << dendl;
+    /* not found */
+    return nullptr;
+  }
+
+  Ob = iter->second;
+
+  if (!Op.compare("PutObject"))
+    return Ob->PutObject;
+  if (!Op.compare("DeleteObject"))
+    return Ob->DeleteObject;
+  if (!Op.compare("GetObject"))
+    return Ob->GetObject;
+  if (!Op.compare("UpdateObject"))
+    return Ob->UpdateObject;
+  if (!Op.compare("ListBucketObjects"))
+    return Ob->ListBucketObjects;
+  if (!Op.compare("ListVersionedObjects"))
+    return Ob->ListVersionedObjects;
+  if (!Op.compare("PutObjectData"))
+    return Ob->PutObjectData;
+  if (!Op.compare("UpdateObjectData"))
+    return Ob->UpdateObjectData;
+  if (!Op.compare("GetObjectData"))
+    return Ob->GetObjectData;
+  if (!Op.compare("DeleteObjectData"))
+    return Ob->DeleteObjectData;
+  if (!Op.compare("DeleteStaleObjectData"))
+    return Ob->DeleteStaleObjectData;
+
+  return nullptr;
+}
+
+int DB::objectmapInsert(const DoutPrefixProvider *dpp, string bucket, class ObjectOp* ptr)
+{
+  map<string, class ObjectOp*>::iterator iter;
+  class ObjectOp *Ob;
+
+  const std::lock_guard<std::mutex> lk(mtx);
+  iter = DB::objectmap.find(bucket);
+
+  if (iter != DB::objectmap.end()) {
+    // entry already exists
+    // return success or replace it or
+    // return error ?
+    //
+    // return success for now & delete the newly allocated ptr
+    ldpp_dout(dpp, 30)<<"Objectmap entry already exists for bucket("\
+      <<bucket<<"). Not inserted " << dendl;
+    delete ptr;
+    return 0;
+  }
+
+  Ob = (class ObjectOp*) ptr;
+  Ob->InitializeObjectOps(getDBname(), dpp);
+
+  DB::objectmap.insert(pair<string, class ObjectOp*>(bucket, Ob));
+
+  return 0;
+}
+
+int DB::objectmapDelete(const DoutPrefixProvider *dpp, string bucket)
+{
+  map<string, class ObjectOp*>::iterator iter;
+
+  const std::lock_guard<std::mutex> lk(mtx);
+  iter = DB::objectmap.find(bucket);
+
+  if (iter == DB::objectmap.end()) {
+    // entry doesn't exist
+    // return success or return error ?
+    // return success for now
+    ldpp_dout(dpp, 20)<<"Objectmap entry for bucket("<<bucket<<") "
+      <<"doesnt exist to delete " << dendl;
+    return 0;
+  }
+
+  DB::objectmap.erase(iter);
+
+  return 0;
+}
+
+int DB::InitializeParams(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+
+  if (!params)
+    goto out;
+
+  params->cct = cct;
+
+  //reset params here
+  params->user_table = user_table;
+  params->bucket_table = bucket_table;
+  params->quota_table = quota_table;
+  params->lc_entry_table = lc_entry_table;
+  params->lc_head_table = lc_head_table;
+
+  ret = 0;
+out:
+  return ret;
+}
+
+int DB::ProcessOp(const DoutPrefixProvider *dpp, std::string_view Op, DBOpParams *params) {
+  int ret = -1;
+  shared_ptr<class DBOp> db_op;
+
+  db_op = getDBOp(dpp, Op, params);
+
+  if (!db_op) {
+    ldpp_dout(dpp, 0)<<"No db_op found for Op("<<Op<<")" << dendl;
+    return ret;
+  }
+  ret = db_op->Execute(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In Process op Execute failed for fop(" << Op << ")" << dendl;
+  } else {
+    ldpp_dout(dpp, 20)<<"Successfully processed fop(" << Op << ")" << dendl;
+  }
+
+  return ret;
+}
+
+int DB::get_user(const DoutPrefixProvider *dpp,
+    const std::string& query_str, const std::string& query_str_val,
+    RGWUserInfo& uinfo, map<string, bufferlist> *pattrs,
+    RGWObjVersionTracker *pobjv_tracker) {
+  int ret = 0;
+
+  if (query_str.empty() || query_str_val.empty()) {
+    ldpp_dout(dpp, 0)<<"In GetUser - Invalid query(" << query_str <<"), query_str_val(" << query_str_val <<")" << dendl;
+    return -1;
+  }
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.query_str = query_str;
+
+  // validate query_str with UserTable entries names
+  if (query_str == "username") {
+    params.op.user.uinfo.display_name = query_str_val;
+  } else if (query_str == "email") {
+    params.op.user.uinfo.user_email = query_str_val;
+  } else if (query_str == "access_key") {
+    RGWAccessKey k(query_str_val, "");
+    map<string, RGWAccessKey> keys;
+    keys[query_str_val] = k;
+    params.op.user.uinfo.access_keys = keys;
+  } else if (query_str == "user_id") {
+    params.op.user.uinfo.user_id = uinfo.user_id;
+  } else {
+    ldpp_dout(dpp, 0)<<"In GetUser Invalid query string :" <<query_str.c_str()<<") " << dendl;
+    return -1;
+  }
+
+  ret = ProcessOp(dpp, "GetUser", &params);
+
+  if (ret)
+    goto out;
+
+  /* Verify if its a valid user */
+  if (params.op.user.uinfo.access_keys.empty() ||
+        params.op.user.uinfo.user_id.id.empty()) {
+    ldpp_dout(dpp, 0)<<"In GetUser - No user with query(" <<query_str.c_str()<<"), user_id(" << uinfo.user_id <<") found" << dendl;
+    return -ENOENT;
+  }
+
+  uinfo = params.op.user.uinfo;
+
+  if (pattrs) {
+    *pattrs = params.op.user.user_attrs;
+  }
+
+  if (pobjv_tracker) {
+    pobjv_tracker->read_version = params.op.user.user_version;
+  }
+
+out:
+  return ret;
+}
+
+int DB::store_user(const DoutPrefixProvider *dpp,
+    RGWUserInfo& uinfo, bool exclusive, map<string, bufferlist> *pattrs,
+    RGWObjVersionTracker *pobjv, RGWUserInfo* pold_info)
+{
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+  int ret = 0;
+
+  /* Check if the user already exists and return the old info, caller will have a use for it */
+  RGWUserInfo orig_info;
+  RGWObjVersionTracker objv_tracker = {};
+  obj_version& obj_ver = objv_tracker.read_version;
+
+  orig_info.user_id = uinfo.user_id;
+  ret = get_user(dpp, string("user_id"), uinfo.user_id.id, orig_info, nullptr, &objv_tracker);
+
+  if (!ret && obj_ver.ver) {
+    /* already exists. */
+
+    if (pold_info) {
+      *pold_info = orig_info;
+    }
+
+    if (pobjv && (pobjv->read_version.ver != obj_ver.ver)) {
+      /* Object version mismatch.. return ECANCELED */
+      ret = -ECANCELED;
+      ldpp_dout(dpp, 0)<<"User Read version mismatch err:(" <<ret<<") " << dendl;
+      return ret;
+    }
+
+    if (exclusive) {
+      // return
+      return ret;
+    }
+    obj_ver.ver++;
+  } else {
+    obj_ver.ver = 1;
+    obj_ver.tag = "UserTAG";
+  }
+
+  params.op.user.user_version = obj_ver;
+  params.op.user.uinfo = uinfo;
+
+  if (pattrs) {
+    params.op.user.user_attrs = *pattrs;
+  }
+
+  ret = ProcessOp(dpp, "InsertUser", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"store_user failed with err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+  ldpp_dout(dpp, 20)<<"User creation successful - userid:(" <<uinfo.user_id<<") " << dendl;
+
+  if (pobjv) {
+    pobjv->read_version = obj_ver;
+    pobjv->write_version = obj_ver;
+  }
+
+out:
+  return ret;
+}
+
+int DB::remove_user(const DoutPrefixProvider *dpp,
+    RGWUserInfo& uinfo, RGWObjVersionTracker *pobjv)
+{
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+  int ret = 0;
+
+  RGWUserInfo orig_info;
+  RGWObjVersionTracker objv_tracker = {};
+
+  orig_info.user_id = uinfo.user_id;
+  ret = get_user(dpp, string("user_id"), uinfo.user_id.id, orig_info, nullptr, &objv_tracker);
+
+  if (ret) {
+    return ret;
+  }
+
+  if (!ret && objv_tracker.read_version.ver) {
+    /* already exists. */
+
+    if (pobjv && (pobjv->read_version.ver != objv_tracker.read_version.ver)) {
+      /* Object version mismatch.. return ECANCELED */
+      ret = -ECANCELED;
+      ldpp_dout(dpp, 0)<<"User Read version mismatch err:(" <<ret<<") " << dendl;
+      return ret;
+    }
+  }
+
+  params.op.user.uinfo.user_id = uinfo.user_id;
+
+  ret = ProcessOp(dpp, "RemoveUser", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"remove_user failed with err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::get_bucket_info(const DoutPrefixProvider *dpp, const std::string& query_str,
+    const std::string& query_str_val,
+    RGWBucketInfo& info,
+    rgw::sal::Attrs* pattrs, ceph::real_time* pmtime,
+    obj_version* pbucket_version) {
+  int ret = 0;
+
+  if (query_str.empty()) {
+    // not checking for query_str_val as the query can be to fetch
+    // entries with null values
+    return -1;
+  }
+
+  DBOpParams params = {};
+  DBOpParams params2 = {};
+  InitializeParams(dpp, &params);
+
+  if (query_str == "name") {
+    params.op.bucket.info.bucket.name = info.bucket.name;
+  } else {
+    ldpp_dout(dpp, 0)<<"In GetBucket Invalid query string :" <<query_str.c_str()<<") " << dendl;
+    return -1;
+  }
+
+  ret = ProcessOp(dpp, "GetBucket", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In GetBucket failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  if (!ret && params.op.bucket.info.bucket.marker.empty()) {
+    return -ENOENT;
+  }
+  info = params.op.bucket.info;
+
+  if (pattrs) {
+    *pattrs = params.op.bucket.bucket_attrs;
+  }
+
+  if (pmtime) {
+    *pmtime = params.op.bucket.mtime;
+  }
+  if (pbucket_version) {
+    *pbucket_version = params.op.bucket.bucket_version;
+  }
+
+out:
+  return ret;
+}
+
+int DB::create_bucket(const DoutPrefixProvider *dpp,
+    const RGWUserInfo& owner, rgw_bucket& bucket,
+    const string& zonegroup_id,
+    const rgw_placement_rule& placement_rule,
+    const string& swift_ver_location,
+    const RGWQuotaInfo * pquota_info,
+    map<std::string, bufferlist>& attrs,
+    RGWBucketInfo& info,
+    obj_version *pobjv,
+    obj_version *pep_objv,
+    real_time creation_time,
+    rgw_bucket *pmaster_bucket,
+    uint32_t *pmaster_num_shards,
+    optional_yield y,
+    bool exclusive)
+{
+  /*
+   * XXX: Simple creation for now.
+   *
+   * Referring to RGWRados::create_bucket(), 
+   * Check if bucket already exists, select_bucket_placement,
+   * is explicit put/remove instance info needed? - should not be ideally
+   */
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+  int ret = 0;
+
+  /* Check if the bucket already exists and return the old info, caller will have a use for it */
+  RGWBucketInfo orig_info;
+  orig_info.bucket.name = bucket.name;
+  ret = get_bucket_info(dpp, string("name"), "", orig_info, nullptr, nullptr, nullptr);
+
+  if (!ret && !orig_info.owner.id.empty() && exclusive) {
+    /* already exists. Return the old info */
+
+    info = std::move(orig_info);
+    return ret;
+  }
+
+  RGWObjVersionTracker& objv_tracker = info.objv_tracker;
+
+  objv_tracker.read_version.clear();
+
+  if (pobjv) {
+    objv_tracker.write_version = *pobjv;
+  } else {
+    objv_tracker.generate_new_write_ver(cct);
+  }
+  params.op.bucket.bucket_version = objv_tracker.write_version;
+  objv_tracker.read_version = params.op.bucket.bucket_version;
+
+  uint64_t bid = next_bucket_id();
+  string s = getDBname() + "." + std::to_string(bid);
+  bucket.marker = bucket.bucket_id = s;
+
+  info.bucket = bucket;
+  info.owner = owner.user_id;
+  info.zonegroup = zonegroup_id;
+  info.placement_rule = placement_rule;
+  info.swift_ver_location = swift_ver_location;
+  info.swift_versioning = (!swift_ver_location.empty());
+
+  info.requester_pays = false;
+  if (real_clock::is_zero(creation_time)) {
+    info.creation_time = ceph::real_clock::now();
+  } else {
+    info.creation_time = creation_time;
+  }
+  if (pquota_info) {
+    info.quota = *pquota_info;
+  }
+
+  params.op.bucket.info = info;
+  params.op.bucket.bucket_attrs = attrs;
+  params.op.bucket.mtime = ceph::real_time();
+  params.op.user.uinfo.user_id.id = owner.user_id.id;
+
+  ret = ProcessOp(dpp, "InsertBucket", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"create_bucket failed with err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::remove_bucket(const DoutPrefixProvider *dpp, const RGWBucketInfo info) {
+  int ret = 0;
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.bucket.info.bucket.name = info.bucket.name;
+
+  ret = ProcessOp(dpp, "RemoveBucket", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In RemoveBucket failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::list_buckets(const DoutPrefixProvider *dpp, const std::string& query_str,
+    rgw_user& user,
+    const string& marker,
+    const string& end_marker,
+    uint64_t max,
+    bool need_stats,
+    RGWUserBuckets *buckets,
+    bool *is_truncated)
+{
+  int ret = 0;
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.user.uinfo.user_id = user;
+  params.op.bucket.min_marker = marker;
+  params.op.bucket.max_marker = end_marker;
+  params.op.list_max_count = max;
+  params.op.query_str = query_str;
+
+  ret = ProcessOp(dpp, "ListUserBuckets", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In ListUserBuckets failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  /* need_stats: stats are already part of entries... In case they are maintained in
+   * separate table , maybe use "Inner Join" with stats table for the query.
+   */
+  if (params.op.bucket.list_entries.size() == max)
+    *is_truncated = true;
+
+  for (auto& entry : params.op.bucket.list_entries) {
+    if (!end_marker.empty() &&
+        end_marker.compare(entry.bucket.marker) <= 0) {
+      *is_truncated = false;
+      break;
+    }
+    buckets->add(std::move(entry));
+  }
+
+  if (query_str == "all") {
+    // userID/OwnerID may have changed. Update it.
+    user.id = params.op.bucket.info.owner.id;
+  }
+
+out:
+  return ret;
+}
+
+int DB::update_bucket(const DoutPrefixProvider *dpp, const std::string& query_str,
+    RGWBucketInfo& info,
+    bool exclusive,
+    const rgw_user* powner_id,
+    map<std::string, bufferlist>* pattrs,
+    ceph::real_time* pmtime,
+    RGWObjVersionTracker* pobjv)
+{
+  int ret = 0;
+  DBOpParams params = {};
+  obj_version bucket_version;
+  RGWBucketInfo orig_info;
+
+  /* Check if the bucket already exists and return the old info, caller will have a use for it */
+  orig_info.bucket.name = info.bucket.name;
+  params.op.bucket.info.bucket.name = info.bucket.name;
+  ret = get_bucket_info(dpp, string("name"), "", orig_info, nullptr, nullptr,
+      &bucket_version);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"Failed to read bucket info err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  if (!orig_info.owner.id.empty() && exclusive) {
+    /* already exists. Return the old info */
+
+    info = std::move(orig_info);
+    return ret;
+  }
+
+  /* Verify if the objv read_ver matches current bucket version */
+  if (pobjv) {
+    if (pobjv->read_version.ver != bucket_version.ver) {
+      ldpp_dout(dpp, 0)<<"Read version mismatch err:(" <<ret<<") " << dendl;
+      ret = -ECANCELED;
+      goto out;
+    }
+  } else {
+    pobjv = &info.objv_tracker;
+  }
+
+  InitializeParams(dpp, &params);
+
+  params.op.bucket.info.bucket.name = info.bucket.name;
+
+  if (powner_id) {
+    params.op.user.uinfo.user_id.id = powner_id->id;
+  } else {
+    params.op.user.uinfo.user_id.id = orig_info.owner.id;
+  }
+
+  /* Update version & mtime */
+  params.op.bucket.bucket_version.ver = ++(bucket_version.ver);
+
+  if (pmtime) {
+    params.op.bucket.mtime = *pmtime;;
+  } else {
+    params.op.bucket.mtime = ceph::real_time();
+  }
+
+  if (query_str == "attrs") {
+    params.op.query_str = "attrs";
+    params.op.bucket.bucket_attrs = *pattrs;
+  } else if (query_str == "owner") {
+    /* Update only owner i.e, chown. 
+     * Update creation_time too */
+    params.op.query_str = "owner";
+    params.op.bucket.info.creation_time = params.op.bucket.mtime;
+  } else if (query_str == "info") {
+    params.op.query_str = "info";
+    params.op.bucket.info = info;
+  } else {
+    ret = -1;
+    ldpp_dout(dpp, 0)<<"In UpdateBucket Invalid query_str : " << query_str << dendl;
+    goto out;
+  }
+
+  ret = ProcessOp(dpp, "UpdateBucket", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In UpdateBucket failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  if (pobjv) {
+    pobjv->read_version = params.op.bucket.bucket_version;
+    pobjv->write_version = params.op.bucket.bucket_version;
+  }
+
+out:
+  return ret;
+}
+
+/**
+ * Get ordered listing of the objects in a bucket.
+ *
+ * max_p: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: do not include results that match this string.
+ *     Any skipped results will have the matching portion of their name
+ *     inserted in common_prefixes with a "true" mark.
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: if delim is filled in, any matching prefixes are
+ * placed here.
+ * is_truncated: if number of objects in the bucket is bigger than
+ * max, then truncated.
+ */
+int DB::Bucket::List::list_objects(const DoutPrefixProvider *dpp, int64_t max,
+		           vector<rgw_bucket_dir_entry> *result,
+		           map<string, bool> *common_prefixes, bool *is_truncated)
+{
+  int ret = 0;
+  DB *store = target->get_store();
+  int64_t count = 0;
+  std::string prev_obj;
+
+  DBOpParams db_params = {};
+  store->InitializeParams(dpp, &db_params);
+
+  db_params.op.bucket.info = target->get_bucket_info(); 
+  /* XXX: Handle whole marker? key -> name, instance, ns? */
+  db_params.op.obj.min_marker = params.marker.name;
+  db_params.op.obj.max_marker = params.end_marker.name;
+  db_params.op.obj.prefix = params.prefix + "%";
+  db_params.op.list_max_count = max + 1; /* +1 for next_marker */
+
+  ret = store->ProcessOp(dpp, "ListBucketObjects", &db_params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In ListBucketObjects failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  for (auto& entry : db_params.op.obj.list_entries) {
+
+    if (!params.list_versions) {
+      if (entry.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER) {
+        prev_obj = entry.key.name;
+        // skip all non-current entries and delete_marker
+        continue;
+      }
+      if (entry.key.name == prev_obj) {
+        // non current versions..skip the entry
+        continue;
+      }
+      entry.flags |= rgw_bucket_dir_entry::FLAG_CURRENT;
+    } else {
+      if (entry.key.name != prev_obj) {
+        // current version
+        entry.flags |= rgw_bucket_dir_entry::FLAG_CURRENT;
+      } else {
+        entry.flags &= ~(rgw_bucket_dir_entry::FLAG_CURRENT);
+        entry.flags |= rgw_bucket_dir_entry::FLAG_VER;
+      }
+    }
+
+    prev_obj = entry.key.name;
+
+    if (count >= max) {
+      *is_truncated = true;
+      next_marker.name = entry.key.name;
+      next_marker.instance = entry.key.instance;
+      break;
+    }
+
+    if (!params.delim.empty()) {
+    const std::string& objname = entry.key.name;
+	const int delim_pos = objname.find(params.delim, params.prefix.size());
+	  if (delim_pos >= 0) {
+	    /* extract key -with trailing delimiter- for CommonPrefix */
+	    const std::string& prefix_key =
+	      objname.substr(0, delim_pos + params.delim.length());
+
+	    if (common_prefixes &&
+		common_prefixes->find(prefix_key) == common_prefixes->end()) {
+          next_marker = prefix_key;
+          (*common_prefixes)[prefix_key] = true;
+          count++;
+        }
+        continue;
+      }
+    }
+
+    if (!params.end_marker.name.empty() &&
+        params.end_marker.name.compare(entry.key.name) <= 0) {
+      // should not include end_marker
+      *is_truncated = false;
+      break;
+    }
+    count++;
+    result->push_back(std::move(entry));
+  }
+out:
+  return ret;
+}
+
+int DB::raw_obj::InitializeParamsfromRawObj(const DoutPrefixProvider *dpp,
+                                            DBOpParams* params) {
+  int ret = 0;
+
+  if (!params)
+    return -1;
+
+  params->op.bucket.info.bucket.name = bucket_name;
+  params->op.obj.state.obj.key.name = obj_name;
+  params->op.obj.state.obj.key.instance = obj_instance;
+  params->op.obj.state.obj.key.ns = obj_ns;
+  params->op.obj.obj_id = obj_id;
+
+  if (multipart_part_str != "0.0") {
+    params->op.obj.is_multipart = true;
+  } else {
+    params->op.obj.is_multipart = false;
+  }
+
+  params->op.obj_data.multipart_part_str = multipart_part_str;
+  params->op.obj_data.part_num = part_num;
+
+  return ret;
+}
+
+int DB::Object::InitializeParamsfromObject(const DoutPrefixProvider *dpp,
+                                           DBOpParams* params) {
+  int ret = 0;
+  string bucket = bucket_info.bucket.name;
+
+  if (!params)
+    return -1;
+
+  params->op.bucket.info.bucket.name = bucket;
+  params->op.obj.state.obj = obj;
+  params->op.obj.obj_id = obj_id;
+
+  return ret;
+}
+
+int DB::Object::get_object_impl(const DoutPrefixProvider *dpp, DBOpParams& params) {
+  int ret = 0;
+
+  if (params.op.obj.state.obj.key.name.empty()) {
+    /* Initialize */
+    store->InitializeParams(dpp, &params);
+    InitializeParamsfromObject(dpp, &params);
+  }
+
+  ret = store->ProcessOp(dpp, "GetObject", &params);
+
+  /* pick one field check if object exists */
+  if (!ret && !params.op.obj.state.exists) {
+    ldpp_dout(dpp, 0)<<"Object(bucket:" << bucket_info.bucket.name << ", Object:"<< obj.key.name << ") doesn't exist" << dendl;
+    ret = -ENOENT;
+  }
+
+  return ret;
+}
+
+int DB::Object::obj_omap_set_val_by_key(const DoutPrefixProvider *dpp,
+                                        const std::string& key, bufferlist& val,
+                                        bool must_exist) {
+  int ret = 0;
+
+  DBOpParams params = {};
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  params.op.obj.omap[key] = val;
+  params.op.query_str = "omap";
+  params.op.obj.state.mtime = real_clock::now();
+
+  ret = store->ProcessOp(dpp, "UpdateObject", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::Object::obj_omap_get_vals_by_keys(const DoutPrefixProvider *dpp,
+                                          const std::string& oid,
+                                          const std::set<std::string>& keys,
+                                          std::map<std::string, bufferlist>* vals)
+{
+  int ret = 0;
+  DBOpParams params = {};
+  std::map<std::string, bufferlist> omap;
+
+  if (!vals)
+    return -1;
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  omap = params.op.obj.omap;
+
+  for (const auto& k :  keys) {
+    (*vals)[k] = omap[k];
+  }
+
+out:
+  return ret;
+}
+
+int DB::Object::add_mp_part(const DoutPrefixProvider *dpp,
+                            RGWUploadPartInfo info) {
+  int ret = 0;
+
+  DBOpParams params = {};
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  params.op.obj.mp_parts.push_back(info);
+  params.op.query_str = "mp";
+  params.op.obj.state.mtime = real_clock::now();
+
+  ret = store->ProcessOp(dpp, "UpdateObject", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::Object::get_mp_parts_list(const DoutPrefixProvider *dpp,
+                                  std::list<RGWUploadPartInfo>& info)
+{
+  int ret = 0;
+  DBOpParams params = {};
+  std::map<std::string, bufferlist> omap;
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  info = params.op.obj.mp_parts;
+
+out:
+  return ret;
+}
+
+/* Taken from rgw_rados.cc */
+void DB::gen_rand_obj_instance_name(rgw_obj_key *target_key)
+{
+#define OBJ_INSTANCE_LEN 32
+  char buf[OBJ_INSTANCE_LEN + 1];
+
+  gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
+                                                                      no underscore for instance name due to the way we encode the raw keys */
+
+  target_key->set_instance(buf);
+}
+
+int DB::Object::obj_omap_get_all(const DoutPrefixProvider *dpp,
+                                 std::map<std::string, bufferlist> *m)
+{
+  int ret = 0;
+  DBOpParams params = {};
+  std::map<std::string, bufferlist> omap;
+
+  if (!m)
+    return -1;
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  (*m) = params.op.obj.omap;
+
+out:
+  return ret;
+}
+
+int DB::Object::obj_omap_get_vals(const DoutPrefixProvider *dpp,
+                                  const std::string& marker,
+                                  uint64_t max_count,
+                                  std::map<std::string, bufferlist> *m, bool* pmore)
+{
+  int ret = 0;
+  DBOpParams params = {};
+  std::map<std::string, bufferlist> omap;
+  map<string, bufferlist>::iterator iter;
+  uint64_t count = 0;
+
+  if (!m)
+    return -1;
+
+  ret = get_object_impl(dpp, params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  omap = params.op.obj.omap;
+
+  for (iter = omap.begin(); iter != omap.end(); ++iter) {
+
+    if (iter->first < marker)
+      continue;
+
+    if ((++count) > max_count) {
+      *pmore = true;
+      break;
+    }
+
+    (*m)[iter->first] = iter->second;
+  }
+
+out:
+  return ret;
+}
+
+int DB::Object::set_attrs(const DoutPrefixProvider *dpp,
+                          map<string, bufferlist>& setattrs,
+                          map<string, bufferlist>* rmattrs)
+{
+  int ret = 0;
+
+  DBOpParams params = {};
+  rgw::sal::Attrs *attrs;
+  map<string, bufferlist>::iterator iter;
+  RGWObjState* state;
+
+  store->InitializeParams(dpp, &params);
+  InitializeParamsfromObject(dpp, &params);
+  ret = get_state(dpp, &state, true);
+
+  if (ret && !state->exists) {
+    ldpp_dout(dpp, 0) <<"get_state failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  /* For now lets keep it simple..rmattrs & setattrs ..
+   * XXX: Check rgw_rados::set_attrs
+   */
+  params.op.obj.state = *state;
+  attrs = &params.op.obj.state.attrset;
+  if (rmattrs) {
+    for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+      (*attrs).erase(iter->first);
+    }
+  }
+  for (iter = setattrs.begin(); iter != setattrs.end(); ++iter) {
+    (*attrs)[iter->first] = iter->second;
+  }
+
+  params.op.query_str = "attrs";
+  /* As per https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html, 
+   * the only way for users to modify object metadata is to make a copy of the object and
+   * set the metadata.
+   * Hence do not update mtime for any other attr changes */
+
+  ret = store->ProcessOp(dpp, "UpdateObject", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::Object::transition(const DoutPrefixProvider *dpp,
+                           const rgw_placement_rule& rule,
+                           const real_time& mtime,
+                           uint64_t olh_epoch)
+{
+  int ret = 0;
+
+  DBOpParams params = {};
+  map<string, bufferlist> *attrset;
+
+  store->InitializeParams(dpp, &params);
+  InitializeParamsfromObject(dpp, &params);
+
+  ret = store->ProcessOp(dpp, "GetObject", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) <<"In GetObject failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+  /* pick one field check if object exists */
+  if (!params.op.obj.state.exists) {
+    ldpp_dout(dpp, 0)<<"Object(bucket:" << bucket_info.bucket.name << ", Object:"<< obj.key.name << ") doesn't exist" << dendl;
+    return -1;
+  }
+
+  params.op.query_str = "meta";
+  params.op.obj.state.mtime = real_clock::now();
+  params.op.obj.storage_class = rule.storage_class;
+  attrset = &params.op.obj.state.attrset;
+  if (!rule.storage_class.empty()) {
+    bufferlist bl;
+    bl.append(rule.storage_class);
+    (*attrset)[RGW_ATTR_STORAGE_CLASS] = bl;
+  }
+  params.op.obj.versioned_epoch = olh_epoch; // XXX: not sure if needed
+
+  /* Unlike Rados, in dbstore for now, both head and tail objects
+   * refer to same storage class
+   */
+  params.op.obj.head_placement_rule = rule;
+  params.op.obj.tail_placement.placement_rule = rule;
+
+  ret = store->ProcessOp(dpp, "UpdateObject", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In UpdateObject failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::raw_obj::read(const DoutPrefixProvider *dpp, int64_t ofs,
+                      uint64_t len, bufferlist& bl)
+{
+  int ret = 0;
+  DBOpParams params = {};
+
+  db->InitializeParams(dpp, &params);
+  InitializeParamsfromRawObj(dpp, &params);
+
+  ret = db->ProcessOp(dpp, "GetObjectData", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In GetObjectData failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  /* Verify if its valid obj */
+  if (!params.op.obj_data.size) {
+    ret = -ENOENT;
+    ldpp_dout(dpp, 0)<<"In GetObjectData failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  bufferlist& read_bl = params.op.obj_data.data;
+
+  unsigned copy_len;
+  copy_len = std::min((uint64_t)read_bl.length() - ofs, len);
+  read_bl.begin(ofs).copy(copy_len, bl);
+  return bl.length();
+}
+
+int DB::raw_obj::write(const DoutPrefixProvider *dpp, int64_t ofs, int64_t write_ofs,
+                       uint64_t len, bufferlist& bl)
+{
+  int ret = 0;
+  DBOpParams params = {};
+
+  db->InitializeParams(dpp, &params);
+  InitializeParamsfromRawObj(dpp, &params);
+
+  /* XXX: Check for chunk_size ?? */
+  params.op.obj_data.offset = ofs;
+  unsigned write_len = std::min((uint64_t)bl.length() - write_ofs, len);
+  bl.begin(write_ofs).copy(write_len, params.op.obj_data.data);
+  params.op.obj_data.size = params.op.obj_data.data.length();
+  params.op.obj.state.mtime = real_clock::now();
+
+  ret = db->ProcessOp(dpp, "PutObjectData", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In PutObjectData failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  return write_len;
+}
+
+int DB::Object::list_versioned_objects(const DoutPrefixProvider *dpp,
+                                       std::list<rgw_bucket_dir_entry>& list_entries) {
+  int ret = 0;
+  store = get_store();
+  DBOpParams db_params = {};
+
+  store->InitializeParams(dpp, &db_params);
+  InitializeParamsfromObject(dpp, &db_params);
+
+  db_params.op.list_max_count = MAX_VERSIONED_OBJECTS;
+
+  ret = store->ProcessOp(dpp, "ListVersionedObjects", &db_params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In ListVersionedObjects failed err:(" <<ret<<") " << dendl;
+  } else {
+    list_entries = db_params.op.obj.list_entries;
+  }
+
+  return ret;
+}
+
+int DB::Object::get_obj_state(const DoutPrefixProvider *dpp,
+                              const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                              bool follow_olh, RGWObjState** state)
+{
+  int ret = 0;
+
+  DBOpParams params = {};
+  RGWObjState* s;
+
+  if (!obj.key.instance.empty()) {
+    /* Versionid provided. Fetch the object */
+    ret = get_object_impl(dpp, params);
+
+    if (ret && ret != -ENOENT) {
+      ldpp_dout(dpp, 0) <<"get_object_impl failed err:(" <<ret<<")" << dendl;
+      goto out;
+    }
+  } else {
+    /* Instance is empty. May or may not be versioned object.
+     * List all the versions and read the most recent entry */
+    ret = list_versioned_objects(dpp, params.op.obj.list_entries);
+
+    if (params.op.obj.list_entries.size() != 0) {
+       /* Ensure its not a delete marker */
+      auto& ent = params.op.obj.list_entries.front();
+      if (ent.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER) {
+        ret = -ENOENT;
+        goto out;
+      }
+      store->InitializeParams(dpp, &params);
+      InitializeParamsfromObject(dpp, &params);
+      params.op.obj.state.obj.key = ent.key;
+    
+      ret = get_object_impl(dpp, params);
+
+      if (ret) {
+        ldpp_dout(dpp, 0) <<"get_object_impl of versioned object failed err:(" <<ret<<")" << dendl;
+        goto out;
+      }
+    } else {
+      ret = -ENOENT;
+      return ret;
+    }
+  }
+
+  s = &params.op.obj.state;
+  /* XXX: For now use state->shadow_obj to store ObjectID string */
+  s->shadow_obj = params.op.obj.obj_id;
+
+  *state = &obj_state;
+  **state = *s;
+
+out:
+  return ret;
+
+}
+
+int DB::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState** pstate, bool follow_olh)
+{
+  return get_obj_state(dpp, bucket_info, obj, follow_olh, pstate);
+}
+
+int DB::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest)
+{
+  RGWObjState* state;
+  int r = source->get_state(dpp, &state, true);
+  if (r < 0)
+    return r;
+  if (!state->exists)
+    return -ENOENT;
+  if (!state->get_attr(name, dest))
+    return -ENODATA;
+
+  return 0;
+}
+
+int DB::Object::Read::prepare(const DoutPrefixProvider *dpp)
+{
+  DB *store = source->get_store();
+  CephContext *cct = store->ctx();
+
+  bufferlist etag;
+
+  map<string, bufferlist>::iterator iter;
+
+  RGWObjState* astate;
+
+  int r = source->get_state(dpp, &astate, true);
+  if (r < 0)
+    return r;
+
+  if (!astate->exists) {
+    return -ENOENT;
+  }
+
+  state.obj = astate->obj;
+  source->obj_id = astate->shadow_obj;
+
+  if (params.target_obj) {
+    *params.target_obj = state.obj;
+  }
+  if (params.attrs) {
+    *params.attrs = astate->attrset;
+    if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+      for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
+        ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
+      }
+    }
+  }
+
+  if (conds.if_match || conds.if_nomatch) {
+    r = get_attr(dpp, RGW_ATTR_ETAG, etag);
+    if (r < 0)
+      return r;
+
+    if (conds.if_match) {
+      string if_match_str = rgw_string_unquote(conds.if_match);
+      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
+      if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
+        return -ERR_PRECONDITION_FAILED;
+      }
+    }
+
+    if (conds.if_nomatch) {
+      string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
+      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
+      if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
+        return -ERR_NOT_MODIFIED;
+      }
+    }
+  }
+
+  if (params.obj_size)
+    *params.obj_size = astate->size;
+  if (params.lastmod)
+    *params.lastmod = astate->mtime;
+
+  return 0;
+}
+
+int DB::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
+{
+  if (ofs < 0) {
+    ofs += obj_size;
+    if (ofs < 0)
+      ofs = 0;
+    end = obj_size - 1;
+  } else if (end < 0) {
+    end = obj_size - 1;
+  }
+
+  if (obj_size > 0) {
+    if (ofs >= (off_t)obj_size) {
+      return -ERANGE;
+    }
+    if (end >= (off_t)obj_size) {
+      end = obj_size - 1;
+    }
+  }
+  return 0;
+}
+
+int DB::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, const DoutPrefixProvider *dpp)
+{
+  DB *store = source->get_store();
+
+  uint64_t read_ofs = ofs;
+  uint64_t len, read_len;
+
+  bufferlist read_bl;
+  uint64_t max_chunk_size = store->get_max_chunk_size();
+
+  RGWObjState* astate;
+  int r = source->get_state(dpp, &astate, true);
+  if (r < 0)
+    return r;
+
+  if (!astate->exists) {
+    return -ENOENT;
+  }
+
+  if (astate->size == 0) {
+    end = 0;
+  } else if (end >= (int64_t)astate->size) {
+    end = astate->size - 1;
+  }
+
+  if (end < 0)
+    len = 0;
+  else
+    len = end - ofs + 1;
+
+
+  if (len > max_chunk_size) {
+    len = max_chunk_size;
+  }
+
+  int head_data_size = astate->data.length();
+  bool reading_from_head = (ofs < head_data_size);
+
+  if (reading_from_head) {
+    if (astate) { // && astate->prefetch_data)?
+      if (!ofs && astate->data.length() >= len) {
+        bl = astate->data;
+        return bl.length();
+      }
+
+      if (ofs < astate->data.length()) {
+        unsigned copy_len = std::min((uint64_t)head_data_size - ofs, len);
+        astate->data.begin(ofs).copy(copy_len, bl);
+        return bl.length();
+      }
+    }
+  }
+
+  /* tail object */
+  int part_num = (ofs / max_chunk_size);
+  /* XXX: Handle multipart_str */
+  raw_obj read_obj(store, source->get_bucket_info().bucket.name, astate->obj.key.name, 
+      astate->obj.key.instance, astate->obj.key.ns, source->obj_id, "0.0", part_num);
+
+  read_len = len;
+
+  ldpp_dout(dpp, 20) << "dbstore->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
+
+  // read from non head object
+  r = read_obj.read(dpp, read_ofs, read_len, bl);
+
+  if (r < 0) {
+    return r;
+  }
+
+  return bl.length();
+}
+
+static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+    const DB::raw_obj& read_obj, off_t obj_ofs,
+    off_t len, bool is_head_obj,
+    RGWObjState* astate, void *arg)
+{
+  struct db_get_obj_data* d = static_cast<struct db_get_obj_data*>(arg);
+  return d->store->get_obj_iterate_cb(dpp, read_obj, obj_ofs, len,
+      is_head_obj, astate, arg);
+}
+
+int DB::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+    const raw_obj& read_obj, off_t obj_ofs,
+    off_t len, bool is_head_obj,
+    RGWObjState* astate, void *arg)
+{
+  struct db_get_obj_data* d = static_cast<struct db_get_obj_data*>(arg);
+  bufferlist bl;
+  int r = 0;
+
+  if (is_head_obj) {
+    bl = astate->data;
+  } else {
+    // read from non head object
+    raw_obj robj = read_obj;
+    /* read entire data. So pass offset as '0' & len as '-1' */
+    r = robj.read(dpp, 0, -1, bl);
+
+    if (r <= 0) {
+      return r;
+    }
+  }
+
+  unsigned read_ofs = 0, read_len = 0;
+  while (read_ofs < bl.length()) {
+    unsigned chunk_len = std::min((uint64_t)bl.length() - read_ofs, (uint64_t)len);
+    r = d->client_cb->handle_data(bl, read_ofs, chunk_len);
+    if (r < 0)
+      return r;
+    read_ofs += chunk_len;
+    read_len += chunk_len;
+    ldpp_dout(dpp, 20) << "dbstore->get_obj_iterate_cb  obj-ofs=" << obj_ofs << " len=" << len <<  " chunk_len = " << chunk_len << " read_len = " << read_len << dendl;
+  }
+
+
+  d->offset += read_len;
+
+  return read_len;
+}
+
+int DB::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb)
+{
+  DB *store = source->get_store();
+  const uint64_t chunk_size = store->get_max_chunk_size();
+
+  db_get_obj_data data(store, cb, ofs);
+
+  int r = source->iterate_obj(dpp, source->get_bucket_info(), state.obj,
+      ofs, end, chunk_size, _get_obj_iterate_cb, &data);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int DB::Object::iterate_obj(const DoutPrefixProvider *dpp,
+    const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+    off_t ofs, off_t end, uint64_t max_chunk_size,
+    iterate_obj_cb cb, void *arg)
+{
+  DB *store = get_store();
+  uint64_t len;
+  RGWObjState* astate;
+
+  int r = get_state(dpp, &astate, true);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!astate->exists) {
+    return -ENOENT;
+  }
+
+  if (end < 0)
+    len = 0;
+  else
+    len = end - ofs + 1;
+
+  /* XXX: Will it really help to store all parts info in astate like manifest in Rados? */
+  int part_num = 0;
+  int head_data_size = astate->data.length();
+
+  while (ofs <= end && (uint64_t)ofs < astate->size) {
+    part_num = (ofs / max_chunk_size);
+    uint64_t read_len = std::min(len, max_chunk_size);
+
+    /* XXX: Handle multipart_str */
+    raw_obj read_obj(store, get_bucket_info().bucket.name, astate->obj.key.name, 
+        astate->obj.key.instance, astate->obj.key.ns, obj_id, "0.0", part_num);
+    bool reading_from_head = (ofs < head_data_size);
+
+    r = cb(dpp, read_obj, ofs, read_len, reading_from_head, astate, arg);
+    if (r <= 0) {
+      return r;
+    }
+    /* r refers to chunk_len (no. of bytes) handled in cb */
+    len -= r;
+    ofs += r;
+  }
+
+  return 0;
+}
+
+int DB::Object::Write::prepare(const DoutPrefixProvider* dpp)
+{
+  DB *store = target->get_store();
+
+  int ret = -1;
+
+  /* XXX: handle assume_noent */
+
+  obj_state.obj = target->obj;
+ 
+  if (target->obj_id.empty()) {
+    if (!target->obj.key.instance.empty() && (target->obj.key.instance != "null")) {
+      /* versioned object. Set obj_id same as versionID/instance */
+      target->obj_id = target->obj.key.instance;
+    } else {
+      // generate obj_id
+      char buf[33];
+      gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+      target->obj_id = buf;
+    }
+  }
+
+  ret = 0;
+  return ret;
+}
+
+/* writes tail objects */
+int DB::Object::Write::write_data(const DoutPrefixProvider* dpp,
+                               bufferlist& data, uint64_t ofs) {
+  DB *store = target->get_store();
+  /* tail objects */
+  /* XXX: Split into parts each of max_chunk_size. But later make tail
+   * object chunk size limit to sqlite blob limit */
+  int part_num = 0;
+
+  uint64_t max_chunk_size = store->get_max_chunk_size();
+
+  /* tail_obj ofs should be greater than max_head_size */
+  if (mp_part_str == "0.0")  { // ensure not multipart meta object
+    if (ofs < store->get_max_head_size()) {
+      return -1;
+    }
+  }
+  
+  uint64_t end = data.length();
+  uint64_t write_ofs = 0;
+  /* as we are writing max_chunk_size at a time in sal_dbstore DBAtomicWriter::process(),
+   * maybe this while loop is not needed
+   */
+  while (write_ofs < end) {
+    part_num = (ofs / max_chunk_size);
+    uint64_t len = std::min(end, max_chunk_size);
+
+    /* XXX: Handle multipart_str */
+    raw_obj write_obj(store, target->get_bucket_info().bucket.name, obj_state.obj.key.name, 
+        obj_state.obj.key.instance, obj_state.obj.key.ns, target->obj_id, mp_part_str, part_num);
+
+
+    ldpp_dout(dpp, 20) << "dbstore->write obj-ofs=" << ofs << " write_len=" << len << dendl;
+
+    // write into non head object
+    int r = write_obj.write(dpp, ofs, write_ofs, len, data); 
+    if (r < 0) {
+      return r;
+    }
+    /* r refers to chunk_len (no. of bytes) handled in raw_obj::write */
+    len -= r;
+    ofs += r;
+    write_ofs += r;
+  }
+
+  return 0;
+}
+
+/* Write metadata & head object data */
+int DB::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
+    uint64_t size, uint64_t accounted_size,
+    map<string, bufferlist>& attrs,
+    bool assume_noent, bool modify_tail)
+{
+  DB *store = target->get_store();
+
+  RGWObjState* state = &obj_state;
+  map<string, bufferlist> *attrset;
+  DBOpParams params = {};
+  int ret = 0;
+  string etag;
+  string content_type;
+  bufferlist acl_bl;
+  string storage_class;
+
+  map<string, bufferlist>::iterator iter;
+
+  store->InitializeParams(dpp, &params);
+  target->InitializeParamsfromObject(dpp, &params);
+
+  obj_state = params.op.obj.state;
+
+  if (real_clock::is_zero(meta.set_mtime)) {
+    meta.set_mtime = real_clock::now();
+  }
+
+  attrset = &state->attrset;
+  if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule()) {
+    // && meta.flags == PUT_OBJ_CREATE) {
+    auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+    if (iter == attrs.end()) {
+      real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
+      string mode = target->bucket_info.obj_lock.get_mode();
+      RGWObjectRetention obj_retention(mode, lock_until_date);
+      bufferlist bl;
+      obj_retention.encode(bl);
+      (*attrset)[RGW_ATTR_OBJECT_RETENTION] = bl;
+    }
+  }
+
+  state->mtime = meta.set_mtime;
+
+  if (meta.data) {
+    /* if we want to overwrite the data, we also want to overwrite the
+       xattrs, so just remove the object */
+    params.op.obj.head_data = *meta.data;
+  }
+
+  if (meta.rmattrs) {
+    for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
+      const string& name = iter->first;
+      (*attrset).erase(name.c_str());
+    }
+  }
+
+  if (meta.manifest) {
+    storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
+
+    /* remove existing manifest attr */
+    iter = attrs.find(RGW_ATTR_MANIFEST);
+    if (iter != attrs.end())
+      attrs.erase(iter);
+
+    bufferlist bl;
+    encode(*meta.manifest, bl);
+    (*attrset)[RGW_ATTR_MANIFEST] = bl;
+  }
+
+  for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+    const string& name = iter->first;
+    bufferlist& bl = iter->second;
+
+    if (!bl.length())
+      continue;
+
+    (*attrset)[name.c_str()] = bl;
+
+    if (name.compare(RGW_ATTR_ETAG) == 0) {
+      etag = rgw_bl_str(bl);
+      params.op.obj.etag = etag;
+    } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
+      content_type = rgw_bl_str(bl);
+    } else if (name.compare(RGW_ATTR_ACL) == 0) {
+      acl_bl = bl;
+    }
+  }
+
+  if (!storage_class.empty()) {
+    bufferlist bl;
+    bl.append(storage_class);
+    (*attrset)[RGW_ATTR_STORAGE_CLASS] = bl;
+  }
+
+  params.op.obj.state = *state ;
+  params.op.obj.state.exists = true;
+  params.op.obj.state.size = size;
+  params.op.obj.state.accounted_size = accounted_size;
+  params.op.obj.owner = target->get_bucket_info().owner.id;
+  params.op.obj.category = meta.category;
+
+  if (meta.mtime) {
+    *meta.mtime = meta.set_mtime;
+  }
+
+  params.op.query_str = "meta";
+  params.op.obj.obj_id = target->obj_id;
+
+  /* Check if versioned */
+  bool is_versioned = !target->obj.key.instance.empty() && (target->obj.key.instance != "null");
+  params.op.obj.is_versioned = is_versioned;
+
+  if (is_versioned && (params.op.obj.category == RGWObjCategory::Main)) {
+    /* versioned object */
+    params.op.obj.flags |= rgw_bucket_dir_entry::FLAG_VER;
+  }
+  ret = store->ProcessOp(dpp, "PutObject", &params);
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In PutObject failed err:(" <<ret<<")" << dendl;
+    goto out;
+  }
+
+
+out:
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: do_write_meta returned ret=" << ret << dendl;
+  }
+
+  meta.canceled = true;
+
+  return ret;
+}
+
+int DB::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
+    map<string, bufferlist>& attrs)
+{
+  bool assume_noent = false;
+  /* handle assume_noent */
+  int r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail);
+  return r;
+}
+
+int DB::Object::Delete::delete_obj(const DoutPrefixProvider *dpp) {
+  int ret = 0;
+  DBOpParams del_params = {};
+  bool versioning_enabled = ((params.versioning_status & BUCKET_VERSIONED) == BUCKET_VERSIONED); 
+  bool versioning_suspended = ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == BUCKET_VERSIONS_SUSPENDED); 
+  bool regular_obj = true;
+  std::string versionid = target->obj.key.instance;
+
+  ret = target->get_object_impl(dpp, del_params);
+
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0)<<"GetObject during delete failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  regular_obj = (del_params.op.obj.category == RGWObjCategory::Main);
+  if (!ret) {
+    if (!versionid.empty()) {
+      // version-id is provided
+      ret = delete_obj_impl(dpp, del_params);
+      return ret;
+    } else { // version-id is empty..
+      /*
+       * case: bucket_versioned
+       *    create_delete_marker;
+       * case: bucket_suspended
+       *    delete entry
+       *    create delete marker with version-id null;
+       * default:
+       *   just delete the entry
+       */
+      if (versioning_suspended && regular_obj) {
+        ret = delete_obj_impl(dpp, del_params);
+        ret = create_dm(dpp, del_params);
+      } else if (versioning_enabled && regular_obj) {
+        ret = create_dm(dpp, del_params);
+      } else {
+        ret = delete_obj_impl(dpp, del_params);
+      }
+    }
+  } else { // ret == -ENOENT
+     /* case: VersionID given
+      *     return -ENOENT
+      * else: // may or may not be versioned object
+      *     Listversionedobjects
+      *     if (list_entries.empty()) {
+      *         nothing to do..return ENOENT
+      *     } else {
+      *         read top entry
+      *         if (top.flags | FLAG_DELETE_MARKER) {
+      *            // nothing to do
+      *            return -ENOENT;
+      *          }
+      *          if (bucket_versioned)  {
+      *            // create delete marker with new version-id
+      *          } else if (bucket_suspended) {
+      *            // create delete marker with version-id null
+      *          }
+      *          bucket cannot be in unversioned state post having versions
+      *     }
+      */
+     if (!versionid.empty()) {
+       return -ENOENT;
+     }
+     ret = target->list_versioned_objects(dpp, del_params.op.obj.list_entries);
+     if (ret) {
+        ldpp_dout(dpp, 0)<<"ListVersionedObjects failed err:(" <<ret<<")" << dendl;
+        return ret;
+     }
+    if (del_params.op.obj.list_entries.empty()) {
+      return -ENOENT;
+    }
+    auto &ent = del_params.op.obj.list_entries.front();
+    if (ent.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER) {
+      // for now do not create another delete marker..just exit
+      return 0;
+    }
+    ret = create_dm(dpp, del_params);
+  }
+  return ret;
+}
+
+int DB::Object::Delete::delete_obj_impl(const DoutPrefixProvider *dpp,
+                                        DBOpParams& del_params) {
+  int ret = 0;
+  DB *store = target->get_store();
+
+  ret = store->ProcessOp(dpp, "DeleteObject", &del_params);
+  if (ret) {
+    ldpp_dout(dpp, 0) << "In DeleteObject failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  /* Now that tail objects are associated with objectID, they are not deleted
+   * as part of this DeleteObj operation. Such tail objects (with no head object
+   * in *.object.table are cleaned up later by GC thread.
+   *
+   * To avoid races between writes/reads & GC delete, mtime is maintained for each
+   * tail object. This mtime is updated when tail object is written and also when
+   * its corresponding head object is deleted (like here in this case).
+   */
+  DBOpParams update_params = del_params;
+  update_params.op.obj.state.mtime = real_clock::now();
+  ret = store->ProcessOp(dpp, "UpdateObjectData", &update_params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) << "Updating tail objects mtime failed err:(" <<ret<<")" << dendl;
+  }
+  return ret;
+}
+
+/*
+ * a) if no versionID specified,
+ *  - create a delete marker with 
+ *    - new version/instanceID (if bucket versioned)
+ *    - null versionID (if versioning suspended)
+ */
+int DB::Object::Delete::create_dm(const DoutPrefixProvider *dpp,
+                                             DBOpParams& del_params) {
+
+  DB *store = target->get_store();
+  bool versioning_suspended = ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == BUCKET_VERSIONS_SUSPENDED); 
+  int ret = -1;
+  DBOpParams olh_params = {};
+  std::string version_id;
+  DBOpParams next_params = del_params;
+
+  version_id = del_params.op.obj.state.obj.key.instance;
+
+  DBOpParams dm_params = del_params;
+
+  // create delete marker
+
+  store->InitializeParams(dpp, &dm_params);
+  target->InitializeParamsfromObject(dpp, &dm_params);
+  dm_params.op.obj.category = RGWObjCategory::None;
+
+  if (versioning_suspended) {
+    dm_params.op.obj.state.obj.key.instance = "null";
+  } else {
+    store->gen_rand_obj_instance_name(&dm_params.op.obj.state.obj.key);
+    dm_params.op.obj.obj_id = dm_params.op.obj.state.obj.key.instance;
+  }
+
+  dm_params.op.obj.flags |= (rgw_bucket_dir_entry::FLAG_DELETE_MARKER);
+
+  ret = store->ProcessOp(dpp, "PutObject", &dm_params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0) << "delete_olh: failed to create delete marker - err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+  result.delete_marker = true;
+  result.version_id = dm_params.op.obj.state.obj.key.instance;
+  return ret;
+}
+
+int DB::get_entry(const std::string& oid, const std::string& marker,
+			      std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_entry.index = oid;
+  params.op.lc_entry.entry.set_bucket(marker);
+
+  params.op.query_str = "get_entry";
+  ret = ProcessOp(dpp, "GetLCEntry", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In GetLCEntry failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  if (!params.op.lc_entry.entry.get_start_time() == 0) { //ensure entry found
+    rgw::sal::Lifecycle::LCEntry* e;
+    e = new rgw::sal::StoreLifecycle::StoreLCEntry(params.op.lc_entry.entry);
+    if (!e) {
+      ret = -ENOMEM;
+      goto out;
+    }
+    entry->reset(e);
+  }
+
+out:
+  return ret;
+}
+
+int DB::get_next_entry(const std::string& oid, const std::string& marker,
+			      std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_entry.index = oid;
+  params.op.lc_entry.entry.set_bucket(marker);
+
+  params.op.query_str = "get_next_entry";
+  ret = ProcessOp(dpp, "GetLCEntry", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In GetLCEntry failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  if (!params.op.lc_entry.entry.get_start_time() == 0) { //ensure entry found
+    rgw::sal::Lifecycle::LCEntry* e;
+    e = new rgw::sal::StoreLifecycle::StoreLCEntry(params.op.lc_entry.entry);
+    if (!e) {
+      ret = -ENOMEM;
+      goto out;
+    }
+    entry->reset(e);
+  }
+
+out:
+  return ret;
+}
+
+int DB::set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_entry.index = oid;
+  params.op.lc_entry.entry = entry;
+
+  ret = ProcessOp(dpp, "InsertLCEntry", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In InsertLCEntry failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::list_entries(const std::string& oid, const std::string& marker,
+  				 uint32_t max_entries, std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& entries)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  entries.clear();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_entry.index = oid;
+  params.op.lc_entry.min_marker = marker;
+  params.op.list_max_count = max_entries;
+
+  ret = ProcessOp(dpp, "ListLCEntries", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In ListLCEntries failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  for (auto& entry : params.op.lc_entry.list_entries) {
+    entries.push_back(std::make_unique<rgw::sal::StoreLifecycle::StoreLCEntry>(std::move(entry)));
+  }
+
+out:
+  return ret;
+}
+
+int DB::rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_entry.index = oid;
+  params.op.lc_entry.entry = entry;
+
+  ret = ProcessOp(dpp, "RemoveLCEntry", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In RemoveLCEntry failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LCHead>* head)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_head.index = oid;
+
+  ret = ProcessOp(dpp, "GetLCHead", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In GetLCHead failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+  *head = std::make_unique<rgw::sal::StoreLifecycle::StoreLCHead>(params.op.lc_head.head);
+
+out:
+  return ret;
+}
+
+int DB::put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head)
+{
+  int ret = 0;
+  const DoutPrefixProvider *dpp = get_def_dpp();
+
+  DBOpParams params = {};
+  InitializeParams(dpp, &params);
+
+  params.op.lc_head.index = oid;
+  params.op.lc_head.head = head;
+
+  ret = ProcessOp(dpp, "InsertLCHead", &params);
+
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"In InsertLCHead failed err:(" <<ret<<") " << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int DB::delete_stale_objs(const DoutPrefixProvider *dpp, const std::string& bucket,
+                          uint32_t min_wait) {
+  DBOpParams params = {};
+  int ret = -1;
+
+  params.op.bucket.info.bucket.name = bucket;
+  /* Verify if bucket exists.
+   * XXX: This is needed for now to create objectmap of bucket
+   * in SQLGetBucket
+   */
+  InitializeParams(dpp, &params);
+  ret = ProcessOp(dpp, "GetBucket", &params);
+  if (ret) {
+    ldpp_dout(dpp, 0) << "In GetBucket failed err:(" <<ret<<")" << dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << " Deleting stale_objs of bucket( " << bucket <<")" << dendl;
+  /* XXX: handle reads racing with delete here. Simple approach is maybe
+   * to use locks or sqlite transactions.
+   */
+  InitializeParams(dpp, &params);
+  params.op.obj.state.mtime = (real_clock::now() - make_timespan(min_wait));
+  ret = ProcessOp(dpp, "DeleteStaleObjectData", &params);
+  if (ret) {
+    ldpp_dout(dpp, 0) << "In DeleteStaleObjectData failed err:(" <<ret<<")" << dendl;
+  }
+
+  return ret;
+}
+
+void *DB::GC::entry() {
+  do {
+    std::unique_lock<std::mutex> lk(mtx);
+
+    ldpp_dout(dpp, 2) << " DB GC started " << dendl;
+    int max = 100;
+    RGWUserBuckets buckets;
+    bool is_truncated = false;
+
+    do {
+      std::string& marker = bucket_marker;
+      rgw_user user;
+      user.id = user_marker;
+      buckets.clear();
+      is_truncated = false;
+
+      int r = db->list_buckets(dpp, "all", user, marker, string(),
+                       max, false, &buckets, &is_truncated);
+ 
+      if (r < 0) { //do nothing? retry later ?
+        break;
+      }
+
+      for (const auto& ent : buckets.get_buckets()) {
+        const std::string &bname = ent.first;
+
+        r = db->delete_stale_objs(dpp, bname, gc_obj_min_wait);
+
+        if (r < 0) { //do nothing? skip to next entry?
+         ldpp_dout(dpp, 2) << " delete_stale_objs failed for bucket( " << bname <<")" << dendl;
+        }
+        bucket_marker = bname;
+        user_marker = user.id;
+
+        /* XXX: If using locks, unlock here and reacquire in the next iteration */
+        cv.wait_for(lk, std::chrono::milliseconds(100));
+	if (stop_signalled) {
+	  goto done;
+	}
+      }
+    } while(is_truncated);
+
+    bucket_marker.clear();
+    cv.wait_for(lk, std::chrono::milliseconds(gc_interval*10));
+  } while(! stop_signalled);
+
+done:
+  return nullptr;
+}
+
+} } // namespace rgw::store
+
diff --git a/src/rgw/driver/dbstore/common/dbstore.h b/src/rgw/driver/dbstore/common/dbstore.h
new file mode 100644
index 000000000..b26cc116e
--- /dev/null
+++ b/src/rgw/driver/dbstore/common/dbstore.h
@@ -0,0 +1,2016 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string>
+#include <stdio.h>
+#include <iostream>
+#include <mutex>
+#include <condition_variable>
+#include "fmt/format.h"
+#include <map>
+#include "rgw_sal_store.h"
+#include "rgw_common.h"
+#include "driver/rados/rgw_bucket.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include "common/ceph_context.h"
+#include "rgw_obj_manifest.h"
+#include "rgw_multi.h"
+
+namespace rgw { namespace store {
+
+class DB;
+
+struct DBOpUserInfo {
+  RGWUserInfo uinfo = {};
+  obj_version user_version;
+  rgw::sal::Attrs user_attrs;
+};
+
+struct DBOpBucketInfo {
+  RGWBucketEnt ent; // maybe not needed. not used in create/get_bucket
+  RGWBucketInfo info;
+  RGWUser* owner = nullptr;
+  rgw::sal::Attrs bucket_attrs;
+  obj_version bucket_version;
+  ceph::real_time mtime;
+  // used for list query
+  std::string min_marker;
+  std::string max_marker;
+  std::list<RGWBucketEnt> list_entries;
+};
+
+struct DBOpObjectInfo {
+  RGWAccessControlPolicy acls;
+  RGWObjState state = {};
+
+  /* Below are taken from rgw_bucket_dir_entry */
+  RGWObjCategory category;
+  std::string etag;
+  std::string owner;
+  std::string owner_display_name;
+  std::string content_type;
+  std::string storage_class;
+  bool appendable;
+  uint64_t index_ver;
+  std::string tag;
+  uint16_t flags;
+  uint64_t versioned_epoch;
+
+  /* from state.manifest (RGWObjManifest) */
+  std::map<uint64_t, RGWObjManifestPart> objs;
+  uint64_t head_size{0};
+  rgw_placement_rule head_placement_rule;
+  uint64_t max_head_size{0};
+  std::string obj_id;
+  rgw_bucket_placement tail_placement; /* might be different than the original bucket,
+                                          as object might have been copied across pools */
+  std::map<uint64_t, RGWObjManifestRule> rules;
+  std::string tail_instance; /* tail object's instance */
+
+
+  /* Obj's omap <key,value> store */
+  std::map<std::string, bufferlist> omap;
+
+  /* Extra fields */
+  bool is_multipart;
+  std::list<RGWUploadPartInfo> mp_parts;
+
+  bufferlist head_data;
+  std::string min_marker;
+  std::string max_marker;
+  std::string prefix;
+  std::list<rgw_bucket_dir_entry> list_entries;
+  /* XXX: Maybe use std::vector instead of std::list */
+
+  /* for versioned objects */
+  bool is_versioned;
+  uint64_t version_num = 0;
+};
+
+struct DBOpObjectDataInfo {
+  RGWObjState state;
+  uint64_t part_num;
+  std::string multipart_part_str;
+  uint64_t offset;
+  uint64_t size;
+  bufferlist data{};
+};
+
+struct DBOpLCHeadInfo {
+  std::string index;
+  rgw::sal::StoreLifecycle::StoreLCHead head;
+};
+
+struct DBOpLCEntryInfo {
+  std::string index;
+  rgw::sal::StoreLifecycle::StoreLCEntry entry;
+  // used for list query
+  std::string min_marker;
+  std::list<rgw::sal::StoreLifecycle::StoreLCEntry> list_entries;
+};
+
+struct DBOpInfo {
+  std::string name; // Op name
+  /* Support only single access_key for now. So store
+   * it separately as primary access_key_id & secret to
+   * be able to query easily.
+   *
+   * XXX: Swift keys and subuser not supported for now */
+  DBOpUserInfo user;
+  std::string query_str;
+  DBOpBucketInfo bucket;
+  DBOpObjectInfo obj;
+  DBOpObjectDataInfo obj_data;
+  DBOpLCHeadInfo lc_head;
+  DBOpLCEntryInfo lc_entry;
+  uint64_t list_max_count;
+};
+
+struct DBOpParams {
+  CephContext *cct;
+
+  /* Tables */
+  std::string user_table;
+  std::string bucket_table;
+  std::string object_table;
+
+  /* Ops*/
+  DBOpInfo op;
+
+  std::string objectdata_table;
+  std::string object_trigger;
+  std::string object_view;
+  std::string quota_table;
+  std::string lc_head_table;
+  std::string lc_entry_table;
+  std::string obj;
+};
+
+/* Used for prepared schemas.
+ * Difference with above structure is that all 
+ * the fields are strings here to accommodate any
+ * style identifiers used by backend db. By default
+ * initialized with sqlitedb style, can be overriden
+ * using InitPrepareParams()
+ *
+ * These identifiers are used in prepare and bind statements
+ * to get the right index of each param.
+ */
+struct DBOpUserPrepareInfo {
+  static constexpr const char* user_id = ":user_id";
+  static constexpr const char* tenant = ":tenant";
+  static constexpr const char* ns = ":ns";
+  static constexpr const char* display_name = ":display_name";
+  static constexpr const char* user_email = ":user_email";
+  /* Support only single access_key for now. So store
+   * it separately as primary access_key_id & secret to
+   * be able to query easily.
+   *
+   * In future, when need to support & query from multiple
+   * access keys, better to maintain them in a separate table.
+   */
+  static constexpr const char* access_keys_id = ":access_keys_id";
+  static constexpr const char* access_keys_secret = ":access_keys_secret";
+  static constexpr const char* access_keys = ":access_keys";
+  static constexpr const char* swift_keys = ":swift_keys";
+  static constexpr const char* subusers = ":subusers";
+  static constexpr const char* suspended = ":suspended";
+  static constexpr const char* max_buckets = ":max_buckets";
+  static constexpr const char* op_mask = ":op_mask";
+  static constexpr const char* user_caps = ":user_caps";
+  static constexpr const char* admin = ":admin";
+  static constexpr const char* system = ":system";
+  static constexpr const char* placement_name = ":placement_name";
+  static constexpr const char* placement_storage_class = ":placement_storage_class";
+  static constexpr const char* placement_tags = ":placement_tags";
+  static constexpr const char* bucket_quota = ":bucket_quota";
+  static constexpr const char* temp_url_keys = ":temp_url_keys";
+  static constexpr const char* user_quota = ":user_quota";
+  static constexpr const char* type = ":type";
+  static constexpr const char* mfa_ids = ":mfa_ids";
+  static constexpr const char* user_attrs = ":user_attrs";
+  static constexpr const char* user_ver = ":user_vers";
+  static constexpr const char* user_ver_tag = ":user_ver_tag";
+};
+
+struct DBOpBucketPrepareInfo {
+  static constexpr const char* bucket_name = ":bucket_name";
+  static constexpr const char* tenant = ":tenant";
+  static constexpr const char* marker = ":marker";
+  static constexpr const char* bucket_id = ":bucket_id";
+  static constexpr const char* size = ":size";
+  static constexpr const char* size_rounded = ":size_rounded";
+  static constexpr const char* creation_time = ":creation_time";
+  static constexpr const char* count = ":count";
+  static constexpr const char* placement_name = ":placement_name";
+  static constexpr const char* placement_storage_class = ":placement_storage_class";
+  /* ownerid - maps to DBOpUserPrepareInfo */
+  static constexpr const char* flags = ":flags";
+  static constexpr const char* zonegroup = ":zonegroup";
+  static constexpr const char* has_instance_obj = ":has_instance_obj";
+  static constexpr const char* quota = ":quota";
+  static constexpr const char* requester_pays = ":requester_pays";
+  static constexpr const char* has_website = ":has_website";
+  static constexpr const char* website_conf = ":website_conf";
+  static constexpr const char* swift_versioning = ":swift_versioning";
+  static constexpr const char* swift_ver_location = ":swift_ver_location";
+  static constexpr const char* mdsearch_config = ":mdsearch_config";
+  static constexpr const char* new_bucket_instance_id = ":new_bucket_instance_id";
+  static constexpr const char* obj_lock = ":obj_lock";
+  static constexpr const char* sync_policy_info_groups = ":sync_policy_info_groups";
+  static constexpr const char* bucket_attrs = ":bucket_attrs";
+  static constexpr const char* bucket_ver = ":bucket_vers";
+  static constexpr const char* bucket_ver_tag = ":bucket_ver_tag";
+  static constexpr const char* mtime = ":mtime";
+  static constexpr const char* min_marker = ":min_marker";
+  static constexpr const char* max_marker = ":max_marker";
+};
+
+struct DBOpObjectPrepareInfo {
+  static constexpr const char* obj_name = ":obj_name";
+  static constexpr const char* obj_instance = ":obj_instance";
+  static constexpr const char* obj_ns  = ":obj_ns";
+  static constexpr const char* acls = ":acls";
+  static constexpr const char* index_ver = ":index_ver";
+  static constexpr const char* tag = ":tag";
+  static constexpr const char* flags = ":flags";
+  static constexpr const char* versioned_epoch = ":versioned_epoch";
+  static constexpr const char* obj_category = ":obj_category";
+  static constexpr const char* etag = ":etag";
+  static constexpr const char* owner = ":owner";
+  static constexpr const char* owner_display_name = ":owner_display_name";
+  static constexpr const char* storage_class = ":storage_class";
+  static constexpr const char* appendable = ":appendable";
+  static constexpr const char* content_type = ":content_type";
+  static constexpr const char* index_hash_source = ":index_hash_source";
+  static constexpr const char* obj_size = ":obj_size";
+  static constexpr const char* accounted_size = ":accounted_size";
+  static constexpr const char* mtime = ":mtime";
+  static constexpr const char* epoch = ":epoch";
+  static constexpr const char* obj_tag = ":obj_tag";
+  static constexpr const char* tail_tag = ":tail_tag";
+  static constexpr const char* write_tag = ":write_tag";
+  static constexpr const char* fake_tag = ":fake_tag";
+  static constexpr const char* shadow_obj = ":shadow_obj";
+  static constexpr const char* has_data = ":has_data";
+  static constexpr const char* is_versioned = ":is_versioned";
+  static constexpr const char* version_num = ":version_num";
+  static constexpr const char* pg_ver = ":pg_ver";
+  static constexpr const char* zone_short_id = ":zone_short_id";
+  static constexpr const char* obj_version = ":obj_version";
+  static constexpr const char* obj_version_tag = ":obj_version_tag";
+  static constexpr const char* obj_attrs = ":obj_attrs";
+  static constexpr const char* head_size = ":head_size";
+  static constexpr const char* max_head_size = ":max_head_size";
+  static constexpr const char* obj_id = ":obj_id";
+  static constexpr const char* tail_instance = ":tail_instance";
+  static constexpr const char* head_placement_rule_name = ":head_placement_rule_name";
+  static constexpr const char* head_placement_storage_class  = ":head_placement_storage_class";
+  static constexpr const char* tail_placement_rule_name = ":tail_placement_rule_name";
+  static constexpr const char* tail_placement_storage_class  = ":tail_placement_storage_class";
+  static constexpr const char* manifest_part_objs = ":manifest_part_objs";
+  static constexpr const char* manifest_part_rules = ":manifest_part_rules";
+  static constexpr const char* omap = ":omap";
+  static constexpr const char* is_multipart = ":is_multipart";
+  static constexpr const char* mp_parts = ":mp_parts";
+  static constexpr const char* head_data = ":head_data";
+  static constexpr const char* min_marker = ":min_marker";
+  static constexpr const char* max_marker = ":max_marker";
+  static constexpr const char* prefix = ":prefix";
+  /* Below used to update mp_parts obj name
+   * from meta object to src object on completion */
+  static constexpr const char* new_obj_name = ":new_obj_name";
+  static constexpr const char* new_obj_instance = ":new_obj_instance";
+  static constexpr const char* new_obj_ns  = ":new_obj_ns";
+};
+
+struct DBOpObjectDataPrepareInfo {
+  static constexpr const char* part_num = ":part_num";
+  static constexpr const char* offset = ":offset";
+  static constexpr const char* data = ":data";
+  static constexpr const char* size = ":size";
+  static constexpr const char* multipart_part_str = ":multipart_part_str";
+};
+
+struct DBOpLCEntryPrepareInfo {
+  static constexpr const char* index = ":index";
+  static constexpr const char* bucket_name = ":bucket_name";
+  static constexpr const char* start_time = ":start_time";
+  static constexpr const char* status = ":status";
+  static constexpr const char* min_marker = ":min_marker";
+};
+
+struct DBOpLCHeadPrepareInfo {
+  static constexpr const char* index = ":index";
+  static constexpr const char* start_date = ":start_date";
+  static constexpr const char* marker = ":marker";
+};
+
+struct DBOpPrepareInfo {
+  DBOpUserPrepareInfo user;
+  std::string_view query_str; // view into DBOpInfo::query_str
+  DBOpBucketPrepareInfo bucket;
+  DBOpObjectPrepareInfo obj;
+  DBOpObjectDataPrepareInfo obj_data;
+  DBOpLCHeadPrepareInfo lc_head;
+  DBOpLCEntryPrepareInfo lc_entry;
+  static constexpr const char* list_max_count = ":list_max_count";
+};
+
+struct DBOpPrepareParams {
+  /* Tables */
+  std::string user_table;
+  std::string bucket_table;
+  std::string object_table;
+
+  /* Ops */
+  DBOpPrepareInfo op;
+
+
+  std::string objectdata_table;
+  std::string object_trigger;
+  std::string object_view;
+  std::string quota_table;
+  std::string lc_head_table;
+  std::string lc_entry_table;
+};
+
+struct DBOps {
+  std::shared_ptr<class InsertUserOp> InsertUser;
+  std::shared_ptr<class RemoveUserOp> RemoveUser;
+  std::shared_ptr<class GetUserOp> GetUser;
+  std::shared_ptr<class InsertBucketOp> InsertBucket;
+  std::shared_ptr<class UpdateBucketOp> UpdateBucket;
+  std::shared_ptr<class RemoveBucketOp> RemoveBucket;
+  std::shared_ptr<class GetBucketOp> GetBucket;
+  std::shared_ptr<class ListUserBucketsOp> ListUserBuckets;
+  std::shared_ptr<class InsertLCEntryOp> InsertLCEntry;
+  std::shared_ptr<class RemoveLCEntryOp> RemoveLCEntry;
+  std::shared_ptr<class GetLCEntryOp> GetLCEntry;
+  std::shared_ptr<class ListLCEntriesOp> ListLCEntries;
+  std::shared_ptr<class  InsertLCHeadOp> InsertLCHead;
+  std::shared_ptr<class RemoveLCHeadOp> RemoveLCHead;
+  std::shared_ptr<class GetLCHeadOp> GetLCHead;
+};
+
+class ObjectOp {
+  public:
+    ObjectOp() {};
+
+    virtual ~ObjectOp() {}
+
+    std::shared_ptr<class PutObjectOp> PutObject;
+    std::shared_ptr<class DeleteObjectOp> DeleteObject;
+    std::shared_ptr<class GetObjectOp> GetObject;
+    std::shared_ptr<class UpdateObjectOp> UpdateObject;
+    std::shared_ptr<class ListBucketObjectsOp> ListBucketObjects;
+    std::shared_ptr<class ListVersionedObjectsOp> ListVersionedObjects;
+    std::shared_ptr<class PutObjectDataOp> PutObjectData;
+    std::shared_ptr<class UpdateObjectDataOp> UpdateObjectData;
+    std::shared_ptr<class GetObjectDataOp> GetObjectData;
+    std::shared_ptr<class DeleteObjectDataOp> DeleteObjectData;
+    std::shared_ptr<class DeleteStaleObjectDataOp> DeleteStaleObjectData;
+
+    virtual int InitializeObjectOps(std::string db_name, const DoutPrefixProvider *dpp) { return 0; }
+};
+
+class DBOp {
+  private:
+    static constexpr std::string_view CreateUserTableQ =
+      /* Corresponds to rgw::sal::User
+       *
+       * For now only UserID is made Primary key.
+       * If multiple tenants are stored in single .db handle, should
+       * make both (UserID, Tenant) as Primary Key.
+       *
+       * XXX:
+       * - AccessKeys, SwiftKeys, Subusers (map<>) are stored as blob.
+       *   To enable easy query, first accesskey is stored in separate fields
+       *   AccessKeysID, AccessKeysSecret.
+       *   In future, may be have separate table to store these keys and
+       *   query on that table.
+       * - Quota stored as blob .. should be linked to quota table.
+       */
+      "CREATE TABLE IF NOT EXISTS '{}' (	\
+      UserID TEXT NOT NULL UNIQUE,		\
+      Tenant TEXT ,		\
+      NS TEXT ,		\
+      DisplayName TEXT , \
+      UserEmail TEXT ,	\
+      AccessKeysID TEXT ,	\
+      AccessKeysSecret TEXT ,	\
+      AccessKeys BLOB ,	\
+      SwiftKeys BLOB ,	\
+      SubUsers BLOB ,		\
+      Suspended INTEGER ,	\
+      MaxBuckets INTEGER ,	\
+      OpMask	INTEGER ,	\
+      UserCaps BLOB ,		\
+      Admin	INTEGER ,	\
+      System INTEGER , 	\
+      PlacementName TEXT , 	\
+      PlacementStorageClass TEXT , 	\
+      PlacementTags BLOB ,	\
+      BucketQuota BLOB ,	\
+      TempURLKeys BLOB ,	\
+      UserQuota BLOB ,	\
+      TYPE INTEGER ,		\
+      MfaIDs BLOB ,	\
+      AssumedRoleARN TEXT , \
+      UserAttrs   BLOB,   \
+      UserVersion   INTEGER,    \
+      UserVersionTag TEXT,      \
+      PRIMARY KEY (UserID) \n);";
+
+    static constexpr std::string_view CreateBucketTableQ =
+      /* Corresponds to rgw::sal::Bucket
+       *  
+       *  For now only BucketName is made Primary key. Since buckets should
+       *  be unique across users in rgw, OwnerID is not made part of primary key.
+       *  However it is still referenced as foreign key
+       *
+       *  If multiple tenants are stored in single .db handle, should
+       *  make both (BucketName, Tenant) as Primary Key. Also should
+       *  reference (UserID, Tenant) as Foreign key.
+       *
+       * leaving below RADOS specific fields
+       *   - rgw_data_placement_target explicit_placement (struct rgw_bucket)
+       *   - rgw::BucketLayout layout (struct RGWBucketInfo)
+       *   - const static uint32_t NUM_SHARDS_BLIND_BUCKET (struct RGWBucketInfo),
+       *     should be '0' indicating no sharding.
+       *   - cls_rgw_reshard_status reshard_status (struct RGWBucketInfo)
+       *
+       * XXX:
+       *   - Quota stored as blob .. should be linked to quota table.
+       *   - WebsiteConf stored as BLOB..if required, should be split
+       *   - Storing bucket_version (struct RGWBucket), objv_tracker
+       *     (struct RGWBucketInfo) separately. Are they same?
+       *
+       */
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      BucketName TEXT NOT NULL UNIQUE , \
+      Tenant TEXT,        \
+      Marker TEXT,        \
+      BucketID TEXT,      \
+      Size   INTEGER,     \
+      SizeRounded INTEGER,\
+      CreationTime BLOB,  \
+      Count  INTEGER,     \
+      PlacementName TEXT , 	\
+      PlacementStorageClass TEXT , 	\
+      OwnerID TEXT NOT NULL, \
+      Flags   INTEGER,       \
+      Zonegroup TEXT,         \
+      HasInstanceObj BOOLEAN, \
+      Quota   BLOB,       \
+      RequesterPays BOOLEAN,  \
+      HasWebsite  BOOLEAN,    \
+      WebsiteConf BLOB,   \
+      SwiftVersioning BOOLEAN, \
+      SwiftVerLocation TEXT,  \
+      MdsearchConfig  BLOB,   \
+      NewBucketInstanceID TEXT,\
+      ObjectLock BLOB, \
+      SyncPolicyInfoGroups BLOB, \
+      BucketAttrs   BLOB,   \
+      BucketVersion   INTEGER,    \
+      BucketVersionTag TEXT,      \
+      Mtime   BLOB,   \
+      PRIMARY KEY (BucketName) \
+      FOREIGN KEY (OwnerID) \
+      REFERENCES '{}' (UserID) ON DELETE CASCADE ON UPDATE CASCADE \n);";
+
+    static constexpr std::string_view CreateObjectTableTriggerQ =
+      "CREATE TRIGGER IF NOT EXISTS '{}' \
+          AFTER INSERT ON '{}' \
+       BEGIN \
+          UPDATE '{}' \
+          SET VersionNum = (SELECT COALESCE(max(VersionNum), 0) from '{}' where ObjName = new.ObjName) + 1 \
+          where ObjName = new.ObjName and ObjInstance = new.ObjInstance; \
+       END;";
+
+    static constexpr std::string_view CreateObjectTableQ =
+      /* Corresponds to rgw::sal::Object
+       *
+       *  For now only BucketName, ObjName is made Primary key.
+       *  If multiple tenants are stored in single .db handle, should
+       *  include Tenant too in the Primary Key. Also should
+       *  reference (BucketID, Tenant) as Foreign key.
+       * 
+       * referring to 
+       * - rgw_bucket_dir_entry - following are added for now
+       *   flags,
+       *   versioned_epoch
+       *   tag
+       *   index_ver
+       *   meta.category
+       *   meta.etag
+       *   meta.storageclass
+       *   meta.appendable
+       *   meta.content_type
+       *   meta.owner
+       *   meta.owner_display_name
+       *
+       * - RGWObjState. Below are omitted from that struct
+       *    as they seem in-memory variables
+       *    * is_atomic, has_atts, exists, prefetch_data, keep_tail, 
+       * - RGWObjManifest
+       *
+       * Extra field added "IsMultipart" to flag multipart uploads,
+       * HeadData to store first chunk data.
+       */
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      ObjName TEXT NOT NULL , \
+      ObjInstance TEXT, \
+      ObjNS TEXT, \
+      BucketName TEXT NOT NULL , \
+      ACLs    BLOB,   \
+      IndexVer    INTEGER,    \
+      Tag TEXT,   \
+      Flags INTEGER, \
+      VersionedEpoch INTEGER, \
+      ObjCategory INTEGER,    \
+      Etag   TEXT,    \
+      Owner TEXT, \
+      OwnerDisplayName TEXT,  \
+      StorageClass    TEXT,   \
+      Appendable  BOOL,   \
+      ContentType TEXT,   \
+      IndexHashSource TEXT, \
+      ObjSize  INTEGER,   \
+      AccountedSize INTEGER,  \
+      Mtime   BLOB,   \
+      Epoch  INTEGER, \
+      ObjTag  BLOB,   \
+      TailTag BLOB,   \
+      WriteTag    TEXT,   \
+      FakeTag BOOL,   \
+      ShadowObj   TEXT,   \
+      HasData  BOOL,  \
+      IsVersioned BOOL,  \
+      VersionNum  INTEGER, \
+      PGVer   INTEGER, \
+      ZoneShortID  INTEGER,  \
+      ObjVersion   INTEGER,    \
+      ObjVersionTag TEXT,      \
+      ObjAttrs    BLOB,   \
+      HeadSize    INTEGER,    \
+      MaxHeadSize    INTEGER,    \
+      ObjID      TEXT NOT NULL, \
+      TailInstance  TEXT, \
+      HeadPlacementRuleName   TEXT, \
+      HeadPlacementRuleStorageClass TEXT, \
+      TailPlacementRuleName   TEXT, \
+      TailPlacementStorageClass TEXT, \
+      ManifestPartObjs    BLOB,   \
+      ManifestPartRules   BLOB,   \
+      Omap    BLOB,   \
+      IsMultipart     BOOL,   \
+      MPPartsList    BLOB,   \
+      HeadData  BLOB,   \
+      PRIMARY KEY (ObjName, ObjInstance, BucketName), \
+      FOREIGN KEY (BucketName) \
+      REFERENCES '{}' (BucketName) ON DELETE CASCADE ON UPDATE CASCADE \n);";
+
+    static constexpr std::string_view CreateObjectDataTableQ =
+      /* Extra field 'MultipartPartStr' added which signifies multipart
+       * <uploadid + partnum>. For regular object, it is '0.0'
+       *
+       *  - part: a collection of stripes that make a contiguous part of an
+       object. A regular object will only have one part (although might have
+       many stripes), a multipart object might have many parts. Each part
+       has a fixed stripe size (ObjChunkSize), although the last stripe of a
+       part might be smaller than that.
+       */
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      ObjName TEXT NOT NULL , \
+      ObjInstance TEXT, \
+      ObjNS TEXT, \
+      BucketName TEXT NOT NULL , \
+      ObjID      TEXT NOT NULL , \
+      MultipartPartStr TEXT, \
+      PartNum  INTEGER NOT NULL, \
+      Offset   INTEGER, \
+      Size 	 INTEGER, \
+      Mtime  BLOB,       \
+      Data     BLOB,             \
+      PRIMARY KEY (ObjName, BucketName, ObjInstance, ObjID, MultipartPartStr, PartNum), \
+      FOREIGN KEY (BucketName) \
+      REFERENCES '{}' (BucketName) ON DELETE CASCADE ON UPDATE CASCADE \n);";
+
+    static constexpr std::string_view CreateObjectViewQ =
+      /* This query creats temporary view with entries from ObjectData table which have
+       * corresponding head object (i.e, with same ObjName, ObjInstance, ObjNS, ObjID)
+       * in the Object table.
+       *
+       * GC thread can use this view to delete stale entries from the ObjectData table which
+       * do not exist in this view.
+       *
+       * XXX: This view is throwing ForeignKey mismatch error, mostly may be because all the keys
+       * of objectdata table are not referenced here. So this view is not used atm.
+       */
+      "CREATE TEMP VIEW IF NOT EXISTS '{}' AS \
+      SELECT s.ObjName, s.ObjInstance, s.ObjID from '{}' as s INNER JOIN '{}' USING \
+      (ObjName, BucketName, ObjInstance, ObjID);";
+
+
+    static constexpr std::string_view CreateQuotaTableQ =
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      QuotaID INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE , \
+      MaxSizeSoftThreshold INTEGER ,	\
+      MaxObjsSoftThreshold INTEGER ,	\
+      MaxSize	INTEGER ,		\
+      MaxObjects INTEGER ,		\
+      Enabled Boolean ,		\
+      CheckOnRaw Boolean \n);";
+
+    static constexpr std::string_view CreateLCEntryTableQ =
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      LCIndex  TEXT NOT NULL , \
+      BucketName TEXT NOT NULL , \
+      StartTime  INTEGER , \
+      Status     INTEGER , \
+      PRIMARY KEY (LCIndex, BucketName) \n);";
+
+    static constexpr std::string_view CreateLCHeadTableQ =
+      "CREATE TABLE IF NOT EXISTS '{}' ( \
+      LCIndex  TEXT NOT NULL , \
+      Marker TEXT , \
+      StartDate  INTEGER , \
+      PRIMARY KEY (LCIndex) \n);";
+
+    static constexpr std::string_view DropQ = "DROP TABLE IF EXISTS '{}'";
+    static constexpr std::string_view ListAllQ = "SELECT  * from '{}'";
+
+  public:
+    DBOp() {}
+    virtual ~DBOp() {}
+    std::mutex mtx; // to protect prepared stmt
+
+    static std::string CreateTableSchema(std::string_view type,
+                                         const DBOpParams *params) {
+      if (!type.compare("User"))
+        return fmt::format(CreateUserTableQ,
+            params->user_table);
+      if (!type.compare("Bucket"))
+        return fmt::format(CreateBucketTableQ,
+            params->bucket_table,
+            params->user_table);
+      if (!type.compare("Object"))
+        return fmt::format(CreateObjectTableQ,
+            params->object_table,
+            params->bucket_table);
+      if (!type.compare("ObjectTrigger"))
+        return fmt::format(CreateObjectTableTriggerQ,
+            params->object_trigger,
+            params->object_table,
+            params->object_table,
+            params->object_table);
+      if (!type.compare("ObjectData"))
+        return fmt::format(CreateObjectDataTableQ,
+            params->objectdata_table,
+            params->bucket_table);
+      if (!type.compare("ObjectView"))
+        return fmt::format(CreateObjectTableQ,
+            params->object_view,
+            params->objectdata_table,
+            params->object_table);
+      if (!type.compare("Quota"))
+        return fmt::format(CreateQuotaTableQ,
+            params->quota_table);
+      if (!type.compare("LCHead"))
+        return fmt::format(CreateLCHeadTableQ,
+            params->lc_head_table);
+      if (!type.compare("LCEntry"))
+        return fmt::format(CreateLCEntryTableQ,
+            params->lc_entry_table,
+            params->bucket_table);
+
+      ceph_abort_msgf("incorrect table type %.*s", type.size(), type.data());
+    }
+
+    static std::string DeleteTableSchema(std::string_view table) {
+      return fmt::format(DropQ, table);
+    }
+    static std::string ListTableSchema(std::string_view table) {
+      return fmt::format(ListAllQ, table);
+    }
+
+    virtual int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; }
+    virtual int Bind(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; }
+    virtual int Execute(const DoutPrefixProvider *dpp, DBOpParams *params) { return 0; }
+};
+
+class InsertUserOp : virtual public DBOp {
+  private:
+    /* For existing entires, -
+     * (1) INSERT or REPLACE - it will delete previous entry and then
+     * inserts new one. Since it deletes previos enties, it will
+     * trigger all foriegn key cascade deletes or other triggers.
+     * (2) INSERT or UPDATE - this will set NULL values to unassigned
+     * fields.
+     * more info: https://code-examples.net/en/q/377728
+     *
+     * For now using INSERT or REPLACE. If required of updating existing
+     * record, will use another query.
+     */
+    static constexpr std::string_view Query = "INSERT OR REPLACE INTO '{}'	\
+                          (UserID, Tenant, NS, DisplayName, UserEmail, \
+                           AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
+                           SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
+                           System, PlacementName, PlacementStorageClass, PlacementTags, \
+                           BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, \
+                           UserAttrs, UserVersion, UserVersionTag) \
+                          VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \
+                              {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});";
+
+  public:
+    virtual ~InsertUserOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.user_table,
+          params.op.user.user_id, params.op.user.tenant, params.op.user.ns,
+          params.op.user.display_name, params.op.user.user_email,
+          params.op.user.access_keys_id, params.op.user.access_keys_secret,
+          params.op.user.access_keys, params.op.user.swift_keys,
+          params.op.user.subusers, params.op.user.suspended,
+          params.op.user.max_buckets, params.op.user.op_mask,
+          params.op.user.user_caps, params.op.user.admin, params.op.user.system,
+          params.op.user.placement_name, params.op.user.placement_storage_class,
+          params.op.user.placement_tags, params.op.user.bucket_quota,
+          params.op.user.temp_url_keys, params.op.user.user_quota,
+          params.op.user.type, params.op.user.mfa_ids,
+          params.op.user.user_attrs, params.op.user.user_ver,
+          params.op.user.user_ver_tag);
+    }
+
+};
+
+class RemoveUserOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where UserID = {}";
+
+  public:
+    virtual ~RemoveUserOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.user_table,
+          params.op.user.user_id);
+    }
+};
+
+class GetUserOp: virtual public DBOp {
+  private:
+    /* If below query columns are updated, make sure to update the indexes
+     * in list_user() cbk in sqliteDB.cc */
+    static constexpr std::string_view Query = "SELECT \
+                          UserID, Tenant, NS, DisplayName, UserEmail, \
+                          AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
+                          SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
+                          System, PlacementName, PlacementStorageClass, PlacementTags, \
+                          BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
+                          UserAttrs, UserVersion, UserVersionTag from '{}' where UserID = {}";
+
+    static constexpr std::string_view QueryByEmail = "SELECT \
+                                 UserID, Tenant, NS, DisplayName, UserEmail, \
+                                 AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
+                                 SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
+                                 System, PlacementName, PlacementStorageClass, PlacementTags, \
+                                 BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
+                                 UserAttrs, UserVersion, UserVersionTag from '{}' where UserEmail = {}";
+
+    static constexpr std::string_view QueryByAccessKeys = "SELECT \
+                                      UserID, Tenant, NS, DisplayName, UserEmail, \
+                                      AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
+                                      SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
+                                      System, PlacementName, PlacementStorageClass, PlacementTags, \
+                                      BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
+                                      UserAttrs, UserVersion, UserVersionTag from '{}' where AccessKeysID = {}";
+
+    static constexpr std::string_view QueryByUserID = "SELECT \
+                                  UserID, Tenant, NS, DisplayName, UserEmail, \
+                                  AccessKeysID, AccessKeysSecret, AccessKeys, SwiftKeys,\
+                                  SubUsers, Suspended, MaxBuckets, OpMask, UserCaps, Admin, \
+                                  System, PlacementName, PlacementStorageClass, PlacementTags, \
+                                  BucketQuota, TempURLKeys, UserQuota, Type, MfaIDs, AssumedRoleARN, \
+                                  UserAttrs, UserVersion, UserVersionTag \
+                                  from '{}' where UserID = {}";
+
+  public:
+    virtual ~GetUserOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      if (params.op.query_str == "email") {
+        return fmt::format(QueryByEmail, params.user_table,
+            params.op.user.user_email);
+      } else if (params.op.query_str == "access_key") {
+        return fmt::format(QueryByAccessKeys,
+            params.user_table,
+            params.op.user.access_keys_id);
+      } else if (params.op.query_str == "user_id") {
+        return fmt::format(QueryByUserID,
+            params.user_table,
+            params.op.user.user_id);
+      } else {
+        return fmt::format(Query, params.user_table,
+            params.op.user.user_id);
+      }
+    }
+};
+
+class InsertBucketOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "INSERT OR REPLACE INTO '{}' \
+      (BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
+       Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \
+       HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
+       SwiftVersioning, SwiftVerLocation, \
+       MdsearchConfig, NewBucketInstanceID, ObjectLock, \
+       SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime) \
+      VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, \
+          {}, {}, {}, {}, {}, {}, {}, {}, {}, \
+          {}, {}, {}, {}, {}, {}, {}, {}, {}, {})";
+
+  public:
+    virtual ~InsertBucketOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.bucket_table,
+          params.op.bucket.bucket_name, params.op.bucket.tenant,
+          params.op.bucket.marker, params.op.bucket.bucket_id,
+          params.op.bucket.size, params.op.bucket.size_rounded,
+          params.op.bucket.creation_time, params.op.bucket.count,
+          params.op.bucket.placement_name, params.op.bucket.placement_storage_class,
+          params.op.user.user_id,
+          params.op.bucket.flags, params.op.bucket.zonegroup, params.op.bucket.has_instance_obj,
+          params.op.bucket.quota, params.op.bucket.requester_pays, params.op.bucket.has_website,
+          params.op.bucket.website_conf, params.op.bucket.swift_versioning,
+          params.op.bucket.swift_ver_location, params.op.bucket.mdsearch_config,
+          params.op.bucket.new_bucket_instance_id, params.op.bucket.obj_lock,
+          params.op.bucket.sync_policy_info_groups, params.op.bucket.bucket_attrs,
+          params.op.bucket.bucket_ver, params.op.bucket.bucket_ver_tag,
+          params.op.bucket.mtime);
+    }
+};
+
+class UpdateBucketOp: virtual public DBOp {
+  private:
+    // Updates Info, Mtime, Version
+    static constexpr std::string_view InfoQuery =
+      "UPDATE '{}' SET Tenant = {}, Marker = {}, BucketID = {}, CreationTime = {}, \
+      Count = {}, PlacementName = {}, PlacementStorageClass = {}, OwnerID = {}, Flags = {}, \
+      Zonegroup = {}, HasInstanceObj = {}, Quota = {}, RequesterPays = {}, HasWebsite = {}, \
+      WebsiteConf = {}, SwiftVersioning = {}, SwiftVerLocation = {}, MdsearchConfig = {}, \
+      NewBucketInstanceID = {}, ObjectLock = {}, SyncPolicyInfoGroups = {}, \
+      BucketVersion = {}, Mtime = {} WHERE BucketName = {}";
+    // Updates Attrs, OwnerID, Mtime, Version
+    static constexpr std::string_view AttrsQuery =
+      "UPDATE '{}' SET OwnerID = {}, BucketAttrs = {}, Mtime = {}, BucketVersion = {} \
+      WHERE BucketName = {}";
+    // Updates OwnerID, CreationTime, Mtime, Version
+    static constexpr std::string_view OwnerQuery =
+      "UPDATE '{}' SET OwnerID = {}, CreationTime = {}, Mtime = {}, BucketVersion = {} WHERE BucketName = {}";
+
+  public:
+    virtual ~UpdateBucketOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      if (params.op.query_str == "info") {
+        return fmt::format(InfoQuery, params.bucket_table,
+            params.op.bucket.tenant, params.op.bucket.marker, params.op.bucket.bucket_id,
+            params.op.bucket.creation_time, params.op.bucket.count,
+            params.op.bucket.placement_name, params.op.bucket.placement_storage_class,
+            params.op.user.user_id,
+            params.op.bucket.flags, params.op.bucket.zonegroup, params.op.bucket.has_instance_obj,
+            params.op.bucket.quota, params.op.bucket.requester_pays, params.op.bucket.has_website,
+            params.op.bucket.website_conf, params.op.bucket.swift_versioning,
+            params.op.bucket.swift_ver_location, params.op.bucket.mdsearch_config,
+            params.op.bucket.new_bucket_instance_id, params.op.bucket.obj_lock,
+            params.op.bucket.sync_policy_info_groups,
+            params.op.bucket.bucket_ver, params.op.bucket.mtime,
+            params.op.bucket.bucket_name);
+      }
+      if (params.op.query_str == "attrs") {
+        return fmt::format(AttrsQuery, params.bucket_table,
+            params.op.user.user_id, params.op.bucket.bucket_attrs,
+            params.op.bucket.mtime,
+            params.op.bucket.bucket_ver, params.op.bucket.bucket_name);
+      }
+      if (params.op.query_str == "owner") {
+        return fmt::format(OwnerQuery, params.bucket_table,
+            params.op.user.user_id, params.op.bucket.creation_time,
+            params.op.bucket.mtime,
+            params.op.bucket.bucket_ver, params.op.bucket.bucket_name);
+      }
+      return "";
+    }
+};
+
+class RemoveBucketOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where BucketName = {}";
+
+  public:
+    virtual ~RemoveBucketOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.bucket_table,
+          params.op.bucket.bucket_name);
+    }
+};
+
+class GetBucketOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query = "SELECT  \
+                          BucketName, BucketTable.Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
+                          Count, BucketTable.PlacementName, BucketTable.PlacementStorageClass, OwnerID, Flags, Zonegroup, \
+                          HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
+                          SwiftVersioning, SwiftVerLocation, \
+                          MdsearchConfig, NewBucketInstanceID, ObjectLock, \
+                          SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime, NS \
+                          from '{}' as BucketTable INNER JOIN '{}' ON OwnerID = UserID where BucketName = {}";
+
+  public:
+    virtual ~GetBucketOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      //return fmt::format(Query, params.op.bucket.bucket_name,
+      //          params.bucket_table, params.user_table);
+      return fmt::format(Query,
+          params.bucket_table, params.user_table,
+          params.op.bucket.bucket_name);
+    }
+};
+
+class ListUserBucketsOp: virtual public DBOp {
+  private:
+    // once we have stats also stored, may have to update this query to join
+    // these two tables.
+    static constexpr std::string_view Query = "SELECT  \
+                          BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
+                          Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \
+                          HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
+                          SwiftVersioning, SwiftVerLocation, \
+                          MdsearchConfig, NewBucketInstanceID, ObjectLock, \
+                          SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime \
+                          FROM '{}' WHERE OwnerID = {} AND BucketName > {} ORDER BY BucketName ASC LIMIT {}";
+
+    /* BucketNames are unique across users. Hence userid/OwnerID is not used as
+     * marker or for ordering here in the below query 
+     */
+    static constexpr std::string_view AllQuery = "SELECT  \
+                          BucketName, Tenant, Marker, BucketID, Size, SizeRounded, CreationTime, \
+                          Count, PlacementName, PlacementStorageClass, OwnerID, Flags, Zonegroup, \
+                          HasInstanceObj, Quota, RequesterPays, HasWebsite, WebsiteConf, \
+                          SwiftVersioning, SwiftVerLocation, \
+                          MdsearchConfig, NewBucketInstanceID, ObjectLock, \
+                          SyncPolicyInfoGroups, BucketAttrs, BucketVersion, BucketVersionTag, Mtime \
+                          FROM '{}' WHERE BucketName > {} ORDER BY BucketName ASC LIMIT {}";
+
+  public:
+    virtual ~ListUserBucketsOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      if (params.op.query_str == "all") {
+        return fmt::format(AllQuery, params.bucket_table,
+          params.op.bucket.min_marker,
+          params.op.list_max_count);
+      } else {
+        return fmt::format(Query, params.bucket_table,
+          params.op.user.user_id, params.op.bucket.min_marker,
+          params.op.list_max_count);
+      }
+    }
+};
+
+class PutObjectOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "INSERT OR REPLACE INTO '{}' \
+      (ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
+       Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
+       StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
+       AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
+       ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
+       ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
+       ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
+       TailPlacementRuleName, TailPlacementStorageClass, \
+       ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \
+       HeadData)     \
+      VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \
+          {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, \
+          {}, {}, {}, \
+          {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})";
+
+  public:
+    virtual ~PutObjectOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.object_table, params.op.obj.obj_name,
+          params.op.obj.obj_instance, params.op.obj.obj_ns,
+          params.op.bucket.bucket_name, params.op.obj.acls, params.op.obj.index_ver,
+          params.op.obj.tag, params.op.obj.flags, params.op.obj.versioned_epoch,
+          params.op.obj.obj_category, params.op.obj.etag, params.op.obj.owner,
+          params.op.obj.owner_display_name, params.op.obj.storage_class,
+          params.op.obj.appendable, params.op.obj.content_type,
+          params.op.obj.index_hash_source, params.op.obj.obj_size,
+          params.op.obj.accounted_size, params.op.obj.mtime,
+          params.op.obj.epoch, params.op.obj.obj_tag, params.op.obj.tail_tag,
+          params.op.obj.write_tag, params.op.obj.fake_tag, params.op.obj.shadow_obj,
+          params.op.obj.has_data, params.op.obj.is_versioned,
+          params.op.obj.version_num,
+          params.op.obj.pg_ver, params.op.obj.zone_short_id,
+          params.op.obj.obj_version, params.op.obj.obj_version_tag,
+          params.op.obj.obj_attrs, params.op.obj.head_size,
+          params.op.obj.max_head_size, params.op.obj.obj_id,
+          params.op.obj.tail_instance,
+          params.op.obj.head_placement_rule_name,
+          params.op.obj.head_placement_storage_class,
+          params.op.obj.tail_placement_rule_name,
+          params.op.obj.tail_placement_storage_class,
+          params.op.obj.manifest_part_objs,
+          params.op.obj.manifest_part_rules, params.op.obj.omap,
+          params.op.obj.is_multipart, params.op.obj.mp_parts,
+          params.op.obj.head_data);
+    }
+};
+
+class DeleteObjectOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {}";
+
+  public:
+    virtual ~DeleteObjectOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.object_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_name,
+          params.op.obj.obj_instance);
+    }
+};
+
+class GetObjectOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "SELECT  \
+      ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
+      Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
+      StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
+      AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
+      ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
+      ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
+      ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
+      TailPlacementRuleName, TailPlacementStorageClass, \
+      ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \
+      HeadData from '{}' \
+      where BucketName = {} and ObjName = {} and ObjInstance = {}";
+
+  public:
+    virtual ~GetObjectOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.object_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_name,
+          params.op.obj.obj_instance);
+    }
+};
+
+class ListBucketObjectsOp: virtual public DBOp {
+  private:
+    // once we have stats also stored, may have to update this query to join
+    // these two tables.
+    static constexpr std::string_view Query =
+      "SELECT  \
+      ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
+      Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
+      StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
+      AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
+      ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
+      ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
+      ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
+      TailPlacementRuleName, TailPlacementStorageClass, \
+      ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, HeadData from '{}' \
+      where BucketName = {} and ObjName >= {} and ObjName LIKE {} ORDER BY ObjName ASC, VersionNum DESC LIMIT {}";
+  public:
+    virtual ~ListBucketObjectsOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      /* XXX: Include obj_id, delim */
+      return fmt::format(Query,
+          params.object_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.min_marker,
+          params.op.obj.prefix,
+          params.op.list_max_count);
+    }
+};
+
+#define MAX_VERSIONED_OBJECTS 20
+class ListVersionedObjectsOp: virtual public DBOp {
+  private:
+    // once we have stats also stored, may have to update this query to join
+    // these two tables.
+    static constexpr std::string_view Query =
+      "SELECT  \
+      ObjName, ObjInstance, ObjNS, BucketName, ACLs, IndexVer, Tag, \
+      Flags, VersionedEpoch, ObjCategory, Etag, Owner, OwnerDisplayName, \
+      StorageClass, Appendable, ContentType, IndexHashSource, ObjSize, \
+      AccountedSize, Mtime, Epoch, ObjTag, TailTag, WriteTag, FakeTag, \
+      ShadowObj, HasData, IsVersioned, VersionNum, PGVer, ZoneShortID, \
+      ObjVersion, ObjVersionTag, ObjAttrs, HeadSize, MaxHeadSize, \
+      ObjID, TailInstance, HeadPlacementRuleName, HeadPlacementRuleStorageClass, \
+      TailPlacementRuleName, TailPlacementStorageClass, \
+      ManifestPartObjs, ManifestPartRules, Omap, IsMultipart, MPPartsList, \
+      HeadData from '{}' \
+      where BucketName = {} and ObjName = {} ORDER BY VersionNum DESC LIMIT {}";
+  public:
+    virtual ~ListVersionedObjectsOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      /* XXX: Include obj_id, delim */
+      return fmt::format(Query,
+          params.object_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_name,
+          params.op.list_max_count);
+    }
+};
+
+class UpdateObjectOp: virtual public DBOp {
+  private:
+    // Updates Omap
+    static constexpr std::string_view OmapQuery =
+      "UPDATE '{}' SET Omap = {}, Mtime = {} \
+      where BucketName = {} and ObjName = {} and ObjInstance = {}";
+    static constexpr std::string_view AttrsQuery =
+      "UPDATE '{}' SET ObjAttrs = {}, Mtime = {}  \
+      where BucketName = {} and ObjName = {} and ObjInstance = {}";
+    static constexpr std::string_view MPQuery =
+      "UPDATE '{}' SET MPPartsList = {}, Mtime = {}  \
+      where BucketName = {} and ObjName = {} and ObjInstance = {}";
+    static constexpr std::string_view MetaQuery =
+      "UPDATE '{}' SET \
+       ObjNS = {}, ACLs = {}, IndexVer = {}, Tag = {}, Flags = {}, VersionedEpoch = {}, \
+       ObjCategory = {}, Etag = {}, Owner = {}, OwnerDisplayName = {}, \
+       StorageClass = {}, Appendable = {}, ContentType = {}, \
+       IndexHashSource = {}, ObjSize = {}, AccountedSize = {}, Mtime = {}, \
+       Epoch = {}, ObjTag = {}, TailTag = {}, WriteTag = {}, FakeTag = {}, \
+       ShadowObj = {}, HasData = {}, IsVersioned = {}, VersionNum = {}, PGVer = {}, \
+       ZoneShortID = {}, ObjVersion = {}, ObjVersionTag = {}, ObjAttrs = {}, \
+       HeadSize = {}, MaxHeadSize = {}, ObjID = {}, TailInstance = {}, \
+       HeadPlacementRuleName = {}, HeadPlacementRuleStorageClass = {}, \
+       TailPlacementRuleName = {}, TailPlacementStorageClass = {}, \
+       ManifestPartObjs = {}, ManifestPartRules = {}, Omap = {}, \
+       IsMultipart = {}, MPPartsList = {}, HeadData = {} \
+       WHERE ObjName = {} and ObjInstance = {} and BucketName = {}";
+
+  public:
+    virtual ~UpdateObjectOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      if (params.op.query_str == "omap") {
+        return fmt::format(OmapQuery,
+            params.object_table, params.op.obj.omap,
+            params.op.obj.mtime,
+            params.op.bucket.bucket_name,
+            params.op.obj.obj_name,
+            params.op.obj.obj_instance);
+      }
+      if (params.op.query_str == "attrs") {
+        return fmt::format(AttrsQuery,
+            params.object_table, params.op.obj.obj_attrs,
+            params.op.obj.mtime,
+            params.op.bucket.bucket_name,
+            params.op.obj.obj_name,
+            params.op.obj.obj_instance);
+      }
+      if (params.op.query_str == "mp") {
+        return fmt::format(MPQuery,
+            params.object_table, params.op.obj.mp_parts,
+            params.op.obj.mtime,
+            params.op.bucket.bucket_name,
+            params.op.obj.obj_name,
+            params.op.obj.obj_instance);
+      }
+      if (params.op.query_str == "meta") {
+        return fmt::format(MetaQuery,
+          params.object_table,
+          params.op.obj.obj_ns, params.op.obj.acls, params.op.obj.index_ver,
+          params.op.obj.tag, params.op.obj.flags, params.op.obj.versioned_epoch,
+          params.op.obj.obj_category, params.op.obj.etag, params.op.obj.owner,
+          params.op.obj.owner_display_name, params.op.obj.storage_class,
+          params.op.obj.appendable, params.op.obj.content_type,
+          params.op.obj.index_hash_source, params.op.obj.obj_size,
+          params.op.obj.accounted_size, params.op.obj.mtime,
+          params.op.obj.epoch, params.op.obj.obj_tag, params.op.obj.tail_tag,
+          params.op.obj.write_tag, params.op.obj.fake_tag, params.op.obj.shadow_obj,
+          params.op.obj.has_data, params.op.obj.is_versioned, params.op.obj.version_num,
+          params.op.obj.pg_ver, params.op.obj.zone_short_id,
+          params.op.obj.obj_version, params.op.obj.obj_version_tag,
+          params.op.obj.obj_attrs, params.op.obj.head_size,
+          params.op.obj.max_head_size, params.op.obj.obj_id,
+          params.op.obj.tail_instance,
+          params.op.obj.head_placement_rule_name,
+          params.op.obj.head_placement_storage_class,
+          params.op.obj.tail_placement_rule_name,
+          params.op.obj.tail_placement_storage_class,
+          params.op.obj.manifest_part_objs,
+          params.op.obj.manifest_part_rules, params.op.obj.omap,
+          params.op.obj.is_multipart, params.op.obj.mp_parts,
+          params.op.obj.head_data, 
+          params.op.obj.obj_name, params.op.obj.obj_instance,
+          params.op.bucket.bucket_name);
+      }
+      return "";
+    }
+};
+
+class PutObjectDataOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "INSERT OR REPLACE INTO '{}' \
+      (ObjName, ObjInstance, ObjNS, BucketName, ObjID, MultipartPartStr, PartNum, Offset, Size, Mtime, Data) \
+      VALUES ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})";
+
+  public:
+    virtual ~PutObjectDataOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.objectdata_table,
+          params.op.obj.obj_name, params.op.obj.obj_instance,
+          params.op.obj.obj_ns,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_id,
+          params.op.obj_data.multipart_part_str,
+          params.op.obj_data.part_num,
+          params.op.obj_data.offset,
+          params.op.obj_data.size,
+          params.op.obj.mtime,
+          params.op.obj_data.data);
+    }
+};
+
+/* XXX: Recheck if this is really needed */
+class UpdateObjectDataOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "UPDATE '{}' \
+      SET Mtime = {} WHERE ObjName = {} and ObjInstance = {} and \
+      BucketName = {} and ObjID = {}";
+
+  public:
+    virtual ~UpdateObjectDataOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.objectdata_table,
+          params.op.obj.mtime,
+          params.op.obj.obj_name, params.op.obj.obj_instance,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_id);
+    }
+};
+
+class GetObjectDataOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "SELECT  \
+      ObjName, ObjInstance, ObjNS, BucketName, ObjID, MultipartPartStr, PartNum, Offset, Size, Mtime, Data \
+      from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {} and ObjID = {} ORDER BY MultipartPartStr, PartNum";
+
+  public:
+    virtual ~GetObjectDataOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.objectdata_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_name,
+          params.op.obj.obj_instance,
+          params.op.obj.obj_id);
+    }
+};
+
+class DeleteObjectDataOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where BucketName = {} and ObjName = {} and ObjInstance = {} and ObjID = {}";
+
+  public:
+    virtual ~DeleteObjectDataOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.objectdata_table,
+          params.op.bucket.bucket_name,
+          params.op.obj.obj_name,
+          params.op.obj.obj_instance,
+          params.op.obj.obj_id);
+    }
+};
+
+class DeleteStaleObjectDataOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' WHERE (ObjName, ObjInstance, ObjID) NOT IN (SELECT s.ObjName, s.ObjInstance, s.ObjID from '{}' as s INNER JOIN '{}' USING (ObjName, BucketName, ObjInstance, ObjID)) and Mtime < {}";
+
+  public:
+    virtual ~DeleteStaleObjectDataOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query,
+          params.objectdata_table,
+          params.objectdata_table,
+          params.object_table,
+          params.op.obj.mtime);
+    }
+};
+
+class InsertLCEntryOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "INSERT OR REPLACE INTO '{}' \
+      (LCIndex, BucketName, StartTime, Status) \
+      VALUES ({}, {}, {}, {})";
+
+  public:
+    virtual ~InsertLCEntryOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_entry_table,
+          params.op.lc_entry.index, params.op.lc_entry.bucket_name,
+          params.op.lc_entry.start_time, params.op.lc_entry.status);
+    }
+};
+
+class RemoveLCEntryOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where LCIndex = {} and BucketName = {}";
+
+  public:
+    virtual ~RemoveLCEntryOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_entry_table,
+          params.op.lc_entry.index, params.op.lc_entry.bucket_name);
+    }
+};
+
+class GetLCEntryOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query = "SELECT  \
+                          LCIndex, BucketName, StartTime, Status \
+                          from '{}' where LCIndex = {} and BucketName = {}";
+    static constexpr std::string_view NextQuery = "SELECT  \
+                          LCIndex, BucketName, StartTime, Status \
+                          from '{}' where LCIndex = {} and BucketName > {} ORDER BY BucketName ASC";
+
+  public:
+    virtual ~GetLCEntryOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      if (params.op.query_str == "get_next_entry") {
+        return fmt::format(NextQuery, params.lc_entry_table,
+            params.op.lc_entry.index, params.op.lc_entry.bucket_name);
+      }
+      // default 
+      return fmt::format(Query, params.lc_entry_table,
+          params.op.lc_entry.index, params.op.lc_entry.bucket_name);
+    }
+};
+
+class ListLCEntriesOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query = "SELECT  \
+                          LCIndex, BucketName, StartTime, Status \
+                          FROM '{}' WHERE LCIndex = {} AND BucketName > {} ORDER BY BucketName ASC LIMIT {}";
+
+  public:
+    virtual ~ListLCEntriesOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_entry_table,
+          params.op.lc_entry.index, params.op.lc_entry.min_marker,
+          params.op.list_max_count);
+    }
+};
+
+class InsertLCHeadOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "INSERT OR REPLACE INTO '{}' \
+      (LCIndex, Marker, StartDate) \
+      VALUES ({}, {}, {})";
+
+  public:
+    virtual ~InsertLCHeadOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_head_table,
+          params.op.lc_head.index, params.op.lc_head.marker,
+          params.op.lc_head.start_date);
+    }
+};
+
+class RemoveLCHeadOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query =
+      "DELETE from '{}' where LCIndex = {}";
+
+  public:
+    virtual ~RemoveLCHeadOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_head_table,
+          params.op.lc_head.index);
+    }
+};
+
+class GetLCHeadOp: virtual public DBOp {
+  private:
+    static constexpr std::string_view Query = "SELECT  \
+                          LCIndex, Marker, StartDate \
+                          from '{}' where LCIndex = {}";
+
+  public:
+    virtual ~GetLCHeadOp() {}
+
+    static std::string Schema(DBOpPrepareParams &params) {
+      return fmt::format(Query, params.lc_head_table,
+          params.op.lc_head.index);
+    }
+};
+
+/* taken from rgw_rados.h::RGWOLHInfo */
+struct DBOLHInfo {
+  rgw_obj target;
+  bool removed;
+  DBOLHInfo() : removed(false) {}
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(target, bl);
+    encode(removed, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(target, bl);
+    decode(removed, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(DBOLHInfo)
+
+class DB {
+  private:
+    const std::string db_name;
+    rgw::sal::Driver* driver;
+    const std::string user_table;
+    const std::string bucket_table;
+    const std::string quota_table;
+    const std::string lc_head_table;
+    const std::string lc_entry_table;
+    static std::map<std::string, class ObjectOp*> objectmap;
+
+  protected:
+    void *db;
+    CephContext *cct;
+    const DoutPrefix dp;
+    uint64_t max_bucket_id = 0;
+    // XXX: default ObjStripeSize or ObjChunk size - 4M, make them configurable?
+    uint64_t ObjHeadSize = 1024; /* 1K - default head data size */
+    uint64_t ObjChunkSize = (get_blob_limit() - 1000); /* 1000 to accommodate other fields */
+    // Below mutex is to protect objectmap and other shared
+    // objects if any.
+    std::mutex mtx;
+
+  public:
+    DB(std::string db_name, CephContext *_cct) : db_name(db_name),
+    user_table(db_name+"_user_table"),
+    bucket_table(db_name+"_bucket_table"),
+    quota_table(db_name+"_quota_table"),
+    lc_head_table(db_name+"_lc_head_table"),
+    lc_entry_table(db_name+"_lc_entry_table"),
+    cct(_cct),
+    dp(_cct, ceph_subsys_rgw, "rgw DBStore backend: ")
+  {}
+    /*	DB() {}*/
+
+    DB(CephContext *_cct) : db_name("default_db"),
+    user_table(db_name+"_user_table"),
+    bucket_table(db_name+"_bucket_table"),
+    quota_table(db_name+"_quota_table"),
+    lc_head_table(db_name+"_lc_head_table"),
+    lc_entry_table(db_name+"_lc_entry_table"),
+    cct(_cct),
+    dp(_cct, ceph_subsys_rgw, "rgw DBStore backend: ")
+  {}
+    virtual	~DB() {}
+
+    const std::string getDBname() { return db_name; }
+    const std::string getDBfile() { return db_name + ".db"; }
+    const std::string getUserTable() { return user_table; }
+    const std::string getBucketTable() { return bucket_table; }
+    const std::string getQuotaTable() { return quota_table; }
+    const std::string getLCHeadTable() { return lc_head_table; }
+    const std::string getLCEntryTable() { return lc_entry_table; }
+    const std::string getObjectTable(std::string bucket) {
+      return db_name+"_"+bucket+"_object_table"; }
+    const std::string getObjectDataTable(std::string bucket) {
+      return db_name+"_"+bucket+"_objectdata_table"; }
+    const std::string getObjectView(std::string bucket) {
+      return db_name+"_"+bucket+"_object_view"; }
+    const std::string getObjectTrigger(std::string bucket) {
+      return db_name+"_"+bucket+"_object_trigger"; }
+
+    std::map<std::string, class ObjectOp*> getObjectMap();
+
+    struct DBOps dbops; // DB operations, make it private?
+
+    void set_driver(rgw::sal::Driver* _driver) {
+      driver = _driver;
+    }
+
+    void set_context(CephContext *_cct) {
+      cct = _cct;
+    }
+
+    CephContext *ctx() { return cct; }
+    const DoutPrefixProvider *get_def_dpp() { return &dp; }
+
+    int Initialize(std::string logfile, int loglevel);
+    int Destroy(const DoutPrefixProvider *dpp);
+    int LockInit(const DoutPrefixProvider *dpp);
+    int LockDestroy(const DoutPrefixProvider *dpp);
+    int Lock(const DoutPrefixProvider *dpp);
+    int Unlock(const DoutPrefixProvider *dpp);
+
+    int InitializeParams(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int ProcessOp(const DoutPrefixProvider *dpp, std::string_view Op, DBOpParams *params);
+    std::shared_ptr<class DBOp> getDBOp(const DoutPrefixProvider *dpp, std::string_view Op, const DBOpParams *params);
+    int objectmapInsert(const DoutPrefixProvider *dpp, std::string bucket, class ObjectOp* ptr);
+    int objectmapDelete(const DoutPrefixProvider *dpp, std::string bucket);
+
+    virtual uint64_t get_blob_limit() { return 0; };
+    virtual void *openDB(const DoutPrefixProvider *dpp) { return NULL; }
+    virtual int closeDB(const DoutPrefixProvider *dpp) { return 0; }
+    virtual int createTables(const DoutPrefixProvider *dpp) { return 0; }
+    virtual int InitializeDBOps(const DoutPrefixProvider *dpp) { return 0; }
+    virtual int InitPrepareParams(const DoutPrefixProvider *dpp,
+                                  DBOpPrepareParams &p_params,
+                                  DBOpParams* params) = 0;
+    virtual int createLCTables(const DoutPrefixProvider *dpp) = 0;
+
+    virtual int ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params) = 0;
+    virtual int ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params) = 0;
+    virtual int ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params) = 0;
+
+    int get_user(const DoutPrefixProvider *dpp,
+        const std::string& query_str, const std::string& query_str_val,
+        RGWUserInfo& uinfo, std::map<std::string, bufferlist> *pattrs,
+        RGWObjVersionTracker *pobjv_tracker);
+    int store_user(const DoutPrefixProvider *dpp,
+        RGWUserInfo& uinfo, bool exclusive, std::map<std::string, bufferlist> *pattrs,
+        RGWObjVersionTracker *pobjv_tracker, RGWUserInfo* pold_info);
+    int remove_user(const DoutPrefixProvider *dpp,
+        RGWUserInfo& uinfo, RGWObjVersionTracker *pobjv_tracker);
+    int get_bucket_info(const DoutPrefixProvider *dpp, const std::string& query_str,
+        const std::string& query_str_val,
+        RGWBucketInfo& info, rgw::sal::Attrs* pattrs, ceph::real_time* pmtime,
+        obj_version* pbucket_version);
+    int create_bucket(const DoutPrefixProvider *dpp,
+        const RGWUserInfo& owner, rgw_bucket& bucket,
+        const std::string& zonegroup_id,
+        const rgw_placement_rule& placement_rule,
+        const std::string& swift_ver_location,
+        const RGWQuotaInfo * pquota_info,
+        std::map<std::string, bufferlist>& attrs,
+        RGWBucketInfo& info,
+        obj_version *pobjv,
+        obj_version *pep_objv,
+        real_time creation_time,
+        rgw_bucket *pmaster_bucket,
+        uint32_t *pmaster_num_shards,
+        optional_yield y,
+        bool exclusive);
+
+    int next_bucket_id() { return ++max_bucket_id; };
+
+    int remove_bucket(const DoutPrefixProvider *dpp, const RGWBucketInfo info);
+    int list_buckets(const DoutPrefixProvider *dpp, const std::string& query_str,
+        rgw_user& user,
+        const std::string& marker,
+        const std::string& end_marker,
+        uint64_t max,
+        bool need_stats,
+        RGWUserBuckets *buckets,
+        bool *is_truncated);
+    int update_bucket(const DoutPrefixProvider *dpp, const std::string& query_str,
+        RGWBucketInfo& info, bool exclusive,
+        const rgw_user* powner_id, std::map<std::string, bufferlist>* pattrs,
+        ceph::real_time* pmtime, RGWObjVersionTracker* pobjv);
+
+    uint64_t get_max_head_size() { return ObjHeadSize; }
+    uint64_t get_max_chunk_size() { return ObjChunkSize; }
+    void gen_rand_obj_instance_name(rgw_obj_key *target_key);
+
+    // db raw obj string is of format -
+    // "<bucketname>_<objname>_<objinstance>_<multipart-part-str>_<partnum>"
+    static constexpr std::string_view raw_obj_oid = "{0}_{1}_{2}_{3}_{4}";
+
+    std::string to_oid(std::string_view bucket, std::string_view obj_name,
+                       std::string_view obj_instance, std::string_view obj_id,
+                       std::string_view mp_str, uint64_t partnum) {
+      return fmt::format(raw_obj_oid, bucket, obj_name, obj_instance, obj_id, mp_str, partnum);
+    }
+    int from_oid(const std::string& oid, std::string& bucket, std::string& obj_name, std::string& obj_id,
+        std::string& obj_instance,
+        std::string& mp_str, uint64_t& partnum) {
+      // TODO: use ceph::split() from common/split.h
+      // XXX: doesn't this break if obj_name has underscores in it?
+      std::vector<std::string> result;
+      boost::split(result, oid, boost::is_any_of("_"));
+      bucket = result[0];
+      obj_name = result[1];
+      obj_instance = result[2];
+      obj_id = result[3];
+      mp_str = result[4];
+      partnum = stoi(result[5]);
+
+      return 0;
+    }
+
+    struct raw_obj {
+      DB* db;
+
+      std::string bucket_name;
+      std::string obj_name;
+      std::string obj_instance;
+      std::string obj_ns;
+      std::string obj_id;
+      std::string multipart_part_str;
+      uint64_t part_num;
+
+      std::string obj_table;
+      std::string obj_data_table;
+
+      raw_obj(DB* _db) {
+        db = _db;
+      }
+
+      raw_obj(DB* _db, std::string& _bname, std::string& _obj_name, std::string& _obj_instance,
+          std::string& _obj_ns, std::string& _obj_id, std::string _mp_part_str, int _part_num) {
+        db = _db;
+        bucket_name = _bname;
+        obj_name = _obj_name;
+        obj_instance = _obj_instance;
+        obj_ns = _obj_ns;
+        obj_id = _obj_id;
+        multipart_part_str = _mp_part_str;
+        part_num = _part_num;
+
+        obj_table = bucket_name+".object.table";
+        obj_data_table = bucket_name+".objectdata.table";
+      }
+
+      raw_obj(DB* _db, std::string& oid) {
+        int r;
+
+        db = _db;
+        r = db->from_oid(oid, bucket_name, obj_name, obj_instance, obj_id, multipart_part_str,
+            part_num);
+        if (r < 0) {
+          multipart_part_str = "0.0";
+          part_num = 0;
+        }
+
+        obj_table = db->getObjectTable(bucket_name);
+        obj_data_table = db->getObjectDataTable(bucket_name);
+      }
+
+      int InitializeParamsfromRawObj (const DoutPrefixProvider *dpp, DBOpParams* params);
+
+      int read(const DoutPrefixProvider *dpp, int64_t ofs, uint64_t end, bufferlist& bl);
+      int write(const DoutPrefixProvider *dpp, int64_t ofs, int64_t write_ofs, uint64_t len, bufferlist& bl);
+    };
+
+    class GC : public Thread {
+      const DoutPrefixProvider *dpp;
+      DB *db;
+      /* Default time interval for GC 
+       * XXX: Make below options configurable
+       *
+       * gc_interval: The time between successive gc thread runs
+       * gc_obj_min_wait: Min. time to wait before deleting any data post its creation.
+       *                    
+       */
+      std::mutex mtx;
+      std::condition_variable cv;
+      bool stop_signalled = false;
+      uint32_t gc_interval = 24*60*60; //sec ; default: 24*60*60
+      uint32_t gc_obj_min_wait = 60*60; //60*60sec default
+      std::string bucket_marker;
+      std::string user_marker;
+
+    public:
+      GC(const DoutPrefixProvider *_dpp, DB* _db) :
+            dpp(_dpp), db(_db) {}
+
+      void *entry() override;
+
+      void signal_stop() {
+	std::lock_guard<std::mutex> lk_guard(mtx);
+	stop_signalled = true;
+	cv.notify_one();
+      }
+
+      friend class DB;
+    };
+    std::unique_ptr<DB::GC> gc_worker;
+
+    class Bucket {
+      friend class DB;
+      DB* store;
+
+      RGWBucketInfo bucket_info;
+
+      public:
+        Bucket(DB *_store, const RGWBucketInfo& _binfo) : store(_store), bucket_info(_binfo) {}
+        DB *get_store() { return store; }
+        rgw_bucket& get_bucket() { return bucket_info.bucket; }
+        RGWBucketInfo& get_bucket_info() { return bucket_info; }
+
+      class List {
+      protected:
+        // absolute maximum number of objects that
+        // list_objects_(un)ordered can return
+        static constexpr int64_t bucket_list_objects_absolute_max = 25000;
+
+        DB::Bucket *target;
+        rgw_obj_key next_marker;
+
+      public:
+
+        struct Params {
+          std::string prefix;
+          std::string delim;
+          rgw_obj_key marker;
+          rgw_obj_key end_marker;
+          std::string ns;
+          bool enforce_ns;
+          RGWAccessListFilter* access_list_filter;
+          RGWBucketListNameFilter force_check_filter;
+          bool list_versions;
+	  bool allow_unordered;
+
+          Params() :
+	        enforce_ns(true),
+	        access_list_filter(nullptr),
+	        list_versions(false),
+	        allow_unordered(false)
+	        {}
+        } params;
+
+        explicit List(DB::Bucket *_target) : target(_target) {}
+
+        /* XXX: Handle ordered and unordered separately.
+         * For now returning only ordered entries */
+        int list_objects(const DoutPrefixProvider *dpp, int64_t max,
+			   std::vector<rgw_bucket_dir_entry> *result,
+			   std::map<std::string, bool> *common_prefixes, bool *is_truncated);
+        rgw_obj_key& get_next_marker() {
+          return next_marker;
+        }
+      };
+    };
+
+    class Object {
+      friend class DB;
+      DB* store;
+
+      RGWBucketInfo bucket_info;
+      rgw_obj obj;
+
+      RGWObjState obj_state;
+      std::string obj_id;
+
+      bool versioning_disabled;
+
+      bool bs_initialized;
+
+      public:
+      Object(DB *_store, const RGWBucketInfo& _bucket_info, const rgw_obj& _obj) : store(_store), bucket_info(_bucket_info),
+      obj(_obj),
+      versioning_disabled(false),
+      bs_initialized(false) {}
+
+      Object(DB *_store, const RGWBucketInfo& _bucket_info, const rgw_obj& _obj, const std::string& _obj_id) : store(_store), bucket_info(_bucket_info), obj(_obj), obj_id(_obj_id) {}
+
+      struct Read {
+        DB::Object *source;
+
+        struct GetObjState {
+          rgw_obj obj;
+        } state;
+
+        struct ConditionParams {
+          const ceph::real_time *mod_ptr;
+          const ceph::real_time *unmod_ptr;
+          bool high_precision_time;
+          uint32_t mod_zone_id;
+          uint64_t mod_pg_ver;
+          const char *if_match;
+          const char *if_nomatch;
+
+          ConditionParams() :
+            mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0),
+            if_match(NULL), if_nomatch(NULL) {}
+        } conds;
+
+        struct Params {
+          ceph::real_time *lastmod;
+          uint64_t *obj_size;
+	  std::map<std::string, bufferlist> *attrs;
+          rgw_obj *target_obj;
+
+          Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
+          target_obj(nullptr) {}
+        } params;
+
+        explicit Read(DB::Object *_source) : source(_source) {}
+
+        int prepare(const DoutPrefixProvider *dpp);
+        static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
+        int read(int64_t ofs, int64_t end, bufferlist& bl, const DoutPrefixProvider *dpp);
+        int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb);
+        int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest);
+      };
+
+      struct Write {
+        DB::Object *target;
+        RGWObjState obj_state;
+        std::string mp_part_str = "0.0"; // multipart num
+
+        struct MetaParams {
+          ceph::real_time *mtime;
+	  std::map<std::string, bufferlist>* rmattrs;
+          const bufferlist *data;
+          RGWObjManifest *manifest;
+          const std::string *ptag;
+          std::list<rgw_obj_index_key> *remove_objs;
+          ceph::real_time set_mtime;
+          rgw_user owner;
+          RGWObjCategory category;
+          int flags;
+          const char *if_match;
+          const char *if_nomatch;
+          std::optional<uint64_t> olh_epoch;
+          ceph::real_time delete_at;
+          bool canceled;
+          const std::string *user_data;
+          rgw_zone_set *zones_trace;
+          bool modify_tail;
+          bool completeMultipart;
+          bool appendable;
+
+          MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
+          remove_objs(NULL), category(RGWObjCategory::Main), flags(0),
+          if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr),
+          modify_tail(false),  completeMultipart(false), appendable(false) {}
+        } meta;
+
+        explicit Write(DB::Object *_target) : target(_target) {}
+
+        void set_mp_part_str(std::string _mp_part_str) { mp_part_str = _mp_part_str;}
+        int prepare(const DoutPrefixProvider* dpp);
+        int write_data(const DoutPrefixProvider* dpp,
+                               bufferlist& data, uint64_t ofs);
+        int _do_write_meta(const DoutPrefixProvider *dpp,
+            uint64_t size, uint64_t accounted_size,
+	    std::map<std::string, bufferlist>& attrs,
+            bool assume_noent, bool modify_tail);
+        int write_meta(const DoutPrefixProvider *dpp, uint64_t size,
+	    uint64_t accounted_size, std::map<std::string, bufferlist>& attrs);
+      };
+
+      struct Delete {
+        DB::Object *target;
+
+        struct DeleteParams {
+          rgw_user bucket_owner;
+          int versioning_status;
+          ACLOwner obj_owner; /* needed for creation of deletion marker */
+          uint64_t olh_epoch;
+          std::string marker_version_id;
+          uint32_t bilog_flags;
+          std::list<rgw_obj_index_key> *remove_objs;
+          ceph::real_time expiration_time;
+          ceph::real_time unmod_since;
+          ceph::real_time mtime; /* for setting delete marker mtime */
+          bool high_precision_time;
+          rgw_zone_set *zones_trace;
+          bool abortmp;
+          uint64_t parts_accounted_size;
+
+          DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
+        } params;
+
+        struct DeleteResult {
+          bool delete_marker;
+          std::string version_id;
+
+          DeleteResult() : delete_marker(false) {}
+        } result;
+
+        explicit Delete(DB::Object *_target) : target(_target) {}
+
+        int delete_obj(const DoutPrefixProvider *dpp);
+        int delete_obj_impl(const DoutPrefixProvider *dpp, DBOpParams& del_params);
+        int create_dm(const DoutPrefixProvider *dpp, DBOpParams& del_params);
+      };
+
+      /* XXX: the parameters may be subject to change. All we need is bucket name
+       * & obj name,instance - keys */
+      int get_object_impl(const DoutPrefixProvider *dpp, DBOpParams& params);
+      int get_obj_state(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+                        const rgw_obj& obj,
+                        bool follow_olh, RGWObjState **state);
+      int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, bool follow_olh);
+      int list_versioned_objects(const DoutPrefixProvider *dpp,
+                                 std::list<rgw_bucket_dir_entry>& list_entries);
+
+      DB *get_store() { return store; }
+      rgw_obj& get_obj() { return obj; }
+      RGWBucketInfo& get_bucket_info() { return bucket_info; }
+
+      int InitializeParamsfromObject(const DoutPrefixProvider *dpp, DBOpParams* params);
+      int set_attrs(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>& setattrs,
+          std::map<std::string, bufferlist>* rmattrs);
+      int transition(const DoutPrefixProvider *dpp,
+                     const rgw_placement_rule& rule, const real_time& mtime,
+                     uint64_t olh_epoch);
+      int obj_omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val, bool must_exist);
+      int obj_omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+          const std::set<std::string>& keys,
+          std::map<std::string, bufferlist>* vals);
+      int obj_omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m);
+      int obj_omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+          std::map<std::string, bufferlist> *m, bool* pmore);
+      using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const raw_obj&, off_t, off_t,
+          bool, RGWObjState*, void*);
+      int add_mp_part(const DoutPrefixProvider *dpp, RGWUploadPartInfo info);
+      int get_mp_parts_list(const DoutPrefixProvider *dpp, std::list<RGWUploadPartInfo>& info);
+
+      int iterate_obj(const DoutPrefixProvider *dpp,
+          const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+          off_t ofs, off_t end, uint64_t max_chunk_size,
+          iterate_obj_cb cb, void *arg);
+    };
+    int get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+        const raw_obj& read_obj, off_t obj_ofs,
+        off_t len, bool is_head_obj,
+        RGWObjState *astate, void *arg);
+
+    int get_entry(const std::string& oid, const std::string& marker,
+		  std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry);
+    int get_next_entry(const std::string& oid, const std::string& marker,
+		  std::unique_ptr<rgw::sal::Lifecycle::LCEntry>* entry);
+    int set_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry);
+    int list_entries(const std::string& oid, const std::string& marker,
+			   uint32_t max_entries, std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& entries);
+    int rm_entry(const std::string& oid, rgw::sal::Lifecycle::LCEntry& entry);
+    int get_head(const std::string& oid, std::unique_ptr<rgw::sal::Lifecycle::LCHead>* head);
+    int put_head(const std::string& oid, rgw::sal::Lifecycle::LCHead& head);
+    int delete_stale_objs(const DoutPrefixProvider *dpp, const std::string& bucket,
+                          uint32_t min_wait);
+    int createGC(const DoutPrefixProvider *_dpp);
+    int stopGC();
+};
+
+struct db_get_obj_data {
+  DB* store;
+  RGWGetDataCB* client_cb = nullptr;
+  uint64_t offset; // next offset to write to client
+
+  db_get_obj_data(DB* db, RGWGetDataCB* cb, uint64_t offset) :
+    store(db), client_cb(cb), offset(offset) {}
+  ~db_get_obj_data() {}
+};
+
+} } // namespace rgw::store
diff --git a/src/rgw/driver/dbstore/common/dbstore_log.h b/src/rgw/driver/dbstore/common/dbstore_log.h
new file mode 100644
index 000000000..416508369
--- /dev/null
+++ b/src/rgw/driver/dbstore/common/dbstore_log.h
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <cerrno>
+#include <cstdlib>
+#include <string>
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include "common/dout.h"
+
+#undef dout_prefix
+#define dout_prefix *_dout << "rgw dbstore: "
diff --git a/src/rgw/driver/dbstore/config/sqlite.cc b/src/rgw/driver/dbstore/config/sqlite.cc
new file mode 100644
index 000000000..a1b217735
--- /dev/null
+++ b/src/rgw/driver/dbstore/config/sqlite.cc
@@ -0,0 +1,2070 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <charconv>
+#include <initializer_list>
+#include <map>
+
+#include <fmt/format.h>
+
+#include <sqlite3.h>
+
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "common/dout.h"
+#include "common/random_string.h"
+#include "rgw_zone.h"
+
+#include "common/connection_pool.h"
+#include "sqlite/connection.h"
+#include "sqlite/error.h"
+#include "sqlite/statement.h"
+#include "sqlite_schema.h"
+#include "sqlite.h"
+
+#define dout_subsys ceph_subsys_rgw_dbstore
+
+namespace rgw::dbstore::config {
+
+struct Prefix : DoutPrefixPipe {
+  std::string_view prefix;
+  Prefix(const DoutPrefixProvider& dpp, std::string_view prefix)
+      : DoutPrefixPipe(dpp), prefix(prefix) {}
+  unsigned get_subsys() const override { return dout_subsys; }
+  void add_prefix(std::ostream& out) const override {
+    out << prefix;
+  }
+};
+
+namespace {
+
+// parameter names for prepared statement bindings
+static constexpr const char* P1 = ":1";
+static constexpr const char* P2 = ":2";
+static constexpr const char* P3 = ":3";
+static constexpr const char* P4 = ":4";
+static constexpr const char* P5 = ":5";
+static constexpr const char* P6 = ":6";
+
+
+void read_text_rows(const DoutPrefixProvider* dpp,
+                    const sqlite::stmt_execution& stmt,
+                    std::span<std::string> entries,
+                    sal::ListResult<std::string>& result)
+{
+  result.entries = sqlite::read_text_rows(dpp, stmt, entries);
+  if (result.entries.size() < entries.size()) { // end of listing
+    result.next.clear();
+  } else {
+    result.next = result.entries.back();
+  }
+}
+
+struct RealmRow {
+  RGWRealm info;
+  int ver;
+  std::string tag;
+};
+
+void read_realm_row(const sqlite::stmt_execution& stmt, RealmRow& row)
+{
+  row.info.id = sqlite::column_text(stmt, 0);
+  row.info.name = sqlite::column_text(stmt, 1);
+  row.info.current_period = sqlite::column_text(stmt, 2);
+  row.info.epoch = sqlite::column_int(stmt, 3);
+  row.ver = sqlite::column_int(stmt, 4);
+  row.tag = sqlite::column_text(stmt, 5);
+}
+
+void read_period_row(const sqlite::stmt_execution& stmt, RGWPeriod& row)
+{
+  // just read the Data column and decode everything else from that
+  std::string data = sqlite::column_text(stmt, 3);
+
+  bufferlist bl = bufferlist::static_from_string(data);
+  auto p = bl.cbegin();
+  decode(row, p);
+}
+
+struct ZoneGroupRow {
+  RGWZoneGroup info;
+  int ver;
+  std::string tag;
+};
+
+void read_zonegroup_row(const sqlite::stmt_execution& stmt, ZoneGroupRow& row)
+{
+  std::string data = sqlite::column_text(stmt, 3);
+  row.ver = sqlite::column_int(stmt, 4);
+  row.tag = sqlite::column_text(stmt, 5);
+
+  bufferlist bl = bufferlist::static_from_string(data);
+  auto p = bl.cbegin();
+  decode(row.info, p);
+}
+
+struct ZoneRow {
+  RGWZoneParams info;
+  int ver;
+  std::string tag;
+};
+
+void read_zone_row(const sqlite::stmt_execution& stmt, ZoneRow& row)
+{
+  std::string data = sqlite::column_text(stmt, 3);
+  row.ver = sqlite::column_int(stmt, 4);
+  row.tag = sqlite::column_text(stmt, 5);
+
+  bufferlist bl = bufferlist::static_from_string(data);
+  auto p = bl.cbegin();
+  decode(row.info, p);
+}
+
+std::string generate_version_tag(CephContext* cct)
+{
+  static constexpr auto TAG_LEN = 24;
+  return gen_rand_alphanumeric(cct, TAG_LEN);
+}
+
+using SQLiteConnectionHandle = ConnectionHandle<sqlite::Connection>;
+
+using SQLiteConnectionPool = ConnectionPool<
+    sqlite::Connection, sqlite::ConnectionFactory>;
+
+} // anonymous namespace
+
+class SQLiteImpl : public SQLiteConnectionPool {
+ public:
+  using SQLiteConnectionPool::SQLiteConnectionPool;
+};
+
+
+SQLiteConfigStore::SQLiteConfigStore(std::unique_ptr<SQLiteImpl> impl)
+  : impl(std::move(impl))
+{
+}
+
+SQLiteConfigStore::~SQLiteConfigStore() = default;
+
+
+// Realm
+
+class SQLiteRealmWriter : public sal::RealmWriter {
+  SQLiteImpl* impl;
+  int ver;
+  std::string tag;
+  std::string realm_id;
+  std::string realm_name;
+ public:
+  SQLiteRealmWriter(SQLiteImpl* impl, int ver, std::string tag,
+                    std::string_view realm_id, std::string_view realm_name)
+    : impl(impl), ver(ver), tag(std::move(tag)),
+      realm_id(realm_id), realm_name(realm_name)
+  {}
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWRealm& info) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:realm_write "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after a conflict or delete
+    }
+    if (realm_id != info.id || realm_name != info.name) {
+      return -EINVAL; // can't modify realm id or name directly
+    }
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["realm_upd"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::realm_update5,
+                                            P1, P2, P3, P4, P5);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, info.id);
+      sqlite::bind_text(dpp, binding, P2, info.current_period);
+      sqlite::bind_int(dpp, binding, P3, info.epoch);
+      sqlite::bind_int(dpp, binding, P4, ver);
+      sqlite::bind_text(dpp, binding, P5, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        // our version is no longer consistent, so later writes would fail too
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "realm update failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::foreign_key_constraint) {
+        return -EINVAL; // refers to nonexistent CurrentPeriod
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    ++ver;
+    return 0;
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWRealm& info, std::string_view new_name) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:realm_rename "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    if (realm_id != info.id || realm_name != info.name) {
+      return -EINVAL; // can't modify realm id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["realm_rename"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::realm_rename4,
+                                            P1, P2, P3, P4);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, realm_id);
+      sqlite::bind_text(dpp, binding, P2, new_name);
+      sqlite::bind_int(dpp, binding, P3, ver);
+      sqlite::bind_text(dpp, binding, P4, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "realm rename failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::unique_constraint) {
+        return -EEXIST; // Name already taken
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    info.name = std::string{new_name};
+    ++ver;
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:realm_remove "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["realm_del"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::realm_delete3, P1, P2, P3);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, realm_id);
+      sqlite::bind_int(dpp, binding, P2, ver);
+      sqlite::bind_text(dpp, binding, P3, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      impl = nullptr; // prevent any further writes after delete
+      if (!::sqlite3_changes(conn->db.get())) {
+        return -ECANCELED; // VersionNumber/Tag mismatch
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "realm delete failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    return 0;
+  }
+}; // SQLiteRealmWriter
+
+
+int SQLiteConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp,
+                                              optional_yield y, bool exclusive,
+                                              std::string_view realm_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:write_default_realm_id "}; dpp = &prefix;
+
+  if (realm_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a realm id" << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["def_realm_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_realm_insert1, P1);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["def_realm_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_realm_upsert1, P1);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default realm insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::primary_key_constraint) {
+      return -EEXIST;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string& realm_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_realm_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_realm_sel"];
+    if (!stmt) {
+      static constexpr std::string_view sql = schema::default_realm_select0;
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    realm_id = sqlite::column_text(reset, 0);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                               optional_yield y)
+
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_realm_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_realm_del"];
+    if (!stmt) {
+      static constexpr std::string_view sql = schema::default_realm_delete0;
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval0(dpp, reset);
+
+    if (!::sqlite3_changes(conn->db.get())) {
+      return -ENOENT;
+    }
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default realm delete failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+int SQLiteConfigStore::create_realm(const DoutPrefixProvider* dpp,
+                                    optional_yield y, bool exclusive,
+                                    const RGWRealm& info,
+                                    std::unique_ptr<sal::RealmWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:create_realm "}; dpp = &prefix;
+
+  if (info.id.empty()) {
+    ldpp_dout(dpp, 0) << "realm cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.name.empty()) {
+    ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  int ver = 1;
+  auto tag = generate_version_tag(dpp->get_cct());
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["realm_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::realm_insert4,
+                                            P1, P2, P3, P4);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["realm_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::realm_upsert4,
+                                            P1, P2, P3, P4);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, info.id);
+    sqlite::bind_text(dpp, binding, P2, info.name);
+    sqlite::bind_int(dpp, binding, P3, ver);
+    sqlite::bind_text(dpp, binding, P4, tag);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::primary_key_constraint) {
+      return -EEXIST; // ID already taken
+    } else if (e.code() == sqlite::errc::unique_constraint) {
+      return -EEXIST; // Name already taken
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<SQLiteRealmWriter>(
+        impl.get(), ver, std::move(tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view realm_id,
+                                        RGWRealm& info,
+                                        std::unique_ptr<sal::RealmWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_by_id "}; dpp = &prefix;
+
+  if (realm_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a realm id" << dendl;
+    return -EINVAL;
+  }
+
+  RealmRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["realm_sel_id"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::realm_select_id1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_realm_row(reset, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteRealmWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+static void realm_select_by_name(const DoutPrefixProvider* dpp,
+                                 sqlite::Connection& conn,
+                                 std::string_view realm_name,
+                                 RealmRow& row)
+{
+  auto& stmt = conn.statements["realm_sel_name"];
+  if (!stmt) {
+    const std::string sql = fmt::format(schema::realm_select_name1, P1);
+    stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql);
+  }
+  auto binding = sqlite::stmt_binding{stmt.get()};
+  sqlite::bind_text(dpp, binding, P1, realm_name);
+
+  auto reset = sqlite::stmt_execution{stmt.get()};
+  sqlite::eval1(dpp, reset);
+
+  read_realm_row(reset, row);
+}
+
+int SQLiteConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view realm_name,
+                                          RGWRealm& info,
+                                          std::unique_ptr<sal::RealmWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_by_name "}; dpp = &prefix;
+
+  if (realm_name.empty()) {
+    ldpp_dout(dpp, 0) << "requires a realm name" << dendl;
+    return -EINVAL;
+  }
+
+  RealmRow row;
+  try {
+    auto conn = impl->get(dpp);
+    realm_select_by_name(dpp, *conn, realm_name, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteRealmWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_realm(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          RGWRealm& info,
+                                          std::unique_ptr<sal::RealmWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_realm "}; dpp = &prefix;
+
+  RealmRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["realm_sel_def"];
+    if (!stmt) {
+      static constexpr std::string_view sql = schema::realm_select_default0;
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_realm_row(reset, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteRealmWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_realm_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_name,
+                                     std::string& realm_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_realm_id "}; dpp = &prefix;
+
+  if (realm_name.empty()) {
+    ldpp_dout(dpp, 0) << "requires a realm name" << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    auto conn = impl->get(dpp);
+
+    RealmRow row;
+    realm_select_by_name(dpp, *conn, realm_name, row);
+
+    realm_id = std::move(row.info.id);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "realm decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int SQLiteConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                               optional_yield y,
+                                               const RGWPeriod& period)
+{
+  return -ENOTSUP;
+}
+
+int SQLiteConfigStore::list_realm_names(const DoutPrefixProvider* dpp,
+                                        optional_yield y, const std::string& marker,
+                                        std::span<std::string> entries,
+                                        sal::ListResult<std::string>& result)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:list_realm_names "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["realm_sel_names"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::realm_select_names2, P1, P2);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, marker);
+    sqlite::bind_int(dpp, binding, P2, entries.size());
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    read_text_rows(dpp, reset, entries, result);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "realm select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+// Period
+
+int SQLiteConfigStore::create_period(const DoutPrefixProvider* dpp,
+                                     optional_yield y, bool exclusive,
+                                     const RGWPeriod& info)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:create_period "}; dpp = &prefix;
+
+  if (info.id.empty()) {
+    ldpp_dout(dpp, 0) << "period cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+
+  bufferlist bl;
+  encode(info, bl);
+  const auto data = std::string_view{bl.c_str(), bl.length()};
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["period_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::period_insert4,
+                                            P1, P2, P3, P4);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["period_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::period_upsert4,
+                                            P1, P2, P3, P4);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, info.id);
+    sqlite::bind_int(dpp, binding, P2, info.epoch);
+    sqlite::bind_text(dpp, binding, P3, info.realm_id);
+    sqlite::bind_text(dpp, binding, P4, data);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::foreign_key_constraint) {
+      return -EINVAL; // refers to nonexistent RealmID
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+static void period_select_epoch(const DoutPrefixProvider* dpp,
+                                sqlite::Connection& conn,
+                                std::string_view id, uint32_t epoch,
+                                RGWPeriod& row)
+{
+  auto& stmt = conn.statements["period_sel_epoch"];
+  if (!stmt) {
+    const std::string sql = fmt::format(schema::period_select_epoch2, P1, P2);
+    stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql);
+  }
+  auto binding = sqlite::stmt_binding{stmt.get()};
+  sqlite::bind_text(dpp, binding, P1, id);
+  sqlite::bind_int(dpp, binding, P2, epoch);
+
+  auto reset = sqlite::stmt_execution{stmt.get()};
+  sqlite::eval1(dpp, reset);
+
+  read_period_row(reset, row);
+}
+
+static void period_select_latest(const DoutPrefixProvider* dpp,
+                                 sqlite::Connection& conn,
+                                 std::string_view id, RGWPeriod& row)
+{
+  auto& stmt = conn.statements["period_sel_latest"];
+  if (!stmt) {
+    const std::string sql = fmt::format(schema::period_select_latest1, P1);
+    stmt = sqlite::prepare_statement(dpp, conn.db.get(), sql);
+  }
+  auto binding = sqlite::stmt_binding{stmt.get()};
+  sqlite::bind_text(dpp, binding, P1, id);
+
+  auto reset = sqlite::stmt_execution{stmt.get()};
+  sqlite::eval1(dpp, reset);
+
+  read_period_row(reset, row);
+}
+
+int SQLiteConfigStore::read_period(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view period_id,
+                                   std::optional<uint32_t> epoch,
+                                   RGWPeriod& info)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_period "}; dpp = &prefix;
+
+  if (period_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a period id" << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    auto conn = impl->get(dpp);
+    if (epoch) {
+      period_select_epoch(dpp, *conn, period_id, *epoch, info);
+    } else {
+      period_select_latest(dpp, *conn, period_id, info);
+    }
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "period decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::delete_period(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view period_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:delete_period "}; dpp = &prefix;
+
+  if (period_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a period id" << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["period_del"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::period_delete1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, period_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval0(dpp, reset);
+
+    if (!::sqlite3_changes(conn->db.get())) {
+      return -ENOENT;
+    }
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period delete failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::list_period_ids(const DoutPrefixProvider* dpp,
+                                       optional_yield y,
+                                       const std::string& marker,
+                                       std::span<std::string> entries,
+                                       sal::ListResult<std::string>& result)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:list_period_ids "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["period_sel_ids"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::period_select_ids2, P1, P2);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, marker);
+    sqlite::bind_int(dpp, binding, P2, entries.size());
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    read_text_rows(dpp, reset, entries, result);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+// ZoneGroup
+
+class SQLiteZoneGroupWriter : public sal::ZoneGroupWriter {
+  SQLiteImpl* impl;
+  int ver;
+  std::string tag;
+  std::string zonegroup_id;
+  std::string zonegroup_name;
+ public:
+  SQLiteZoneGroupWriter(SQLiteImpl* impl, int ver, std::string tag,
+                        std::string_view zonegroup_id,
+                        std::string_view zonegroup_name)
+    : impl(impl), ver(ver), tag(std::move(tag)),
+      zonegroup_id(zonegroup_id), zonegroup_name(zonegroup_name)
+  {}
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneGroup& info) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_write "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    if (zonegroup_id != info.id || zonegroup_name != info.name) {
+      return -EINVAL; // can't modify zonegroup id or name directly
+    }
+
+    bufferlist bl;
+    encode(info, bl);
+    const auto data = std::string_view{bl.c_str(), bl.length()};
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zonegroup_upd"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zonegroup_update5,
+                                            P1, P2, P3, P4, P5);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, info.id);
+      sqlite::bind_text(dpp, binding, P2, info.realm_id);
+      sqlite::bind_text(dpp, binding, P3, data);
+      sqlite::bind_int(dpp, binding, P4, ver);
+      sqlite::bind_text(dpp, binding, P5, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zonegroup update failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::foreign_key_constraint) {
+        return -EINVAL; // refers to nonexistent RealmID
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    return 0;
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneGroup& info, std::string_view new_name) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_rename "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
+      return -EINVAL; // can't modify zonegroup id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zonegroup_rename"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zonegroup_rename4,
+                                            P1, P2, P3, P4);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, info.id);
+      sqlite::bind_text(dpp, binding, P2, new_name);
+      sqlite::bind_int(dpp, binding, P3, ver);
+      sqlite::bind_text(dpp, binding, P4, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zonegroup rename failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::unique_constraint) {
+        return -EEXIST; // Name already taken
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    info.name = std::string{new_name};
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zonegroup_remove "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zonegroup_del"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zonegroup_delete3,
+                                            P1, P2, P3);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, zonegroup_id);
+      sqlite::bind_int(dpp, binding, P2, ver);
+      sqlite::bind_text(dpp, binding, P3, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      impl = nullptr;
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zonegroup delete failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    return 0;
+  }
+}; // SQLiteZoneGroupWriter
+
+
+int SQLiteConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                  optional_yield y, bool exclusive,
+                                                  std::string_view realm_id,
+                                                  std::string_view zonegroup_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:write_default_zonegroup_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["def_zonegroup_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_zonegroup_insert2,
+                                            P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["def_zonegroup_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_zonegroup_upsert2,
+                                            P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+    sqlite::bind_text(dpp, binding, P2, zonegroup_id);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zonegroup insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                 optional_yield y,
+                                                 std::string_view realm_id,
+                                                 std::string& zonegroup_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zonegroup_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_zonegroup_sel"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::default_zonegroup_select1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    zonegroup_id = sqlite::column_text(reset, 0);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zonegroup select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                   optional_yield y,
+                                                   std::string_view realm_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_zonegroup_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_zonegroup_del"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::default_zonegroup_delete1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval0(dpp, reset);
+
+    if (!::sqlite3_changes(conn->db.get())) {
+      return -ENOENT;
+    }
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zonegroup delete failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+int SQLiteConfigStore::create_zonegroup(const DoutPrefixProvider* dpp,
+                                        optional_yield y, bool exclusive,
+                                        const RGWZoneGroup& info,
+                                        std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:create_zonegroup "}; dpp = &prefix;
+
+  if (info.id.empty()) {
+    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.name.empty()) {
+    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  int ver = 1;
+  auto tag = generate_version_tag(dpp->get_cct());
+
+  bufferlist bl;
+  encode(info, bl);
+  const auto data = std::string_view{bl.c_str(), bl.length()};
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["zonegroup_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::zonegroup_insert6,
+                                            P1, P2, P3, P4, P5, P6);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["zonegroup_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::zonegroup_upsert6,
+                                            P1, P2, P3, P4, P5, P6);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, info.id);
+    sqlite::bind_text(dpp, binding, P2, info.name);
+    sqlite::bind_text(dpp, binding, P3, info.realm_id);
+    sqlite::bind_text(dpp, binding, P4, data);
+    sqlite::bind_int(dpp, binding, P5, ver);
+    sqlite::bind_text(dpp, binding, P6, tag);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::foreign_key_constraint) {
+      return -EINVAL; // refers to nonexistent RealmID
+    } else if (e.code() == sqlite::errc::primary_key_constraint) {
+      return -EEXIST; // ID already taken
+    } else if (e.code() == sqlite::errc::unique_constraint) {
+      return -EEXIST; // Name already taken
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneGroupWriter>(
+        impl.get(), ver, std::move(tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            std::string_view zonegroup_id,
+                                            RGWZoneGroup& info,
+                                            std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_zonegroup_by_id "}; dpp = &prefix;
+
+  if (zonegroup_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a zonegroup id" << dendl;
+    return -EINVAL;
+  }
+
+  ZoneGroupRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zonegroup_sel_id"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zonegroup_select_id1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, zonegroup_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zonegroup_row(reset, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneGroupWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                              optional_yield y,
+                                              std::string_view zonegroup_name,
+                                              RGWZoneGroup& info,
+                                              std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_zonegroup_by_name "}; dpp = &prefix;
+
+  if (zonegroup_name.empty()) {
+    ldpp_dout(dpp, 0) << "requires a zonegroup name" << dendl;
+    return -EINVAL;
+  }
+
+  ZoneGroupRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zonegroup_sel_name"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zonegroup_select_name1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, zonegroup_name);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zonegroup_row(reset, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneGroupWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                              optional_yield y,
+                                              std::string_view realm_id,
+                                              RGWZoneGroup& info,
+                                              std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zonegroup "}; dpp = &prefix;
+
+  ZoneGroupRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zonegroup_sel_def"];
+    if (!stmt) {
+      static constexpr std::string_view sql = schema::zonegroup_select_default0;
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zonegroup_row(reset, row);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneGroupWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            const std::string& marker,
+                                            std::span<std::string> entries,
+                                            sal::ListResult<std::string>& result)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:list_zonegroup_names "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zonegroup_sel_names"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zonegroup_select_names2, P1, P2);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    auto reset = sqlite::stmt_execution{stmt.get()};
+
+    sqlite::bind_text(dpp, binding, P1, marker);
+    sqlite::bind_int(dpp, binding, P2, entries.size());
+
+    read_text_rows(dpp, reset, entries, result);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zonegroup select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+// Zone
+
+class SQLiteZoneWriter : public sal::ZoneWriter {
+  SQLiteImpl* impl;
+  int ver;
+  std::string tag;
+  std::string zone_id;
+  std::string zone_name;
+ public:
+  SQLiteZoneWriter(SQLiteImpl* impl, int ver, std::string tag,
+                   std::string_view zone_id, std::string_view zone_name)
+    : impl(impl), ver(ver), tag(std::move(tag)),
+      zone_id(zone_id), zone_name(zone_name)
+  {}
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneParams& info) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zone_write "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    if (zone_id != info.id || zone_name != info.name) {
+      return -EINVAL; // can't modify zone id or name directly
+    }
+
+    bufferlist bl;
+    encode(info, bl);
+    const auto data = std::string_view{bl.c_str(), bl.length()};
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zone_upd"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zone_update5,
+                                            P1, P2, P3, P4, P5);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, info.id);
+      sqlite::bind_text(dpp, binding, P2, info.realm_id);
+      sqlite::bind_text(dpp, binding, P3, data);
+      sqlite::bind_int(dpp, binding, P4, ver);
+      sqlite::bind_text(dpp, binding, P5, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zone update failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::foreign_key_constraint) {
+        return -EINVAL; // refers to nonexistent RealmID
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    ++ver;
+    return 0;
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneParams& info, std::string_view new_name) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zone_rename "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    if (zone_id != info.id || zone_name != info.name) {
+      return -EINVAL; // can't modify zone id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zone_rename"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zone_rename4, P1, P2, P2, P3);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, info.id);
+      sqlite::bind_text(dpp, binding, P2, new_name);
+      sqlite::bind_int(dpp, binding, P3, ver);
+      sqlite::bind_text(dpp, binding, P4, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        impl = nullptr;
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zone rename failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::unique_constraint) {
+        return -EEXIST; // Name already taken
+      } else if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    info.name = std::string{new_name};
+    ++ver;
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    Prefix prefix{*dpp, "dbconfig:sqlite:zone_remove "}; dpp = &prefix;
+
+    if (!impl) {
+      return -EINVAL; // can't write after conflict or delete
+    }
+    try {
+      auto conn = impl->get(dpp);
+      auto& stmt = conn->statements["zone_del"];
+      if (!stmt) {
+        const std::string sql = fmt::format(schema::zone_delete3, P1, P2, P3);
+        stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+      auto binding = sqlite::stmt_binding{stmt.get()};
+      sqlite::bind_text(dpp, binding, P1, zone_id);
+      sqlite::bind_int(dpp, binding, P2, ver);
+      sqlite::bind_text(dpp, binding, P3, tag);
+
+      auto reset = sqlite::stmt_execution{stmt.get()};
+      sqlite::eval0(dpp, reset);
+
+      impl = nullptr;
+      if (!::sqlite3_changes(conn->db.get())) { // VersionNumber/Tag mismatch
+        return -ECANCELED;
+      }
+    } catch (const sqlite::error& e) {
+      ldpp_dout(dpp, 20) << "zone delete failed: " << e.what() << dendl;
+      if (e.code() == sqlite::errc::busy) {
+        return -EBUSY;
+      }
+      return -EIO;
+    }
+    return 0;
+  }
+}; // SQLiteZoneWriter
+
+
+int SQLiteConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp,
+                                             optional_yield y, bool exclusive,
+                                             std::string_view realm_id,
+                                             std::string_view zone_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:write_default_zone_id "}; dpp = &prefix;
+
+  if (zone_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a zone id" << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["def_zone_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_zone_insert2, P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["def_zone_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::default_zone_upsert2, P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+    sqlite::bind_text(dpp, binding, P2, zone_id);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zone insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            std::string_view realm_id,
+                                            std::string& zone_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zone_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_zone_sel"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::default_zone_select1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    zone_id = sqlite::column_text(reset, 0);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zone select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                              optional_yield y,
+                                              std::string_view realm_id)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:delete_default_zone_id "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["def_zone_del"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::default_zone_delete1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval0(dpp, reset);
+
+    if (!::sqlite3_changes(conn->db.get())) {
+      return -ENOENT;
+    }
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "default zone delete failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+int SQLiteConfigStore::create_zone(const DoutPrefixProvider* dpp,
+                                   optional_yield y, bool exclusive,
+                                   const RGWZoneParams& info,
+                                   std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:create_zone "}; dpp = &prefix;
+
+  if (info.id.empty()) {
+    ldpp_dout(dpp, 0) << "zone cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.name.empty()) {
+    ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  int ver = 1;
+  auto tag = generate_version_tag(dpp->get_cct());
+
+  bufferlist bl;
+  encode(info, bl);
+  const auto data = std::string_view{bl.c_str(), bl.length()};
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["zone_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::zone_insert6,
+                                            P1, P2, P3, P4, P5, P6);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["zone_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::zone_upsert6,
+                                            P1, P2, P3, P4, P5, P6);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, info.id);
+    sqlite::bind_text(dpp, binding, P2, info.name);
+    sqlite::bind_text(dpp, binding, P3, info.realm_id);
+    sqlite::bind_text(dpp, binding, P4, data);
+    sqlite::bind_int(dpp, binding, P5, ver);
+    sqlite::bind_text(dpp, binding, P6, tag);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zone insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::foreign_key_constraint) {
+      return -EINVAL; // refers to nonexistent RealmID
+    } else if (e.code() == sqlite::errc::primary_key_constraint) {
+      return -EEXIST; // ID already taken
+    } else if (e.code() == sqlite::errc::unique_constraint) {
+      return -EEXIST; // Name already taken
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneWriter>(
+        impl.get(), ver, std::move(tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp,
+                                       optional_yield y,
+                                       std::string_view zone_id,
+                                       RGWZoneParams& info,
+                                       std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_zone_by_id "}; dpp = &prefix;
+
+  if (zone_id.empty()) {
+    ldpp_dout(dpp, 0) << "requires a zone id" << dendl;
+    return -EINVAL;
+  }
+
+  ZoneRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zone_sel_id"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zone_select_id1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, zone_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zone_row(reset, row);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp,
+                                         optional_yield y,
+                                         std::string_view zone_name,
+                                         RGWZoneParams& info,
+                                         std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_zone_by_name "}; dpp = &prefix;
+
+  if (zone_name.empty()) {
+    ldpp_dout(dpp, 0) << "requires a zone name" << dendl;
+    return -EINVAL;
+  }
+
+  ZoneRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zone_sel_name"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zone_select_name1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, zone_name);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zone_row(reset, row);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::read_default_zone(const DoutPrefixProvider* dpp,
+                                         optional_yield y,
+                                         std::string_view realm_id,
+                                         RGWZoneParams& info,
+                                         std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_default_zone "}; dpp = &prefix;
+
+  ZoneRow row;
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zone_sel_def"];
+    if (!stmt) {
+      static constexpr std::string_view sql = schema::zone_select_default0;
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    read_zone_row(reset, row);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+
+  info = std::move(row.info);
+  if (writer) {
+    *writer = std::make_unique<SQLiteZoneWriter>(
+        impl.get(), row.ver, std::move(row.tag), info.id, info.name);
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::list_zone_names(const DoutPrefixProvider* dpp,
+                                       optional_yield y,
+                                       const std::string& marker,
+                                       std::span<std::string> entries,
+                                       sal::ListResult<std::string>& result)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:list_zone_names "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["zone_sel_names"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::zone_select_names2, P1, P2);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, marker);
+    sqlite::bind_int(dpp, binding, P2, entries.size());
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    read_text_rows(dpp, reset, entries, result);
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "zone select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+
+// PeriodConfig
+
+int SQLiteConfigStore::read_period_config(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view realm_id,
+                                          RGWPeriodConfig& info)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:read_period_config "}; dpp = &prefix;
+
+  try {
+    auto conn = impl->get(dpp);
+    auto& stmt = conn->statements["period_conf_sel"];
+    if (!stmt) {
+      const std::string sql = fmt::format(schema::period_config_select1, P1);
+      stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+    }
+    auto binding = sqlite::stmt_binding{stmt.get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+
+    auto reset = sqlite::stmt_execution{stmt.get()};
+    sqlite::eval1(dpp, reset);
+
+    std::string data = sqlite::column_text(reset, 0);
+    bufferlist bl = bufferlist::static_from_string(data);
+    auto p = bl.cbegin();
+    decode(info, p);
+
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "period config decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period config select failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::done) {
+      return -ENOENT;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+int SQLiteConfigStore::write_period_config(const DoutPrefixProvider* dpp,
+                                           optional_yield y, bool exclusive,
+                                           std::string_view realm_id,
+                                           const RGWPeriodConfig& info)
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:write_period_config "}; dpp = &prefix;
+
+  bufferlist bl;
+  encode(info, bl);
+  const auto data = std::string_view{bl.c_str(), bl.length()};
+
+  try {
+    auto conn = impl->get(dpp);
+    sqlite::stmt_ptr* stmt = nullptr;
+    if (exclusive) {
+      stmt = &conn->statements["period_conf_ins"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::period_config_insert2, P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    } else {
+      stmt = &conn->statements["period_conf_ups"];
+      if (!*stmt) {
+        const std::string sql = fmt::format(schema::period_config_upsert2, P1, P2);
+        *stmt = sqlite::prepare_statement(dpp, conn->db.get(), sql);
+      }
+    }
+    auto binding = sqlite::stmt_binding{stmt->get()};
+    sqlite::bind_text(dpp, binding, P1, realm_id);
+    sqlite::bind_text(dpp, binding, P2, data);
+
+    auto reset = sqlite::stmt_execution{stmt->get()};
+    sqlite::eval0(dpp, reset);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 20) << "period config decode failed: " << e.what() << dendl;
+    return -EIO;
+  } catch (const sqlite::error& e) {
+    ldpp_dout(dpp, 20) << "period config insert failed: " << e.what() << dendl;
+    if (e.code() == sqlite::errc::primary_key_constraint) {
+      return -EEXIST;
+    } else if (e.code() == sqlite::errc::busy) {
+      return -EBUSY;
+    }
+    return -EIO;
+  }
+  return 0;
+}
+
+namespace {
+
+int version_cb(void* user, int count, char** values, char** names)
+{
+  if (count != 1) {
+    return EINVAL;
+  }
+  std::string_view name = names[0];
+  if (name != "user_version") {
+    return EINVAL;
+  }
+  std::string_view value = values[0];
+  auto result = std::from_chars(value.begin(), value.end(),
+                                *reinterpret_cast<uint32_t*>(user));
+  if (result.ec != std::errc{}) {
+    return static_cast<int>(result.ec);
+  }
+  return 0;
+}
+
+void apply_schema_migrations(const DoutPrefixProvider* dpp, sqlite3* db)
+{
+  sqlite::execute(dpp, db, "PRAGMA foreign_keys = ON", nullptr, nullptr);
+
+  // initiate a transaction and read the current schema version
+  uint32_t version = 0;
+  sqlite::execute(dpp, db, "BEGIN; PRAGMA user_version", version_cb, &version);
+
+  const uint32_t initial_version = version;
+  ldpp_dout(dpp, 4) << "current schema version " << version << dendl;
+
+  // use the version as an index into schema::migrations
+  auto m = std::next(schema::migrations.begin(), version);
+
+  for (; m != schema::migrations.end(); ++m, ++version) {
+    try {
+      sqlite::execute(dpp, db, m->up, nullptr, nullptr);
+    } catch (const sqlite::error&) {
+      ldpp_dout(dpp, -1) << "ERROR: schema migration failed on v" << version
+          << ": " << m->description << dendl;
+      throw;
+    }
+  }
+
+  if (version > initial_version) {
+    // update the user_version and commit the transaction
+    const auto commit = fmt::format("PRAGMA user_version = {}; COMMIT", version);
+    sqlite::execute(dpp, db, commit.c_str(), nullptr, nullptr);
+
+    ldpp_dout(dpp, 4) << "upgraded database schema to version " << version << dendl;
+  } else {
+    // nothing to commit
+    sqlite::execute(dpp, db, "ROLLBACK", nullptr, nullptr);
+  }
+}
+
+} // anonymous namespace
+
+
+auto create_sqlite_store(const DoutPrefixProvider* dpp, const std::string& uri)
+  -> std::unique_ptr<config::SQLiteConfigStore>
+{
+  Prefix prefix{*dpp, "dbconfig:sqlite:create_sqlite_store "}; dpp = &prefix;
+
+  // build the connection pool
+  int flags = SQLITE_OPEN_CREATE | SQLITE_OPEN_URI | SQLITE_OPEN_READWRITE |
+      SQLITE_OPEN_NOMUTEX;
+  auto factory = sqlite::ConnectionFactory{uri, flags};
+
+  // sqlite does not support concurrent writers. we enforce this limitation by
+  // using a connection pool of size=1
+  static constexpr size_t max_connections = 1;
+  auto impl = std::make_unique<SQLiteImpl>(std::move(factory), max_connections);
+
+  // open a connection to apply schema migrations
+  auto conn = impl->get(dpp);
+  apply_schema_migrations(dpp, conn->db.get());
+
+  return std::make_unique<SQLiteConfigStore>(std::move(impl));
+}
+
+} // namespace rgw::dbstore::config
diff --git a/src/rgw/driver/dbstore/config/sqlite.h b/src/rgw/driver/dbstore/config/sqlite.h
new file mode 100644
index 000000000..d79e04072
--- /dev/null
+++ b/src/rgw/driver/dbstore/config/sqlite.h
@@ -0,0 +1,172 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal_config.h"
+
+class DoutPrefixProvider;
+
+namespace rgw::dbstore::config {
+
+struct SQLiteImpl;
+
+class SQLiteConfigStore : public sal::ConfigStore {
+ public:
+  explicit SQLiteConfigStore(std::unique_ptr<SQLiteImpl> impl);
+  ~SQLiteConfigStore() override;
+
+  int write_default_realm_id(const DoutPrefixProvider* dpp,
+                             optional_yield y, bool exclusive,
+                             std::string_view realm_id) override;
+  int read_default_realm_id(const DoutPrefixProvider* dpp,
+                            optional_yield y,
+                            std::string& realm_id) override;
+  int delete_default_realm_id(const DoutPrefixProvider* dpp,
+                              optional_yield y) override;
+
+  int create_realm(const DoutPrefixProvider* dpp,
+                   optional_yield y, bool exclusive,
+                   const RGWRealm& info,
+                   std::unique_ptr<sal::RealmWriter>* writer) override;
+  int read_realm_by_id(const DoutPrefixProvider* dpp,
+                       optional_yield y,
+                       std::string_view realm_id,
+                       RGWRealm& info,
+                       std::unique_ptr<sal::RealmWriter>* writer) override;
+  int read_realm_by_name(const DoutPrefixProvider* dpp,
+                         optional_yield y,
+                         std::string_view realm_name,
+                         RGWRealm& info,
+                         std::unique_ptr<sal::RealmWriter>* writer) override;
+  int read_default_realm(const DoutPrefixProvider* dpp,
+                         optional_yield y,
+                         RGWRealm& info,
+                         std::unique_ptr<sal::RealmWriter>* writer) override;
+  int read_realm_id(const DoutPrefixProvider* dpp,
+                    optional_yield y, std::string_view realm_name,
+                    std::string& realm_id) override;
+  int realm_notify_new_period(const DoutPrefixProvider* dpp,
+                              optional_yield y,
+                              const RGWPeriod& period) override;
+  int list_realm_names(const DoutPrefixProvider* dpp,
+                       optional_yield y, const std::string& marker,
+                       std::span<std::string> entries,
+                       sal::ListResult<std::string>& result) override;
+
+  int create_period(const DoutPrefixProvider* dpp,
+                    optional_yield y, bool exclusive,
+                    const RGWPeriod& info) override;
+  int read_period(const DoutPrefixProvider* dpp,
+                  optional_yield y, std::string_view period_id,
+                  std::optional<uint32_t> epoch, RGWPeriod& info) override;
+  int delete_period(const DoutPrefixProvider* dpp,
+                    optional_yield y,
+                    std::string_view period_id) override;
+  int list_period_ids(const DoutPrefixProvider* dpp,
+                      optional_yield y, const std::string& marker,
+                      std::span<std::string> entries,
+                      sal::ListResult<std::string>& result) override;
+
+  int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                 optional_yield y, bool exclusive,
+                                 std::string_view realm_id,
+                                 std::string_view zonegroup_id) override;
+  int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view realm_id,
+                                std::string& zonegroup_id) override;
+  int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                  optional_yield y,
+                                  std::string_view realm_id) override;
+
+  int create_zonegroup(const DoutPrefixProvider* dpp,
+                       optional_yield y, bool exclusive,
+                       const RGWZoneGroup& info,
+                       std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                           optional_yield y,
+                           std::string_view zonegroup_id,
+                           RGWZoneGroup& info,
+                           std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                             optional_yield y,
+                             std::string_view zonegroup_name,
+                             RGWZoneGroup& info,
+                             std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  int read_default_zonegroup(const DoutPrefixProvider* dpp,
+                             optional_yield y,
+                             std::string_view realm_id,
+                             RGWZoneGroup& info,
+                             std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  int list_zonegroup_names(const DoutPrefixProvider* dpp,
+                           optional_yield y, const std::string& marker,
+                           std::span<std::string> entries,
+                           sal::ListResult<std::string>& result) override;
+
+  int write_default_zone_id(const DoutPrefixProvider* dpp,
+                            optional_yield y, bool exclusive,
+                            std::string_view realm_id,
+                            std::string_view zone_id) override;
+  int read_default_zone_id(const DoutPrefixProvider* dpp,
+                           optional_yield y,
+                           std::string_view realm_id,
+                           std::string& zone_id) override;
+  int delete_default_zone_id(const DoutPrefixProvider* dpp,
+                             optional_yield y,
+                             std::string_view realm_id) override;
+
+  int create_zone(const DoutPrefixProvider* dpp,
+                  optional_yield y, bool exclusive,
+                  const RGWZoneParams& info,
+                  std::unique_ptr<sal::ZoneWriter>* writer) override;
+  int read_zone_by_id(const DoutPrefixProvider* dpp,
+                      optional_yield y,
+                      std::string_view zone_id,
+                      RGWZoneParams& info,
+                      std::unique_ptr<sal::ZoneWriter>* writer) override;
+  int read_zone_by_name(const DoutPrefixProvider* dpp,
+                        optional_yield y,
+                        std::string_view zone_name,
+                        RGWZoneParams& info,
+                        std::unique_ptr<sal::ZoneWriter>* writer) override;
+  int read_default_zone(const DoutPrefixProvider* dpp,
+                        optional_yield y,
+                        std::string_view realm_id,
+                        RGWZoneParams& info,
+                        std::unique_ptr<sal::ZoneWriter>* writer) override;
+  int list_zone_names(const DoutPrefixProvider* dpp,
+                      optional_yield y, const std::string& marker,
+                      std::span<std::string> entries,
+                      sal::ListResult<std::string>& result) override;
+
+  int read_period_config(const DoutPrefixProvider* dpp,
+                         optional_yield y,
+                         std::string_view realm_id,
+                         RGWPeriodConfig& info) override;
+  int write_period_config(const DoutPrefixProvider* dpp,
+                          optional_yield y, bool exclusive,
+                          std::string_view realm_id,
+                          const RGWPeriodConfig& info) override;
+
+ private:
+  std::unique_ptr<SQLiteImpl> impl;
+}; // SQLiteConfigStore
+
+
+auto create_sqlite_store(const DoutPrefixProvider* dpp, const std::string& uri)
+  -> std::unique_ptr<config::SQLiteConfigStore>;
+
+} // namespace rgw::dbstore::config
diff --git a/src/rgw/driver/dbstore/config/sqlite_schema.h b/src/rgw/driver/dbstore/config/sqlite_schema.h
new file mode 100644
index 000000000..c8a8fce3e
--- /dev/null
+++ b/src/rgw/driver/dbstore/config/sqlite_schema.h
@@ -0,0 +1,299 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <initializer_list>
+
+namespace rgw::dbstore::config::schema {
+
+struct Migration {
+  // human-readable description to help with debugging migration errors
+  const char* description = nullptr;
+  // series of sql statements to apply the schema migration
+  const char* up = nullptr;
+  // series of sql statements to undo the schema migration
+  const char* down = nullptr;
+};
+
+static constexpr std::initializer_list<Migration> migrations {{
+    .description = "create the initial ConfigStore tables",
+    .up = R"(
+CREATE TABLE IF NOT EXISTS Realms (
+  ID TEXT PRIMARY KEY NOT NULL,
+  Name TEXT UNIQUE NOT NULL,
+  CurrentPeriod TEXT,
+  Epoch INTEGER DEFAULT 0,
+  VersionNumber INTEGER,
+  VersionTag TEXT
+);
+CREATE TABLE IF NOT EXISTS Periods (
+  ID TEXT NOT NULL,
+  Epoch INTEGER DEFAULT 0,
+  RealmID TEXT NOT NULL REFERENCES Realms (ID),
+  Data TEXT NOT NULL,
+  PRIMARY KEY (ID, Epoch)
+);
+CREATE TABLE IF NOT EXISTS PeriodConfigs (
+  RealmID TEXT PRIMARY KEY NOT NULL REFERENCES Realms (ID),
+  Data TEXT NOT NULL
+);
+CREATE TABLE IF NOT EXISTS ZoneGroups (
+  ID TEXT PRIMARY KEY NOT NULL,
+  Name TEXT UNIQUE NOT NULL,
+  RealmID TEXT NOT NULL REFERENCES Realms (ID),
+  Data TEXT NOT NULL,
+  VersionNumber INTEGER,
+  VersionTag TEXT
+);
+CREATE TABLE IF NOT EXISTS Zones (
+  ID TEXT PRIMARY KEY NOT NULL,
+  Name TEXT UNIQUE NOT NULL,
+  RealmID TEXT NOT NULL REFERENCES Realms (ID),
+  Data TEXT NOT NULL,
+  VersionNumber INTEGER,
+  VersionTag TEXT
+);
+CREATE TABLE IF NOT EXISTS DefaultRealms (
+  ID TEXT,
+  Empty TEXT PRIMARY KEY
+);
+CREATE TABLE IF NOT EXISTS DefaultZoneGroups (
+  ID TEXT,
+  RealmID TEXT PRIMARY KEY REFERENCES Realms (ID)
+);
+CREATE TABLE IF NOT EXISTS DefaultZones (
+  ID TEXT,
+  RealmID TEXT PRIMARY KEY REFERENCES Realms (ID)
+);
+)",
+    .down = R"(
+DROP TABLE IF EXISTS Realms;
+DROP TABLE IF EXISTS Periods;
+DROP TABLE IF EXISTS PeriodConfigs;
+DROP TABLE IF EXISTS ZoneGroups;
+DROP TABLE IF EXISTS Zones;
+DROP TABLE IF EXISTS DefaultRealms;
+DROP TABLE IF EXISTS DefaultZoneGroups;
+DROP TABLE IF EXISTS DefaultZones;
+)"
+  }
+};
+
+
+// DefaultRealms
+
+static constexpr const char* default_realm_insert1 =
+"INSERT INTO DefaultRealms (ID, Empty) VALUES ({}, '')";
+
+static constexpr const char* default_realm_upsert1 =
+R"(INSERT INTO DefaultRealms (ID, Empty) VALUES ({0}, '')
+ON CONFLICT(Empty) DO UPDATE SET ID = {0})";
+
+static constexpr const char* default_realm_select0 =
+"SELECT ID FROM DefaultRealms LIMIT 1";
+
+static constexpr const char* default_realm_delete0 =
+"DELETE FROM DefaultRealms";
+
+
+// Realms
+
+static constexpr const char* realm_update5 =
+"UPDATE Realms SET CurrentPeriod = {1}, Epoch = {2}, VersionNumber = {3} + 1 \
+WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}";
+
+static constexpr const char* realm_rename4 =
+"UPDATE Realms SET Name = {1}, VersionNumber = {2} + 1 \
+WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}";
+
+static constexpr const char* realm_delete3 =
+"DELETE FROM Realms WHERE ID = {} AND VersionNumber = {} AND VersionTag = {}";
+
+static constexpr const char* realm_insert4 =
+"INSERT INTO Realms (ID, Name, VersionNumber, VersionTag) \
+VALUES ({}, {}, {}, {})";
+
+static constexpr const char* realm_upsert4 =
+"INSERT INTO Realms (ID, Name, VersionNumber, VersionTag) \
+VALUES ({0}, {1}, {2}, {3}) \
+ON CONFLICT(ID) DO UPDATE SET Name = {1}, \
+VersionNumber = {2}, VersionTag = {3}";
+
+static constexpr const char* realm_select_id1 =
+"SELECT * FROM Realms WHERE ID = {} LIMIT 1";
+
+static constexpr const char* realm_select_name1 =
+"SELECT * FROM Realms WHERE Name = {} LIMIT 1";
+
+static constexpr const char* realm_select_default0 =
+"SELECT r.* FROM Realms r \
+INNER JOIN DefaultRealms d \
+ON d.ID = r.ID LIMIT 1";
+
+static constexpr const char* realm_select_names2 =
+"SELECT Name FROM Realms WHERE Name > {} \
+ORDER BY Name ASC LIMIT {}";
+
+
+// Periods
+
+static constexpr const char* period_insert4 =
+"INSERT INTO Periods (ID, Epoch, RealmID, Data) \
+VALUES ({}, {}, {}, {})";
+
+static constexpr const char* period_upsert4 =
+"INSERT INTO Periods (ID, Epoch, RealmID, Data) \
+VALUES ({0}, {1}, {2}, {3}) \
+ON CONFLICT DO UPDATE SET RealmID = {2}, Data = {3}";
+
+static constexpr const char* period_select_epoch2 =
+"SELECT * FROM Periods WHERE ID = {} AND Epoch = {} LIMIT 1";
+
+static constexpr const char* period_select_latest1 =
+"SELECT * FROM Periods WHERE ID = {} ORDER BY Epoch DESC LIMIT 1";
+
+static constexpr const char* period_delete1 =
+"DELETE FROM Periods WHERE ID = {}";
+
+static constexpr const char* period_select_ids2 =
+"SELECT ID FROM Periods WHERE ID > {} ORDER BY ID ASC LIMIT {}";
+
+
+// DefaultZoneGroups
+
+static constexpr const char* default_zonegroup_insert2 =
+"INSERT INTO DefaultZoneGroups (RealmID, ID) VALUES ({}, {})";
+
+static constexpr const char* default_zonegroup_upsert2 =
+"INSERT INTO DefaultZoneGroups (RealmID, ID) \
+VALUES ({0}, {1}) \
+ON CONFLICT(RealmID) DO UPDATE SET ID = {1}";
+
+static constexpr const char* default_zonegroup_select1 =
+"SELECT ID FROM DefaultZoneGroups WHERE RealmID = {}";
+
+static constexpr const char* default_zonegroup_delete1 =
+"DELETE FROM DefaultZoneGroups WHERE RealmID = {}";
+
+
+// ZoneGroups
+
+static constexpr const char* zonegroup_update5 =
+"UPDATE ZoneGroups SET RealmID = {1}, Data = {2}, VersionNumber = {3} + 1 \
+WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}";
+
+static constexpr const char* zonegroup_rename4 =
+"UPDATE ZoneGroups SET Name = {1}, VersionNumber = {2} + 1 \
+WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}";
+
+static constexpr const char* zonegroup_delete3 =
+"DELETE FROM ZoneGroups WHERE ID = {} \
+AND VersionNumber = {} AND VersionTag = {}";
+
+static constexpr const char* zonegroup_insert6 =
+"INSERT INTO ZoneGroups (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
+VALUES ({}, {}, {}, {}, {}, {})";
+
+static constexpr const char* zonegroup_upsert6 =
+"INSERT INTO ZoneGroups (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
+VALUES ({0}, {1}, {2}, {3}, {4}, {5}) \
+ON CONFLICT (ID) DO UPDATE SET Name = {1}, RealmID = {2}, \
+Data = {3}, VersionNumber = {4}, VersionTag = {5}";
+
+static constexpr const char* zonegroup_select_id1 =
+"SELECT * FROM ZoneGroups WHERE ID = {} LIMIT 1";
+
+static constexpr const char* zonegroup_select_name1 =
+"SELECT * FROM ZoneGroups WHERE Name = {} LIMIT 1";
+
+static constexpr const char* zonegroup_select_default0 =
+"SELECT z.* FROM ZoneGroups z \
+INNER JOIN DefaultZoneGroups d \
+ON d.ID = z.ID LIMIT 1";
+
+static constexpr const char* zonegroup_select_names2 =
+"SELECT Name FROM ZoneGroups WHERE Name > {} \
+ORDER BY Name ASC LIMIT {}";
+
+
+// DefaultZones
+
+static constexpr const char* default_zone_insert2 =
+"INSERT INTO DefaultZones (RealmID, ID) VALUES ({}, {})";
+
+static constexpr const char* default_zone_upsert2 =
+"INSERT INTO DefaultZones (RealmID, ID) VALUES ({0}, {1}) \
+ON CONFLICT(RealmID) DO UPDATE SET ID = {1}";
+
+static constexpr const char* default_zone_select1 =
+"SELECT ID FROM DefaultZones WHERE RealmID = {}";
+
+static constexpr const char* default_zone_delete1 =
+"DELETE FROM DefaultZones WHERE RealmID = {}";
+
+
+// Zones
+
+static constexpr const char* zone_update5 =
+"UPDATE Zones SET RealmID = {1}, Data = {2}, VersionNumber = {3} + 1 \
+WHERE ID = {0} AND VersionNumber = {3} AND VersionTag = {4}";
+
+static constexpr const char* zone_rename4 =
+"UPDATE Zones SET Name = {1}, VersionNumber = {2} + 1 \
+WHERE ID = {0} AND VersionNumber = {2} AND VersionTag = {3}";
+
+static constexpr const char* zone_delete3 =
+"DELETE FROM Zones WHERE ID = {} AND VersionNumber = {} AND VersionTag = {}";
+
+static constexpr const char* zone_insert6 =
+"INSERT INTO Zones (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
+VALUES ({}, {}, {}, {}, {}, {})";
+
+static constexpr const char* zone_upsert6 =
+"INSERT INTO Zones (ID, Name, RealmID, Data, VersionNumber, VersionTag) \
+VALUES ({0}, {1}, {2}, {3}, {4}, {5}) \
+ON CONFLICT (ID) DO UPDATE SET Name = {1}, RealmID = {2}, \
+Data = {3}, VersionNumber = {4}, VersionTag = {5}";
+
+static constexpr const char* zone_select_id1 =
+"SELECT * FROM Zones WHERE ID = {} LIMIT 1";
+
+static constexpr const char* zone_select_name1 =
+"SELECT * FROM Zones WHERE Name = {} LIMIT 1";
+
+static constexpr const char* zone_select_default0 =
+"SELECT z.* FROM Zones z \
+INNER JOIN DefaultZones d \
+ON d.ID = z.ID LIMIT 1";
+
+static constexpr const char* zone_select_names2 =
+"SELECT Name FROM Zones WHERE Name > {} \
+ORDER BY Name ASC LIMIT {}";
+
+
+// PeriodConfigs
+
+static constexpr const char* period_config_insert2 =
+"INSERT INTO PeriodConfigs (RealmID, Data) VALUES ({}, {})";
+
+static constexpr const char* period_config_upsert2 =
+"INSERT INTO PeriodConfigs (RealmID, Data) VALUES ({0}, {1}) \
+ON CONFLICT (RealmID) DO UPDATE SET Data = {1}";
+
+static constexpr const char* period_config_select1 =
+"SELECT Data FROM PeriodConfigs WHERE RealmID = {} LIMIT 1";
+
+} // namespace rgw::dbstore::config::schema
diff --git a/src/rgw/driver/dbstore/config/store.cc b/src/rgw/driver/dbstore/config/store.cc
new file mode 100644
index 000000000..569a093b7
--- /dev/null
+++ b/src/rgw/driver/dbstore/config/store.cc
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <stdexcept>
+
+#include <fmt/format.h>
+
+#include "store.h"
+#ifdef SQLITE_ENABLED
+#include "sqlite.h"
+#endif
+
+namespace rgw::dbstore {
+
+auto create_config_store(const DoutPrefixProvider* dpp, const std::string& uri)
+  -> std::unique_ptr<sal::ConfigStore>
+{
+#ifdef SQLITE_ENABLED
+  if (uri.starts_with("file:")) {
+    return config::create_sqlite_store(dpp, uri);
+  }
+#endif
+  throw std::runtime_error(fmt::format("unrecognized URI {}", uri));
+}
+
+} // namespace rgw::dbstore
diff --git a/src/rgw/driver/dbstore/config/store.h b/src/rgw/driver/dbstore/config/store.h
new file mode 100644
index 000000000..553d9f709
--- /dev/null
+++ b/src/rgw/driver/dbstore/config/store.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include "rgw_sal_config.h"
+
+namespace rgw::dbstore {
+
+// ConfigStore factory
+auto create_config_store(const DoutPrefixProvider* dpp, const std::string& uri)
+  -> std::unique_ptr<sal::ConfigStore>;
+
+} // namespace rgw::dbstore
diff --git a/src/rgw/driver/dbstore/dbstore_main.cc b/src/rgw/driver/dbstore/dbstore_main.cc
new file mode 100644
index 000000000..4fff38ced
--- /dev/null
+++ b/src/rgw/driver/dbstore/dbstore_main.cc
@@ -0,0 +1,199 @@
+#include <stdio.h>
+#include <sqlite3.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "dbstore_mgr.h"
+#include <dbstore.h>
+#include <dbstore_log.h>
+
+using namespace std;
+using namespace rgw::store;
+using DB = rgw::store::DB;
+
+struct thr_args {
+  DB *dbs;
+  int thr_id;
+};
+
+void* process(void *arg)
+{
+  struct thr_args *t_args = (struct thr_args*)arg;
+
+  DB *db = t_args->dbs;
+  int thr_id = t_args->thr_id;
+  int ret = -1;
+
+  cout<<"Entered thread:"<<thr_id<<"\n";
+
+  string user1 = "User1";
+  string bucketa = "rgw";
+  string objecta1 = "bugfixing";
+  string objecta2 = "zipper";
+  string bucketb = "gluster";
+  string objectb1 = "bugfixing";
+  string objectb2 = "delegations";
+
+  string user2 = "User2";
+  string bucketc = "qe";
+  string objectc1 = "rhhi";
+  string objectc2 = "cns";
+
+  DBOpParams params = {};
+  const DoutPrefixProvider *dpp = db->get_def_dpp();
+
+  db->InitializeParams(dpp, &params);
+
+  params.op.user.uinfo.display_name = user1;
+  params.op.user.uinfo.user_id.tenant = "tenant";
+  params.op.user.uinfo.user_id.id = user1;
+  params.op.user.uinfo.suspended = 123;
+  params.op.user.uinfo.max_buckets = 456;
+  params.op.user.uinfo.placement_tags.push_back("tags1");
+  params.op.user.uinfo.placement_tags.push_back("tags2");
+
+  RGWAccessKey k1("id1", "key1");
+  RGWAccessKey k2("id2", "key2");
+  params.op.user.uinfo.access_keys.insert(make_pair("key1", k1));
+  params.op.user.uinfo.access_keys.insert(make_pair("key2", k2));
+
+  ret = db->ProcessOp(dpp, "InsertUser", &params);
+  cout << "InsertUser return value: " <<  ret << "\n";
+
+  DBOpParams params2 = {};
+  params.op.user.uinfo.user_id.tenant = "tenant2";
+
+  db->InitializeParams(dpp, &params2);
+  params2.op.user.uinfo.display_name = user1;
+  ret = db->ProcessOp(dpp, "GetUser", &params2);
+
+  cout << "GetUser return value: " <<  ret << "\n";
+
+  cout << "tenant: " << params2.op.user.uinfo.user_id.tenant << "\n";
+  cout << "suspended: " << (int)params2.op.user.uinfo.suspended << "\n";
+
+  list<string>::iterator it = params2.op.user.uinfo.placement_tags.begin();
+
+  while (it != params2.op.user.uinfo.placement_tags.end()) {
+    cout << "list = " << *it << "\n";
+    it++;
+  }
+
+  map<string, RGWAccessKey>::iterator it2 = params2.op.user.uinfo.access_keys.begin();
+
+  while (it2 != params2.op.user.uinfo.access_keys.end()) {
+    cout << "keys = " << it2->first << "\n";
+    RGWAccessKey k = it2->second;
+    cout << "id = " << k.id << ", keys = " << k.key << "\n";
+    it2++;
+  }
+
+  params.op.bucket.info.bucket.name = bucketa;
+  db->ProcessOp(dpp, "InsertBucket", &params);
+
+  params.op.user.uinfo.display_name = user2;
+  params.op.user.uinfo.user_id.id = user2;
+  db->ProcessOp(dpp, "InsertUser", &params);
+
+  params.op.bucket.info.bucket.name = bucketb;
+  db->ProcessOp(dpp, "InsertBucket", &params);
+
+  db->ProcessOp(dpp, "GetUser", &params);
+  db->ProcessOp(dpp, "GetBucket", &params);
+
+  db->ListAllUsers(dpp, &params);
+  db->ListAllBuckets(dpp, &params);
+
+  params.op.bucket.info.bucket.name = bucketb;
+
+  db->ProcessOp(dpp, "RemoveBucket", &params);
+
+  params.op.user.uinfo.user_id.id = user2;
+  db->ProcessOp(dpp, "RemoveUser", &params);
+
+  db->ListAllUsers(dpp, &params);
+  db->ListAllBuckets(dpp, &params);
+  cout<<"Exiting thread:"<<thr_id<<"\n";
+
+  return 0;
+}
+
+int main(int argc, char *argv[])
+{
+  string tenant = "Redhat";
+  string logfile = "rgw_dbstore_bin.log";
+  int loglevel = 20;
+
+  DBStoreManager *dbsm;
+  DB *dbs;
+  int rc = 0, tnum = 0;
+  void *res;
+
+  pthread_attr_t attr;
+  int num_thr = 2;
+  pthread_t threads[num_thr];
+  struct thr_args t_args[num_thr];
+
+
+  cout << "loglevel  " << loglevel << "\n";
+  // format: ./dbstore-bin logfile loglevel
+  if (argc == 3) {
+	logfile = argv[1];
+	loglevel = (atoi)(argv[2]);
+	cout << "loglevel set to " << loglevel << "\n";
+  }
+
+  vector<const char*> args;
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+                CODE_ENVIRONMENT_DAEMON, CINIT_FLAG_NO_MON_CONFIG, 1);
+  dbsm = new DBStoreManager(cct.get(), logfile, loglevel);
+  dbs = dbsm->getDB(tenant, true);
+
+  cout<<"No. of threads being created = "<<num_thr<<"\n";
+
+  /* Initialize thread creation attributes */
+  rc = pthread_attr_init(&attr);
+
+  if (rc != 0) {
+    cout<<" error in pthread_attr_init \n";
+    goto out;
+  }
+
+  for (tnum = 0; tnum < num_thr; tnum++) {
+    t_args[tnum].dbs = dbs;
+    t_args[tnum].thr_id = tnum;
+    rc = pthread_create((pthread_t*)&threads[tnum], &attr, &process,
+        &t_args[tnum]);
+    if (rc != 0) {
+      cout<<" error in pthread_create \n";
+      goto out;
+    }
+
+    cout<<"Created thread (thread-id:"<<tnum<<")\n";
+  }
+
+  /* Destroy the thread attributes object, since it is no
+     longer needed */
+
+  rc = pthread_attr_destroy(&attr);
+  if (rc != 0) {
+    cout<<"error in pthread_attr_destroy \n";
+  }
+
+  /* Now join with each thread, and display its returned value */
+
+  for (tnum = 0; tnum < num_thr; tnum++) {
+    rc = pthread_join(threads[tnum], &res);
+    if (rc != 0) {
+      cout<<"error in pthread_join \n";
+    } else {
+      cout<<"Joined with thread "<<tnum<<"\n";
+    }
+  }
+
+out:
+  dbsm->destroyAllHandles();
+
+  return 0;
+}
diff --git a/src/rgw/driver/dbstore/dbstore_mgr.cc b/src/rgw/driver/dbstore/dbstore_mgr.cc
new file mode 100644
index 000000000..6835f526b
--- /dev/null
+++ b/src/rgw/driver/dbstore/dbstore_mgr.cc
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "dbstore_mgr.h"
+#include "common/dbstore_log.h"
+
+#include <filesystem>
+
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+using namespace std;
+
+
+/* Given a tenant, find and return the DBStore handle.
+ * If not found and 'create' set to true, create one
+ * and return
+ */
+DB *DBStoreManager::getDB (string tenant, bool create)
+{
+  map<string, DB*>::iterator iter;
+  DB *dbs = nullptr;
+  pair<map<string, DB*>::iterator,bool> ret;
+
+  if (tenant.empty())
+    return default_db;
+
+  if (DBStoreHandles.empty())
+    goto not_found;
+
+  iter = DBStoreHandles.find(tenant);
+
+  if (iter != DBStoreHandles.end())
+    return iter->second;
+
+not_found:
+  if (!create)
+    return nullptr;
+
+  dbs = createDB(tenant);
+
+  return dbs;
+}
+
+/* Create DBStore instance */
+DB *DBStoreManager::createDB(std::string tenant) {
+  DB *dbs = nullptr;
+  pair<map<string, DB*>::iterator,bool> ret;
+  const auto& db_path = g_conf().get_val<std::string>("dbstore_db_dir");
+  const auto& db_name = g_conf().get_val<std::string>("dbstore_db_name_prefix") + "-" + tenant;
+
+  auto db_full_path = std::filesystem::path(db_path) / db_name;
+  ldout(cct, 0) << "DB initialization full db_path("<<db_full_path<<")" << dendl;
+
+  /* Create the handle */
+#ifdef SQLITE_ENABLED
+  dbs = new SQLiteDB(db_full_path.string(), cct);
+#else
+  dbs = new DB(db_full_path.string(), cct);
+#endif
+
+  /* API is DB::Initialize(string logfile, int loglevel);
+   * If none provided, by default write in to dbstore.log file
+   * created in current working directory with loglevel L_EVENT.
+   * XXX: need to align these logs to ceph location
+   */
+  if (dbs->Initialize("", -1) < 0) {
+    ldout(cct, 0) << "DB initialization failed for tenant("<<tenant<<")" << dendl;
+
+    delete dbs;
+    return nullptr;
+  }
+
+  /* XXX: Do we need lock to protect this map?
+  */
+  ret = DBStoreHandles.insert(pair<string, DB*>(tenant, dbs));
+
+  /*
+   * Its safe to check for already existing entry (just
+   * incase other thread raced and created the entry)
+   */
+  if (ret.second == false) {
+    /* Entry already created by another thread */
+    delete dbs;
+
+    dbs = ret.first->second;
+  }
+
+  return dbs;
+}
+
+void DBStoreManager::deleteDB(string tenant) {
+  map<string, DB*>::iterator iter;
+  DB *dbs = nullptr;
+
+  if (tenant.empty() || DBStoreHandles.empty())
+    return;
+
+  /* XXX: Check if we need to perform this operation under a lock */
+  iter = DBStoreHandles.find(tenant);
+
+  if (iter == DBStoreHandles.end())
+    return;
+
+  dbs = iter->second;
+
+  DBStoreHandles.erase(iter);
+  dbs->Destroy(dbs->get_def_dpp());
+  delete dbs;
+
+  return;
+}
+
+void DBStoreManager::deleteDB(DB *dbs) {
+  if (!dbs)
+    return;
+
+  (void)deleteDB(dbs->getDBname());
+}
+
+
+void DBStoreManager::destroyAllHandles(){
+  map<string, DB*>::iterator iter;
+  DB *dbs = nullptr;
+
+  if (DBStoreHandles.empty())
+    return;
+
+  for (iter = DBStoreHandles.begin(); iter != DBStoreHandles.end();
+      ++iter) {
+    dbs = iter->second;
+    dbs->Destroy(dbs->get_def_dpp());
+    delete dbs;
+  }
+
+  DBStoreHandles.clear();
+
+  return;
+}
+
+
diff --git a/src/rgw/driver/dbstore/dbstore_mgr.h b/src/rgw/driver/dbstore/dbstore_mgr.h
new file mode 100644
index 000000000..77fc3aaf7
--- /dev/null
+++ b/src/rgw/driver/dbstore/dbstore_mgr.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <map>
+#include <cerrno>
+#include <cstdlib>
+#include <string>
+#include <cstdio>
+#include <iostream>
+#include <vector>
+
+#include "common/ceph_context.h"
+#include "common/dbstore.h"
+#include "sqlite/sqliteDB.h"
+
+using namespace rgw::store;
+using DB = rgw::store::DB;
+
+/* XXX: Should be a dbstore config option */
+const static std::string default_tenant = "default_ns";
+
+class DBStoreManager {
+private:
+  std::map<std::string, DB*> DBStoreHandles;
+  DB *default_db = nullptr;
+  CephContext *cct;
+
+public:
+  DBStoreManager(CephContext *_cct): DBStoreHandles() {
+    cct = _cct;
+	default_db = createDB(default_tenant);
+  };
+  DBStoreManager(CephContext *_cct, std::string logfile, int loglevel): DBStoreHandles() {
+    /* No ceph context. Create one with log args provided */
+    cct = _cct;
+    cct->_log->set_log_file(logfile);
+    cct->_log->reopen_log_file();
+    cct->_conf->subsys.set_log_level(ceph_subsys_rgw, loglevel);
+    default_db = createDB(default_tenant);
+  };
+  ~DBStoreManager() { destroyAllHandles(); };
+
+  /* XXX: TBD based on testing
+   * 1)  Lock to protect DBStoreHandles map.
+   * 2) Refcount of each DBStore to protect from
+   * being deleted while using it.
+   */
+  DB* getDB () { return default_db; };
+  DB* getDB (std::string tenant, bool create);
+  DB* createDB (std::string tenant);
+  void deleteDB (std::string tenant);
+  void deleteDB (DB* db);
+  void destroyAllHandles();
+};
diff --git a/src/rgw/driver/dbstore/sqlite/CMakeLists.txt b/src/rgw/driver/dbstore/sqlite/CMakeLists.txt
new file mode 100644
index 000000000..909765e30
--- /dev/null
+++ b/src/rgw/driver/dbstore/sqlite/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.14.0)
+project(sqlite_db)
+
+find_package(SQLite3 REQUIRED)
+
+set(sqlite_db_srcs
+    sqliteDB.h
+    sqliteDB.cc)
+
+include_directories(${CMAKE_INCLUDE_DIR})
+
+set(SQLITE_COMPILE_FLAGS "-DSQLITE_THREADSAFE=1")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SQLITE_COMPILE_FLAGS}")
+
+add_library(sqlite_db STATIC ${sqlite_db_srcs})
+target_link_libraries(sqlite_db sqlite3 dbstore_lib rgw_common)
diff --git a/src/rgw/driver/dbstore/sqlite/connection.cc b/src/rgw/driver/dbstore/sqlite/connection.cc
new file mode 100644
index 000000000..143a3a0d5
--- /dev/null
+++ b/src/rgw/driver/dbstore/sqlite/connection.cc
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "connection.h"
+#include "error.h"
+
+namespace rgw::dbstore::sqlite {
+
+db_ptr open_database(const char* filename, int flags)
+{
+  sqlite3* db = nullptr;
+  const int result = ::sqlite3_open_v2(filename, &db, flags, nullptr);
+  if (result != SQLITE_OK) {
+    throw std::system_error(result, sqlite::error_category());
+  }
+  // request extended result codes
+  (void) ::sqlite3_extended_result_codes(db, 1);
+  return db_ptr{db};
+}
+
+} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/driver/dbstore/sqlite/connection.h b/src/rgw/driver/dbstore/sqlite/connection.h
new file mode 100644
index 000000000..6088763fd
--- /dev/null
+++ b/src/rgw/driver/dbstore/sqlite/connection.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include <sqlite3.h>
+
+#include <fmt/format.h>
+
+#include "sqlite/statement.h"
+
+class DoutPrefixProvider;
+
+namespace rgw::dbstore::sqlite {
+
+// owning sqlite3 pointer
+struct db_deleter {
+  void operator()(sqlite3* p) const { ::sqlite3_close(p); }
+};
+using db_ptr = std::unique_ptr<sqlite3, db_deleter>;
+
+
+// open the database file or throw on error
+db_ptr open_database(const char* filename, int flags);
+
+
+struct Connection {
+  db_ptr db;
+  // map of statements, prepared on first use
+  std::map<std::string_view, stmt_ptr> statements;
+
+  explicit Connection(db_ptr db) : db(std::move(db)) {}
+};
+
+// sqlite connection factory for ConnectionPool
+class ConnectionFactory {
+  std::string uri;
+  int flags;
+ public:
+  ConnectionFactory(std::string uri, int flags)
+      : uri(std::move(uri)), flags(flags) {}
+
+  auto operator()(const DoutPrefixProvider* dpp)
+    -> std::unique_ptr<Connection>
+  {
+    auto db = open_database(uri.c_str(), flags);
+    return std::make_unique<Connection>(std::move(db));
+  }
+};
+
+} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/driver/dbstore/sqlite/error.cc b/src/rgw/driver/dbstore/sqlite/error.cc
new file mode 100644
index 000000000..5fe9eb0ae
--- /dev/null
+++ b/src/rgw/driver/dbstore/sqlite/error.cc
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "error.h"
+
+namespace rgw::dbstore::sqlite {
+
+const std::error_category& error_category()
+{
+  struct category : std::error_category {
+    const char* name() const noexcept override {
+      return "dbstore:sqlite";
+    }
+    std::string message(int ev) const override {
+      return ::sqlite3_errstr(ev);
+    }
+    std::error_condition default_error_condition(int code) const noexcept override {
+      return {code & 0xFF, category()};
+    }
+  };
+  static category instance;
+  return instance;
+}
+
+} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/driver/dbstore/sqlite/error.h b/src/rgw/driver/dbstore/sqlite/error.h
new file mode 100644
index 000000000..15396d8ca
--- /dev/null
+++ b/src/rgw/driver/dbstore/sqlite/error.h
@@ -0,0 +1,81 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <system_error>
+#include <sqlite3.h>
+
+namespace rgw::dbstore::sqlite {
+
+// error category for sqlite extended result codes:
+//   https://www.sqlite.org/rescode.html
+const std::error_category& error_category();
+
+
+// sqlite exception type that carries the extended error code and message
+class error : public std::runtime_error {
+  std::error_code ec;
+ public:
+  error(const char* errmsg, std::error_code ec)
+      : runtime_error(errmsg), ec(ec) {}
+  error(sqlite3* db, std::error_code ec) : error(::sqlite3_errmsg(db), ec) {}
+  error(sqlite3* db, int result) : error(db, {result, error_category()}) {}
+  error(sqlite3* db) : error(db, ::sqlite3_extended_errcode(db)) {}
+  std::error_code code() const { return ec; }
+};
+
+
+// sqlite error conditions for primary and extended result codes
+//
+// 'primary' error_conditions will match 'primary' error_codes as well as any
+// 'extended' error_codes whose lowest 8 bits match that primary code. for
+// example, the error_condition for SQLITE_CONSTRAINT will match the error_codes
+// SQLITE_CONSTRAINT and SQLITE_CONSTRAINT_*
+enum class errc {
+  // primary result codes
+  ok = SQLITE_OK,
+  busy = SQLITE_BUSY,
+  constraint = SQLITE_CONSTRAINT,
+  row = SQLITE_ROW,
+  done = SQLITE_DONE,
+
+  // extended result codes
+  primary_key_constraint = SQLITE_CONSTRAINT_PRIMARYKEY,
+  foreign_key_constraint = SQLITE_CONSTRAINT_FOREIGNKEY,
+  unique_constraint = SQLITE_CONSTRAINT_UNIQUE,
+
+  // ..add conditions as needed
+};
+
+inline std::error_code make_error_code(errc e)
+{
+  return {static_cast<int>(e), error_category()};
+}
+
+inline std::error_condition make_error_condition(errc e)
+{
+  return {static_cast<int>(e), error_category()};
+}
+
+} // namespace rgw::dbstore::sqlite
+
+namespace std {
+
+// enable implicit conversions from sqlite::errc to std::error_condition
+template<> struct is_error_condition_enum<
+    rgw::dbstore::sqlite::errc> : public true_type {};
+
+} // namespace std
diff --git a/src/rgw/driver/dbstore/sqlite/sqliteDB.cc b/src/rgw/driver/dbstore/sqlite/sqliteDB.cc
new file mode 100644
index 000000000..dc244c07b
--- /dev/null
+++ b/src/rgw/driver/dbstore/sqlite/sqliteDB.cc
@@ -0,0 +1,2996 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "sqliteDB.h"
+
+using namespace std;
+
+#define SQL_PREPARE(dpp, params, sdb, stmt, ret, Op) 	\
+  do {							\
+    string schema;			   		\
+    schema = Schema(params);	   		\
+    sqlite3_prepare_v2 (*sdb, schema.c_str(), 	\
+        -1, &stmt , NULL);		\
+    if (!stmt) {					\
+      ldpp_dout(dpp, 0) <<"failed to prepare statement " \
+      <<"for Op("<<Op<<"); Errmsg -"\
+      <<sqlite3_errmsg(*sdb)<< dendl;\
+      ret = -1;				\
+      goto out;				\
+    }						\
+    ldpp_dout(dpp, 20)<<"Successfully Prepared stmt for Op("<<Op	\
+    <<") schema("<<schema<<") stmt("<<stmt<<")"<< dendl;	\
+    ret = 0;					\
+  } while(0);
+
+#define SQL_BIND_INDEX(dpp, stmt, index, str, sdb)	\
+  do {						\
+    index = sqlite3_bind_parameter_index(stmt, str);     \
+    \
+    if (index <=0)  {				     \
+      ldpp_dout(dpp, 0) <<"failed to fetch bind parameter"\
+      " index for str("<<str<<") in "   \
+      <<"stmt("<<stmt<<"); Errmsg -"    \
+      <<sqlite3_errmsg(*sdb)<< dendl; 	     \
+      rc = -1;				     \
+      goto out;				     \
+    }						     \
+    ldpp_dout(dpp, 20)<<"Bind parameter index for str("  \
+    <<str<<") in stmt("<<stmt<<") is "  \
+    <<index<< dendl;			     \
+  }while(0);
+
+#define SQL_BIND_TEXT(dpp, stmt, index, str, sdb)			\
+  do {								\
+    rc = sqlite3_bind_text(stmt, index, str, -1, SQLITE_TRANSIENT); 	\
+    if (rc != SQLITE_OK) {					      	\
+      ldpp_dout(dpp, 0)<<"sqlite bind text failed for index("     	\
+      <<index<<"), str("<<str<<") in stmt("   	\
+      <<stmt<<"); Errmsg - "<<sqlite3_errmsg(*sdb) \
+      << dendl;				\
+      rc = -1;					\
+      goto out;					\
+    }							\
+    ldpp_dout(dpp, 20)<<"Bind parameter text for index("  \
+    <<index<<") in stmt("<<stmt<<") is "  \
+    <<str<< dendl;			     \
+  }while(0);
+
+#define SQL_BIND_INT(dpp, stmt, index, num, sdb)			\
+  do {								\
+    rc = sqlite3_bind_int(stmt, index, num);		\
+    \
+    if (rc != SQLITE_OK) {					\
+      ldpp_dout(dpp, 0)<<"sqlite bind int failed for index("     	\
+      <<index<<"), num("<<num<<") in stmt("   	\
+      <<stmt<<"); Errmsg - "<<sqlite3_errmsg(*sdb) \
+      << dendl;				\
+      rc = -1;					\
+      goto out;					\
+    }							\
+    ldpp_dout(dpp, 20)<<"Bind parameter int for index("  \
+    <<index<<") in stmt("<<stmt<<") is "  \
+    <<num<< dendl;			     \
+  }while(0);
+
+#define SQL_BIND_BLOB(dpp, stmt, index, blob, size, sdb)		\
+  do {								\
+    rc = sqlite3_bind_blob(stmt, index, blob, size, SQLITE_TRANSIENT);  \
+    \
+    if (rc != SQLITE_OK) {					\
+      ldpp_dout(dpp, 0)<<"sqlite bind blob failed for index("     	\
+      <<index<<"), blob("<<blob<<") in stmt("   	\
+      <<stmt<<"); Errmsg - "<<sqlite3_errmsg(*sdb) \
+      << dendl;				\
+      rc = -1;					\
+      goto out;					\
+    }							\
+  }while(0);
+
+#define SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, param, sdb)		\
+  do {								\
+    bufferlist b;						\
+    encode(param, b);					\
+    SQL_BIND_BLOB(dpp, stmt, index, b.c_str(), b.length(), sdb); \
+  }while(0);
+
+#define SQL_READ_BLOB(dpp, stmt, index, void_ptr, len)		\
+  do {								\
+    void_ptr = NULL;					\
+    void_ptr = (void *)sqlite3_column_blob(stmt, index);	\
+    len = sqlite3_column_bytes(stmt, index);		\
+    \
+    if (!void_ptr || len == 0) {				\
+      ldpp_dout(dpp, 20)<<"Null value for blob index("  \
+      <<index<<") in stmt("<<stmt<<") "<< dendl;   \
+    }							\
+  }while(0);
+
+#define SQL_DECODE_BLOB_PARAM(dpp, stmt, index, param, sdb)		\
+  do {								\
+    bufferlist b;						\
+    void *blob;						\
+    int blob_len = 0;					\
+    \
+    SQL_READ_BLOB(dpp, stmt, index, blob, blob_len);		\
+    \
+    b.append(reinterpret_cast<char *>(blob), blob_len);	\
+    \
+    decode(param, b);					\
+  }while(0);
+
+#define SQL_EXECUTE(dpp, params, stmt, cbk, args...) \
+  do{						\
+    const std::lock_guard<std::mutex> lk(((DBOp*)(this))->mtx); \
+    if (!stmt) {				\
+      ret = Prepare(dpp, params);		\
+    }					\
+    \
+    if (!stmt) {				\
+      ldpp_dout(dpp, 0) <<"No prepared statement "<< dendl;	\
+      goto out;			\
+    }					\
+    \
+    ret = Bind(dpp, params);			\
+    if (ret) {				\
+      ldpp_dout(dpp, 0) <<"Bind parameters failed for stmt(" <<stmt<<") "<< dendl;		\
+      goto out;			\
+    }					\
+    \
+    ret = Step(dpp, params->op, stmt, cbk);		\
+    \
+    Reset(dpp, stmt);				\
+    \
+    if (ret) {				\
+      ldpp_dout(dpp, 0) <<"Execution failed for stmt(" <<stmt<<")"<< dendl;		\
+      goto out;			\
+    }					\
+  }while(0);
+
+int SQLiteDB::InitPrepareParams(const DoutPrefixProvider *dpp,
+                                DBOpPrepareParams &p_params,
+                                DBOpParams* params)
+{
+  std::string bucket;
+
+  if (!params)
+    return -1;
+
+  if (params->user_table.empty()) {
+    params->user_table = getUserTable();
+  }
+  if (params->user_table.empty()) {
+    params->user_table = getUserTable();
+  }
+  if (params->bucket_table.empty()) {
+    params->bucket_table = getBucketTable();
+  }
+  if (params->quota_table.empty()) {
+    params->quota_table = getQuotaTable();
+  }
+  if (params->lc_entry_table.empty()) {
+    params->lc_entry_table = getLCEntryTable();
+  }
+  if (params->lc_head_table.empty()) {
+    params->lc_head_table = getLCHeadTable();
+  }
+
+  p_params.user_table = params->user_table;
+  p_params.bucket_table = params->bucket_table;
+  p_params.quota_table = params->quota_table;
+  p_params.lc_entry_table = params->lc_entry_table;
+  p_params.lc_head_table = params->lc_head_table;
+
+  p_params.op.query_str = params->op.query_str;
+
+  bucket = params->op.bucket.info.bucket.name;
+
+  if (!bucket.empty()) {
+    if (params->object_table.empty()) {
+      params->object_table = getObjectTable(bucket);
+    }
+    if (params->objectdata_table.empty()) {
+      params->objectdata_table = getObjectDataTable(bucket);
+    }
+    if (params->object_view.empty()) {
+      params->object_view = getObjectView(bucket);
+    }
+    if (params->object_trigger.empty()) {
+      params->object_trigger = getObjectTrigger(bucket);
+    }
+    p_params.object_table = params->object_table;
+    p_params.objectdata_table = params->objectdata_table;
+    p_params.object_view = params->object_view;
+  }
+
+  return 0;
+}
+
+static int list_callback(void *None, int argc, char **argv, char **aname)
+{
+  int i;
+  for(i=0; i < argc; i++) {
+    string arg = argv[i] ? argv[i] : "NULL";
+    cout<<aname[i]<<" = "<<arg<<"\n";
+  }
+  return 0;
+}
+
+enum GetUser {
+  UserID = 0,
+  Tenant,
+  NS,
+  DisplayName,
+  UserEmail,
+  AccessKeysID,
+  AccessKeysSecret,
+  AccessKeys,
+  SwiftKeys,
+  SubUsers,
+  Suspended,
+  MaxBuckets,
+  OpMask,
+  UserCaps,
+  Admin,
+  System,
+  PlacementName,
+  PlacementStorageClass,
+  PlacementTags,
+  BucketQuota,
+  TempURLKeys,
+  UserQuota,
+  TYPE,
+  MfaIDs,
+  AssumedRoleARN,
+  UserAttrs,
+  UserVersion,
+  UserVersionTag,
+};
+
+enum GetBucket {
+  BucketName = 0,
+  Bucket_Tenant, //Tenant
+  Marker,
+  BucketID,
+  Size,
+  SizeRounded,
+  CreationTime,
+  Count,
+  Bucket_PlacementName,
+  Bucket_PlacementStorageClass,
+  OwnerID,
+  Flags,
+  Zonegroup,
+  HasInstanceObj,
+  Quota,
+  RequesterPays,
+  HasWebsite,
+  WebsiteConf,
+  SwiftVersioning,
+  SwiftVerLocation,
+  MdsearchConfig,
+  NewBucketInstanceID,
+  ObjectLock,
+  SyncPolicyInfoGroups,
+  BucketAttrs,
+  BucketVersion,
+  BucketVersionTag,
+  Mtime,
+  Bucket_User_NS
+};
+
+enum GetObject {
+  ObjName,
+  ObjInstance,
+  ObjNS,
+  ObjBucketName,
+  ACLs,
+  IndexVer,
+  Tag,
+  ObjFlags,
+  VersionedEpoch,
+  ObjCategory,
+  Etag,
+  Owner,
+  OwnerDisplayName,
+  StorageClass,
+  Appendable,
+  ContentType,
+  IndexHashSource,
+  ObjSize,
+  AccountedSize,
+  ObjMtime,
+  Epoch,
+  ObjTag,
+  TailTag,
+  WriteTag,
+  FakeTag,
+  ShadowObj,
+  HasData,
+  IsVersioned,
+  VersionNum,
+  PGVer,
+  ZoneShortID,
+  ObjVersion,
+  ObjVersionTag,
+  ObjAttrs,
+  HeadSize,
+  MaxHeadSize,
+  ObjID,
+  TailInstance,
+  HeadPlacementRuleName,
+  HeadPlacementRuleStorageClass,
+  TailPlacementRuleName,
+  TailPlacementStorageClass,
+  ManifestPartObjs,
+  ManifestPartRules,
+  Omap,
+  IsMultipart,
+  MPPartsList,
+  HeadData,
+  Versions
+};
+
+enum GetObjectData {
+  ObjDataName,
+  ObjDataInstance,
+  ObjDataNS,
+  ObjDataBucketName,
+  ObjDataID,
+  MultipartPartStr,
+  PartNum,
+  Offset,
+  ObjDataSize,
+  ObjDataMtime,
+  ObjData
+};
+
+enum GetLCEntry {
+  LCEntryIndex,
+  LCEntryBucketName,
+  LCEntryStartTime,
+  LCEntryStatus
+};
+
+enum GetLCHead {
+  LCHeadIndex,
+  LCHeadMarker,
+  LCHeadStartDate
+};
+
+static int list_user(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  op.user.uinfo.user_id.tenant = (const char*)sqlite3_column_text(stmt, Tenant);
+  op.user.uinfo.user_id.id = (const char*)sqlite3_column_text(stmt, UserID);
+  op.user.uinfo.user_id.ns = (const char*)sqlite3_column_text(stmt, NS);
+  op.user.uinfo.display_name = (const char*)sqlite3_column_text(stmt, DisplayName); // user_name
+  op.user.uinfo.user_email = (const char*)sqlite3_column_text(stmt, UserEmail);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, SwiftKeys, op.user.uinfo.swift_keys, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, SubUsers, op.user.uinfo.subusers, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, AccessKeys, op.user.uinfo.access_keys, sdb);
+
+  op.user.uinfo.suspended = sqlite3_column_int(stmt, Suspended);
+  op.user.uinfo.max_buckets = sqlite3_column_int(stmt, MaxBuckets);
+  op.user.uinfo.op_mask = sqlite3_column_int(stmt, OpMask);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, UserCaps, op.user.uinfo.caps, sdb);
+
+  op.user.uinfo.admin = sqlite3_column_int(stmt, Admin);
+  op.user.uinfo.system = sqlite3_column_int(stmt, System);
+
+  op.user.uinfo.default_placement.name = (const char*)sqlite3_column_text(stmt, PlacementName);
+
+  op.user.uinfo.default_placement.storage_class = (const char*)sqlite3_column_text(stmt, PlacementStorageClass);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, PlacementTags, op.user.uinfo.placement_tags, sdb);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, BucketQuota, op.user.uinfo.quota.bucket_quota, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, TempURLKeys, op.user.uinfo.temp_url_keys, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, UserQuota, op.user.uinfo.quota.user_quota, sdb);
+
+  op.user.uinfo.type = sqlite3_column_int(stmt, TYPE);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, MfaIDs, op.user.uinfo.mfa_ids, sdb);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, UserAttrs, op.user.user_attrs, sdb);
+  op.user.user_version.ver = sqlite3_column_int(stmt, UserVersion);
+  op.user.user_version.tag = (const char*)sqlite3_column_text(stmt, UserVersionTag);
+
+  return 0;
+}
+
+static int list_bucket(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  op.bucket.ent.bucket.name = (const char*)sqlite3_column_text(stmt, BucketName);
+  op.bucket.ent.bucket.tenant = (const char*)sqlite3_column_text(stmt, Bucket_Tenant);
+  op.bucket.ent.bucket.marker = (const char*)sqlite3_column_text(stmt, Marker);
+  op.bucket.ent.bucket.bucket_id = (const char*)sqlite3_column_text(stmt, BucketID);
+  op.bucket.ent.size = sqlite3_column_int(stmt, Size);
+  op.bucket.ent.size_rounded = sqlite3_column_int(stmt, SizeRounded);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, CreationTime, op.bucket.ent.creation_time, sdb);
+  op.bucket.ent.count = sqlite3_column_int(stmt, Count);
+  op.bucket.ent.placement_rule.name = (const char*)sqlite3_column_text(stmt, Bucket_PlacementName);
+  op.bucket.ent.placement_rule.storage_class = (const char*)sqlite3_column_text(stmt, Bucket_PlacementStorageClass);
+
+  op.bucket.info.bucket = op.bucket.ent.bucket;
+  op.bucket.info.placement_rule = op.bucket.ent.placement_rule;
+  op.bucket.info.creation_time = op.bucket.ent.creation_time;
+
+  op.bucket.info.owner.id = (const char*)sqlite3_column_text(stmt, OwnerID);
+  op.bucket.info.owner.tenant = op.bucket.ent.bucket.tenant;
+
+  if (op.name == "GetBucket") {
+    op.bucket.info.owner.ns = (const char*)sqlite3_column_text(stmt, Bucket_User_NS);
+  }
+
+  op.bucket.info.flags = sqlite3_column_int(stmt, Flags);
+  op.bucket.info.zonegroup = (const char*)sqlite3_column_text(stmt, Zonegroup);
+  op.bucket.info.has_instance_obj = sqlite3_column_int(stmt, HasInstanceObj);
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, Quota, op.bucket.info.quota, sdb);
+  op.bucket.info.requester_pays = sqlite3_column_int(stmt, RequesterPays);
+  op.bucket.info.has_website = sqlite3_column_int(stmt, HasWebsite);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, WebsiteConf, op.bucket.info.website_conf, sdb);
+  op.bucket.info.swift_versioning = sqlite3_column_int(stmt, SwiftVersioning);
+  op.bucket.info.swift_ver_location = (const char*)sqlite3_column_text(stmt, SwiftVerLocation);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, MdsearchConfig, op.bucket.info.mdsearch_config, sdb);
+  op.bucket.info.new_bucket_instance_id = (const char*)sqlite3_column_text(stmt, NewBucketInstanceID);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjectLock, op.bucket.info.obj_lock, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, SyncPolicyInfoGroups, op.bucket.info.sync_policy, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, BucketAttrs, op.bucket.bucket_attrs, sdb);
+  op.bucket.bucket_version.ver = sqlite3_column_int(stmt, BucketVersion);
+  op.bucket.bucket_version.tag = (const char*)sqlite3_column_text(stmt, BucketVersionTag);
+
+  /* Read bucket version into info.objv_tracker.read_ver. No need
+   * to set write_ver as its not used anywhere. Still keeping its
+   * value same as read_ver */
+  op.bucket.info.objv_tracker.read_version = op.bucket.bucket_version;
+  op.bucket.info.objv_tracker.write_version = op.bucket.bucket_version;
+
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, Mtime, op.bucket.mtime, sdb);
+
+  op.bucket.list_entries.push_back(op.bucket.ent);
+
+  return 0;
+}
+
+static int list_object(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  //cout<<sqlite3_column_text(stmt, 0)<<", ";
+  //cout<<sqlite3_column_text(stmt, 1) << "\n";
+
+  op.obj.state.exists = true;
+  op.obj.state.obj.key.name = (const char*)sqlite3_column_text(stmt, ObjName);
+  op.bucket.info.bucket.name = (const char*)sqlite3_column_text(stmt, ObjBucketName);
+  op.obj.state.obj.key.instance = (const char*)sqlite3_column_text(stmt, ObjInstance);
+  op.obj.state.obj.key.ns = (const char*)sqlite3_column_text(stmt, ObjNS);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ACLs, op.obj.acls, sdb);
+  op.obj.index_ver = sqlite3_column_int(stmt, IndexVer);
+  op.obj.tag = (const char*)sqlite3_column_text(stmt, Tag);
+  op.obj.flags = sqlite3_column_int(stmt, ObjFlags); 
+  op.obj.versioned_epoch = sqlite3_column_int(stmt, VersionedEpoch);
+  op.obj.category = (RGWObjCategory)sqlite3_column_int(stmt, ObjCategory); 
+  op.obj.etag = (const char*)sqlite3_column_text(stmt, Etag);
+  op.obj.owner = (const char*)sqlite3_column_text(stmt, Owner);
+  op.obj.owner_display_name = (const char*)sqlite3_column_text(stmt, OwnerDisplayName);
+  op.obj.storage_class = (const char*)sqlite3_column_text(stmt, StorageClass);
+  op.obj.appendable = sqlite3_column_int(stmt, Appendable); 
+  op.obj.content_type = (const char*)sqlite3_column_text(stmt, ContentType);
+  op.obj.state.obj.index_hash_source = (const char*)sqlite3_column_text(stmt, IndexHashSource);
+  op.obj.state.size = sqlite3_column_int(stmt, ObjSize); 
+  op.obj.state.accounted_size = sqlite3_column_int(stmt, AccountedSize); 
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjMtime, op.obj.state.mtime, sdb);
+  op.obj.state.epoch = sqlite3_column_int(stmt, Epoch);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjTag, op.obj.state.obj_tag, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, TailTag, op.obj.state.tail_tag, sdb);
+  op.obj.state.write_tag = (const char*)sqlite3_column_text(stmt, WriteTag);
+  op.obj.state.fake_tag = sqlite3_column_int(stmt, FakeTag);
+  op.obj.state.shadow_obj = (const char*)sqlite3_column_text(stmt, ShadowObj);
+  op.obj.state.has_data = sqlite3_column_int(stmt, HasData); 
+  op.obj.is_versioned = sqlite3_column_int(stmt, IsVersioned); 
+  op.obj.version_num = sqlite3_column_int(stmt, VersionNum); 
+  op.obj.state.pg_ver = sqlite3_column_int(stmt, PGVer); 
+  op.obj.state.zone_short_id = sqlite3_column_int(stmt, ZoneShortID); 
+  op.obj.state.objv_tracker.read_version.ver = sqlite3_column_int(stmt, ObjVersion); 
+  op.obj.state.objv_tracker.read_version.tag = (const char*)sqlite3_column_text(stmt, ObjVersionTag);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjAttrs, op.obj.state.attrset, sdb);
+  op.obj.head_size = sqlite3_column_int(stmt, HeadSize); 
+  op.obj.max_head_size = sqlite3_column_int(stmt, MaxHeadSize); 
+  op.obj.obj_id = (const char*)sqlite3_column_text(stmt, ObjID);
+  op.obj.tail_instance = (const char*)sqlite3_column_text(stmt, TailInstance);
+  op.obj.head_placement_rule.name = (const char*)sqlite3_column_text(stmt, HeadPlacementRuleName);
+  op.obj.head_placement_rule.storage_class = (const char*)sqlite3_column_text(stmt, HeadPlacementRuleStorageClass);
+  op.obj.tail_placement.placement_rule.name = (const char*)sqlite3_column_text(stmt, TailPlacementRuleName);
+  op.obj.tail_placement.placement_rule.storage_class = (const char*)sqlite3_column_text(stmt, TailPlacementStorageClass);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ManifestPartObjs, op.obj.objs, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ManifestPartRules, op.obj.rules, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, Omap, op.obj.omap, sdb);
+  op.obj.is_multipart = sqlite3_column_int(stmt, IsMultipart);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, MPPartsList, op.obj.mp_parts, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, HeadData, op.obj.head_data, sdb);
+  op.obj.state.data = op.obj.head_data;
+
+  rgw_bucket_dir_entry dent;
+  dent.key.name = op.obj.state.obj.key.name;
+  dent.key.instance = op.obj.state.obj.key.instance;
+  dent.tag = op.obj.tag;
+  dent.flags = op.obj.flags;
+  dent.versioned_epoch = op.obj.versioned_epoch;
+  dent.index_ver = op.obj.index_ver;
+  dent.exists = true;
+  dent.meta.category = op.obj.category;
+  dent.meta.size = op.obj.state.size;
+  dent.meta.accounted_size = op.obj.state.accounted_size;
+  dent.meta.mtime = op.obj.state.mtime;
+  dent.meta.etag = op.obj.etag;
+  dent.meta.owner = op.obj.owner;
+  dent.meta.owner_display_name = op.obj.owner_display_name;
+  dent.meta.content_type = op.obj.content_type;
+  dent.meta.storage_class = op.obj.storage_class;
+  dent.meta.appendable = op.obj.appendable;
+
+  op.obj.list_entries.push_back(dent);
+  return 0;
+}
+
+static int get_objectdata(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  op.obj.state.obj.key.name = (const char*)sqlite3_column_text(stmt, ObjName);
+  op.bucket.info.bucket.name = (const char*)sqlite3_column_text(stmt, ObjBucketName);
+  op.obj.state.obj.key.instance = (const char*)sqlite3_column_text(stmt, ObjInstance);
+  op.obj.state.obj.key.ns = (const char*)sqlite3_column_text(stmt, ObjNS);
+  op.obj.obj_id = (const char*)sqlite3_column_text(stmt, ObjDataID);
+  op.obj_data.part_num = sqlite3_column_int(stmt, PartNum);
+  op.obj_data.offset = sqlite3_column_int(stmt, Offset);
+  op.obj_data.size = sqlite3_column_int(stmt, ObjDataSize);
+  op.obj_data.multipart_part_str = (const char*)sqlite3_column_text(stmt, MultipartPartStr);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjDataMtime, op.obj.state.mtime, sdb);
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, ObjData, op.obj_data.data, sdb);
+
+  return 0;
+}
+
+static int list_lc_entry(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  op.lc_entry.index = (const char*)sqlite3_column_text(stmt, LCEntryIndex);
+  op.lc_entry.entry.set_bucket((const char*)sqlite3_column_text(stmt, LCEntryBucketName));
+  op.lc_entry.entry.set_start_time(sqlite3_column_int(stmt, LCEntryStartTime));
+  op.lc_entry.entry.set_status(sqlite3_column_int(stmt, LCEntryStatus));
+ 
+  op.lc_entry.list_entries.push_back(op.lc_entry.entry);
+
+  return 0;
+}
+
+static int list_lc_head(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt) {
+  if (!stmt)
+    return -1;
+
+  int64_t start_date;
+
+  op.lc_head.index = (const char*)sqlite3_column_text(stmt, LCHeadIndex);
+  op.lc_head.head.set_marker((const char*)sqlite3_column_text(stmt, LCHeadMarker));
+ 
+  SQL_DECODE_BLOB_PARAM(dpp, stmt, LCHeadStartDate, start_date, sdb);
+  op.lc_head.head.get_start_date() = start_date;
+
+  return 0;
+}
+
+int SQLiteDB::InitializeDBOps(const DoutPrefixProvider *dpp)
+{
+  (void)createTables(dpp);
+  dbops.InsertUser = make_shared<SQLInsertUser>(&this->db, this->getDBname(), cct);
+  dbops.RemoveUser = make_shared<SQLRemoveUser>(&this->db, this->getDBname(), cct);
+  dbops.GetUser = make_shared<SQLGetUser>(&this->db, this->getDBname(), cct);
+  dbops.InsertBucket = make_shared<SQLInsertBucket>(&this->db, this->getDBname(), cct);
+  dbops.UpdateBucket = make_shared<SQLUpdateBucket>(&this->db, this->getDBname(), cct);
+  dbops.RemoveBucket = make_shared<SQLRemoveBucket>(&this->db, this->getDBname(), cct);
+  dbops.GetBucket = make_shared<SQLGetBucket>(&this->db, this->getDBname(), cct);
+  dbops.ListUserBuckets = make_shared<SQLListUserBuckets>(&this->db, this->getDBname(), cct);
+  dbops.InsertLCEntry = make_shared<SQLInsertLCEntry>(&this->db, this->getDBname(), cct);
+  dbops.RemoveLCEntry = make_shared<SQLRemoveLCEntry>(&this->db, this->getDBname(), cct);
+  dbops.GetLCEntry = make_shared<SQLGetLCEntry>(&this->db, this->getDBname(), cct);
+  dbops.ListLCEntries = make_shared<SQLListLCEntries>(&this->db, this->getDBname(), cct);
+  dbops.InsertLCHead = make_shared<SQLInsertLCHead>(&this->db, this->getDBname(), cct);
+  dbops.RemoveLCHead = make_shared<SQLRemoveLCHead>(&this->db, this->getDBname(), cct);
+  dbops.GetLCHead = make_shared<SQLGetLCHead>(&this->db, this->getDBname(), cct);
+
+  return 0;
+}
+
+void *SQLiteDB::openDB(const DoutPrefixProvider *dpp)
+{
+  string dbname;
+  int rc = 0;
+
+  dbname = getDBfile();
+  if (dbname.empty()) {
+    ldpp_dout(dpp, 0)<<"dbname is NULL" << dendl;
+    goto out;
+  }
+
+  rc = sqlite3_open_v2(dbname.c_str(), (sqlite3**)&db,
+      SQLITE_OPEN_READWRITE |
+      SQLITE_OPEN_CREATE |
+      SQLITE_OPEN_FULLMUTEX,
+      NULL);
+
+  if (rc) {
+    ldpp_dout(dpp, 0) <<"Cant open "<<dbname<<"; Errmsg - "\
+      <<sqlite3_errmsg((sqlite3*)db) <<  dendl;
+  } else {
+    ldpp_dout(dpp, 0) <<"Opened database("<<dbname<<") successfully" <<  dendl;
+  }
+
+  exec(dpp, "PRAGMA foreign_keys=ON", NULL);
+
+out:
+  return db;
+}
+
+int SQLiteDB::closeDB(const DoutPrefixProvider *dpp)
+{
+  if (db)
+    sqlite3_close((sqlite3 *)db);
+
+  db = NULL;
+
+  return 0;
+}
+
+int SQLiteDB::Reset(const DoutPrefixProvider *dpp, sqlite3_stmt *stmt)
+{
+  int ret = -1;
+
+  if (!stmt) {
+    return -1;
+  }
+  sqlite3_clear_bindings(stmt);
+  ret = sqlite3_reset(stmt);
+
+  return ret;
+}
+
+int SQLiteDB::Step(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt,
+    int (*cbk)(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt))
+{
+  int ret = -1;
+
+  if (!stmt) {
+    return -1;
+  }
+
+again:
+  ret = sqlite3_step(stmt);
+
+  if ((ret != SQLITE_DONE) && (ret != SQLITE_ROW)) {
+    ldpp_dout(dpp, 0)<<"sqlite step failed for stmt("<<stmt \
+      <<"); Errmsg - "<<sqlite3_errmsg((sqlite3*)db) << dendl;
+    return -1;
+  } else if (ret == SQLITE_ROW) {
+    if (cbk) {
+      (*cbk)(dpp, op, stmt);
+    } else {
+    }
+    goto again;
+  }
+
+  ldpp_dout(dpp, 20)<<"sqlite step successfully executed for stmt(" \
+    <<stmt<<")  ret = " << ret << dendl;
+
+  return 0;
+}
+
+int SQLiteDB::exec(const DoutPrefixProvider *dpp, const char *schema,
+    int (*callback)(void*,int,char**,char**))
+{
+  int ret = -1;
+  char *errmsg = NULL;
+
+  if (!db)
+    goto out;
+
+  ret = sqlite3_exec((sqlite3*)db, schema, callback, 0, &errmsg);
+  if (ret != SQLITE_OK) {
+    ldpp_dout(dpp, 0) <<"sqlite exec failed for schema("<<schema \
+      <<"); Errmsg - "<<errmsg <<  dendl;
+    sqlite3_free(errmsg);
+    goto out;
+  }
+  ret = 0;
+  ldpp_dout(dpp, 10) <<"sqlite exec successfully processed for schema(" \
+    <<schema<<")" <<  dendl;
+out:
+  return ret;
+}
+
+int SQLiteDB::createTables(const DoutPrefixProvider *dpp)
+{
+  int ret = -1;
+  int cu = 0, cb = 0, cq = 0;
+  DBOpParams params = {};
+
+  params.user_table = getUserTable();
+  params.bucket_table = getBucketTable();
+
+  if ((cu = createUserTable(dpp, &params)))
+    goto out;
+
+  if ((cb = createBucketTable(dpp, &params)))
+    goto out;
+
+  if ((cq = createQuotaTable(dpp, &params)))
+    goto out;
+
+  ret = 0;
+out:
+  if (ret) {
+    if (cu)
+      DeleteUserTable(dpp, &params);
+    if (cb)
+      DeleteBucketTable(dpp, &params);
+    ldpp_dout(dpp, 0)<<"Creation of tables failed" << dendl;
+  }
+
+  return ret;
+}
+
+int SQLiteDB::createUserTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("User", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateUserTable failed" << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateUserTable suceeded" << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("Bucket", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateBucketTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateBucketTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("Object", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateObjectTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateObjectTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createObjectTableTrigger(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("ObjectTrigger", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateObjectTableTrigger failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateObjectTableTrigger suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createObjectView(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("ObjectView", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateObjectView failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateObjectView suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("Quota", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateQuotaTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateQuotaTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = CreateTableSchema("ObjectData", params);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"CreateObjectDataTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"CreateObjectDataTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::createLCTables(const DoutPrefixProvider *dpp)
+{
+  int ret = -1;
+  string schema;
+  DBOpParams params = {};
+
+  params.lc_entry_table = getLCEntryTable();
+  params.lc_head_table = getLCHeadTable();
+  params.bucket_table = getBucketTable();
+
+  schema = CreateTableSchema("LCEntry", &params);
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"CreateLCEntryTable failed" << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20)<<"CreateLCEntryTable suceeded" << dendl;
+
+  schema = CreateTableSchema("LCHead", &params);
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret) {
+    ldpp_dout(dpp, 0)<<"CreateLCHeadTable failed" << dendl;
+    (void)DeleteLCEntryTable(dpp, &params);
+  }
+  ldpp_dout(dpp, 20)<<"CreateLCHeadTable suceeded" << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteUserTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->user_table);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteUserTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"DeleteUserTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->bucket_table);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeletebucketTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"DeletebucketTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->object_table);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteObjectTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"DeleteObjectTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->objectdata_table);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteObjectDataTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"DeleteObjectDataTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->quota_table);
+
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteQuotaTable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"DeleteQuotaTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteLCEntryTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->lc_entry_table);
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteLCEntryTable failed " << dendl;
+  ldpp_dout(dpp, 20)<<"DeleteLCEntryTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::DeleteLCHeadTable(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = DeleteTableSchema(params->lc_head_table);
+  ret = exec(dpp, schema.c_str(), NULL);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"DeleteLCHeadTable failed " << dendl;
+  ldpp_dout(dpp, 20)<<"DeleteLCHeadTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = ListTableSchema(params->user_table);
+  ret = exec(dpp, schema.c_str(), &list_callback);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"GetUsertable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"GetUserTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+
+  schema = ListTableSchema(params->bucket_table);
+
+  ret = exec(dpp, schema.c_str(), &list_callback);
+  if (ret)
+    ldpp_dout(dpp, 0)<<"Listbuckettable failed " << dendl;
+
+  ldpp_dout(dpp, 20)<<"ListbucketTable suceeded " << dendl;
+
+  return ret;
+}
+
+int SQLiteDB::ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params)
+{
+  int ret = -1;
+  string schema;
+  map<string, class ObjectOp*>::iterator iter;
+  map<string, class ObjectOp*> objectmap;
+  string bucket;
+
+  objectmap = getObjectMap();
+
+  if (objectmap.empty())
+    ldpp_dout(dpp, 20)<<"objectmap empty " << dendl;
+
+  for (iter = objectmap.begin(); iter != objectmap.end(); ++iter) {
+    bucket = iter->first;
+    params->object_table = getObjectTable(bucket);
+    schema = ListTableSchema(params->object_table);
+
+    ret = exec(dpp, schema.c_str(), &list_callback);
+    if (ret)
+      ldpp_dout(dpp, 0)<<"ListObjecttable failed " << dendl;
+
+    ldpp_dout(dpp, 20)<<"ListObjectTable suceeded " << dendl;
+  }
+
+  return ret;
+}
+
+int SQLObjectOp::InitializeObjectOps(string db_name, const DoutPrefixProvider *dpp)
+{
+  PutObject = make_shared<SQLPutObject>(sdb, db_name, cct);
+  DeleteObject = make_shared<SQLDeleteObject>(sdb, db_name, cct);
+  GetObject = make_shared<SQLGetObject>(sdb, db_name, cct);
+  UpdateObject = make_shared<SQLUpdateObject>(sdb, db_name, cct);
+  ListBucketObjects = make_shared<SQLListBucketObjects>(sdb, db_name, cct);
+  ListVersionedObjects = make_shared<SQLListVersionedObjects>(sdb, db_name, cct);
+  PutObjectData = make_shared<SQLPutObjectData>(sdb, db_name, cct);
+  UpdateObjectData = make_shared<SQLUpdateObjectData>(sdb, db_name, cct);
+  GetObjectData = make_shared<SQLGetObjectData>(sdb, db_name, cct);
+  DeleteObjectData = make_shared<SQLDeleteObjectData>(sdb, db_name, cct);
+  DeleteStaleObjectData = make_shared<SQLDeleteStaleObjectData>(sdb, db_name, cct);
+
+  return 0;
+}
+
+int SQLInsertUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLInsertUser - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertUser");
+out:
+  return ret;
+}
+
+int SQLInsertUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.tenant, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.tenant.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.ns, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.ns.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.display_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.display_name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_email, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_email.c_str(), sdb);
+
+  if (!params->op.user.uinfo.access_keys.empty()) {
+    string access_key;
+    string key;
+    map<string, RGWAccessKey>::const_iterator it =
+      params->op.user.uinfo.access_keys.begin();
+    const RGWAccessKey& k = it->second;
+    access_key = k.id;
+    key = k.key;
+
+    SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys_id, sdb);
+    SQL_BIND_TEXT(dpp, stmt, index, access_key.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys_secret, sdb);
+    SQL_BIND_TEXT(dpp, stmt, index, key.c_str(), sdb);
+
+  }
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.access_keys, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.access_keys, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.swift_keys, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.swift_keys, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.subusers, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.subusers, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.suspended, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.suspended, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.max_buckets, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.max_buckets, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.op_mask, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.op_mask, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_caps, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.caps, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.admin, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.admin, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.system, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.system, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.default_placement.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_storage_class, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.default_placement.storage_class.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.placement_tags, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.placement_tags, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.bucket_quota, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.quota.bucket_quota, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.temp_url_keys, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.temp_url_keys, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_quota, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.quota.user_quota, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.type, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.uinfo.type, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.mfa_ids, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.uinfo.mfa_ids, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_attrs, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.user.user_attrs, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_ver, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.user.user_version.ver, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_ver_tag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.user_version.tag.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLInsertUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLRemoveUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLRemoveUser - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveUser");
+out:
+  return ret;
+}
+
+int SQLRemoveUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLRemoveUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetUser::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetUser - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  if (params->op.query_str == "email") { 
+    SQL_PREPARE(dpp, p_params, sdb, email_stmt, ret, "PrepareGetUser");
+  } else if (params->op.query_str == "access_key") { 
+    SQL_PREPARE(dpp, p_params, sdb, ak_stmt, ret, "PrepareGetUser");
+  } else if (params->op.query_str == "user_id") { 
+    SQL_PREPARE(dpp, p_params, sdb, userid_stmt, ret, "PrepareGetUser");
+  } else { // by default by userid
+    SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetUser");
+  }
+out:
+  return ret;
+}
+
+int SQLGetUser::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.query_str == "email") { 
+    SQL_BIND_INDEX(dpp, email_stmt, index, p_params.op.user.user_email, sdb);
+    SQL_BIND_TEXT(dpp, email_stmt, index, params->op.user.uinfo.user_email.c_str(), sdb);
+  } else if (params->op.query_str == "access_key") { 
+    if (!params->op.user.uinfo.access_keys.empty()) {
+      string access_key;
+      map<string, RGWAccessKey>::const_iterator it =
+        params->op.user.uinfo.access_keys.begin();
+      const RGWAccessKey& k = it->second;
+      access_key = k.id;
+
+      SQL_BIND_INDEX(dpp, ak_stmt, index, p_params.op.user.access_keys_id, sdb);
+      SQL_BIND_TEXT(dpp, ak_stmt, index, access_key.c_str(), sdb);
+    }
+  } else if (params->op.query_str == "user_id") { 
+    SQL_BIND_INDEX(dpp, userid_stmt, index, p_params.op.user.user_id, sdb);
+    SQL_BIND_TEXT(dpp, userid_stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+  } else { // by default by userid
+    SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
+    SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+  }
+
+out:
+  return rc;
+}
+
+int SQLGetUser::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  if (params->op.query_str == "email") { 
+    SQL_EXECUTE(dpp, params, email_stmt, list_user);
+  } else if (params->op.query_str == "access_key") { 
+    SQL_EXECUTE(dpp, params, ak_stmt, list_user);
+  } else if (params->op.query_str == "user_id") { 
+    SQL_EXECUTE(dpp, params, userid_stmt, list_user);
+  } else { // by default by userid
+    SQL_EXECUTE(dpp, params, stmt, list_user);
+  }
+
+out:
+  return ret;
+}
+
+int SQLInsertBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLInsertBucket - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertBucket");
+
+out:
+  return ret;
+}
+
+int SQLInsertBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  // user_id here is copied as OwnerID in the bucket table.
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.user.user_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.tenant, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.tenant.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.marker, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.marker.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.bucket_id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.size, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.size_rounded, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.size_rounded, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.creation_time, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.creation_time, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.count, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.ent.count, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.placement_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.placement_rule.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.placement_storage_class, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.placement_rule.storage_class.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.flags, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.flags, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.zonegroup, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.zonegroup.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.has_instance_obj, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.has_instance_obj, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.quota, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.quota, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.requester_pays, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.requester_pays, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.has_website, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.has_website, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.website_conf, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.website_conf, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.swift_versioning, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.info.swift_versioning, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.swift_ver_location, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.swift_ver_location.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.mdsearch_config, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.mdsearch_config, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.new_bucket_instance_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.new_bucket_instance_id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.obj_lock, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.obj_lock, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.sync_policy_info_groups, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.info.sync_policy, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_attrs, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.bucket_attrs, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_ver, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.bucket.bucket_version.ver, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_ver_tag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.bucket_version.tag.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.bucket.mtime, sdb);
+
+out:
+  return rc;
+}
+
+int SQLInsertBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  class SQLObjectOp *ObPtr = NULL;
+  string bucket_name = params->op.bucket.info.bucket.name;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  ObPtr = new SQLObjectOp(sdb, ctx());
+
+  objectmapInsert(dpp, bucket_name, ObPtr);
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+
+  /* Once Bucket is inserted created corresponding object(&data) tables
+   */
+  InitPrepareParams(dpp, p_params, params);
+
+  (void)createObjectTable(dpp, params);
+  (void)createObjectDataTable(dpp, params);
+  (void)createObjectTableTrigger(dpp, params);
+out:
+  return ret;
+}
+
+int SQLUpdateBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  if (params->op.query_str == "attrs") { 
+    SQL_PREPARE(dpp, p_params, sdb, attrs_stmt, ret, "PrepareUpdateBucket");
+  } else if (params->op.query_str == "owner") { 
+    SQL_PREPARE(dpp, p_params, sdb, owner_stmt, ret, "PrepareUpdateBucket");
+  } else if (params->op.query_str == "info") { 
+    SQL_PREPARE(dpp, p_params, sdb, info_stmt, ret, "PrepareUpdateBucket");
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" <<
+      params->op.query_str << "" << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int SQLUpdateBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+  sqlite3_stmt** stmt = NULL; // Prepared statement
+
+  /* All below fields for attrs */
+  if (params->op.query_str == "attrs") { 
+    stmt = &attrs_stmt;
+  } else if (params->op.query_str == "owner") { 
+    stmt = &owner_stmt;
+  } else if (params->op.query_str == "info") { 
+    stmt = &info_stmt;
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" <<
+      params->op.query_str << "" << dendl;
+    goto out;
+  }
+
+  if (params->op.query_str == "attrs") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_attrs, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.bucket_attrs, sdb);
+  } else if (params->op.query_str == "owner") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.creation_time, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.creation_time, sdb);
+  } else if (params->op.query_str == "info") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.tenant, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.tenant.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.marker, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.marker.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_id, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.bucket_id.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.creation_time, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.creation_time, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.count, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.ent.count, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.placement_name, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.placement_rule.name.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.placement_storage_class, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.placement_rule.storage_class.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.flags, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.flags, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.zonegroup, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.zonegroup.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.has_instance_obj, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.has_instance_obj, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.quota, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.quota, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.requester_pays, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.requester_pays, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.has_website, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.has_website, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.website_conf, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.website_conf, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.swift_versioning, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.info.swift_versioning, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.swift_ver_location, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.swift_ver_location.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.mdsearch_config, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.mdsearch_config, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.new_bucket_instance_id, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.new_bucket_instance_id.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.obj_lock, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.obj_lock, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.sync_policy_info_groups, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.info.sync_policy, sdb);
+  }
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.user.user_id, sdb);
+  SQL_BIND_TEXT(dpp, *stmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_ver, sdb);
+  SQL_BIND_INT(dpp, *stmt, index, params->op.bucket.bucket_version.ver, sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.bucket.mtime, sdb);
+
+out:
+  return rc;
+}
+
+int SQLUpdateBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  sqlite3_stmt** stmt = NULL; // Prepared statement
+
+  if (params->op.query_str == "attrs") { 
+    stmt = &attrs_stmt;
+  } else if (params->op.query_str == "owner") { 
+    stmt = &owner_stmt;
+  } else if (params->op.query_str == "info") { 
+    stmt = &info_stmt;
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateBucket invalid query_str:" <<
+      params->op.query_str << "" << dendl;
+    goto out;
+  }
+
+  SQL_EXECUTE(dpp, params, *stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLRemoveBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLRemoveBucket - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveBucket");
+
+out:
+  return ret;
+}
+
+int SQLRemoveBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLRemoveBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  objectmapDelete(dpp, params->op.bucket.info.bucket.name);
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetBucket::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetBucket - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetBucket");
+
+out:
+  return ret;
+}
+
+int SQLGetBucket::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLGetBucket::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  class SQLObjectOp *ObPtr = NULL;
+
+  params->op.name = "GetBucket";
+
+  ObPtr = new SQLObjectOp(sdb, ctx());
+
+  /* For the case when the  server restarts, need to reinsert objectmap*/
+  objectmapInsert(dpp, params->op.bucket.info.bucket.name, ObPtr);
+  SQL_EXECUTE(dpp, params, stmt, list_bucket);
+out:
+  return ret;
+}
+
+int SQLListUserBuckets::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLListUserBuckets - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  if (params->op.query_str == "all") { 
+    SQL_PREPARE(dpp, p_params, sdb, all_stmt, ret, "PrepareListUserBuckets");
+  }else {
+    SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListUserBuckets");
+  }
+
+out:
+  return ret;
+}
+
+int SQLListUserBuckets::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+  sqlite3_stmt** pstmt = NULL; // Prepared statement
+
+  if (params->op.query_str == "all") { 
+    pstmt = &all_stmt;
+  } else { 
+    pstmt = &stmt;
+  }
+
+  if (params->op.query_str != "all") { 
+    SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.user.user_id, sdb);
+    SQL_BIND_TEXT(dpp, *pstmt, index, params->op.user.uinfo.user_id.id.c_str(), sdb);
+  }
+
+  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.bucket.min_marker, sdb);
+  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.bucket.min_marker.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.list_max_count, sdb);
+  SQL_BIND_INT(dpp, *pstmt, index, params->op.list_max_count, sdb);
+
+out:
+  return rc;
+}
+
+int SQLListUserBuckets::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  if (params->op.query_str == "all") { 
+    SQL_EXECUTE(dpp, params, all_stmt, list_bucket);
+  } else {
+    SQL_EXECUTE(dpp, params, stmt, list_bucket);
+  }
+out:
+  return ret;
+}
+
+int SQLPutObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLPutObject - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PreparePutObject");
+
+out:
+  return ret;
+}
+
+int SQLPutObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  int VersionNum = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_ns, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.acls, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.acls, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.index_ver, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.index_ver, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tag.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.flags, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.flags, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.versioned_epoch, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.versioned_epoch, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_category, sdb);
+  SQL_BIND_INT(dpp, stmt, index, (uint8_t)(params->op.obj.category), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.etag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.etag.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.owner, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.owner.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.owner_display_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.owner_display_name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.storage_class, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.storage_class.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.appendable, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.appendable, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.content_type, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.content_type.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.index_hash_source, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.index_hash_source.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_size, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.accounted_size, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.accounted_size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.epoch, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.epoch, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_tag, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.obj_tag, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_tag, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.tail_tag, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.write_tag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.write_tag.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.fake_tag, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.fake_tag, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.shadow_obj, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.shadow_obj.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.has_data, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.has_data, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.is_versioned, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.is_versioned, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.version_num, sdb);
+  SQL_BIND_INT(dpp, stmt, index, VersionNum, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.pg_ver, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.pg_ver, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.zone_short_id, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.zone_short_id, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_version, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.state.objv_tracker.read_version.ver, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_version_tag, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.objv_tracker.read_version.tag.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_attrs, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.attrset, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_size, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.head_size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.max_head_size, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.max_head_size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_placement_rule_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.head_placement_rule.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_placement_storage_class, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.head_placement_rule.storage_class.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_placement_rule_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_placement.placement_rule.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.tail_placement_storage_class, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.tail_placement.placement_rule.storage_class.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.manifest_part_objs, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.objs, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.manifest_part_rules, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.rules, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.omap, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.omap, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.is_multipart, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj.is_multipart, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mp_parts, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.mp_parts, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.head_data, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.head_data, sdb);
+
+out:
+  return rc;
+}
+
+int SQLPutObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLDeleteObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLDeleteObject - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteObject");
+
+out:
+  return ret;
+}
+
+int SQLDeleteObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+out:
+  return rc;
+}
+
+int SQLDeleteObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetObject - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetObject");
+
+out:
+  return ret;
+}
+
+int SQLGetObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLGetObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, list_object);
+out:
+  return ret;
+}
+
+int SQLUpdateObject::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+  struct DBOpParams copy = *params;
+  string bucket_name;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateObject - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  if (params->op.query_str == "omap") {
+    SQL_PREPARE(dpp, p_params, sdb, omap_stmt, ret, "PrepareUpdateObject");
+  } else if (params->op.query_str == "attrs") {
+    SQL_PREPARE(dpp, p_params, sdb, attrs_stmt, ret, "PrepareUpdateObject");
+  } else if (params->op.query_str == "meta") {
+    SQL_PREPARE(dpp, p_params, sdb, meta_stmt, ret, "PrepareUpdateObject");
+  } else if (params->op.query_str == "mp") {
+    SQL_PREPARE(dpp, p_params, sdb, mp_stmt, ret, "PrepareUpdateObject");
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" <<
+      params->op.query_str << dendl;
+    goto out;
+  }
+
+out:
+  return ret;
+}
+
+int SQLUpdateObject::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+  sqlite3_stmt** stmt = NULL; // Prepared statement
+
+  /* All below fields for attrs */
+  if (params->op.query_str == "omap") { 
+    stmt = &omap_stmt;
+  } else if (params->op.query_str == "attrs") { 
+    stmt = &attrs_stmt;
+  } else if (params->op.query_str == "meta") { 
+    stmt = &meta_stmt;
+  } else if (params->op.query_str == "mp") { 
+    stmt = &mp_stmt;
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" <<
+      params->op.query_str << dendl;
+    goto out;
+  }
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, *stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.mtime, sdb);
+
+  if (params->op.query_str == "omap") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.omap, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.omap, sdb);
+  }
+  if (params->op.query_str == "attrs") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_attrs, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.attrset, sdb);
+  }
+  if (params->op.query_str == "mp") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mp_parts, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.mp_parts, sdb);
+  }
+  if (params->op.query_str == "meta") { 
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_ns, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.acls, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.acls, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.index_ver, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.index_ver, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tag, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tag.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.flags, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.flags, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.versioned_epoch, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.versioned_epoch, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_category, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, (uint8_t)(params->op.obj.category), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.etag, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.etag.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.owner, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.owner.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.owner_display_name, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.owner_display_name.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.storage_class, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.storage_class.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.appendable, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.appendable, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.content_type, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.content_type.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.index_hash_source, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.obj.index_hash_source.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_size, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.size, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.accounted_size, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.accounted_size, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.epoch, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.epoch, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_tag, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.obj_tag, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_tag, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.tail_tag, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.write_tag, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.write_tag.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.fake_tag, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.fake_tag, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.shadow_obj, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.shadow_obj.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.has_data, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.has_data, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.is_versioned, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.is_versioned, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.version_num, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.version_num, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.pg_ver, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.pg_ver, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.zone_short_id, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.zone_short_id, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_version, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.state.objv_tracker.read_version.ver, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_version_tag, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.state.objv_tracker.read_version.tag.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_attrs, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.state.attrset, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_size, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.head_size, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.max_head_size, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.max_head_size, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.obj_id, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_instance, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_instance.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_placement_rule_name, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.head_placement_rule.name.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_placement_storage_class, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.head_placement_rule.storage_class.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_placement_rule_name, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_placement.placement_rule.name.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.tail_placement_storage_class, sdb);
+    SQL_BIND_TEXT(dpp, *stmt, index, params->op.obj.tail_placement.placement_rule.storage_class.c_str(), sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.manifest_part_objs, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.objs, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.manifest_part_rules, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.rules, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.omap, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.omap, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.is_multipart, sdb);
+    SQL_BIND_INT(dpp, *stmt, index, params->op.obj.is_multipart, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.mp_parts, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.mp_parts, sdb);
+
+    SQL_BIND_INDEX(dpp, *stmt, index, p_params.op.obj.head_data, sdb);
+    SQL_ENCODE_BLOB_PARAM(dpp, *stmt, index, params->op.obj.head_data, sdb);
+  }
+
+out:
+  return rc;
+}
+
+int SQLUpdateObject::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  sqlite3_stmt** stmt = NULL; // Prepared statement
+
+  if (params->op.query_str == "omap") { 
+    stmt = &omap_stmt;
+  } else if (params->op.query_str == "attrs") { 
+    stmt = &attrs_stmt;
+  } else if (params->op.query_str == "meta") { 
+    stmt = &meta_stmt;
+  } else if (params->op.query_str == "mp") { 
+    stmt = &mp_stmt;
+  } else {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateObject invalid query_str:" <<
+      params->op.query_str << dendl;
+    goto out;
+  }
+
+  SQL_EXECUTE(dpp, params, *stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLListBucketObjects::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLListBucketObjects - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListBucketObjects");
+
+out:
+  return ret;
+}
+
+int SQLListBucketObjects::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.min_marker, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.min_marker.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.prefix, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.prefix.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb);
+
+out:
+  return rc;
+}
+
+int SQLListBucketObjects::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, list_object);
+out:
+  return ret;
+}
+
+int SQLListVersionedObjects::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLListVersionedObjects - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListVersionedObjects");
+
+out:
+  return ret;
+}
+
+int SQLListVersionedObjects::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb);
+
+out:
+  return rc;
+}
+
+int SQLListVersionedObjects::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, list_object);
+out:
+  return ret;
+}
+
+int SQLPutObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLPutObjectData - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PreparePutObjectData");
+
+out:
+  return ret;
+}
+
+int SQLPutObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_ns, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.ns.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.part_num, sdb);
+
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.part_num, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.offset, sdb);
+
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.offset, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.data, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj_data.data, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.size, sdb);
+
+  SQL_BIND_INT(dpp, stmt, index, params->op.obj_data.size, sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj_data.multipart_part_str, sdb);
+
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj_data.multipart_part_str.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
+
+out:
+  return rc;
+}
+
+int SQLPutObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLUpdateObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLUpdateObjectData - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareUpdateObjectData");
+
+out:
+  return ret;
+}
+
+int SQLUpdateObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
+
+out:
+  return rc;
+}
+
+int SQLUpdateObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetObjectData - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetObjectData");
+
+out:
+  return ret;
+}
+
+int SQLGetObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLGetObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, get_objectdata);
+out:
+  return ret;
+}
+
+int SQLDeleteObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLDeleteObjectData - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteObjectData");
+
+out:
+  return ret;
+}
+
+int SQLDeleteObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (params->op.obj.state.obj.key.instance.empty()) {
+    params->op.obj.state.obj.key.instance = "null";
+  }
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.bucket.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.bucket.info.bucket.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.name.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_instance, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.state.obj.key.instance.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.obj_id, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.obj.obj_id.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLDeleteObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLDeleteStaleObjectData::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLDeleteStaleObjectData - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareDeleteStaleObjectData");
+
+out:
+  return ret;
+}
+
+int SQLDeleteStaleObjectData::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.obj.mtime, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, params->op.obj.state.mtime, sdb);
+
+out:
+  return rc;
+}
+
+int SQLDeleteStaleObjectData::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLInsertLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLInsertLCEntry - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertLCEntry");
+
+out:
+  return ret;
+}
+
+int SQLInsertLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.status, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_status(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.start_time, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.lc_entry.entry.get_start_time(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLInsertLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLRemoveLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLRemoveLCEntry - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveLCEntry");
+
+out:
+  return ret;
+}
+
+int SQLRemoveLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLRemoveLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetLCEntry::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  sqlite3_stmt** pstmt = NULL; // Prepared statement
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetLCEntry - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  if (params->op.query_str == "get_next_entry") {
+    pstmt = &next_stmt;
+  } else {
+    pstmt = &stmt;
+  }
+  SQL_PREPARE(dpp, p_params, sdb, *pstmt, ret, "PrepareGetLCEntry");
+
+out:
+  return ret;
+}
+
+int SQLGetLCEntry::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+  sqlite3_stmt** pstmt = NULL; // Prepared statement
+
+  if (params->op.query_str == "get_next_entry") {
+    pstmt = &next_stmt;
+  } else {
+    pstmt = &stmt;
+  }
+  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.lc_entry.index, sdb);
+  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.index.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, *pstmt, index, p_params.op.lc_entry.bucket_name, sdb);
+  SQL_BIND_TEXT(dpp, *pstmt, index, params->op.lc_entry.entry.get_bucket().c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLGetLCEntry::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  sqlite3_stmt** pstmt = NULL; // Prepared statement
+
+  if (params->op.query_str == "get_next_entry") {
+    pstmt = &next_stmt;
+  } else {
+    pstmt = &stmt;
+  }
+
+  SQL_EXECUTE(dpp, params, *pstmt, list_lc_entry);
+out:
+  return ret;
+}
+
+int SQLListLCEntries::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLListLCEntries - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareListLCEntries");
+
+out:
+  return ret;
+}
+
+int SQLListLCEntries::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.index.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_entry.min_marker, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_entry.min_marker.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.list_max_count, sdb);
+  SQL_BIND_INT(dpp, stmt, index, params->op.list_max_count, sdb);
+
+out:
+  return rc;
+}
+
+int SQLListLCEntries::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, list_lc_entry);
+out:
+  return ret;
+}
+
+int SQLInsertLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLInsertLCHead - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareInsertLCHead");
+
+out:
+  return ret;
+}
+
+int SQLInsertLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.marker, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.head.get_marker().c_str(), sdb);
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.start_date, sdb);
+  SQL_ENCODE_BLOB_PARAM(dpp, stmt, index, static_cast<int64_t>(params->op.lc_head.head.start_date), sdb);
+
+out:
+  return rc;
+}
+
+int SQLInsertLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLRemoveLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLRemoveLCHead - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareRemoveLCHead");
+
+out:
+  return ret;
+}
+
+int SQLRemoveLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLRemoveLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  SQL_EXECUTE(dpp, params, stmt, NULL);
+out:
+  return ret;
+}
+
+int SQLGetLCHead::Prepare(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  if (!*sdb) {
+    ldpp_dout(dpp, 0)<<"In SQLGetLCHead - no db" << dendl;
+    goto out;
+  }
+
+  InitPrepareParams(dpp, p_params, params);
+
+  SQL_PREPARE(dpp, p_params, sdb, stmt, ret, "PrepareGetLCHead");
+
+out:
+  return ret;
+}
+
+int SQLGetLCHead::Bind(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int index = -1;
+  int rc = 0;
+  struct DBOpPrepareParams p_params = PrepareParams;
+
+  SQL_BIND_INDEX(dpp, stmt, index, p_params.op.lc_head.index, sdb);
+  SQL_BIND_TEXT(dpp, stmt, index, params->op.lc_head.index.c_str(), sdb);
+
+out:
+  return rc;
+}
+
+int SQLGetLCHead::Execute(const DoutPrefixProvider *dpp, struct DBOpParams *params)
+{
+  int ret = -1;
+
+  // clear the params before fetching the entry
+  params->op.lc_head.head = {};
+  SQL_EXECUTE(dpp, params, stmt, list_lc_head);
+out:
+  return ret;
+}
diff --git a/src/rgw/driver/dbstore/sqlite/sqliteDB.h b/src/rgw/driver/dbstore/sqlite/sqliteDB.h
new file mode 100644
index 000000000..ec0ef2bb2
--- /dev/null
+++ b/src/rgw/driver/dbstore/sqlite/sqliteDB.h
@@ -0,0 +1,551 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string>
+#include <sqlite3.h>
+#include "rgw/driver/dbstore/common/dbstore.h"
+
+using namespace rgw::store;
+
+class SQLiteDB : public DB, virtual public DBOp {
+  private:
+    sqlite3_mutex *mutex = NULL;
+
+  protected:
+    CephContext *cct;
+
+  public:
+    sqlite3_stmt *stmt = NULL;
+    DBOpPrepareParams PrepareParams;
+
+    SQLiteDB(sqlite3 *dbi, std::string db_name, CephContext *_cct) : DB(db_name, _cct), cct(_cct) {
+      db = (void*)dbi;
+    }
+    SQLiteDB(std::string db_name, CephContext *_cct) : DB(db_name, _cct), cct(_cct) {
+    }
+    ~SQLiteDB() {}
+
+    uint64_t get_blob_limit() override { return SQLITE_LIMIT_LENGTH; }
+    void *openDB(const DoutPrefixProvider *dpp) override;
+    int closeDB(const DoutPrefixProvider *dpp) override;
+    int InitializeDBOps(const DoutPrefixProvider *dpp) override;
+
+    int InitPrepareParams(const DoutPrefixProvider *dpp, DBOpPrepareParams &p_params,
+                          DBOpParams* params) override;
+
+    int exec(const DoutPrefixProvider *dpp, const char *schema,
+        int (*callback)(void*,int,char**,char**));
+    int Step(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt,
+        int (*cbk)(const DoutPrefixProvider *dpp, DBOpInfo &op, sqlite3_stmt *stmt));
+    int Reset(const DoutPrefixProvider *dpp, sqlite3_stmt *stmt);
+    /* default value matches with sqliteDB style */
+
+    int createTables(const DoutPrefixProvider *dpp) override;
+    int createBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createUserTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createObjectView(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createObjectTableTrigger(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int createQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    void populate_object_params(const DoutPrefixProvider *dpp,
+                                struct DBOpPrepareParams& p_params,
+                                struct DBOpParams* params, bool data);
+
+    int createLCTables(const DoutPrefixProvider *dpp) override;
+
+    int DeleteBucketTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteUserTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteObjectTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteObjectDataTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteQuotaTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteLCEntryTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int DeleteLCHeadTable(const DoutPrefixProvider *dpp, DBOpParams *params);
+
+    int ListAllBuckets(const DoutPrefixProvider *dpp, DBOpParams *params) override;
+    int ListAllUsers(const DoutPrefixProvider *dpp, DBOpParams *params) override;
+    int ListAllObjects(const DoutPrefixProvider *dpp, DBOpParams *params) override;
+};
+
+class SQLObjectOp : public ObjectOp {
+  private:
+    sqlite3 **sdb = NULL;
+    CephContext *cct;
+
+  public:
+    SQLObjectOp(sqlite3 **sdbi, CephContext *_cct) : sdb(sdbi), cct(_cct) {};
+    ~SQLObjectOp() {}
+
+    int InitializeObjectOps(std::string db_name, const DoutPrefixProvider *dpp);
+};
+
+class SQLInsertUser : public SQLiteDB, public InsertUserOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLInsertUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLInsertUser() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLRemoveUser : public SQLiteDB, public RemoveUserOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLRemoveUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLRemoveUser() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetUser : public SQLiteDB, public GetUserOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+    sqlite3_stmt *email_stmt = NULL; // Prepared statement to query by useremail
+    sqlite3_stmt *ak_stmt = NULL; // Prepared statement to query by access_key_id
+    sqlite3_stmt *userid_stmt = NULL; // Prepared statement to query by user_id
+
+  public:
+    SQLGetUser(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLGetUser() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+      if (email_stmt)
+        sqlite3_finalize(email_stmt);
+      if (ak_stmt)
+        sqlite3_finalize(ak_stmt);
+      if (userid_stmt)
+        sqlite3_finalize(userid_stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLInsertBucket : public SQLiteDB, public InsertBucketOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLInsertBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLInsertBucket() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLUpdateBucket : public SQLiteDB, public UpdateBucketOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *info_stmt = NULL; // Prepared statement
+    sqlite3_stmt *attrs_stmt = NULL; // Prepared statement
+    sqlite3_stmt *owner_stmt = NULL; // Prepared statement
+
+  public:
+    SQLUpdateBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLUpdateBucket() {
+      if (info_stmt)
+        sqlite3_finalize(info_stmt);
+      if (attrs_stmt)
+        sqlite3_finalize(attrs_stmt);
+      if (owner_stmt)
+        sqlite3_finalize(owner_stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLRemoveBucket : public SQLiteDB, public RemoveBucketOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLRemoveBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLRemoveBucket() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetBucket : public SQLiteDB, public GetBucketOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLGetBucket(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLGetBucket() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLListUserBuckets : public SQLiteDB, public ListUserBucketsOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+    sqlite3_stmt *all_stmt = NULL; // Prepared statement
+
+  public:
+    SQLListUserBuckets(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLListUserBuckets() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+      if (all_stmt)
+        sqlite3_finalize(all_stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLPutObject : public SQLiteDB, public PutObjectOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLPutObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLPutObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLPutObject() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLDeleteObject : public SQLiteDB, public DeleteObjectOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLDeleteObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLDeleteObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLDeleteObject() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetObject : public SQLiteDB, public GetObjectOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLGetObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLGetObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLGetObject() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLUpdateObject : public SQLiteDB, public UpdateObjectOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *omap_stmt = NULL; // Prepared statement
+    sqlite3_stmt *attrs_stmt = NULL; // Prepared statement
+    sqlite3_stmt *meta_stmt = NULL; // Prepared statement
+    sqlite3_stmt *mp_stmt = NULL; // Prepared statement
+
+  public:
+    SQLUpdateObject(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLUpdateObject(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLUpdateObject() {
+      if (omap_stmt)
+        sqlite3_finalize(omap_stmt);
+      if (attrs_stmt)
+        sqlite3_finalize(attrs_stmt);
+      if (meta_stmt)
+        sqlite3_finalize(meta_stmt);
+    }
+
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLListBucketObjects : public SQLiteDB, public ListBucketObjectsOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLListBucketObjects(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLListBucketObjects(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLListBucketObjects() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLListVersionedObjects : public SQLiteDB, public ListVersionedObjectsOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLListVersionedObjects(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLListVersionedObjects(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLListVersionedObjects() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLPutObjectData : public SQLiteDB, public PutObjectDataOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLPutObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLPutObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLPutObjectData() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLUpdateObjectData : public SQLiteDB, public UpdateObjectDataOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLUpdateObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLUpdateObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLUpdateObjectData() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetObjectData : public SQLiteDB, public GetObjectDataOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLGetObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLGetObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLGetObjectData() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLDeleteObjectData : public SQLiteDB, public DeleteObjectDataOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLDeleteObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLDeleteObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLDeleteObjectData() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLDeleteStaleObjectData : public SQLiteDB, public DeleteStaleObjectDataOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLDeleteStaleObjectData(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    SQLDeleteStaleObjectData(sqlite3 **sdbi, std::string db_name, CephContext *cct) : SQLiteDB(*sdbi, db_name, cct), sdb(sdbi) {}
+
+    ~SQLDeleteStaleObjectData() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLInsertLCEntry : public SQLiteDB, public InsertLCEntryOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLInsertLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLInsertLCEntry() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLRemoveLCEntry : public SQLiteDB, public RemoveLCEntryOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLRemoveLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLRemoveLCEntry() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetLCEntry : public SQLiteDB, public GetLCEntryOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+    sqlite3_stmt *next_stmt = NULL; // Prepared statement
+
+  public:
+    SQLGetLCEntry(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLGetLCEntry() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+      if (next_stmt)
+        sqlite3_finalize(next_stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLListLCEntries : public SQLiteDB, public ListLCEntriesOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLListLCEntries(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLListLCEntries() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLInsertLCHead : public SQLiteDB, public InsertLCHeadOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLInsertLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLInsertLCHead() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLRemoveLCHead : public SQLiteDB, public RemoveLCHeadOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLRemoveLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLRemoveLCHead() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
+
+class SQLGetLCHead : public SQLiteDB, public GetLCHeadOp {
+  private:
+    sqlite3 **sdb = NULL;
+    sqlite3_stmt *stmt = NULL; // Prepared statement
+
+  public:
+    SQLGetLCHead(void **db, std::string db_name, CephContext *cct) : SQLiteDB((sqlite3 *)(*db), db_name, cct), sdb((sqlite3 **)db) {}
+    ~SQLGetLCHead() {
+      if (stmt)
+        sqlite3_finalize(stmt);
+    }
+    int Prepare(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Execute(const DoutPrefixProvider *dpp, DBOpParams *params);
+    int Bind(const DoutPrefixProvider *dpp, DBOpParams *params);
+};
diff --git a/src/rgw/driver/dbstore/sqlite/statement.cc b/src/rgw/driver/dbstore/sqlite/statement.cc
new file mode 100644
index 000000000..dcf7dba9c
--- /dev/null
+++ b/src/rgw/driver/dbstore/sqlite/statement.cc
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "error.h"
+#include "statement.h"
+
+#define dout_subsys ceph_subsys_rgw_dbstore
+
+namespace rgw::dbstore::sqlite {
+
+// owning pointer to arbitrary memory allocated and returned by sqlite3
+struct sqlite_deleter {
+  template <typename T>
+  void operator()(T* p) { ::sqlite3_free(p); }
+};
+template <typename T>
+using sqlite_ptr = std::unique_ptr<T, sqlite_deleter>;
+
+
+stmt_ptr prepare_statement(const DoutPrefixProvider* dpp,
+                           sqlite3* db, std::string_view sql)
+{
+  sqlite3_stmt* stmt = nullptr;
+  int result = ::sqlite3_prepare_v2(db, sql.data(), sql.size(), &stmt, nullptr);
+  auto ec = std::error_code{result, sqlite::error_category()};
+  if (ec != sqlite::errc::ok) {
+    const char* errmsg = ::sqlite3_errmsg(db);
+    ldpp_dout(dpp, 1) << "preparation failed: " << errmsg
+        << " (" << ec << ")\nstatement: " << sql << dendl;
+    throw sqlite::error(errmsg, ec);
+  }
+  return stmt_ptr{stmt};
+}
+
+static int bind_index(const DoutPrefixProvider* dpp,
+                      const stmt_binding& stmt, const char* name)
+{
+  const int index = ::sqlite3_bind_parameter_index(stmt.get(), name);
+  if (index <= 0) {
+    ldpp_dout(dpp, 1) << "binding failed on parameter name="
+        << name << dendl;
+    sqlite3* db = ::sqlite3_db_handle(stmt.get());
+    throw sqlite::error(db);
+  }
+  return index;
+}
+
+void bind_text(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
+               const char* name, std::string_view value)
+{
+  const int index = bind_index(dpp, stmt, name);
+
+  int result = ::sqlite3_bind_text(stmt.get(), index, value.data(),
+                                   value.size(), SQLITE_STATIC);
+  auto ec = std::error_code{result, sqlite::error_category()};
+  if (ec != sqlite::errc::ok) {
+    ldpp_dout(dpp, 1) << "binding failed on parameter name="
+        << name << " value=" << value << dendl;
+    sqlite3* db = ::sqlite3_db_handle(stmt.get());
+    throw sqlite::error(db, ec);
+  }
+}
+
+void bind_int(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
+              const char* name, int value)
+{
+  const int index = bind_index(dpp, stmt, name);
+
+  int result = ::sqlite3_bind_int(stmt.get(), index, value);
+  auto ec = std::error_code{result, sqlite::error_category()};
+  if (ec != sqlite::errc::ok) {
+    ldpp_dout(dpp, 1) << "binding failed on parameter name="
+        << name << " value=" << value << dendl;
+    sqlite3* db = ::sqlite3_db_handle(stmt.get());
+    throw sqlite::error(db, ec);
+  }
+}
+
+void eval0(const DoutPrefixProvider* dpp, const stmt_execution& stmt)
+{
+  sqlite_ptr<char> sql;
+  if (dpp->get_cct()->_conf->subsys.should_gather<dout_subsys, 20>()) {
+    sql.reset(::sqlite3_expanded_sql(stmt.get()));
+  }
+
+  const int result = ::sqlite3_step(stmt.get());
+  auto ec = std::error_code{result, sqlite::error_category()};
+  sqlite3* db = ::sqlite3_db_handle(stmt.get());
+
+  if (ec != sqlite::errc::done) {
+    const char* errmsg = ::sqlite3_errmsg(db);
+    ldpp_dout(dpp, 20) << "evaluation failed: " << errmsg
+        << " (" << ec << ")\nstatement: " << sql.get() << dendl;
+    throw sqlite::error(errmsg, ec);
+  }
+  ldpp_dout(dpp, 20) << "evaluation succeeded: " << sql.get() << dendl;
+}
+
+void eval1(const DoutPrefixProvider* dpp, const stmt_execution& stmt)
+{
+  sqlite_ptr<char> sql;
+  if (dpp->get_cct()->_conf->subsys.should_gather<dout_subsys, 20>()) {
+    sql.reset(::sqlite3_expanded_sql(stmt.get()));
+  }
+
+  const int result = ::sqlite3_step(stmt.get());
+  auto ec = std::error_code{result, sqlite::error_category()};
+  if (ec != sqlite::errc::row) {
+    sqlite3* db = ::sqlite3_db_handle(stmt.get());
+    const char* errmsg = ::sqlite3_errmsg(db);
+    ldpp_dout(dpp, 1) << "evaluation failed: " << errmsg << " (" << ec
+        << ")\nstatement: " << sql.get() << dendl;
+    throw sqlite::error(errmsg, ec);
+  }
+  ldpp_dout(dpp, 20) << "evaluation succeeded: " << sql.get() << dendl;
+}
+
+int column_int(const stmt_execution& stmt, int column)
+{
+  return ::sqlite3_column_int(stmt.get(), column);
+}
+
+std::string column_text(const stmt_execution& stmt, int column)
+{
+  const unsigned char* text = ::sqlite3_column_text(stmt.get(), column);
+  // may be NULL
+  if (text) {
+    const std::size_t size = ::sqlite3_column_bytes(stmt.get(), column);
+    return {reinterpret_cast<const char*>(text), size};
+  } else {
+    return {};
+  }
+}
+
+auto read_text_rows(const DoutPrefixProvider* dpp,
+                    const stmt_execution& stmt,
+                    std::span<std::string> entries)
+  -> std::span<std::string>
+{
+  sqlite_ptr<char> sql;
+  if (dpp->get_cct()->_conf->subsys.should_gather<dout_subsys, 20>()) {
+    sql.reset(::sqlite3_expanded_sql(stmt.get()));
+  }
+
+  std::size_t count = 0;
+  while (count < entries.size()) {
+    const int result = ::sqlite3_step(stmt.get());
+    auto ec = std::error_code{result, sqlite::error_category()};
+    if (ec == sqlite::errc::done) {
+      break;
+    }
+    if (ec != sqlite::errc::row) {
+      sqlite3* db = ::sqlite3_db_handle(stmt.get());
+      const char* errmsg = ::sqlite3_errmsg(db);
+      ldpp_dout(dpp, 1) << "evaluation failed: " << errmsg << " (" << ec
+          << ")\nstatement: " << sql.get() << dendl;
+      throw sqlite::error(errmsg, ec);
+    }
+    entries[count] = column_text(stmt, 0);
+    ++count;
+  }
+  ldpp_dout(dpp, 20) << "statement evaluation produced " << count
+      << " results: " << sql.get() << dendl;
+
+  return entries.first(count);
+}
+
+void execute(const DoutPrefixProvider* dpp, sqlite3* db, const char* query,
+             sqlite3_callback callback, void* arg)
+{
+  char* errmsg = nullptr;
+  const int result = ::sqlite3_exec(db, query, callback, arg, &errmsg);
+  auto ec = std::error_code{result, sqlite::error_category()};
+  auto ptr = sqlite_ptr<char>{errmsg}; // free on destruction
+  if (ec != sqlite::errc::ok) {
+    ldpp_dout(dpp, 1) << "query execution failed: " << errmsg << " (" << ec
+        << ")\nquery: " << query << dendl;
+    throw sqlite::error(errmsg, ec);
+  }
+  ldpp_dout(dpp, 20) << "query execution succeeded: " << query << dendl;
+}
+
+} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/driver/dbstore/sqlite/statement.h b/src/rgw/driver/dbstore/sqlite/statement.h
new file mode 100644
index 000000000..98b4acfea
--- /dev/null
+++ b/src/rgw/driver/dbstore/sqlite/statement.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include <span>
+#include <string>
+
+#include <sqlite3.h>
+
+class DoutPrefixProvider;
+
+namespace rgw::dbstore::sqlite {
+
+// owning sqlite3_stmt pointer
+struct stmt_deleter {
+  void operator()(sqlite3_stmt* p) const { ::sqlite3_finalize(p); }
+};
+using stmt_ptr = std::unique_ptr<sqlite3_stmt, stmt_deleter>;
+
+// non-owning sqlite3_stmt pointer that clears binding state on destruction
+struct stmt_binding_deleter {
+  void operator()(sqlite3_stmt* p) const { ::sqlite3_clear_bindings(p); }
+};
+using stmt_binding = std::unique_ptr<sqlite3_stmt, stmt_binding_deleter>;
+
+// non-owning sqlite3_stmt pointer that clears execution state on destruction
+struct stmt_execution_deleter {
+  void operator()(sqlite3_stmt* p) const { ::sqlite3_reset(p); }
+};
+using stmt_execution = std::unique_ptr<sqlite3_stmt, stmt_execution_deleter>;
+
+
+// prepare the sql statement or throw on error
+stmt_ptr prepare_statement(const DoutPrefixProvider* dpp,
+                           sqlite3* db, std::string_view sql);
+
+// bind an input string for the given parameter name
+void bind_text(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
+               const char* name, std::string_view value);
+
+// bind an input integer for the given parameter name
+void bind_int(const DoutPrefixProvider* dpp, const stmt_binding& stmt,
+              const char* name, int value);
+
+// evaluate a prepared statement, expecting no result rows
+void eval0(const DoutPrefixProvider* dpp, const stmt_execution& stmt);
+
+// evaluate a prepared statement, expecting a single result row
+void eval1(const DoutPrefixProvider* dpp, const stmt_execution& stmt);
+
+// return the given column as an integer
+int column_int(const stmt_execution& stmt, int column);
+
+// return the given column as text, or an empty string on NULL
+std::string column_text(const stmt_execution& stmt, int column);
+
+// read the text column from each result row into the given entries, and return
+// the sub-span of entries that contain results
+auto read_text_rows(const DoutPrefixProvider* dpp,
+                    const stmt_execution& stmt,
+                    std::span<std::string> entries)
+  -> std::span<std::string>;
+
+// execute a raw query without preparing a statement. the optional callback
+// can be used to read results
+void execute(const DoutPrefixProvider* dpp, sqlite3* db, const char* query,
+             sqlite3_callback callback, void* arg);
+
+} // namespace rgw::dbstore::sqlite
diff --git a/src/rgw/driver/dbstore/tests/CMakeLists.txt b/src/rgw/driver/dbstore/tests/CMakeLists.txt
new file mode 100644
index 000000000..4e60dcf5e
--- /dev/null
+++ b/src/rgw/driver/dbstore/tests/CMakeLists.txt
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 3.14.0)
+project(dbstore-tests)
+
+set (CMAKE_LINK_LIBRARIES ${CMAKE_LINK_LIBRARIES} gtest)
+
+set(dbstore_tests_srcs
+    dbstore_tests.cc)
+
+include_directories(${CMAKE_INCLUDE_DIR})
+
+add_executable(unittest_dbstore_tests ${dbstore_tests_srcs})
+target_link_libraries(unittest_dbstore_tests ${CMAKE_LINK_LIBRARIES})
+add_ceph_unittest(unittest_dbstore_tests)
+
+add_executable(unittest_dbstore_mgr_tests dbstore_mgr_tests.cc)
+target_link_libraries(unittest_dbstore_mgr_tests dbstore gtest_main)
+add_ceph_unittest(unittest_dbstore_mgr_tests)
diff --git a/src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc b/src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc
new file mode 100644
index 000000000..02ecd9f15
--- /dev/null
+++ b/src/rgw/driver/dbstore/tests/dbstore_mgr_tests.cc
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "rgw/driver/dbstore/dbstore_mgr.h"
+
+#include <filesystem>
+#include <gtest/gtest.h>
+#include <memory>
+
+using namespace rgw;
+namespace fs = std::filesystem;
+const static std::string TEST_DIR = "rgw_dbstore_tests";
+
+bool endsWith(const std::string &mainStr, const std::string &toMatch)
+{
+    if(mainStr.size() >= toMatch.size() &&
+            mainStr.compare(mainStr.size() - toMatch.size(), toMatch.size(), toMatch) == 0)
+            return true;
+        else
+            return false;
+}
+
+class TestDBStoreManager : public ::testing::Test {
+protected:
+  void SetUp() override {
+    ctx_ = std::make_shared<CephContext>(CEPH_ENTITY_TYPE_CLIENT);
+    g_ceph_context = ctx_.get();
+    fs::current_path(fs::temp_directory_path());
+    fs::create_directory(TEST_DIR);
+  }
+
+  void TearDown() override {
+    fs::current_path(fs::temp_directory_path());
+    fs::remove_all(TEST_DIR);
+  }
+
+  std::string getTestDir() const {
+    auto test_dir = fs::temp_directory_path() / TEST_DIR;
+    return test_dir.string();
+  }
+
+  fs::path getDBFullPath(const std::string & base_dir,
+                         const std::string & tenant) const {
+    auto db_path = ctx_->_conf.get_val<std::string>("dbstore_db_dir");
+    const auto& db_name = ctx_->_conf.get_val<std::string>("dbstore_db_name_prefix") + "-" + tenant + ".db";
+
+    auto db_full_path = std::filesystem::path(db_path) / db_name;
+    auto db_full_path_test = fs::path(base_dir) / db_full_path;
+    return db_full_path_test;
+  }
+
+  std::string getDBTenant(const std::string & base_dir,
+                          const std::string & tenant) const {
+    auto db_name = ctx_->_conf.get_val<std::string>("dbstore_db_name_prefix");
+    db_name += "-" + tenant;
+    auto db_full_path = fs::path(base_dir) /  db_name;
+    return db_full_path.string();
+  }
+
+  std::string getDBTenant(const std::string & tenant = default_tenant) const {
+    return getDBTenant(getTestDir(), tenant);
+  }
+
+  fs::path getDBFullPath(const std::string & tenant) const {
+    return getDBFullPath(getTestDir(), tenant);
+  }
+
+  fs::path getLogFilePath(const std::string & log_file) {
+    return fs::temp_directory_path() / log_file;
+  }
+
+  std::shared_ptr<CephContext> getContext() const {
+    return ctx_;
+  }
+
+ private:
+    std::shared_ptr<CephContext> ctx_;
+};
+
+TEST_F(TestDBStoreManager, BasicInstantiateUsingDBDir) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant)));
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant)));
+}
+
+TEST_F(TestDBStoreManager, DBNamePrefix) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+  std::string prefix = "testprefix";
+  getContext()->_conf.set_val("dbstore_db_name_prefix", prefix);
+
+  EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant)));
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant)));
+
+  // check that the database name contains the given prefix
+  std::string expected_db_name = prefix + "-" + default_tenant + ".db";
+  EXPECT_TRUE(endsWith(getDBFullPath(default_tenant), expected_db_name));
+}
+
+TEST_F(TestDBStoreManager, BasicInstantiateSecondConstructor) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  EXPECT_FALSE(fs::exists(getDBFullPath(default_tenant)));
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get(), getLogFilePath("test.log").string(), 10);
+  EXPECT_TRUE(fs::exists(getDBFullPath(default_tenant)));
+}
+
+TEST_F(TestDBStoreManager, TestDBName) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  auto db = dbstore_mgr->getDB(default_tenant, false);
+  ASSERT_NE(nullptr, db);
+  EXPECT_EQ(getDBTenant(), db->getDBname());
+}
+
+
+TEST_F(TestDBStoreManager, TestDBNameDefaultDB) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  // passing an empty tenant should return the default_db
+  auto db = dbstore_mgr->getDB("", false);
+  ASSERT_NE(nullptr, db);
+  EXPECT_EQ(getDBTenant(), db->getDBname());
+}
+
+TEST_F(TestDBStoreManager, TestDBBadTenant) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  auto db = dbstore_mgr->getDB("does-not-exist", false);
+  ASSERT_EQ(nullptr, db);
+}
+
+TEST_F(TestDBStoreManager, TestGetNewDB) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+
+  auto new_tenant_path = "new_tenant";
+  auto db = dbstore_mgr->getDB(new_tenant_path, true);
+  ASSERT_NE(nullptr, db);
+  EXPECT_EQ(getDBTenant(new_tenant_path), db->getDBname());
+}
+
+TEST_F(TestDBStoreManager, TestDelete) {
+  getContext()->_conf.set_val("dbstore_db_dir", getTestDir());
+
+  auto dbstore_mgr = std::make_shared<DBStoreManager>(getContext().get());
+  dbstore_mgr->deleteDB(default_tenant);
+  auto db = dbstore_mgr->getDB(default_tenant, false);
+  ASSERT_EQ(nullptr, db);
+}
diff --git a/src/rgw/driver/dbstore/tests/dbstore_tests.cc b/src/rgw/driver/dbstore/tests/dbstore_tests.cc
new file mode 100644
index 000000000..27edb7b85
--- /dev/null
+++ b/src/rgw/driver/dbstore/tests/dbstore_tests.cc
@@ -0,0 +1,1417 @@
+#include "gtest/gtest.h"
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dbstore.h>
+#include <sqliteDB.h>
+#include "rgw_common.h"
+
+using namespace std;
+using DB = rgw::store::DB;
+
+vector<const char*> args;
+
+namespace gtest {
+  class Environment* env;
+
+  class Environment : public ::testing::Environment {
+    public:
+      Environment(): tenant("default_ns"), db(nullptr),
+      db_type("SQLite"), ret(-1) {}
+
+      Environment(string tenantname, string db_typename): 
+        tenant(tenantname), db(nullptr),
+        db_type(db_typename), ret(-1) {}
+
+      virtual ~Environment() {}
+
+      void SetUp() override {
+        cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+            CODE_ENVIRONMENT_DAEMON,
+            CINIT_FLAG_NO_DEFAULT_CONFIG_FILE | CINIT_FLAG_NO_MON_CONFIG | CINIT_FLAG_NO_DAEMON_ACTIONS);
+        if (!db_type.compare("SQLite")) {
+          db = new SQLiteDB(tenant, cct.get());
+          ASSERT_TRUE(db != nullptr);
+          ret = db->Initialize(logfile, loglevel);
+          ASSERT_GE(ret, 0);
+        }
+      }
+
+      void TearDown() override {
+        if (!db)
+          return;
+        db->Destroy(db->get_def_dpp());
+        delete db;
+      }
+
+      string tenant;
+      DB *db;
+      string db_type;
+      int ret;
+      string logfile = "rgw_dbstore_tests.log";
+      int loglevel = 30;
+      boost::intrusive_ptr<CephContext> cct;
+  };
+}
+
+ceph::real_time bucket_mtime = real_clock::now();
+string marker1;
+
+class DBGetDataCB : public RGWGetDataCB {
+  public:
+    bufferlist data_bl;
+    off_t data_ofs, data_len;
+
+    int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) {
+      data_bl = bl;
+      data_ofs = bl_ofs;
+      data_len = bl_len;
+      return 0;
+    }
+};
+
+namespace {
+
+  class DBStoreTest : public ::testing::Test {
+    protected:
+      int ret;
+      DB *db = nullptr;
+      string user1 = "user1";
+      string user_id1 = "user_id1";
+      string bucket1 = "bucket1";
+      string object1 = "object1";
+      string data = "Hello World";
+      DBOpParams GlobalParams = {};
+      const DoutPrefixProvider *dpp;
+
+      DBStoreTest() {}
+      void SetUp() {
+        db = gtest::env->db;
+        ASSERT_TRUE(db != nullptr);
+        dpp = db->get_def_dpp();
+        ASSERT_TRUE(dpp != nullptr);
+
+        GlobalParams.op.user.uinfo.display_name = user1;
+        GlobalParams.op.user.uinfo.user_id.id = user_id1;
+        GlobalParams.op.bucket.info.bucket.name = bucket1;
+        GlobalParams.op.obj.state.obj.bucket = GlobalParams.op.bucket.info.bucket;
+        GlobalParams.op.obj.state.obj.key.name = object1;
+        GlobalParams.op.obj.state.obj.key.instance = "inst1";
+        GlobalParams.op.obj.obj_id = "obj_id1";
+        GlobalParams.op.obj_data.part_num = 0;
+
+        /* As of now InitializeParams doesnt do anything
+         * special based on fop. Hence its okay to do
+         * global initialization once.
+         */
+        ret = db->InitializeParams(dpp, &GlobalParams);
+        ASSERT_EQ(ret, 0);
+      }
+
+      void TearDown() {
+      }
+
+      int write_object(const DoutPrefixProvider *dpp, DBOpParams params) {
+        DB::Object op_target(db, params.op.bucket.info,
+                             params.op.obj.state.obj);
+        DB::Object::Write write_op(&op_target);
+        map<string, bufferlist> setattrs;
+        ret = write_op.prepare(dpp);
+        if (ret)
+          return ret;
+
+        write_op.meta.mtime = &bucket_mtime;
+        write_op.meta.category = RGWObjCategory::Main;
+        write_op.meta.owner = params.op.user.uinfo.user_id;
+
+        bufferlist b1 = params.op.obj.head_data;
+        write_op.meta.data = &b1;
+
+        bufferlist b2;
+        encode("ACL", b2);
+        setattrs[RGW_ATTR_ACL] = b2;
+
+        ret = write_op.write_meta(0, params.op.obj.state.size, b1.length()+1, setattrs);
+        return ret;
+      }
+  };
+}
+
+TEST_F(DBStoreTest, InsertUser) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.user.uinfo.user_id.tenant = "tenant";
+  params.op.user.uinfo.user_email = "user1@dbstore.com";
+  params.op.user.uinfo.suspended = 123;
+  params.op.user.uinfo.max_buckets = 456;
+  params.op.user.uinfo.placement_tags.push_back("tags");
+  RGWAccessKey k1("id1", "key1");
+  RGWAccessKey k2("id2", "key2");
+  params.op.user.uinfo.access_keys["id1"] = k1;
+  params.op.user.uinfo.access_keys["id2"] = k2;
+  params.op.user.user_version.ver = 1;    
+  params.op.user.user_version.tag = "UserTAG";    
+
+  ret = db->ProcessOp(dpp, "InsertUser", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, GetUser) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "GetUser", &params);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(params.op.user.uinfo.user_id.tenant, "tenant");
+  ASSERT_EQ(params.op.user.uinfo.user_email, "user1@dbstore.com");
+  ASSERT_EQ(params.op.user.uinfo.user_id.id, "user_id1");
+  ASSERT_EQ(params.op.user.uinfo.suspended, 123);
+  ASSERT_EQ(params.op.user.uinfo.max_buckets, 456);
+  ASSERT_EQ(params.op.user.uinfo.placement_tags.back(), "tags");
+  RGWAccessKey k;
+  map<string, RGWAccessKey>::iterator it2 = params.op.user.uinfo.access_keys.begin();
+  k = it2->second;
+  ASSERT_EQ(k.id, "id1");
+  ASSERT_EQ(k.key, "key1");
+  it2++;
+  k = it2->second;
+  ASSERT_EQ(k.id, "id2");
+  ASSERT_EQ(k.key, "key2");
+
+}
+
+TEST_F(DBStoreTest, GetUserQuery) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.query_str = "email";
+  params.op.user.uinfo.user_email = "user1@dbstore.com";
+
+  ret = db->ProcessOp(dpp, "GetUser", &params);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(params.op.user.uinfo.user_id.tenant, "tenant");
+  ASSERT_EQ(params.op.user.uinfo.user_email, "user1@dbstore.com");
+  ASSERT_EQ(params.op.user.uinfo.user_id.id, "user_id1");
+  ASSERT_EQ(params.op.user.uinfo.suspended, 123);
+  ASSERT_EQ(params.op.user.uinfo.max_buckets, 456);
+  ASSERT_EQ(params.op.user.uinfo.placement_tags.back(), "tags");
+  RGWAccessKey k;
+  map<string, RGWAccessKey>::iterator it2 = params.op.user.uinfo.access_keys.begin();
+  k = it2->second;
+  ASSERT_EQ(k.id, "id1");
+  ASSERT_EQ(k.key, "key1");
+  it2++;
+  k = it2->second;
+  ASSERT_EQ(k.id, "id2");
+  ASSERT_EQ(k.key, "key2");
+
+}
+
+TEST_F(DBStoreTest, GetUserQueryByEmail) {
+  int ret = -1;
+  RGWUserInfo uinfo;
+  string email = "user1@dbstore.com";
+  map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv;
+
+  ret = db->get_user(dpp, "email", email, uinfo, &attrs, &objv);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(uinfo.user_id.tenant, "tenant");
+  ASSERT_EQ(uinfo.user_email, "user1@dbstore.com");
+  ASSERT_EQ(uinfo.user_id.id, "user_id1");
+  ASSERT_EQ(uinfo.suspended, 123);
+  ASSERT_EQ(uinfo.max_buckets, 456);
+  ASSERT_EQ(uinfo.placement_tags.back(), "tags");
+  RGWAccessKey k;
+  map<string, RGWAccessKey>::iterator it2 = uinfo.access_keys.begin();
+  k = it2->second;
+  ASSERT_EQ(k.id, "id1");
+  ASSERT_EQ(k.key, "key1");
+  it2++;
+  k = it2->second;
+  ASSERT_EQ(k.id, "id2");
+  ASSERT_EQ(k.key, "key2");
+  ASSERT_EQ(objv.read_version.ver, 1);
+}
+
+TEST_F(DBStoreTest, GetUserQueryByAccessKey) {
+  int ret = -1;
+  RGWUserInfo uinfo;
+  string key = "id1";
+
+  ret = db->get_user(dpp, "access_key", key, uinfo, nullptr, nullptr);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(uinfo.user_id.tenant, "tenant");
+  ASSERT_EQ(uinfo.user_email, "user1@dbstore.com");
+  ASSERT_EQ(uinfo.user_id.id, "user_id1");
+  ASSERT_EQ(uinfo.suspended, 123);
+  ASSERT_EQ(uinfo.max_buckets, 456);
+  ASSERT_EQ(uinfo.placement_tags.back(), "tags");
+  RGWAccessKey k;
+  map<string, RGWAccessKey>::iterator it2 = uinfo.access_keys.begin();
+  k = it2->second;
+  ASSERT_EQ(k.id, "id1");
+  ASSERT_EQ(k.key, "key1");
+  it2++;
+  k = it2->second;
+  ASSERT_EQ(k.id, "id2");
+  ASSERT_EQ(k.key, "key2");
+}
+
+TEST_F(DBStoreTest, StoreUser) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  RGWUserInfo uinfo, old_uinfo;
+  map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv_tracker;
+
+  bufferlist attr1, attr2;
+  encode("attrs1", attr1);
+  attrs["attr1"] = attr1;
+  encode("attrs2", attr2);
+  attrs["attr2"] = attr2;
+
+  uinfo.user_id.id = "user_id2";
+  uinfo.user_id.tenant = "tenant";
+  uinfo.user_email = "user2@dbstore.com";
+  uinfo.suspended = 123;
+  uinfo.max_buckets = 456;
+  uinfo.placement_tags.push_back("tags");
+  RGWAccessKey k1("id1", "key1");
+  RGWAccessKey k2("id2", "key2");
+  uinfo.access_keys["id1"] = k1;
+  uinfo.access_keys["id2"] = k2;
+
+  /* non exclusive create..should create new one */
+  ret = db->store_user(dpp, uinfo, false, &attrs, &objv_tracker, &old_uinfo);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(old_uinfo.user_email, "");
+  ASSERT_EQ(objv_tracker.read_version.ver, 1);
+  ASSERT_EQ(objv_tracker.read_version.tag, "UserTAG");
+
+  /* invalid version number */
+  objv_tracker.read_version.ver = 4;
+  ret = db->store_user(dpp, uinfo, true, &attrs, &objv_tracker, &old_uinfo);
+  ASSERT_EQ(ret, -125); /* returns ECANCELED */
+  ASSERT_EQ(old_uinfo.user_id.id, uinfo.user_id.id);
+  ASSERT_EQ(old_uinfo.user_email, uinfo.user_email);
+
+  /* exclusive create..should not create new one */
+  uinfo.user_email = "user2_new@dbstore.com";
+  objv_tracker.read_version.ver = 1;
+  ret = db->store_user(dpp, uinfo, true, &attrs, &objv_tracker, &old_uinfo);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(old_uinfo.user_email, "user2@dbstore.com");
+  ASSERT_EQ(objv_tracker.read_version.ver, 1);
+
+  ret = db->store_user(dpp, uinfo, false, &attrs, &objv_tracker, &old_uinfo);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(old_uinfo.user_email, "user2@dbstore.com");
+  ASSERT_EQ(objv_tracker.read_version.ver, 2);
+  ASSERT_EQ(objv_tracker.read_version.tag, "UserTAG");
+}
+
+TEST_F(DBStoreTest, GetUserQueryByUserID) {
+  int ret = -1;
+  RGWUserInfo uinfo;
+  map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv;
+
+  uinfo.user_id.tenant = "tenant";
+  uinfo.user_id.id = "user_id2";
+
+  ret = db->get_user(dpp, "user_id", "user_id2", uinfo, &attrs, &objv);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(uinfo.user_id.tenant, "tenant");
+  ASSERT_EQ(uinfo.user_email, "user2_new@dbstore.com");
+  ASSERT_EQ(uinfo.user_id.id, "user_id2");
+  ASSERT_EQ(uinfo.suspended, 123);
+  ASSERT_EQ(uinfo.max_buckets, 456);
+  ASSERT_EQ(uinfo.placement_tags.back(), "tags");
+  RGWAccessKey k;
+  map<string, RGWAccessKey>::iterator it = uinfo.access_keys.begin();
+  k = it->second;
+  ASSERT_EQ(k.id, "id1");
+  ASSERT_EQ(k.key, "key1");
+  it++;
+  k = it->second;
+  ASSERT_EQ(k.id, "id2");
+  ASSERT_EQ(k.key, "key2");
+
+  ASSERT_EQ(objv.read_version.ver, 2);
+
+  bufferlist k1, k2;
+  string attr;
+  map<std::string, bufferlist>::iterator it2 = attrs.begin();
+  k1 = it2->second;
+  decode(attr, k1);
+  ASSERT_EQ(attr, "attrs1");
+  it2++;
+  k2 = it2->second;
+  decode(attr, k2);
+  ASSERT_EQ(attr, "attrs2");
+}
+
+TEST_F(DBStoreTest, ListAllUsers) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ListAllUsers(dpp, &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, InsertBucket) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.bucket.info.bucket.name = "bucket1";
+  params.op.bucket.info.bucket.tenant = "tenant";
+  params.op.bucket.info.bucket.marker = "marker1";
+
+  params.op.bucket.ent.size = 1024;
+
+  params.op.bucket.info.has_instance_obj = false;
+  params.op.bucket.bucket_version.ver = 1;
+  params.op.bucket.bucket_version.tag = "read_tag";
+
+  params.op.bucket.mtime = bucket_mtime;
+
+  ret = db->ProcessOp(dpp, "InsertBucket", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, UpdateBucketAttrs) {
+  int ret = -1;
+  RGWBucketInfo info;
+  map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv;
+
+  bufferlist aclbl, aclbl2;
+  encode("attrs1", aclbl);
+  attrs["attr1"] = aclbl;
+  encode("attrs2", aclbl2);
+  attrs["attr2"] = aclbl2;
+
+  info.bucket.name = "bucket1";
+
+  /* invalid version number */
+  objv.read_version.ver = 4;
+  ret = db->update_bucket(dpp, "attrs", info, false, nullptr, &attrs, &bucket_mtime, &objv);
+  ASSERT_EQ(ret, -125); /* returns ECANCELED */
+
+  /* right version number */
+  objv.read_version.ver = 1;
+  ret = db->update_bucket(dpp, "attrs", info, false, nullptr, &attrs, &bucket_mtime, &objv);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(objv.read_version.ver, 2);
+}
+
+TEST_F(DBStoreTest, UpdateBucketInfo) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  RGWBucketInfo info;
+
+  params.op.bucket.info.bucket.name = "bucket1";
+
+  ret = db->ProcessOp(dpp, "GetBucket", &params);
+  ASSERT_EQ(ret, 0);
+
+  info = params.op.bucket.info;
+
+  info.bucket.marker = "marker2";
+  ret = db->update_bucket(dpp, "info", info, false, nullptr, nullptr, &bucket_mtime, nullptr);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(info.objv_tracker.read_version.ver, 3);
+}
+
+TEST_F(DBStoreTest, GetBucket) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.bucket.info.bucket.name = "bucket1";
+  ret = db->ProcessOp(dpp, "GetBucket", &params);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(params.op.bucket.info.bucket.name, "bucket1");
+  ASSERT_EQ(params.op.bucket.info.bucket.tenant, "tenant");
+  ASSERT_EQ(params.op.bucket.info.bucket.marker, "marker2");
+  ASSERT_EQ(params.op.bucket.ent.size, 1024);
+  ASSERT_EQ(params.op.bucket.ent.bucket.name, "bucket1");
+  ASSERT_EQ(params.op.bucket.ent.bucket.tenant, "tenant");
+  ASSERT_EQ(params.op.bucket.info.has_instance_obj, false);
+  ASSERT_EQ(params.op.bucket.info.objv_tracker.read_version.ver, 3);
+  ASSERT_EQ(params.op.bucket.info.objv_tracker.read_version.tag, "read_tag");
+  ASSERT_EQ(params.op.bucket.mtime, bucket_mtime);
+  ASSERT_EQ(params.op.bucket.info.owner.id, "user_id1");
+  bufferlist k, k2;
+  string acl;
+  map<std::string, bufferlist>::iterator it2 = params.op.bucket.bucket_attrs.begin();
+  k = it2->second;
+  decode(acl, k);
+  ASSERT_EQ(acl, "attrs1");
+  it2++;
+  k2 = it2->second;
+  decode(acl, k2);
+  ASSERT_EQ(acl, "attrs2");
+}
+
+TEST_F(DBStoreTest, CreateBucket) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  RGWBucketInfo info;
+  RGWUserInfo owner;
+  rgw_bucket bucket;
+  obj_version objv;
+  rgw_placement_rule rule;
+  map<std::string, bufferlist> attrs;
+
+  owner.user_id.id = "user_id1";
+  bucket.name = "bucket1";
+  bucket.tenant = "tenant";
+
+  objv.ver = 2;
+  objv.tag = "write_tag";
+
+  rule.name = "rule1";
+  rule.storage_class = "sc1";
+
+  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
+      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
+      null_yield, false);
+  ASSERT_EQ(ret, 0);
+  bucket.name = "bucket2";
+  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
+      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
+      null_yield, false);
+  ASSERT_EQ(ret, 0);
+  bucket.name = "bucket3";
+  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
+      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
+      null_yield, false);
+  ASSERT_EQ(ret, 0);
+  bucket.name = "bucket4";
+  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
+      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
+      null_yield, false);
+  ASSERT_EQ(ret, 0);
+  bucket.name = "bucket5";
+  ret = db->create_bucket(dpp, owner, bucket, "zid", rule, "swift_ver", NULL,
+      attrs, info, &objv, NULL, bucket_mtime, NULL, NULL,
+      null_yield, false);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, GetBucketQueryByName) {
+  int ret = -1;
+  RGWBucketInfo binfo;
+  binfo.bucket.name = "bucket2";
+  rgw::sal::Attrs attrs;
+  ceph::real_time mtime;
+  obj_version objv;
+
+  ret = db->get_bucket_info(dpp, "name", "", binfo, &attrs, &mtime, &objv);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(binfo.bucket.name, "bucket2");
+  ASSERT_EQ(binfo.bucket.tenant, "tenant");
+  ASSERT_EQ(binfo.owner.id, "user_id1");
+  ASSERT_EQ(binfo.objv_tracker.read_version.ver, 2);
+  ASSERT_EQ(binfo.objv_tracker.read_version.tag, "write_tag");
+  ASSERT_EQ(binfo.zonegroup, "zid");
+  ASSERT_EQ(binfo.creation_time, bucket_mtime);
+  ASSERT_EQ(binfo.placement_rule.name, "rule1");
+  ASSERT_EQ(binfo.placement_rule.storage_class, "sc1");
+  ASSERT_EQ(objv.ver, 2);
+  ASSERT_EQ(objv.tag, "write_tag");
+
+  marker1 = binfo.bucket.marker;
+}
+
+TEST_F(DBStoreTest, ListUserBuckets) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  rgw_user owner;
+  int max = 2;
+  bool need_stats = true;
+  bool is_truncated = false;
+  RGWUserBuckets ulist;
+
+  owner.id = "user_id1";
+
+  marker1 = "";
+  do {
+    is_truncated = false;
+    ret = db->list_buckets(dpp, "", owner, marker1, "", max, need_stats, &ulist,
+          &is_truncated);
+    ASSERT_EQ(ret, 0);
+
+    cout << "marker1 :" << marker1 << "\n";
+
+    cout << "is_truncated :" << is_truncated << "\n";
+
+    for (const auto& ent: ulist.get_buckets()) {
+      RGWBucketEnt e = ent.second;
+      cout << "###################### \n";
+      cout << "ent.bucket.id : " << e.bucket.name << "\n";
+      cout << "ent.bucket.marker : " << e.bucket.marker << "\n";
+      cout << "ent.bucket.bucket_id : " << e.bucket.bucket_id << "\n";
+      cout << "ent.size : " << e.size << "\n";
+      cout << "ent.rule.name : " << e.placement_rule.name << "\n";
+
+      marker1 = e.bucket.name;
+    }
+    ulist.clear();
+  } while(is_truncated);
+}
+
+TEST_F(DBStoreTest, BucketChown) {
+  int ret = -1;
+  RGWBucketInfo info;
+  rgw_user user;
+  user.id = "user_id2";
+
+  info.bucket.name = "bucket5";
+
+  ret = db->update_bucket(dpp, "owner", info, false, &user, nullptr, &bucket_mtime, nullptr);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(info.objv_tracker.read_version.ver, 3);
+}
+
+TEST_F(DBStoreTest, ListAllBuckets) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ListAllBuckets(dpp, &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, ListAllBuckets2) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  rgw_user owner;
+  int max = 2;
+  bool need_stats = true;
+  bool is_truncated = false;
+  RGWUserBuckets ulist;
+
+  marker1 = "";
+  do {
+    is_truncated = false;
+    ret = db->list_buckets(dpp, "all", owner, marker1, "", max, need_stats, &ulist,
+          &is_truncated);
+    ASSERT_EQ(ret, 0);
+
+    cout << "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n";
+    cout << "ownerID : " << owner.id << "\n";
+    cout << "marker1 :" << marker1 << "\n";
+
+    cout << "is_truncated :" << is_truncated << "\n";
+
+    for (const auto& ent: ulist.get_buckets()) {
+      RGWBucketEnt e = ent.second;
+      cout << "###################### \n";
+      cout << "ent.bucket.id : " << e.bucket.name << "\n";
+      cout << "ent.bucket.marker : " << e.bucket.marker << "\n";
+      cout << "ent.bucket.bucket_id : " << e.bucket.bucket_id << "\n";
+      cout << "ent.size : " << e.size << "\n";
+      cout << "ent.rule.name : " << e.placement_rule.name << "\n";
+
+      marker1 = e.bucket.name;
+    }
+    ulist.clear();
+  } while(is_truncated);
+}
+
+TEST_F(DBStoreTest, RemoveBucketAPI) {
+  int ret = -1;
+  RGWBucketInfo info;
+
+  info.bucket.name = "bucket5";
+
+  ret = db->remove_bucket(dpp, info);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, RemoveUserAPI) {
+  int ret = -1;
+  RGWUserInfo uinfo;
+  RGWObjVersionTracker objv;
+
+  uinfo.user_id.tenant = "tenant";
+  uinfo.user_id.id = "user_id2";
+
+  /* invalid version number...should fail */
+  objv.read_version.ver = 4;
+  ret = db->remove_user(dpp, uinfo, &objv);
+  ASSERT_EQ(ret, -125);
+
+  objv.read_version.ver = 2;
+  ret = db->remove_user(dpp, uinfo, &objv);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, PutObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.obj.category = RGWObjCategory::Main;
+  params.op.obj.storage_class = "STANDARD";
+  bufferlist b1;
+  encode("HELLO WORLD", b1);
+  cout<<"XXXXXXXXX Insert b1.length " << b1.length() << "\n";
+  params.op.obj.head_data = b1;
+  params.op.obj.state.size = 12;
+  params.op.obj.state.is_olh = false;
+  ret = db->ProcessOp(dpp, "PutObject", &params);
+  ASSERT_EQ(ret, 0);
+
+  /* Insert another objects */
+  params.op.obj.state.obj.key.name = "object2";
+  params.op.obj.state.obj.key.instance = "inst2";
+  ret = db->ProcessOp(dpp, "PutObject", &params);
+  ASSERT_EQ(ret, 0);
+
+  params.op.obj.state.obj.key.name = "object3";
+  params.op.obj.state.obj.key.instance = "inst3";
+  ret = db->ProcessOp(dpp, "PutObject", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, ListAllObjects) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ListAllObjects(dpp, &params);
+  ASSERT_GE(ret, 0);
+}
+
+TEST_F(DBStoreTest, GetObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "GetObject", &params);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(params.op.obj.category, RGWObjCategory::Main);
+  ASSERT_EQ(params.op.obj.storage_class, "STANDARD");
+  string data;
+  decode(data, params.op.obj.head_data);
+  ASSERT_EQ(data, "HELLO WORLD");
+  ASSERT_EQ(params.op.obj.state.size, 12);
+  cout << "versionNum :" << params.op.obj.version_num << "\n";
+}
+
+TEST_F(DBStoreTest, GetObjectState) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  RGWObjState* s;
+
+  params.op.obj.state.obj.key.name = "object2";
+  params.op.obj.state.obj.key.instance = "inst2";
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  ret = op_target.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
+      false, &s);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(s->size, 12);
+  ASSERT_EQ(s->is_olh, false);
+  cout << "versionNum :" << params.op.obj.version_num << "\n";
+
+  /* Recheck with get_state API */
+  ret = op_target.get_state(dpp, &s, false);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(s->size, 12);
+  ASSERT_EQ(s->is_olh, false);
+  cout << "versionNum :" << params.op.obj.version_num << "\n";
+}
+
+TEST_F(DBStoreTest, ObjAttrs) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  map<string, bufferlist> setattrs;
+  map<string, bufferlist> rmattrs;
+  map<string, bufferlist> readattrs;
+
+  bufferlist b1, b2, b3;
+  encode("ACL", b1);
+  setattrs[RGW_ATTR_ACL] = b1;
+  encode("LC", b2);
+  setattrs[RGW_ATTR_LC] = b2;
+  encode("ETAG", b3);
+  setattrs[RGW_ATTR_ETAG] = b3;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  /* Set some attrs */
+  ret = op_target.set_attrs(dpp, setattrs, nullptr);
+  ASSERT_EQ(ret, 0);
+
+  /* read those attrs */
+  DB::Object::Read read_op(&op_target);
+  read_op.params.attrs = &readattrs;
+  ret = read_op.prepare(dpp);
+  ASSERT_EQ(ret, 0);
+
+  string val;
+  decode(val, readattrs[RGW_ATTR_ACL]);
+  ASSERT_EQ(val, "ACL");
+  decode(val, readattrs[RGW_ATTR_LC]);
+  ASSERT_EQ(val, "LC");
+  decode(val, readattrs[RGW_ATTR_ETAG]);
+  ASSERT_EQ(val, "ETAG");
+
+  /* Remove some attrs */
+  rmattrs[RGW_ATTR_ACL] = b1;
+  map<string, bufferlist> empty;
+  ret = op_target.set_attrs(dpp, empty, &rmattrs);
+  ASSERT_EQ(ret, 0);
+
+  /* read those attrs */
+  ret = read_op.prepare(dpp);
+  ASSERT_EQ(ret, 0);
+
+  ASSERT_EQ(readattrs.count(RGW_ATTR_ACL), 0);
+  decode(val, readattrs[RGW_ATTR_LC]);
+  ASSERT_EQ(val, "LC");
+  decode(val, readattrs[RGW_ATTR_ETAG]);
+  ASSERT_EQ(val, "ETAG");
+}
+
+TEST_F(DBStoreTest, WriteObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  params.op.obj.state.obj.key.name = "object3";
+  params.op.obj.state.obj.key.instance = "inst3";
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  bufferlist b1;
+  encode("HELLO WORLD - Object3", b1);
+  params.op.obj.head_data = b1;
+  params.op.obj.state.size = 22;
+
+  ret = write_object(dpp, params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, ReadObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  map<string, bufferlist> readattrs;
+  params.op.obj.state.obj.key.name = "object3";
+  params.op.obj.state.obj.key.instance = "inst3";
+  uint64_t obj_size;
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+  DB::Object::Read read_op(&op_target);
+  read_op.params.attrs = &readattrs;
+  read_op.params.obj_size = &obj_size;
+  ret = read_op.prepare(dpp);
+  ASSERT_EQ(ret, 0);
+
+  bufferlist bl;
+  ret = read_op.read(0, 25, bl, dpp);
+  cout<<"XXXXXXXXX Insert bl.length " << bl.length() << "\n";
+  ASSERT_EQ(ret, 25);
+
+  string data;
+  decode(data, bl);
+  ASSERT_EQ(data, "HELLO WORLD - Object3");
+  ASSERT_EQ(obj_size, 22);
+}
+
+TEST_F(DBStoreTest, IterateObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  map<string, bufferlist> readattrs;
+  uint64_t obj_size;
+  DBGetDataCB cb;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+  DB::Object::Read read_op(&op_target);
+  read_op.params.attrs = &readattrs;
+  read_op.params.obj_size = &obj_size;
+  ret = read_op.prepare(dpp);
+  ASSERT_EQ(ret, 0);
+
+  bufferlist bl;
+  ret = read_op.iterate(dpp, 0, 15, &cb);
+  ASSERT_EQ(ret, 0);
+  string data;
+  decode(data, cb.data_bl);
+  cout << "XXXXXXXXXX iterate data is " << data << ", bl_ofs = " << cb.data_ofs << ", bl_len = " << cb.data_len << "\n";
+  ASSERT_EQ(data, "HELLO WORLD");
+  ASSERT_EQ(cb.data_ofs, 0);
+  ASSERT_EQ(cb.data_len, 15);
+}
+
+TEST_F(DBStoreTest, ListBucketObjects) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  
+  int max = 2;
+  bool is_truncated = false;
+  rgw_obj_key marker1;
+  DB::Bucket target(db, params.op.bucket.info);
+  DB::Bucket::List list_op(&target);
+
+  vector<rgw_bucket_dir_entry> dir_list;
+
+  marker1.name = "";
+  do {
+    is_truncated = false;
+    list_op.params.marker = marker1;
+    ret = list_op.list_objects(dpp, max, &dir_list, nullptr, &is_truncated);
+    ASSERT_EQ(ret, 0);
+
+    cout << "marker1 :" << marker1.name << "\n";
+
+    cout << "is_truncated :" << is_truncated << "\n";
+
+    for (const auto& ent: dir_list) {
+      cls_rgw_obj_key key = ent.key;
+      cout << "###################### \n";
+      cout << "key.name : " << key.name << "\n";
+      cout << "key.instance : " << key.instance << "\n";
+
+      marker1 = list_op.get_next_marker();
+    }
+    dir_list.clear();
+  } while(is_truncated);
+}
+
+TEST_F(DBStoreTest, DeleteObj) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  RGWObjState *s;
+
+  /* delete object2 */
+  params.op.obj.state.obj.key.name = "object2";
+  params.op.obj.state.obj.key.instance = "inst2";
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  DB::Object::Delete delete_op(&op_target);
+  ret = delete_op.delete_obj(dpp);
+  ASSERT_EQ(ret, 0);
+
+  /* Should return ENOENT */
+  ret = op_target.get_state(dpp, &s, false);
+  ASSERT_EQ(ret, -2);
+}
+
+TEST_F(DBStoreTest, WriteVersionedObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::string instances[] = {"inst1", "inst2", "inst3"};
+  bufferlist b1;
+
+  params.op.obj.flags |= rgw_bucket_dir_entry::FLAG_CURRENT;
+  params.op.obj.state.obj.key.name = "object1";
+
+  /* Write versioned objects */
+  DB::Object op_target(db, params.op.bucket.info, params.op.obj.state.obj);
+  DB::Object::Write write_op(&op_target);
+
+  /* Version1 */
+  params.op.obj.state.obj.key.instance = instances[0];
+  encode("HELLO WORLD", b1);
+  params.op.obj.head_data = b1;
+  params.op.obj.state.size = 12;
+  ret = write_object(dpp, params);
+  ASSERT_EQ(ret, 0);
+
+  /* Version2 */
+  params.op.obj.state.obj.key.instance = instances[1];
+  b1.clear();
+  encode("HELLO WORLD ABC", b1);
+  params.op.obj.head_data = b1;
+  params.op.obj.state.size = 16;
+  ret = write_object(dpp, params);
+  ASSERT_EQ(ret, 0);
+
+  /* Version3 */
+  params.op.obj.state.obj.key.instance = instances[2];
+  b1.clear();
+  encode("HELLO WORLD A", b1);
+  params.op.obj.head_data = b1;
+  params.op.obj.state.size = 14;
+  ret = write_object(dpp, params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, ListVersionedObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::string instances[] = {"inst1", "inst2", "inst3"};
+  int i = 0;
+
+  /* list versioned objects */
+  params.op.obj.state.obj.key.instance.clear();
+  params.op.list_max_count = MAX_VERSIONED_OBJECTS;
+  ret = db->ProcessOp(dpp, "ListVersionedObjects", &params);
+  ASSERT_EQ(ret, 0);
+
+  i = 2;
+  for (auto ent: params.op.obj.list_entries) {
+
+
+    ASSERT_EQ(ent.key.instance, instances[i]);
+    i--;
+  }
+}
+
+TEST_F(DBStoreTest, ReadVersionedObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::string instances[] = {"inst1", "inst2", "inst3"};
+  std::string data;
+
+  /* read object.. should fetch latest version */
+  RGWObjState* s;
+  params = GlobalParams;
+  params.op.obj.state.obj.key.instance.clear();
+  DB::Object op_target2(db, params.op.bucket.info, params.op.obj.state.obj);
+  ret = op_target2.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
+                                 true, &s);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(s->obj.key.instance, instances[2]);
+  decode(data, s->data);
+  ASSERT_EQ(data, "HELLO WORLD A");
+  ASSERT_EQ(s->size, 14);
+
+  /* read a particular non-current version */
+  params.op.obj.state.obj.key.instance = instances[1];
+  DB::Object op_target3(db, params.op.bucket.info, params.op.obj.state.obj);
+  ret = op_target3.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
+                                 true, &s);
+  ASSERT_EQ(ret, 0);
+  decode(data, s->data);
+  ASSERT_EQ(data, "HELLO WORLD ABC");
+  ASSERT_EQ(s->size, 16);
+}
+
+TEST_F(DBStoreTest, DeleteVersionedObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::string instances[] = {"inst1", "inst2", "inst3"};
+  std::string data;
+  std::string dm_instance;
+  int i = 0;
+
+  /* Delete object..should create delete marker */
+  params.op.obj.state.obj.key.instance.clear();
+  DB::Object op_target(db, params.op.bucket.info, params.op.obj.state.obj);
+  DB::Object::Delete delete_op(&op_target);
+  delete_op.params.versioning_status |= BUCKET_VERSIONED;
+
+  ret = delete_op.delete_obj(dpp);
+  ASSERT_EQ(ret, 0);
+
+  /* list versioned objects */
+  params = GlobalParams;
+  params.op.obj.state.obj.key.instance.clear();
+  params.op.list_max_count = MAX_VERSIONED_OBJECTS;
+  ret = db->ProcessOp(dpp, "ListVersionedObjects", &params);
+
+  i = 3;
+  for (auto ent: params.op.obj.list_entries) {
+    string is_delete_marker = (ent.flags & rgw_bucket_dir_entry::FLAG_DELETE_MARKER)? "true" : "false";
+    cout << "ent.name: " << ent.key.name << ". ent.instance: " << ent.key.instance << " is_delete_marker = " << is_delete_marker << "\n";
+
+    if (i == 3) {
+      ASSERT_EQ(is_delete_marker, "true");
+      dm_instance = ent.key.instance;
+    } else {
+      ASSERT_EQ(is_delete_marker, "false");
+      ASSERT_EQ(ent.key.instance, instances[i]);
+    }
+
+    i--;
+  }
+
+  /* read object.. should return -ENOENT */
+  RGWObjState* s;
+  params = GlobalParams;
+  params.op.obj.state.obj.key.instance.clear();
+  DB::Object op_target2(db, params.op.bucket.info, params.op.obj.state.obj);
+  ret = op_target2.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
+                                 true, &s);
+  ASSERT_EQ(ret, -ENOENT);
+
+  /* Delete delete marker..should be able to read object now */ 
+  params.op.obj.state.obj.key.instance = dm_instance;
+  DB::Object op_target3(db, params.op.bucket.info, params.op.obj.state.obj);
+  DB::Object::Delete delete_op2(&op_target3);
+  delete_op2.params.versioning_status |= BUCKET_VERSIONED;
+
+  ret = delete_op2.delete_obj(dpp);
+  ASSERT_EQ(ret, 0);
+
+  /* read object.. should fetch latest version */
+  params = GlobalParams;
+  params.op.obj.state.obj.key.instance.clear();
+  DB::Object op_target4(db, params.op.bucket.info, params.op.obj.state.obj);
+  ret = op_target4.get_obj_state(dpp, params.op.bucket.info, params.op.obj.state.obj,
+                                 true, &s);
+  ASSERT_EQ(s->obj.key.instance, instances[2]);
+  decode(data, s->data);
+  ASSERT_EQ(data, "HELLO WORLD A");
+  ASSERT_EQ(s->size, 14);
+
+  /* delete latest version using version-id. Next version should get promoted */
+  params.op.obj.state.obj.key.instance = instances[2];
+  DB::Object op_target5(db, params.op.bucket.info, params.op.obj.state.obj);
+  DB::Object::Delete delete_op3(&op_target5);
+  delete_op3.params.versioning_status |= BUCKET_VERSIONED;
+
+  ret = delete_op3.delete_obj(dpp);
+  ASSERT_EQ(ret, 0);
+
+  /* list versioned objects..only two versions should be present
+   * with second version marked as CURRENT */
+  params = GlobalParams;
+  params.op.obj.state.obj.key.instance.clear();
+  params.op.list_max_count = MAX_VERSIONED_OBJECTS;
+  ret = db->ProcessOp(dpp, "ListVersionedObjects", &params);
+
+  i = 1;
+  for (auto ent: params.op.obj.list_entries) {
+
+    if (i == 1) {
+      dm_instance = ent.key.instance;
+    } else {
+      ASSERT_EQ(ent.key.instance, instances[i]);
+    }
+
+    i--;
+  }
+
+}
+
+TEST_F(DBStoreTest, ObjectOmapSetVal) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  string val = "part1_val";
+  bufferlist bl;
+  encode(val, bl);
+  ret = op_target.obj_omap_set_val_by_key(dpp, "part1", bl, false);
+  ASSERT_EQ(ret, 0);
+
+  val = "part2_val";
+  bl.clear();
+  encode(val, bl);
+  ret = op_target.obj_omap_set_val_by_key(dpp, "part2", bl, false);
+  ASSERT_EQ(ret, 0);
+
+  val = "part3_val";
+  bl.clear();
+  encode(val, bl);
+  ret = op_target.obj_omap_set_val_by_key(dpp, "part3", bl, false);
+  ASSERT_EQ(ret, 0);
+
+  val = "part4_val";
+  bl.clear();
+  encode(val, bl);
+  ret = op_target.obj_omap_set_val_by_key(dpp, "part4", bl, false);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, ObjectOmapGetValsByKeys) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::set<std::string> keys;
+  std::map<std::string, bufferlist> vals;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  keys.insert("part2");
+  keys.insert("part4");
+
+  ret = op_target.obj_omap_get_vals_by_keys(dpp, "", keys, &vals);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(vals.size(), 2);
+
+  string val;
+  decode(val, vals["part2"]);
+  ASSERT_EQ(val, "part2_val");
+  decode(val, vals["part4"]);
+  ASSERT_EQ(val, "part4_val");
+}
+
+TEST_F(DBStoreTest, ObjectOmapGetAll) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::map<std::string, bufferlist> vals;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  ret = op_target.obj_omap_get_all(dpp, &vals);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(vals.size(), 4);
+
+  string val;
+  decode(val, vals["part1"]);
+  ASSERT_EQ(val, "part1_val");
+  decode(val, vals["part2"]);
+  ASSERT_EQ(val, "part2_val");
+  decode(val, vals["part3"]);
+  ASSERT_EQ(val, "part3_val");
+  decode(val, vals["part4"]);
+  ASSERT_EQ(val, "part4_val");
+}
+
+TEST_F(DBStoreTest, ObjectOmapGetVals) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::set<std::string> keys;
+  std::map<std::string, bufferlist> vals;
+  bool pmore;
+
+  DB::Object op_target(db, params.op.bucket.info,
+      params.op.obj.state.obj);
+
+  ret = op_target.obj_omap_get_vals(dpp, "part3", 10, &vals, &pmore);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(vals.size(), 2);
+
+  string val;
+  decode(val, vals["part3"]);
+  ASSERT_EQ(val, "part3_val");
+  decode(val, vals["part4"]);
+  ASSERT_EQ(val, "part4_val");
+}
+
+TEST_F(DBStoreTest, PutObjectData) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.obj_data.part_num = 1;
+  params.op.obj_data.offset = 10;
+  params.op.obj_data.multipart_part_str = "2";
+  bufferlist b1;
+  encode("HELLO WORLD", b1);
+  params.op.obj_data.data = b1;
+  params.op.obj_data.size = 12;
+  params.op.obj.state.mtime = real_clock::now();
+  ret = db->ProcessOp(dpp, "PutObjectData", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, UpdateObjectData) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.obj.state.mtime = bucket_mtime;
+  ret = db->ProcessOp(dpp, "UpdateObjectData", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, GetObjectData) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.obj.state.obj.key.instance = "inst1";
+  params.op.obj.state.obj.key.name = "object1";
+  ret = db->ProcessOp(dpp, "GetObjectData", &params);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(params.op.obj_data.part_num, 1);
+  ASSERT_EQ(params.op.obj_data.offset, 10);
+  ASSERT_EQ(params.op.obj_data.multipart_part_str, "2");
+  ASSERT_EQ(params.op.obj.state.obj.key.instance, "inst1");
+  ASSERT_EQ(params.op.obj.state.obj.key.name, "object1");
+  ASSERT_EQ(params.op.obj.state.mtime, bucket_mtime);
+  string data;
+  decode(data, params.op.obj_data.data);
+  ASSERT_EQ(data, "HELLO WORLD");
+}
+
+TEST_F(DBStoreTest, DeleteObjectData) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "DeleteObjectData", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, DeleteObject) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "DeleteObject", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, LCTables) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->createLCTables(dpp);
+  ASSERT_GE(ret, 0);
+}
+
+TEST_F(DBStoreTest, LCHead) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  std::string index1 = "bucket1";
+  std::string index2 = "bucket2";
+  time_t lc_time = ceph_clock_now();
+  std::unique_ptr<rgw::sal::Lifecycle::LCHead> head;
+  std::string ents[] = {"entry1", "entry2", "entry3"};
+  rgw::sal::StoreLifecycle::StoreLCHead head1(lc_time, 0, ents[0]);
+  rgw::sal::StoreLifecycle::StoreLCHead head2(lc_time, 0, ents[1]);
+  rgw::sal::StoreLifecycle::StoreLCHead head3(lc_time, 0, ents[2]);
+
+  ret = db->put_head(index1, head1);
+  ASSERT_EQ(ret, 0);
+  ret = db->put_head(index2, head2);
+  ASSERT_EQ(ret, 0);
+
+  ret = db->get_head(index1, &head);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(head->get_marker(), "entry1");
+
+  ret = db->get_head(index2, &head);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(head->get_marker(), "entry2");
+
+  // update index1
+  ret = db->put_head(index1, head3);
+  ASSERT_EQ(ret, 0);
+  ret = db->get_head(index1, &head);
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(head->get_marker(), "entry3");
+
+}
+TEST_F(DBStoreTest, LCEntry) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+  uint64_t lc_time = ceph_clock_now();
+  std::string index1 = "lcindex1";
+  std::string index2 = "lcindex2";
+  typedef enum {lc_uninitial = 1, lc_complete} status;
+  std::string ents[] = {"bucket1", "bucket2", "bucket3", "bucket4"};
+  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
+  rgw::sal::StoreLifecycle::StoreLCEntry entry1(ents[0], lc_time, lc_uninitial);
+  rgw::sal::StoreLifecycle::StoreLCEntry entry2(ents[1], lc_time, lc_uninitial);
+  rgw::sal::StoreLifecycle::StoreLCEntry entry3(ents[2], lc_time, lc_uninitial);
+  rgw::sal::StoreLifecycle::StoreLCEntry entry4(ents[3], lc_time, lc_uninitial);
+
+  vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> lc_entries;
+
+  ret = db->set_entry(index1, entry1);
+  ASSERT_EQ(ret, 0);
+  ret = db->set_entry(index1, entry2);
+  ASSERT_EQ(ret, 0);
+  ret = db->set_entry(index1, entry3);
+  ASSERT_EQ(ret, 0);
+  ret = db->set_entry(index2, entry4);
+  ASSERT_EQ(ret, 0);
+
+  // get entry index1, entry1
+  ret = db->get_entry(index1, ents[0], &entry); 
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(entry->get_status(), lc_uninitial);
+  ASSERT_EQ(entry->get_start_time(), lc_time);
+
+  // get next entry index1, entry2
+  ret = db->get_next_entry(index1, ents[1], &entry); 
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(entry->get_bucket(), ents[2]);
+  ASSERT_EQ(entry->get_status(), lc_uninitial);
+  ASSERT_EQ(entry->get_start_time(), lc_time);
+
+  // update entry4 to entry5
+  entry4.status = lc_complete;
+  ret = db->set_entry(index2, entry4);
+  ASSERT_EQ(ret, 0);
+  ret = db->get_entry(index2, ents[3], &entry); 
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(entry->get_status(), lc_complete);
+
+  // list entries
+  ret = db->list_entries(index1, "", 5, lc_entries);
+  ASSERT_EQ(ret, 0);
+  for (const auto& ent: lc_entries) {
+    cout << "###################### \n";
+    cout << "lc entry.bucket : " << ent->get_bucket() << "\n";
+    cout << "lc entry.status : " << ent->get_status() << "\n";
+  }
+
+  // remove index1, entry3
+  ret = db->rm_entry(index1, entry3); 
+  ASSERT_EQ(ret, 0);
+
+  // get next entry index1, entry2.. should be null
+  entry.release();
+  ret = db->get_next_entry(index1, ents[1], &entry); 
+  ASSERT_EQ(ret, 0);
+  ASSERT_EQ(entry.get(), nullptr);
+}
+
+TEST_F(DBStoreTest, RemoveBucket) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "RemoveBucket", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, RemoveUser) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  ret = db->ProcessOp(dpp, "RemoveUser", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+TEST_F(DBStoreTest, InsertTestIDUser) {
+  struct DBOpParams params = GlobalParams;
+  int ret = -1;
+
+  params.op.user.uinfo.user_id.id = "testid";
+  params.op.user.uinfo.display_name = "M. Tester";
+  params.op.user.uinfo.user_id.tenant = "tenant";
+  params.op.user.uinfo.user_email = "tester@ceph.com";
+  RGWAccessKey k1("0555b35654ad1656d804", "h7GhxuBLTrlhVUyxSPUKUV8r/2EI4ngqJxD7iBdBYLhwluN30JaT3Q==");
+  params.op.user.uinfo.access_keys["0555b35654ad1656d804"] = k1;
+  params.op.user.user_version.ver = 1;    
+  params.op.user.user_version.tag = "UserTAG";    
+
+  ret = db->ProcessOp(dpp, "InsertUser", &params);
+  ASSERT_EQ(ret, 0);
+}
+
+int main(int argc, char **argv)
+{
+  int ret = -1;
+  string c_logfile = "rgw_dbstore_tests.log";
+  int c_loglevel = 20;
+
+  // format: ./dbstore-tests logfile loglevel
+  if (argc == 3) {
+    c_logfile = argv[1];
+    c_loglevel = (atoi)(argv[2]);
+    cout << "logfile:" << c_logfile << ", loglevel set to " << c_loglevel << "\n";
+  }
+
+  ::testing::InitGoogleTest(&argc, argv);
+
+  gtest::env = new gtest::Environment();
+  gtest::env->logfile = c_logfile;
+  gtest::env->loglevel = c_loglevel;
+  ::testing::AddGlobalTestEnvironment(gtest::env);
+
+  ret = RUN_ALL_TESTS();
+
+  return ret;
+}
diff --git a/src/rgw/driver/immutable_config/store.cc b/src/rgw/driver/immutable_config/store.cc
new file mode 100644
index 000000000..8d3e0765f
--- /dev/null
+++ b/src/rgw/driver/immutable_config/store.cc
@@ -0,0 +1,422 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_zone.h"
+#include "store.h"
+
+namespace rgw::sal {
+
+ImmutableConfigStore::ImmutableConfigStore(const RGWZoneGroup& zonegroup,
+                                           const RGWZoneParams& zone,
+                                           const RGWPeriodConfig& period_config)
+    : zonegroup(zonegroup), zone(zone), period_config(period_config)
+{
+}
+
+// Realm
+int ImmutableConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp,
+                                                 optional_yield y, bool exclusive,
+                                                 std::string_view realm_id)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp,
+                                                optional_yield y,
+                                                std::string& realm_id)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                                  optional_yield y)
+{
+  return -EROFS;
+}
+
+
+int ImmutableConfigStore::create_realm(const DoutPrefixProvider* dpp,
+                                       optional_yield y, bool exclusive,
+                                       const RGWRealm& info,
+                                       std::unique_ptr<RealmWriter>* writer)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp,
+                                           optional_yield y,
+                                           std::string_view realm_id,
+                                           RGWRealm& info,
+                                           std::unique_ptr<RealmWriter>* writer)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string_view realm_name,
+                                             RGWRealm& info,
+                                             std::unique_ptr<RealmWriter>* writer)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::read_default_realm(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             RGWRealm& info,
+                                             std::unique_ptr<RealmWriter>* writer)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::read_realm_id(const DoutPrefixProvider* dpp,
+                                        optional_yield y, std::string_view realm_name,
+                                        std::string& realm_id)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                                  optional_yield y,
+                                                  const RGWPeriod& period)
+{
+  return -ENOTSUP;
+}
+
+int ImmutableConfigStore::list_realm_names(const DoutPrefixProvider* dpp,
+                                           optional_yield y, const std::string& marker,
+                                           std::span<std::string> entries,
+                                           ListResult<std::string>& result)
+{
+  result.next.clear();
+  result.entries = entries.first(0);
+  return 0;
+}
+
+
+// Period
+int ImmutableConfigStore::create_period(const DoutPrefixProvider* dpp,
+                                        optional_yield y, bool exclusive,
+                                        const RGWPeriod& info)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_period(const DoutPrefixProvider* dpp,
+                                      optional_yield y, std::string_view period_id,
+                                      std::optional<uint32_t> epoch, RGWPeriod& info)
+{
+  return -ENOENT;
+}
+
+int ImmutableConfigStore::delete_period(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view period_id)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::list_period_ids(const DoutPrefixProvider* dpp,
+                                          optional_yield y, const std::string& marker,
+                                          std::span<std::string> entries,
+                                          ListResult<std::string>& result)
+{
+  result.next.clear();
+  result.entries = entries.first(0);
+  return 0;
+}
+
+
+// ZoneGroup
+
+class ImmutableZoneGroupWriter : public ZoneGroupWriter {
+ public:
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneGroup& info) override
+  {
+    return -EROFS;
+  }
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneGroup& info, std::string_view new_name) override
+  {
+    return -EROFS;
+  }
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    return -EROFS;
+  }
+};
+
+int ImmutableConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                     optional_yield y, bool exclusive,
+                                                     std::string_view realm_id,
+                                                     std::string_view zonegroup_id)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                    optional_yield y,
+                                                    std::string_view realm_id,
+                                                    std::string& zonegroup_id)
+{
+  if (!realm_id.empty()) {
+    return -ENOENT;
+  }
+  zonegroup_id = zonegroup.id;
+  return 0;
+}
+
+int ImmutableConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                      optional_yield y,
+                                                      std::string_view realm_id)
+{
+  return -EROFS;
+}
+
+
+int ImmutableConfigStore::create_zonegroup(const DoutPrefixProvider* dpp,
+                                           optional_yield y, bool exclusive,
+                                           const RGWZoneGroup& info,
+                                           std::unique_ptr<ZoneGroupWriter>* writer)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                               optional_yield y,
+                                               std::string_view zonegroup_id,
+                                               RGWZoneGroup& info,
+                                               std::unique_ptr<ZoneGroupWriter>* writer)
+{
+  if (zonegroup_id != zonegroup.id) {
+    return -ENOENT;
+  }
+
+  info = zonegroup;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneGroupWriter>();
+  }
+  return 0;
+}
+int ImmutableConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                                 optional_yield y,
+                                                 std::string_view zonegroup_name,
+                                                 RGWZoneGroup& info,
+                                                 std::unique_ptr<ZoneGroupWriter>* writer)
+{
+  if (zonegroup_name != zonegroup.name) {
+    return -ENOENT;
+  }
+
+  info = zonegroup;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneGroupWriter>();
+  }
+  return 0;
+}
+
+int ImmutableConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                                 optional_yield y,
+                                                 std::string_view realm_id,
+                                                 RGWZoneGroup& info,
+                                                 std::unique_ptr<ZoneGroupWriter>* writer)
+{
+  info = zonegroup;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneGroupWriter>();
+  }
+  return 0;
+}
+
+int ImmutableConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                               optional_yield y, const std::string& marker,
+                                               std::span<std::string> entries,
+                                               ListResult<std::string>& result)
+{
+  if (marker < zonegroup.name) {
+    entries[0] = zonegroup.name;
+    result.next = zonegroup.name;
+    result.entries = entries.first(1);
+  } else {
+    result.next.clear();
+    result.entries = entries.first(0);
+  }
+  return 0;
+}
+
+// Zone
+
+class ImmutableZoneWriter : public ZoneWriter {
+ public:
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneParams& info) override
+  {
+    return -EROFS;
+  }
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneParams& info, std::string_view new_name) override
+  {
+    return -EROFS;
+  }
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    return -EROFS;
+  }
+};
+
+int ImmutableConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp,
+                                                optional_yield y, bool exclusive,
+                                                std::string_view realm_id,
+                                                std::string_view zone_id)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp,
+                                               optional_yield y,
+                                               std::string_view realm_id,
+                                               std::string& zone_id)
+{
+  if (realm_id.empty()) {
+    return -ENOENT;
+  }
+  zone_id = zone.id;
+  return 0;
+}
+
+int ImmutableConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                                 optional_yield y,
+                                                 std::string_view realm_id)
+{
+  return -EROFS;
+}
+
+
+int ImmutableConfigStore::create_zone(const DoutPrefixProvider* dpp,
+                                      optional_yield y, bool exclusive,
+                                      const RGWZoneParams& info,
+                                      std::unique_ptr<ZoneWriter>* writer)
+{
+  return -EROFS;
+}
+
+int ImmutableConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view zone_id,
+                                          RGWZoneParams& info,
+                                          std::unique_ptr<ZoneWriter>* writer)
+{
+  if (zone_id != zone.id) {
+    return -ENOENT;
+  }
+
+  info = zone;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneWriter>();
+  }
+  return 0;
+}
+
+int ImmutableConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            std::string_view zone_name,
+                                            RGWZoneParams& info,
+                                            std::unique_ptr<ZoneWriter>* writer)
+{
+  if (zone_name != zone.name) {
+    return -ENOENT;
+  }
+
+  info = zone;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneWriter>();
+  }
+  return 0;
+}
+
+int ImmutableConfigStore::read_default_zone(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            std::string_view realm_id,
+                                            RGWZoneParams& info,
+                                            std::unique_ptr<ZoneWriter>* writer)
+{
+  if (!realm_id.empty()) {
+    return -ENOENT;
+  }
+
+  info = zone;
+
+  if (writer) {
+    *writer = std::make_unique<ImmutableZoneWriter>();
+  }
+  return 0;
+}
+
+int ImmutableConfigStore::list_zone_names(const DoutPrefixProvider* dpp,
+                                          optional_yield y, const std::string& marker,
+                                          std::span<std::string> entries,
+                                          ListResult<std::string>& result)
+{
+  if (marker < zone.name) {
+    entries[0] = zone.name;
+    result.next = zone.name;
+    result.entries = entries.first(1);
+  } else {
+    result.next.clear();
+    result.entries = entries.first(0);
+  }
+  return 0;
+}
+
+
+// PeriodConfig
+int ImmutableConfigStore::read_period_config(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string_view realm_id,
+                                             RGWPeriodConfig& info)
+{
+  if (!realm_id.empty()) {
+    return -ENOENT;
+  }
+
+  info = period_config;
+  return 0;
+}
+
+int ImmutableConfigStore::write_period_config(const DoutPrefixProvider* dpp,
+                                              optional_yield y, bool exclusive,
+                                              std::string_view realm_id,
+                                              const RGWPeriodConfig& info)
+{
+  return -EROFS;
+}
+
+
+/// ImmutableConfigStore factory function
+auto create_immutable_config_store(const DoutPrefixProvider* dpp,
+                                   const RGWZoneGroup& zonegroup,
+                                   const RGWZoneParams& zone,
+                                   const RGWPeriodConfig& period_config)
+  -> std::unique_ptr<ConfigStore>
+{
+  return std::make_unique<ImmutableConfigStore>(zonegroup, zone, period_config);
+}
+
+} // namespace rgw::sal
diff --git a/src/rgw/driver/immutable_config/store.h b/src/rgw/driver/immutable_config/store.h
new file mode 100644
index 000000000..9a1ac5f14
--- /dev/null
+++ b/src/rgw/driver/immutable_config/store.h
@@ -0,0 +1,180 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal_config.h"
+
+namespace rgw::sal {
+
+/// A read-only ConfigStore that serves the given default zonegroup and zone.
+class ImmutableConfigStore : public ConfigStore {
+ public:
+  explicit ImmutableConfigStore(const RGWZoneGroup& zonegroup,
+                                const RGWZoneParams& zone,
+                                const RGWPeriodConfig& period_config);
+
+  // Realm
+  virtual int write_default_realm_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y, bool exclusive,
+                                     std::string_view realm_id) override;
+  virtual int read_default_realm_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y,
+                                    std::string& realm_id) override;
+  virtual int delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                      optional_yield y) override;
+
+  virtual int create_realm(const DoutPrefixProvider* dpp,
+                           optional_yield y, bool exclusive,
+                           const RGWRealm& info,
+                           std::unique_ptr<RealmWriter>* writer) override;
+  virtual int read_realm_by_id(const DoutPrefixProvider* dpp,
+                               optional_yield y,
+                               std::string_view realm_id,
+                               RGWRealm& info,
+                               std::unique_ptr<RealmWriter>* writer) override;
+  virtual int read_realm_by_name(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 std::string_view realm_name,
+                                 RGWRealm& info,
+                                 std::unique_ptr<RealmWriter>* writer) override;
+  virtual int read_default_realm(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 RGWRealm& info,
+                                 std::unique_ptr<RealmWriter>* writer) override;
+  virtual int read_realm_id(const DoutPrefixProvider* dpp,
+                            optional_yield y, std::string_view realm_name,
+                            std::string& realm_id) override;
+  virtual int realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      const RGWPeriod& period) override;
+  virtual int list_realm_names(const DoutPrefixProvider* dpp,
+                               optional_yield y, const std::string& marker,
+                               std::span<std::string> entries,
+                               ListResult<std::string>& result) override;
+
+  // Period
+  virtual int create_period(const DoutPrefixProvider* dpp,
+                            optional_yield y, bool exclusive,
+                            const RGWPeriod& info) override;
+  virtual int read_period(const DoutPrefixProvider* dpp,
+                          optional_yield y, std::string_view period_id,
+                          std::optional<uint32_t> epoch, RGWPeriod& info) override;
+  virtual int delete_period(const DoutPrefixProvider* dpp,
+                            optional_yield y,
+                            std::string_view period_id) override;
+  virtual int list_period_ids(const DoutPrefixProvider* dpp,
+                              optional_yield y, const std::string& marker,
+                              std::span<std::string> entries,
+                              ListResult<std::string>& result) override;
+
+  // ZoneGroup
+  virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                         optional_yield y, bool exclusive,
+                                         std::string_view realm_id,
+                                         std::string_view zonegroup_id) override;
+  virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view realm_id,
+                                        std::string& zonegroup_id) override;
+  virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view realm_id) override;
+
+  virtual int create_zonegroup(const DoutPrefixProvider* dpp,
+                               optional_yield y, bool exclusive,
+                               const RGWZoneGroup& info,
+                               std::unique_ptr<ZoneGroupWriter>* writer) override;
+  virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view zonegroup_id,
+                                   RGWZoneGroup& info,
+                                   std::unique_ptr<ZoneGroupWriter>* writer) override;
+  virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view zonegroup_name,
+                                     RGWZoneGroup& info,
+                                     std::unique_ptr<ZoneGroupWriter>* writer) override;
+  virtual int read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_id,
+                                     RGWZoneGroup& info,
+                                     std::unique_ptr<ZoneGroupWriter>* writer) override;
+  virtual int list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                   optional_yield y, const std::string& marker,
+                                   std::span<std::string> entries,
+                                   ListResult<std::string>& result) override;
+
+  // Zone
+  virtual int write_default_zone_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y, bool exclusive,
+                                    std::string_view realm_id,
+                                    std::string_view zone_id) override;
+  virtual int read_default_zone_id(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view realm_id,
+                                   std::string& zone_id) override;
+  virtual int delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_id) override;
+
+  virtual int create_zone(const DoutPrefixProvider* dpp,
+                          optional_yield y, bool exclusive,
+                          const RGWZoneParams& info,
+                          std::unique_ptr<ZoneWriter>* writer) override;
+  virtual int read_zone_by_id(const DoutPrefixProvider* dpp,
+                              optional_yield y,
+                              std::string_view zone_id,
+                              RGWZoneParams& info,
+                              std::unique_ptr<ZoneWriter>* writer) override;
+  virtual int read_zone_by_name(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view zone_name,
+                                RGWZoneParams& info,
+                                std::unique_ptr<ZoneWriter>* writer) override;
+  virtual int read_default_zone(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view realm_id,
+                                RGWZoneParams& info,
+                                std::unique_ptr<ZoneWriter>* writer) override;
+  virtual int list_zone_names(const DoutPrefixProvider* dpp,
+                              optional_yield y, const std::string& marker,
+                              std::span<std::string> entries,
+                              ListResult<std::string>& result) override;
+
+  // PeriodConfig
+  virtual int read_period_config(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 std::string_view realm_id,
+                                 RGWPeriodConfig& info) override;
+  virtual int write_period_config(const DoutPrefixProvider* dpp,
+                                  optional_yield y, bool exclusive,
+                                  std::string_view realm_id,
+                                  const RGWPeriodConfig& info) override;
+
+ private:
+  const RGWZoneGroup zonegroup;
+  const RGWZoneParams zone;
+  const RGWPeriodConfig period_config;
+}; // ImmutableConfigStore
+
+
+/// ImmutableConfigStore factory function
+auto create_immutable_config_store(const DoutPrefixProvider* dpp,
+                                   const RGWZoneGroup& zonegroup,
+                                   const RGWZoneParams& zone,
+                                   const RGWPeriodConfig& period_config)
+  -> std::unique_ptr<ConfigStore>;
+
+} // namespace rgw::sal
diff --git a/src/rgw/driver/json_config/store.cc b/src/rgw/driver/json_config/store.cc
new file mode 100644
index 000000000..cf5adda25
--- /dev/null
+++ b/src/rgw/driver/json_config/store.cc
@@ -0,0 +1,177 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <system_error>
+#include "include/buffer.h"
+#include "common/errno.h"
+#include "common/ceph_json.h"
+#include "rgw_zone.h"
+#include "driver/immutable_config/store.h"
+#include "store.h"
+
+namespace rgw::sal {
+
+namespace {
+
+struct DecodedConfig {
+  RGWZoneGroup zonegroup;
+  RGWZoneParams zone;
+  RGWPeriodConfig period_config;
+
+  void decode_json(JSONObj *obj)
+  {
+    JSONDecoder::decode_json("zonegroup", zonegroup, obj);
+    JSONDecoder::decode_json("zone", zone, obj);
+    JSONDecoder::decode_json("period_config", period_config, obj);
+  }
+};
+
+static void parse_config(const DoutPrefixProvider* dpp, const char* filename)
+{
+  bufferlist bl;
+  std::string errmsg;
+  int r = bl.read_file(filename, &errmsg);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to read json config file '" << filename
+        << "': " << errmsg << dendl;
+    throw std::system_error(-r, std::system_category());
+  }
+
+  JSONParser p;
+  if (!p.parse(bl.c_str(), bl.length())) {
+    ldpp_dout(dpp, 0) << "failed to parse json config file" << dendl;
+    throw std::system_error(make_error_code(std::errc::invalid_argument));
+  }
+
+  DecodedConfig config;
+  try {
+    decode_json_obj(config, &p);
+  } catch (const JSONDecoder::err& e) {
+    ldpp_dout(dpp, 0) << "failed to decode JSON input: " << e.what() << dendl;
+    throw std::system_error(make_error_code(std::errc::invalid_argument));
+  }
+}
+
+void sanity_check_config(const DoutPrefixProvider* dpp, DecodedConfig& config)
+{
+  if (config.zonegroup.id.empty()) {
+    config.zonegroup.id = "default";
+  }
+  if (config.zonegroup.name.empty()) {
+    config.zonegroup.name = "default";
+  }
+  if (config.zonegroup.api_name.empty()) {
+    config.zonegroup.api_name = config.zonegroup.name;
+  }
+
+  if (config.zone.id.empty()) {
+    config.zone.id = "default";
+  }
+  if (config.zone.name.empty()) {
+    config.zone.name = "default";
+  }
+
+  // add default placement if it doesn't exist
+  rgw_pool pool;
+  RGWZonePlacementInfo placement;
+  placement.storage_classes.set_storage_class(
+      RGW_STORAGE_CLASS_STANDARD, &pool, nullptr);
+  config.zone.placement_pools.emplace("default-placement",
+                                      std::move(placement));
+
+  std::set<rgw_pool> pools;
+  int r = rgw::init_zone_pool_names(dpp, null_yield, pools, config.zone);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to set default zone pool names" << dendl;
+    throw std::system_error(-r, std::system_category());
+  }
+
+  // verify that config.zonegroup only contains config.zone
+  if (config.zonegroup.zones.size() > 1) {
+    ldpp_dout(dpp, 0) << "zonegroup cannot contain multiple zones" << dendl;
+    throw std::system_error(make_error_code(std::errc::invalid_argument));
+  }
+
+  if (config.zonegroup.zones.size() == 1) {
+    auto z = config.zonegroup.zones.begin();
+    if (z->first != config.zone.id) {
+      ldpp_dout(dpp, 0) << "zonegroup contains unknown zone id="
+          << z->first << dendl;
+      throw std::system_error(make_error_code(std::errc::invalid_argument));
+    }
+    if (z->second.id != config.zone.id) {
+      ldpp_dout(dpp, 0) << "zonegroup contains unknown zone id="
+          << z->second.id << dendl;
+      throw std::system_error(make_error_code(std::errc::invalid_argument));
+    }
+    if (z->second.name != config.zone.name) {
+      ldpp_dout(dpp, 0) << "zonegroup contains unknown zone name="
+          << z->second.name << dendl;
+      throw std::system_error(make_error_code(std::errc::invalid_argument));
+    }
+    if (config.zonegroup.master_zone != config.zone.id) {
+      ldpp_dout(dpp, 0) << "zonegroup contains unknown master_zone="
+          << config.zonegroup.master_zone << dendl;
+      throw std::system_error(make_error_code(std::errc::invalid_argument));
+    }
+  } else {
+    // add the zone to the group
+    const bool is_master = true;
+    const bool read_only = false;
+    std::list<std::string> endpoints;
+    std::list<std::string> sync_from;
+    std::list<std::string> sync_from_rm;
+    rgw::zone_features::set enable_features;
+    rgw::zone_features::set disable_features;
+
+    enable_features.insert(rgw::zone_features::supported.begin(),
+                           rgw::zone_features::supported.end());
+
+    int r = rgw::add_zone_to_group(dpp, config.zonegroup, config.zone,
+                                   &is_master, &read_only, endpoints,
+                                   nullptr, nullptr, sync_from, sync_from_rm,
+                                   nullptr, std::nullopt,
+                                   enable_features, disable_features);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to add zone to zonegroup: "
+          << cpp_strerror(r) << dendl;
+      throw std::system_error(-r, std::system_category());
+    }
+
+    config.zonegroup.enabled_features.insert(rgw::zone_features::enabled.begin(),
+                                             rgw::zone_features::enabled.end());
+  }
+
+  // insert the default placement target if it doesn't exist
+  auto target = RGWZoneGroupPlacementTarget{.name = "default-placement"};
+  config.zonegroup.placement_targets.emplace(target.name, target);
+  if (config.zonegroup.default_placement.name.empty()) {
+    config.zonegroup.default_placement.name = target.name;
+  }
+}
+
+} // anonymous namespace
+
+auto create_json_config_store(const DoutPrefixProvider* dpp,
+                              const std::string& filename)
+    -> std::unique_ptr<ConfigStore>
+{
+  DecodedConfig config;
+  parse_config(dpp, filename.c_str());
+  sanity_check_config(dpp, config);
+  return create_immutable_config_store(dpp, config.zonegroup, config.zone,
+                                       config.period_config);
+}
+
+} // namespace rgw::sal
diff --git a/src/rgw/driver/json_config/store.h b/src/rgw/driver/json_config/store.h
new file mode 100644
index 000000000..4482f6716
--- /dev/null
+++ b/src/rgw/driver/json_config/store.h
@@ -0,0 +1,27 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "driver/immutable_config/store.h"
+
+namespace rgw::sal {
+
+/// Create an immutable ConfigStore by parsing the zonegroup and zone from the
+/// given json filename.
+auto create_json_config_store(const DoutPrefixProvider* dpp,
+                              const std::string& filename)
+    -> std::unique_ptr<ConfigStore>;
+
+} // namespace rgw::sal
diff --git a/src/rgw/driver/rados/cls_fifo_legacy.cc b/src/rgw/driver/rados/cls_fifo_legacy.cc
new file mode 100644
index 000000000..f5bb485fa
--- /dev/null
+++ b/src/rgw/driver/rados/cls_fifo_legacy.cc
@@ -0,0 +1,2539 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+#include <optional>
+#include <string_view>
+
+#include <fmt/format.h>
+
+#include "include/rados/librados.hpp"
+
+#include "include/buffer.h"
+
+#include "common/async/yield_context.h"
+#include "common/random_string.h"
+
+#include "cls/fifo/cls_fifo_types.h"
+#include "cls/fifo/cls_fifo_ops.h"
+
+#include "cls_fifo_legacy.h"
+
+namespace rgw::cls::fifo {
+namespace cb = ceph::buffer;
+namespace fifo = rados::cls::fifo;
+
+using ceph::from_error_code;
+
+inline constexpr auto MAX_RACE_RETRIES = 10;
+
+void create_meta(lr::ObjectWriteOperation* op,
+		 std::string_view id,
+		 std::optional<fifo::objv> objv,
+		 std::optional<std::string_view> oid_prefix,
+		 bool exclusive,
+		 std::uint64_t max_part_size,
+		 std::uint64_t max_entry_size)
+{
+  fifo::op::create_meta cm;
+
+  cm.id = id;
+  cm.version = objv;
+  cm.oid_prefix = oid_prefix;
+  cm.max_part_size = max_part_size;
+  cm.max_entry_size = max_entry_size;
+  cm.exclusive = exclusive;
+
+  cb::list in;
+  encode(cm, in);
+  op->exec(fifo::op::CLASS, fifo::op::CREATE_META, in);
+}
+
+int get_meta(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+	     std::optional<fifo::objv> objv, fifo::info* info,
+	     std::uint32_t* part_header_size,
+	     std::uint32_t* part_entry_overhead,
+	     uint64_t tid, optional_yield y,
+	     bool probe)
+{
+  lr::ObjectReadOperation op;
+  fifo::op::get_meta gm;
+  gm.version = objv;
+  cb::list in;
+  encode(gm, in);
+  cb::list bl;
+
+  op.exec(fifo::op::CLASS, fifo::op::GET_META, in,
+	  &bl, nullptr);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+  if (r >= 0) try {
+      fifo::op::get_meta_reply reply;
+      auto iter = bl.cbegin();
+      decode(reply, iter);
+      if (info) *info = std::move(reply.info);
+      if (part_header_size) *part_header_size = reply.part_header_size;
+      if (part_entry_overhead)
+	*part_entry_overhead = reply.part_entry_overhead;
+    } catch (const cb::error& err) {
+      ldpp_dout(dpp, -1)
+	<< __PRETTY_FUNCTION__ << ":" << __LINE__
+	<< " decode failed: " << err.what()
+	<< " tid=" << tid << dendl;
+      r = from_error_code(err.code());
+    } else if (!(probe && (r == -ENOENT || r == -ENODATA))) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " fifo::op::GET_META failed r=" << r << " tid=" << tid
+      << dendl;
+  }
+  return r;
+};
+
+namespace {
+void update_meta(lr::ObjectWriteOperation* op, const fifo::objv& objv,
+		 const fifo::update& update)
+{
+  fifo::op::update_meta um;
+
+  um.version = objv;
+  um.tail_part_num = update.tail_part_num();
+  um.head_part_num = update.head_part_num();
+  um.min_push_part_num = update.min_push_part_num();
+  um.max_push_part_num = update.max_push_part_num();
+  um.journal_entries_add = std::move(update).journal_entries_add();
+  um.journal_entries_rm = std::move(update).journal_entries_rm();
+
+  cb::list in;
+  encode(um, in);
+  op->exec(fifo::op::CLASS, fifo::op::UPDATE_META, in);
+}
+
+void part_init(lr::ObjectWriteOperation* op, fifo::data_params params)
+{
+  fifo::op::init_part ip;
+
+  ip.params = params;
+
+  cb::list in;
+  encode(ip, in);
+  op->exec(fifo::op::CLASS, fifo::op::INIT_PART, in);
+}
+
+int push_part(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+	      std::deque<cb::list> data_bufs, std::uint64_t tid,
+	      optional_yield y)
+{
+  lr::ObjectWriteOperation op;
+  fifo::op::push_part pp;
+
+  op.assert_exists();
+
+  pp.data_bufs = data_bufs;
+  pp.total_len = 0;
+
+  for (const auto& bl : data_bufs)
+    pp.total_len += bl.length();
+
+  cb::list in;
+  encode(pp, in);
+  auto retval = 0;
+  op.exec(fifo::op::CLASS, fifo::op::PUSH_PART, in, nullptr, &retval);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y, lr::OPERATION_RETURNVEC);
+  if (r < 0) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " fifo::op::PUSH_PART failed r=" << r
+      << " tid=" << tid << dendl;
+    return r;
+  }
+  if (retval < 0) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " error handling response retval=" << retval
+      << " tid=" << tid << dendl;
+  }
+  return retval;
+}
+
+void push_part(lr::IoCtx& ioctx, const std::string& oid,
+	       std::deque<cb::list> data_bufs, std::uint64_t tid,
+	       lr::AioCompletion* c)
+{
+  lr::ObjectWriteOperation op;
+  fifo::op::push_part pp;
+
+  pp.data_bufs = data_bufs;
+  pp.total_len = 0;
+
+  for (const auto& bl : data_bufs)
+    pp.total_len += bl.length();
+
+  cb::list in;
+  encode(pp, in);
+  op.exec(fifo::op::CLASS, fifo::op::PUSH_PART, in);
+  auto r = ioctx.aio_operate(oid, c, &op, lr::OPERATION_RETURNVEC);
+  ceph_assert(r >= 0);
+}
+
+void trim_part(lr::ObjectWriteOperation* op,
+	       std::uint64_t ofs, bool exclusive)
+{
+  fifo::op::trim_part tp;
+
+  tp.ofs = ofs;
+  tp.exclusive = exclusive;
+
+  cb::list in;
+  encode(tp, in);
+  op->exec(fifo::op::CLASS, fifo::op::TRIM_PART, in);
+}
+
+int list_part(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+	      std::uint64_t ofs, std::uint64_t max_entries,
+	      std::vector<fifo::part_list_entry>* entries,
+	      bool* more, bool* full_part,
+	      std::uint64_t tid, optional_yield y)
+{
+  lr::ObjectReadOperation op;
+  fifo::op::list_part lp;
+
+  lp.ofs = ofs;
+  lp.max_entries = max_entries;
+
+  cb::list in;
+  encode(lp, in);
+  cb::list bl;
+  op.exec(fifo::op::CLASS, fifo::op::LIST_PART, in, &bl, nullptr);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+  if (r >= 0) try {
+      fifo::op::list_part_reply reply;
+      auto iter = bl.cbegin();
+      decode(reply, iter);
+      if (entries) *entries = std::move(reply.entries);
+      if (more) *more = reply.more;
+      if (full_part) *full_part = reply.full_part;
+    } catch (const cb::error& err) {
+      ldpp_dout(dpp, -1)
+	<< __PRETTY_FUNCTION__ << ":" << __LINE__
+	<< " decode failed: " << err.what()
+	<< " tid=" << tid << dendl;
+      r = from_error_code(err.code());
+    } else if (r != -ENOENT) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " fifo::op::LIST_PART failed r=" << r << " tid=" << tid
+      << dendl;
+  }
+  return r;
+}
+
+struct list_entry_completion : public lr::ObjectOperationCompletion {
+  CephContext* cct;
+  int* r_out;
+  std::vector<fifo::part_list_entry>* entries;
+  bool* more;
+  bool* full_part;
+  std::uint64_t tid;
+
+  list_entry_completion(CephContext* cct, int* r_out, std::vector<fifo::part_list_entry>* entries,
+			bool* more, bool* full_part, std::uint64_t tid)
+    : cct(cct), r_out(r_out), entries(entries), more(more),
+      full_part(full_part), tid(tid) {}
+  virtual ~list_entry_completion() = default;
+  void handle_completion(int r, bufferlist& bl) override {
+    if (r >= 0) try {
+	fifo::op::list_part_reply reply;
+	auto iter = bl.cbegin();
+	decode(reply, iter);
+	if (entries) *entries = std::move(reply.entries);
+	if (more) *more = reply.more;
+	if (full_part) *full_part = reply.full_part;
+      } catch (const cb::error& err) {
+	lderr(cct)
+	  << __PRETTY_FUNCTION__ << ":" << __LINE__
+	  << " decode failed: " << err.what()
+	  << " tid=" << tid << dendl;
+	r = from_error_code(err.code());
+      } else if (r < 0) {
+      lderr(cct)
+	<< __PRETTY_FUNCTION__ << ":" << __LINE__
+	<< " fifo::op::LIST_PART failed r=" << r << " tid=" << tid
+	<< dendl;
+    }
+    if (r_out) *r_out = r;
+  }
+};
+
+lr::ObjectReadOperation list_part(CephContext* cct,
+				  std::uint64_t ofs,
+				  std::uint64_t max_entries,
+				  int* r_out,
+				  std::vector<fifo::part_list_entry>* entries,
+				  bool* more, bool* full_part,
+				  std::uint64_t tid)
+{
+  lr::ObjectReadOperation op;
+  fifo::op::list_part lp;
+
+  lp.ofs = ofs;
+  lp.max_entries = max_entries;
+
+  cb::list in;
+  encode(lp, in);
+  op.exec(fifo::op::CLASS, fifo::op::LIST_PART, in,
+	  new list_entry_completion(cct, r_out, entries, more, full_part,
+				    tid));
+  return op;
+}
+
+int get_part_info(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+		  fifo::part_header* header,
+		  std::uint64_t tid, optional_yield y)
+{
+  lr::ObjectReadOperation op;
+  fifo::op::get_part_info gpi;
+
+  cb::list in;
+  cb::list bl;
+  encode(gpi, in);
+  op.exec(fifo::op::CLASS, fifo::op::GET_PART_INFO, in, &bl, nullptr);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+  if (r >= 0) try {
+      fifo::op::get_part_info_reply reply;
+      auto iter = bl.cbegin();
+      decode(reply, iter);
+      if (header) *header = std::move(reply.header);
+    } catch (const cb::error& err) {
+      ldpp_dout(dpp, -1)
+	<< __PRETTY_FUNCTION__ << ":" << __LINE__
+	<< " decode failed: " << err.what()
+	<< " tid=" << tid << dendl;
+      r = from_error_code(err.code());
+    } else {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " fifo::op::GET_PART_INFO failed r=" << r << " tid=" << tid
+      << dendl;
+  }
+  return r;
+}
+
+struct partinfo_completion : public lr::ObjectOperationCompletion {
+  CephContext* cct;
+  int* rp;
+  fifo::part_header* h;
+  std::uint64_t tid;
+  partinfo_completion(CephContext* cct, int* rp, fifo::part_header* h,
+		      std::uint64_t tid) :
+    cct(cct), rp(rp), h(h), tid(tid) {
+  }
+  virtual ~partinfo_completion() = default;
+  void handle_completion(int r, bufferlist& bl) override {
+    if (r >= 0) try {
+	fifo::op::get_part_info_reply reply;
+	auto iter = bl.cbegin();
+	decode(reply, iter);
+	if (h) *h = std::move(reply.header);
+      } catch (const cb::error& err) {
+	r = from_error_code(err.code());
+	lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " decode failed: " << err.what()
+		   << " tid=" << tid << dendl;
+      } else {
+      lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " fifo::op::GET_PART_INFO failed r=" << r << " tid=" << tid
+		 << dendl;
+    }
+    if (rp) {
+      *rp = r;
+    }
+  }
+};
+
+lr::ObjectReadOperation get_part_info(CephContext* cct,
+				      fifo::part_header* header,
+				      std::uint64_t tid, int* r = 0)
+{
+  lr::ObjectReadOperation op;
+  fifo::op::get_part_info gpi;
+
+  cb::list in;
+  cb::list bl;
+  encode(gpi, in);
+  op.exec(fifo::op::CLASS, fifo::op::GET_PART_INFO, in,
+	  new partinfo_completion(cct, r, header, tid));
+  return op;
+}
+}
+
+std::optional<marker> FIFO::to_marker(std::string_view s)
+{
+  marker m;
+  if (s.empty()) {
+    m.num = info.tail_part_num;
+    m.ofs = 0;
+    return m;
+  }
+
+  auto pos = s.find(':');
+  if (pos == s.npos) {
+    return std::nullopt;
+  }
+
+  auto num = s.substr(0, pos);
+  auto ofs = s.substr(pos + 1);
+
+  auto n = ceph::parse<decltype(m.num)>(num);
+  if (!n) {
+    return std::nullopt;
+  }
+  m.num = *n;
+  auto o = ceph::parse<decltype(m.ofs)>(ofs);
+  if (!o) {
+    return std::nullopt;
+  }
+  m.ofs = *o;
+  return m;
+}
+
+int FIFO::apply_update(const DoutPrefixProvider *dpp,
+                       fifo::info* info,
+		       const fifo::objv& objv,
+		       const fifo::update& update,
+		       std::uint64_t tid)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  std::unique_lock l(m);
+  if (objv != info->version) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " version mismatch, canceling: tid=" << tid << dendl;
+    return -ECANCELED;
+  }
+
+  info->apply_update(update);
+  return {};
+}
+
+int FIFO::_update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+		       fifo::objv version, bool* pcanceled,
+		       std::uint64_t tid, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  bool canceled = false;
+  update_meta(&op, version, update);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r >= 0 || r == -ECANCELED) {
+    canceled = (r == -ECANCELED);
+    if (!canceled) {
+      r = apply_update(dpp, &info, version, update, tid);
+      if (r < 0) canceled = true;
+    }
+    if (canceled) {
+      r = read_meta(dpp, tid, y);
+      canceled = r < 0 ? false : true;
+    }
+  }
+  if (pcanceled) *pcanceled = canceled;
+  if (canceled) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " canceled: tid=" << tid << dendl;
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " returning error: r=" << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+struct Updater : public Completion<Updater> {
+  FIFO* fifo;
+  fifo::update update;
+  fifo::objv version;
+  bool reread = false;
+  bool* pcanceled = nullptr;
+  std::uint64_t tid;
+  Updater(const DoutPrefixProvider *dpp, FIFO* fifo, lr::AioCompletion* super,
+	  const fifo::update& update, fifo::objv version,
+	  bool* pcanceled, std::uint64_t tid)
+    : Completion(dpp, super), fifo(fifo), update(update), version(version),
+      pcanceled(pcanceled) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " entering: tid=" << tid << dendl;
+    if (reread)
+      handle_reread(dpp, std::move(p), r);
+    else
+      handle_update(dpp, std::move(p), r);
+  }
+
+  void handle_update(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " handling async update_meta: tid="
+			 << tid << dendl;
+    if (r < 0 && r != -ECANCELED) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " update failed: r=" << r << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    }
+    bool canceled = (r == -ECANCELED);
+    if (!canceled) {
+      int r = fifo->apply_update(dpp, &fifo->info, version, update, tid);
+      if (r < 0) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			     << " update failed, marking canceled: r=" << r
+			     << " tid=" << tid << dendl;
+	canceled = true;
+      }
+    }
+    if (canceled) {
+      reread = true;
+      fifo->read_meta(dpp, tid, call(std::move(p)));
+      return;
+    }
+    if (pcanceled)
+      *pcanceled = false;
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " completing: tid=" << tid << dendl;
+    complete(std::move(p), 0);
+  }
+
+  void handle_reread(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " handling async read_meta: tid="
+			 << tid << dendl;
+    if (r < 0 && pcanceled) {
+      *pcanceled = false;
+    } else if (r >= 0 && pcanceled) {
+      *pcanceled = true;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " failed dispatching read_meta: r=" << r << " tid="
+		       << tid << dendl;
+    } else {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			   << " completing: tid=" << tid << dendl;
+    }
+    complete(std::move(p), r);
+  }
+};
+
+void FIFO::_update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+			fifo::objv version, bool* pcanceled,
+			std::uint64_t tid, lr::AioCompletion* c)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  update_meta(&op, info.version, update);
+  auto updater = std::make_unique<Updater>(dpp, this, c, update, version, pcanceled,
+					   tid);
+  auto r = ioctx.aio_operate(oid, Updater::call(std::move(updater)), &op);
+  assert(r >= 0);
+}
+
+int FIFO::create_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid,
+		      optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  op.create(false); /* We don't need exclusivity, part_init ensures
+		       we're creating from the same journal entry. */
+  std::unique_lock l(m);
+  part_init(&op, info.params);
+  auto oid = info.part_oid(part_num);
+  l.unlock();
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " part_init failed: r=" << r << " tid="
+	       << tid << dendl;
+  }
+  return r;
+}
+
+int FIFO::remove_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid,
+		      optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  op.remove();
+  std::unique_lock l(m);
+  auto oid = info.part_oid(part_num);
+  l.unlock();
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " remove failed: r=" << r << " tid="
+	       << tid << dendl;
+  }
+  return r;
+}
+
+int FIFO::process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  std::vector<fifo::journal_entry> processed;
+
+  std::unique_lock l(m);
+  auto tmpjournal = info.journal;
+  auto new_tail = info.tail_part_num;
+  auto new_head = info.head_part_num;
+  auto new_max = info.max_push_part_num;
+  l.unlock();
+
+  int r = 0;
+  for (auto& entry : tmpjournal) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " processing entry: entry=" << entry << " tid=" << tid
+		   << dendl;
+    switch (entry.op) {
+      using enum fifo::journal_entry::Op;
+    case create:
+      r = create_part(dpp, entry.part_num, tid, y);
+      if (entry.part_num > new_max) {
+	new_max = entry.part_num;
+      }
+      break;
+    case set_head:
+      r = 0;
+      if (entry.part_num > new_head) {
+	new_head = entry.part_num;
+      }
+      break;
+    case remove:
+      r = remove_part(dpp, entry.part_num, tid, y);
+      if (r == -ENOENT) r = 0;
+      if (entry.part_num >= new_tail) {
+	new_tail = entry.part_num + 1;
+      }
+      break;
+    default:
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " unknown journaled op: entry=" << entry << " tid="
+		 << tid << dendl;
+      return -EIO;
+    }
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " processing entry failed: entry=" << entry
+		 << " r=" << r << " tid=" << tid << dendl;
+      return -r;
+    }
+
+    processed.push_back(std::move(entry));
+  }
+
+  // Postprocess
+  bool canceled = true;
+
+  for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " postprocessing: i=" << i << " tid=" << tid << dendl;
+
+    std::optional<int64_t> tail_part_num;
+    std::optional<int64_t> head_part_num;
+    std::optional<int64_t> max_part_num;
+
+    std::unique_lock l(m);
+    auto objv = info.version;
+    if (new_tail > tail_part_num) tail_part_num = new_tail;
+    if (new_head > info.head_part_num) head_part_num = new_head;
+    if (new_max > info.max_push_part_num) max_part_num = new_max;
+    l.unlock();
+
+    if (processed.empty() &&
+	!tail_part_num &&
+	!max_part_num) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << " nothing to update any more: i=" << i << " tid="
+		     << tid << dendl;
+      canceled = false;
+      break;
+    }
+    auto u = fifo::update().tail_part_num(tail_part_num)
+      .head_part_num(head_part_num).max_push_part_num(max_part_num)
+      .journal_entries_rm(processed);
+    r = _update_meta(dpp, u, objv, &canceled, tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " _update_meta failed: update=" << u
+		 << " r=" << r << " tid=" << tid << dendl;
+      break;
+    }
+
+    if (canceled) {
+      std::vector<fifo::journal_entry> new_processed;
+      std::unique_lock l(m);
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << " update canceled, retrying: i=" << i << " tid="
+		     << tid << dendl;
+      for (auto& e : processed) {
+	if (info.journal.contains(e)) {
+	  new_processed.push_back(e);
+	}
+      }
+      processed = std::move(new_processed);
+    }
+  }
+  if (r == 0 && canceled) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " canceled too many times, giving up: tid=" << tid << dendl;
+    r = -ECANCELED;
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " failed, r=: " << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+int FIFO::_prepare_new_part(const DoutPrefixProvider *dpp,
+			    std::int64_t new_part_num, bool is_head,
+			    std::uint64_t tid, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  std::unique_lock l(m);
+  using enum fifo::journal_entry::Op;
+  std::vector<fifo::journal_entry> jentries{{ create, new_part_num }};
+  if (info.journal.contains({create, new_part_num}) &&
+      (!is_head || info.journal.contains({set_head, new_part_num}))) {
+    l.unlock();
+    ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		  << " new part journaled, but not processed: tid="
+		  << tid << dendl;
+    auto r = process_journal(dpp, tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " process_journal failed: r=" << r << " tid=" << tid << dendl;
+    }
+    return r;
+  }
+  auto version = info.version;
+
+  if (is_head) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " needs new head: tid=" << tid << dendl;
+    jentries.push_back({ set_head, new_part_num });
+  }
+  l.unlock();
+
+  int r = 0;
+  bool canceled = true;
+  for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
+    canceled = false;
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " updating metadata: i=" << i << " tid=" << tid << dendl;
+    auto u = fifo::update{}.journal_entries_add(jentries);
+    r = _update_meta(dpp, u, version, &canceled, tid, y);
+    if (r >= 0 && canceled) {
+      std::unique_lock l(m);
+      version = info.version;
+      auto found = (info.journal.contains({create, new_part_num}) ||
+		    info.journal.contains({set_head, new_part_num}));
+      if ((info.max_push_part_num >= new_part_num &&
+	   info.head_part_num >= new_part_num)) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " raced, but journaled and processed: i=" << i
+		       << " tid=" << tid << dendl;
+	return 0;
+      }
+      if (found) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " raced, journaled but not processed: i=" << i
+		       << " tid=" << tid << dendl;
+	canceled = false;
+      }
+      l.unlock();
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " _update_meta failed: update=" << u << " r=" << r
+		 << " tid=" << tid << dendl;
+      return r;
+    }
+  }
+  if (canceled) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " canceled too many times, giving up: tid=" << tid << dendl;
+    return -ECANCELED;
+  }
+  r = process_journal(dpp, tid, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " process_journal failed: r=" << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+int FIFO::_prepare_new_head(const DoutPrefixProvider *dpp,
+			    std::int64_t new_head_part_num,
+			    std::uint64_t tid, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << " entering: tid=" << tid << dendl;
+  std::unique_lock l(m);
+  auto max_push_part_num = info.max_push_part_num;
+  auto version = info.version;
+  l.unlock();
+
+  int r = 0;
+  if (max_push_part_num < new_head_part_num) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " need new part: tid=" << tid << dendl;
+    r = _prepare_new_part(dpp, new_head_part_num, true, tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " _prepare_new_part failed: r=" << r
+		 << " tid=" << tid << dendl;
+      return r;
+    }
+    std::unique_lock l(m);
+    if (info.max_push_part_num < new_head_part_num) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " inconsistency, push part less than head part: "
+		 << " tid=" << tid << dendl;
+      return -EIO;
+    }
+    l.unlock();
+    return 0;
+  }
+
+  using enum fifo::journal_entry::Op;
+  fifo::journal_entry jentry;
+  jentry.op = set_head;
+  jentry.part_num = new_head_part_num;
+
+  r = 0;
+  bool canceled = true;
+  for (auto i = 0; canceled && i < MAX_RACE_RETRIES; ++i) {
+    canceled = false;
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " updating metadata: i=" << i << " tid=" << tid << dendl;
+    auto u = fifo::update{}.journal_entries_add({{ jentry }});
+    r = _update_meta(dpp, u, version, &canceled, tid, y);
+    if (r >= 0 && canceled) {
+      std::unique_lock l(m);
+      auto found = (info.journal.contains({create, new_head_part_num}) ||
+		    info.journal.contains({set_head, new_head_part_num}));
+      version = info.version;
+      if ((info.head_part_num >= new_head_part_num)) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " raced, but journaled and processed: i=" << i
+		       << " tid=" << tid << dendl;
+	return 0;
+      }
+      if (found) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " raced, journaled but not processed: i=" << i
+		       << " tid=" << tid << dendl;
+	canceled = false;
+      }
+      l.unlock();
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " _update_meta failed: update=" << u << " r=" << r
+		 << " tid=" << tid << dendl;
+      return r;
+    }
+  }
+  if (canceled) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " canceled too many times, giving up: tid=" << tid << dendl;
+    return -ECANCELED;
+  }
+  r = process_journal(dpp, tid, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " process_journal failed: r=" << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+struct NewPartPreparer : public Completion<NewPartPreparer> {
+  FIFO* f;
+  std::vector<fifo::journal_entry> jentries;
+  int i = 0;
+  std::int64_t new_part_num;
+  bool canceled = false;
+  uint64_t tid;
+
+  NewPartPreparer(const DoutPrefixProvider *dpp, FIFO* f, lr::AioCompletion* super,
+		  std::vector<fifo::journal_entry> jentries,
+		  std::int64_t new_part_num,
+		  std::uint64_t tid)
+    : Completion(dpp, super), f(f), jentries(std::move(jentries)),
+      new_part_num(new_part_num), tid(tid) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		      << " entering: tid=" << tid << dendl;
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		    << " _update_meta failed:  r=" << r
+		    << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    }
+
+    if (canceled) {
+      using enum fifo::journal_entry::Op;
+      std::unique_lock l(f->m);
+      auto found = (f->info.journal.contains({create, new_part_num}) ||
+		    f->info.journal.contains({set_head, new_part_num}));
+      auto max_push_part_num = f->info.max_push_part_num;
+      auto head_part_num = f->info.head_part_num;
+      auto version = f->info.version;
+      l.unlock();
+      if ((max_push_part_num >= new_part_num &&
+	   head_part_num >= new_part_num)) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			  << " raced, but journaled and processed: i=" << i
+			  << " tid=" << tid << dendl;
+	complete(std::move(p), 0);
+	return;
+      }
+      if (i >= MAX_RACE_RETRIES) {
+	complete(std::move(p), -ECANCELED);
+	return;
+      }
+      if (!found) {
+	++i;
+	f->_update_meta(dpp, fifo::update{}
+			.journal_entries_add(jentries),
+                        version, &canceled, tid, call(std::move(p)));
+	return;
+      } else {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			  << " raced, journaled but not processed: i=" << i
+			  << " tid=" << tid << dendl;
+	canceled = false;
+      }
+      // Fall through. We still need to process the journal.
+    }
+    f->process_journal(dpp, tid, super());
+    return;
+  }
+};
+
+void FIFO::_prepare_new_part(const DoutPrefixProvider *dpp, std::int64_t new_part_num,
+			     bool is_head, std::uint64_t tid, lr::AioCompletion* c)
+{
+  std::unique_lock l(m);
+  using enum fifo::journal_entry::Op;
+  std::vector<fifo::journal_entry> jentries{{create, new_part_num}};
+  if (info.journal.contains({create, new_part_num}) &&
+      (!is_head || info.journal.contains({set_head, new_part_num}))) {
+    l.unlock();
+    ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		  << " new part journaled, but not processed: tid="
+		  << tid << dendl;
+    process_journal(dpp, tid, c);
+    return;
+  }
+  auto version = info.version;
+
+  if (is_head) {
+    jentries.push_back({ set_head, new_part_num });
+  }
+  l.unlock();
+
+  auto n = std::make_unique<NewPartPreparer>(dpp, this, c, jentries,
+					     new_part_num, tid);
+  auto np = n.get();
+  _update_meta(dpp, fifo::update{}.journal_entries_add(jentries), version,
+	       &np->canceled, tid, NewPartPreparer::call(std::move(n)));
+}
+
+struct NewHeadPreparer : public Completion<NewHeadPreparer> {
+  FIFO* f;
+  int i = 0;
+  bool newpart;
+  std::int64_t new_head_part_num;
+  bool canceled = false;
+  std::uint64_t tid;
+
+  NewHeadPreparer(const DoutPrefixProvider *dpp, FIFO* f, lr::AioCompletion* super,
+		  bool newpart, std::int64_t new_head_part_num,
+		  std::uint64_t tid)
+    : Completion(dpp, super), f(f), newpart(newpart),
+      new_head_part_num(new_head_part_num), tid(tid) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (newpart)
+      handle_newpart(std::move(p), r);
+    else
+      handle_update(dpp, std::move(p), r);
+  }
+
+  void handle_newpart(Ptr&& p, int r) {
+    if (r < 0) {
+      lderr(f->cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		    << " _prepare_new_part failed: r=" << r
+		    << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    }
+    std::unique_lock l(f->m);
+    if (f->info.max_push_part_num < new_head_part_num) {
+      l.unlock();
+      lderr(f->cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		    << " _prepare_new_part failed: r=" << r
+		    << " tid=" << tid << dendl;
+      complete(std::move(p), -EIO);
+    } else {
+      l.unlock();
+      complete(std::move(p), 0);
+    }
+  }
+
+  void handle_update(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " entering: tid=" << tid << dendl;
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		    << " _update_meta failed:  r=" << r
+		    << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    }
+
+    if (canceled) {
+      using enum fifo::journal_entry::Op;
+      std::unique_lock l(f->m);
+      auto found = (f->info.journal.contains({create, new_head_part_num }) ||
+		    f->info.journal.contains({set_head, new_head_part_num }));
+      auto head_part_num = f->info.head_part_num;
+      auto version = f->info.version;
+
+      l.unlock();
+      if ((head_part_num >= new_head_part_num)) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			  << " raced, but journaled and processed: i=" << i
+			  << " tid=" << tid << dendl;
+	complete(std::move(p), 0);
+	return;
+      }
+      if (i >= MAX_RACE_RETRIES) {
+	complete(std::move(p), -ECANCELED);
+	return;
+      }
+      if (!found) {
+	++i;
+	fifo::journal_entry jentry;
+	jentry.op = set_head;
+	jentry.part_num = new_head_part_num;
+	f->_update_meta(dpp, fifo::update{}
+			.journal_entries_add({{jentry}}),
+                        version, &canceled, tid, call(std::move(p)));
+	return;
+      } else {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			  << " raced, journaled but not processed: i=" << i
+			  << " tid=" << tid << dendl;
+	canceled = false;
+      }
+      // Fall through. We still need to process the journal.
+    }
+    f->process_journal(dpp, tid, super());
+    return;
+  }
+};
+
+void FIFO::_prepare_new_head(const DoutPrefixProvider *dpp, std::int64_t new_head_part_num,
+			     std::uint64_t tid, lr::AioCompletion* c)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  std::unique_lock l(m);
+  auto max_push_part_num = info.max_push_part_num;
+  auto version = info.version;
+  l.unlock();
+
+  if (max_push_part_num < new_head_part_num) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " need new part: tid=" << tid << dendl;
+    auto n = std::make_unique<NewHeadPreparer>(dpp, this, c, true, new_head_part_num,
+					       tid);
+    _prepare_new_part(dpp, new_head_part_num, true, tid,
+		      NewHeadPreparer::call(std::move(n)));
+  } else {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " updating head: tid=" << tid << dendl;
+    auto n = std::make_unique<NewHeadPreparer>(dpp, this, c, false, new_head_part_num,
+					       tid);
+    auto np = n.get();
+    using enum fifo::journal_entry::Op;
+    fifo::journal_entry jentry;
+    jentry.op = set_head;
+    jentry.part_num = new_head_part_num;
+    _update_meta(dpp, fifo::update{}.journal_entries_add({{jentry}}), version,
+		 &np->canceled, tid, NewHeadPreparer::call(std::move(n)));
+  }
+}
+
+int FIFO::push_entries(const DoutPrefixProvider *dpp, const std::deque<cb::list>& data_bufs,
+		       std::uint64_t tid, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  std::unique_lock l(m);
+  auto head_part_num = info.head_part_num;
+  const auto part_oid = info.part_oid(head_part_num);
+  l.unlock();
+
+  auto r = push_part(dpp, ioctx, part_oid, data_bufs, tid, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " push_part failed: r=" << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+void FIFO::push_entries(const std::deque<cb::list>& data_bufs,
+			std::uint64_t tid, lr::AioCompletion* c)
+{
+  std::unique_lock l(m);
+  auto head_part_num = info.head_part_num;
+  const auto part_oid = info.part_oid(head_part_num);
+  l.unlock();
+
+  push_part(ioctx, part_oid, data_bufs, tid, c);
+}
+
+int FIFO::trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+		    bool exclusive, std::uint64_t tid,
+		    optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  std::unique_lock l(m);
+  const auto part_oid = info.part_oid(part_num);
+  l.unlock();
+  rgw::cls::fifo::trim_part(&op, ofs, exclusive);
+  auto r = rgw_rados_operate(dpp, ioctx, part_oid, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " trim_part failed: r=" << r << " tid=" << tid << dendl;
+  }
+  return 0;
+}
+
+void FIFO::trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+		     bool exclusive, std::uint64_t tid,
+		     lr::AioCompletion* c)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  lr::ObjectWriteOperation op;
+  std::unique_lock l(m);
+  const auto part_oid = info.part_oid(part_num);
+  l.unlock();
+  rgw::cls::fifo::trim_part(&op, ofs, exclusive);
+  auto r = ioctx.aio_operate(part_oid, c, &op);
+  ceph_assert(r >= 0);
+}
+
+int FIFO::open(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, std::string oid, std::unique_ptr<FIFO>* fifo,
+	       optional_yield y, std::optional<fifo::objv> objv,
+	       bool probe)
+{
+  ldpp_dout(dpp, 20)
+    << __PRETTY_FUNCTION__ << ":" << __LINE__
+    << " entering" << dendl;
+  fifo::info info;
+  std::uint32_t size;
+  std::uint32_t over;
+  int r = get_meta(dpp, ioctx, std::move(oid), objv, &info, &size, &over, 0, y,
+		   probe);
+  if (r < 0) {
+    if (!(probe && (r == -ENOENT || r == -ENODATA))) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " get_meta failed: r=" << r << dendl;
+    }
+    return r;
+  }
+  std::unique_ptr<FIFO> f(new FIFO(std::move(ioctx), oid));
+  f->info = info;
+  f->part_header_size = size;
+  f->part_entry_overhead = over;
+  // If there are journal entries, process them, in case
+  // someone crashed mid-transaction.
+  if (!info.journal.empty()) {
+    ldpp_dout(dpp, 20)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " processing leftover journal" << dendl;
+    r = f->process_journal(dpp, 0, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " process_journal failed: r=" << r << dendl;
+      return r;
+    }
+  }
+  *fifo = std::move(f);
+  return 0;
+}
+
+int FIFO::create(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, std::string oid, std::unique_ptr<FIFO>* fifo,
+		 optional_yield y, std::optional<fifo::objv> objv,
+		 std::optional<std::string_view> oid_prefix,
+		 bool exclusive, std::uint64_t max_part_size,
+		 std::uint64_t max_entry_size)
+{
+  ldpp_dout(dpp, 20)
+    << __PRETTY_FUNCTION__ << ":" << __LINE__
+    << " entering" << dendl;
+  lr::ObjectWriteOperation op;
+  create_meta(&op, oid, objv, oid_prefix, exclusive, max_part_size,
+	      max_entry_size);
+  auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " create_meta failed: r=" << r << dendl;
+    return r;
+  }
+  r = open(dpp, std::move(ioctx), std::move(oid), fifo, y, objv);
+  return r;
+}
+
+int FIFO::read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y) {
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  fifo::info _info;
+  std::uint32_t _phs;
+  std::uint32_t _peo;
+
+  auto r = get_meta(dpp, ioctx, oid, std::nullopt, &_info, &_phs, &_peo, tid, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " get_meta failed: r=" << r << " tid=" << tid << dendl;
+    return r;
+  }
+  std::unique_lock l(m);
+  // We have a newer version already!
+  if (_info.version.same_or_later(this->info.version)) {
+    info = std::move(_info);
+    part_header_size = _phs;
+    part_entry_overhead = _peo;
+  }
+  return 0;
+}
+
+int FIFO::read_meta(const DoutPrefixProvider *dpp, optional_yield y) {
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  l.unlock();
+  return read_meta(dpp, tid, y);
+}
+
+struct Reader : public Completion<Reader> {
+  FIFO* fifo;
+  cb::list bl;
+  std::uint64_t tid;
+  Reader(const DoutPrefixProvider *dpp, FIFO* fifo, lr::AioCompletion* super, std::uint64_t tid)
+    : Completion(dpp, super), fifo(fifo), tid(tid) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " entering: tid=" << tid << dendl;
+    if (r >= 0) try {
+	fifo::op::get_meta_reply reply;
+	auto iter = bl.cbegin();
+	decode(reply, iter);
+	std::unique_lock l(fifo->m);
+	if (reply.info.version.same_or_later(fifo->info.version)) {
+	  fifo->info = std::move(reply.info);
+	  fifo->part_header_size = reply.part_header_size;
+	  fifo->part_entry_overhead = reply.part_entry_overhead;
+	}
+      } catch (const cb::error& err) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " failed to decode response err=" << err.what()
+		   << " tid=" << tid << dendl;
+	r = from_error_code(err.code());
+      } else {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " read_meta failed r=" << r
+		 << " tid=" << tid << dendl;
+    }
+    complete(std::move(p), r);
+  }
+};
+
+void FIFO::read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  lr::ObjectReadOperation op;
+  fifo::op::get_meta gm;
+  cb::list in;
+  encode(gm, in);
+  auto reader = std::make_unique<Reader>(dpp, this, c, tid);
+  auto rp = reader.get();
+  auto r = ioctx.aio_exec(oid, Reader::call(std::move(reader)), fifo::op::CLASS,
+			  fifo::op::GET_META, in, &rp->bl);
+  assert(r >= 0);
+}
+
+const fifo::info& FIFO::meta() const {
+  return info;
+}
+
+std::pair<std::uint32_t, std::uint32_t> FIFO::get_part_layout_info() const {
+  return {part_header_size, part_entry_overhead};
+}
+
+int FIFO::push(const DoutPrefixProvider *dpp, const cb::list& bl, optional_yield y) {
+  return push(dpp, std::vector{ bl }, y);
+}
+
+void FIFO::push(const DoutPrefixProvider *dpp, const cb::list& bl, lr::AioCompletion* c) {
+  push(dpp, std::vector{ bl }, c);
+}
+
+int FIFO::push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs, optional_yield y)
+{
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  auto max_entry_size = info.params.max_entry_size;
+  auto need_new_head = info.need_new_head();
+  auto head_part_num = info.head_part_num;
+  l.unlock();
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  if (data_bufs.empty()) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " empty push, returning success tid=" << tid << dendl;
+    return 0;
+  }
+
+  // Validate sizes
+  for (const auto& bl : data_bufs) {
+    if (bl.length() > max_entry_size) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entry bigger than max_entry_size tid=" << tid << dendl;
+      return -E2BIG;
+    }
+  }
+
+  int r = 0;
+  if (need_new_head) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " need new head tid=" << tid << dendl;
+    r = _prepare_new_head(dpp, head_part_num + 1, tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " _prepare_new_head failed: r=" << r
+		 << " tid=" << tid << dendl;
+      return r;
+    }
+  }
+
+  std::deque<cb::list> remaining(data_bufs.begin(), data_bufs.end());
+  std::deque<cb::list> batch;
+
+  uint64_t batch_len = 0;
+  auto retries = 0;
+  bool canceled = true;
+  while ((!remaining.empty() || !batch.empty()) &&
+	 (retries <= MAX_RACE_RETRIES)) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " preparing push: remaining=" << remaining.size()
+		   << " batch=" << batch.size() << " retries=" << retries
+		   << " tid=" << tid << dendl;
+    std::unique_lock l(m);
+    head_part_num = info.head_part_num;
+    auto max_part_size = info.params.max_part_size;
+    auto overhead = part_entry_overhead;
+    l.unlock();
+
+    while (!remaining.empty() &&
+	   (remaining.front().length() + batch_len <= max_part_size)) {
+      /* We can send entries with data_len up to max_entry_size,
+	 however, we want to also account the overhead when
+	 dealing with multiple entries. Previous check doesn't
+	 account for overhead on purpose. */
+      batch_len += remaining.front().length() + overhead;
+      batch.push_back(std::move(remaining.front()));
+      remaining.pop_front();
+    }
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " prepared push: remaining=" << remaining.size()
+		   << " batch=" << batch.size() << " retries=" << retries
+		   << " batch_len=" << batch_len
+		   << " tid=" << tid << dendl;
+
+    auto r = push_entries(dpp, batch, tid, y);
+    if (r == -ERANGE) {
+      canceled = true;
+      ++retries;
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " need new head tid=" << tid << dendl;
+      r = _prepare_new_head(dpp, head_part_num + 1, tid, y);
+      if (r < 0) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " prepare_new_head failed: r=" << r
+		   << " tid=" << tid << dendl;
+	return r;
+      }
+      r = 0;
+      continue;
+    }
+    if (r == -ENOENT) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " racing client trimmed part, rereading metadata "
+			 << "tid=" << tid << dendl;
+      canceled = true;
+      ++retries;
+      r = read_meta(dpp, y);
+      if (r < 0) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " read_meta failed: r=" << r
+		   << " tid=" << tid << dendl;
+	return r;
+      }
+      r = 0;
+      continue;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " push_entries failed: r=" << r
+		 << " tid=" << tid << dendl;
+      return r;
+    }
+    // Made forward progress!
+    canceled = false;
+    retries = 0;
+    batch_len = 0;
+    if (r == ssize(batch)) {
+      batch.clear();
+    } else  {
+      batch.erase(batch.begin(), batch.begin() + r);
+      for (const auto& b : batch) {
+	batch_len +=  b.length() + part_entry_overhead;
+      }
+    }
+  }
+  if (canceled) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " canceled too many times, giving up: tid=" << tid << dendl;
+    return -ECANCELED;
+  }
+  return 0;
+}
+
+struct Pusher : public Completion<Pusher> {
+  FIFO* f;
+  std::deque<cb::list> remaining;
+  std::deque<cb::list> batch;
+  int i = 0;
+  std::int64_t head_part_num;
+  std::uint64_t tid;
+  enum { pushing, new_heading, meta_reading } state = pushing;
+
+  void prep_then_push(const DoutPrefixProvider *dpp, Ptr&& p, const unsigned successes) {
+    std::unique_lock l(f->m);
+    auto max_part_size = f->info.params.max_part_size;
+    auto part_entry_overhead = f->part_entry_overhead;
+    head_part_num = f->info.head_part_num;
+    l.unlock();
+
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		      << " preparing push: remaining=" << remaining.size()
+		      << " batch=" << batch.size() << " i=" << i
+		      << " tid=" << tid << dendl;
+
+    uint64_t batch_len = 0;
+    if (successes > 0) {
+      if (successes == batch.size()) {
+	batch.clear();
+      } else  {
+	batch.erase(batch.begin(), batch.begin() + successes);
+	for (const auto& b : batch) {
+	  batch_len +=  b.length() + part_entry_overhead;
+	}
+      }
+    }
+
+    if (batch.empty() && remaining.empty()) {
+      complete(std::move(p), 0);
+      return;
+    }
+
+    while (!remaining.empty() &&
+	   (remaining.front().length() + batch_len <= max_part_size)) {
+
+      /* We can send entries with data_len up to max_entry_size,
+	 however, we want to also account the overhead when
+	 dealing with multiple entries. Previous check doesn't
+	 account for overhead on purpose. */
+      batch_len += remaining.front().length() + part_entry_overhead;
+      batch.push_back(std::move(remaining.front()));
+      remaining.pop_front();
+    }
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		      << " prepared push: remaining=" << remaining.size()
+		      << " batch=" << batch.size() << " i=" << i
+		      << " batch_len=" << batch_len
+		      << " tid=" << tid << dendl;
+    push(std::move(p));
+  }
+
+  void push(Ptr&& p) {
+    f->push_entries(batch, tid, call(std::move(p)));
+  }
+
+  void new_head(const DoutPrefixProvider *dpp, Ptr&& p) {
+    state = new_heading;
+    f->_prepare_new_head(dpp, head_part_num + 1, tid, call(std::move(p)));
+  }
+
+  void read_meta(const DoutPrefixProvider *dpp, Ptr&& p) {
+    ++i;
+    state = meta_reading;
+    f->read_meta(dpp, tid, call(std::move(p)));
+  }
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    switch (state) {
+    case pushing:
+      if (r == -ERANGE) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " need new head tid=" << tid << dendl;
+	new_head(dpp, std::move(p));
+	return;
+      }
+      if (r == -ENOENT) {
+	if (i > MAX_RACE_RETRIES) {
+	  ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			     << " racing client deleted part, but we're out"
+			     << " of retries: tid=" << tid << dendl;
+	  complete(std::move(p), r);
+	}
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			   << " racing client deleted part: tid=" << tid << dendl;
+	read_meta(dpp, std::move(p));
+	return;
+      }
+      if (r < 0) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		      << " push_entries failed: r=" << r
+		      << " tid=" << tid << dendl;
+	complete(std::move(p), r);
+	return;
+      }
+      i = 0; // We've made forward progress, so reset the race counter!
+      prep_then_push(dpp, std::move(p), r);
+      break;
+
+    case new_heading:
+      if (r < 0) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		      << " prepare_new_head failed: r=" << r
+		      << " tid=" << tid << dendl;
+	complete(std::move(p), r);
+	return;
+      }
+      state = pushing;
+      handle_new_head(dpp, std::move(p), r);
+      break;
+
+    case meta_reading:
+      if (r < 0) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		      << " read_meta failed: r=" << r
+		      << " tid=" << tid << dendl;
+	complete(std::move(p), r);
+	return;
+      }
+      state = pushing;
+      prep_then_push(dpp, std::move(p), r);
+      break;
+    }
+  }
+
+  void handle_new_head(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (r == -ECANCELED) {
+      if (p->i == MAX_RACE_RETRIES) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		      << " canceled too many times, giving up: tid=" << tid << dendl;
+	complete(std::move(p), -ECANCELED);
+	return;
+      }
+      ++p->i;
+    } else if (r) {
+      complete(std::move(p), r);
+      return;
+    }
+
+    if (p->batch.empty()) {
+      prep_then_push(dpp, std::move(p), 0);
+      return;
+    } else {
+      push(std::move(p));
+      return;
+    }
+  }
+
+  Pusher(const DoutPrefixProvider *dpp, FIFO* f, std::deque<cb::list>&& remaining,
+	 std::int64_t head_part_num, std::uint64_t tid,
+	 lr::AioCompletion* super)
+    : Completion(dpp, super), f(f), remaining(std::move(remaining)),
+      head_part_num(head_part_num), tid(tid) {}
+};
+
+void FIFO::push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs,
+		lr::AioCompletion* c)
+{
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  auto max_entry_size = info.params.max_entry_size;
+  auto need_new_head = info.need_new_head();
+  auto head_part_num = info.head_part_num;
+  l.unlock();
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  auto p = std::make_unique<Pusher>(dpp, this, std::deque<cb::list>(data_bufs.begin(), data_bufs.end()),
+				    head_part_num, tid, c);
+  // Validate sizes
+  for (const auto& bl : data_bufs) {
+    if (bl.length() > max_entry_size) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entry bigger than max_entry_size tid=" << tid << dendl;
+      Pusher::complete(std::move(p), -E2BIG);
+      return;
+    }
+  }
+
+  if (data_bufs.empty() ) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " empty push, returning success tid=" << tid << dendl;
+    Pusher::complete(std::move(p), 0);
+    return;
+  }
+
+  if (need_new_head) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " need new head tid=" << tid << dendl;
+    p->new_head(dpp, std::move(p));
+  } else {
+    p->prep_then_push(dpp, std::move(p), 0);
+  }
+}
+
+int FIFO::list(const DoutPrefixProvider *dpp, int max_entries,
+	       std::optional<std::string_view> markstr,
+	       std::vector<list_entry>* presult, bool* pmore,
+	       optional_yield y)
+{
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  std::int64_t part_num = info.tail_part_num;
+  l.unlock();
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  std::uint64_t ofs = 0;
+  if (markstr) {
+    auto marker = to_marker(*markstr);
+    if (!marker) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " invalid marker string: " << markstr
+		 << " tid= "<< tid << dendl;
+      return -EINVAL;
+    }
+    part_num = marker->num;
+    ofs = marker->ofs;
+  }
+
+  std::vector<list_entry> result;
+  result.reserve(max_entries);
+  bool more = false;
+
+  std::vector<fifo::part_list_entry> entries;
+  int r = 0;
+  while (max_entries > 0) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " max_entries=" << max_entries << " tid=" << tid << dendl;
+    bool part_more = false;
+    bool part_full = false;
+
+    std::unique_lock l(m);
+    auto part_oid = info.part_oid(part_num);
+    l.unlock();
+
+    r = list_part(dpp, ioctx, part_oid, ofs, max_entries, &entries,
+		  &part_more, &part_full, tid, y);
+    if (r == -ENOENT) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << " missing part, rereading metadata"
+		     << " tid= "<< tid << dendl;
+      r = read_meta(dpp, tid, y);
+      if (r < 0) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " read_meta failed: r=" << r
+		   << " tid= "<< tid << dendl;
+	return r;
+      }
+      if (part_num < info.tail_part_num) {
+	/* raced with trim? restart */
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " raced with trim, restarting: tid=" << tid << dendl;
+	max_entries += result.size();
+	result.clear();
+	std::unique_lock l(m);
+	part_num = info.tail_part_num;
+	l.unlock();
+	ofs = 0;
+	continue;
+      }
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << " assuming part was not written yet, so end of data: "
+		     << "tid=" << tid << dendl;
+      more = false;
+      r = 0;
+      break;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " list_entries failed: r=" << r
+		 << " tid= "<< tid << dendl;
+      return r;
+    }
+    more = part_full || part_more;
+    for (auto& entry : entries) {
+      list_entry e;
+      e.data = std::move(entry.data);
+      e.marker = marker{part_num, entry.ofs}.to_string();
+      e.mtime = entry.mtime;
+      result.push_back(std::move(e));
+      --max_entries;
+      if (max_entries == 0)
+	break;
+    }
+    entries.clear();
+    if (max_entries > 0 &&
+	part_more) {
+    }
+
+    if (!part_full) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << " head part is not full, so we can assume we're done: "
+		     << "tid=" << tid << dendl;
+      break;
+    }
+    if (!part_more) {
+      ++part_num;
+      ofs = 0;
+    }
+  }
+  if (presult)
+    *presult = std::move(result);
+  if (pmore)
+    *pmore =  more;
+  return 0;
+}
+
+int FIFO::trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, optional_yield y)
+{
+  bool overshoot = false;
+  auto marker = to_marker(markstr);
+  if (!marker) {
+    return -EINVAL;
+  }
+  auto part_num = marker->num;
+  auto ofs = marker->ofs;
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  auto hn = info.head_part_num;
+  const auto max_part_size = info.params.max_part_size;
+  if (part_num > hn) {
+    l.unlock();
+    auto r = read_meta(dpp, tid, y);
+    if (r < 0) {
+      return r;
+    }
+    l.lock();
+    auto hn = info.head_part_num;
+    if (part_num > hn) {
+      overshoot = true;
+      part_num = hn;
+      ofs = max_part_size;
+    }
+  }
+  if (part_num < info.tail_part_num) {
+    return -ENODATA;
+  }
+  auto pn = info.tail_part_num;
+  l.unlock();
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+
+  int r = 0;
+  while (pn < part_num) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " pn=" << pn << " tid=" << tid << dendl;
+    std::unique_lock l(m);
+    l.unlock();
+    r = trim_part(dpp, pn, max_part_size, false, tid, y);
+    if (r < 0 && r == -ENOENT) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " trim_part failed: r=" << r
+		 << " tid= "<< tid << dendl;
+      return r;
+    }
+    ++pn;
+  }
+  r = trim_part(dpp, part_num, ofs, exclusive, tid, y);
+  if (r < 0 && r != -ENOENT) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " trim_part failed: r=" << r
+	       << " tid= "<< tid << dendl;
+    return r;
+  }
+
+  l.lock();
+  auto tail_part_num = info.tail_part_num;
+  auto objv = info.version;
+  l.unlock();
+  bool canceled = tail_part_num < part_num;
+  int retries = 0;
+  while ((tail_part_num < part_num) &&
+	 canceled &&
+	 (retries <= MAX_RACE_RETRIES)) {
+    r = _update_meta(dpp, fifo::update{}.tail_part_num(part_num), objv, &canceled,
+		     tid, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " _update_meta failed: r=" << r
+		 << " tid= "<< tid << dendl;
+      return r;
+    }
+    if (canceled) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << " canceled: retries=" << retries
+		     << " tid=" << tid << dendl;
+      l.lock();
+      tail_part_num = info.tail_part_num;
+      objv = info.version;
+      l.unlock();
+      ++retries;
+    }
+  }
+  if (canceled) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " canceled too many times, giving up: tid=" << tid << dendl;
+    return -EIO;
+  }
+  return overshoot ? -ENODATA : 0;
+}
+
+struct Trimmer : public Completion<Trimmer> {
+  FIFO* fifo;
+  std::int64_t part_num;
+  std::uint64_t ofs;
+  std::int64_t pn;
+  bool exclusive;
+  std::uint64_t tid;
+  bool update = false;
+  bool reread = false;
+  bool canceled = false;
+  bool overshoot = false;
+  int retries = 0;
+
+  Trimmer(const DoutPrefixProvider *dpp, FIFO* fifo, std::int64_t part_num, std::uint64_t ofs, std::int64_t pn,
+	  bool exclusive, lr::AioCompletion* super, std::uint64_t tid)
+    : Completion(dpp, super), fifo(fifo), part_num(part_num), ofs(ofs), pn(pn),
+      exclusive(exclusive), tid(tid) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " entering: tid=" << tid << dendl;
+
+    if (reread) {
+      reread = false;
+      if (r < 0) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " read_meta failed: r="
+		   << r << " tid=" << tid << dendl;
+	complete(std::move(p), r);
+	return;
+      }
+      std::unique_lock l(fifo->m);
+      auto hn = fifo->info.head_part_num;
+      const auto max_part_size = fifo->info.params.max_part_size;
+      const auto tail_part_num = fifo->info.tail_part_num;
+      l.unlock();
+      if (part_num > hn) {
+	part_num = hn;
+	ofs = max_part_size;
+	overshoot = true;
+      }
+      if (part_num < tail_part_num) {
+	complete(std::move(p), -ENODATA);
+	return;
+      }
+      pn = tail_part_num;
+      if (pn < part_num) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " pn=" << pn << " tid=" << tid << dendl;
+	fifo->trim_part(dpp, pn++, max_part_size, false, tid,
+			call(std::move(p)));
+      } else {
+	update = true;
+	canceled = tail_part_num < part_num;
+	fifo->trim_part(dpp, part_num, ofs, exclusive, tid, call(std::move(p)));
+      }
+      return;
+    }
+
+    if (r == -ENOENT) {
+      r = 0;
+    }
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << (update ? " update_meta " : " trim ") << "failed: r="
+		 << r << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    }
+
+    if (!update) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << " handling preceding trim callback: tid=" << tid << dendl;
+      retries = 0;
+      if (pn < part_num) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " pn=" << pn << " tid=" << tid << dendl;
+	std::unique_lock l(fifo->m);
+	const auto max_part_size = fifo->info.params.max_part_size;
+	l.unlock();
+	fifo->trim_part(dpp, pn++, max_part_size, false, tid,
+			call(std::move(p)));
+	return;
+      }
+
+      std::unique_lock l(fifo->m);
+      const auto tail_part_num = fifo->info.tail_part_num;
+      l.unlock();
+      update = true;
+      canceled = tail_part_num < part_num;
+      fifo->trim_part(dpp, part_num, ofs, exclusive, tid, call(std::move(p)));
+      return;
+    }
+
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " handling update-needed callback: tid=" << tid << dendl;
+    std::unique_lock l(fifo->m);
+    auto tail_part_num = fifo->info.tail_part_num;
+    auto objv = fifo->info.version;
+    l.unlock();
+    if ((tail_part_num < part_num) &&
+	canceled) {
+      if (retries > MAX_RACE_RETRIES) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " canceled too many times, giving up: tid=" << tid << dendl;
+	complete(std::move(p), -EIO);
+	return;
+      }
+      ++retries;
+      fifo->_update_meta(dpp, fifo::update{}
+			 .tail_part_num(part_num), objv, &canceled,
+                         tid, call(std::move(p)));
+    } else {
+      complete(std::move(p), overshoot ? -ENODATA : 0);
+    }
+  }
+};
+
+void FIFO::trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive,
+		lr::AioCompletion* c) {
+  auto marker = to_marker(markstr);
+  auto realmark = marker.value_or(::rgw::cls::fifo::marker{});
+  std::unique_lock l(m);
+  const auto hn = info.head_part_num;
+  const auto max_part_size = info.params.max_part_size;
+  const auto pn = info.tail_part_num;
+  const auto part_oid = info.part_oid(pn);
+  auto tid = ++next_tid;
+  l.unlock();
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " entering: tid=" << tid << dendl;
+  auto trimmer = std::make_unique<Trimmer>(dpp, this, realmark.num, realmark.ofs,
+					   pn, exclusive, c, tid);
+  if (!marker) {
+    Trimmer::complete(std::move(trimmer), -EINVAL);
+    return;
+  }
+  ++trimmer->pn;
+  auto ofs = marker->ofs;
+  if (marker->num > hn) {
+    trimmer->reread = true;
+    read_meta(dpp, tid, Trimmer::call(std::move(trimmer)));
+    return;
+  }
+  if (pn < marker->num) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << " pn=" << pn << " tid=" << tid << dendl;
+    ofs = max_part_size;
+  } else {
+    trimmer->update = true;
+  }
+  trim_part(dpp, pn, ofs, exclusive, tid, Trimmer::call(std::move(trimmer)));
+}
+
+int FIFO::get_part_info(const DoutPrefixProvider *dpp, int64_t part_num,
+			fifo::part_header* header,
+			optional_yield y)
+{
+  std::unique_lock l(m);
+  const auto part_oid = info.part_oid(part_num);
+  auto tid = ++next_tid;
+  l.unlock();
+  auto r = rgw::cls::fifo::get_part_info(dpp, ioctx, part_oid, header, tid, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " get_part_info failed: r="
+	       << r << " tid=" << tid << dendl;
+  }
+  return r;
+}
+
+void FIFO::get_part_info(int64_t part_num,
+			 fifo::part_header* header,
+			 lr::AioCompletion* c)
+{
+  std::unique_lock l(m);
+  const auto part_oid = info.part_oid(part_num);
+  auto tid = ++next_tid;
+  l.unlock();
+  auto op = rgw::cls::fifo::get_part_info(cct, header, tid);
+  auto r = ioctx.aio_operate(part_oid, c, &op, nullptr);
+  ceph_assert(r >= 0);
+}
+
+struct InfoGetter : Completion<InfoGetter> {
+  FIFO* fifo;
+  fifo::part_header header;
+  fu2::function<void(int r, fifo::part_header&&)> f;
+  std::uint64_t tid;
+  bool headerread = false;
+
+  InfoGetter(const DoutPrefixProvider *dpp, FIFO* fifo, fu2::function<void(int r, fifo::part_header&&)> f,
+	     std::uint64_t tid, lr::AioCompletion* super)
+    : Completion(dpp, super), fifo(fifo), f(std::move(f)), tid(tid) {}
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (!headerread) {
+      if (r < 0) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " read_meta failed: r="
+			 << r << " tid=" << tid << dendl;
+	if (f)
+	  f(r, {});
+	complete(std::move(p), r);
+	return;
+      }
+
+      auto info = fifo->meta();
+      auto hpn = info.head_part_num;
+      if (hpn < 0) {
+	ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			     << " no head, returning empty partinfo r="
+			     << r << " tid=" << tid << dendl;
+	if (f)
+	  f(0, {});
+	complete(std::move(p), r);
+	return;
+      }
+      headerread = true;
+      auto op = rgw::cls::fifo::get_part_info(fifo->cct, &header, tid);
+      std::unique_lock l(fifo->m);
+      auto oid = fifo->info.part_oid(hpn);
+      l.unlock();
+      r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op,
+				  nullptr);
+      ceph_assert(r >= 0);
+      return;
+    }
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " get_part_info failed: r="
+		       << r << " tid=" << tid << dendl;
+    }
+
+    if (f)
+      f(r, std::move(header));
+    complete(std::move(p), r);
+    return;
+  }
+};
+
+void FIFO::get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function<void(int r,
+						   fifo::part_header&&)> f,
+			 lr::AioCompletion* c)
+{
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  l.unlock();
+  auto ig = std::make_unique<InfoGetter>(dpp, this, std::move(f), tid, c);
+  read_meta(dpp, tid, InfoGetter::call(std::move(ig)));
+}
+
+struct JournalProcessor : public Completion<JournalProcessor> {
+private:
+  FIFO* const fifo;
+
+  std::vector<fifo::journal_entry> processed;
+  decltype(fifo->info.journal) journal;
+  decltype(journal)::iterator iter;
+  std::int64_t new_tail;
+  std::int64_t new_head;
+  std::int64_t new_max;
+  int race_retries = 0;
+  bool first_pp = true;
+  bool canceled = false;
+  std::uint64_t tid;
+
+  enum {
+    entry_callback,
+    pp_callback,
+  } state;
+
+  void create_part(const DoutPrefixProvider *dpp, Ptr&& p, int64_t part_num) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " entering: tid=" << tid << dendl;
+    state = entry_callback;
+    lr::ObjectWriteOperation op;
+    op.create(false); /* We don't need exclusivity, part_init ensures
+			 we're creating from the  same journal entry. */
+    std::unique_lock l(fifo->m);
+    part_init(&op, fifo->info.params);
+    auto oid = fifo->info.part_oid(part_num);
+    l.unlock();
+    auto r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op);
+    ceph_assert(r >= 0);
+    return;
+  }
+
+  void remove_part(const DoutPrefixProvider *dpp, Ptr&& p, int64_t part_num) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " entering: tid=" << tid << dendl;
+    state = entry_callback;
+    lr::ObjectWriteOperation op;
+    op.remove();
+    std::unique_lock l(fifo->m);
+    auto oid = fifo->info.part_oid(part_num);
+    l.unlock();
+    auto r = fifo->ioctx.aio_operate(oid, call(std::move(p)), &op);
+    ceph_assert(r >= 0);
+    return;
+  }
+
+  void finish_je(const DoutPrefixProvider *dpp, Ptr&& p, int r,
+		 const fifo::journal_entry& entry) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " entering: tid=" << tid << dendl;
+
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " finishing entry: entry=" << entry
+			 << " tid=" << tid << dendl;
+
+    using enum fifo::journal_entry::Op;
+    if (entry.op == remove && r == -ENOENT)
+      r = 0;
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " processing entry failed: entry=" << entry
+		       << " r=" << r << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+      return;
+    } else {
+      switch (entry.op) {
+      case unknown:
+      case set_head:
+	// Can't happen. Filtered out in process.
+	complete(std::move(p), -EIO);
+	return;
+
+      case create:
+	if (entry.part_num > new_max) {
+	  new_max = entry.part_num;
+	}
+	break;
+      case remove:
+	if (entry.part_num >= new_tail) {
+	  new_tail = entry.part_num + 1;
+	}
+	break;
+      }
+      processed.push_back(entry);
+    }
+    ++iter;
+    process(dpp, std::move(p));
+  }
+
+  void postprocess(const DoutPrefixProvider *dpp, Ptr&& p) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " entering: tid=" << tid << dendl;
+    if (processed.empty()) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			   << " nothing to update any more: race_retries="
+			   << race_retries << " tid=" << tid << dendl;
+      complete(std::move(p), 0);
+      return;
+    }
+    pp_run(dpp, std::move(p), 0, false);
+  }
+
+public:
+
+  JournalProcessor(const DoutPrefixProvider *dpp, FIFO* fifo, std::uint64_t tid, lr::AioCompletion* super)
+    : Completion(dpp, super), fifo(fifo), tid(tid) {
+    std::unique_lock l(fifo->m);
+    journal = fifo->info.journal;
+    iter = journal.begin();
+    new_tail = fifo->info.tail_part_num;
+    new_head = fifo->info.head_part_num;
+    new_max = fifo->info.max_push_part_num;
+  }
+
+  void pp_run(const DoutPrefixProvider *dpp, Ptr&& p, int r, bool canceled) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " entering: tid=" << tid << dendl;
+    std::optional<int64_t> tail_part_num;
+    std::optional<int64_t> head_part_num;
+    std::optional<int64_t> max_part_num;
+
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		       << " failed, r=: " << r << " tid=" << tid << dendl;
+      complete(std::move(p), r);
+    }
+
+
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " postprocessing: race_retries="
+			 << race_retries << " tid=" << tid << dendl;
+
+    if (!first_pp && r == 0 && !canceled) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			   << " nothing to update any more: race_retries="
+			   << race_retries << " tid=" << tid << dendl;
+      complete(std::move(p), 0);
+      return;
+    }
+
+    first_pp = false;
+
+    if (canceled) {
+      if (race_retries >= MAX_RACE_RETRIES) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " canceled too many times, giving up: tid="
+			 << tid << dendl;
+	complete(std::move(p), -ECANCELED);
+	return;
+      }
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			   << " update canceled, retrying: race_retries="
+			   << race_retries << " tid=" << tid << dendl;
+
+      ++race_retries;
+
+      std::vector<fifo::journal_entry> new_processed;
+      std::unique_lock l(fifo->m);
+      for (auto& e : processed) {
+	if (fifo->info.journal.contains(e)) {
+	  new_processed.push_back(e);
+	}
+      }
+      processed = std::move(new_processed);
+    }
+
+    std::unique_lock l(fifo->m);
+    auto objv = fifo->info.version;
+    if (new_tail > fifo->info.tail_part_num) {
+      tail_part_num = new_tail;
+    }
+
+    if (new_head > fifo->info.head_part_num) {
+      head_part_num = new_head;
+    }
+
+    if (new_max > fifo->info.max_push_part_num) {
+      max_part_num = new_max;
+    }
+    l.unlock();
+
+    if (processed.empty() &&
+	!tail_part_num &&
+	!max_part_num) {
+      /* nothing to update anymore */
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			   << " nothing to update any more: race_retries="
+			   << race_retries << " tid=" << tid << dendl;
+      complete(std::move(p), 0);
+      return;
+    }
+    state = pp_callback;
+    fifo->_update_meta(dpp, fifo::update{}
+		       .tail_part_num(tail_part_num)
+		       .head_part_num(head_part_num)
+		       .max_push_part_num(max_part_num)
+		       .journal_entries_rm(processed),
+                       objv, &this->canceled, tid, call(std::move(p)));
+    return;
+  }
+
+  JournalProcessor(const JournalProcessor&) = delete;
+  JournalProcessor& operator =(const JournalProcessor&) = delete;
+  JournalProcessor(JournalProcessor&&) = delete;
+  JournalProcessor& operator =(JournalProcessor&&) = delete;
+
+  void process(const DoutPrefixProvider *dpp, Ptr&& p) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " entering: tid=" << tid << dendl;
+    while (iter != journal.end()) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			   << " processing entry: entry=" << *iter
+			   << " tid=" << tid << dendl;
+      const auto entry = *iter;
+      switch (entry.op) {
+	using enum fifo::journal_entry::Op;
+      case create:
+	create_part(dpp, std::move(p), entry.part_num);
+	return;
+      case set_head:
+	if (entry.part_num > new_head) {
+	  new_head = entry.part_num;
+	}
+	processed.push_back(entry);
+	++iter;
+	continue;
+      case remove:
+	remove_part(dpp, std::move(p), entry.part_num);
+	return;
+      default:
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " unknown journaled op: entry=" << entry << " tid="
+			 << tid << dendl;
+	complete(std::move(p), -EIO);
+	return;
+      }
+    }
+    postprocess(dpp, std::move(p));
+    return;
+  }
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << " entering: tid=" << tid << dendl;
+    switch (state) {
+    case entry_callback:
+      finish_je(dpp, std::move(p), r, *iter);
+      return;
+    case pp_callback:
+      auto c = canceled;
+      canceled = false;
+      pp_run(dpp, std::move(p), r, c);
+      return;
+    }
+
+    abort();
+  }
+
+};
+
+void FIFO::process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c) {
+  auto p = std::make_unique<JournalProcessor>(dpp, this, tid, c);
+  p->process(dpp, std::move(p));
+}
+
+struct Lister : Completion<Lister> {
+  FIFO* f;
+  std::vector<list_entry> result;
+  bool more = false;
+  std::int64_t part_num;
+  std::uint64_t ofs;
+  int max_entries;
+  int r_out = 0;
+  std::vector<fifo::part_list_entry> entries;
+  bool part_more = false;
+  bool part_full = false;
+  std::vector<list_entry>* entries_out;
+  bool* more_out;
+  std::uint64_t tid;
+
+  bool read = false;
+
+  void complete(Ptr&& p, int r) {
+    if (r >= 0) {
+      if (more_out) *more_out = more;
+      if (entries_out) *entries_out = std::move(result);
+    }
+    Completion::complete(std::move(p), r);
+  }
+
+public:
+  Lister(const DoutPrefixProvider *dpp, FIFO* f, std::int64_t part_num, std::uint64_t ofs, int max_entries,
+	 std::vector<list_entry>* entries_out, bool* more_out,
+	 std::uint64_t tid, lr::AioCompletion* super)
+    : Completion(dpp, super), f(f), part_num(part_num), ofs(ofs), max_entries(max_entries),
+      entries_out(entries_out), more_out(more_out), tid(tid) {
+    result.reserve(max_entries);
+  }
+
+  Lister(const Lister&) = delete;
+  Lister& operator =(const Lister&) = delete;
+  Lister(Lister&&) = delete;
+  Lister& operator =(Lister&&) = delete;
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (read)
+      handle_read(std::move(p), r);
+    else
+      handle_list(dpp, std::move(p), r);
+  }
+
+  void list(Ptr&& p) {
+    if (max_entries > 0) {
+      part_more = false;
+      part_full = false;
+      entries.clear();
+
+      std::unique_lock l(f->m);
+      auto part_oid = f->info.part_oid(part_num);
+      l.unlock();
+
+      read = false;
+      auto op = list_part(f->cct, ofs, max_entries, &r_out,
+			  &entries, &part_more, &part_full, tid);
+      f->ioctx.aio_operate(part_oid, call(std::move(p)), &op, nullptr);
+    } else {
+      complete(std::move(p), 0);
+    }
+  }
+
+  void handle_read(Ptr&& p, int r) {
+    read = false;
+    if (r >= 0) r = r_out;
+    r_out = 0;
+
+    if (r < 0) {
+      complete(std::move(p), r);
+      return;
+    }
+
+    if (part_num < f->info.tail_part_num) {
+      /* raced with trim? restart */
+      max_entries += result.size();
+      result.clear();
+      part_num = f->info.tail_part_num;
+      ofs = 0;
+      list(std::move(p));
+      return;
+    }
+    /* assuming part was not written yet, so end of data */
+    more = false;
+    complete(std::move(p), 0);
+    return;
+  }
+
+  void handle_list(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    if (r >= 0) r = r_out;
+    r_out = 0;
+    std::unique_lock l(f->m);
+    auto part_oid = f->info.part_oid(part_num);
+    l.unlock();
+    if (r == -ENOENT) {
+      read = true;
+      f->read_meta(dpp, tid, call(std::move(p)));
+      return;
+    }
+    if (r < 0) {
+      complete(std::move(p), r);
+      return;
+    }
+
+    more = part_full || part_more;
+    for (auto& entry : entries) {
+      list_entry e;
+      e.data = std::move(entry.data);
+      e.marker = marker{part_num, entry.ofs}.to_string();
+      e.mtime = entry.mtime;
+      result.push_back(std::move(e));
+    }
+    max_entries -= entries.size();
+    entries.clear();
+    if (max_entries > 0 && part_more) {
+      list(std::move(p));
+      return;
+    }
+
+    if (!part_full) { /* head part is not full */
+      complete(std::move(p), 0);
+      return;
+    }
+    ++part_num;
+    ofs = 0;
+    list(std::move(p));
+  }
+};
+
+void FIFO::list(const DoutPrefixProvider *dpp, int max_entries,
+		std::optional<std::string_view> markstr,
+		std::vector<list_entry>* out,
+		bool* more,
+		lr::AioCompletion* c) {
+  std::unique_lock l(m);
+  auto tid = ++next_tid;
+  std::int64_t part_num = info.tail_part_num;
+  l.unlock();
+  std::uint64_t ofs = 0;
+  std::optional<::rgw::cls::fifo::marker> marker;
+
+  if (markstr) {
+    marker = to_marker(*markstr);
+    if (marker) {
+      part_num = marker->num;
+      ofs = marker->ofs;
+    }
+  }
+
+  auto ls = std::make_unique<Lister>(dpp, this, part_num, ofs, max_entries, out,
+				     more, tid, c);
+  if (markstr && !marker) {
+    auto l = ls.get();
+    l->complete(std::move(ls), -EINVAL);
+  } else {
+    ls->list(std::move(ls));
+  }
+}
+}
diff --git a/src/rgw/driver/rados/cls_fifo_legacy.h b/src/rgw/driver/rados/cls_fifo_legacy.h
new file mode 100644
index 000000000..b0a68157e
--- /dev/null
+++ b/src/rgw/driver/rados/cls_fifo_legacy.h
@@ -0,0 +1,334 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat <contact@redhat.com>
+ * Author: Adam C. Emerson
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <deque>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string_view>
+#include <vector>
+
+#include <fmt/format.h>
+
+#include "include/rados/librados.hpp"
+#include "include/buffer.h"
+#include "include/function2.hpp"
+
+#include "common/async/yield_context.h"
+
+#include "cls/fifo/cls_fifo_types.h"
+#include "cls/fifo/cls_fifo_ops.h"
+
+#include "librados/AioCompletionImpl.h"
+
+#include "rgw_tools.h"
+
+namespace rgw::cls::fifo {
+namespace cb = ceph::buffer;
+namespace fifo = rados::cls::fifo;
+namespace lr = librados;
+
+inline constexpr std::uint64_t default_max_part_size = 4 * 1024 * 1024;
+inline constexpr std::uint64_t default_max_entry_size = 32 * 1024;
+
+void create_meta(lr::ObjectWriteOperation* op, std::string_view id,
+		 std::optional<fifo::objv> objv,
+		 std::optional<std::string_view> oid_prefix,
+		 bool exclusive = false,
+		 std::uint64_t max_part_size = default_max_part_size,
+		 std::uint64_t max_entry_size = default_max_entry_size);
+int get_meta(const DoutPrefixProvider *dpp, lr::IoCtx& ioctx, const std::string& oid,
+	     std::optional<fifo::objv> objv, fifo::info* info,
+	     std::uint32_t* part_header_size,
+	     std::uint32_t* part_entry_overhead,
+	     std::uint64_t tid, optional_yield y,
+	     bool probe = false);
+struct marker {
+  std::int64_t num = 0;
+  std::uint64_t ofs = 0;
+
+  marker() = default;
+  marker(std::int64_t num, std::uint64_t ofs) : num(num), ofs(ofs) {}
+  static marker max() {
+    return { std::numeric_limits<decltype(num)>::max(),
+	     std::numeric_limits<decltype(ofs)>::max() };
+  }
+
+  std::string to_string() {
+    return fmt::format("{:0>20}:{:0>20}", num, ofs);
+  }
+};
+
+struct list_entry {
+  cb::list data;
+  std::string marker;
+  ceph::real_time mtime;
+};
+
+using part_info = fifo::part_header;
+
+/// This is an implementation of FIFO using librados to facilitate
+/// backports. Please see /src/neorados/cls/fifo.h for full
+/// information.
+///
+/// This library uses optional_yield. Please see
+/// /src/common/async/yield_context.h. In summary, optional_yield
+/// contains either a spawn::yield_context (in which case the current
+/// coroutine is suspended until completion) or null_yield (in which
+/// case the current thread is blocked until completion.)
+///
+/// Please see the librados documentation for information on
+/// AioCompletion and IoCtx.
+
+class FIFO {
+  friend struct Reader;
+  friend struct Updater;
+  friend struct Trimmer;
+  friend struct InfoGetter;
+  friend struct Pusher;
+  friend struct NewPartPreparer;
+  friend struct NewHeadPreparer;
+  friend struct JournalProcessor;
+  friend struct Lister;
+
+  mutable lr::IoCtx ioctx;
+  CephContext* cct = static_cast<CephContext*>(ioctx.cct());
+  const std::string oid;
+  std::mutex m;
+  std::uint64_t next_tid = 0;
+
+  fifo::info info;
+
+  std::uint32_t part_header_size = 0xdeadbeef;
+  std::uint32_t part_entry_overhead = 0xdeadbeef;
+
+  std::optional<marker> to_marker(std::string_view s);
+
+  FIFO(lr::IoCtx&& ioc,
+       std::string oid)
+    : ioctx(std::move(ioc)), oid(oid) {}
+
+  int apply_update(const DoutPrefixProvider *dpp,
+                   fifo::info* info,
+		   const fifo::objv& objv,
+		   const fifo::update& update,
+		   std::uint64_t tid);
+  int _update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+		   fifo::objv version, bool* pcanceled,
+		   std::uint64_t tid, optional_yield y);
+  void _update_meta(const DoutPrefixProvider *dpp, const fifo::update& update,
+		    fifo::objv version, bool* pcanceled,
+		    std::uint64_t tid, lr::AioCompletion* c);
+  int create_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid,
+		  optional_yield y);
+  int remove_part(const DoutPrefixProvider *dpp, int64_t part_num, std::uint64_t tid,
+		  optional_yield y);
+  int process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y);
+  void process_journal(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c);
+  int _prepare_new_part(const DoutPrefixProvider *dpp, std::int64_t new_part_num, bool is_head, std::uint64_t tid, optional_yield y);
+  void _prepare_new_part(const DoutPrefixProvider *dpp, std::int64_t new_part_num, bool is_head, std::uint64_t tid, lr::AioCompletion* c);
+  int _prepare_new_head(const DoutPrefixProvider *dpp, std::int64_t new_head_part_num,
+			std::uint64_t tid, optional_yield y);
+  void _prepare_new_head(const DoutPrefixProvider *dpp, std::int64_t new_head_part_num, std::uint64_t tid, lr::AioCompletion* c);
+  int push_entries(const DoutPrefixProvider *dpp, const std::deque<cb::list>& data_bufs,
+		   std::uint64_t tid, optional_yield y);
+  void push_entries(const std::deque<cb::list>& data_bufs,
+		    std::uint64_t tid, lr::AioCompletion* c);
+  int trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+		bool exclusive, std::uint64_t tid, optional_yield y);
+  void trim_part(const DoutPrefixProvider *dpp, int64_t part_num, uint64_t ofs,
+		 bool exclusive, std::uint64_t tid, lr::AioCompletion* c);
+
+  /// Force refresh of metadata, yielding/blocking style
+  int read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, optional_yield y);
+  /// Force refresh of metadata, with a librados Completion
+  void read_meta(const DoutPrefixProvider *dpp, std::uint64_t tid, lr::AioCompletion* c);
+
+public:
+
+  FIFO(const FIFO&) = delete;
+  FIFO& operator =(const FIFO&) = delete;
+  FIFO(FIFO&&) = delete;
+  FIFO& operator =(FIFO&&) = delete;
+
+  /// Open an existing FIFO.
+  static int open(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, //< IO Context
+		  std::string oid, //< OID for metadata object
+		  std::unique_ptr<FIFO>* fifo, //< OUT: Pointer to FIFO object
+		  optional_yield y, //< Optional yield context
+		  /// Operation will fail if FIFO is not at this version
+		  std::optional<fifo::objv> objv = std::nullopt,
+		  /// Probing for existence, don't print errors if we
+		  /// can't find it.
+		  bool probe = false);
+  /// Create a new or open an existing FIFO.
+  static int create(const DoutPrefixProvider *dpp, lr::IoCtx ioctx, //< IO Context
+		    std::string oid, //< OID for metadata object
+		    std::unique_ptr<FIFO>* fifo, //< OUT: Pointer to FIFO object
+		    optional_yield y, //< Optional yield context
+		    /// Operation will fail if the FIFO exists and is
+		    /// not of this version.
+		    std::optional<fifo::objv> objv = std::nullopt,
+		    /// Prefix for all objects
+		    std::optional<std::string_view> oid_prefix = std::nullopt,
+		    /// Fail if the FIFO already exists
+		    bool exclusive = false,
+		    /// Maximum allowed size of parts
+		    std::uint64_t max_part_size = default_max_part_size,
+		    /// Maximum allowed size of entries
+		    std::uint64_t max_entry_size = default_max_entry_size);
+
+  /// Force refresh of metadata, yielding/blocking style
+  int read_meta(const DoutPrefixProvider *dpp, optional_yield y);
+  /// Get currently known metadata
+  const fifo::info& meta() const;
+  /// Get partition header and entry overhead size
+  std::pair<std::uint32_t, std::uint32_t> get_part_layout_info() const;
+  /// Push an entry to the FIFO
+  int push(const DoutPrefixProvider *dpp, 
+           const cb::list& bl, //< Entry to push
+	   optional_yield y //< Optional yield
+    );
+  /// Push an entry to the FIFO
+  void push(const DoutPrefixProvider *dpp, const cb::list& bl, //< Entry to push
+	    lr::AioCompletion* c //< Async Completion
+    );
+  /// Push entries to the FIFO
+  int push(const DoutPrefixProvider *dpp, 
+           const std::vector<cb::list>& data_bufs, //< Entries to push
+	   optional_yield y //< Optional yield
+    );
+  /// Push entries to the FIFO
+  void push(const DoutPrefixProvider *dpp, const std::vector<cb::list>& data_bufs, //< Entries to push
+	    lr::AioCompletion* c //< Async Completion
+    );
+  /// List entries
+  int list(const DoutPrefixProvider *dpp, 
+           int max_entries, //< Maximum entries to list
+	   /// Point after which to begin listing. Start at tail if null
+	   std::optional<std::string_view> markstr,
+	   std::vector<list_entry>* out, //< OUT: entries
+	   /// OUT: True if more entries in FIFO beyond the last returned
+	   bool* more,
+	   optional_yield y //< Optional yield
+    );
+  void list(const DoutPrefixProvider *dpp, 
+            int max_entries, //< Maximum entries to list
+	    /// Point after which to begin listing. Start at tail if null
+	    std::optional<std::string_view> markstr,
+	    std::vector<list_entry>* out, //< OUT: entries
+	    /// OUT: True if more entries in FIFO beyond the last returned
+	    bool* more,
+	    lr::AioCompletion* c //< Async Completion
+    );
+  /// Trim entries, coroutine/block style
+  int trim(const DoutPrefixProvider *dpp, 
+           std::string_view markstr, //< Position to which to trim, inclusive
+	   bool exclusive, //< If true, do not trim the target entry
+			   //< itself, just all those before it.
+	   optional_yield y //< Optional yield
+    );
+  /// Trim entries, librados AioCompletion style
+  void trim(const DoutPrefixProvider *dpp, 
+            std::string_view markstr, //< Position to which to trim, inclusive
+	    bool exclusive, //< If true, do not trim the target entry
+	                    //< itself, just all those before it.
+	    lr::AioCompletion* c //< librados AIO Completion
+    );
+  /// Get part info
+  int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, /// Part number
+		    fifo::part_header* header, //< OUT: Information
+		    optional_yield y //< Optional yield
+    );
+  /// Get part info
+  void get_part_info(int64_t part_num, //< Part number
+		    fifo::part_header* header, //< OUT: Information
+		    lr::AioCompletion* c //< AIO Completion
+    );
+  /// A convenience method to fetch the part information for the FIFO
+  /// head, using librados::AioCompletion, since
+  /// libradio::AioCompletions compose lousily.
+  void get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function< //< Function to receive info
+		       void(int r, fifo::part_header&&)>,
+		     lr::AioCompletion* c //< AIO Completion
+    );
+};
+
+template<typename T>
+struct Completion {
+private:
+  const DoutPrefixProvider *_dpp;
+  lr::AioCompletion* _cur = nullptr;
+  lr::AioCompletion* _super;
+public:
+
+  using Ptr = std::unique_ptr<T>;
+
+  lr::AioCompletion* cur() const {
+    return _cur;
+  }
+  lr::AioCompletion* super() const {
+    return _super;
+  }
+
+  Completion(const DoutPrefixProvider *dpp, lr::AioCompletion* super) : _dpp(dpp), _super(super) {
+    super->pc->get();
+  }
+
+  ~Completion() {
+    if (_super) {
+      _super->pc->put();
+    }
+    if (_cur)
+      _cur->release();
+    _super = nullptr;
+    _cur = nullptr;
+  }
+
+  // The only times that aio_operate can return an error are:
+  // 1. The completion contains a null pointer. This should just
+  //    crash, and in our case it does.
+  // 2. An attempt is made to write to a snapshot. RGW doesn't use
+  //    snapshots, so we don't care.
+  //
+  // So we will just assert that initiating an Aio operation succeeds
+  // and not worry about recovering.
+  static lr::AioCompletion* call(Ptr&& p) {
+    p->_cur = lr::Rados::aio_create_completion(static_cast<void*>(p.get()),
+					       &cb);
+    auto c = p->_cur;
+    p.release();
+    return c;
+  }
+  static void complete(Ptr&& p, int r) {
+    auto c = p->_super;
+    p->_super = nullptr;
+    rgw_complete_aio_completion(c, r);
+  }
+
+  static void cb(lr::completion_t, void* arg) {
+    auto t = static_cast<T*>(arg);
+    auto r = t->_cur->get_return_value();
+    t->_cur->release();
+    t->_cur = nullptr;
+    t->handle(t->_dpp, Ptr(t), r);
+  }
+};
+
+}
diff --git a/src/rgw/driver/rados/config/impl.cc b/src/rgw/driver/rados/config/impl.cc
new file mode 100644
index 000000000..f1b2befad
--- /dev/null
+++ b/src/rgw/driver/rados/config/impl.cc
@@ -0,0 +1,129 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "impl.h"
+
+#include "common/async/yield_context.h"
+#include "common/errno.h"
+#include "rgw_string.h"
+#include "rgw_zone.h"
+
+namespace rgw::rados {
+
+// default pool names
+constexpr std::string_view default_zone_root_pool = "rgw.root";
+constexpr std::string_view default_zonegroup_root_pool = "rgw.root";
+constexpr std::string_view default_realm_root_pool = "rgw.root";
+constexpr std::string_view default_period_root_pool = "rgw.root";
+
+static rgw_pool default_pool(std::string_view name,
+                             std::string_view default_name)
+{
+  return std::string{name_or_default(name, default_name)};
+}
+
+ConfigImpl::ConfigImpl(const ceph::common::ConfigProxy& conf)
+  : realm_pool(default_pool(conf->rgw_realm_root_pool,
+                            default_realm_root_pool)),
+    period_pool(default_pool(conf->rgw_period_root_pool,
+                             default_period_root_pool)),
+    zonegroup_pool(default_pool(conf->rgw_zonegroup_root_pool,
+                                default_zonegroup_root_pool)),
+    zone_pool(default_pool(conf->rgw_zone_root_pool,
+                           default_zone_root_pool))
+{
+}
+
+int ConfigImpl::read(const DoutPrefixProvider* dpp, optional_yield y,
+                     const rgw_pool& pool, const std::string& oid,
+                     bufferlist& bl, RGWObjVersionTracker* objv)
+{
+  librados::IoCtx ioctx;
+  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+  if (r < 0) {
+    return r;
+  }
+  librados::ObjectReadOperation op;
+  if (objv) {
+    objv->prepare_op_for_read(&op);
+  }
+  op.read(0, 0, &bl, nullptr);
+  return rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+}
+
+int ConfigImpl::write(const DoutPrefixProvider* dpp, optional_yield y,
+                      const rgw_pool& pool, const std::string& oid,
+                      Create create, const bufferlist& bl,
+                      RGWObjVersionTracker* objv)
+{
+  librados::IoCtx ioctx;
+  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  switch (create) {
+    case Create::MustNotExist: op.create(true); break;
+    case Create::MayExist: op.create(false); break;
+    case Create::MustExist: op.assert_exists(); break;
+  }
+  if (objv) {
+    objv->prepare_op_for_write(&op);
+  }
+  op.write_full(bl);
+
+  r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r >= 0 && objv) {
+    objv->apply_write();
+  }
+  return r;
+}
+
+int ConfigImpl::remove(const DoutPrefixProvider* dpp, optional_yield y,
+                       const rgw_pool& pool, const std::string& oid,
+                       RGWObjVersionTracker* objv)
+{
+  librados::IoCtx ioctx;
+  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  if (objv) {
+    objv->prepare_op_for_write(&op);
+  }
+  op.remove();
+
+  r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (r >= 0 && objv) {
+    objv->apply_write();
+  }
+  return r;
+}
+
+int ConfigImpl::notify(const DoutPrefixProvider* dpp, optional_yield y,
+                       const rgw_pool& pool, const std::string& oid,
+                       bufferlist& bl, uint64_t timeout_ms)
+{
+  librados::IoCtx ioctx;
+  int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+  if (r < 0) {
+    return r;
+  }
+  return rgw_rados_notify(dpp, ioctx, oid, bl, timeout_ms, nullptr, y);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/impl.h b/src/rgw/driver/rados/config/impl.h
new file mode 100644
index 000000000..3aed451f9
--- /dev/null
+++ b/src/rgw/driver/rados/config/impl.h
@@ -0,0 +1,139 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "include/rados/librados.hpp"
+#include "common/dout.h"
+#include "rgw_basic_types.h"
+#include "rgw_tools.h"
+#include "rgw_sal_config.h"
+
+namespace rgw::rados {
+
+// write options that control object creation
+enum class Create {
+  MustNotExist, // fail with EEXIST if the object already exists
+  MayExist, // create if the object didn't exist, overwrite if it did
+  MustExist, // fail with ENOENT if the object doesn't exist
+};
+
+struct ConfigImpl {
+  librados::Rados rados;
+
+  const rgw_pool realm_pool;
+  const rgw_pool period_pool;
+  const rgw_pool zonegroup_pool;
+  const rgw_pool zone_pool;
+
+  ConfigImpl(const ceph::common::ConfigProxy& conf);
+
+  int read(const DoutPrefixProvider* dpp, optional_yield y,
+           const rgw_pool& pool, const std::string& oid,
+           bufferlist& bl, RGWObjVersionTracker* objv);
+
+  template <typename T>
+  int read(const DoutPrefixProvider* dpp, optional_yield y,
+           const rgw_pool& pool, const std::string& oid,
+           T& data, RGWObjVersionTracker* objv)
+  {
+    bufferlist bl;
+    int r = read(dpp, y, pool, oid, bl, objv);
+    if (r < 0) {
+      return r;
+    }
+    try {
+      auto p = bl.cbegin();
+      decode(data, p);
+    } catch (const buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from "
+          << pool << ":" << oid << dendl;
+      return -EIO;
+    }
+    return 0;
+  }
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const rgw_pool& pool, const std::string& oid, Create create,
+            const bufferlist& bl, RGWObjVersionTracker* objv);
+
+  template <typename T>
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const rgw_pool& pool, const std::string& oid, Create create,
+            const T& data, RGWObjVersionTracker* objv)
+  {
+    bufferlist bl;
+    encode(data, bl);
+
+    return write(dpp, y, pool, oid, create, bl, objv);
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y,
+             const rgw_pool& pool, const std::string& oid,
+             RGWObjVersionTracker* objv);
+
+  int list(const DoutPrefixProvider* dpp, optional_yield y,
+           const rgw_pool& pool, const std::string& marker,
+           std::regular_invocable<std::string> auto filter,
+           std::span<std::string> entries,
+           sal::ListResult<std::string>& result)
+  {
+    librados::IoCtx ioctx;
+    int r = rgw_init_ioctx(dpp, &rados, pool, ioctx, true, false);
+    if (r < 0) {
+      return r;
+    }
+    librados::ObjectCursor oc;
+    if (!oc.from_str(marker)) {
+      ldpp_dout(dpp, 10) << "failed to parse cursor: " << marker << dendl;
+      return -EINVAL;
+    }
+    std::size_t count = 0;
+    try {
+      auto iter = ioctx.nobjects_begin(oc);
+      const auto end = ioctx.nobjects_end();
+      for (; count < entries.size() && iter != end; ++iter) {
+        std::string entry = filter(iter->get_oid());
+        if (!entry.empty()) {
+          entries[count++] = std::move(entry);
+        }
+      }
+      if (iter == end) {
+        result.next.clear();
+      } else {
+        result.next = iter.get_cursor().to_str();
+      }
+    } catch (const std::exception& e) {
+      ldpp_dout(dpp, 10) << "NObjectIterator exception " << e.what() << dendl;
+      return -EIO;
+    }
+    result.entries = entries.first(count);
+    return 0;
+  }
+
+  int notify(const DoutPrefixProvider* dpp, optional_yield y,
+             const rgw_pool& pool, const std::string& oid,
+             bufferlist& bl, uint64_t timeout_ms);
+};
+
+inline std::string_view name_or_default(std::string_view name,
+                                        std::string_view default_name)
+{
+  if (!name.empty()) {
+    return name;
+  }
+  return default_name;
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/period.cc b/src/rgw/driver/rados/config/period.cc
new file mode 100644
index 000000000..bc3fa27e7
--- /dev/null
+++ b/src/rgw/driver/rados/config/period.cc
@@ -0,0 +1,230 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// period oids
+constexpr std::string_view period_info_oid_prefix = "periods.";
+constexpr std::string_view period_latest_epoch_info_oid = ".latest_epoch";
+constexpr std::string_view period_staging_suffix = ":staging";
+
+static std::string period_oid(std::string_view period_id, uint32_t epoch)
+{
+  // omit the epoch for the staging period
+  if (period_id.ends_with(period_staging_suffix)) {
+    return string_cat_reserve(period_info_oid_prefix, period_id);
+  }
+  return fmt::format("{}{}.{}", period_info_oid_prefix, period_id, epoch);
+}
+
+static std::string latest_epoch_oid(const ceph::common::ConfigProxy& conf,
+                                    std::string_view period_id)
+{
+  return string_cat_reserve(
+      period_info_oid_prefix, period_id,
+      name_or_default(conf->rgw_period_latest_epoch_info_oid,
+                      period_latest_epoch_info_oid));
+}
+
+static int read_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+                             ConfigImpl* impl, std::string_view period_id,
+                             uint32_t& epoch, RGWObjVersionTracker* objv)
+{
+  const auto& pool = impl->period_pool;
+  const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
+  RGWPeriodLatestEpochInfo latest;
+  int r = impl->read(dpp, y, pool, latest_oid, latest, objv);
+  if (r >= 0) {
+    epoch = latest.epoch;
+  }
+  return r;
+}
+
+static int write_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+                              ConfigImpl* impl, bool exclusive,
+                              std::string_view period_id, uint32_t epoch,
+                              RGWObjVersionTracker* objv)
+{
+  const auto& pool = impl->period_pool;
+  const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+  RGWPeriodLatestEpochInfo latest{epoch};
+  return impl->write(dpp, y, pool, latest_oid, create, latest, objv);
+}
+
+static int delete_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+                               ConfigImpl* impl, std::string_view period_id,
+                               RGWObjVersionTracker* objv)
+{
+  const auto& pool = impl->period_pool;
+  const auto latest_oid = latest_epoch_oid(dpp->get_cct()->_conf, period_id);
+  return impl->remove(dpp, y, pool, latest_oid, objv);
+}
+
+static int update_latest_epoch(const DoutPrefixProvider* dpp, optional_yield y,
+                               ConfigImpl* impl, std::string_view period_id,
+                               uint32_t epoch)
+{
+  static constexpr int MAX_RETRIES = 20;
+
+  for (int i = 0; i < MAX_RETRIES; i++) {
+    uint32_t existing_epoch = 0;
+    RGWObjVersionTracker objv;
+    bool exclusive = false;
+
+    // read existing epoch
+    int r = read_latest_epoch(dpp, y, impl, period_id, existing_epoch, &objv);
+    if (r == -ENOENT) {
+      // use an exclusive create to set the epoch atomically
+      exclusive = true;
+      objv.generate_new_write_ver(dpp->get_cct());
+      ldpp_dout(dpp, 20) << "creating initial latest_epoch=" << epoch
+          << " for period=" << period_id << dendl;
+    } else if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to read latest_epoch" << dendl;
+      return r;
+    } else if (epoch <= existing_epoch) {
+      r = -EEXIST; // fail with EEXIST if epoch is not newer
+      ldpp_dout(dpp, 10) << "found existing latest_epoch " << existing_epoch
+          << " >= given epoch " << epoch << ", returning r=" << r << dendl;
+      return r;
+    } else {
+      ldpp_dout(dpp, 20) << "updating latest_epoch from " << existing_epoch
+          << " -> " << epoch << " on period=" << period_id << dendl;
+    }
+
+    r = write_latest_epoch(dpp, y, impl, exclusive, period_id, epoch, &objv);
+    if (r == -EEXIST) {
+      continue; // exclusive create raced with another update, retry
+    } else if (r == -ECANCELED) {
+      continue; // write raced with a conflicting version, retry
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to write latest_epoch" << dendl;
+      return r;
+    }
+    return 0; // return success
+  }
+
+  return -ECANCELED; // fail after max retries
+}
+
+int RadosConfigStore::create_period(const DoutPrefixProvider* dpp,
+                                    optional_yield y, bool exclusive,
+                                    const RGWPeriod& info)
+{
+  if (info.get_id().empty()) {
+    ldpp_dout(dpp, 0) << "period cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.get_epoch() == 0) {
+    ldpp_dout(dpp, 0) << "period cannot have an empty epoch" << dendl;
+    return -EINVAL;
+  }
+  const auto& pool = impl->period_pool;
+  const auto info_oid = period_oid(info.get_id(), info.get_epoch());
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+  RGWObjVersionTracker objv;
+  objv.generate_new_write_ver(dpp->get_cct());
+  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  (void) update_latest_epoch(dpp, y, impl.get(), info.get_id(), info.get_epoch());
+  return 0;
+}
+
+int RadosConfigStore::read_period(const DoutPrefixProvider* dpp,
+                                  optional_yield y,
+                                  std::string_view period_id,
+                                  std::optional<uint32_t> epoch,
+                                  RGWPeriod& info)
+{
+  int r = 0;
+  if (!epoch) {
+    epoch = 0;
+    r = read_latest_epoch(dpp, y, impl.get(), period_id, *epoch, nullptr);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  const auto& pool = impl->period_pool;
+  const auto info_oid = period_oid(period_id, *epoch);
+  return impl->read(dpp, y, pool, info_oid, info, nullptr);
+}
+
+int RadosConfigStore::delete_period(const DoutPrefixProvider* dpp,
+                                    optional_yield y,
+                                    std::string_view period_id)
+{
+  const auto& pool = impl->period_pool;
+
+  // read the latest_epoch
+  uint32_t latest_epoch = 0;
+  RGWObjVersionTracker latest_objv;
+  int r = read_latest_epoch(dpp, y, impl.get(), period_id,
+                            latest_epoch, &latest_objv);
+  if (r < 0 && r != -ENOENT) { // just delete epoch=0 on ENOENT
+    ldpp_dout(dpp, 0) << "failed to read latest epoch for period "
+        << period_id << ": " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  for (uint32_t epoch = 0; epoch <= latest_epoch; epoch++) {
+    const auto info_oid = period_oid(period_id, epoch);
+    r = impl->remove(dpp, y, pool, info_oid, nullptr);
+    if (r < 0 && r != -ENOENT) { // ignore ENOENT
+      ldpp_dout(dpp, 0) << "failed to delete period " << info_oid
+          << ": " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  return delete_latest_epoch(dpp, y, impl.get(), period_id, &latest_objv);
+}
+
+int RadosConfigStore::list_period_ids(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      const std::string& marker,
+                                      std::span<std::string> entries,
+                                      sal::ListResult<std::string>& result)
+{
+  const auto& pool = impl->period_pool;
+  constexpr auto prefix = [] (std::string oid) -> std::string {
+      if (!oid.starts_with(period_info_oid_prefix)) {
+        return {};
+      }
+      if (!oid.ends_with(period_latest_epoch_info_oid)) {
+        return {};
+      }
+      // trim the prefix and suffix
+      const std::size_t count = oid.size() -
+          period_info_oid_prefix.size() -
+          period_latest_epoch_info_oid.size();
+      return oid.substr(period_info_oid_prefix.size(), count);
+    };
+
+  return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/period_config.cc b/src/rgw/driver/rados/config/period_config.cc
new file mode 100644
index 000000000..ec984ebdc
--- /dev/null
+++ b/src/rgw/driver/rados/config/period_config.cc
@@ -0,0 +1,55 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// period config oids
+constexpr std::string_view period_config_prefix = "period_config.";
+constexpr std::string_view period_config_realm_default = "default";
+
+std::string period_config_oid(std::string_view realm_id)
+{
+  if (realm_id.empty()) {
+    realm_id = period_config_realm_default;
+  }
+  return string_cat_reserve(period_config_prefix, realm_id);
+}
+
+int RadosConfigStore::read_period_config(const DoutPrefixProvider* dpp,
+                                         optional_yield y,
+                                         std::string_view realm_id,
+                                         RGWPeriodConfig& info)
+{
+  const auto& pool = impl->period_pool;
+  const auto oid = period_config_oid(realm_id);
+  return impl->read(dpp, y, pool, oid, info, nullptr);
+}
+
+int RadosConfigStore::write_period_config(const DoutPrefixProvider* dpp,
+                                          optional_yield y, bool exclusive,
+                                          std::string_view realm_id,
+                                          const RGWPeriodConfig& info)
+{
+  const auto& pool = impl->period_pool;
+  const auto oid = period_config_oid(realm_id);
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+  return impl->write(dpp, y, pool, oid, create, info, nullptr);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/realm.cc b/src/rgw/driver/rados/config/realm.cc
new file mode 100644
index 000000000..331e0ffd2
--- /dev/null
+++ b/src/rgw/driver/rados/config/realm.cc
@@ -0,0 +1,364 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// realm oids
+constexpr std::string_view realm_names_oid_prefix = "realms_names.";
+constexpr std::string_view realm_info_oid_prefix = "realms.";
+constexpr std::string_view realm_control_oid_suffix = ".control";
+constexpr std::string_view default_realm_info_oid = "default.realm";
+
+static std::string realm_info_oid(std::string_view realm_id)
+{
+  return string_cat_reserve(realm_info_oid_prefix, realm_id);
+}
+static std::string realm_name_oid(std::string_view realm_id)
+{
+  return string_cat_reserve(realm_names_oid_prefix, realm_id);
+}
+static std::string realm_control_oid(std::string_view realm_id)
+{
+  return string_cat_reserve(realm_info_oid_prefix, realm_id,
+                            realm_control_oid_suffix);
+}
+static std::string default_realm_oid(const ceph::common::ConfigProxy& conf)
+{
+  return std::string{name_or_default(conf->rgw_default_realm_info_oid,
+                                     default_realm_info_oid)};
+}
+
+
+int RadosConfigStore::write_default_realm_id(const DoutPrefixProvider* dpp,
+                                             optional_yield y, bool exclusive,
+                                             std::string_view realm_id)
+{
+  const auto& pool = impl->realm_pool;
+  const auto oid = default_realm_oid(dpp->get_cct()->_conf);
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  default_info.default_id = realm_id;
+
+  return impl->write(dpp, y, pool, oid, create, default_info, nullptr);
+}
+
+int RadosConfigStore::read_default_realm_id(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            std::string& realm_id)
+{
+  const auto& pool = impl->realm_pool;
+  const auto oid = default_realm_oid(dpp->get_cct()->_conf);
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  int r = impl->read(dpp, y, pool, oid, default_info, nullptr);
+  if (r >= 0) {
+    realm_id = default_info.default_id;
+  }
+  return r;
+}
+
+int RadosConfigStore::delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                              optional_yield y)
+{
+  const auto& pool = impl->realm_pool;
+  const auto oid = default_realm_oid(dpp->get_cct()->_conf);
+
+  return impl->remove(dpp, y, pool, oid, nullptr);
+}
+
+
+class RadosRealmWriter : public sal::RealmWriter {
+  ConfigImpl* impl;
+  RGWObjVersionTracker objv;
+  std::string realm_id;
+  std::string realm_name;
+ public:
+  RadosRealmWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
+                   std::string_view realm_id, std::string_view realm_name)
+    : impl(impl), objv(std::move(objv)),
+      realm_id(realm_id), realm_name(realm_name)
+  {
+  }
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWRealm& info) override
+  {
+    if (realm_id != info.get_id() || realm_name != info.get_name()) {
+      return -EINVAL; // can't modify realm id or name directly
+    }
+
+    const auto& pool = impl->realm_pool;
+    const auto info_oid = realm_info_oid(info.get_id());
+    return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWRealm& info, std::string_view new_name) override
+  {
+    if (realm_id != info.get_id() || realm_name != info.get_name()) {
+      return -EINVAL; // can't modify realm id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    const auto& pool = impl->realm_pool;
+    const auto name = RGWNameToId{info.get_id()};
+    const auto info_oid = realm_info_oid(info.get_id());
+    const auto old_oid = realm_name_oid(info.get_name());
+    const auto new_oid = realm_name_oid(new_name);
+
+    // link the new name
+    RGWObjVersionTracker new_objv;
+    new_objv.generate_new_write_ver(dpp->get_cct());
+    int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
+                        name, &new_objv);
+    if (r < 0) {
+      return r;
+    }
+
+    // write the info with updated name
+    info.set_name(std::string{new_name});
+    r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+    if (r < 0) {
+      // on failure, unlink the new name
+      (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
+      return r;
+    }
+
+    // unlink the old name
+    (void) impl->remove(dpp, y, pool, old_oid, nullptr);
+
+    realm_name = new_name;
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    const auto& pool = impl->realm_pool;
+    const auto info_oid = realm_info_oid(realm_id);
+    int r = impl->remove(dpp, y, pool, info_oid, &objv);
+    if (r < 0) {
+      return r;
+    }
+    const auto name_oid = realm_name_oid(realm_name);
+    (void) impl->remove(dpp, y, pool, name_oid, nullptr);
+    const auto control_oid = realm_control_oid(realm_id);
+    (void) impl->remove(dpp, y, pool, control_oid, nullptr);
+    return 0;
+  }
+}; // RadosRealmWriter
+
+
+int RadosConfigStore::create_realm(const DoutPrefixProvider* dpp,
+                                   optional_yield y, bool exclusive,
+                                   const RGWRealm& info,
+                                   std::unique_ptr<sal::RealmWriter>* writer)
+{
+  if (info.get_id().empty()) {
+    ldpp_dout(dpp, 0) << "realm cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.get_name().empty()) {
+    ldpp_dout(dpp, 0) << "realm cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  const auto& pool = impl->realm_pool;
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  // write the realm info
+  const auto info_oid = realm_info_oid(info.get_id());
+  RGWObjVersionTracker objv;
+  objv.generate_new_write_ver(dpp->get_cct());
+
+  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  // write the realm name
+  const auto name_oid = realm_name_oid(info.get_name());
+  const auto name = RGWNameToId{info.get_id()};
+  RGWObjVersionTracker name_objv;
+  name_objv.generate_new_write_ver(dpp->get_cct());
+
+  r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
+  if (r < 0) {
+    (void) impl->remove(dpp, y, pool, info_oid, &objv);
+    return r;
+  }
+
+  // create control object for watch/notify
+  const auto control_oid = realm_control_oid(info.get_id());
+  bufferlist empty_bl;
+  r = impl->write(dpp, y, pool, control_oid, Create::MayExist,
+                  empty_bl, nullptr);
+  if (r < 0) {
+    (void) impl->remove(dpp, y, pool, name_oid, &name_objv);
+    (void) impl->remove(dpp, y, pool, info_oid, &objv);
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosRealmWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_realm_by_id(const DoutPrefixProvider* dpp,
+                                       optional_yield y,
+                                       std::string_view realm_id,
+                                       RGWRealm& info,
+                                       std::unique_ptr<sal::RealmWriter>* writer)
+{
+  const auto& pool = impl->realm_pool;
+  const auto info_oid = realm_info_oid(realm_id);
+  RGWObjVersionTracker objv;
+  int r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosRealmWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_realm_by_name(const DoutPrefixProvider* dpp,
+                                         optional_yield y,
+                                         std::string_view realm_name,
+                                         RGWRealm& info,
+                                         std::unique_ptr<sal::RealmWriter>* writer)
+{
+  const auto& pool = impl->realm_pool;
+
+  // look up realm id by name
+  RGWNameToId name;
+  const auto name_oid = realm_name_oid(realm_name);
+  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = realm_info_oid(name.obj_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosRealmWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_default_realm(const DoutPrefixProvider* dpp,
+                                         optional_yield y,
+                                         RGWRealm& info,
+                                         std::unique_ptr<sal::RealmWriter>* writer)
+{
+  const auto& pool = impl->realm_pool;
+
+  // read default realm id
+  RGWDefaultSystemMetaObjInfo default_info;
+  const auto default_oid = default_realm_oid(dpp->get_cct()->_conf);
+  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = realm_info_oid(default_info.default_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosRealmWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_realm_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y,
+                                    std::string_view realm_name,
+                                    std::string& realm_id)
+{
+  const auto& pool = impl->realm_pool;
+  RGWNameToId name;
+
+  // look up realm id by name
+  const auto name_oid = realm_name_oid(realm_name);
+  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+  if (r < 0) {
+    return r;
+  }
+  realm_id = std::move(name.obj_id);
+  return 0;
+}
+
+int RadosConfigStore::realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                              optional_yield y,
+                                              const RGWPeriod& period)
+{
+  const auto& pool = impl->realm_pool;
+  const auto control_oid = realm_control_oid(period.get_realm());
+
+  bufferlist bl;
+  using ceph::encode;
+  // push the period to dependent zonegroups/zones
+  encode(RGWRealmNotify::ZonesNeedPeriod, bl);
+  encode(period, bl);
+  // reload the gateway with the new period
+  encode(RGWRealmNotify::Reload, bl);
+
+  constexpr uint64_t timeout_ms = 0;
+  return impl->notify(dpp, y, pool, control_oid, bl, timeout_ms);
+}
+
+int RadosConfigStore::list_realm_names(const DoutPrefixProvider* dpp,
+                                       optional_yield y,
+                                       const std::string& marker,
+                                       std::span<std::string> entries,
+                                       sal::ListResult<std::string>& result)
+{
+  const auto& pool = impl->realm_pool;
+  constexpr auto prefix = [] (std::string oid) -> std::string {
+      if (!oid.starts_with(realm_names_oid_prefix)) {
+        return {};
+      }
+      return oid.substr(realm_names_oid_prefix.size());
+    };
+  return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/store.cc b/src/rgw/driver/rados/config/store.cc
new file mode 100644
index 000000000..ec2b034a8
--- /dev/null
+++ b/src/rgw/driver/rados/config/store.cc
@@ -0,0 +1,52 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "impl.h"
+#include "store.h"
+
+namespace rgw::rados {
+
+RadosConfigStore::RadosConfigStore(std::unique_ptr<ConfigImpl> impl)
+  : impl(std::move(impl))
+{
+}
+
+RadosConfigStore::~RadosConfigStore() = default;
+
+
+auto create_config_store(const DoutPrefixProvider* dpp)
+    -> std::unique_ptr<RadosConfigStore>
+{
+  auto impl = std::make_unique<ConfigImpl>(dpp->get_cct()->_conf);
+
+  // initialize a Rados client
+  int r = impl->rados.init_with_context(dpp->get_cct());
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "Rados client initialization failed with "
+        << cpp_strerror(-r) << dendl;
+    return nullptr;
+  }
+  r = impl->rados.connect();
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "Rados client connection failed with "
+        << cpp_strerror(-r) << dendl;
+    return nullptr;
+  }
+
+  return std::make_unique<RadosConfigStore>(std::move(impl));
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/store.h b/src/rgw/driver/rados/config/store.h
new file mode 100644
index 000000000..1b93a803d
--- /dev/null
+++ b/src/rgw/driver/rados/config/store.h
@@ -0,0 +1,182 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <string>
+#include "rgw_common.h"
+#include "rgw_sal_config.h"
+
+class DoutPrefixProvider;
+class optional_yield;
+
+namespace rgw::rados {
+
+struct ConfigImpl;
+
+class RadosConfigStore : public sal::ConfigStore {
+ public:
+  explicit RadosConfigStore(std::unique_ptr<ConfigImpl> impl);
+  virtual ~RadosConfigStore() override;
+
+  // Realm
+  virtual int write_default_realm_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y, bool exclusive,
+                                     std::string_view realm_id) override;
+  virtual int read_default_realm_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y,
+                                    std::string& realm_id) override;
+  virtual int delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                      optional_yield y) override;
+
+  virtual int create_realm(const DoutPrefixProvider* dpp,
+                           optional_yield y, bool exclusive,
+                           const RGWRealm& info,
+                           std::unique_ptr<sal::RealmWriter>* writer) override;
+  virtual int read_realm_by_id(const DoutPrefixProvider* dpp,
+                               optional_yield y,
+                               std::string_view realm_id,
+                               RGWRealm& info,
+                               std::unique_ptr<sal::RealmWriter>* writer) override;
+  virtual int read_realm_by_name(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 std::string_view realm_name,
+                                 RGWRealm& info,
+                                 std::unique_ptr<sal::RealmWriter>* writer) override;
+  virtual int read_default_realm(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 RGWRealm& info,
+                                 std::unique_ptr<sal::RealmWriter>* writer) override;
+  virtual int read_realm_id(const DoutPrefixProvider* dpp,
+                            optional_yield y, std::string_view realm_name,
+                            std::string& realm_id) override;
+  virtual int realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      const RGWPeriod& period) override;
+  virtual int list_realm_names(const DoutPrefixProvider* dpp,
+                               optional_yield y, const std::string& marker,
+                               std::span<std::string> entries,
+                               sal::ListResult<std::string>& result) override;
+
+  // Period
+  virtual int create_period(const DoutPrefixProvider* dpp,
+                            optional_yield y, bool exclusive,
+                            const RGWPeriod& info) override;
+  virtual int read_period(const DoutPrefixProvider* dpp,
+                          optional_yield y, std::string_view period_id,
+                          std::optional<uint32_t> epoch, RGWPeriod& info) override;
+  virtual int delete_period(const DoutPrefixProvider* dpp,
+                            optional_yield y,
+                            std::string_view period_id) override;
+  virtual int list_period_ids(const DoutPrefixProvider* dpp,
+                              optional_yield y, const std::string& marker,
+                              std::span<std::string> entries,
+                              sal::ListResult<std::string>& result) override;
+
+  // ZoneGroup
+  virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                         optional_yield y, bool exclusive,
+                                         std::string_view realm_id,
+                                         std::string_view zonegroup_id) override;
+  virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view realm_id,
+                                        std::string& zonegroup_id) override;
+  virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view realm_id) override;
+
+  virtual int create_zonegroup(const DoutPrefixProvider* dpp,
+                               optional_yield y, bool exclusive,
+                               const RGWZoneGroup& info,
+                               std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view zonegroup_id,
+                                   RGWZoneGroup& info,
+                                   std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view zonegroup_name,
+                                     RGWZoneGroup& info,
+                                     std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  virtual int read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_id,
+                                     RGWZoneGroup& info,
+                                     std::unique_ptr<sal::ZoneGroupWriter>* writer) override;
+  virtual int list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                   optional_yield y, const std::string& marker,
+                                   std::span<std::string> entries,
+                                   sal::ListResult<std::string>& result) override;
+
+  // Zone
+  virtual int write_default_zone_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y, bool exclusive,
+                                    std::string_view realm_id,
+                                    std::string_view zone_id) override;
+  virtual int read_default_zone_id(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view realm_id,
+                                   std::string& zone_id) override;
+  virtual int delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_id) override;
+
+  virtual int create_zone(const DoutPrefixProvider* dpp,
+                          optional_yield y, bool exclusive,
+                          const RGWZoneParams& info,
+                          std::unique_ptr<sal::ZoneWriter>* writer) override;
+  virtual int read_zone_by_id(const DoutPrefixProvider* dpp,
+                              optional_yield y,
+                              std::string_view zone_id,
+                              RGWZoneParams& info,
+                              std::unique_ptr<sal::ZoneWriter>* writer) override;
+  virtual int read_zone_by_name(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view zone_name,
+                                RGWZoneParams& info,
+                                std::unique_ptr<sal::ZoneWriter>* writer) override;
+  virtual int read_default_zone(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view realm_id,
+                                RGWZoneParams& info,
+                                std::unique_ptr<sal::ZoneWriter>* writer) override;
+  virtual int list_zone_names(const DoutPrefixProvider* dpp,
+                              optional_yield y, const std::string& marker,
+                              std::span<std::string> entries,
+                              sal::ListResult<std::string>& result) override;
+
+  // PeriodConfig
+  virtual int read_period_config(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 std::string_view realm_id,
+                                 RGWPeriodConfig& info) override;
+  virtual int write_period_config(const DoutPrefixProvider* dpp,
+                                  optional_yield y, bool exclusive,
+                                  std::string_view realm_id,
+                                  const RGWPeriodConfig& info) override;
+
+ private:
+  std::unique_ptr<ConfigImpl> impl;
+}; // RadosConfigStore
+
+
+/// RadosConfigStore factory function
+auto create_config_store(const DoutPrefixProvider* dpp)
+    -> std::unique_ptr<RadosConfigStore>;
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/zone.cc b/src/rgw/driver/rados/config/zone.cc
new file mode 100644
index 000000000..e06c1606c
--- /dev/null
+++ b/src/rgw/driver/rados/config/zone.cc
@@ -0,0 +1,312 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// zone oids
+constexpr std::string_view zone_info_oid_prefix = "zone_info.";
+constexpr std::string_view zone_names_oid_prefix = "zone_names.";
+
+std::string zone_info_oid(std::string_view zone_id)
+{
+  return string_cat_reserve(zone_info_oid_prefix, zone_id);
+}
+std::string zone_name_oid(std::string_view zone_id)
+{
+  return string_cat_reserve(zone_names_oid_prefix, zone_id);
+}
+std::string default_zone_oid(const ceph::common::ConfigProxy& conf,
+                             std::string_view realm_id)
+{
+  return fmt::format("{}.{}", conf->rgw_default_zone_info_oid, realm_id);
+}
+
+
+int RadosConfigStore::write_default_zone_id(const DoutPrefixProvider* dpp,
+                                            optional_yield y,
+                                            bool exclusive,
+                                            std::string_view realm_id,
+                                            std::string_view zone_id)
+{
+  const auto& pool = impl->zone_pool;
+  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  default_info.default_id = zone_id;
+
+  return impl->write(dpp, y, pool, default_oid, create, default_info, nullptr);
+}
+
+int RadosConfigStore::read_default_zone_id(const DoutPrefixProvider* dpp,
+                                           optional_yield y,
+                                           std::string_view realm_id,
+                                           std::string& zone_id)
+{
+  const auto& pool = impl->zone_pool;
+  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+  if (r >= 0) {
+    zone_id = default_info.default_id;
+  }
+  return r;
+}
+
+int RadosConfigStore::delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string_view realm_id)
+{
+  const auto& pool = impl->zone_pool;
+  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+
+  return impl->remove(dpp, y, pool, default_oid, nullptr);
+}
+
+
+class RadosZoneWriter : public sal::ZoneWriter {
+  ConfigImpl* impl;
+  RGWObjVersionTracker objv;
+  std::string zone_id;
+  std::string zone_name;
+ public:
+  RadosZoneWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
+                  std::string_view zone_id, std::string_view zone_name)
+      : impl(impl), objv(std::move(objv)),
+        zone_id(zone_id), zone_name(zone_name)
+  {
+  }
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneParams& info) override
+  {
+    if (zone_id != info.get_id() || zone_name != info.get_name()) {
+      return -EINVAL; // can't modify zone id or name directly
+    }
+
+    const auto& pool = impl->zone_pool;
+    const auto info_oid = zone_info_oid(info.get_id());
+    return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneParams& info, std::string_view new_name) override
+  {
+    if (zone_id != info.get_id() || zone_name != info.get_name()) {
+      return -EINVAL; // can't modify zone id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    const auto& pool = impl->zone_pool;
+    const auto name = RGWNameToId{info.get_id()};
+    const auto info_oid = zone_info_oid(info.get_id());
+    const auto old_oid = zone_name_oid(info.get_name());
+    const auto new_oid = zone_name_oid(new_name);
+
+    // link the new name
+    RGWObjVersionTracker new_objv;
+    new_objv.generate_new_write_ver(dpp->get_cct());
+    int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
+                        name, &new_objv);
+    if (r < 0) {
+      return r;
+    }
+
+    // write the info with updated name
+    info.set_name(std::string{new_name});
+    r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+    if (r < 0) {
+      // on failure, unlink the new name
+      (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
+      return r;
+    }
+
+    // unlink the old name
+    (void) impl->remove(dpp, y, pool, old_oid, nullptr);
+
+    zone_name = new_name;
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    const auto& pool = impl->zone_pool;
+    const auto info_oid = zone_info_oid(zone_id);
+    int r = impl->remove(dpp, y, pool, info_oid, &objv);
+    if (r < 0) {
+      return r;
+    }
+    const auto name_oid = zone_name_oid(zone_name);
+    (void) impl->remove(dpp, y, pool, name_oid, nullptr);
+    return 0;
+  }
+}; // RadosZoneWriter
+
+
+int RadosConfigStore::create_zone(const DoutPrefixProvider* dpp,
+                                  optional_yield y, bool exclusive,
+                                  const RGWZoneParams& info,
+                                  std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  if (info.get_id().empty()) {
+    ldpp_dout(dpp, 0) << "zone cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.get_name().empty()) {
+    ldpp_dout(dpp, 0) << "zone cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  const auto& pool = impl->zone_pool;
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  // write the zone info
+  const auto info_oid = zone_info_oid(info.get_id());
+  RGWObjVersionTracker objv;
+  objv.generate_new_write_ver(dpp->get_cct());
+
+  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  // write the zone name
+  const auto name_oid = zone_name_oid(info.get_name());
+  const auto name = RGWNameToId{info.get_id()};
+  RGWObjVersionTracker name_objv;
+  name_objv.generate_new_write_ver(dpp->get_cct());
+
+  r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
+  if (r < 0) {
+    (void) impl->remove(dpp, y, pool, info_oid, &objv);
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_zone_by_id(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      std::string_view zone_id,
+                                      RGWZoneParams& info,
+                                      std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  const auto& pool = impl->zone_pool;
+  const auto info_oid = zone_info_oid(zone_id);
+  RGWObjVersionTracker objv;
+
+  int r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_zone_by_name(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view zone_name,
+                                        RGWZoneParams& info,
+                                        std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  const auto& pool = impl->zone_pool;
+
+  // look up zone id by name
+  const auto name_oid = zone_name_oid(zone_name);
+  RGWNameToId name;
+  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = zone_info_oid(name.obj_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_default_zone(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view realm_id,
+                                        RGWZoneParams& info,
+                                        std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  const auto& pool = impl->zone_pool;
+
+  // read default zone id
+  const auto default_oid = default_zone_oid(dpp->get_cct()->_conf, realm_id);
+  RGWDefaultSystemMetaObjInfo default_info;
+  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = zone_info_oid(default_info.default_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::list_zone_names(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      const std::string& marker,
+                                      std::span<std::string> entries,
+                                      sal::ListResult<std::string>& result)
+{
+  const auto& pool = impl->zone_pool;
+  constexpr auto prefix = [] (std::string oid) -> std::string {
+      if (!oid.starts_with(zone_names_oid_prefix)) {
+        return {};
+      }
+      return oid.substr(zone_names_oid_prefix.size());
+    };
+  return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/config/zonegroup.cc b/src/rgw/driver/rados/config/zonegroup.cc
new file mode 100644
index 000000000..1766a68ce
--- /dev/null
+++ b/src/rgw/driver/rados/config/zonegroup.cc
@@ -0,0 +1,315 @@
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/dout.h"
+#include "common/errno.h"
+#include "rgw_zone.h"
+#include "driver/rados/config/store.h"
+
+#include "impl.h"
+
+namespace rgw::rados {
+
+// zonegroup oids
+constexpr std::string_view zonegroup_names_oid_prefix = "zonegroups_names.";
+constexpr std::string_view zonegroup_info_oid_prefix = "zonegroup_info.";
+constexpr std::string_view default_zonegroup_info_oid = "default.zonegroup";
+
+static std::string zonegroup_info_oid(std::string_view zonegroup_id)
+{
+  return string_cat_reserve(zonegroup_info_oid_prefix, zonegroup_id);
+}
+static std::string zonegroup_name_oid(std::string_view zonegroup_id)
+{
+  return string_cat_reserve(zonegroup_names_oid_prefix, zonegroup_id);
+}
+static std::string default_zonegroup_oid(const ceph::common::ConfigProxy& conf,
+                                         std::string_view realm_id)
+{
+  const auto prefix = name_or_default(conf->rgw_default_zonegroup_info_oid,
+                                      default_zonegroup_info_oid);
+  return fmt::format("{}.{}", prefix, realm_id);
+}
+
+
+int RadosConfigStore::write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                 optional_yield y,
+                                                 bool exclusive,
+                                                 std::string_view realm_id,
+                                                 std::string_view zonegroup_id)
+{
+  const auto& pool = impl->zonegroup_pool;
+  const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  default_info.default_id = zonegroup_id;
+
+  return impl->write(dpp, y, pool, oid, create, default_info, nullptr);
+}
+
+int RadosConfigStore::read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                optional_yield y,
+                                                std::string_view realm_id,
+                                                std::string& zonegroup_id)
+{
+  const auto& pool = impl->zonegroup_pool;
+  const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  int r = impl->read(dpp, y, pool, oid, default_info, nullptr);
+  if (r >= 0) {
+    zonegroup_id = default_info.default_id;
+  }
+  return r;
+}
+
+int RadosConfigStore::delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                                  optional_yield y,
+                                                  std::string_view realm_id)
+{
+  const auto& pool = impl->zonegroup_pool;
+  const auto oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+  return impl->remove(dpp, y, pool, oid, nullptr);
+}
+
+
+class RadosZoneGroupWriter : public sal::ZoneGroupWriter {
+  ConfigImpl* impl;
+  RGWObjVersionTracker objv;
+  std::string zonegroup_id;
+  std::string zonegroup_name;
+ public:
+  RadosZoneGroupWriter(ConfigImpl* impl, RGWObjVersionTracker objv,
+                       std::string_view zonegroup_id,
+                       std::string_view zonegroup_name)
+    : impl(impl), objv(std::move(objv)),
+      zonegroup_id(zonegroup_id), zonegroup_name(zonegroup_name)
+  {
+  }
+
+  int write(const DoutPrefixProvider* dpp, optional_yield y,
+            const RGWZoneGroup& info) override
+  {
+    if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
+      return -EINVAL; // can't modify zonegroup id or name directly
+    }
+
+    const auto& pool = impl->zonegroup_pool;
+    const auto info_oid = zonegroup_info_oid(info.get_id());
+    return impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+  }
+
+  int rename(const DoutPrefixProvider* dpp, optional_yield y,
+             RGWZoneGroup& info, std::string_view new_name) override
+  {
+    if (zonegroup_id != info.get_id() || zonegroup_name != info.get_name()) {
+      return -EINVAL; // can't modify zonegroup id or name directly
+    }
+    if (new_name.empty()) {
+      ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+      return -EINVAL;
+    }
+
+    const auto& pool = impl->zonegroup_pool;
+    const auto name = RGWNameToId{info.get_id()};
+    const auto info_oid = zonegroup_info_oid(info.get_id());
+    const auto old_oid = zonegroup_name_oid(info.get_name());
+    const auto new_oid = zonegroup_name_oid(new_name);
+
+    // link the new name
+    RGWObjVersionTracker new_objv;
+    new_objv.generate_new_write_ver(dpp->get_cct());
+    int r = impl->write(dpp, y, pool, new_oid, Create::MustNotExist,
+                        name, &new_objv);
+    if (r < 0) {
+      return r;
+    }
+
+    // write the info with updated name
+    info.set_name(std::string{new_name});
+    r = impl->write(dpp, y, pool, info_oid, Create::MustExist, info, &objv);
+    if (r < 0) {
+      // on failure, unlink the new name
+      (void) impl->remove(dpp, y, pool, new_oid, &new_objv);
+      return r;
+    }
+
+    // unlink the old name
+    (void) impl->remove(dpp, y, pool, old_oid, nullptr);
+
+    zonegroup_name = new_name;
+    return 0;
+  }
+
+  int remove(const DoutPrefixProvider* dpp, optional_yield y) override
+  {
+    const auto& pool = impl->zonegroup_pool;
+    const auto info_oid = zonegroup_info_oid(zonegroup_id);
+    int r = impl->remove(dpp, y, pool, info_oid, &objv);
+    if (r < 0) {
+      return r;
+    }
+    const auto name_oid = zonegroup_name_oid(zonegroup_name);
+    (void) impl->remove(dpp, y, pool, name_oid, nullptr);
+    return 0;
+  }
+}; // RadosZoneGroupWriter
+
+
+int RadosConfigStore::create_zonegroup(const DoutPrefixProvider* dpp,
+                                       optional_yield y, bool exclusive,
+                                       const RGWZoneGroup& info,
+                                       std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  if (info.get_id().empty()) {
+    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty id" << dendl;
+    return -EINVAL;
+  }
+  if (info.get_name().empty()) {
+    ldpp_dout(dpp, 0) << "zonegroup cannot have an empty name" << dendl;
+    return -EINVAL;
+  }
+
+  const auto& pool = impl->zonegroup_pool;
+  const auto create = exclusive ? Create::MustNotExist : Create::MayExist;
+
+  // write the zonegroup info
+  const auto info_oid = zonegroup_info_oid(info.get_id());
+  RGWObjVersionTracker objv;
+  objv.generate_new_write_ver(dpp->get_cct());
+
+  int r = impl->write(dpp, y, pool, info_oid, create, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  // write the zonegroup name
+  const auto name_oid = zonegroup_name_oid(info.get_name());
+  const auto name = RGWNameToId{info.get_id()};
+  RGWObjVersionTracker name_objv;
+  name_objv.generate_new_write_ver(dpp->get_cct());
+
+  r = impl->write(dpp, y, pool, name_oid, create, name, &name_objv);
+  if (r < 0) {
+    (void) impl->remove(dpp, y, pool, info_oid, &objv);
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneGroupWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                           optional_yield y,
+                                           std::string_view zonegroup_id,
+                                           RGWZoneGroup& info,
+                                           std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  const auto& pool = impl->zonegroup_pool;
+  const auto info_oid = zonegroup_info_oid(zonegroup_id);
+  RGWObjVersionTracker objv;
+
+  int r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneGroupWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string_view zonegroup_name,
+                                             RGWZoneGroup& info,
+                                             std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  const auto& pool = impl->zonegroup_pool;
+
+  // look up zonegroup id by name
+  RGWNameToId name;
+  const auto name_oid = zonegroup_name_oid(zonegroup_name);
+  int r = impl->read(dpp, y, pool, name_oid, name, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = zonegroup_info_oid(name.obj_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneGroupWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                             optional_yield y,
+                                             std::string_view realm_id,
+                                             RGWZoneGroup& info,
+                                             std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  const auto& pool = impl->zonegroup_pool;
+
+  // read default zonegroup id
+  RGWDefaultSystemMetaObjInfo default_info;
+  const auto default_oid = default_zonegroup_oid(dpp->get_cct()->_conf, realm_id);
+  int r = impl->read(dpp, y, pool, default_oid, default_info, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  const auto info_oid = zonegroup_info_oid(default_info.default_id);
+  RGWObjVersionTracker objv;
+  r = impl->read(dpp, y, pool, info_oid, info, &objv);
+  if (r < 0) {
+    return r;
+  }
+
+  if (writer) {
+    *writer = std::make_unique<RadosZoneGroupWriter>(
+        impl.get(), std::move(objv), info.get_id(), info.get_name());
+  }
+  return 0;
+}
+
+int RadosConfigStore::list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                           optional_yield y,
+                                           const std::string& marker,
+                                           std::span<std::string> entries,
+                                           sal::ListResult<std::string>& result)
+{
+  const auto& pool = impl->zonegroup_pool;
+  constexpr auto prefix = [] (std::string oid) -> std::string {
+      if (!oid.starts_with(zonegroup_names_oid_prefix)) {
+        return {};
+      }
+      return oid.substr(zonegroup_names_oid_prefix.size());
+    };
+  return impl->list(dpp, y, pool, marker, prefix, entries, result);
+}
+
+} // namespace rgw::rados
diff --git a/src/rgw/driver/rados/rgw_bucket.cc b/src/rgw/driver/rados/rgw_bucket.cc
new file mode 100644
index 000000000..32cd1ccf9
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_bucket.cc
@@ -0,0 +1,3316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_acl_s3.h"
+#include "rgw_tag_s3.h"
+
+#include "rgw_bucket.h"
+#include "rgw_op.h"
+#include "rgw_bucket_sync.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_bucket.h"
+#include "services/svc_user.h"
+
+#include "rgw_reshard.h"
+
+// stolen from src/cls/version/cls_version.cc
+#define VERSION_ATTR "ceph.objclass.version"
+
+#include "cls/user/cls_user_types.h"
+
+#include "rgw_sal_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+// seconds for timeout during RGWBucket::check_object_index
+constexpr uint64_t BUCKET_TAG_QUICK_TIMEOUT = 30;
+
+using namespace std;
+
+// these values are copied from cls/rgw/cls_rgw.cc
+static const string BI_OLH_ENTRY_NS_START = "\x80" "1001_";
+static const string BI_INSTANCE_ENTRY_NS_START = "\x80" "1000_";
+
+// number of characters that we should allow to be buffered by the formatter
+// before flushing (used by index check methods with dump_keys=true)
+static constexpr int FORMATTER_LEN_FLUSH_THRESHOLD = 4 * 1024 * 1024;
+
+// default number of entries to list with each bucket listing call
+// (use marker to bridge between calls)
+static constexpr size_t listing_max_entries = 1000;
+
+/*
+ * The tenant_name is always returned on purpose. May be empty, of course.
+ */
+static void parse_bucket(const string& bucket,
+                         string *tenant_name,
+                         string *bucket_name,
+                         string *bucket_instance = nullptr /* optional */)
+{
+  /*
+   * expected format: [tenant/]bucket:bucket_instance
+   */
+  int pos = bucket.find('/');
+  if (pos >= 0) {
+    *tenant_name = bucket.substr(0, pos);
+  } else {
+    tenant_name->clear();
+  }
+  string bn = bucket.substr(pos + 1);
+  pos = bn.find (':');
+  if (pos < 0) {
+    *bucket_name = std::move(bn);
+    return;
+  }
+  *bucket_name = bn.substr(0, pos);
+  if (bucket_instance) {
+    *bucket_instance = bn.substr(pos + 1);
+  }
+
+  /*
+   * deal with the possible tenant:bucket:bucket_instance case
+   */
+  if (tenant_name->empty()) {
+    pos = bucket_instance->find(':');
+    if (pos >= 0) {
+      *tenant_name = *bucket_name;
+      *bucket_name = bucket_instance->substr(0, pos);
+      *bucket_instance = bucket_instance->substr(pos + 1);
+    }
+  }
+}
+
+static void dump_mulipart_index_results(list<rgw_obj_index_key>& objs_to_unlink,
+        Formatter *f)
+{
+  for (const auto& o : objs_to_unlink) {
+    f->dump_string("object",  o.name);
+  }
+}
+
+void check_bad_user_bucket_mapping(rgw::sal::Driver* driver, rgw::sal::User& user,
+				   bool fix,
+				   optional_yield y,
+                                   const DoutPrefixProvider *dpp)
+{
+  rgw::sal::BucketList user_buckets;
+  string marker;
+
+  CephContext *cct = driver->ctx();
+
+  size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
+
+  do {
+    int ret = user.list_buckets(dpp, marker, string(), max_entries, false, user_buckets, y);
+    if (ret < 0) {
+      ldout(driver->ctx(), 0) << "failed to read user buckets: "
+			     << cpp_strerror(-ret) << dendl;
+      return;
+    }
+
+    map<string, std::unique_ptr<rgw::sal::Bucket>>& buckets = user_buckets.get_buckets();
+    for (auto i = buckets.begin();
+         i != buckets.end();
+         ++i) {
+      marker = i->first;
+
+      auto& bucket = i->second;
+
+      std::unique_ptr<rgw::sal::Bucket> actual_bucket;
+      int r = driver->get_bucket(dpp, &user, user.get_tenant(), bucket->get_name(), &actual_bucket, y);
+      if (r < 0) {
+        ldout(driver->ctx(), 0) << "could not get bucket info for bucket=" << bucket << dendl;
+        continue;
+      }
+
+      if (actual_bucket->get_name().compare(bucket->get_name()) != 0 ||
+          actual_bucket->get_tenant().compare(bucket->get_tenant()) != 0 ||
+          actual_bucket->get_marker().compare(bucket->get_marker()) != 0 ||
+          actual_bucket->get_bucket_id().compare(bucket->get_bucket_id()) != 0) {
+        cout << "bucket info mismatch: expected " << actual_bucket << " got " << bucket << std::endl;
+        if (fix) {
+          cout << "fixing" << std::endl;
+	  r = actual_bucket->chown(dpp, user, y);
+          if (r < 0) {
+            cerr << "failed to fix bucket: " << cpp_strerror(-r) << std::endl;
+          }
+        }
+      }
+    }
+  } while (user_buckets.is_truncated());
+}
+
+// returns true if entry is in the empty namespace. note: function
+// type conforms to type RGWBucketListNameFilter
+bool rgw_bucket_object_check_filter(const std::string& oid)
+{
+  const static std::string empty_ns;
+  rgw_obj_key key; // thrown away but needed for parsing
+  return rgw_obj_key::oid_to_key_in_ns(oid, &key, empty_ns);
+}
+
+int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, rgw_obj_key& key)
+{
+  if (key.instance.empty()) {
+    key.instance = "null";
+  }
+
+  std::unique_ptr<rgw::sal::Object> object = bucket->get_object(key);
+
+  return object->delete_object(dpp, null_yield);
+}
+
+static void set_err_msg(std::string *sink, std::string msg)
+{
+  if (sink && !msg.empty())
+    *sink = msg;
+}
+
+int RGWBucket::init(rgw::sal::Driver* _driver, RGWBucketAdminOpState& op_state,
+                    optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+  if (!_driver) {
+    set_err_msg(err_msg, "no storage!");
+    return -EINVAL;
+  }
+
+  driver = _driver;
+
+  std::string bucket_name = op_state.get_bucket_name();
+
+  if (bucket_name.empty() && op_state.get_user_id().empty())
+    return -EINVAL;
+
+  user = driver->get_user(op_state.get_user_id());
+  std::string tenant = user->get_tenant();
+
+  // split possible tenant/name
+  auto pos = bucket_name.find('/');
+  if (pos != string::npos) {
+    tenant = bucket_name.substr(0, pos);
+    bucket_name = bucket_name.substr(pos + 1);
+  }
+
+  int r = driver->get_bucket(dpp, user.get(), tenant, bucket_name, &bucket, y);
+  if (r < 0) {
+      set_err_msg(err_msg, "failed to fetch bucket info for bucket=" + bucket_name);
+      return r;
+  }
+
+  op_state.set_bucket(bucket->clone());
+
+  if (!rgw::sal::User::empty(user.get())) {
+    r = user->load_user(dpp, y);
+    if (r < 0) {
+      set_err_msg(err_msg, "failed to fetch user info");
+      return r;
+    }
+  }
+
+  op_state.display_name = user->get_display_name();
+
+  clear_failure();
+  return 0;
+}
+
+bool rgw_find_bucket_by_id(const DoutPrefixProvider *dpp, CephContext *cct, rgw::sal::Driver* driver,
+                           const string& marker, const string& bucket_id, rgw_bucket* bucket_out)
+{
+  void *handle = NULL;
+  bool truncated = false;
+  string s;
+
+  int ret = driver->meta_list_keys_init(dpp, "bucket.instance", marker, &handle);
+  if (ret < 0) {
+    cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+    driver->meta_list_keys_complete(handle);
+    return -ret;
+  }
+  do {
+      list<string> keys;
+      ret = driver->meta_list_keys_next(dpp, handle, 1000, keys, &truncated);
+      if (ret < 0) {
+        cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+        driver->meta_list_keys_complete(handle);
+        return -ret;
+      }
+      for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+        s = *iter;
+        ret = rgw_bucket_parse_bucket_key(cct, s, bucket_out, nullptr);
+        if (ret < 0) {
+          continue;
+        }
+        if (bucket_id == bucket_out->bucket_id) {
+          driver->meta_list_keys_complete(handle);
+          return true;
+        }
+      }
+  } while (truncated);
+  driver->meta_list_keys_complete(handle);
+  return false;
+}
+
+int RGWBucket::chown(RGWBucketAdminOpState& op_state, const string& marker,
+                     optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+  /* User passed in by rgw_admin is the new user; get the current user and set it in
+   * the bucket */
+  std::unique_ptr<rgw::sal::User> old_user = driver->get_user(bucket->get_info().owner);
+  bucket->set_owner(old_user.get());
+
+  return rgw_chown_bucket_and_objects(driver, bucket.get(), user.get(), marker, err_msg, dpp, y);
+}
+
+int RGWBucket::set_quota(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+  bucket = op_state.get_bucket()->clone();
+
+  bucket->get_info().quota = op_state.quota;
+  int r = bucket->put_info(dpp, false, real_time());
+  if (r < 0) {
+    set_err_msg(err_msg, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r));
+    return r;
+  }
+  return r;
+}
+
+int RGWBucket::remove_object(const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, std::string *err_msg)
+{
+  std::string object_name = op_state.get_object_name();
+
+  rgw_obj_key key(object_name);
+
+  bucket = op_state.get_bucket()->clone();
+
+  int ret = rgw_remove_object(dpp, driver, bucket.get(), key);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to remove object" + cpp_strerror(-ret));
+    return ret;
+  }
+
+  return 0;
+}
+
+static void dump_bucket_index(const vector<rgw_bucket_dir_entry>& objs,  Formatter *f)
+{
+  for (auto iter = objs.begin(); iter != objs.end(); ++iter) {
+    f->dump_string("object", iter->key.name);
+  }
+}
+
+static void dump_bucket_usage(map<RGWObjCategory, RGWStorageStats>& stats, Formatter *formatter)
+{
+  map<RGWObjCategory, RGWStorageStats>::iterator iter;
+
+  formatter->open_object_section("usage");
+  for (iter = stats.begin(); iter != stats.end(); ++iter) {
+    RGWStorageStats& s = iter->second;
+    formatter->open_object_section(to_string(iter->first));
+    s.dump(formatter);
+    formatter->close_section();
+  }
+  formatter->close_section();
+}
+
+static void dump_index_check(map<RGWObjCategory, RGWStorageStats> existing_stats,
+        map<RGWObjCategory, RGWStorageStats> calculated_stats,
+        Formatter *formatter)
+{
+  formatter->open_object_section("check_result");
+  formatter->open_object_section("existing_header");
+  dump_bucket_usage(existing_stats, formatter);
+  formatter->close_section();
+  formatter->open_object_section("calculated_header");
+  dump_bucket_usage(calculated_stats, formatter);
+  formatter->close_section();
+  formatter->close_section();
+}
+
+int RGWBucket::check_bad_index_multipart(RGWBucketAdminOpState& op_state,
+					 RGWFormatterFlusher& flusher,
+					 const DoutPrefixProvider *dpp,
+					 std::string *err_msg)
+{
+  const bool fix_index = op_state.will_fix_index();
+
+  bucket = op_state.get_bucket()->clone();
+
+  rgw::sal::Bucket::ListParams params;
+  params.list_versions = true;
+  params.ns = RGW_OBJ_NS_MULTIPART;
+
+  std::map<std::string, bool> meta_objs;
+  std::map<rgw_obj_index_key, std::string> all_objs;
+  bool is_truncated;
+  do {
+    rgw::sal::Bucket::ListResults results;
+    int r = bucket->list(dpp, params, listing_max_entries, results, null_yield);
+    if (r < 0) {
+      set_err_msg(err_msg, "failed to list objects in bucket=" + bucket->get_name() +
+              " err=" +  cpp_strerror(-r));
+
+      return r;
+    }
+    is_truncated = results.is_truncated;
+
+    for (const auto& o : results.objs) {
+      rgw_obj_index_key key = o.key;
+      rgw_obj obj(bucket->get_key(), key);
+      std::string oid = obj.get_oid();
+
+      int pos = oid.find_last_of('.');
+      if (pos < 0) {
+        /* obj has no suffix */
+        all_objs[key] = oid;
+      } else {
+        /* obj has suffix */
+	std::string name = oid.substr(0, pos);
+	std::string suffix = oid.substr(pos + 1);
+
+        if (suffix.compare("meta") == 0) {
+          meta_objs[name] = true;
+        } else {
+          all_objs[key] = name;
+        }
+      }
+    }
+  } while (is_truncated);
+
+  std::list<rgw_obj_index_key> objs_to_unlink;
+  Formatter *f =  flusher.get_formatter();
+
+  f->open_array_section("invalid_multipart_entries");
+
+  for (const auto& o : all_objs) {
+    const std::string& name = o.second;
+    if (meta_objs.find(name) == meta_objs.end()) {
+      objs_to_unlink.push_back(o.first);
+    }
+
+    if (objs_to_unlink.size() > listing_max_entries) {
+      if (fix_index) {
+	// note: under rados this removes directly from rados index objects
+	int r = bucket->remove_objs_from_index(dpp, objs_to_unlink);
+	if (r < 0) {
+	  set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " +
+		      cpp_strerror(-r));
+	  return r;
+	}
+      }
+
+      dump_mulipart_index_results(objs_to_unlink, f);
+      flusher.flush();
+      objs_to_unlink.clear();
+    }
+  }
+
+  if (fix_index) {
+    // note: under rados this removes directly from rados index objects
+    int r = bucket->remove_objs_from_index(dpp, objs_to_unlink);
+    if (r < 0) {
+      set_err_msg(err_msg, "ERROR: remove_obj_from_index() returned error: " +
+              cpp_strerror(-r));
+
+      return r;
+    }
+  }
+
+  dump_mulipart_index_results(objs_to_unlink, f);
+  f->close_section();
+  flusher.flush();
+
+  return 0;
+}
+
+int RGWBucket::check_object_index(const DoutPrefixProvider *dpp, 
+                                  RGWBucketAdminOpState& op_state,
+                                  RGWFormatterFlusher& flusher,
+                                  optional_yield y,
+                                  std::string *err_msg)
+{
+
+  bool fix_index = op_state.will_fix_index();
+
+  if (!fix_index) {
+    set_err_msg(err_msg, "check-objects flag requires fix index enabled");
+    return -EINVAL;
+  }
+
+  // use a quicker/shorter tag timeout during this process
+  bucket->set_tag_timeout(dpp, BUCKET_TAG_QUICK_TIMEOUT);
+
+  rgw::sal::Bucket::ListResults results;
+  results.is_truncated = true;
+
+  Formatter *formatter = flusher.get_formatter();
+  formatter->open_object_section("objects");
+
+  while (results.is_truncated) {
+    rgw::sal::Bucket::ListParams params;
+    params.marker = results.next_marker;
+    params.force_check_filter = rgw_bucket_object_check_filter;
+
+    int r = bucket->list(dpp, params, listing_max_entries, results, y);
+
+    if (r == -ENOENT) {
+      break;
+    } else if (r < 0) {
+      set_err_msg(err_msg, "ERROR: failed operation r=" + cpp_strerror(-r));
+    }
+
+    dump_bucket_index(results.objs, formatter);
+    flusher.flush();
+  }
+
+  formatter->close_section();
+
+  // restore normal tag timeout for bucket
+  bucket->set_tag_timeout(dpp, 0);
+
+  return 0;
+}
+
+/**
+ * Loops over all olh entries in a bucket shard and finds ones with
+ * exists=false and pending_removal=true. If the pending log is empty on
+ * these entries, they were left behind after the last remaining version of
+ * an object was deleted or after an incomplete upload. This was known to
+ * happen historically due to concurrency conflicts among requests referencing
+ * the same object key. If op_state.fix_index is true, we continue where the
+ * request left off by calling RGWRados::clear_olh. If the pending log is not
+ * empty, we attempt to apply it.
+ */
+static int check_index_olh(rgw::sal::RadosStore* const rados_store,
+                           rgw::sal::Bucket* const bucket,
+                           const DoutPrefixProvider *dpp,
+                           RGWBucketAdminOpState& op_state,
+                           RGWFormatterFlusher& flusher,
+                           const int shard, 
+                           uint64_t* const count_out,
+                           optional_yield y)
+{
+  string marker = BI_OLH_ENTRY_NS_START;
+  bool is_truncated = true;
+  list<rgw_cls_bi_entry> entries;
+
+  RGWObjectCtx obj_ctx(rados_store);
+  RGWRados* store = rados_store->getRados();
+  RGWRados::BucketShard bs(store);
+
+  int ret = bs.init(dpp, bucket->get_info(), bucket->get_info().layout.current_index, shard);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR bs.init(bucket=" << bucket << "): " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  
+  *count_out = 0;
+  do {
+    entries.clear();
+    ret = store->bi_list(bs, "", marker, -1, &entries, &is_truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR bi_list(): " << cpp_strerror(-ret) << dendl;
+      break;
+    }
+    list<rgw_cls_bi_entry>::iterator iter;
+    for (iter = entries.begin(); iter != entries.end(); ++iter) {
+      rgw_cls_bi_entry& entry = *iter;
+      marker = entry.idx;
+      if (entry.type != BIIndexType::OLH) {
+        is_truncated = false;
+        break;
+      }
+      rgw_bucket_olh_entry olh_entry;
+      auto iiter = entry.data.cbegin();
+      try {
+        decode(olh_entry, iiter);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, -1) << "ERROR failed to decode olh entry for key: " << entry.idx << dendl;
+        continue;
+      }
+      if (olh_entry.exists || !olh_entry.pending_removal) {
+        continue;
+      }
+      if (op_state.will_fix_index()) {
+        rgw_obj obj(bucket->get_key(), olh_entry.key.name);
+        if (olh_entry.pending_log.empty()) {
+          ret = store->clear_olh(dpp, obj_ctx, obj, bucket->get_info(), olh_entry.tag, olh_entry.epoch, y);
+          if (ret < 0) {
+            ldpp_dout(dpp, -1) << "ERROR failed to clear olh for: " << olh_entry.key.name << " clear_olh(): " << cpp_strerror(-ret) << dendl;
+            continue;
+          }
+        } else {
+          std::unique_ptr<rgw::sal::Object> object = bucket->get_object({olh_entry.key.name});
+          RGWObjState *state;
+          ret = object->get_obj_state(dpp, &state, y, false);
+          if (ret < 0) {
+            ldpp_dout(dpp, -1) << "ERROR failed to get state for: " << olh_entry.key.name << " get_obj_state(): " << cpp_strerror(-ret) << dendl;
+            continue;
+          }
+          ret = store->update_olh(dpp, obj_ctx, state, bucket->get_info(), obj);
+          if (ret < 0) {
+            ldpp_dout(dpp, -1) << "ERROR failed to update olh for: " << olh_entry.key.name << " update_olh(): " << cpp_strerror(-ret) << dendl;
+            continue;
+          }
+        }
+      }
+      if (op_state.dump_keys) {
+        flusher.get_formatter()->dump_string("", olh_entry.key.name);
+        if (flusher.get_formatter()->get_len() > FORMATTER_LEN_FLUSH_THRESHOLD) {
+          flusher.flush();
+        }
+      }
+      *count_out += 1;
+    }
+  } while (is_truncated);
+  flusher.flush();
+  return 0;
+}
+
+
+/**
+ * Spawns separate coroutines to check each bucket shard for leftover
+ * olh entries (and remove them if op_state.fix_index is true).
+ */
+int RGWBucket::check_index_olh(rgw::sal::RadosStore* const rados_store,
+                               const DoutPrefixProvider *dpp,
+                               RGWBucketAdminOpState& op_state,
+                               RGWFormatterFlusher& flusher)
+{
+  const RGWBucketInfo& bucket_info = get_bucket_info();
+  if ((bucket_info.versioning_status() & BUCKET_VERSIONED) == 0) {
+    ldpp_dout(dpp, 0) << "WARNING: this command is only applicable to versioned buckets" << dendl;
+    return 0;
+  }
+  
+  Formatter* formatter = flusher.get_formatter();
+  if (op_state.dump_keys) {
+    formatter->open_array_section("");
+  }
+
+  const int max_shards = rgw::num_shards(bucket_info.layout.current_index);
+  std::string verb = op_state.will_fix_index() ? "removed" : "found";
+  uint64_t count_out = 0;
+  
+  boost::asio::io_context context;
+  int next_shard = 0;
+  
+  const int max_aio = std::max(1, op_state.get_max_aio());
+
+  for (int i=0; i<max_aio; i++) {
+    spawn::spawn(context, [&](yield_context yield) {
+      while (true) {
+        int shard = next_shard;
+        next_shard += 1;
+        if (shard >= max_shards) {
+          return;
+        }
+        optional_yield y(context, yield);
+        uint64_t shard_count;
+        int r = ::check_index_olh(rados_store, &*bucket, dpp, op_state, flusher, shard, &shard_count, y);
+        if (r < 0) {
+          ldpp_dout(dpp, -1) << "NOTICE: error processing shard " << shard << 
+            " check_index_olh(): " << r << dendl;
+        }
+        count_out += shard_count;
+        if (!op_state.hide_progress) {
+          ldpp_dout(dpp, 1) << "NOTICE: finished shard " << shard << " (" << shard_count <<
+            " entries " << verb << ")" << dendl;
+        }
+      }
+    });
+  }
+  try {
+    context.run();
+  } catch (const std::system_error& e) {
+    return -e.code().value();
+  }
+  if (!op_state.hide_progress) {
+    ldpp_dout(dpp, 1) << "NOTICE: finished all shards (" << count_out <<
+      " entries " << verb << ")" << dendl;
+  }
+  if (op_state.dump_keys) {
+    formatter->close_section();
+    flusher.flush();
+  }
+  return 0;
+}
+
+/**
+ * Indicates whether a versioned bucket instance entry is listable in the
+ * index. It does this by looping over all plain entries with prefix equal to
+ * the key name, and checking whether any have an instance ID matching the one
+ * on the specified key. The existence of an instance entry without a matching
+ * plain entry indicates that the object was uploaded successfully, but the
+ * request exited prior to linking the object into the index (via the creation
+ * of a plain entry).
+ */
+static int is_versioned_instance_listable(const DoutPrefixProvider *dpp,
+                                          RGWRados::BucketShard& bs,
+                                          const cls_rgw_obj_key& key,
+                                          bool& listable,
+                                          optional_yield y)
+{
+  const std::string empty_delim;
+  cls_rgw_obj_key marker;
+  rgw_cls_list_ret result;
+  listable = false;
+
+  do {
+    librados::ObjectReadOperation op;
+    cls_rgw_bucket_list_op(op, marker, key.name, empty_delim, 1000,
+                           true, &result);
+    bufferlist ibl;
+    int r = bs.bucket_obj.operate(dpp, &op, &ibl, y);
+    if (r < 0) {
+      return r;
+    }
+
+    for (auto const& entry : result.dir.m) {
+      if (entry.second.key == key) {
+        listable = true;
+        return 0;
+      }
+      marker = entry.second.key;
+    }
+  } while (result.is_truncated);
+  return 0;
+}
+
+/**
+ * Loops over all instance entries in a bucket shard and finds ones with
+ * versioned_epoch=0 and an mtime that is earlier than op_state.min_age
+ * relative to the current time. These entries represent objects that were
+ * uploaded successfully but were not successfully linked into the object
+ * index. As an extra precaution, we also verify that these entries are indeed
+ * non listable (have no corresponding plain entry in the index). We can assume
+ * that clients received an error response for the associated upload requests
+ * since the bucket index linking transaction did not complete. Therefore, if
+ * op_state.fix_index is true, we remove the object that is associated with the
+ * instance entry.
+ */
+static int check_index_unlinked(rgw::sal::RadosStore* const rados_store,
+                                rgw::sal::Bucket* const bucket,
+                                const DoutPrefixProvider *dpp,
+                                RGWBucketAdminOpState& op_state,
+                                RGWFormatterFlusher& flusher,
+                                const int shard, 
+                                uint64_t* const count_out,
+                                optional_yield y)
+{
+  string marker = BI_INSTANCE_ENTRY_NS_START;
+  bool is_truncated = true;
+  list<rgw_cls_bi_entry> entries;
+
+  RGWObjectCtx obj_ctx(rados_store);
+  RGWRados* store = rados_store->getRados();
+  RGWRados::BucketShard bs(store);
+
+  int ret = bs.init(dpp, bucket->get_info(), bucket->get_info().layout.current_index, shard);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR bs.init(bucket=" << bucket << "): " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  ceph::real_clock::time_point now = ceph::real_clock::now();
+  ceph::real_clock::time_point not_after = now - op_state.min_age;
+  
+  *count_out = 0;
+  do {
+    entries.clear();
+    ret = store->bi_list(bs, "", marker, -1, &entries, &is_truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR bi_list(): " << cpp_strerror(-ret) << dendl;
+      break;
+    }
+    list<rgw_cls_bi_entry>::iterator iter;
+    for (iter = entries.begin(); iter != entries.end(); ++iter) {
+      rgw_cls_bi_entry& entry = *iter;
+      marker = entry.idx;
+      if (entry.type != BIIndexType::Instance) {
+        is_truncated = false;
+        break;
+      }
+      rgw_bucket_dir_entry dir_entry;
+      auto iiter = entry.data.cbegin();
+      try {
+        decode(dir_entry, iiter);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, -1) << "ERROR failed to decode instance entry for key: " <<
+          entry.idx << dendl;
+        continue;
+      }
+      if (dir_entry.versioned_epoch != 0 || dir_entry.meta.mtime > not_after) {
+        continue;
+      }
+      bool listable;
+      ret = is_versioned_instance_listable(dpp, bs, dir_entry.key, listable, y);
+      if (ret < 0) {
+        ldpp_dout(dpp, -1) << "ERROR is_versioned_instance_listable(key='" <<
+          dir_entry.key << "'): " << cpp_strerror(-ret) << dendl;
+        continue;
+      }
+      if (listable) {
+        continue;
+      }
+      if (op_state.will_fix_index()) {
+        rgw_obj_key key(dir_entry.key.name, dir_entry.key.instance);
+        ret = rgw_remove_object(dpp, rados_store, bucket, key);
+        if (ret < 0) {
+          ldpp_dout(dpp, -1) << "ERROR rgw_remove_obj(key='" <<
+            dir_entry.key << "'): " << cpp_strerror(-ret) << dendl;
+          continue;
+        }
+      }
+      if (op_state.dump_keys) {
+        Formatter* const formatter = flusher.get_formatter();
+        formatter->open_object_section("object_instance");
+        formatter->dump_string("name", dir_entry.key.name);
+        formatter->dump_string("instance", dir_entry.key.instance);
+        formatter->close_section();
+        if (formatter->get_len() > FORMATTER_LEN_FLUSH_THRESHOLD) {
+          flusher.flush();
+        }
+      }
+      *count_out += 1;
+    }
+  } while (is_truncated);
+  flusher.flush();
+  return 0;
+}
+
+/**
+ * Spawns separate coroutines to check each bucket shard for unlinked
+ * instance entries (and remove them if op_state.fix_index is true).
+ */
+int RGWBucket::check_index_unlinked(rgw::sal::RadosStore* const rados_store,
+                                    const DoutPrefixProvider *dpp,
+                                    RGWBucketAdminOpState& op_state,
+                                    RGWFormatterFlusher& flusher)
+{
+  const RGWBucketInfo& bucket_info = get_bucket_info();
+  if ((bucket_info.versioning_status() & BUCKET_VERSIONED) == 0) {
+    ldpp_dout(dpp, 0) << "WARNING: this command is only applicable to versioned buckets" << dendl;
+    return 0;
+  }
+  
+  Formatter* formatter = flusher.get_formatter();
+  if (op_state.dump_keys) {
+    formatter->open_array_section("");
+  }
+
+  const int max_shards = rgw::num_shards(bucket_info.layout.current_index);
+  std::string verb = op_state.will_fix_index() ? "removed" : "found";
+  uint64_t count_out = 0;
+  
+  int max_aio = std::max(1, op_state.get_max_aio());
+  int next_shard = 0;
+  boost::asio::io_context context;
+  for (int i=0; i<max_aio; i++) {
+    spawn::spawn(context, [&](yield_context yield) {
+      while (true) {
+        int shard = next_shard;
+        next_shard += 1;
+        if (shard >= max_shards) {
+          return;
+        }
+        uint64_t shard_count;
+        optional_yield y {context, yield};
+        int r = ::check_index_unlinked(rados_store, &*bucket, dpp, op_state, flusher, shard, &shard_count, y);
+        if (r < 0) {
+          ldpp_dout(dpp, -1) << "ERROR: error processing shard " << shard << 
+            " check_index_unlinked(): " << r << dendl;
+        }
+        count_out += shard_count;
+        if (!op_state.hide_progress) {
+          ldpp_dout(dpp, 1) << "NOTICE: finished shard " << shard << " (" << shard_count <<
+            " entries " << verb << ")" << dendl;
+        }
+      }
+    });
+  }
+  try {
+    context.run();
+  } catch (const std::system_error& e) {
+    return -e.code().value();
+  }
+
+  if (!op_state.hide_progress) {
+    ldpp_dout(dpp, 1) << "NOTICE: finished all shards (" << count_out <<
+      " entries " << verb << ")" << dendl;
+  }
+  if (op_state.dump_keys) {
+    formatter->close_section();
+    flusher.flush();
+  }
+  return 0;
+}
+
+int RGWBucket::check_index(const DoutPrefixProvider *dpp,
+        RGWBucketAdminOpState& op_state,
+        map<RGWObjCategory, RGWStorageStats>& existing_stats,
+        map<RGWObjCategory, RGWStorageStats>& calculated_stats,
+        std::string *err_msg)
+{
+  bool fix_index = op_state.will_fix_index();
+
+  int r = bucket->check_index(dpp, existing_stats, calculated_stats);
+  if (r < 0) {
+    set_err_msg(err_msg, "failed to check index error=" + cpp_strerror(-r));
+    return r;
+  }
+
+  if (fix_index) {
+    r = bucket->rebuild_index(dpp);
+    if (r < 0) {
+      set_err_msg(err_msg, "failed to rebuild index err=" + cpp_strerror(-r));
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+int RGWBucket::sync(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+  if (!driver->is_meta_master()) {
+    set_err_msg(err_msg, "ERROR: failed to update bucket sync: only allowed on meta master zone");
+    return -EINVAL;
+  }
+  bool sync = op_state.will_sync_bucket();
+  if (sync) {
+    bucket->get_info().flags &= ~BUCKET_DATASYNC_DISABLED;
+  } else {
+    bucket->get_info().flags |= BUCKET_DATASYNC_DISABLED;
+  }
+
+  // when writing this metadata, RGWSI_BucketIndex_RADOS::handle_overwrite()
+  // will write the corresponding datalog and bilog entries
+  int r = bucket->put_info(dpp, false, real_time());
+  if (r < 0) {
+    set_err_msg(err_msg, "ERROR: failed writing bucket instance info:" + cpp_strerror(-r));
+    return r;
+  }
+
+  return 0;
+}
+
+
+int RGWBucket::policy_bl_to_stream(bufferlist& bl, ostream& o)
+{
+  RGWAccessControlPolicy_S3 policy(g_ceph_context);
+  int ret = decode_bl(bl, policy);
+  if (ret < 0) {
+    ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
+  }
+  policy.to_xml(o);
+  return 0;
+}
+
+int rgw_object_get_attr(const DoutPrefixProvider *dpp,
+			rgw::sal::Driver* driver, rgw::sal::Object* obj,
+			const char* attr_name, bufferlist& out_bl, optional_yield y)
+{
+  std::unique_ptr<rgw::sal::Object::ReadOp> rop = obj->get_read_op();
+
+  return rop->get_attr(dpp, attr_name, out_bl, y);
+}
+
+int RGWBucket::get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  int ret;
+  std::string object_name = op_state.get_object_name();
+
+  bucket = op_state.get_bucket()->clone();
+
+  if (!object_name.empty()) {
+    bufferlist bl;
+    std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(rgw_obj_key(object_name));
+
+    ret = rgw_object_get_attr(dpp, driver, obj.get(), RGW_ATTR_ACL, bl, y);
+    if (ret < 0){
+      return ret;
+    }
+
+    ret = decode_bl(bl, policy);
+    if (ret < 0) {
+      ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
+    }
+    return ret;
+  }
+
+  map<string, bufferlist>::iterator aiter = bucket->get_attrs().find(RGW_ATTR_ACL);
+  if (aiter == bucket->get_attrs().end()) {
+    return -ENOENT;
+  }
+
+  ret = decode_bl(aiter->second, policy);
+  if (ret < 0) {
+    ldout(driver->ctx(),0) << "failed to decode RGWAccessControlPolicy" << dendl;
+  }
+
+  return ret;
+}
+
+
+int RGWBucketAdminOp::get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWAccessControlPolicy& policy, const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+
+  int ret = bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = bucket.get_policy(op_state, policy, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+/* Wrappers to facilitate RESTful interface */
+
+
+int RGWBucketAdminOp::get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp)
+{
+  RGWAccessControlPolicy policy(driver->ctx());
+
+  int ret = get_policy(driver, op_state, policy, dpp);
+  if (ret < 0)
+    return ret;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  flusher.start(0);
+
+  formatter->open_object_section("policy");
+  policy.dump(formatter);
+  formatter->close_section();
+
+  flusher.flush();
+
+  return 0;
+}
+
+int RGWBucketAdminOp::dump_s3_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  ostream& os, const DoutPrefixProvider *dpp)
+{
+  RGWAccessControlPolicy_S3 policy(driver->ctx());
+
+  int ret = get_policy(driver, op_state, policy, dpp);
+  if (ret < 0)
+    return ret;
+
+  policy.to_xml(os);
+
+  return 0;
+}
+
+int RGWBucketAdminOp::unlink(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+
+  int ret = bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+
+  return static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->unlink_bucket(op_state.get_user_id(), op_state.get_bucket()->get_info().bucket, null_yield, dpp, true);
+}
+
+int RGWBucketAdminOp::link(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, string *err)
+{
+  if (!op_state.is_user_op()) {
+    set_err_msg(err, "empty user id");
+    return -EINVAL;
+  }
+
+  RGWBucket bucket;
+  int ret = bucket.init(driver, op_state, null_yield, dpp, err);
+  if (ret < 0)
+    return ret;
+
+  string bucket_id = op_state.get_bucket_id();
+  std::string display_name = op_state.get_user_display_name();
+  std::unique_ptr<rgw::sal::Bucket> loc_bucket;
+  std::unique_ptr<rgw::sal::Bucket> old_bucket;
+
+  loc_bucket = op_state.get_bucket()->clone();
+
+  if (!bucket_id.empty() && bucket_id != loc_bucket->get_bucket_id()) {
+    set_err_msg(err,
+	"specified bucket id does not match " + loc_bucket->get_bucket_id());
+    return -EINVAL;
+  }
+
+  old_bucket = loc_bucket->clone();
+
+  loc_bucket->get_key().tenant = op_state.get_user_id().tenant;
+
+  if (!op_state.new_bucket_name.empty()) {
+    auto pos = op_state.new_bucket_name.find('/');
+    if (pos != string::npos) {
+      loc_bucket->get_key().tenant = op_state.new_bucket_name.substr(0, pos);
+      loc_bucket->get_key().name = op_state.new_bucket_name.substr(pos + 1);
+    } else {
+      loc_bucket->get_key().name = op_state.new_bucket_name;
+    }
+  }
+
+  RGWObjVersionTracker objv_tracker;
+  RGWObjVersionTracker old_version = loc_bucket->get_info().objv_tracker;
+
+  map<string, bufferlist>::iterator aiter = loc_bucket->get_attrs().find(RGW_ATTR_ACL);
+  if (aiter == loc_bucket->get_attrs().end()) {
+	// should never happen; only pre-argonaut buckets lacked this.
+    ldpp_dout(dpp, 0) << "WARNING: can't bucket link because no acl on bucket=" << old_bucket << dendl;
+    set_err_msg(err,
+	"While crossing the Anavros you have displeased the goddess Hera."
+	"  You must sacrifice your ancient bucket " + loc_bucket->get_bucket_id());
+    return -EINVAL;
+  }
+  bufferlist& aclbl = aiter->second;
+  RGWAccessControlPolicy policy;
+  ACLOwner owner;
+  try {
+   auto iter = aclbl.cbegin();
+   decode(policy, iter);
+   owner = policy.get_owner();
+  } catch (buffer::error& e) {
+    set_err_msg(err, "couldn't decode policy");
+    return -EIO;
+  }
+
+  int r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->unlink_bucket(owner.get_id(), old_bucket->get_info().bucket, null_yield, dpp, false);
+  if (r < 0) {
+    set_err_msg(err, "could not unlink policy from user " + owner.get_id().to_str());
+    return r;
+  }
+
+  // now update the user for the bucket...
+  if (display_name.empty()) {
+    ldpp_dout(dpp, 0) << "WARNING: user " << op_state.get_user_id() << " has no display name set" << dendl;
+  }
+
+  RGWAccessControlPolicy policy_instance;
+  policy_instance.create_default(op_state.get_user_id(), display_name);
+  owner = policy_instance.get_owner();
+
+  aclbl.clear();
+  policy_instance.encode(aclbl);
+
+  bool exclusive = false;
+  loc_bucket->get_info().owner = op_state.get_user_id();
+  if (*loc_bucket != *old_bucket) {
+    loc_bucket->get_info().bucket = loc_bucket->get_key();
+    loc_bucket->get_info().objv_tracker.version_for_read()->ver = 0;
+    exclusive = true;
+  }
+
+  r = loc_bucket->put_info(dpp, exclusive, ceph::real_time());
+  if (r < 0) {
+    set_err_msg(err, "ERROR: failed writing bucket instance info: " + cpp_strerror(-r));
+    return r;
+  }
+
+  /* link to user */
+  RGWBucketEntryPoint ep;
+  ep.bucket = loc_bucket->get_info().bucket;
+  ep.owner = op_state.get_user_id();
+  ep.creation_time = loc_bucket->get_info().creation_time;
+  ep.linked = true;
+  rgw::sal::Attrs ep_attrs;
+  rgw_ep_info ep_data{ep, ep_attrs};
+
+  r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->link_bucket(op_state.get_user_id(), loc_bucket->get_info().bucket, loc_bucket->get_info().creation_time, null_yield, dpp, true, &ep_data);
+  if (r < 0) {
+    set_err_msg(err, "failed to relink bucket");
+    return r;
+  }
+
+  if (*loc_bucket != *old_bucket) {
+    // like RGWRados::delete_bucket -- excepting no bucket_index work.
+    r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->remove_bucket_entrypoint_info(
+					old_bucket->get_key(), null_yield, dpp,
+					RGWBucketCtl::Bucket::RemoveParams()
+					.set_objv_tracker(&ep_data.ep_objv));
+    if (r < 0) {
+      set_err_msg(err, "failed to unlink old bucket " + old_bucket->get_tenant() + "/" + old_bucket->get_name());
+      return r;
+    }
+    r = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->remove_bucket_instance_info(
+					old_bucket->get_key(), old_bucket->get_info(),
+					null_yield, dpp,
+					RGWBucketCtl::BucketInstance::RemoveParams()
+					.set_objv_tracker(&ep_data.ep_objv));
+    if (r < 0) {
+      set_err_msg(err, "failed to unlink old bucket " + old_bucket->get_tenant() + "/" + old_bucket->get_name());
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+int RGWBucketAdminOp::chown(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const string& marker, const DoutPrefixProvider *dpp, string *err)
+{
+  RGWBucket bucket;
+
+  int ret = bucket.init(driver, op_state, null_yield, dpp, err);
+  if (ret < 0)
+    return ret;
+
+  return bucket.chown(op_state, marker, null_yield, dpp, err);
+
+}
+
+int RGWBucketAdminOp::check_index_olh(rgw::sal::RadosStore* store, RGWBucketAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+  int ret = bucket.init(store, op_state, null_yield, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "bucket.init(): " << ret << dendl;
+    return ret;
+  }
+  flusher.start(0);
+  ret = bucket.check_index_olh(store, dpp, op_state, flusher);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "check_index_olh(): " << ret << dendl;
+    return ret;
+  }
+  flusher.flush();
+  return 0;
+}
+
+int RGWBucketAdminOp::check_index_unlinked(rgw::sal::RadosStore* store,
+                                           RGWBucketAdminOpState& op_state,
+                                           RGWFormatterFlusher& flusher,
+                                           const DoutPrefixProvider *dpp)
+{
+  flusher.start(0);
+  RGWBucket bucket;
+  int ret = bucket.init(store, op_state, null_yield, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "bucket.init(): " << ret << dendl;
+    return ret;
+  }
+  ret = bucket.check_index_unlinked(store, dpp, op_state, flusher);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "check_index_unlinked(): " << ret << dendl;
+    return ret;
+  }
+  flusher.flush();
+  return 0;
+}
+
+int RGWBucketAdminOp::check_index(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  int ret;
+  map<RGWObjCategory, RGWStorageStats> existing_stats;
+  map<RGWObjCategory, RGWStorageStats> calculated_stats;
+
+
+  RGWBucket bucket;
+
+  ret = bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+
+  Formatter *formatter = flusher.get_formatter();
+  flusher.start(0);
+  formatter->open_object_section("bucket_check");
+
+  ret = bucket.check_bad_index_multipart(op_state, flusher, dpp);
+  if (ret < 0)
+    return ret;
+
+  if (op_state.will_check_objects()) {
+    ret = bucket.check_object_index(dpp, op_state, flusher, y);
+    if (ret < 0)
+      return ret;
+  }
+
+  ret = bucket.check_index(dpp, op_state, existing_stats, calculated_stats);
+  if (ret < 0)
+    return ret;
+
+  dump_index_check(existing_stats, calculated_stats, formatter);
+  
+  formatter->close_section();
+  flusher.flush();
+
+  return 0;
+}
+
+int RGWBucketAdminOp::remove_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+				    optional_yield y, const DoutPrefixProvider *dpp, 
+                                    bool bypass_gc, bool keep_index_consistent)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(op_state.get_user_id());
+
+  int ret = driver->get_bucket(dpp, user.get(), user->get_tenant(), op_state.get_bucket_name(),
+			      &bucket, y);
+  if (ret < 0)
+    return ret;
+
+  if (bypass_gc)
+    ret = bucket->remove_bucket_bypass_gc(op_state.get_max_aio(), keep_index_consistent, y, dpp);
+  else
+    ret = bucket->remove_bucket(dpp, op_state.will_delete_children(),
+				false, nullptr, y);
+
+  return ret;
+}
+
+int RGWBucketAdminOp::remove_object(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+
+  int ret = bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+
+  return bucket.remove_object(dpp, op_state);
+}
+
+int RGWBucketAdminOp::sync_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, string *err_msg)
+{
+  RGWBucket bucket;
+  int ret = bucket.init(driver, op_state, null_yield, dpp, err_msg);
+  if (ret < 0)
+  {
+    return ret;
+  }
+  return bucket.sync(op_state, dpp, err_msg);
+}
+
+static int bucket_stats(rgw::sal::Driver* driver,
+			const std::string& tenant_name,
+			const std::string& bucket_name,
+			Formatter *formatter,
+                        const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  map<RGWObjCategory, RGWStorageStats> stats;
+
+  int ret = driver->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield);
+  if (ret < 0) {
+    return ret;
+  }
+
+  const RGWBucketInfo& bucket_info = bucket->get_info();
+
+  const auto& index = bucket->get_info().get_current_index();
+  if (is_layout_indexless(index)) {
+    cerr << "error, indexless buckets do not maintain stats; bucket=" <<
+      bucket->get_name() << std::endl;
+    return -EINVAL;
+  }
+
+  std::string bucket_ver, master_ver;
+  std::string max_marker;
+  ret = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, &max_marker);
+  if (ret < 0) {
+    cerr << "error getting bucket stats bucket=" << bucket->get_name() << " ret=" << ret << std::endl;
+    return ret;
+  }
+
+  utime_t ut(bucket->get_modification_time());
+  utime_t ctime_ut(bucket->get_creation_time());
+
+  formatter->open_object_section("stats");
+  formatter->dump_string("bucket", bucket->get_name());
+  formatter->dump_int("num_shards",
+		      bucket->get_info().layout.current_index.layout.normal.num_shards);
+  formatter->dump_string("tenant", bucket->get_tenant());
+  formatter->dump_string("zonegroup", bucket->get_info().zonegroup);
+  formatter->dump_string("placement_rule", bucket->get_info().placement_rule.to_str());
+  ::encode_json("explicit_placement", bucket->get_key().explicit_placement, formatter);
+  formatter->dump_string("id", bucket->get_bucket_id());
+  formatter->dump_string("marker", bucket->get_marker());
+  formatter->dump_stream("index_type") << bucket->get_info().layout.current_index.layout.type;
+  formatter->dump_bool("versioned", bucket_info.versioned());
+  formatter->dump_bool("versioning_enabled", bucket_info.versioning_enabled());
+  formatter->dump_bool("object_lock_enabled", bucket_info.obj_lock_enabled());
+  formatter->dump_bool("mfa_enabled", bucket_info.mfa_enabled());
+  ::encode_json("owner", bucket->get_info().owner, formatter);
+  formatter->dump_string("ver", bucket_ver);
+  formatter->dump_string("master_ver", master_ver);
+  ut.gmtime(formatter->dump_stream("mtime"));
+  ctime_ut.gmtime(formatter->dump_stream("creation_time"));
+  formatter->dump_string("max_marker", max_marker);
+  dump_bucket_usage(stats, formatter);
+  encode_json("bucket_quota", bucket->get_info().quota, formatter);
+
+  // bucket tags
+  auto iter = bucket->get_attrs().find(RGW_ATTR_TAGS);
+  if (iter != bucket->get_attrs().end()) {
+    RGWObjTagSet_S3 tagset;
+    bufferlist::const_iterator piter{&iter->second};
+    try {
+      tagset.decode(piter);
+      tagset.dump(formatter); 
+    } catch (buffer::error& err) {
+      cerr << "ERROR: caught buffer:error, couldn't decode TagSet" << std::endl;
+    }
+  }
+
+  // TODO: bucket CORS
+  // TODO: bucket LC
+  formatter->close_section();
+
+  return 0;
+}
+
+int RGWBucketAdminOp::limit_check(rgw::sal::Driver* driver,
+				  RGWBucketAdminOpState& op_state,
+				  const std::list<std::string>& user_ids,
+				  RGWFormatterFlusher& flusher, optional_yield y,
+                                  const DoutPrefixProvider *dpp,
+				  bool warnings_only)
+{
+  int ret = 0;
+  const size_t max_entries =
+    driver->ctx()->_conf->rgw_list_buckets_max_chunk;
+
+  const size_t safe_max_objs_per_shard =
+    driver->ctx()->_conf->rgw_safe_max_objects_per_shard;
+
+  uint16_t shard_warn_pct =
+    driver->ctx()->_conf->rgw_shard_warning_threshold;
+  if (shard_warn_pct > 100)
+    shard_warn_pct = 90;
+
+  Formatter *formatter = flusher.get_formatter();
+  flusher.start(0);
+
+  formatter->open_array_section("users");
+
+  for (const auto& user_id : user_ids) {
+
+    formatter->open_object_section("user");
+    formatter->dump_string("user_id", user_id);
+    formatter->open_array_section("buckets");
+
+    string marker;
+    rgw::sal::BucketList buckets;
+    do {
+      std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(user_id));
+
+      ret = user->list_buckets(dpp, marker, string(), max_entries, false, buckets, y);
+
+      if (ret < 0)
+        return ret;
+
+      map<string, std::unique_ptr<rgw::sal::Bucket>>& m_buckets = buckets.get_buckets();
+
+      for (const auto& iter : m_buckets) {
+	auto& bucket = iter.second;
+	uint64_t num_objects = 0;
+
+	marker = bucket->get_name(); /* Casey's location for marker update,
+				     * as we may now not reach the end of
+				     * the loop body */
+
+	ret = bucket->load_bucket(dpp, y);
+	if (ret < 0)
+	  continue;
+
+	const auto& index = bucket->get_info().get_current_index();
+	if (is_layout_indexless(index)) {
+	  continue; // indexless buckets don't have stats
+	}
+
+	/* need stats for num_entries */
+	string bucket_ver, master_ver;
+	std::map<RGWObjCategory, RGWStorageStats> stats;
+	ret = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, nullptr);
+
+	if (ret < 0)
+	  continue;
+
+	for (const auto& s : stats) {
+	  num_objects += s.second.num_objects;
+	}
+
+	const uint32_t num_shards = rgw::num_shards(index.layout.normal);
+	uint64_t objs_per_shard =
+	  (num_shards) ? num_objects/num_shards : num_objects;
+	{
+	  bool warn;
+	  stringstream ss;
+	  uint64_t fill_pct = objs_per_shard * 100 / safe_max_objs_per_shard;
+	  if (fill_pct > 100) {
+	    ss << "OVER " << fill_pct << "%";
+	    warn = true;
+	  } else if (fill_pct >= shard_warn_pct) {
+	    ss << "WARN " << fill_pct << "%";
+	    warn = true;
+	  } else {
+	    ss << "OK";
+	    warn = false;
+	  }
+
+	  if (warn || !warnings_only) {
+	    formatter->open_object_section("bucket");
+	    formatter->dump_string("bucket", bucket->get_name());
+	    formatter->dump_string("tenant", bucket->get_tenant());
+	    formatter->dump_int("num_objects", num_objects);
+	    formatter->dump_int("num_shards", num_shards);
+	    formatter->dump_int("objects_per_shard", objs_per_shard);
+	    formatter->dump_string("fill_status", ss.str());
+	    formatter->close_section();
+	  }
+	}
+      }
+      formatter->flush(cout);
+    } while (buckets.is_truncated()); /* foreach: bucket */
+
+    formatter->close_section();
+    formatter->close_section();
+    formatter->flush(cout);
+
+  } /* foreach: user_id */
+
+  formatter->close_section();
+  formatter->flush(cout);
+
+  return ret;
+} /* RGWBucketAdminOp::limit_check */
+
+int RGWBucketAdminOp::info(rgw::sal::Driver* driver,
+			   RGWBucketAdminOpState& op_state,
+			   RGWFormatterFlusher& flusher,
+			   optional_yield y,
+                           const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+  int ret = 0;
+  const std::string& bucket_name = op_state.get_bucket_name();
+  if (!bucket_name.empty()) {
+    ret = bucket.init(driver, op_state, y, dpp);
+    if (-ENOENT == ret)
+      return -ERR_NO_SUCH_BUCKET;
+    else if (ret < 0)
+      return ret;
+  }
+
+  Formatter *formatter = flusher.get_formatter();
+  flusher.start(0);
+
+  CephContext *cct = driver->ctx();
+
+  const size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
+
+  const bool show_stats = op_state.will_fetch_stats();
+  const rgw_user& user_id = op_state.get_user_id();
+  if (op_state.is_user_op()) {
+    formatter->open_array_section("buckets");
+
+    rgw::sal::BucketList buckets;
+    std::unique_ptr<rgw::sal::User> user = driver->get_user(op_state.get_user_id());
+    std::string marker;
+    const std::string empty_end_marker;
+    constexpr bool no_need_stats = false; // set need_stats to false
+
+    do {
+      ret = user->list_buckets(dpp, marker, empty_end_marker, max_entries,
+			      no_need_stats, buckets, y);
+      if (ret < 0) {
+        return ret;
+      }
+
+      const std::string* marker_cursor = nullptr;
+      map<string, std::unique_ptr<rgw::sal::Bucket>>& m = buckets.get_buckets();
+
+      for (const auto& i : m) {
+        const std::string& obj_name = i.first;
+        if (!bucket_name.empty() && bucket_name != obj_name) {
+          continue;
+        }
+
+        if (show_stats) {
+          bucket_stats(driver, user_id.tenant, obj_name, formatter, dpp);
+	} else {
+          formatter->dump_string("bucket", obj_name);
+	}
+
+        marker_cursor = &obj_name;
+      } // for loop
+      if (marker_cursor) {
+	marker = *marker_cursor;
+      }
+
+      flusher.flush();
+    } while (buckets.is_truncated());
+
+    formatter->close_section();
+  } else if (!bucket_name.empty()) {
+    ret = bucket_stats(driver, user_id.tenant, bucket_name, formatter, dpp);
+    if (ret < 0) {
+      return ret;
+    }
+  } else {
+    void *handle = nullptr;
+    bool truncated = true;
+
+    formatter->open_array_section("buckets");
+    ret = driver->meta_list_keys_init(dpp, "bucket", string(), &handle);
+    while (ret == 0 && truncated) {
+      std::list<std::string> buckets;
+      constexpr int max_keys = 1000;
+      ret = driver->meta_list_keys_next(dpp, handle, max_keys, buckets,
+						   &truncated);
+      for (auto& bucket_name : buckets) {
+        if (show_stats) {
+          bucket_stats(driver, user_id.tenant, bucket_name, formatter, dpp);
+	} else {
+          formatter->dump_string("bucket", bucket_name);
+	}
+      }
+    }
+    driver->meta_list_keys_complete(handle);
+
+    formatter->close_section();
+  }
+
+  flusher.flush();
+
+  return 0;
+}
+
+int RGWBucketAdminOp::set_quota(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp)
+{
+  RGWBucket bucket;
+
+  int ret = bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0)
+    return ret;
+  return bucket.set_quota(op_state, dpp);
+}
+
+inline auto split_tenant(const std::string& bucket_name){
+  auto p = bucket_name.find('/');
+  if(p != std::string::npos) {
+    return std::make_pair(bucket_name.substr(0,p), bucket_name.substr(p+1));
+  }
+  return std::make_pair(std::string(), bucket_name);
+}
+
+using bucket_instance_ls = std::vector<RGWBucketInfo>;
+void get_stale_instances(rgw::sal::Driver* driver, const std::string& bucket_name,
+                         const vector<std::string>& lst,
+                         bucket_instance_ls& stale_instances,
+                         const DoutPrefixProvider *dpp)
+{
+
+  bucket_instance_ls other_instances;
+// first iterate over the entries, and pick up the done buckets; these
+// are guaranteed to be stale
+  for (const auto& bucket_instance : lst){
+    RGWBucketInfo binfo;
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    rgw_bucket rbucket;
+    rgw_bucket_parse_bucket_key(driver->ctx(), bucket_instance, &rbucket, nullptr);
+    int r = driver->get_bucket(dpp, nullptr, rbucket, &bucket, null_yield);
+    if (r < 0){
+      // this can only happen if someone deletes us right when we're processing
+      ldpp_dout(dpp, -1) << "Bucket instance is invalid: " << bucket_instance
+                          << cpp_strerror(-r) << dendl;
+      continue;
+    }
+    binfo = bucket->get_info();
+    if (binfo.reshard_status == cls_rgw_reshard_status::DONE)
+      stale_instances.emplace_back(std::move(binfo));
+    else {
+      other_instances.emplace_back(std::move(binfo));
+    }
+  }
+
+  // Read the cur bucket info, if the bucket doesn't exist we can simply return
+  // all the instances
+  auto [tenant, bname] = split_tenant(bucket_name);
+  RGWBucketInfo cur_bucket_info;
+  std::unique_ptr<rgw::sal::Bucket> cur_bucket;
+  int r = driver->get_bucket(dpp, nullptr, tenant, bname, &cur_bucket, null_yield);
+  if (r < 0) {
+    if (r == -ENOENT) {
+      // bucket doesn't exist, everything is stale then
+      stale_instances.insert(std::end(stale_instances),
+                             std::make_move_iterator(other_instances.begin()),
+                             std::make_move_iterator(other_instances.end()));
+    } else {
+      // all bets are off if we can't read the bucket, just return the sureshot stale instances
+      ldpp_dout(dpp, -1) << "error: reading bucket info for bucket: "
+                          << bname << cpp_strerror(-r) << dendl;
+    }
+    return;
+  }
+
+  // Don't process further in this round if bucket is resharding
+  cur_bucket_info = cur_bucket->get_info();
+  if (cur_bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS)
+    return;
+
+  other_instances.erase(std::remove_if(other_instances.begin(), other_instances.end(),
+                                       [&cur_bucket_info](const RGWBucketInfo& b){
+                                         return (b.bucket.bucket_id == cur_bucket_info.bucket.bucket_id ||
+                                                 b.bucket.bucket_id == cur_bucket_info.new_bucket_instance_id);
+                                       }),
+                        other_instances.end());
+
+  // check if there are still instances left
+  if (other_instances.empty()) {
+    return;
+  }
+
+  // Now we have a bucket with instances where the reshard status is none, this
+  // usually happens when the reshard process couldn't complete, lockdown the
+  // bucket and walk through these instances to make sure no one else interferes
+  // with these
+  {
+    RGWBucketReshardLock reshard_lock(static_cast<rgw::sal::RadosStore*>(driver), cur_bucket->get_info(), true);
+    r = reshard_lock.lock(dpp);
+    if (r < 0) {
+      // most likely bucket is under reshard, return the sureshot stale instances
+      ldpp_dout(dpp, 5) << __func__
+                             << "failed to take reshard lock; reshard underway likey" << dendl;
+      return;
+    }
+    auto sg = make_scope_guard([&reshard_lock](){ reshard_lock.unlock();} );
+    // this should be fast enough that we may not need to renew locks and check
+    // exit status?, should we read the values of the instances again?
+    stale_instances.insert(std::end(stale_instances),
+                           std::make_move_iterator(other_instances.begin()),
+                           std::make_move_iterator(other_instances.end()));
+  }
+
+  return;
+}
+
+static int process_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                                   RGWFormatterFlusher& flusher,
+                                   const DoutPrefixProvider *dpp,
+                                   std::function<void(const bucket_instance_ls&,
+                                                      Formatter *,
+                                                      rgw::sal::Driver*)> process_f)
+{
+  std::string marker;
+  void *handle;
+  Formatter *formatter = flusher.get_formatter();
+  static constexpr auto default_max_keys = 1000;
+
+  int ret = driver->meta_list_keys_init(dpp, "bucket.instance", marker, &handle);
+  if (ret < 0) {
+    cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+
+  bool truncated;
+
+  formatter->open_array_section("keys");
+  auto g = make_scope_guard([&driver, &handle, &formatter]() {
+                              driver->meta_list_keys_complete(handle);
+                              formatter->close_section(); // keys
+                              formatter->flush(cout);
+                            });
+
+  do {
+    list<std::string> keys;
+
+    ret = driver->meta_list_keys_next(dpp, handle, default_max_keys, keys, &truncated);
+    if (ret < 0 && ret != -ENOENT) {
+      cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    } if (ret != -ENOENT) {
+      // partition the list of buckets by buckets as the listing is un sorted,
+      // since it would minimize the reads to bucket_info
+      std::unordered_map<std::string, std::vector<std::string>> bucket_instance_map;
+      for (auto &key: keys) {
+        auto pos = key.find(':');
+        if(pos != std::string::npos)
+          bucket_instance_map[key.substr(0,pos)].emplace_back(std::move(key));
+      }
+      for (const auto& kv: bucket_instance_map) {
+        bucket_instance_ls stale_lst;
+        get_stale_instances(driver, kv.first, kv.second, stale_lst, dpp);
+        process_f(stale_lst, formatter, driver);
+      }
+    }
+  } while (truncated);
+
+  return 0;
+}
+
+int RGWBucketAdminOp::list_stale_instances(rgw::sal::Driver* driver,
+                                           RGWBucketAdminOpState& op_state,
+                                           RGWFormatterFlusher& flusher,
+                                           const DoutPrefixProvider *dpp)
+{
+  auto process_f = [](const bucket_instance_ls& lst,
+                      Formatter *formatter,
+                      rgw::sal::Driver*){
+                     for (const auto& binfo: lst)
+                       formatter->dump_string("key", binfo.bucket.get_key());
+                   };
+  return process_stale_instances(driver, op_state, flusher, dpp, process_f);
+}
+
+
+int RGWBucketAdminOp::clear_stale_instances(rgw::sal::Driver* driver,
+                                            RGWBucketAdminOpState& op_state,
+                                            RGWFormatterFlusher& flusher,
+                                            const DoutPrefixProvider *dpp)
+{
+  auto process_f = [dpp](const bucket_instance_ls& lst,
+                      Formatter *formatter,
+                      rgw::sal::Driver* driver){
+                     for (const auto &binfo: lst) {
+		       std::unique_ptr<rgw::sal::Bucket> bucket;
+		       driver->get_bucket(nullptr, binfo, &bucket);
+		       int ret = bucket->purge_instance(dpp);
+                       if (ret == 0){
+                         auto md_key = "bucket.instance:" + binfo.bucket.get_key();
+                         ret = driver->meta_remove(dpp, md_key, null_yield);
+                       }
+                       formatter->open_object_section("delete_status");
+                       formatter->dump_string("bucket_instance", binfo.bucket.get_key());
+                       formatter->dump_int("status", -ret);
+                       formatter->close_section();
+                     }
+                   };
+
+  return process_stale_instances(driver, op_state, flusher, dpp, process_f);
+}
+
+static int fix_single_bucket_lc(rgw::sal::Driver* driver,
+                                const std::string& tenant_name,
+                                const std::string& bucket_name,
+                                const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  int ret = driver->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield);
+  if (ret < 0) {
+    // TODO: Should we handle the case where the bucket could've been removed between
+    // listing and fetching?
+    return ret;
+  }
+
+  return rgw::lc::fix_lc_shard_entry(dpp, driver, driver->get_rgwlc()->get_lc(), bucket.get());
+}
+
+static void format_lc_status(Formatter* formatter,
+                             const std::string& tenant_name,
+                             const std::string& bucket_name,
+                             int status)
+{
+  formatter->open_object_section("bucket_entry");
+  std::string entry = tenant_name.empty() ? bucket_name : tenant_name + "/" + bucket_name;
+  formatter->dump_string("bucket", entry);
+  formatter->dump_int("status", status);
+  formatter->close_section(); // bucket_entry
+}
+
+static void process_single_lc_entry(rgw::sal::Driver* driver,
+				    Formatter *formatter,
+                                    const std::string& tenant_name,
+                                    const std::string& bucket_name,
+                                    const DoutPrefixProvider *dpp)
+{
+  int ret = fix_single_bucket_lc(driver, tenant_name, bucket_name, dpp);
+  format_lc_status(formatter, tenant_name, bucket_name, -ret);
+}
+
+int RGWBucketAdminOp::fix_lc_shards(rgw::sal::Driver* driver,
+                                    RGWBucketAdminOpState& op_state,
+                                    RGWFormatterFlusher& flusher,
+                                    const DoutPrefixProvider *dpp)
+{
+  std::string marker;
+  void *handle;
+  Formatter *formatter = flusher.get_formatter();
+  static constexpr auto default_max_keys = 1000;
+
+  bool truncated;
+  if (const std::string& bucket_name = op_state.get_bucket_name();
+      ! bucket_name.empty()) {
+    const rgw_user user_id = op_state.get_user_id();
+    process_single_lc_entry(driver, formatter, user_id.tenant, bucket_name, dpp);
+    formatter->flush(cout);
+  } else {
+    int ret = driver->meta_list_keys_init(dpp, "bucket", marker, &handle);
+    if (ret < 0) {
+      std::cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+
+    {
+      formatter->open_array_section("lc_fix_status");
+      auto sg = make_scope_guard([&driver, &handle, &formatter](){
+                                   driver->meta_list_keys_complete(handle);
+                                   formatter->close_section(); // lc_fix_status
+                                   formatter->flush(cout);
+                                 });
+      do {
+        list<std::string> keys;
+        ret = driver->meta_list_keys_next(dpp, handle, default_max_keys, keys, &truncated);
+        if (ret < 0 && ret != -ENOENT) {
+          std::cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+          return ret;
+        } if (ret != -ENOENT) {
+          for (const auto &key:keys) {
+            auto [tenant_name, bucket_name] = split_tenant(key);
+            process_single_lc_entry(driver, formatter, tenant_name, bucket_name, dpp);
+          }
+        }
+        formatter->flush(cout); // regularly flush every 1k entries
+      } while (truncated);
+    }
+
+  }
+  return 0;
+
+}
+
+static bool has_object_expired(const DoutPrefixProvider *dpp,
+			       rgw::sal::Driver* driver,
+			       rgw::sal::Bucket* bucket,
+			       const rgw_obj_key& key, utime_t& delete_at)
+{
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
+  bufferlist delete_at_bl;
+
+  int ret = rgw_object_get_attr(dpp, driver, obj.get(), RGW_ATTR_DELETE_AT, delete_at_bl, null_yield);
+  if (ret < 0) {
+    return false;  // no delete at attr, proceed
+  }
+
+  ret = decode_bl(delete_at_bl, delete_at);
+  if (ret < 0) {
+    return false;  // failed to parse
+  }
+
+  if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) {
+    return true;
+  }
+
+  return false;
+}
+
+static int fix_bucket_obj_expiry(const DoutPrefixProvider *dpp,
+				 rgw::sal::Driver* driver,
+				 rgw::sal::Bucket* bucket,
+				 RGWFormatterFlusher& flusher, bool dry_run)
+{
+  if (bucket->get_key().bucket_id == bucket->get_key().marker) {
+    ldpp_dout(dpp, -1) << "Not a resharded bucket skipping" << dendl;
+    return 0;  // not a resharded bucket, move along
+  }
+
+  Formatter *formatter = flusher.get_formatter();
+  formatter->open_array_section("expired_deletion_status");
+  auto sg = make_scope_guard([&formatter] {
+			       formatter->close_section();
+			       formatter->flush(std::cout);
+			     });
+
+  rgw::sal::Bucket::ListParams params;
+  rgw::sal::Bucket::ListResults results;
+
+  params.list_versions = bucket->versioned();
+  params.allow_unordered = true;
+
+  do {
+    int ret = bucket->list(dpp, params, listing_max_entries, results, null_yield);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR failed to list objects in the bucket" << dendl;
+      return ret;
+    }
+    for (const auto& obj : results.objs) {
+      rgw_obj_key key(obj.key);
+      utime_t delete_at;
+      if (has_object_expired(dpp, driver, bucket, key, delete_at)) {
+	formatter->open_object_section("object_status");
+	formatter->dump_string("object", key.name);
+	formatter->dump_stream("delete_at") << delete_at;
+
+	if (!dry_run) {
+	  ret = rgw_remove_object(dpp, driver, bucket, key);
+	  formatter->dump_int("status", ret);
+	}
+
+	formatter->close_section();  // object_status
+      }
+    }
+    formatter->flush(cout); // regularly flush every 1k entries
+  } while (results.is_truncated);
+
+  return 0;
+}
+
+int RGWBucketAdminOp::fix_obj_expiry(rgw::sal::Driver* driver,
+				     RGWBucketAdminOpState& op_state,
+				     RGWFormatterFlusher& flusher,
+                                     const DoutPrefixProvider *dpp, bool dry_run)
+{
+  RGWBucket admin_bucket;
+  int ret = admin_bucket.init(driver, op_state, null_yield, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "failed to initialize bucket" << dendl;
+    return ret;
+  }
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  ret = driver->get_bucket(nullptr, admin_bucket.get_bucket_info(), &bucket);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return fix_bucket_obj_expiry(dpp, driver, bucket.get(), flusher, dry_run);
+}
+
+void RGWBucketCompleteInfo::dump(Formatter *f) const {
+  encode_json("bucket_info", info, f);
+  encode_json("attrs", attrs, f);
+}
+
+void RGWBucketCompleteInfo::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("bucket_info", info, obj);
+  JSONDecoder::decode_json("attrs", attrs, obj);
+}
+
+class RGWBucketMetadataHandler : public RGWBucketMetadataHandlerBase {
+public:
+  struct Svc {
+    RGWSI_Bucket *bucket{nullptr};
+  } svc;
+
+  struct Ctl {
+    RGWBucketCtl *bucket{nullptr};
+  } ctl;
+
+  RGWBucketMetadataHandler() {}
+
+  void init(RGWSI_Bucket *bucket_svc,
+            RGWBucketCtl *bucket_ctl) override {
+    base_init(bucket_svc->ctx(),
+              bucket_svc->get_ep_be_handler().get());
+    svc.bucket = bucket_svc;
+    ctl.bucket = bucket_ctl;
+  }
+
+  string get_type() override { return "bucket"; }
+
+  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+    RGWBucketEntryPoint be;
+
+    try {
+      decode_json_obj(be, jo);
+    } catch (JSONDecoder::err& e) {
+      return nullptr;
+    }
+
+    return new RGWBucketEntryMetadataObject(be, objv, mtime);
+  }
+
+  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWObjVersionTracker ot;
+    RGWBucketEntryPoint be;
+
+    real_time mtime;
+    map<string, bufferlist> attrs;
+
+    RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+    int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &ot, &mtime, &attrs, y, dpp);
+    if (ret < 0)
+      return ret;
+
+    RGWBucketEntryMetadataObject *mdo = new RGWBucketEntryMetadataObject(be, ot.read_version, mtime, std::move(attrs));
+
+    *obj = mdo;
+
+    return 0;
+  }
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+             RGWMetadataObject *obj,
+             RGWObjVersionTracker& objv_tracker,
+             optional_yield y,
+             const DoutPrefixProvider *dpp,
+             RGWMDLogSyncType type, bool from_remote_zone) override;
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+                optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWBucketEntryPoint be;
+
+    real_time orig_mtime;
+
+    RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+    int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &objv_tracker, &orig_mtime, nullptr, y, dpp);
+    if (ret < 0)
+      return ret;
+
+    /*
+     * We're unlinking the bucket but we don't want to update the entrypoint here - we're removing
+     * it immediately and don't want to invalidate our cached objv_version or the bucket obj removal
+     * will incorrectly fail.
+     */
+    ret = ctl.bucket->unlink_bucket(be.owner, be.bucket, y, dpp, false);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
+    }
+
+    ret = svc.bucket->remove_bucket_entrypoint_info(ctx, entry, &objv_tracker, y, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "could not delete bucket=" << entry << dendl;
+    }
+    /* idempotent */
+    return 0;
+  }
+
+  int call(std::function<int(RGWSI_Bucket_EP_Ctx& ctx)> f) {
+    return call(nullopt, f);
+  }
+
+  int call(std::optional<RGWSI_MetaBackend_CtxParams> bectx_params,
+           std::function<int(RGWSI_Bucket_EP_Ctx& ctx)> f) {
+    return be_handler->call(bectx_params, [&](RGWSI_MetaBackend_Handler::Op *op) {
+      RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+      return f(ctx);
+    });
+  }
+};
+
+class RGWMetadataHandlerPut_Bucket : public RGWMetadataHandlerPut_SObj
+{
+  RGWBucketMetadataHandler *bhandler;
+  RGWBucketEntryMetadataObject *obj;
+public:
+  RGWMetadataHandlerPut_Bucket(RGWBucketMetadataHandler *_handler,
+                               RGWSI_MetaBackend_Handler::Op *op, string& entry,
+                               RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+			       optional_yield y,
+                               RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, op, entry, obj, objv_tracker, y, type, from_remote_zone),
+                                                        bhandler(_handler) {
+    obj = static_cast<RGWBucketEntryMetadataObject *>(_obj);
+  }
+  ~RGWMetadataHandlerPut_Bucket() {}
+
+  void encode_obj(bufferlist *bl) override {
+    obj->get_ep().encode(*bl);
+  }
+
+  int put_checked(const DoutPrefixProvider *dpp) override;
+  int put_post(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+                                     RGWMetadataObject *obj,
+                                     RGWObjVersionTracker& objv_tracker,
+				     optional_yield y,
+                                     const DoutPrefixProvider *dpp,
+                                     RGWMDLogSyncType type, bool from_remote_zone)
+{
+  RGWMetadataHandlerPut_Bucket put_op(this, op, entry, obj, objv_tracker, y, type, from_remote_zone);
+  return do_put_operate(&put_op, dpp);
+}
+
+int RGWMetadataHandlerPut_Bucket::put_checked(const DoutPrefixProvider *dpp)
+{
+  RGWBucketEntryMetadataObject *orig_obj = static_cast<RGWBucketEntryMetadataObject *>(old_obj);
+
+  if (orig_obj) {
+    obj->set_pattrs(&orig_obj->get_attrs());
+  }
+
+  auto& be = obj->get_ep();
+  auto mtime = obj->get_mtime();
+  auto pattrs = obj->get_pattrs();
+
+  RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+  return bhandler->svc.bucket->store_bucket_entrypoint_info(ctx, entry,
+                                                           be,
+                                                           false,
+                                                           mtime,
+                                                           pattrs,
+                                                           &objv_tracker,
+							   y,
+                                                           dpp);
+}
+
+int RGWMetadataHandlerPut_Bucket::put_post(const DoutPrefixProvider *dpp)
+{
+  auto& be = obj->get_ep();
+
+  int ret;
+
+  /* link bucket */
+  if (be.linked) {
+    ret = bhandler->ctl.bucket->link_bucket(be.owner, be.bucket, be.creation_time, y, dpp, false);
+  } else {
+    ret = bhandler->ctl.bucket->unlink_bucket(be.owner, be.bucket, y, dpp, false);
+  }
+
+  return ret;
+}
+
+static void get_md5_digest(const RGWBucketEntryPoint *be, string& md5_digest) {
+
+   char md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+   unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+   bufferlist bl;
+
+   Formatter *f = new JSONFormatter(false);
+   be->dump(f);
+   f->flush(bl);
+
+   MD5 hash;
+   // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+   hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+   hash.Update((const unsigned char *)bl.c_str(), bl.length());
+   hash.Final(m);
+
+   buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, md5);
+
+   delete f;
+
+   md5_digest = md5;
+}
+
+#define ARCHIVE_META_ATTR RGW_ATTR_PREFIX "zone.archive.info" 
+
+struct archive_meta_info {
+  rgw_bucket orig_bucket;
+
+  bool from_attrs(CephContext *cct, map<string, bufferlist>& attrs) {
+    auto iter = attrs.find(ARCHIVE_META_ATTR);
+    if (iter == attrs.end()) {
+      return false;
+    }
+
+    auto bliter = iter->second.cbegin();
+    try {
+      decode(bliter);
+    } catch (buffer::error& err) {
+      ldout(cct, 0) << "ERROR: failed to decode archive meta info" << dendl;
+      return false;
+    }
+
+    return true;
+  }
+
+  void store_in_attrs(map<string, bufferlist>& attrs) const {
+    encode(attrs[ARCHIVE_META_ATTR]);
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(orig_bucket, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(orig_bucket, bl);
+     DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(archive_meta_info)
+
+class RGWArchiveBucketMetadataHandler : public RGWBucketMetadataHandler {
+public:
+  RGWArchiveBucketMetadataHandler() {}
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+                optional_yield y, const DoutPrefixProvider *dpp) override {
+    auto cct = svc.bucket->ctx();
+
+    RGWSI_Bucket_EP_Ctx ctx(op->ctx());
+
+    ldpp_dout(dpp, 5) << "SKIP: bucket removal is not allowed on archive zone: bucket:" << entry << " ... proceeding to rename" << dendl;
+
+    string tenant_name, bucket_name;
+    parse_bucket(entry, &tenant_name, &bucket_name);
+    rgw_bucket entry_bucket;
+    entry_bucket.tenant = tenant_name;
+    entry_bucket.name = bucket_name;
+
+    real_time mtime;
+
+    /* read original entrypoint */
+
+    RGWBucketEntryPoint be;
+    map<string, bufferlist> attrs;
+    int ret = svc.bucket->read_bucket_entrypoint_info(ctx, entry, &be, &objv_tracker, &mtime, &attrs, y, dpp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    string bi_meta_name = RGWSI_Bucket::get_bi_meta_key(be.bucket);
+
+    /* read original bucket instance info */
+
+    map<string, bufferlist> attrs_m;
+    ceph::real_time orig_mtime;
+    RGWBucketInfo old_bi;
+
+    ret = ctl.bucket->read_bucket_instance_info(be.bucket, &old_bi, y, dpp, RGWBucketCtl::BucketInstance::GetParams()
+                                                                    .set_mtime(&orig_mtime)
+                                                                    .set_attrs(&attrs_m));
+    if (ret < 0) {
+        return ret;
+    }
+
+    archive_meta_info ami;
+
+    if (!ami.from_attrs(svc.bucket->ctx(), attrs_m)) {
+      ami.orig_bucket = old_bi.bucket;
+      ami.store_in_attrs(attrs_m);
+    }
+
+    /* generate a new bucket instance. We could have avoided this if we could just point a new
+     * bucket entry point to the old bucket instance, however, due to limitation in the way
+     * we index buckets under the user, bucket entrypoint and bucket instance of the same
+     * bucket need to have the same name, so we need to copy the old bucket instance into
+     * to a new entry with the new name
+     */
+
+    string new_bucket_name;
+
+    RGWBucketInfo new_bi = old_bi;
+    RGWBucketEntryPoint new_be = be;
+
+    string md5_digest;
+
+    get_md5_digest(&new_be, md5_digest);
+    new_bucket_name = ami.orig_bucket.name + "-deleted-" + md5_digest;
+
+    new_bi.bucket.name = new_bucket_name;
+    new_bi.objv_tracker.clear();
+
+    new_be.bucket.name = new_bucket_name;
+
+    ret = ctl.bucket->store_bucket_instance_info(new_be.bucket, new_bi, y, dpp, RGWBucketCtl::BucketInstance::PutParams()
+                                                                    .set_exclusive(false)
+                                                                    .set_mtime(orig_mtime)
+                                                                    .set_attrs(&attrs_m)
+                                                                    .set_orig_info(&old_bi));
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket instance info for bucket=" << new_bi.bucket << " ret=" << ret << dendl;
+      return ret;
+    }
+
+    /* store a new entrypoint */
+
+    RGWObjVersionTracker ot;
+    ot.generate_new_write_ver(cct);
+
+    ret = svc.bucket->store_bucket_entrypoint_info(ctx, RGWSI_Bucket::get_entrypoint_meta_key(new_be.bucket),
+                                                   new_be, true, mtime, &attrs, nullptr, y, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl;
+      return ret;
+    }
+
+    /* link new bucket */
+
+    ret = ctl.bucket->link_bucket(new_be.owner, new_be.bucket, new_be.creation_time, y, dpp, false);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to link new bucket for bucket=" << new_be.bucket << " ret=" << ret << dendl;
+      return ret;
+    }
+
+    /* clean up old stuff */
+
+    ret = ctl.bucket->unlink_bucket(be.owner, entry_bucket, y, dpp, false);
+    if (ret < 0) {
+        ldpp_dout(dpp, -1) << "could not unlink bucket=" << entry << " owner=" << be.owner << dendl;
+    }
+
+    // if (ret == -ECANCELED) it means that there was a race here, and someone
+    // wrote to the bucket entrypoint just before we removed it. The question is
+    // whether it was a newly created bucket entrypoint ...  in which case we
+    // should ignore the error and move forward, or whether it is a higher version
+    // of the same bucket instance ... in which we should retry
+    ret = svc.bucket->remove_bucket_entrypoint_info(ctx,
+                                                    RGWSI_Bucket::get_entrypoint_meta_key(be.bucket),
+                                                    &objv_tracker,
+                                                    y,
+                                                    dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to put new bucket entrypoint for bucket=" << new_be.bucket << " ret=" << ret << dendl;
+      return ret;
+    }
+
+    ret = ctl.bucket->remove_bucket_instance_info(be.bucket, old_bi, y, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "could not delete bucket=" << entry << dendl;
+    }
+
+
+    /* idempotent */
+
+    return 0;
+  }
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+             RGWMetadataObject *obj,
+             RGWObjVersionTracker& objv_tracker,
+             optional_yield y, const DoutPrefixProvider *dpp,
+             RGWMDLogSyncType type, bool from_remote_zone) override {
+    if (entry.find("-deleted-") != string::npos) {
+      RGWObjVersionTracker ot;
+      RGWMetadataObject *robj;
+      int ret = do_get(op, entry, &robj, y, dpp);
+      if (ret != -ENOENT) {
+        if (ret < 0) {
+          return ret;
+        }
+        ot.read_version = robj->get_version();
+        delete robj;
+
+        ret = do_remove(op, entry, ot, y, dpp);
+        if (ret < 0) {
+          return ret;
+        }
+      }
+    }
+
+    return RGWBucketMetadataHandler::do_put(op, entry, obj,
+                                            objv_tracker, y, dpp, type, from_remote_zone);
+  }
+
+};
+
+class RGWBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandlerBase {
+  int read_bucket_instance_entry(RGWSI_Bucket_BI_Ctx& ctx,
+                                 const string& entry,
+                                 RGWBucketCompleteInfo *bi,
+                                 ceph::real_time *pmtime,
+                                 optional_yield y,
+                                 const DoutPrefixProvider *dpp) {
+    return svc.bucket->read_bucket_instance_info(ctx,
+                                                 entry,
+                                                 &bi->info,
+                                                 pmtime, &bi->attrs,
+                                                 y,
+                                                 dpp);
+  }
+
+public:
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_Bucket *bucket{nullptr};
+    RGWSI_BucketIndex *bi{nullptr};
+  } svc;
+
+  rgw::sal::Driver* driver;
+
+  RGWBucketInstanceMetadataHandler(rgw::sal::Driver* driver)
+    : driver(driver) {}
+
+  void init(RGWSI_Zone *zone_svc,
+	    RGWSI_Bucket *bucket_svc,
+	    RGWSI_BucketIndex *bi_svc) override {
+    base_init(bucket_svc->ctx(),
+              bucket_svc->get_bi_be_handler().get());
+    svc.zone = zone_svc;
+    svc.bucket = bucket_svc;
+    svc.bi = bi_svc;
+  }
+
+  string get_type() override { return "bucket.instance"; }
+
+  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+    RGWBucketCompleteInfo bci;
+
+    try {
+      decode_json_obj(bci, jo);
+    } catch (JSONDecoder::err& e) {
+      return nullptr;
+    }
+
+    return new RGWBucketInstanceMetadataObject(bci, objv, mtime);
+  }
+
+  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWBucketCompleteInfo bci;
+    real_time mtime;
+
+    RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+
+    int ret = svc.bucket->read_bucket_instance_info(ctx, entry, &bci.info, &mtime, &bci.attrs, y, dpp);
+    if (ret < 0)
+      return ret;
+
+    RGWBucketInstanceMetadataObject *mdo = new RGWBucketInstanceMetadataObject(bci, bci.info.objv_tracker.read_version, mtime);
+
+    *obj = mdo;
+
+    return 0;
+  }
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+             RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+	     optional_yield y, const DoutPrefixProvider *dpp,
+             RGWMDLogSyncType sync_type, bool from_remote_zone) override;
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+                optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWBucketCompleteInfo bci;
+
+    RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+
+    int ret = read_bucket_instance_entry(ctx, entry, &bci, nullptr, y, dpp);
+    if (ret < 0 && ret != -ENOENT)
+      return ret;
+
+    return svc.bucket->remove_bucket_instance_info(ctx, entry, bci.info, &bci.info.objv_tracker, y, dpp);
+  }
+
+  int call(std::function<int(RGWSI_Bucket_BI_Ctx& ctx)> f) {
+    return call(nullopt, f);
+  }
+
+  int call(std::optional<RGWSI_MetaBackend_CtxParams> bectx_params,
+           std::function<int(RGWSI_Bucket_BI_Ctx& ctx)> f) {
+    return be_handler->call(bectx_params, [&](RGWSI_MetaBackend_Handler::Op *op) {
+      RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+      return f(ctx);
+    });
+  }
+};
+
+class RGWMetadataHandlerPut_BucketInstance : public RGWMetadataHandlerPut_SObj
+{
+  CephContext *cct;
+  RGWBucketInstanceMetadataHandler *bihandler;
+  RGWBucketInstanceMetadataObject *obj;
+public:
+  RGWMetadataHandlerPut_BucketInstance(CephContext *_cct,
+                                       RGWBucketInstanceMetadataHandler *_handler,
+                                       RGWSI_MetaBackend_Handler::Op *_op, string& entry,
+                                       RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+				       optional_yield y,
+                                       RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, _op, entry, _obj, objv_tracker, y, type, from_remote_zone),
+                                       cct(_cct), bihandler(_handler) {
+    obj = static_cast<RGWBucketInstanceMetadataObject *>(_obj);
+
+    auto& bci = obj->get_bci();
+    obj->set_pattrs(&bci.attrs);
+  }
+
+  void encode_obj(bufferlist *bl) override {
+    obj->get_bucket_info().encode(*bl);
+  }
+
+  int put_check(const DoutPrefixProvider *dpp) override;
+  int put_checked(const DoutPrefixProvider *dpp) override;
+  int put_post(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketInstanceMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op,
+                                             string& entry,
+                                             RGWMetadataObject *obj,
+                                             RGWObjVersionTracker& objv_tracker,
+                                             optional_yield y,
+                                             const DoutPrefixProvider *dpp,
+                                             RGWMDLogSyncType type, bool from_remote_zone)
+{
+  RGWMetadataHandlerPut_BucketInstance put_op(svc.bucket->ctx(), this, op, entry, obj,
+                                              objv_tracker, y, type, from_remote_zone);
+  return do_put_operate(&put_op, dpp);
+}
+
+void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
+				const RGWZone& zone,
+				std::optional<uint32_t> shards,
+				std::optional<rgw::BucketIndexType> type) {
+  layout.current_index.gen = 0;
+  layout.current_index.layout.normal.hash_type = rgw::BucketHashType::Mod;
+
+  layout.current_index.layout.type =
+    type.value_or(rgw::BucketIndexType::Normal);
+
+  if (shards) {
+    layout.current_index.layout.normal.num_shards = *shards;
+  } else if (cct->_conf->rgw_override_bucket_index_max_shards > 0) {
+    layout.current_index.layout.normal.num_shards =
+      cct->_conf->rgw_override_bucket_index_max_shards;
+  } else {
+    layout.current_index.layout.normal.num_shards =
+      zone.bucket_index_max_shards;
+  }
+
+  if (layout.current_index.layout.type == rgw::BucketIndexType::Normal) {
+    layout.logs.push_back(log_layout_from_index(0, layout.current_index));
+  }
+}
+
+int RGWMetadataHandlerPut_BucketInstance::put_check(const DoutPrefixProvider *dpp)
+{
+  int ret;
+
+  RGWBucketCompleteInfo& bci = obj->get_bci();
+
+  RGWBucketInstanceMetadataObject *orig_obj = static_cast<RGWBucketInstanceMetadataObject *>(old_obj);
+
+  RGWBucketCompleteInfo *old_bci = (orig_obj ? &orig_obj->get_bci() : nullptr);
+
+  const bool exists = (!!orig_obj);
+
+  if (from_remote_zone) {
+    // don't sync bucket layout changes
+    if (!exists) {
+      // replace peer's layout with default-constructed, then apply our defaults
+      bci.info.layout = rgw::BucketLayout{};
+      init_default_bucket_layout(cct, bci.info.layout,
+				 bihandler->svc.zone->get_zone(),
+				 std::nullopt, std::nullopt);
+    } else {
+      bci.info.layout = old_bci->info.layout;
+    }
+  }
+
+  if (!exists || old_bci->info.bucket.bucket_id != bci.info.bucket.bucket_id) {
+    /* a new bucket, we need to select a new bucket placement for it */
+    string tenant_name;
+    string bucket_name;
+    string bucket_instance;
+    parse_bucket(entry, &tenant_name, &bucket_name, &bucket_instance);
+
+    RGWZonePlacementInfo rule_info;
+    bci.info.bucket.name = bucket_name;
+    bci.info.bucket.bucket_id = bucket_instance;
+    bci.info.bucket.tenant = tenant_name;
+    // if the sync module never writes data, don't require the zone to specify all placement targets
+    if (bihandler->svc.zone->sync_module_supports_writes()) {
+      ret = bihandler->svc.zone->select_bucket_location_by_rule(dpp, bci.info.placement_rule, &rule_info, y);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: select_bucket_placement() returned " << ret << dendl;
+        return ret;
+      }
+    }
+    bci.info.layout.current_index.layout.type = rule_info.index_type;
+  } else {
+    /* always keep bucket versioning enabled on archive zone */
+    if (bihandler->driver->get_zone()->get_tier_type() == "archive") {
+      bci.info.flags = (bci.info.flags & ~BUCKET_VERSIONS_SUSPENDED) | BUCKET_VERSIONED;
+    }
+    /* existing bucket, keep its placement */
+    bci.info.bucket.explicit_placement = old_bci->info.bucket.explicit_placement;
+    bci.info.placement_rule = old_bci->info.placement_rule;
+  }
+
+  /* record the read version (if any), store the new version */
+  bci.info.objv_tracker.read_version = objv_tracker.read_version;
+  bci.info.objv_tracker.write_version = objv_tracker.write_version;
+
+  return 0;
+}
+
+int RGWMetadataHandlerPut_BucketInstance::put_checked(const DoutPrefixProvider *dpp)
+{
+  RGWBucketInstanceMetadataObject *orig_obj = static_cast<RGWBucketInstanceMetadataObject *>(old_obj);
+
+  RGWBucketInfo *orig_info = (orig_obj ? &orig_obj->get_bucket_info() : nullptr);
+
+  auto& info = obj->get_bucket_info();
+  auto mtime = obj->get_mtime();
+  auto pattrs = obj->get_pattrs();
+
+  RGWSI_Bucket_BI_Ctx ctx(op->ctx());
+
+  return bihandler->svc.bucket->store_bucket_instance_info(ctx,
+                                                         entry,
+                                                         info,
+                                                         orig_info,
+                                                         false,
+                                                         mtime,
+                                                         pattrs,
+							 y,
+                                                         dpp);
+}
+
+int RGWMetadataHandlerPut_BucketInstance::put_post(const DoutPrefixProvider *dpp)
+{
+  RGWBucketCompleteInfo& bci = obj->get_bci();
+
+  objv_tracker = bci.info.objv_tracker;
+
+  int ret = bihandler->svc.bi->init_index(dpp, bci.info, bci.info.layout.current_index);
+  if (ret < 0) {
+    return ret;
+  }
+
+  /* update lifecyle policy */
+  {
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    ret = bihandler->driver->get_bucket(nullptr, bci.info, &bucket);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ << " failed to get_bucket(...) for "
+			<< bci.info.bucket.name
+			<< dendl;
+      return ret;
+    }
+
+    auto lc = bihandler->driver->get_rgwlc();
+
+    auto lc_it = bci.attrs.find(RGW_ATTR_LC);
+    if (lc_it != bci.attrs.end()) {
+      ldpp_dout(dpp, 20) << "set lc config for " << bci.info.bucket.name << dendl;
+      ret = lc->set_bucket_config(bucket.get(), bci.attrs, nullptr);
+      if (ret < 0) {
+	      ldpp_dout(dpp, 0) << __func__ << " failed to set lc config for "
+			<< bci.info.bucket.name
+			<< dendl;
+	      return ret;
+      }
+
+    } else {
+      ldpp_dout(dpp, 20) << "remove lc config for " << bci.info.bucket.name << dendl;
+      ret = lc->remove_bucket_config(bucket.get(), bci.attrs, false /* cannot merge attrs */);
+      if (ret < 0) {
+	      ldpp_dout(dpp, 0) << __func__ << " failed to remove lc config for "
+			<< bci.info.bucket.name
+			<< dendl;
+	      return ret;
+      }
+    }
+  } /* update lc */
+
+  return STATUS_APPLIED;
+}
+
+class RGWArchiveBucketInstanceMetadataHandler : public RGWBucketInstanceMetadataHandler {
+public:
+  RGWArchiveBucketInstanceMetadataHandler(rgw::sal::Driver* driver)
+    : RGWBucketInstanceMetadataHandler(driver) {}
+
+  // N.B. replication of lifecycle policy relies on logic in RGWBucketInstanceMetadataHandler::do_put(...), override with caution
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) override {
+    ldpp_dout(dpp, 0) << "SKIP: bucket instance removal is not allowed on archive zone: bucket.instance:" << entry << dendl;
+    return 0;
+  }
+};
+
+RGWBucketCtl::RGWBucketCtl(RGWSI_Zone *zone_svc,
+                           RGWSI_Bucket *bucket_svc,
+                           RGWSI_Bucket_Sync *bucket_sync_svc,
+                           RGWSI_BucketIndex *bi_svc,
+                           RGWSI_User* user_svc)
+  : cct(zone_svc->ctx())
+{
+  svc.zone = zone_svc;
+  svc.bucket = bucket_svc;
+  svc.bucket_sync = bucket_sync_svc;
+  svc.bi = bi_svc;
+  svc.user = user_svc;
+}
+
+void RGWBucketCtl::init(RGWUserCtl *user_ctl,
+                        RGWBucketMetadataHandler *_bm_handler,
+                        RGWBucketInstanceMetadataHandler *_bmi_handler,
+                        RGWDataChangesLog *datalog,
+                        const DoutPrefixProvider *dpp)
+{
+  ctl.user = user_ctl;
+
+  bm_handler = _bm_handler;
+  bmi_handler = _bmi_handler;
+
+  bucket_be_handler = bm_handler->get_be_handler();
+  bi_be_handler = bmi_handler->get_be_handler();
+
+  datalog->set_bucket_filter(
+    [this](const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp) {
+      return bucket_exports_data(bucket, y, dpp);
+    });
+}
+
+int RGWBucketCtl::call(std::function<int(RGWSI_Bucket_X_Ctx& ctx)> f) {
+  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ep_ctx) {
+    return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& bi_ctx) {
+      RGWSI_Bucket_X_Ctx ctx{ep_ctx, bi_ctx};
+      return f(ctx);
+    });
+  });
+}
+
+int RGWBucketCtl::read_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                              RGWBucketEntryPoint *info,
+                                              optional_yield y, const DoutPrefixProvider *dpp,
+                                              const Bucket::GetParams& params)
+{
+  return bm_handler->call(params.bectx_params, [&](RGWSI_Bucket_EP_Ctx& ctx) {
+    return svc.bucket->read_bucket_entrypoint_info(ctx,
+                                                   RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+                                                   info,
+                                                   params.objv_tracker,
+                                                   params.mtime,
+                                                   params.attrs,
+						   y,
+                                                   dpp,
+                                                   params.cache_info,
+                                                   params.refresh_version);
+  });
+}
+
+int RGWBucketCtl::store_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                               RGWBucketEntryPoint& info,
+                                               optional_yield y,
+                                               const DoutPrefixProvider *dpp,
+                                               const Bucket::PutParams& params)
+{
+  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+    return svc.bucket->store_bucket_entrypoint_info(ctx,
+                                                    RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+                                                    info,
+                                                    params.exclusive,
+                                                    params.mtime,
+                                                    params.attrs,
+                                                    params.objv_tracker,
+                                                    y,
+                                                    dpp);
+  });
+}
+
+int RGWBucketCtl::remove_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                                optional_yield y,
+                                                const DoutPrefixProvider *dpp,
+                                                const Bucket::RemoveParams& params)
+{
+  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+    return svc.bucket->remove_bucket_entrypoint_info(ctx,
+                                                     RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+                                                     params.objv_tracker,
+						     y,
+                                                     dpp);
+  });
+}
+
+int RGWBucketCtl::read_bucket_instance_info(const rgw_bucket& bucket,
+                                            RGWBucketInfo *info,
+                                            optional_yield y,
+                                            const DoutPrefixProvider *dpp,
+                                            const BucketInstance::GetParams& params)
+{
+  int ret = bmi_handler->call(params.bectx_params, [&](RGWSI_Bucket_BI_Ctx& ctx) {
+    return svc.bucket->read_bucket_instance_info(ctx,
+                                                 RGWSI_Bucket::get_bi_meta_key(bucket),
+                                                 info,
+                                                 params.mtime,
+                                                 params.attrs,
+						 y,
+                                                 dpp,
+                                                 params.cache_info,
+                                                 params.refresh_version);
+  });
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (params.objv_tracker) {
+    *params.objv_tracker = info->objv_tracker;
+  }
+
+  return 0;
+}
+
+int RGWBucketCtl::read_bucket_info(const rgw_bucket& bucket,
+                                   RGWBucketInfo *info,
+                                   optional_yield y,
+                                   const DoutPrefixProvider *dpp,
+                                   const BucketInstance::GetParams& params,
+                                   RGWObjVersionTracker *ep_objv_tracker)
+{
+  const rgw_bucket *b = &bucket;
+
+  std::optional<RGWBucketEntryPoint> ep;
+
+  if (b->bucket_id.empty()) {
+    ep.emplace();
+
+    int r = read_bucket_entrypoint_info(*b, &(*ep), y, dpp, RGWBucketCtl::Bucket::GetParams()
+                                                    .set_bectx_params(params.bectx_params)
+                                                    .set_objv_tracker(ep_objv_tracker));
+    if (r < 0) {
+      return r;
+    }
+
+    b = &ep->bucket;
+  }
+
+  int ret = bmi_handler->call(params.bectx_params, [&](RGWSI_Bucket_BI_Ctx& ctx) {
+    return svc.bucket->read_bucket_instance_info(ctx,
+                                                 RGWSI_Bucket::get_bi_meta_key(*b),
+                                                 info,
+                                                 params.mtime,
+                                                 params.attrs,
+						 y, dpp,
+                                                 params.cache_info,
+                                                 params.refresh_version);
+  });
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (params.objv_tracker) {
+    *params.objv_tracker = info->objv_tracker;
+  }
+
+  return 0;
+}
+
+int RGWBucketCtl::do_store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                                const rgw_bucket& bucket,
+                                                RGWBucketInfo& info,
+                                                optional_yield y,
+                                                const DoutPrefixProvider *dpp,
+                                                const BucketInstance::PutParams& params)
+{
+  if (params.objv_tracker) {
+    info.objv_tracker = *params.objv_tracker;
+  }
+
+  return svc.bucket->store_bucket_instance_info(ctx,
+                                                RGWSI_Bucket::get_bi_meta_key(bucket),
+                                                info,
+                                                params.orig_info,
+                                                params.exclusive,
+                                                params.mtime,
+                                                params.attrs,
+                                                y,
+                                                dpp);
+}
+
+int RGWBucketCtl::store_bucket_instance_info(const rgw_bucket& bucket,
+                                            RGWBucketInfo& info,
+                                            optional_yield y,
+                                            const DoutPrefixProvider *dpp,
+                                            const BucketInstance::PutParams& params)
+{
+  return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& ctx) {
+    return do_store_bucket_instance_info(ctx, bucket, info, y, dpp, params);
+  });
+}
+
+int RGWBucketCtl::remove_bucket_instance_info(const rgw_bucket& bucket,
+                                              RGWBucketInfo& info,
+                                              optional_yield y,
+                                              const DoutPrefixProvider *dpp,
+                                              const BucketInstance::RemoveParams& params)
+{
+  if (params.objv_tracker) {
+    info.objv_tracker = *params.objv_tracker;
+  }
+
+  return bmi_handler->call([&](RGWSI_Bucket_BI_Ctx& ctx) {
+    return svc.bucket->remove_bucket_instance_info(ctx,
+                                                   RGWSI_Bucket::get_bi_meta_key(bucket),
+                                                   info,
+                                                   &info.objv_tracker,
+                                                   y,
+                                                   dpp);
+  });
+}
+
+int RGWBucketCtl::do_store_linked_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+                                              RGWBucketInfo& info,
+                                              RGWBucketInfo *orig_info,
+                                              bool exclusive, real_time mtime,
+                                              obj_version *pep_objv,
+                                              map<string, bufferlist> *pattrs,
+                                              bool create_entry_point,
+					      optional_yield y, const DoutPrefixProvider *dpp)
+{
+  bool create_head = !info.has_instance_obj || create_entry_point;
+
+  int ret = svc.bucket->store_bucket_instance_info(ctx.bi,
+                                                   RGWSI_Bucket::get_bi_meta_key(info.bucket),
+                                                   info,
+                                                   orig_info,
+                                                   exclusive,
+                                                   mtime, pattrs,
+						   y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (!create_head)
+    return 0; /* done! */
+
+  RGWBucketEntryPoint entry_point;
+  entry_point.bucket = info.bucket;
+  entry_point.owner = info.owner;
+  entry_point.creation_time = info.creation_time;
+  entry_point.linked = true;
+  RGWObjVersionTracker ot;
+  if (pep_objv && !pep_objv->tag.empty()) {
+    ot.write_version = *pep_objv;
+  } else {
+    ot.generate_new_write_ver(cct);
+    if (pep_objv) {
+      *pep_objv = ot.write_version;
+    }
+  }
+  ret = svc.bucket->store_bucket_entrypoint_info(ctx.ep,
+                                                 RGWSI_Bucket::get_entrypoint_meta_key(info.bucket),
+                                                 entry_point,
+                                                 exclusive,
+                                                 mtime,
+                                                 pattrs,
+                                                 &ot,
+						 y,
+                                                 dpp);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+int RGWBucketCtl::convert_old_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+                                          const rgw_bucket& bucket,
+                                          optional_yield y,
+                                          const DoutPrefixProvider *dpp)
+{
+  RGWBucketEntryPoint entry_point;
+  real_time ep_mtime;
+  RGWObjVersionTracker ot;
+  map<string, bufferlist> attrs;
+  RGWBucketInfo info;
+  auto cct = svc.bucket->ctx();
+
+  ldpp_dout(dpp, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket << dendl;
+
+  int ret = svc.bucket->read_bucket_entrypoint_info(ctx.ep,
+                                                    RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+                                                    &entry_point, &ot, &ep_mtime, &attrs, y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket << dendl;
+    return ret;
+  }
+
+  if (!entry_point.has_bucket_info) {
+    /* already converted! */
+    return 0;
+  }
+
+  info = entry_point.old_bucket_info;
+
+  ot.generate_new_write_ver(cct);
+
+  ret = do_store_linked_bucket_info(ctx, info, nullptr, false, ep_mtime, &ot.write_version, &attrs, true, y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWBucketCtl::set_bucket_instance_attrs(RGWBucketInfo& bucket_info,
+                                            map<string, bufferlist>& attrs,
+                                            RGWObjVersionTracker *objv_tracker,
+                                            optional_yield y,
+                                            const DoutPrefixProvider *dpp)
+{
+  return call([&](RGWSI_Bucket_X_Ctx& ctx) {
+    rgw_bucket& bucket = bucket_info.bucket;
+
+    if (!bucket_info.has_instance_obj) {
+      /* an old bucket object, need to convert it */
+        int ret = convert_old_bucket_info(ctx, bucket, y, dpp);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed converting old bucket info: " << ret << dendl;
+          return ret;
+        }
+    }
+
+    return do_store_bucket_instance_info(ctx.bi,
+                                         bucket,
+                                         bucket_info,
+                                         y,
+                                         dpp,
+                                         BucketInstance::PutParams().set_attrs(&attrs)
+                                                                    .set_objv_tracker(objv_tracker)
+                                                                    .set_orig_info(&bucket_info));
+    });
+}
+
+
+int RGWBucketCtl::link_bucket(const rgw_user& user_id,
+                              const rgw_bucket& bucket,
+                              ceph::real_time creation_time,
+			      optional_yield y,
+                              const DoutPrefixProvider *dpp,
+                              bool update_entrypoint,
+                              rgw_ep_info *pinfo)
+{
+  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+    return do_link_bucket(ctx, user_id, bucket, creation_time,
+                          update_entrypoint, pinfo, y, dpp);
+  });
+}
+
+int RGWBucketCtl::do_link_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+                                 const rgw_user& user_id,
+                                 const rgw_bucket& bucket,
+                                 ceph::real_time creation_time,
+                                 bool update_entrypoint,
+                                 rgw_ep_info *pinfo,
+				 optional_yield y,
+                                 const DoutPrefixProvider *dpp)
+{
+  int ret;
+
+  RGWBucketEntryPoint ep;
+  RGWObjVersionTracker ot;
+  RGWObjVersionTracker& rot = (pinfo) ? pinfo->ep_objv : ot;
+  map<string, bufferlist> attrs, *pattrs = nullptr;
+  string meta_key;
+
+  if (update_entrypoint) {
+    meta_key = RGWSI_Bucket::get_entrypoint_meta_key(bucket);
+    if (pinfo) {
+      ep = pinfo->ep;
+      pattrs = &pinfo->attrs;
+    } else {
+      ret = svc.bucket->read_bucket_entrypoint_info(ctx,
+                                                    meta_key,
+                                                    &ep, &rot,
+                                                    nullptr, &attrs,
+                                                    y, dpp);
+      if (ret < 0 && ret != -ENOENT) {
+        ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() returned: "
+                      << cpp_strerror(-ret) << dendl;
+      }
+      pattrs = &attrs;
+    }
+  }
+
+  ret = svc.user->add_bucket(dpp, user_id, bucket, creation_time, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: error adding bucket to user directory:"
+		  << " user=" << user_id
+                  << " bucket=" << bucket
+		  << " err=" << cpp_strerror(-ret)
+		  << dendl;
+    goto done_err;
+  }
+
+  if (!update_entrypoint)
+    return 0;
+
+  ep.linked = true;
+  ep.owner = user_id;
+  ep.bucket = bucket;
+  ret = svc.bucket->store_bucket_entrypoint_info(
+    ctx, meta_key, ep, false, real_time(), pattrs, &rot, y, dpp);
+  if (ret < 0)
+    goto done_err;
+
+  return 0;
+
+done_err:
+  int r = do_unlink_bucket(ctx, user_id, bucket, true, y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed unlinking bucket on error cleanup: "
+                           << cpp_strerror(-r) << dendl;
+  }
+  return ret;
+}
+
+int RGWBucketCtl::unlink_bucket(const rgw_user& user_id, const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp, bool update_entrypoint)
+{
+  return bm_handler->call([&](RGWSI_Bucket_EP_Ctx& ctx) {
+    return do_unlink_bucket(ctx, user_id, bucket, update_entrypoint, y, dpp);
+  });
+}
+
+int RGWBucketCtl::do_unlink_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+                                   const rgw_user& user_id,
+                                   const rgw_bucket& bucket,
+                                   bool update_entrypoint,
+				   optional_yield y,
+                                   const DoutPrefixProvider *dpp)
+{
+  int ret = svc.user->remove_bucket(dpp, user_id, bucket, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: error removing bucket from directory: "
+        << cpp_strerror(-ret)<< dendl;
+  }
+
+  if (!update_entrypoint)
+    return 0;
+
+  RGWBucketEntryPoint ep;
+  RGWObjVersionTracker ot;
+  map<string, bufferlist> attrs;
+  string meta_key = RGWSI_Bucket::get_entrypoint_meta_key(bucket);
+  ret = svc.bucket->read_bucket_entrypoint_info(ctx, meta_key, &ep, &ot, nullptr, &attrs, y, dpp);
+  if (ret == -ENOENT)
+    return 0;
+  if (ret < 0)
+    return ret;
+
+  if (!ep.linked)
+    return 0;
+
+  if (ep.owner != user_id) {
+    ldpp_dout(dpp, 0) << "bucket entry point user mismatch, can't unlink bucket: " << ep.owner << " != " << user_id << dendl;
+    return -EINVAL;
+  }
+
+  ep.linked = false;
+  return svc.bucket->store_bucket_entrypoint_info(ctx, meta_key, ep, false, real_time(), &attrs, &ot, y, dpp);
+}
+
+int RGWBucketCtl::read_bucket_stats(const rgw_bucket& bucket,
+                                    RGWBucketEnt *result,
+                                    optional_yield y,
+                                    const DoutPrefixProvider *dpp)
+{
+  return call([&](RGWSI_Bucket_X_Ctx& ctx) {
+    return svc.bucket->read_bucket_stats(ctx, bucket, result, y, dpp);
+  });
+}
+
+int RGWBucketCtl::read_buckets_stats(map<string, RGWBucketEnt>& m,
+                                     optional_yield y, const DoutPrefixProvider *dpp)
+{
+  return call([&](RGWSI_Bucket_X_Ctx& ctx) {
+    return svc.bucket->read_buckets_stats(ctx, m, y, dpp);
+  });
+}
+
+int RGWBucketCtl::sync_user_stats(const DoutPrefixProvider *dpp, 
+                                  const rgw_user& user_id,
+                                  const RGWBucketInfo& bucket_info,
+				  optional_yield y,
+                                  RGWBucketEnt* pent)
+{
+  RGWBucketEnt ent;
+  if (!pent) {
+    pent = &ent;
+  }
+  int r = svc.bi->read_stats(dpp, bucket_info, pent, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << __func__ << "(): failed to read bucket stats (r=" << r << ")" << dendl;
+    return r;
+  }
+
+  return svc.user->flush_bucket_stats(dpp, user_id, *pent, y);
+}
+
+int RGWBucketCtl::get_sync_policy_handler(std::optional<rgw_zone_id> zone,
+                                          std::optional<rgw_bucket> bucket,
+                                          RGWBucketSyncPolicyHandlerRef *phandler,
+                                          optional_yield y,
+                                          const DoutPrefixProvider *dpp)
+{
+  int r = call([&](RGWSI_Bucket_X_Ctx& ctx) {
+    return svc.bucket_sync->get_policy_handler(ctx, zone, bucket, phandler, y, dpp);
+  });
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << __func__ << "(): failed to get policy handler for bucket=" << bucket << " (r=" << r << ")" << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int RGWBucketCtl::bucket_exports_data(const rgw_bucket& bucket,
+                                      optional_yield y,
+                                      const DoutPrefixProvider *dpp)
+{
+
+  RGWBucketSyncPolicyHandlerRef handler;
+
+  int r = get_sync_policy_handler(std::nullopt, bucket, &handler, y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  return handler->bucket_exports_data();
+}
+
+int RGWBucketCtl::bucket_imports_data(const rgw_bucket& bucket,
+                                      optional_yield y, const DoutPrefixProvider *dpp)
+{
+
+  RGWBucketSyncPolicyHandlerRef handler;
+
+  int r = get_sync_policy_handler(std::nullopt, bucket, &handler, y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  return handler->bucket_imports_data();
+}
+
+RGWBucketMetadataHandlerBase* RGWBucketMetaHandlerAllocator::alloc()
+{
+  return new RGWBucketMetadataHandler();
+}
+
+RGWBucketInstanceMetadataHandlerBase* RGWBucketInstanceMetaHandlerAllocator::alloc(rgw::sal::Driver* driver)
+{
+  return new RGWBucketInstanceMetadataHandler(driver);
+}
+
+RGWBucketMetadataHandlerBase* RGWArchiveBucketMetaHandlerAllocator::alloc()
+{
+  return new RGWArchiveBucketMetadataHandler();
+}
+
+RGWBucketInstanceMetadataHandlerBase* RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(rgw::sal::Driver* driver)
+{
+  return new RGWArchiveBucketInstanceMetadataHandler(driver);
+}
+
+
+void RGWBucketEntryPoint::generate_test_instances(list<RGWBucketEntryPoint*>& o)
+{
+  RGWBucketEntryPoint *bp = new RGWBucketEntryPoint();
+  init_bucket(&bp->bucket, "tenant", "bucket", "pool", ".index.pool", "marker", "10");
+  bp->owner = "owner";
+  bp->creation_time = ceph::real_clock::from_ceph_timespec({ceph_le32(2), ceph_le32(3)});
+
+  o.push_back(bp);
+  o.push_back(new RGWBucketEntryPoint);
+}
+
+void RGWBucketEntryPoint::dump(Formatter *f) const
+{
+  encode_json("bucket", bucket, f);
+  encode_json("owner", owner, f);
+  utime_t ut(creation_time);
+  encode_json("creation_time", ut, f);
+  encode_json("linked", linked, f);
+  encode_json("has_bucket_info", has_bucket_info, f);
+  if (has_bucket_info) {
+    encode_json("old_bucket_info", old_bucket_info, f);
+  }
+}
+
+void RGWBucketEntryPoint::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("bucket", bucket, obj);
+  JSONDecoder::decode_json("owner", owner, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("creation_time", ut, obj);
+  creation_time = ut.to_real_time();
+  JSONDecoder::decode_json("linked", linked, obj);
+  JSONDecoder::decode_json("has_bucket_info", has_bucket_info, obj);
+  if (has_bucket_info) {
+    JSONDecoder::decode_json("old_bucket_info", old_bucket_info, obj);
+  }
+}
+
diff --git a/src/rgw/driver/rados/rgw_bucket.h b/src/rgw/driver/rados/rgw_bucket.h
new file mode 100644
index 000000000..c13e737ce
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_bucket.h
@@ -0,0 +1,766 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <memory>
+#include <variant>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "include/types.h"
+#include "rgw_common.h"
+#include "rgw_tools.h"
+#include "rgw_metadata.h"
+#include "rgw/rgw_bucket.h"
+
+#include "rgw_string.h"
+#include "rgw_sal.h"
+
+#include "common/Formatter.h"
+#include "common/lru_map.h"
+#include "common/ceph_time.h"
+
+#include "rgw_formats.h"
+
+#include "services/svc_bucket_types.h"
+#include "services/svc_bucket_sync.h"
+
+// define as static when RGWBucket implementation completes
+extern void rgw_get_buckets_obj(const rgw_user& user_id, std::string& buckets_obj_id);
+
+class RGWSI_Meta;
+class RGWBucketMetadataHandler;
+class RGWBucketInstanceMetadataHandler;
+class RGWUserCtl;
+class RGWBucketCtl;
+class RGWZone;
+struct RGWZoneParams;
+
+// this is used as a filter to RGWRados::cls_bucket_list_ordered; it
+// conforms to the type RGWBucketListNameFilter
+extern bool rgw_bucket_object_check_filter(const std::string& oid);
+
+void init_default_bucket_layout(CephContext *cct, rgw::BucketLayout& layout,
+				const RGWZone& zone,
+				std::optional<uint32_t> shards,
+				std::optional<rgw::BucketIndexType> type);
+
+struct RGWBucketCompleteInfo {
+  RGWBucketInfo info;
+  std::map<std::string, bufferlist> attrs;
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+
+class RGWBucketEntryMetadataObject : public RGWMetadataObject {
+  RGWBucketEntryPoint ep;
+  std::map<std::string, bufferlist> attrs;
+public:
+  RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, const obj_version& v, real_time m) : ep(_ep) {
+    objv = v;
+    mtime = m;
+    set_pattrs (&attrs);
+  }
+  RGWBucketEntryMetadataObject(RGWBucketEntryPoint& _ep, const obj_version& v, real_time m, std::map<std::string, bufferlist>&& _attrs) :
+    ep(_ep), attrs(std::move(_attrs)) {
+    objv = v;
+    mtime = m;
+    set_pattrs (&attrs);
+  }
+
+  void dump(Formatter *f) const override {
+    ep.dump(f);
+  }
+
+  RGWBucketEntryPoint& get_ep() {
+    return ep;
+  }
+
+  std::map<std::string, bufferlist>& get_attrs() {
+    return attrs;
+  }
+};
+
+class RGWBucketInstanceMetadataObject : public RGWMetadataObject {
+  RGWBucketCompleteInfo info;
+public:
+  RGWBucketInstanceMetadataObject() {}
+  RGWBucketInstanceMetadataObject(RGWBucketCompleteInfo& i, const obj_version& v, real_time m) : info(i) {
+    objv = v;
+    mtime = m;
+  }
+
+  void dump(Formatter *f) const override {
+    info.dump(f);
+  }
+
+  void decode_json(JSONObj *obj) {
+    info.decode_json(obj);
+  }
+
+  RGWBucketCompleteInfo& get_bci() {
+    return info;
+  }
+  RGWBucketInfo& get_bucket_info() {
+    return info.info;
+  }
+};
+
+/**
+ * store a list of the user's buckets, with associated functinos.
+ */
+class RGWUserBuckets {
+  std::map<std::string, RGWBucketEnt> buckets;
+
+public:
+  RGWUserBuckets() = default;
+  RGWUserBuckets(RGWUserBuckets&&) = default;
+
+  RGWUserBuckets& operator=(const RGWUserBuckets&) = default;
+
+  void encode(bufferlist& bl) const {
+    using ceph::encode;
+    encode(buckets, bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    using ceph::decode;
+    decode(buckets, bl);
+  }
+  /**
+   * Check if the user owns a bucket by the given name.
+   */
+  bool owns(std::string& name) {
+    std::map<std::string, RGWBucketEnt>::iterator iter;
+    iter = buckets.find(name);
+    return (iter != buckets.end());
+  }
+
+  /**
+   * Add a (created) bucket to the user's bucket list.
+   */
+  void add(const RGWBucketEnt& bucket) {
+    buckets[bucket.bucket.name] = bucket;
+  }
+
+  /**
+   * Remove a bucket from the user's list by name.
+   */
+  void remove(const std::string& name) {
+    std::map<std::string, RGWBucketEnt>::iterator iter;
+    iter = buckets.find(name);
+    if (iter != buckets.end()) {
+      buckets.erase(iter);
+    }
+  }
+
+  /**
+   * Get the user's buckets as a map.
+   */
+  std::map<std::string, RGWBucketEnt>& get_buckets() { return buckets; }
+
+  /**
+   * Cleanup data structure
+   */
+  void clear() { buckets.clear(); }
+
+  size_t count() { return buckets.size(); }
+};
+WRITE_CLASS_ENCODER(RGWUserBuckets)
+
+class RGWBucketMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
+public:
+  virtual ~RGWBucketMetadataHandlerBase() {}
+  virtual void init(RGWSI_Bucket *bucket_svc,
+                    RGWBucketCtl *bucket_ctl) = 0;
+
+};
+
+class RGWBucketInstanceMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
+public:
+  virtual ~RGWBucketInstanceMetadataHandlerBase() {}
+  virtual void init(RGWSI_Zone *zone_svc,
+                    RGWSI_Bucket *bucket_svc,
+                    RGWSI_BucketIndex *bi_svc) = 0;
+};
+
+class RGWBucketMetaHandlerAllocator {
+public:
+  static RGWBucketMetadataHandlerBase *alloc();
+};
+
+class RGWBucketInstanceMetaHandlerAllocator {
+public:
+  static RGWBucketInstanceMetadataHandlerBase *alloc(rgw::sal::Driver* driver);
+};
+
+class RGWArchiveBucketMetaHandlerAllocator {
+public:
+  static RGWBucketMetadataHandlerBase *alloc();
+};
+
+class RGWArchiveBucketInstanceMetaHandlerAllocator {
+public:
+  static RGWBucketInstanceMetadataHandlerBase *alloc(rgw::sal::Driver* driver);
+};
+
+extern int rgw_remove_object(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::Bucket* bucket, rgw_obj_key& key);
+
+extern int rgw_object_get_attr(rgw::sal::Driver* driver, rgw::sal::Object* obj,
+			       const char* attr_name, bufferlist& out_bl,
+			       optional_yield y);
+
+extern void check_bad_user_bucket_mapping(rgw::sal::Driver* driver, rgw::sal::User& user, bool fix, optional_yield y, const DoutPrefixProvider *dpp);
+
+struct RGWBucketAdminOpState {
+  rgw_user uid;
+  std::string display_name;
+  std::string bucket_name;
+  std::string bucket_id;
+  std::string object_name;
+  std::string new_bucket_name;
+
+  bool list_buckets;
+  bool stat_buckets;
+  bool check_objects;
+  bool fix_index;
+  bool delete_child_objects;
+  bool bucket_stored;
+  bool sync_bucket;
+  bool dump_keys;
+  bool hide_progress;
+  int max_aio = 0;
+  ceph::timespan min_age = std::chrono::hours::zero();
+
+  std::unique_ptr<rgw::sal::Bucket>  bucket;
+
+  RGWQuotaInfo quota;
+  RGWRateLimitInfo ratelimit_info;
+
+  void set_fetch_stats(bool value) { stat_buckets = value; }
+  void set_check_objects(bool value) { check_objects = value; }
+  void set_fix_index(bool value) { fix_index = value; }
+  void set_delete_children(bool value) { delete_child_objects = value; }
+  void set_hide_progress(bool value) { hide_progress = value; }
+  void set_dump_keys(bool value) { dump_keys = value; }
+
+  void set_max_aio(int value) { max_aio = value; }
+  void set_min_age(ceph::timespan value) { min_age = value; }
+
+  void set_user_id(const rgw_user& user_id) {
+    if (!user_id.empty())
+      uid = user_id;
+  }
+  void set_tenant(const std::string& tenant_str) {
+    uid.tenant = tenant_str;
+  }
+  void set_bucket_name(const std::string& bucket_str) {
+    bucket_name = bucket_str; 
+  }
+  void set_object(std::string& object_str) {
+    object_name = object_str;
+  }
+  void set_new_bucket_name(std::string& new_bucket_str) {
+    new_bucket_name = new_bucket_str;
+  }
+  void set_quota(RGWQuotaInfo& value) {
+    quota = value;
+  }
+  void set_bucket_ratelimit(RGWRateLimitInfo& value) {
+    ratelimit_info = value;
+  }
+
+
+  void set_sync_bucket(bool value) { sync_bucket = value; }
+
+  rgw_user& get_user_id() { return uid; }
+  std::string& get_user_display_name() { return display_name; }
+  std::string& get_bucket_name() { return bucket_name; }
+  std::string& get_object_name() { return object_name; }
+  std::string& get_tenant() { return uid.tenant; }
+
+  rgw::sal::Bucket* get_bucket() { return bucket.get(); }
+  void set_bucket(std::unique_ptr<rgw::sal::Bucket> _bucket) {
+    bucket = std::move(_bucket);
+    bucket_stored = true;
+  }
+
+  void set_bucket_id(const std::string& bi) {
+    bucket_id = bi;
+  }
+  const std::string& get_bucket_id() { return bucket_id; }
+
+  bool will_fetch_stats() { return stat_buckets; }
+  bool will_fix_index() { return fix_index; }
+  bool will_delete_children() { return delete_child_objects; }
+  bool will_check_objects() { return check_objects; }
+  bool is_user_op() { return !uid.empty(); }
+  bool is_system_op() { return uid.empty(); }
+  bool has_bucket_stored() { return bucket_stored; }
+  int get_max_aio() { return max_aio; }
+  bool will_sync_bucket() { return sync_bucket; }
+
+  RGWBucketAdminOpState() : list_buckets(false), stat_buckets(false), check_objects(false), 
+                            fix_index(false), delete_child_objects(false),
+                            bucket_stored(false), sync_bucket(true),
+                            dump_keys(false), hide_progress(false) {}
+};
+
+
+/*
+ * A simple wrapper class for administrative bucket operations
+ */
+class RGWBucket {
+  RGWUserBuckets buckets;
+  rgw::sal::Driver* driver;
+  RGWAccessHandle handle;
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  std::unique_ptr<rgw::sal::User> user;
+
+  bool failure;
+
+  RGWObjVersionTracker ep_objv; // entrypoint object version
+
+public:
+  RGWBucket() : driver(NULL), handle(NULL), failure(false) {}
+  int init(rgw::sal::Driver* storage, RGWBucketAdminOpState& op_state, optional_yield y,
+             const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  int check_bad_index_multipart(RGWBucketAdminOpState& op_state,
+              RGWFormatterFlusher& flusher,
+              const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  int check_object_index(const DoutPrefixProvider *dpp, 
+                         RGWBucketAdminOpState& op_state,
+                         RGWFormatterFlusher& flusher,
+                         optional_yield y,
+                         std::string *err_msg = NULL);
+  int check_index_olh(rgw::sal::RadosStore* rados_store, const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state,
+                      RGWFormatterFlusher& flusher);
+  int check_index_unlinked(rgw::sal::RadosStore* rados_store, const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state,
+                           RGWFormatterFlusher& flusher);
+
+  int check_index(const DoutPrefixProvider *dpp,
+          RGWBucketAdminOpState& op_state,
+          std::map<RGWObjCategory, RGWStorageStats>& existing_stats,
+          std::map<RGWObjCategory, RGWStorageStats>& calculated_stats,
+          std::string *err_msg = NULL);
+
+  int chown(RGWBucketAdminOpState& op_state, const std::string& marker,
+            optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+  int set_quota(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  int remove_object(const DoutPrefixProvider *dpp, RGWBucketAdminOpState& op_state, std::string *err_msg = NULL);
+  int policy_bl_to_stream(bufferlist& bl, std::ostream& o);
+  int get_policy(RGWBucketAdminOpState& op_state, RGWAccessControlPolicy& policy, optional_yield y, const DoutPrefixProvider *dpp);
+  int sync(RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  void clear_failure() { failure = false; }
+
+  const RGWBucketInfo& get_bucket_info() const { return bucket->get_info(); }
+};
+
+class RGWBucketAdminOp {
+public:
+  static int get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+  static int get_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWAccessControlPolicy& policy, const DoutPrefixProvider *dpp);
+  static int dump_s3_policy(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  std::ostream& os, const DoutPrefixProvider *dpp);
+
+  static int unlink(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
+  static int link(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+  static int chown(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const std::string& marker, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  static int check_index(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp);
+  static int check_index_olh(rgw::sal::RadosStore* driver, RGWBucketAdminOpState& op_state,
+                             RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+  static int check_index_unlinked(rgw::sal::RadosStore* driver, RGWBucketAdminOpState& op_state,
+                                  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+
+  static int remove_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, optional_yield y,
+			   const DoutPrefixProvider *dpp, bool bypass_gc = false, bool keep_index_consistent = true);
+  static int remove_object(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
+  static int info(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y, const DoutPrefixProvider *dpp);
+  static int limit_check(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+			 const std::list<std::string>& user_ids,
+			 RGWFormatterFlusher& flusher, optional_yield y,
+                         const DoutPrefixProvider *dpp,
+			 bool warnings_only = false);
+  static int set_quota(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp);
+
+  static int list_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+				  RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+
+  static int clear_stale_instances(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+				   RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+  static int fix_lc_shards(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+                           RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp);
+  static int fix_obj_expiry(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state,
+			    RGWFormatterFlusher& flusher, const DoutPrefixProvider *dpp, bool dry_run = false);
+
+  static int sync_bucket(rgw::sal::Driver* driver, RGWBucketAdminOpState& op_state, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+};
+
+struct rgw_ep_info {
+  RGWBucketEntryPoint &ep;
+  std::map<std::string, buffer::list>& attrs;
+  RGWObjVersionTracker ep_objv;
+  rgw_ep_info(RGWBucketEntryPoint &ep, std::map<std::string, bufferlist>& attrs)
+    : ep(ep), attrs(attrs) {}
+};
+
+class RGWBucketCtl {
+  CephContext *cct;
+
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_Bucket *bucket{nullptr};
+    RGWSI_Bucket_Sync *bucket_sync{nullptr};
+    RGWSI_BucketIndex *bi{nullptr};
+    RGWSI_User* user = nullptr;
+  } svc;
+
+  struct Ctl {
+    RGWUserCtl *user{nullptr};
+  } ctl;
+
+  RGWBucketMetadataHandler *bm_handler;
+  RGWBucketInstanceMetadataHandler *bmi_handler;
+
+  RGWSI_Bucket_BE_Handler bucket_be_handler; /* bucket backend handler */
+  RGWSI_BucketInstance_BE_Handler bi_be_handler; /* bucket instance backend handler */
+
+  int call(std::function<int(RGWSI_Bucket_X_Ctx& ctx)> f);
+
+public:
+  RGWBucketCtl(RGWSI_Zone *zone_svc,
+               RGWSI_Bucket *bucket_svc,
+               RGWSI_Bucket_Sync *bucket_sync_svc,
+               RGWSI_BucketIndex *bi_svc,
+               RGWSI_User* user_svc);
+
+  void init(RGWUserCtl *user_ctl,
+            RGWBucketMetadataHandler *_bm_handler,
+            RGWBucketInstanceMetadataHandler *_bmi_handler,
+            RGWDataChangesLog *datalog,
+            const DoutPrefixProvider *dpp);
+
+  struct Bucket {
+    struct GetParams {
+      RGWObjVersionTracker *objv_tracker{nullptr};
+      real_time *mtime{nullptr};
+      std::map<std::string, bufferlist> *attrs{nullptr};
+      rgw_cache_entry_info *cache_info{nullptr};
+      boost::optional<obj_version> refresh_version;
+      std::optional<RGWSI_MetaBackend_CtxParams> bectx_params;
+
+      GetParams() {}
+
+      GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+
+      GetParams& set_mtime(ceph::real_time *_mtime) {
+        mtime = _mtime;
+        return *this;
+      }
+
+      GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+
+      GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
+        cache_info = _cache_info;
+        return *this;
+      }
+
+      GetParams& set_refresh_version(const obj_version& _refresh_version) {
+        refresh_version = _refresh_version;
+        return *this;
+      }
+
+      GetParams& set_bectx_params(std::optional<RGWSI_MetaBackend_CtxParams> _bectx_params) {
+        bectx_params = _bectx_params;
+        return *this;
+      }
+    };
+
+    struct PutParams {
+      RGWObjVersionTracker *objv_tracker{nullptr};
+      ceph::real_time mtime;
+      bool exclusive{false};
+      std::map<std::string, bufferlist> *attrs{nullptr};
+
+      PutParams() {}
+
+      PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+
+      PutParams& set_mtime(const ceph::real_time& _mtime) {
+        mtime = _mtime;
+        return *this;
+      }
+
+      PutParams& set_exclusive(bool _exclusive) {
+        exclusive = _exclusive;
+        return *this;
+      }
+
+      PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+    };
+
+    struct RemoveParams {
+      RGWObjVersionTracker *objv_tracker{nullptr};
+
+      RemoveParams() {}
+
+      RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+    };
+  };
+
+  struct BucketInstance {
+    struct GetParams {
+      real_time *mtime{nullptr};
+      std::map<std::string, bufferlist> *attrs{nullptr};
+      rgw_cache_entry_info *cache_info{nullptr};
+      boost::optional<obj_version> refresh_version;
+      RGWObjVersionTracker *objv_tracker{nullptr};
+      std::optional<RGWSI_MetaBackend_CtxParams> bectx_params;
+
+      GetParams() {}
+
+      GetParams& set_mtime(ceph::real_time *_mtime) {
+        mtime = _mtime;
+        return *this;
+      }
+
+      GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+
+      GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
+        cache_info = _cache_info;
+        return *this;
+      }
+
+      GetParams& set_refresh_version(const obj_version& _refresh_version) {
+        refresh_version = _refresh_version;
+        return *this;
+      }
+
+      GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+
+      GetParams& set_bectx_params(std::optional<RGWSI_MetaBackend_CtxParams> _bectx_params) {
+        bectx_params = _bectx_params;
+        return *this;
+      }
+    };
+
+    struct PutParams {
+      std::optional<RGWBucketInfo *> orig_info; /* nullopt: orig_info was not fetched,
+                                                   nullptr: orig_info was not found (new bucket instance */
+      ceph::real_time mtime;
+      bool exclusive{false};
+      std::map<std::string, bufferlist> *attrs{nullptr};
+      RGWObjVersionTracker *objv_tracker{nullptr};
+
+      PutParams() {}
+
+      PutParams& set_orig_info(RGWBucketInfo *pinfo) {
+        orig_info = pinfo;
+        return *this;
+      }
+
+      PutParams& set_mtime(const ceph::real_time& _mtime) {
+        mtime = _mtime;
+        return *this;
+      }
+
+      PutParams& set_exclusive(bool _exclusive) {
+        exclusive = _exclusive;
+        return *this;
+      }
+
+      PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+
+      PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+    };
+
+    struct RemoveParams {
+      RGWObjVersionTracker *objv_tracker{nullptr};
+
+      RemoveParams() {}
+
+      RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+    };
+  };
+
+  /* bucket entrypoint */
+  int read_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                  RGWBucketEntryPoint *info,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp,
+                                  const Bucket::GetParams& params = {});
+  int store_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                   RGWBucketEntryPoint& info,
+                                   optional_yield y,
+                                   const DoutPrefixProvider *dpp,
+                                   const Bucket::PutParams& params = {});
+  int remove_bucket_entrypoint_info(const rgw_bucket& bucket,
+                                    optional_yield y,
+                                    const DoutPrefixProvider *dpp,
+                                    const Bucket::RemoveParams& params = {});
+
+  /* bucket instance */
+  int read_bucket_instance_info(const rgw_bucket& bucket,
+                                  RGWBucketInfo *info,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp,
+                                  const BucketInstance::GetParams& params = {});
+  int store_bucket_instance_info(const rgw_bucket& bucket,
+                                 RGWBucketInfo& info,
+                                 optional_yield y,
+                                 const DoutPrefixProvider *dpp,
+                                 const BucketInstance::PutParams& params = {});
+  int remove_bucket_instance_info(const rgw_bucket& bucket,
+                                  RGWBucketInfo& info,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp,
+                                  const BucketInstance::RemoveParams& params = {});
+
+  /*
+   * bucket_id may or may not be provided
+   *
+   * ep_objv_tracker might not be populated even if provided. Will only be set if entrypoint is read
+   * (that is: if bucket_id is empty).
+   */
+  int read_bucket_info(const rgw_bucket& bucket,
+                       RGWBucketInfo *info,
+                       optional_yield y,
+                       const DoutPrefixProvider *dpp,
+                       const BucketInstance::GetParams& params = {},
+		       RGWObjVersionTracker *ep_objv_tracker = nullptr);
+
+
+  int set_bucket_instance_attrs(RGWBucketInfo& bucket_info,
+                                std::map<std::string, bufferlist>& attrs,
+                                RGWObjVersionTracker *objv_tracker,
+                                optional_yield y,
+                                const DoutPrefixProvider *dpp);
+
+  /* user/bucket */
+  int link_bucket(const rgw_user& user_id,
+                  const rgw_bucket& bucket,
+                  ceph::real_time creation_time,
+		  optional_yield y,
+                  const DoutPrefixProvider *dpp,
+                  bool update_entrypoint = true,
+                  rgw_ep_info *pinfo = nullptr);
+
+  int unlink_bucket(const rgw_user& user_id,
+                    const rgw_bucket& bucket,
+		    optional_yield y,
+                    const DoutPrefixProvider *dpp,
+                    bool update_entrypoint = true);
+
+  int read_buckets_stats(std::map<std::string, RGWBucketEnt>& m,
+                         optional_yield y,
+                         const DoutPrefixProvider *dpp);
+
+  int read_bucket_stats(const rgw_bucket& bucket,
+                        RGWBucketEnt *result,
+                        optional_yield y,
+                        const DoutPrefixProvider *dpp);
+
+  /* quota related */
+  int sync_user_stats(const DoutPrefixProvider *dpp, 
+                      const rgw_user& user_id, const RGWBucketInfo& bucket_info,
+		      optional_yield y,
+                      RGWBucketEnt* pent);
+
+  /* bucket sync */
+  int get_sync_policy_handler(std::optional<rgw_zone_id> zone,
+                              std::optional<rgw_bucket> bucket,
+			      RGWBucketSyncPolicyHandlerRef *phandler,
+			      optional_yield y,
+                              const DoutPrefixProvider *dpp);
+  int bucket_exports_data(const rgw_bucket& bucket,
+                          optional_yield y,
+                          const DoutPrefixProvider *dpp);
+  int bucket_imports_data(const rgw_bucket& bucket,
+                          optional_yield y,
+                          const DoutPrefixProvider *dpp);
+
+private:
+  int convert_old_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+                              const rgw_bucket& bucket,
+                              optional_yield y,
+                              const DoutPrefixProvider *dpp);
+
+  int do_store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                    const rgw_bucket& bucket,
+                                    RGWBucketInfo& info,
+                                    optional_yield y,
+                                    const DoutPrefixProvider *dpp,
+                                    const BucketInstance::PutParams& params);
+
+  int do_store_linked_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+                                  RGWBucketInfo& info,
+                                  RGWBucketInfo *orig_info,
+                                  bool exclusive, real_time mtime,
+                                  obj_version *pep_objv,
+                                  std::map<std::string, bufferlist> *pattrs,
+                                  bool create_entry_point,
+				  optional_yield,
+                                  const DoutPrefixProvider *dpp);
+
+  int do_link_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+                     const rgw_user& user,
+                     const rgw_bucket& bucket,
+                     ceph::real_time creation_time,
+                     bool update_entrypoint,
+                     rgw_ep_info *pinfo,
+		     optional_yield y,
+                     const DoutPrefixProvider *dpp);
+
+  int do_unlink_bucket(RGWSI_Bucket_EP_Ctx& ctx,
+                       const rgw_user& user_id,
+                       const rgw_bucket& bucket,
+                       bool update_entrypoint,
+		       optional_yield y,
+                       const DoutPrefixProvider *dpp);
+
+};
+
+bool rgw_find_bucket_by_id(const DoutPrefixProvider *dpp, CephContext *cct, rgw::sal::Driver* driver, const std::string& marker,
+                           const std::string& bucket_id, rgw_bucket* bucket_out);
diff --git a/src/rgw/driver/rados/rgw_bucket_sync.cc b/src/rgw/driver/rados/rgw_bucket_sync.cc
new file mode 100644
index 000000000..6ff76c16a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_bucket_sync.cc
@@ -0,0 +1,1018 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_bucket_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_bucket_sync.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_entity& e) {
+  os << "{b=" << rgw_sync_bucket_entities::bucket_key(e.bucket) << ",z=" << e.zone.value_or(rgw_zone_id()) << ",az=" << (int)e.all_zones << "}";
+  return os;
+}
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_pipe& pipe) {
+  os << "{s=" << pipe.source << ",d=" << pipe.dest << "}";
+  return os;
+}
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_entities& e) {
+  os << "{b=" << rgw_sync_bucket_entities::bucket_key(e.bucket) << ",z=" << e.zones.value_or(std::set<rgw_zone_id>()) << "}";
+  return os;
+}
+
+ostream& operator<<(ostream& os, const rgw_sync_bucket_pipes& pipe) {
+  os << "{id=" << pipe.id << ",s=" << pipe.source << ",d=" << pipe.dest << "}";
+  return os;
+}
+
+static std::vector<rgw_sync_bucket_pipe> filter_relevant_pipes(const std::vector<rgw_sync_bucket_pipes>& pipes,
+                                                               const rgw_zone_id& source_zone,
+                                                               const rgw_zone_id& dest_zone)
+{
+  std::vector<rgw_sync_bucket_pipe> relevant_pipes;
+  for (auto& p : pipes) {
+    if (p.source.match_zone(source_zone) &&
+        p.dest.match_zone(dest_zone)) {
+      for (auto pipe : p.expand()) {
+        pipe.source.apply_zone(source_zone);
+        pipe.dest.apply_zone(dest_zone);
+        relevant_pipes.push_back(pipe);
+      }
+    }
+  }
+
+  return relevant_pipes;
+}
+
+static bool is_wildcard_bucket(const rgw_bucket& bucket)
+{
+  return bucket.name.empty();
+}
+
+void rgw_sync_group_pipe_map::dump(ceph::Formatter *f) const
+{
+  encode_json("zone", zone.id, f);
+  encode_json("buckets", rgw_sync_bucket_entities::bucket_key(bucket), f);
+  encode_json("sources", sources, f);
+  encode_json("dests", dests, f);
+}
+
+
+template <typename CB1, typename CB2>
+void rgw_sync_group_pipe_map::try_add_to_pipe_map(const rgw_zone_id& source_zone,
+                                                  const rgw_zone_id& dest_zone,
+                                                  const std::vector<rgw_sync_bucket_pipes>& pipes,
+                                                  zb_pipe_map_t *pipe_map,
+                                                  CB1 filter_cb,
+                                                  CB2 call_filter_cb)
+{
+  if (!filter_cb(source_zone, nullopt, dest_zone, nullopt)) {
+    return;
+  }
+  auto relevant_pipes = filter_relevant_pipes(pipes, source_zone, dest_zone);
+
+  for (auto& pipe : relevant_pipes) {
+    rgw_sync_bucket_entity zb;
+    if (!call_filter_cb(pipe, &zb)) {
+      continue;
+    }
+    pipe_map->insert(make_pair(zb, pipe));
+  }
+}
+          
+template <typename CB>
+void rgw_sync_group_pipe_map::try_add_source(const rgw_zone_id& source_zone,
+                  const rgw_zone_id& dest_zone,
+                  const std::vector<rgw_sync_bucket_pipes>& pipes,
+                  CB filter_cb)
+{
+  return try_add_to_pipe_map(source_zone, dest_zone, pipes,
+                             &sources,
+                             filter_cb,
+                             [&](const rgw_sync_bucket_pipe& pipe, rgw_sync_bucket_entity *zb) {
+                             *zb = rgw_sync_bucket_entity{source_zone, pipe.source.get_bucket()};
+                             return filter_cb(source_zone, zb->bucket, dest_zone, pipe.dest.get_bucket());
+                             });
+}
+
+template <typename CB>
+void rgw_sync_group_pipe_map::try_add_dest(const rgw_zone_id& source_zone,
+                                           const rgw_zone_id& dest_zone,
+                                           const std::vector<rgw_sync_bucket_pipes>& pipes,
+                                           CB filter_cb)
+{
+  return try_add_to_pipe_map(source_zone, dest_zone, pipes,
+                             &dests,
+                             filter_cb,
+                             [&](const rgw_sync_bucket_pipe& pipe, rgw_sync_bucket_entity *zb) {
+                             *zb = rgw_sync_bucket_entity{dest_zone, pipe.dest.get_bucket()};
+                             return filter_cb(source_zone, pipe.source.get_bucket(), dest_zone, zb->bucket);
+                             });
+}
+
+using zb_pipe_map_t = rgw_sync_group_pipe_map::zb_pipe_map_t;
+
+pair<zb_pipe_map_t::const_iterator, zb_pipe_map_t::const_iterator> rgw_sync_group_pipe_map::find_pipes(const zb_pipe_map_t& m,
+                                                                                                       const rgw_zone_id& zone,
+                                                                                                       std::optional<rgw_bucket> b) const
+{
+  if (!b) {
+    return m.equal_range(rgw_sync_bucket_entity{zone, rgw_bucket()});
+  }
+
+  auto zb = rgw_sync_bucket_entity{zone, *b};
+
+  auto range = m.equal_range(zb);
+  if (range.first == range.second &&
+      !is_wildcard_bucket(*b)) {
+    /* couldn't find the specific bucket, try to find by wildcard */
+    zb.bucket = rgw_bucket();
+    range = m.equal_range(zb);
+  }
+
+  return range;
+}
+
+
+template <typename CB>
+void rgw_sync_group_pipe_map::init(const DoutPrefixProvider *dpp,
+                                   CephContext *cct,
+                                   const rgw_zone_id& _zone,
+                                   std::optional<rgw_bucket> _bucket,
+                                   const rgw_sync_policy_group& group,
+                                   rgw_sync_data_flow_group *_default_flow,
+                                   std::set<rgw_zone_id> *_pall_zones,
+                                   CB filter_cb) {
+  zone = _zone;
+  bucket = _bucket;
+  default_flow = _default_flow;
+  pall_zones = _pall_zones;
+
+  rgw_sync_bucket_entity zb(zone, bucket);
+
+  status = group.status;
+
+  std::vector<rgw_sync_bucket_pipes> zone_pipes;
+
+  string bucket_key = (bucket ? bucket->get_key() : "*");
+
+  /* only look at pipes that touch the specific zone and bucket */
+  for (auto& pipe : group.pipes) {
+    if (pipe.contains_zone_bucket(zone, bucket)) {
+      ldpp_dout(dpp, 20) << __func__ << "(): pipe_map (zone=" << zone << " bucket=" << bucket_key << "): adding potential pipe: " << pipe << dendl;
+      zone_pipes.push_back(pipe);
+    }
+  }
+
+  const rgw_sync_data_flow_group *pflow;
+
+  if (!group.data_flow.empty()) {
+    pflow = &group.data_flow;
+  } else {
+    if (!default_flow) {
+      return;
+    }
+    pflow = default_flow;
+  }
+
+  auto& flow = *pflow;
+
+  pall_zones->insert(zone);
+
+  /* symmetrical */
+  for (auto& symmetrical_group : flow.symmetrical) {
+    if (symmetrical_group.zones.find(zone) != symmetrical_group.zones.end()) {
+      for (auto& z : symmetrical_group.zones) {
+        if (z != zone) {
+          pall_zones->insert(z);
+          try_add_source(z, zone, zone_pipes, filter_cb);
+          try_add_dest(zone, z, zone_pipes, filter_cb);
+        }
+      }
+    }
+  }
+
+  /* directional */
+  for (auto& rule : flow.directional) {
+    if (rule.source_zone == zone) {
+      pall_zones->insert(rule.dest_zone);
+      try_add_dest(zone, rule.dest_zone, zone_pipes, filter_cb);
+    } else if (rule.dest_zone == zone) {
+      pall_zones->insert(rule.source_zone);
+      try_add_source(rule.source_zone, zone, zone_pipes, filter_cb);
+    }
+  }
+}
+
+/*
+ * find all relevant pipes in our zone that match {dest_bucket} <- {source_zone, source_bucket}
+ */
+vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_source_pipes(const rgw_zone_id& source_zone,
+                                                                        std::optional<rgw_bucket> source_bucket,
+                                                                        std::optional<rgw_bucket> dest_bucket) const {
+  vector<rgw_sync_bucket_pipe> result;
+
+  auto range = find_pipes(sources, source_zone, source_bucket);
+
+  for (auto iter = range.first; iter != range.second; ++iter) {
+    auto pipe = iter->second;
+    if (pipe.dest.match_bucket(dest_bucket)) {
+      result.push_back(pipe);
+    }
+  }
+  return result;
+}
+
+/*
+ * find all relevant pipes in other zones that pull from a specific
+ * source bucket in out zone {source_bucket} -> {dest_zone, dest_bucket}
+ */
+vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_dest_pipes(std::optional<rgw_bucket> source_bucket,
+                                                                      const rgw_zone_id& dest_zone,
+                                                                      std::optional<rgw_bucket> dest_bucket) const {
+  vector<rgw_sync_bucket_pipe> result;
+
+  auto range = find_pipes(dests, dest_zone, dest_bucket);
+
+  for (auto iter = range.first; iter != range.second; ++iter) {
+    auto pipe = iter->second;
+    if (pipe.source.match_bucket(source_bucket)) {
+      result.push_back(pipe);
+    }
+  }
+
+  return result;
+}
+
+/*
+ * find all relevant pipes from {source_zone, source_bucket} -> {dest_zone, dest_bucket}
+ */
+vector<rgw_sync_bucket_pipe> rgw_sync_group_pipe_map::find_pipes(const rgw_zone_id& source_zone,
+                                                                 std::optional<rgw_bucket> source_bucket,
+                                                                 const rgw_zone_id& dest_zone,
+                                                                 std::optional<rgw_bucket> dest_bucket) const {
+  if (dest_zone == zone) {
+    return find_source_pipes(source_zone, source_bucket, dest_bucket);
+  }
+
+  if (source_zone == zone) {
+    return find_dest_pipes(source_bucket, dest_zone, dest_bucket);
+  }
+
+  return vector<rgw_sync_bucket_pipe>();
+}
+
+void RGWBucketSyncFlowManager::pipe_rules::insert(const rgw_sync_bucket_pipe& pipe)
+{
+  pipes.push_back(pipe);
+
+  auto ppipe = &pipes.back();
+  auto prefix = ppipe->params.source.filter.prefix.value_or(string());
+
+  prefix_refs.insert(make_pair(prefix, ppipe));
+
+  for (auto& t : ppipe->params.source.filter.tags) {
+    string tag = t.key + "=" + t.value;
+    auto titer = tag_refs.find(tag);
+    if (titer != tag_refs.end() &&
+        ppipe->params.priority > titer->second->params.priority) {
+      titer->second = ppipe;
+    } else {
+      tag_refs[tag] = ppipe;
+    }
+  }
+}
+
+bool RGWBucketSyncFlowManager::pipe_rules::find_basic_info_without_tags(const rgw_obj_key& key,
+                                                                        std::optional<rgw_user> *user,
+                                                                        std::optional<rgw_user> *acl_translation_owner,
+                                                                        std::optional<string> *storage_class,
+                                                                        rgw_sync_pipe_params::Mode *mode,
+                                                                        bool *need_more_info) const
+{
+  std::optional<string> owner;
+
+  *need_more_info = false;
+
+  if (prefix_refs.empty()) {
+    return false;
+  }
+
+  auto end = prefix_refs.upper_bound(key.name);
+  auto iter = end;
+  if (iter != prefix_refs.begin()) {
+    --iter;
+  }
+  if (iter == prefix_refs.end()) {
+    return false;
+  }
+
+  if (iter != prefix_refs.begin()) {
+    iter = prefix_refs.find(iter->first); /* prefix_refs is multimap, find first element
+                                             holding that key */
+  }
+
+  std::vector<decltype(iter)> iters;
+
+  std::optional<int> priority;
+
+  for (; iter != end; ++iter) {
+    auto& prefix = iter->first;
+    if (!boost::starts_with(key.name, prefix)) {
+      continue;
+    }
+
+    auto& rule_params = iter->second->params;
+    auto& filter = rule_params.source.filter;
+
+    if (rule_params.priority > priority) {
+      priority = rule_params.priority;
+
+      if (!filter.has_tags()) {
+        iters.clear();
+      }
+      iters.push_back(iter);
+
+      *need_more_info = filter.has_tags(); /* if highest priority filter has tags, then
+                                              we can't be sure if it would be used.
+                                              We need to first read the info from the source object */
+    }
+  }
+
+  if (iters.empty()) {
+    return false;
+  }
+
+  std::optional<rgw_user> _user;
+  std::optional<rgw_sync_pipe_acl_translation> _acl_translation;
+  std::optional<string> _storage_class;
+  rgw_sync_pipe_params::Mode _mode{rgw_sync_pipe_params::Mode::MODE_SYSTEM};
+
+  // make sure all params are the same by saving the first one
+  // encountered and comparing all subsequent to it
+  bool first_iter = true;
+  for (auto& iter : iters) {
+    const rgw_sync_pipe_params& rule_params = iter->second->params;
+    if (first_iter) {
+      _user = rule_params.user;
+      _acl_translation = rule_params.dest.acl_translation;
+      _storage_class = rule_params.dest.storage_class;
+      _mode = rule_params.mode;
+      first_iter = false;
+    } else {
+      // note: three of these == operators are comparing std::optional
+      // against std::optional; as one would expect they are equal a)
+      // if both do not contain values or b) if both do and those
+      // contained values are the same
+      const bool conflict =
+	!(_user == rule_params.user &&
+	  _acl_translation == rule_params.dest.acl_translation &&
+	  _storage_class == rule_params.dest.storage_class &&
+	  _mode == rule_params.mode);
+      if (conflict) {
+	*need_more_info = true;
+	return false;
+      }
+    }
+  }
+
+  *user = _user;
+  if (_acl_translation) {
+    *acl_translation_owner = _acl_translation->owner;
+  }
+  *storage_class = _storage_class;
+  *mode = _mode;
+
+  return true;
+}
+
+bool RGWBucketSyncFlowManager::pipe_rules::find_obj_params(const rgw_obj_key& key,
+                                                           const RGWObjTags::tag_map_t& tags,
+                                                           rgw_sync_pipe_params *params) const
+{
+  if (prefix_refs.empty()) {
+    return false;
+  }
+
+  auto iter = prefix_refs.upper_bound(key.name);
+  if (iter != prefix_refs.begin()) {
+    --iter;
+  }
+  if (iter == prefix_refs.end()) {
+    return false;
+  }
+
+  auto end = prefix_refs.upper_bound(key.name);
+  auto max = end;
+
+  std::optional<int> priority;
+
+  for (; iter != end; ++iter) {
+    /* NOTE: this is not the most efficient way to do it,
+     * a trie data structure would be better
+     */
+    auto& prefix = iter->first;
+    if (!boost::starts_with(key.name, prefix)) {
+      continue;
+    }
+
+    auto& rule_params = iter->second->params;
+    auto& filter = rule_params.source.filter;
+
+    if (!filter.check_tags(tags)) {
+      continue;
+    }
+
+    if (rule_params.priority > priority) {
+      priority = rule_params.priority;
+      max = iter;
+    }
+  }
+
+  if (max == end) {
+    return false;
+  }
+
+  *params = max->second->params;
+  return true;
+}
+
+/*
+ * return either the current prefix for s, or the next one if s is not within a prefix
+ */
+
+RGWBucketSyncFlowManager::pipe_rules::prefix_map_t::const_iterator RGWBucketSyncFlowManager::pipe_rules::prefix_search(const std::string& s) const
+{
+  if (prefix_refs.empty()) {
+    return prefix_refs.end();
+  }
+  auto next = prefix_refs.upper_bound(s);
+  auto iter = next;
+  if (iter != prefix_refs.begin()) {
+    --iter;
+  }
+  if (!boost::starts_with(s, iter->first)) {
+    return next;
+  }
+
+  return iter;
+}
+
+void RGWBucketSyncFlowManager::pipe_set::insert(const rgw_sync_bucket_pipe& pipe) {
+  /* Ensure this pipe doesn't match with any disabled pipes */ 
+  for (auto p: disabled_pipe_map) {
+    if (p.second.source.match(pipe.source) && p.second.dest.match(pipe.dest)) {
+      return;
+    }
+  }
+  pipe_map.insert(make_pair(pipe.id, pipe));
+
+  auto& rules_ref = rules[endpoints_pair(pipe)];
+
+  if (!rules_ref) {
+    rules_ref = make_shared<RGWBucketSyncFlowManager::pipe_rules>();
+  }
+
+  rules_ref->insert(pipe);
+
+  pipe_handler h(rules_ref, pipe);
+
+  handlers.insert(h);
+}
+
+void RGWBucketSyncFlowManager::pipe_set::remove_all() {
+  pipe_map.clear();
+  disabled_pipe_map.clear();
+  rules.clear();
+  handlers.clear();
+}
+
+void RGWBucketSyncFlowManager::pipe_set::disable(const rgw_sync_bucket_pipe& pipe) {
+  /* This pipe is disabled. Add it to disabled pipes & remove any
+   * matching pipes already inserted
+   */
+  disabled_pipe_map.insert(make_pair(pipe.id, pipe));
+  for (auto iter_p = pipe_map.begin(); iter_p != pipe_map.end(); ) {
+    auto p = iter_p++;
+    if (p->second.source.match(pipe.source) && p->second.dest.match(pipe.dest)) {
+      auto& rules_ref = rules[endpoints_pair(p->second)];
+      if (rules_ref) {
+        pipe_handler h(rules_ref, p->second);
+        handlers.erase(h);
+      }
+      rules.erase(endpoints_pair(p->second));
+      pipe_map.erase(p);
+    }
+  }
+}
+
+void RGWBucketSyncFlowManager::pipe_set::dump(ceph::Formatter *f) const
+{
+  encode_json("pipes", pipe_map, f);
+}
+
+bool RGWBucketSyncFlowManager::allowed_data_flow(const rgw_zone_id& source_zone,
+                                                 std::optional<rgw_bucket> source_bucket,
+                                                 const rgw_zone_id& dest_zone,
+                                                 std::optional<rgw_bucket> dest_bucket,
+                                                 bool check_activated) const
+{
+  bool found = false;
+  bool found_activated = false;
+
+  for (auto m : flow_groups) {
+    auto& fm = m.second;
+    auto pipes = fm.find_pipes(source_zone, source_bucket,
+                               dest_zone, dest_bucket);
+
+    bool is_found = !pipes.empty();
+
+    if (is_found) {
+      switch (fm.status) {
+        case rgw_sync_policy_group::Status::FORBIDDEN:
+          return false;
+        case rgw_sync_policy_group::Status::ENABLED:
+          found = true;
+          found_activated = true;
+          break;
+        case rgw_sync_policy_group::Status::ALLOWED:
+          found = true;
+          break;
+        default:
+          break; /* unknown -- ignore */
+      }
+    }
+  }
+
+  if (check_activated && found_activated) {
+    return true;
+  }
+
+  return found;
+}
+
+void RGWBucketSyncFlowManager::init(const DoutPrefixProvider *dpp, const rgw_sync_policy_info& sync_policy) {
+  std::optional<rgw_sync_data_flow_group> default_flow;
+  if (parent) {
+    default_flow.emplace();
+    default_flow->init_default(parent->all_zones);
+  }
+
+  for (auto& item : sync_policy.groups) {
+    auto& group = item.second;
+    auto& flow_group_map = flow_groups[group.id];
+
+    flow_group_map.init(dpp, cct, zone_id, bucket, group,
+                        (default_flow ? &(*default_flow) : nullptr),
+                        &all_zones,
+                        [&](const rgw_zone_id& source_zone,
+                            std::optional<rgw_bucket> source_bucket,
+                            const rgw_zone_id& dest_zone,
+                            std::optional<rgw_bucket> dest_bucket) {
+                        if (!parent) {
+                          return true;
+                        }
+                        return parent->allowed_data_flow(source_zone,
+                                                         source_bucket,
+                                                         dest_zone,
+                                                         dest_bucket,
+                                                         false); /* just check that it's not disabled */
+                        });
+  }
+}
+
+/*
+* These are the semantics to be followed while resolving the policy
+* conflicts -
+*
+* ==================================================
+* zonegroup               bucket          Result
+* ==================================================
+* enabled                 enabled         enabled
+*                         allowed         enabled
+*                         forbidden       disabled
+* allowed                 enabled         enabled
+*                         allowed         disabled
+*                         forbidden       disabled
+* forbidden               enabled         disabled
+*                         allowed         disabled
+*                         forbidden       disabled
+*
+* In case multiple group policies are set to reflect for any sync pair
+* (<source-zone,source-bucket>, <dest-zone,dest-bucket>), the following
+* rules are applied in the order-
+* 1) Even if one policy status is FORBIDDEN, the sync will be disabled
+* 2) Atleast one policy should be	ENABLED	for the	sync to	be allowed.
+*
+*/
+void RGWBucketSyncFlowManager::reflect(const DoutPrefixProvider *dpp,
+                                       std::optional<rgw_bucket> effective_bucket,
+                                       RGWBucketSyncFlowManager::pipe_set *source_pipes,
+                                       RGWBucketSyncFlowManager::pipe_set *dest_pipes,
+                                       bool only_enabled) const
+
+{
+  string effective_bucket_key;
+  bool is_forbidden = false;
+  if (effective_bucket) {
+    effective_bucket_key = effective_bucket->get_key();
+  }
+  if (parent) {
+    parent->reflect(dpp, effective_bucket, source_pipes, dest_pipes, only_enabled);
+  }
+
+  for (auto& item : flow_groups) {
+    auto& flow_group_map = item.second;
+    is_forbidden = false;
+
+    if (flow_group_map.status == rgw_sync_policy_group::Status::FORBIDDEN) {
+      /* FORBIDDEN takes precedence over all the other rules.
+       * Remove any other pipes which may allow access.
+       */
+      is_forbidden = true;
+    } else if (flow_group_map.status != rgw_sync_policy_group::Status::ENABLED &&
+        (only_enabled || flow_group_map.status != rgw_sync_policy_group::Status::ALLOWED)) {
+      /* only return enabled groups */
+      continue;
+    }
+
+    for (auto& entry : flow_group_map.sources) {
+      rgw_sync_bucket_pipe pipe = entry.second;
+      if (!pipe.dest.match_bucket(effective_bucket)) {
+        continue;
+      }
+
+      pipe.source.apply_bucket(effective_bucket);
+      pipe.dest.apply_bucket(effective_bucket);
+
+      if (is_forbidden) {
+        ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): removing source pipe: " << pipe << dendl;
+        source_pipes->disable(pipe);
+      } else {
+        ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): adding source pipe: " << pipe << dendl;
+        source_pipes->insert(pipe);
+      }
+    }
+
+    for (auto& entry : flow_group_map.dests) {
+      rgw_sync_bucket_pipe pipe = entry.second;
+
+      if (!pipe.source.match_bucket(effective_bucket)) {
+        continue;
+      }
+
+      pipe.source.apply_bucket(effective_bucket);
+      pipe.dest.apply_bucket(effective_bucket);
+
+      if (is_forbidden) {
+        ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): removing dest pipe: " << pipe << dendl;
+        dest_pipes->disable(pipe);
+      } else {
+        ldpp_dout(dpp, 20) << __func__ << "(): flow manager (bucket=" << effective_bucket_key << "): adding dest pipe: " << pipe << dendl;
+        dest_pipes->insert(pipe);
+      }
+    }
+  }
+}
+
+
+RGWBucketSyncFlowManager::RGWBucketSyncFlowManager(CephContext *_cct,
+                                                   const rgw_zone_id& _zone_id,
+                                                   std::optional<rgw_bucket> _bucket,
+                                                   const RGWBucketSyncFlowManager *_parent) : cct(_cct),
+                                                                                              zone_id(_zone_id),
+                                                                                              bucket(_bucket),
+                                                                                              parent(_parent) {}
+
+
+void RGWSyncPolicyCompat::convert_old_sync_config(RGWSI_Zone *zone_svc,
+                                                  RGWSI_SyncModules *sync_modules_svc,
+                                                  rgw_sync_policy_info *ppolicy)
+{
+  bool found = false;
+
+  rgw_sync_policy_info policy;
+
+  auto& group = policy.groups["default"];
+  auto& zonegroup = zone_svc->get_zonegroup();
+
+  for (const auto& ziter1 : zonegroup.zones) {
+    auto& id1 = ziter1.first;
+    const RGWZone& z1 = ziter1.second;
+
+    for (const auto& ziter2 : zonegroup.zones) {
+      auto& id2 = ziter2.first;
+      const RGWZone& z2 = ziter2.second;
+
+      if (id1 == id2) {
+        continue;
+      }
+
+      if (z1.syncs_from(z2.name)) {
+        found = true;
+        rgw_sync_directional_rule *rule;
+        group.data_flow.find_or_create_directional(id2,
+                                                   id1,
+                                                   &rule);
+      }
+    }
+  }
+
+  if (!found) { /* nothing syncs */
+    return;
+  }
+
+  rgw_sync_bucket_pipes pipes;
+  pipes.id = "all";
+  pipes.source.all_zones = true;
+  pipes.dest.all_zones = true;
+
+  group.pipes.emplace_back(std::move(pipes));
+
+
+  group.status = rgw_sync_policy_group::Status::ENABLED;
+
+  *ppolicy = std::move(policy);
+}
+
+RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(RGWSI_Zone *_zone_svc,
+                                                       RGWSI_SyncModules *sync_modules_svc,
+						       RGWSI_Bucket_Sync *_bucket_sync_svc,
+                                                       std::optional<rgw_zone_id> effective_zone) : zone_svc(_zone_svc) ,
+                                                                                                    bucket_sync_svc(_bucket_sync_svc) {
+  zone_id = effective_zone.value_or(zone_svc->zone_id());
+  flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
+                                              zone_id,
+                                              nullopt,
+                                              nullptr));
+  sync_policy = zone_svc->get_zonegroup().sync_policy;
+
+  if (sync_policy.empty()) {
+    RGWSyncPolicyCompat::convert_old_sync_config(zone_svc, sync_modules_svc, &sync_policy);
+    legacy_config = true;
+  }
+}
+
+RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+                                                       const RGWBucketInfo& _bucket_info,
+                                                       map<string, bufferlist>&& _bucket_attrs) : parent(_parent),
+                                                                                                       bucket_info(_bucket_info),
+                                                                                                       bucket_attrs(std::move(_bucket_attrs)) {
+  if (_bucket_info.sync_policy) {
+    sync_policy = *_bucket_info.sync_policy;
+
+    for (auto& entry : sync_policy.groups) {
+      for (auto& pipe : entry.second.pipes) {
+        if (pipe.params.mode == rgw_sync_pipe_params::MODE_USER &&
+            pipe.params.user.empty()) {
+          pipe.params.user = _bucket_info.owner;
+        }
+      }
+    }
+  }
+  legacy_config = parent->legacy_config;
+  bucket = _bucket_info.bucket;
+  zone_svc = parent->zone_svc;
+  bucket_sync_svc = parent->bucket_sync_svc;
+  flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
+                                              parent->zone_id,
+                                              _bucket_info.bucket,
+                                              parent->flow_mgr.get()));
+}
+
+RGWBucketSyncPolicyHandler::RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+                                                       const rgw_bucket& _bucket,
+                                                       std::optional<rgw_sync_policy_info> _sync_policy) : parent(_parent) {
+  if (_sync_policy) {
+    sync_policy = *_sync_policy;
+  }
+  legacy_config = parent->legacy_config;
+  bucket = _bucket;
+  zone_svc = parent->zone_svc;
+  bucket_sync_svc = parent->bucket_sync_svc;
+  flow_mgr.reset(new RGWBucketSyncFlowManager(zone_svc->ctx(),
+                                              parent->zone_id,
+                                              _bucket,
+                                              parent->flow_mgr.get()));
+}
+
+RGWBucketSyncPolicyHandler *RGWBucketSyncPolicyHandler::alloc_child(const RGWBucketInfo& bucket_info,
+                                                                    map<string, bufferlist>&& bucket_attrs) const
+{
+  return new RGWBucketSyncPolicyHandler(this, bucket_info, std::move(bucket_attrs));
+}
+
+RGWBucketSyncPolicyHandler *RGWBucketSyncPolicyHandler::alloc_child(const rgw_bucket& bucket,
+                                                                    std::optional<rgw_sync_policy_info> sync_policy) const
+{
+  return new RGWBucketSyncPolicyHandler(this, bucket, sync_policy);
+}
+
+int RGWBucketSyncPolicyHandler::init(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  int r = bucket_sync_svc->get_bucket_sync_hints(dpp, bucket.value_or(rgw_bucket()),
+						 &source_hints,
+						 &target_hints,
+						 y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to initialize bucket sync policy handler: get_bucket_sync_hints() on bucket="
+      << bucket << " returned r=" << r << dendl;
+    return r;
+  }
+
+  flow_mgr->init(dpp, sync_policy);
+
+  reflect(dpp, &source_pipes,
+          &target_pipes,
+          &sources,
+          &targets,
+          &source_zones,
+          &target_zones,
+          true);
+
+  return 0;
+}
+
+void RGWBucketSyncPolicyHandler::reflect(const DoutPrefixProvider *dpp, RGWBucketSyncFlowManager::pipe_set *psource_pipes,
+                                         RGWBucketSyncFlowManager::pipe_set *ptarget_pipes,
+                                         map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *psources,
+                                         map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *ptargets,
+                                         std::set<rgw_zone_id> *psource_zones,
+                                         std::set<rgw_zone_id> *ptarget_zones,
+                                         bool only_enabled) const
+{
+  RGWBucketSyncFlowManager::pipe_set _source_pipes;
+  RGWBucketSyncFlowManager::pipe_set _target_pipes;
+  map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> _sources;
+  map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> _targets;
+  std::set<rgw_zone_id> _source_zones;
+  std::set<rgw_zone_id> _target_zones;
+
+  flow_mgr->reflect(dpp, bucket, &_source_pipes, &_target_pipes, only_enabled);
+
+  for (auto& entry : _source_pipes.pipe_map) {
+    auto& pipe = entry.second;
+    if (!pipe.source.zone) {
+      continue;
+    }
+    _source_zones.insert(*pipe.source.zone);
+    _sources[*pipe.source.zone].insert(pipe);
+  }
+
+  for (auto& entry : _target_pipes.pipe_map) {
+    auto& pipe = entry.second;
+    if (!pipe.dest.zone) {
+      continue;
+    }
+    _target_zones.insert(*pipe.dest.zone);
+    _targets[*pipe.dest.zone].insert(pipe);
+  }
+
+  if (psource_pipes) {
+    *psource_pipes = std::move(_source_pipes);
+  }
+  if (ptarget_pipes) {
+    *ptarget_pipes = std::move(_target_pipes);
+  }
+  if (psources) {
+    *psources = std::move(_sources);
+  }
+  if (ptargets) {
+    *ptargets = std::move(_targets);
+  }
+  if (psource_zones) {
+    *psource_zones = std::move(_source_zones);
+  }
+  if (ptarget_zones) {
+    *ptarget_zones = std::move(_target_zones);
+  }
+}
+
+multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_sources() const
+{
+  multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
+
+  for (auto& source_entry : sources) {
+    auto& zone_id = source_entry.first;
+
+    auto& pipes = source_entry.second.pipe_map;
+
+    for (auto& entry : pipes) {
+      auto& pipe = entry.second;
+      m.insert(make_pair(zone_id, pipe));
+    }
+  }
+
+  for (auto& pipe : resolved_sources) {
+    if (!pipe.source.zone) {
+      continue;
+    }
+
+    m.insert(make_pair(*pipe.source.zone, pipe));
+  }
+
+  return m;
+}
+
+multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_dests() const
+{
+  multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
+
+  for (auto& dest_entry : targets) {
+    auto& zone_id = dest_entry.first;
+
+    auto& pipes = dest_entry.second.pipe_map;
+
+    for (auto& entry : pipes) {
+      auto& pipe = entry.second;
+      m.insert(make_pair(zone_id, pipe));
+    }
+  }
+
+  for (auto& pipe : resolved_dests) {
+    if (!pipe.dest.zone) {
+      continue;
+    }
+
+    m.insert(make_pair(*pipe.dest.zone, pipe));
+  }
+
+  return m;
+}
+
+multimap<rgw_zone_id, rgw_sync_bucket_pipe> RGWBucketSyncPolicyHandler::get_all_dests_in_zone(const rgw_zone_id& zone_id) const
+{
+  multimap<rgw_zone_id, rgw_sync_bucket_pipe> m;
+
+  auto iter = targets.find(zone_id);
+  if (iter != targets.end()) {
+    auto& pipes = iter->second.pipe_map;
+
+    for (auto& entry : pipes) {
+      auto& pipe = entry.second;
+      m.insert(make_pair(zone_id, pipe));
+    }
+  }
+
+  for (auto& pipe : resolved_dests) {
+    if (!pipe.dest.zone ||
+        *pipe.dest.zone != zone_id) {
+      continue;
+    }
+
+    m.insert(make_pair(*pipe.dest.zone, pipe));
+  }
+
+  return m;
+}
+
+void RGWBucketSyncPolicyHandler::get_pipes(std::set<rgw_sync_bucket_pipe> *_sources, std::set<rgw_sync_bucket_pipe> *_targets,
+                                           std::optional<rgw_sync_bucket_entity> filter_peer) { /* return raw pipes */
+  for (auto& entry : source_pipes.pipe_map) {
+    auto& source_pipe = entry.second;
+    if (!filter_peer ||
+        source_pipe.source.match(*filter_peer)) {
+      _sources->insert(source_pipe);
+    }
+  }
+
+  for (auto& entry : target_pipes.pipe_map) {
+    auto& target_pipe = entry.second;
+    if (!filter_peer ||
+        target_pipe.dest.match(*filter_peer)) {
+      _targets->insert(target_pipe);
+    }
+  }
+}
+
+bool RGWBucketSyncPolicyHandler::bucket_exports_data() const
+{
+  if (!bucket) {
+    return false;
+  }
+
+  if (!zone_svc->sync_module_exports_data()) {
+    return false;
+  }
+
+  if (bucket_is_sync_source()) {
+    return true;
+  }
+
+  return (zone_svc->need_to_log_data() &&
+          bucket_info->datasync_flag_enabled());
+}
+
+bool RGWBucketSyncPolicyHandler::bucket_imports_data() const
+{
+  return bucket_is_sync_target();
+}
+
diff --git a/src/rgw/driver/rados/rgw_bucket_sync.h b/src/rgw/driver/rados/rgw_bucket_sync.h
new file mode 100644
index 000000000..d425ecf17
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_bucket_sync.h
@@ -0,0 +1,416 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_common.h"
+#include "rgw_sync_policy.h"
+
+class RGWSI_Zone;
+class RGWSI_SyncModules;
+class RGWSI_Bucket_Sync;
+
+struct rgw_sync_group_pipe_map;
+struct rgw_sync_bucket_pipes;
+struct rgw_sync_policy_info;
+
+struct rgw_sync_group_pipe_map {
+  rgw_zone_id zone;
+  std::optional<rgw_bucket> bucket;
+
+  rgw_sync_policy_group::Status status{rgw_sync_policy_group::Status::UNKNOWN};
+
+  using zb_pipe_map_t = std::multimap<rgw_sync_bucket_entity, rgw_sync_bucket_pipe>;
+
+  zb_pipe_map_t sources; /* all the pipes where zone is pulling from */
+  zb_pipe_map_t dests; /* all the pipes that pull from zone */
+
+  std::set<rgw_zone_id> *pall_zones{nullptr};
+  rgw_sync_data_flow_group *default_flow{nullptr}; /* flow to use if policy doesn't define it,
+                                                      used in the case of bucket sync policy, not at the
+                                                      zonegroup level */
+
+  void dump(ceph::Formatter *f) const;
+
+  template <typename CB1, typename CB2>
+  void try_add_to_pipe_map(const rgw_zone_id& source_zone,
+                           const rgw_zone_id& dest_zone,
+                           const std::vector<rgw_sync_bucket_pipes>& pipes,
+                           zb_pipe_map_t *pipe_map,
+                           CB1 filter_cb,
+                           CB2 call_filter_cb);
+          
+  template <typename CB>
+  void try_add_source(const rgw_zone_id& source_zone,
+                      const rgw_zone_id& dest_zone,
+                      const std::vector<rgw_sync_bucket_pipes>& pipes,
+                      CB filter_cb);
+          
+  template <typename CB>
+  void try_add_dest(const rgw_zone_id& source_zone,
+                  const rgw_zone_id& dest_zone,
+                  const std::vector<rgw_sync_bucket_pipes>& pipes,
+                  CB filter_cb);
+          
+  std::pair<zb_pipe_map_t::const_iterator, zb_pipe_map_t::const_iterator> find_pipes(const zb_pipe_map_t& m,
+                                                                                const rgw_zone_id& zone,
+                                                                                std::optional<rgw_bucket> b) const;
+
+  template <typename CB>
+  void init(const DoutPrefixProvider *dpp, CephContext *cct,
+            const rgw_zone_id& _zone,
+            std::optional<rgw_bucket> _bucket,
+            const rgw_sync_policy_group& group,
+            rgw_sync_data_flow_group *_default_flow,
+            std::set<rgw_zone_id> *_pall_zones,
+            CB filter_cb);
+
+  /*
+   * find all relevant pipes in our zone that match {dest_bucket} <- {source_zone, source_bucket}
+   */
+  std::vector<rgw_sync_bucket_pipe> find_source_pipes(const rgw_zone_id& source_zone,
+                                                 std::optional<rgw_bucket> source_bucket,
+                                                 std::optional<rgw_bucket> dest_bucket) const;
+
+  /*
+   * find all relevant pipes in other zones that pull from a specific
+   * source bucket in out zone {source_bucket} -> {dest_zone, dest_bucket}
+   */
+  std::vector<rgw_sync_bucket_pipe> find_dest_pipes(std::optional<rgw_bucket> source_bucket,
+                                               const rgw_zone_id& dest_zone,
+                                               std::optional<rgw_bucket> dest_bucket) const;
+
+  /*
+   * find all relevant pipes from {source_zone, source_bucket} -> {dest_zone, dest_bucket}
+   */
+  std::vector<rgw_sync_bucket_pipe> find_pipes(const rgw_zone_id& source_zone,
+                                          std::optional<rgw_bucket> source_bucket,
+                                          const rgw_zone_id& dest_zone,
+                                          std::optional<rgw_bucket> dest_bucket) const;
+};
+
+class RGWSyncPolicyCompat {
+public:
+  static void convert_old_sync_config(RGWSI_Zone *zone_svc,
+                                      RGWSI_SyncModules *sync_modules_svc,
+                                      rgw_sync_policy_info *ppolicy);
+};
+
+class RGWBucketSyncFlowManager {
+  friend class RGWBucketSyncPolicyHandler;
+public:
+  struct endpoints_pair {
+    rgw_sync_bucket_entity source;
+    rgw_sync_bucket_entity dest;
+
+    endpoints_pair() {}
+    endpoints_pair(const rgw_sync_bucket_pipe& pipe) {
+      source = pipe.source;
+      dest = pipe.dest;
+    }
+
+    bool operator<(const endpoints_pair& e) const {
+      if (source < e.source) {
+        return true;
+      }
+      if (e.source < source) {
+        return false;
+      }
+      return (dest < e.dest);
+    }
+  };
+
+  /*
+   * pipe_rules: deal with a set of pipes that have common endpoints_pair
+   */
+  class pipe_rules {
+    std::list<rgw_sync_bucket_pipe> pipes;
+
+  public:
+    using prefix_map_t = std::multimap<std::string, rgw_sync_bucket_pipe *>;
+
+    std::map<std::string, rgw_sync_bucket_pipe *> tag_refs;
+    prefix_map_t prefix_refs;
+
+    void insert(const rgw_sync_bucket_pipe& pipe);
+
+    bool find_basic_info_without_tags(const rgw_obj_key& key,
+                                      std::optional<rgw_user> *user,
+                                      std::optional<rgw_user> *acl_translation,
+                                      std::optional<std::string> *storage_class,
+                                      rgw_sync_pipe_params::Mode *mode,
+                                      bool *need_more_info) const;
+    bool find_obj_params(const rgw_obj_key& key, 
+                         const RGWObjTags::tag_map_t& tags,
+                         rgw_sync_pipe_params *params) const;
+
+    void scan_prefixes(std::vector<std::string> *prefixes) const;
+
+    prefix_map_t::const_iterator prefix_begin() const {
+      return prefix_refs.begin();
+    }
+    prefix_map_t::const_iterator prefix_search(const std::string& s) const;
+    prefix_map_t::const_iterator prefix_end() const {
+      return prefix_refs.end();
+    }
+  };
+
+  using pipe_rules_ref = std::shared_ptr<pipe_rules>;
+
+  /*
+   * pipe_handler: extends endpoints_rule to point at the corresponding rules handler
+   */
+  struct pipe_handler : public endpoints_pair {
+    pipe_rules_ref rules;
+
+    pipe_handler() {}
+    pipe_handler(pipe_rules_ref& _rules,
+                 const rgw_sync_bucket_pipe& _pipe) : endpoints_pair(_pipe),
+                                                      rules(_rules) {}
+    bool specific() const {
+      return source.specific() && dest.specific();
+    }
+    
+    bool find_basic_info_without_tags(const rgw_obj_key& key,
+                                      std::optional<rgw_user> *user,
+                                      std::optional<rgw_user> *acl_translation,
+                                      std::optional<std::string> *storage_class,
+                                      rgw_sync_pipe_params::Mode *mode,
+                                      bool *need_more_info) const {
+      if (!rules) {
+        return false;
+      }
+      return rules->find_basic_info_without_tags(key, user, acl_translation, storage_class, mode, need_more_info);
+    }
+
+    bool find_obj_params(const rgw_obj_key& key,
+                         const RGWObjTags::tag_map_t& tags,
+                         rgw_sync_pipe_params *params) const {
+      if (!rules) {
+        return false;
+      }
+      return rules->find_obj_params(key, tags, params);
+    }
+  };
+
+  struct pipe_set {
+    std::map<endpoints_pair, pipe_rules_ref> rules;
+    std::multimap<std::string, rgw_sync_bucket_pipe> pipe_map;
+    std::multimap<std::string, rgw_sync_bucket_pipe> disabled_pipe_map;
+
+    std::set<pipe_handler> handlers;
+
+    using iterator = std::set<pipe_handler>::iterator;
+
+    void clear() {
+      rules.clear();
+      pipe_map.clear();
+      disabled_pipe_map.clear();
+      handlers.clear();
+    }
+
+    void insert(const rgw_sync_bucket_pipe& pipe);
+    void remove_all();
+    void disable(const rgw_sync_bucket_pipe& pipe);
+
+    iterator begin() const {
+      return handlers.begin();
+    }
+
+    iterator end() const {
+      return handlers.end();
+    }
+
+    void dump(ceph::Formatter *f) const;
+  };
+
+private:
+
+  CephContext *cct;
+
+  rgw_zone_id zone_id;
+  std::optional<rgw_bucket> bucket;
+
+  const RGWBucketSyncFlowManager *parent{nullptr};
+
+  std::map<std::string, rgw_sync_group_pipe_map> flow_groups;
+
+  std::set<rgw_zone_id> all_zones;
+
+  bool allowed_data_flow(const rgw_zone_id& source_zone,
+                         std::optional<rgw_bucket> source_bucket,
+                         const rgw_zone_id& dest_zone,
+                         std::optional<rgw_bucket> dest_bucket,
+                         bool check_activated) const;
+
+  /*
+   * find all the matching flows om a flow map for a specific bucket
+   */
+  void update_flow_maps(const rgw_sync_bucket_pipes& pipe);
+
+  void init(const DoutPrefixProvider *dpp, const rgw_sync_policy_info& sync_policy);
+
+public:
+
+  RGWBucketSyncFlowManager(CephContext *_cct,
+                           const rgw_zone_id& _zone_id,
+                           std::optional<rgw_bucket> _bucket,
+                           const RGWBucketSyncFlowManager *_parent);
+
+  void reflect(const DoutPrefixProvider *dpp, std::optional<rgw_bucket> effective_bucket,
+               pipe_set *flow_by_source,
+               pipe_set *flow_by_dest,  
+               bool only_enabled) const;
+
+};
+
+static inline std::ostream& operator<<(std::ostream& os, const RGWBucketSyncFlowManager::endpoints_pair& e) {
+  os << e.dest << " -> " << e.source;
+  return os;
+}
+
+class RGWBucketSyncPolicyHandler {
+  bool legacy_config{false};
+  const RGWBucketSyncPolicyHandler *parent{nullptr};
+  RGWSI_Zone *zone_svc;
+  RGWSI_Bucket_Sync *bucket_sync_svc;
+  rgw_zone_id zone_id;
+  std::optional<RGWBucketInfo> bucket_info;
+  std::optional<std::map<std::string, bufferlist> > bucket_attrs;
+  std::optional<rgw_bucket> bucket;
+  std::unique_ptr<RGWBucketSyncFlowManager> flow_mgr;
+  rgw_sync_policy_info sync_policy;
+
+  RGWBucketSyncFlowManager::pipe_set source_pipes;
+  RGWBucketSyncFlowManager::pipe_set target_pipes;
+
+  std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> sources; /* source pipes by source zone id */
+  std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> targets; /* target pipes by target zone id */
+
+  std::set<rgw_zone_id> source_zones;
+  std::set<rgw_zone_id> target_zones;
+
+  std::set<rgw_bucket> source_hints;
+  std::set<rgw_bucket> target_hints;
+  std::set<rgw_sync_bucket_pipe> resolved_sources;
+  std::set<rgw_sync_bucket_pipe> resolved_dests;
+
+
+  bool bucket_is_sync_source() const {
+    return !targets.empty() || !resolved_dests.empty();
+  }
+
+  bool bucket_is_sync_target() const {
+    return !sources.empty() || !resolved_sources.empty();
+  }
+
+  RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+                             const RGWBucketInfo& _bucket_info,
+                             std::map<std::string, bufferlist>&& _bucket_attrs);
+
+  RGWBucketSyncPolicyHandler(const RGWBucketSyncPolicyHandler *_parent,
+                             const rgw_bucket& _bucket,
+                             std::optional<rgw_sync_policy_info> _sync_policy);
+public:
+  RGWBucketSyncPolicyHandler(RGWSI_Zone *_zone_svc,
+                             RGWSI_SyncModules *sync_modules_svc,
+			     RGWSI_Bucket_Sync *bucket_sync_svc,
+                             std::optional<rgw_zone_id> effective_zone = std::nullopt);
+
+  RGWBucketSyncPolicyHandler *alloc_child(const RGWBucketInfo& bucket_info,
+                                          std::map<std::string, bufferlist>&& bucket_attrs) const;
+  RGWBucketSyncPolicyHandler *alloc_child(const rgw_bucket& bucket,
+                                          std::optional<rgw_sync_policy_info> sync_policy) const;
+
+  int init(const DoutPrefixProvider *dpp, optional_yield y);
+
+  void reflect(const DoutPrefixProvider *dpp, RGWBucketSyncFlowManager::pipe_set *psource_pipes,
+               RGWBucketSyncFlowManager::pipe_set *ptarget_pipes,
+               std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *psources,
+               std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set> *ptargets,
+               std::set<rgw_zone_id> *psource_zones,
+               std::set<rgw_zone_id> *ptarget_zones,
+               bool only_enabled) const;
+
+  void set_resolved_hints(std::set<rgw_sync_bucket_pipe>&& _resolved_sources,
+                          std::set<rgw_sync_bucket_pipe>&& _resolved_dests) {
+    resolved_sources = std::move(_resolved_sources);
+    resolved_dests = std::move(_resolved_dests);
+  }
+
+  const std::set<rgw_sync_bucket_pipe>& get_resolved_source_hints() {
+    return resolved_sources;
+  }
+
+  const std::set<rgw_sync_bucket_pipe>& get_resolved_dest_hints() {
+    return resolved_dests;
+  }
+
+  const std::set<rgw_zone_id>& get_source_zones() const {
+    return source_zones;
+  }
+
+  const std::set<rgw_zone_id>& get_target_zones() const {
+    return target_zones;
+  }
+
+  const  std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& get_sources() {
+    return sources;
+  }
+
+  std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_sources() const;
+  std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_dests() const;
+  std::multimap<rgw_zone_id, rgw_sync_bucket_pipe> get_all_dests_in_zone(const rgw_zone_id& zone_id) const;
+
+  const std::map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& get_targets() {
+    return targets;
+  }
+
+  const std::optional<RGWBucketInfo>& get_bucket_info() const {
+    return bucket_info;
+  }
+
+  const std::optional<std::map<std::string, bufferlist> >& get_bucket_attrs() const {
+    return bucket_attrs;
+  }
+
+  void get_pipes(RGWBucketSyncFlowManager::pipe_set **_sources, RGWBucketSyncFlowManager::pipe_set **_targets) { /* return raw pipes (with zone name) */
+    *_sources = &source_pipes;
+    *_targets = &target_pipes;
+  }
+  void get_pipes(std::set<rgw_sync_bucket_pipe> *sources, std::set<rgw_sync_bucket_pipe> *targets,
+                 std::optional<rgw_sync_bucket_entity> filter_peer);
+
+  const std::set<rgw_bucket>& get_source_hints() const {
+    return source_hints;
+  }
+
+  const std::set<rgw_bucket>& get_target_hints() const {
+    return target_hints;
+  }
+
+  bool bucket_exports_data() const;
+  bool bucket_imports_data() const;
+
+  const rgw_sync_policy_info& get_sync_policy() const {
+    return sync_policy;
+  }
+
+  bool is_legacy_config() const {
+    return legacy_config;
+  }
+};
+
diff --git a/src/rgw/driver/rados/rgw_cr_rados.cc b/src/rgw/driver/rados/rgw_cr_rados.cc
new file mode 100644
index 000000000..d8e0ecba6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_cr_rados.cc
@@ -0,0 +1,1165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/compat.h"
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_sync_counters.h"
+#include "rgw_bucket.h"
+#include "rgw_datalog_notify.h"
+#include "rgw_cr_rest.h"
+#include "rgw_rest_conn.h"
+#include "rgw_rados.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_cls.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+#include <boost/asio/yield.hpp>
+#include <boost/container/flat_set.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+bool RGWAsyncRadosProcessor::RGWWQ::_enqueue(RGWAsyncRadosRequest *req) {
+  if (processor->is_going_down()) {
+    return false;
+  }
+  req->get();
+  processor->m_req_queue.push_back(req);
+  dout(20) << "enqueued request req=" << hex << req << dec << dendl;
+  _dump_queue();
+  return true;
+}
+
+bool RGWAsyncRadosProcessor::RGWWQ::_empty() {
+  return processor->m_req_queue.empty();
+}
+
+RGWAsyncRadosRequest *RGWAsyncRadosProcessor::RGWWQ::_dequeue() {
+  if (processor->m_req_queue.empty())
+    return NULL;
+  RGWAsyncRadosRequest *req = processor->m_req_queue.front();
+  processor->m_req_queue.pop_front();
+  dout(20) << "dequeued request req=" << hex << req << dec << dendl;
+  _dump_queue();
+  return req;
+}
+
+void RGWAsyncRadosProcessor::RGWWQ::_process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) {
+  processor->handle_request(this, req);
+  processor->req_throttle.put(1);
+}
+
+void RGWAsyncRadosProcessor::RGWWQ::_dump_queue() {
+  if (!g_conf()->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    return;
+  }
+  deque<RGWAsyncRadosRequest *>::iterator iter;
+  if (processor->m_req_queue.empty()) {
+    dout(20) << "RGWWQ: empty" << dendl;
+    return;
+  }
+  dout(20) << "RGWWQ:" << dendl;
+  for (iter = processor->m_req_queue.begin(); iter != processor->m_req_queue.end(); ++iter) {
+    dout(20) << "req: " << hex << *iter << dec << dendl;
+  }
+}
+
+RGWAsyncRadosProcessor::RGWAsyncRadosProcessor(CephContext *_cct, int num_threads)
+  : cct(_cct), m_tp(cct, "RGWAsyncRadosProcessor::m_tp", "rados_async", num_threads),
+    req_throttle(_cct, "rgw_async_rados_ops", num_threads * 2),
+    req_wq(this,
+	   ceph::make_timespan(g_conf()->rgw_op_thread_timeout),
+	   ceph::make_timespan(g_conf()->rgw_op_thread_suicide_timeout),
+	   &m_tp) {
+}
+
+void RGWAsyncRadosProcessor::start() {
+  m_tp.start();
+}
+
+void RGWAsyncRadosProcessor::stop() {
+  going_down = true;
+  m_tp.drain(&req_wq);
+  m_tp.stop();
+  for (auto iter = m_req_queue.begin(); iter != m_req_queue.end(); ++iter) {
+    (*iter)->put();
+  }
+}
+
+void RGWAsyncRadosProcessor::handle_request(const DoutPrefixProvider *dpp, RGWAsyncRadosRequest *req) {
+  req->send_request(dpp);
+  req->put();
+}
+
+void RGWAsyncRadosProcessor::queue(RGWAsyncRadosRequest *req) {
+  req_throttle.get(1);
+  req_wq.queue(req);
+}
+
+int RGWAsyncGetSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  map<string, bufferlist> *pattrs = want_attrs ? &attrs : nullptr;
+
+  auto sysobj = svc_sysobj->get_obj(obj);
+  return sysobj.rop()
+               .set_objv_tracker(&objv_tracker)
+               .set_attrs(pattrs)
+	       .set_raw_attrs(raw_attrs)
+               .read(dpp, &bl, null_yield);
+}
+
+RGWAsyncGetSystemObj::RGWAsyncGetSystemObj(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+                       RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                       bool want_attrs, bool raw_attrs)
+  : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc_sysobj(_svc),
+    obj(_obj), want_attrs(want_attrs), raw_attrs(raw_attrs)
+{
+  if (_objv_tracker) {
+    objv_tracker = *_objv_tracker;
+  }
+}
+
+int RGWSimpleRadosReadAttrsCR::send_request(const DoutPrefixProvider *dpp)
+{
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret="
+			 << r << dendl;
+    return r;
+  }
+
+  set_status() << "sending request";
+
+  librados::ObjectReadOperation op;
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_read(&op);
+  }
+
+  if (raw_attrs && pattrs) {
+    op.getxattrs(pattrs, nullptr);
+  } else {
+    op.getxattrs(&unfiltered_attrs, nullptr);
+  }
+
+  cn = stack->create_completion_notifier();
+  return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op,
+				      nullptr);
+}
+
+int RGWSimpleRadosReadAttrsCR::request_complete()
+{
+  int ret = cn->completion()->get_return_value();
+  set_status() << "request complete; ret=" << ret;
+  if (!raw_attrs && pattrs) {
+    rgw_filter_attrset(unfiltered_attrs, RGW_ATTR_PREFIX, pattrs);
+  }
+  return ret;
+}
+
+int RGWAsyncPutSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  auto sysobj = svc->get_obj(obj);
+  return sysobj.wop()
+               .set_objv_tracker(&objv_tracker)
+               .set_exclusive(exclusive)
+               .write_data(dpp, bl, null_yield);
+}
+
+RGWAsyncPutSystemObj::RGWAsyncPutSystemObj(const DoutPrefixProvider *_dpp, 
+                     RGWCoroutine *caller, 
+                     RGWAioCompletionNotifier *cn,
+                     RGWSI_SysObj *_svc,
+                     RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                     bool _exclusive, bufferlist _bl)
+  : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc(_svc),
+    obj(_obj), exclusive(_exclusive), bl(std::move(_bl))
+{
+  if (_objv_tracker) {
+    objv_tracker = *_objv_tracker;
+  }
+}
+
+int RGWAsyncPutSystemObjAttrs::_send_request(const DoutPrefixProvider *dpp)
+{
+  auto sysobj = svc->get_obj(obj);
+  return sysobj.wop()
+               .set_objv_tracker(&objv_tracker)
+               .set_exclusive(exclusive)
+               .set_attrs(attrs)
+               .write_attrs(dpp, null_yield);
+}
+
+RGWAsyncPutSystemObjAttrs::RGWAsyncPutSystemObjAttrs(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn,
+                     RGWSI_SysObj *_svc,
+                     RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                     map<string, bufferlist> _attrs, bool exclusive)
+  : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), svc(_svc),
+    obj(_obj), attrs(std::move(_attrs)), exclusive(exclusive)
+{
+  if (_objv_tracker) {
+    objv_tracker = *_objv_tracker;
+  }
+}
+
+
+RGWOmapAppend::RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
+                             uint64_t _window_size)
+                      : RGWConsumerCR<string>(_store->ctx()), async_rados(_async_rados),
+                        store(_store), obj(_obj), going_down(false), num_pending_entries(0), window_size(_window_size), total_entries(0)
+{
+}
+
+int RGWAsyncLockSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  rgw_rados_ref ref;
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  rados::cls::lock::Lock l(lock_name);
+  utime_t duration(duration_secs, 0);
+  l.set_duration(duration);
+  l.set_cookie(cookie);
+  l.set_may_renew(true);
+
+  return l.lock_exclusive(&ref.pool.ioctx(), ref.obj.oid);
+}
+
+RGWAsyncLockSystemObj::RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                      RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                       const string& _name, const string& _cookie, uint32_t _duration_secs) : RGWAsyncRadosRequest(caller, cn), store(_store),
+                                                              obj(_obj),
+                                                              lock_name(_name),
+                                                              cookie(_cookie),
+                                                              duration_secs(_duration_secs)
+{
+}
+
+int RGWAsyncUnlockSystemObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  rgw_rados_ref ref;
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  rados::cls::lock::Lock l(lock_name);
+
+  l.set_cookie(cookie);
+
+  return l.unlock(&ref.pool.ioctx(), ref.obj.oid);
+}
+
+RGWAsyncUnlockSystemObj::RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                                                 RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                                                 const string& _name, const string& _cookie) : RGWAsyncRadosRequest(caller, cn), store(_store),
+  obj(_obj),
+  lock_name(_name), cookie(_cookie)
+{
+}
+
+RGWRadosSetOmapKeysCR::RGWRadosSetOmapKeysCR(rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      map<string, bufferlist>& _entries) : RGWSimpleCoroutine(_store->ctx()),
+                                                store(_store),
+                                                entries(_entries),
+                                                obj(_obj), cn(NULL)
+{
+  stringstream& s = set_description();
+  s << "set omap keys dest=" << obj << " keys=[" << s.str() << "]";
+  for (auto i = entries.begin(); i != entries.end(); ++i) {
+    if (i != entries.begin()) {
+      s << ", ";
+    }
+    s << i->first;
+  }
+  s << "]";
+}
+
+int RGWRadosSetOmapKeysCR::send_request(const DoutPrefixProvider *dpp)
+{
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  set_status() << "sending request";
+
+  librados::ObjectWriteOperation op;
+  op.omap_set(entries);
+
+  cn = stack->create_completion_notifier();
+  return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
+}
+
+int RGWRadosSetOmapKeysCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosGetOmapKeysCR::RGWRadosGetOmapKeysCR(rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      const string& _marker,
+                      int _max_entries,
+                      ResultPtr _result)
+  : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj),
+    marker(_marker), max_entries(_max_entries),
+    result(std::move(_result))
+{
+  ceph_assert(result); // must be allocated
+  set_description() << "get omap keys dest=" << obj << " marker=" << marker;
+}
+
+int RGWRadosGetOmapKeysCR::send_request(const DoutPrefixProvider *dpp) {
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &result->ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  set_status() << "send request";
+
+  librados::ObjectReadOperation op;
+  op.omap_get_keys2(marker, max_entries, &result->entries, &result->more, nullptr);
+
+  cn = stack->create_completion_notifier(result);
+  return result->ref.pool.ioctx().aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL);
+}
+
+int RGWRadosGetOmapKeysCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosGetOmapValsCR::RGWRadosGetOmapValsCR(rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      const string& _marker,
+                      int _max_entries,
+                      ResultPtr _result)
+  : RGWSimpleCoroutine(_store->ctx()), store(_store), obj(_obj),
+    marker(_marker), max_entries(_max_entries),
+    result(std::move(_result))
+{
+  ceph_assert(result); // must be allocated
+  set_description() << "get omap keys dest=" << obj << " marker=" << marker;
+}
+
+int RGWRadosGetOmapValsCR::send_request(const DoutPrefixProvider *dpp) {
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &result->ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  set_status() << "send request";
+
+  librados::ObjectReadOperation op;
+  op.omap_get_vals2(marker, max_entries, &result->entries, &result->more, nullptr);
+
+  cn = stack->create_completion_notifier(result);
+  return result->ref.pool.ioctx().aio_operate(result->ref.obj.oid, cn->completion(), &op, NULL);
+}
+
+int RGWRadosGetOmapValsCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosRemoveOmapKeysCR::RGWRadosRemoveOmapKeysCR(rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      const set<string>& _keys) : RGWSimpleCoroutine(_store->ctx()),
+                                                store(_store),
+                                                keys(_keys),
+                                                obj(_obj), cn(NULL)
+{
+  set_description() << "remove omap keys dest=" << obj << " keys=" << keys;
+}
+
+int RGWRadosRemoveOmapKeysCR::send_request(const DoutPrefixProvider *dpp) {
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  set_status() << "send request";
+
+  librados::ObjectWriteOperation op;
+  op.omap_rm_keys(keys);
+
+  cn = stack->create_completion_notifier();
+  return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
+}
+
+int RGWRadosRemoveOmapKeysCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosRemoveCR::RGWRadosRemoveCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+                                   RGWObjVersionTracker* objv_tracker)
+  : RGWSimpleCoroutine(store->ctx()),
+    store(store), obj(obj), objv_tracker(objv_tracker)
+{
+  set_description() << "remove dest=" << obj;
+}
+
+int RGWRadosRemoveCR::send_request(const DoutPrefixProvider *dpp)
+{
+  auto rados = store->getRados()->get_rados_handle();
+  int r = rados->ioctx_create(obj.pool.name.c_str(), ioctx);
+  if (r < 0) {
+    lderr(cct) << "ERROR: failed to open pool (" << obj.pool.name << ") ret=" << r << dendl;
+    return r;
+  }
+  ioctx.locator_set_key(obj.loc);
+
+  set_status() << "send request";
+
+  librados::ObjectWriteOperation op;
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_write(&op);
+  }
+  op.remove();
+
+  cn = stack->create_completion_notifier();
+  return ioctx.aio_operate(obj.oid, cn->completion(), &op);
+}
+
+int RGWRadosRemoveCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+					 librados::IoCtx&& ioctx,
+					 std::string_view oid,
+					 RGWObjVersionTracker* objv_tracker)
+  : RGWSimpleCoroutine(store->ctx()), ioctx(std::move(ioctx)),
+    oid(std::string(oid)), objv_tracker(objv_tracker)
+{
+  set_description() << "remove dest=" << oid;
+}
+
+RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+					 RGWSI_RADOS::Obj& obj,
+					 RGWObjVersionTracker* objv_tracker)
+  : RGWSimpleCoroutine(store->ctx()),
+    ioctx(librados::IoCtx(obj.get_ref().pool.ioctx())),
+    oid(obj.get_ref().obj.oid),
+    objv_tracker(objv_tracker)
+{
+  set_description() << "remove dest=" << oid;
+}
+
+RGWRadosRemoveOidCR::RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+					 RGWSI_RADOS::Obj&& obj,
+					 RGWObjVersionTracker* objv_tracker)
+  : RGWSimpleCoroutine(store->ctx()),
+    ioctx(std::move(obj.get_ref().pool.ioctx())),
+    oid(std::move(obj.get_ref().obj.oid)),
+    objv_tracker(objv_tracker)
+{
+  set_description() << "remove dest=" << oid;
+}
+
+int RGWRadosRemoveOidCR::send_request(const DoutPrefixProvider *dpp)
+{
+  librados::ObjectWriteOperation op;
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_write(&op);
+  }
+  op.remove();
+
+  cn = stack->create_completion_notifier();
+  return ioctx.aio_operate(oid, cn->completion(), &op);
+}
+
+int RGWRadosRemoveOidCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWSimpleRadosLockCR::RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      const string& _lock_name,
+                      const string& _cookie,
+                      uint32_t _duration) : RGWSimpleCoroutine(_store->ctx()),
+                                                async_rados(_async_rados),
+                                                store(_store),
+                                                lock_name(_lock_name),
+                                                cookie(_cookie),
+                                                duration(_duration),
+                                                obj(_obj),
+                                                req(NULL)
+{
+  set_description() << "rados lock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie << " duration=" << duration;
+}
+
+void RGWSimpleRadosLockCR::request_cleanup()
+{
+  if (req) {
+    req->finish();
+    req = NULL;
+  }
+}
+
+int RGWSimpleRadosLockCR::send_request(const DoutPrefixProvider *dpp)
+{
+  set_status() << "sending request";
+  req = new RGWAsyncLockSystemObj(this, stack->create_completion_notifier(),
+                                 store, NULL, obj, lock_name, cookie, duration);
+  async_rados->queue(req);
+  return 0;
+}
+
+int RGWSimpleRadosLockCR::request_complete()
+{
+  set_status() << "request complete; ret=" << req->get_ret_status();
+  return req->get_ret_status();
+}
+
+RGWSimpleRadosUnlockCR::RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                      const rgw_raw_obj& _obj,
+                      const string& _lock_name,
+                      const string& _cookie) : RGWSimpleCoroutine(_store->ctx()),
+                                                async_rados(_async_rados),
+                                                store(_store),
+                                                lock_name(_lock_name),
+                                                cookie(_cookie),
+                                                obj(_obj),
+                                                req(NULL)
+{
+  set_description() << "rados unlock dest=" << obj << " lock=" << lock_name << " cookie=" << cookie;
+}
+
+void RGWSimpleRadosUnlockCR::request_cleanup()
+{
+  if (req) {
+    req->finish();
+    req = NULL;
+  }
+}
+
+int RGWSimpleRadosUnlockCR::send_request(const DoutPrefixProvider *dpp)
+{
+  set_status() << "sending request";
+
+  req = new RGWAsyncUnlockSystemObj(this, stack->create_completion_notifier(),
+                                 store, NULL, obj, lock_name, cookie);
+  async_rados->queue(req);
+  return 0;
+}
+
+int RGWSimpleRadosUnlockCR::request_complete()
+{
+  set_status() << "request complete; ret=" << req->get_ret_status();
+  return req->get_ret_status();
+}
+
+int RGWOmapAppend::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+    for (;;) {
+      if (!has_product() && going_down) {
+        set_status() << "going down";
+        break;
+      }
+      set_status() << "waiting for product";
+      yield wait_for_product();
+      yield {
+        string entry;
+        while (consume(&entry)) {
+          set_status() << "adding entry: " << entry;
+          entries[entry] = bufferlist();
+          if (entries.size() >= window_size) {
+            break;
+          }
+        }
+        if (entries.size() >= window_size || going_down) {
+          set_status() << "flushing to omap";
+          call(new RGWRadosSetOmapKeysCR(store, obj, entries));
+          entries.clear();
+        }
+      }
+      if (get_ret_status() < 0) {
+        ldout(cct, 0) << "ERROR: failed to store entries in omap" << dendl;
+        return set_state(RGWCoroutine_Error);
+      }
+    }
+    /* done with coroutine */
+    return set_state(RGWCoroutine_Done);
+  }
+  return 0;
+}
+
+void RGWOmapAppend::flush_pending() {
+  receive(pending_entries);
+  num_pending_entries = 0;
+}
+
+bool RGWOmapAppend::append(const string& s) {
+  if (is_done()) {
+    return false;
+  }
+  ++total_entries;
+  pending_entries.push_back(s);
+  if (++num_pending_entries >= (int)window_size) {
+    flush_pending();
+  }
+  return true;
+}
+
+bool RGWOmapAppend::finish() {
+  going_down = true;
+  flush_pending();
+  set_sleeping(false);
+  return (!is_done());
+}
+
+int RGWAsyncGetBucketInstanceInfo::_send_request(const DoutPrefixProvider *dpp)
+{
+  int r;
+  if (!bucket.bucket_id.empty()) {
+    r = store->getRados()->get_bucket_instance_info(bucket, bucket_info, nullptr, &attrs, null_yield, dpp);
+  } else {
+    r = store->ctl()->bucket->read_bucket_info(bucket, &bucket_info, null_yield, dpp,
+                                               RGWBucketCtl::BucketInstance::GetParams().set_attrs(&attrs));
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to get bucket instance info for "
+        << bucket << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWAsyncPutBucketInstanceInfo::_send_request(const DoutPrefixProvider *dpp)
+{
+  auto r = store->getRados()->put_bucket_instance_info(bucket_info, exclusive,
+						       mtime, attrs, dpp, null_yield);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to put bucket instance info for "
+		      << bucket_info.bucket << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+RGWRadosBILogTrimCR::RGWRadosBILogTrimCR(
+  const DoutPrefixProvider *dpp,
+  rgw::sal::RadosStore* store,
+  const RGWBucketInfo& bucket_info,
+  int shard_id,
+  const rgw::bucket_index_layout_generation& generation,
+  const std::string& start_marker,
+  const std::string& end_marker)
+  : RGWSimpleCoroutine(store->ctx()), bucket_info(bucket_info),
+    shard_id(shard_id), generation(generation), bs(store->getRados()),
+    start_marker(BucketIndexShardsManager::get_shard_marker(start_marker)),
+    end_marker(BucketIndexShardsManager::get_shard_marker(end_marker))
+{
+}
+
+int RGWRadosBILogTrimCR::send_request(const DoutPrefixProvider *dpp)
+{
+  int r = bs.init(dpp, bucket_info, generation, shard_id);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: bucket shard init failed ret=" << r << dendl;
+    return r;
+  }
+
+  bufferlist in;
+  cls_rgw_bi_log_trim_op call;
+  call.start_marker = std::move(start_marker);
+  call.end_marker = std::move(end_marker);
+  encode(call, in);
+
+  librados::ObjectWriteOperation op;
+  op.exec(RGW_CLASS, RGW_BI_LOG_TRIM, in);
+
+  cn = stack->create_completion_notifier();
+  return bs.bucket_obj.aio_operate(cn->completion(), &op);
+}
+
+int RGWRadosBILogTrimCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+  set_status() << "request complete; ret=" << r;
+  return r;
+}
+
+int RGWAsyncFetchRemoteObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  RGWObjectCtx obj_ctx(store);
+
+  char buf[16];
+  snprintf(buf, sizeof(buf), ".%lld", (long long)store->getRados()->instance_id());
+  rgw::sal::Attrs attrs;
+
+  rgw_obj src_obj(src_bucket, key);
+
+  rgw::sal::RadosBucket dest_bucket(store, dest_bucket_info);
+  rgw::sal::RadosObject dest_obj(store, dest_key.value_or(key), &dest_bucket);
+    
+  std::string etag;
+
+  std::optional<uint64_t> bytes_transferred;
+  int r = store->getRados()->fetch_remote_obj(obj_ctx,
+                       user_id.value_or(rgw_user()),
+                       NULL, /* req_info */
+                       source_zone,
+                       dest_obj.get_obj(),
+                       src_obj,
+                       dest_bucket_info, /* dest */
+                       nullptr, /* source */
+                       dest_placement_rule,
+                       nullptr, /* real_time* src_mtime, */
+                       NULL, /* real_time* mtime, */
+                       NULL, /* const real_time* mod_ptr, */
+                       NULL, /* const real_time* unmod_ptr, */
+                       false, /* high precision time */
+                       NULL, /* const char *if_match, */
+                       NULL, /* const char *if_nomatch, */
+                       RGWRados::ATTRSMOD_NONE,
+                       copy_if_newer,
+                       attrs,
+                       RGWObjCategory::Main,
+                       versioned_epoch,
+                       real_time(), /* delete_at */
+                       NULL, /* string *ptag, */
+                       &etag, /* string *petag, */
+                       NULL, /* void (*progress_cb)(off_t, void *), */
+                       NULL, /* void *progress_data*); */
+                       dpp,
+                       filter.get(),
+                       source_trace_entry,
+                       &zones_trace,
+                       &bytes_transferred);
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "store->fetch_remote_obj() returned r=" << r << dendl;
+    if (counters) {
+      counters->inc(sync_counters::l_fetch_err, 1);
+    }
+  } else {
+      // r >= 0
+      if (bytes_transferred) {
+        // send notification that object was succesfully synced
+        std::string user_id = "rgw sync";
+        std::string req_id = "0";
+        		
+        RGWObjTags obj_tags;
+        auto iter = attrs.find(RGW_ATTR_TAGS);
+        if (iter != attrs.end()) {
+          try {
+            auto it = iter->second.cbegin();
+            obj_tags.decode(it);
+          } catch (buffer::error &err) {
+            ldpp_dout(dpp, 1) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
+          }
+        }
+
+        // NOTE: we create a mutable copy of bucket.get_tenant as the get_notification function expects a std::string&, not const
+        std::string tenant(dest_bucket.get_tenant());
+
+        std::unique_ptr<rgw::sal::Notification> notify 
+                 = store->get_notification(dpp, &dest_obj, nullptr, rgw::notify::ObjectSyncedCreate,
+                  &dest_bucket, user_id,
+                  tenant,
+                  req_id, null_yield);
+
+        auto notify_res = static_cast<rgw::sal::RadosNotification*>(notify.get())->get_reservation();
+        int ret = rgw::notify::publish_reserve(dpp, rgw::notify::ObjectSyncedCreate, notify_res, &obj_tags);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << "ERROR: reserving notification failed, with error: " << ret << dendl;
+          // no need to return, the sync already happened
+        } else {
+          ret = rgw::notify::publish_commit(&dest_obj, *bytes_transferred, ceph::real_clock::now(), etag, dest_obj.get_instance(), rgw::notify::ObjectSyncedCreate, notify_res, dpp);
+          if (ret < 0) {
+            ldpp_dout(dpp, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
+          }
+        }
+      }
+      
+      if (counters) {
+        if (bytes_transferred) {
+          counters->inc(sync_counters::l_fetch, *bytes_transferred);
+        } else {
+          counters->inc(sync_counters::l_fetch_not_modified);
+        }
+      }
+  }
+  return r;
+}
+
+int RGWAsyncStatRemoteObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  RGWObjectCtx obj_ctx(store);
+
+  string user_id;
+  char buf[16];
+  snprintf(buf, sizeof(buf), ".%lld", (long long)store->getRados()->instance_id());
+
+
+  rgw_obj src_obj(src_bucket, key);
+
+  int r = store->getRados()->stat_remote_obj(dpp,
+                       obj_ctx,
+                       rgw_user(user_id),
+                       nullptr, /* req_info */
+                       source_zone,
+                       src_obj,
+                       nullptr, /* source */
+                       pmtime, /* real_time* src_mtime, */
+                       psize, /* uint64_t * */
+                       nullptr, /* const real_time* mod_ptr, */
+                       nullptr, /* const real_time* unmod_ptr, */
+                       true, /* high precision time */
+                       nullptr, /* const char *if_match, */
+                       nullptr, /* const char *if_nomatch, */
+                       pattrs,
+                       pheaders,
+                       nullptr,
+                       nullptr, /* string *ptag, */
+                       petag); /* string *petag, */
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "store->stat_remote_obj() returned r=" << r << dendl;
+  }
+  return r;
+}
+
+
+int RGWAsyncRemoveObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  ldpp_dout(dpp, 0) << __func__ << "(): deleting obj=" << obj << dendl;
+
+  obj->set_atomic();
+
+  RGWObjState *state;
+
+  int ret = obj->get_obj_state(dpp, &state, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << "(): get_obj_state() obj=" << obj << " returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  /* has there been any racing object write? */
+  if (del_if_older && (state->mtime > timestamp)) {
+    ldpp_dout(dpp, 20) << __func__ << "(): skipping object removal obj=" << obj << " (obj mtime=" << state->mtime << ", request timestamp=" << timestamp << ")" << dendl;
+    return 0;
+  }
+
+  RGWAccessControlPolicy policy;
+
+  /* decode policy */
+  map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_ACL);
+  if (iter != state->attrset.end()) {
+    auto bliter = iter->second.cbegin();
+    try {
+      policy.decode(bliter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+      return -EIO;
+    }
+  }
+
+  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = obj->get_delete_op();
+
+  del_op->params.bucket_owner = bucket->get_info().owner;
+  del_op->params.obj_owner = policy.get_owner();
+  if (del_if_older) {
+    del_op->params.unmod_since = timestamp;
+  }
+  if (versioned) {
+    del_op->params.versioning_status = BUCKET_VERSIONED;
+  }
+  del_op->params.olh_epoch = versioned_epoch;
+  del_op->params.marker_version_id = marker_version_id;
+  del_op->params.obj_owner.set_id(rgw_user(owner));
+  del_op->params.obj_owner.set_name(owner_display_name);
+  del_op->params.mtime = timestamp;
+  del_op->params.high_precision_time = true;
+  del_op->params.zones_trace = &zones_trace;
+
+  ret = del_op->delete_obj(dpp, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << "(): delete_obj() obj=" << obj << " returned ret=" << ret << dendl;
+  }
+  return ret;
+}
+
+int RGWContinuousLeaseCR::operate(const DoutPrefixProvider *dpp)
+{
+  if (aborted) {
+    caller->set_sleeping(false);
+    return set_cr_done();
+  }
+  reenter(this) {
+    last_renew_try_time = ceph::coarse_mono_clock::now();
+    while (!going_down) {
+      current_time = ceph::coarse_mono_clock::now();
+      yield call(new RGWSimpleRadosLockCR(async_rados, store, obj, lock_name, cookie, interval));
+      if (latency) {
+	latency->add_latency(ceph::coarse_mono_clock::now() - current_time);
+      }
+      current_time = ceph::coarse_mono_clock::now();
+      if (current_time - last_renew_try_time > interval_tolerance) {
+        // renewal should happen between 50%-90% of interval
+        ldout(store->ctx(), 1) << *this << ": WARNING: did not renew lock " << obj << ":" << lock_name << ": within 90\% of interval. " << 
+          (current_time - last_renew_try_time) << " > " << interval_tolerance << dendl;
+      }
+      last_renew_try_time = current_time;
+
+      caller->set_sleeping(false); /* will only be relevant when we return, that's why we can do it early */
+      if (retcode < 0) {
+        set_locked(false);
+        ldout(store->ctx(), 20) << *this << ": couldn't lock " << obj << ":" << lock_name << ": retcode=" << retcode << dendl;
+        return set_state(RGWCoroutine_Error, retcode);
+      }
+      ldout(store->ctx(), 20) << *this << ": successfully locked " << obj << ":" << lock_name << dendl;
+      set_locked(true);
+      yield wait(utime_t(interval / 2, 0));
+    }
+    set_locked(false); /* moot at this point anyway */
+    current_time = ceph::coarse_mono_clock::now();
+    yield call(new RGWSimpleRadosUnlockCR(async_rados, store, obj, lock_name, cookie));
+    if (latency) {
+      latency->add_latency(ceph::coarse_mono_clock::now() - current_time);
+    }
+    return set_state(RGWCoroutine_Done);
+  }
+  return 0;
+}
+
+RGWRadosTimelogAddCR::RGWRadosTimelogAddCR(const DoutPrefixProvider *_dpp, rgw::sal::RadosStore* _store, const string& _oid,
+                      const cls_log_entry& entry) : RGWSimpleCoroutine(_store->ctx()),
+                                                dpp(_dpp),
+                                                store(_store),
+                                                oid(_oid), cn(NULL)
+{
+  stringstream& s = set_description();
+  s << "timelog add entry oid=" <<  oid << "entry={id=" << entry.id << ", section=" << entry.section << ", name=" << entry.name << "}";
+  entries.push_back(entry);
+}
+
+int RGWRadosTimelogAddCR::send_request(const DoutPrefixProvider *dpp)
+{
+  set_status() << "sending request";
+
+  cn = stack->create_completion_notifier();
+  return store->svc()->cls->timelog.add(dpp, oid, entries, cn->completion(), true, null_yield);
+}
+
+int RGWRadosTimelogAddCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+RGWRadosTimelogTrimCR::RGWRadosTimelogTrimCR(const DoutPrefixProvider *dpp,
+                                             rgw::sal::RadosStore* store,
+                                             const std::string& oid,
+                                             const real_time& start_time,
+                                             const real_time& end_time,
+                                             const std::string& from_marker,
+                                             const std::string& to_marker)
+  : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), oid(oid),
+    start_time(start_time), end_time(end_time),
+    from_marker(from_marker), to_marker(to_marker)
+{
+  set_description() << "timelog trim oid=" <<  oid
+      << " start_time=" << start_time << " end_time=" << end_time
+      << " from_marker=" << from_marker << " to_marker=" << to_marker;
+}
+
+int RGWRadosTimelogTrimCR::send_request(const DoutPrefixProvider *dpp)
+{
+  set_status() << "sending request";
+
+  cn = stack->create_completion_notifier();
+  return store->svc()->cls->timelog.trim(dpp, oid, start_time, end_time, from_marker,
+                                      to_marker, cn->completion(),
+                                      null_yield);
+}
+
+int RGWRadosTimelogTrimCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+
+RGWSyncLogTrimCR::RGWSyncLogTrimCR(const DoutPrefixProvider *dpp,
+                                   rgw::sal::RadosStore* store, const std::string& oid,
+                                   const std::string& to_marker,
+                                   std::string *last_trim_marker)
+  : RGWRadosTimelogTrimCR(dpp, store, oid, real_time{}, real_time{},
+                          std::string{}, to_marker),
+    cct(store->ctx()), last_trim_marker(last_trim_marker)
+{
+}
+
+int RGWSyncLogTrimCR::request_complete()
+{
+  int r = RGWRadosTimelogTrimCR::request_complete();
+  if (r != -ENODATA) {
+    return r;
+  }
+  // nothing left to trim, update last_trim_marker
+  if (*last_trim_marker < to_marker && to_marker != max_marker) {
+    *last_trim_marker = to_marker;
+  }
+  return 0;
+}
+
+
+int RGWAsyncStatObj::_send_request(const DoutPrefixProvider *dpp)
+{
+  rgw_raw_obj raw_obj;
+  store->getRados()->obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
+  return store->getRados()->raw_obj_stat(dpp, raw_obj, psize, pmtime, pepoch,
+                             nullptr, nullptr, objv_tracker, null_yield);
+}
+
+RGWStatObjCR::RGWStatObjCR(const DoutPrefixProvider *dpp,
+                           RGWAsyncRadosProcessor *async_rados, rgw::sal::RadosStore* store,
+                           const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize,
+                           real_time* pmtime, uint64_t *pepoch,
+                           RGWObjVersionTracker *objv_tracker)
+  : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), async_rados(async_rados),
+    bucket_info(_bucket_info), obj(obj), psize(psize), pmtime(pmtime), pepoch(pepoch),
+    objv_tracker(objv_tracker)
+{
+}
+
+void RGWStatObjCR::request_cleanup()
+{
+  if (req) {
+    req->finish();
+    req = NULL;
+  }
+}
+
+int RGWStatObjCR::send_request(const DoutPrefixProvider *dpp)
+{
+  req = new RGWAsyncStatObj(dpp, this, stack->create_completion_notifier(),
+                            store, bucket_info, obj, psize, pmtime, pepoch, objv_tracker);
+  async_rados->queue(req);
+  return 0;
+}
+
+int RGWStatObjCR::request_complete()
+{
+  return req->get_ret_status();
+}
+
+RGWRadosNotifyCR::RGWRadosNotifyCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+                                   bufferlist& request, uint64_t timeout_ms,
+                                   bufferlist *response)
+  : RGWSimpleCoroutine(store->ctx()), store(store), obj(obj),
+    request(request), timeout_ms(timeout_ms), response(response)
+{
+  set_description() << "notify dest=" << obj;
+}
+
+int RGWRadosNotifyCR::send_request(const DoutPrefixProvider *dpp)
+{
+  int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret=" << r << dendl;
+    return r;
+  }
+
+  set_status() << "sending request";
+
+  cn = stack->create_completion_notifier();
+  return ref.pool.ioctx().aio_notify(ref.obj.oid, cn->completion(), request,
+                              timeout_ms, response);
+}
+
+int RGWRadosNotifyCR::request_complete()
+{
+  int r = cn->completion()->get_return_value();
+
+  set_status() << "request complete; ret=" << r;
+
+  return r;
+}
+
+
+int RGWDataPostNotifyCR::operate(const DoutPrefixProvider* dpp)
+{
+  reenter(this) {
+    using PostNotify2 = RGWPostRESTResourceCR<bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>, int>;
+    yield {
+      rgw_http_param_pair pairs[] = { { "type", "data" },
+                                      { "notify2", NULL },
+                                      { "source-zone", source_zone },
+                                      { NULL, NULL } };
+      call(new PostNotify2(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, nullptr));
+    }
+    if (retcode == -ERR_METHOD_NOT_ALLOWED) {
+      using PostNotify1 = RGWPostRESTResourceCR<rgw_data_notify_v1_encoder, int>;
+      yield {
+        rgw_http_param_pair pairs[] = { { "type", "data" },
+                                        { "notify", NULL },
+                                        { "source-zone", source_zone },
+                                        { NULL, NULL } };
+        auto encoder = rgw_data_notify_v1_encoder{shards};
+        call(new PostNotify1(store->ctx(), conn, &http_manager, "/admin/log", pairs, encoder, nullptr));
+      }
+    }
+    if (retcode < 0) {
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
diff --git a/src/rgw/driver/rados/rgw_cr_rados.h b/src/rgw/driver/rados/rgw_cr_rados.h
new file mode 100644
index 000000000..7bda18878
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_cr_rados.h
@@ -0,0 +1,1647 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include "include/ceph_assert.h"
+#include "rgw_coroutine.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "common/WorkQueue.h"
+#include "common/Throttle.h"
+
+#include <atomic>
+#include "common/ceph_time.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_bucket.h"
+
+struct rgw_http_param_pair;
+class RGWRESTConn;
+
+class RGWAsyncRadosRequest : public RefCountedObject {
+  RGWCoroutine *caller;
+  RGWAioCompletionNotifier *notifier;
+
+  int retcode;
+
+  ceph::mutex lock = ceph::make_mutex("RGWAsyncRadosRequest::lock");
+
+protected:
+  virtual int _send_request(const DoutPrefixProvider *dpp) = 0;
+public:
+  RGWAsyncRadosRequest(RGWCoroutine *_caller, RGWAioCompletionNotifier *_cn)
+    : caller(_caller), notifier(_cn), retcode(0) {
+  }
+  ~RGWAsyncRadosRequest() override {
+    if (notifier) {
+      notifier->put();
+    }
+  }
+
+  void send_request(const DoutPrefixProvider *dpp) {
+    get();
+    retcode = _send_request(dpp);
+    {
+      std::lock_guard l{lock};
+      if (notifier) {
+        notifier->cb(); // drops its own ref
+        notifier = nullptr;
+      }
+    }
+    put();
+  }
+
+  int get_ret_status() { return retcode; }
+
+  void finish() {
+    {
+      std::lock_guard l{lock};
+      if (notifier) {
+        // we won't call notifier->cb() to drop its ref, so drop it here
+        notifier->put();
+        notifier = nullptr;
+      }
+    }
+    put();
+  }
+};
+
+
+class RGWAsyncRadosProcessor {
+  std::deque<RGWAsyncRadosRequest *> m_req_queue;
+  std::atomic<bool> going_down = { false };
+protected:
+  CephContext *cct;
+  ThreadPool m_tp;
+  Throttle req_throttle;
+
+  struct RGWWQ : public DoutPrefixProvider, public ThreadPool::WorkQueue<RGWAsyncRadosRequest> {
+    RGWAsyncRadosProcessor *processor;
+    RGWWQ(RGWAsyncRadosProcessor *p,
+	  ceph::timespan timeout, ceph::timespan suicide_timeout,
+	  ThreadPool *tp)
+      : ThreadPool::WorkQueue<RGWAsyncRadosRequest>("RGWWQ", timeout, suicide_timeout, tp), processor(p) {}
+
+    bool _enqueue(RGWAsyncRadosRequest *req) override;
+    void _dequeue(RGWAsyncRadosRequest *req) override {
+      ceph_abort();
+    }
+    bool _empty() override;
+    RGWAsyncRadosRequest *_dequeue() override;
+    using ThreadPool::WorkQueue<RGWAsyncRadosRequest>::_process;
+    void _process(RGWAsyncRadosRequest *req, ThreadPool::TPHandle& handle) override;
+    void _dump_queue();
+    void _clear() override {
+      ceph_assert(processor->m_req_queue.empty());
+    }
+
+  CephContext *get_cct() const { return processor->cct; }
+  unsigned get_subsys() const { return ceph_subsys_rgw; }
+  std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw async rados processor: ";}
+
+  } req_wq;
+
+public:
+  RGWAsyncRadosProcessor(CephContext *_cct, int num_threads);
+  ~RGWAsyncRadosProcessor() {}
+  void start();
+  void stop();
+  void handle_request(const DoutPrefixProvider *dpp, RGWAsyncRadosRequest *req);
+  void queue(RGWAsyncRadosRequest *req);
+
+  bool is_going_down() {
+    return going_down;
+  }
+
+};
+
+template <class P>
+class RGWSimpleWriteOnlyAsyncCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+
+  P params;
+  const DoutPrefixProvider *dpp;
+
+  class Request : public RGWAsyncRadosRequest {
+    rgw::sal::RadosStore* store;
+    P params;
+    const DoutPrefixProvider *dpp;
+  protected:
+    int _send_request(const DoutPrefixProvider *dpp) override;
+  public:
+    Request(RGWCoroutine *caller,
+            RGWAioCompletionNotifier *cn,
+            rgw::sal::RadosStore* store,
+            const P& _params,
+            const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn),
+                                store(store),
+                                params(_params),
+                                dpp(dpp) {}
+  } *req{nullptr};
+
+ public:
+  RGWSimpleWriteOnlyAsyncCR(RGWAsyncRadosProcessor *_async_rados,
+			    rgw::sal::RadosStore* _store,
+			    const P& _params,
+                            const DoutPrefixProvider *_dpp) : RGWSimpleCoroutine(_store->ctx()),
+                                                async_rados(_async_rados),
+                                                store(_store),
+				                params(_params),
+                                                dpp(_dpp) {}
+
+  ~RGWSimpleWriteOnlyAsyncCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new Request(this,
+                      stack->create_completion_notifier(),
+                      store,
+                      params,
+                      dpp);
+
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+
+template <class P, class R>
+class RGWSimpleAsyncCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+
+  P params;
+  std::shared_ptr<R> result;
+  const DoutPrefixProvider *dpp;
+
+  class Request : public RGWAsyncRadosRequest {
+    rgw::sal::RadosStore* store;
+    P params;
+    std::shared_ptr<R> result;
+    const DoutPrefixProvider *dpp;
+  protected:
+    int _send_request(const DoutPrefixProvider *dpp) override;
+  public:
+    Request(const DoutPrefixProvider *dpp,
+            RGWCoroutine *caller,
+            RGWAioCompletionNotifier *cn,
+            rgw::sal::RadosStore* _store,
+            const P& _params,
+            std::shared_ptr<R>& _result,
+            const DoutPrefixProvider *_dpp) : RGWAsyncRadosRequest(caller, cn),
+                                           store(_store),
+                                           params(_params),
+                                           result(_result),
+                                           dpp(_dpp) {}
+  } *req{nullptr};
+
+ public:
+  RGWSimpleAsyncCR(RGWAsyncRadosProcessor *_async_rados,
+                   rgw::sal::RadosStore* _store,
+                   const P& _params,
+                   std::shared_ptr<R>& _result,
+                   const DoutPrefixProvider *_dpp) : RGWSimpleCoroutine(_store->ctx()),
+                                                  async_rados(_async_rados),
+                                                  store(_store),
+                                                  params(_params),
+                                                  result(_result),
+                                                  dpp(_dpp) {}
+
+  ~RGWSimpleAsyncCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new Request(dpp,
+                      this,
+                      stack->create_completion_notifier(),
+                      store,
+                      params,
+                      result,
+                      dpp);
+
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWGenericAsyncCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+
+
+public:
+  class Action {
+  public:
+    virtual ~Action() {}
+    virtual int operate() = 0;
+  };
+
+private:
+  std::shared_ptr<Action> action;
+
+  class Request : public RGWAsyncRadosRequest {
+    std::shared_ptr<Action> action;
+  protected:
+    int _send_request(const DoutPrefixProvider *dpp) override {
+      if (!action) {
+	return 0;
+      }
+      return action->operate();
+    }
+  public:
+    Request(const DoutPrefixProvider *dpp,
+            RGWCoroutine *caller,
+            RGWAioCompletionNotifier *cn,
+            std::shared_ptr<Action>& _action) : RGWAsyncRadosRequest(caller, cn),
+                                           action(_action) {}
+  } *req{nullptr};
+
+ public:
+  RGWGenericAsyncCR(CephContext *_cct,
+		    RGWAsyncRadosProcessor *_async_rados,
+		    std::shared_ptr<Action>& _action) : RGWSimpleCoroutine(_cct),
+                                                  async_rados(_async_rados),
+                                                  action(_action) {}
+  template<typename T>
+  RGWGenericAsyncCR(CephContext *_cct,
+		    RGWAsyncRadosProcessor *_async_rados,
+		    std::shared_ptr<T>& _action) : RGWSimpleCoroutine(_cct),
+                                                  async_rados(_async_rados),
+                                                  action(std::static_pointer_cast<Action>(_action)) {}
+
+  ~RGWGenericAsyncCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new Request(dpp, this,
+                      stack->create_completion_notifier(),
+                      action);
+
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+
+class RGWAsyncGetSystemObj : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  RGWSI_SysObj* svc_sysobj;
+  rgw_raw_obj obj;
+  const bool want_attrs;
+  const bool raw_attrs;
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncGetSystemObj(const DoutPrefixProvider *dpp, 
+                       RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+                       RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                       bool want_attrs, bool raw_attrs);
+
+  bufferlist bl;
+  std::map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv_tracker;
+};
+
+class RGWAsyncPutSystemObj : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  RGWSI_SysObj *svc;
+  rgw_raw_obj obj;
+  bool exclusive;
+  bufferlist bl;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncPutSystemObj(const DoutPrefixProvider *dpp, RGWCoroutine *caller, 
+                       RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+                       RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+                       bool _exclusive, bufferlist _bl);
+
+  RGWObjVersionTracker objv_tracker;
+};
+
+class RGWAsyncPutSystemObjAttrs : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  RGWSI_SysObj *svc;
+  rgw_raw_obj obj;
+  std::map<std::string, bufferlist> attrs;
+  bool exclusive;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncPutSystemObjAttrs(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWSI_SysObj *_svc,
+			    RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+			    std::map<std::string, bufferlist> _attrs, bool exclusive);
+
+  RGWObjVersionTracker objv_tracker;
+};
+
+class RGWAsyncLockSystemObj : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  rgw_raw_obj obj;
+  std::string lock_name;
+  std::string cookie;
+  uint32_t duration_secs;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncLockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                        RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+		        const std::string& _name, const std::string& _cookie, uint32_t _duration_secs);
+};
+
+class RGWAsyncUnlockSystemObj : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  rgw_raw_obj obj;
+  std::string lock_name;
+  std::string cookie;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncUnlockSystemObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                        RGWObjVersionTracker *_objv_tracker, const rgw_raw_obj& _obj,
+		        const std::string& _name, const std::string& _cookie);
+};
+
+template <class T>
+class RGWSimpleRadosReadCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider* dpp;
+  rgw::sal::RadosStore* store;
+  rgw_raw_obj obj;
+  T* result;
+  /// on ENOENT, call handle_data() with an empty object instead of failing
+  const bool empty_on_enoent;
+  RGWObjVersionTracker* objv_tracker;
+
+  T val;
+  rgw_rados_ref ref;
+  ceph::buffer::list bl;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWSimpleRadosReadCR(const DoutPrefixProvider* dpp,
+		       rgw::sal::RadosStore* store,
+		       const rgw_raw_obj& obj,
+		       T* result, bool empty_on_enoent = true,
+		       RGWObjVersionTracker* objv_tracker = nullptr)
+    : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store),
+      obj(obj), result(result), empty_on_enoent(empty_on_enoent),
+      objv_tracker(objv_tracker) {
+    if (!result) {
+      result = &val;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) {
+    int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret="
+			 << r << dendl;
+      return r;
+    }
+
+    set_status() << "sending request";
+
+    librados::ObjectReadOperation op;
+    if (objv_tracker) {
+      objv_tracker->prepare_op_for_read(&op);
+    }
+
+    op.read(0, -1, &bl, nullptr);
+
+    cn = stack->create_completion_notifier();
+    return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op,
+					nullptr);
+  }
+
+  int request_complete() {
+    int ret = cn->completion()->get_return_value();
+    set_status() << "request complete; ret=" << ret;
+
+    if (ret == -ENOENT && empty_on_enoent) {
+      *result = T();
+    } else {
+      if (ret < 0) {
+	return ret;
+      }
+      try {
+	auto iter = bl.cbegin();
+	if (iter.end()) {
+	  // allow successful reads with empty buffers. ReadSyncStatus coroutines
+	  // depend on this to be able to read without locking, because the
+	  // cls lock from InitSyncStatus will create an empty object if it didn't
+	  // exist
+	  *result = T();
+	} else {
+	  decode(*result, iter);
+	}
+      } catch (buffer::error& err) {
+	return -EIO;
+      }
+    }
+
+    return handle_data(*result);
+  }
+
+  virtual int handle_data(T& data) {
+    return 0;
+  }
+};
+
+class RGWSimpleRadosReadAttrsCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider* dpp;
+  rgw::sal::RadosStore* const store;
+
+  const rgw_raw_obj obj;
+  std::map<std::string, bufferlist>* const pattrs;
+  const bool raw_attrs;
+  RGWObjVersionTracker* const objv_tracker;
+
+  rgw_rados_ref ref;
+  std::map<std::string, bufferlist> unfiltered_attrs;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWSimpleRadosReadAttrsCR(const DoutPrefixProvider* dpp,
+			    rgw::sal::RadosStore* store,
+                            rgw_raw_obj obj,
+			    std::map<std::string, bufferlist>* pattrs,
+                            bool raw_attrs,
+			    RGWObjVersionTracker* objv_tracker = nullptr)
+    : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store),
+      obj(std::move(obj)), pattrs(pattrs), raw_attrs(raw_attrs),
+      objv_tracker(objv_tracker) {}
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+template <class T>
+class RGWSimpleRadosWriteCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider* dpp;
+  rgw::sal::RadosStore* const store;
+  rgw_raw_obj obj;
+  RGWObjVersionTracker* objv_tracker;
+  bool exclusive;
+
+  bufferlist bl;
+  rgw_rados_ref ref;
+  std::map<std::string, bufferlist> unfiltered_attrs;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+
+public:
+  RGWSimpleRadosWriteCR(const DoutPrefixProvider* dpp,
+			rgw::sal::RadosStore* const store,
+			rgw_raw_obj obj, const T& data,
+			RGWObjVersionTracker* objv_tracker = nullptr,
+			bool exclusive = false)
+    : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store),
+      obj(std::move(obj)), objv_tracker(objv_tracker), exclusive(exclusive) {
+    encode(data, bl);
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret="
+			 << r << dendl;
+      return r;
+    }
+
+    set_status() << "sending request";
+
+    librados::ObjectWriteOperation op;
+    if (exclusive) {
+      op.create(true);
+    }
+    if (objv_tracker) {
+      objv_tracker->prepare_op_for_write(&op);
+    }
+    op.write_full(bl);
+
+    cn = stack->create_completion_notifier();
+    return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
+  }
+
+  int request_complete() override {
+    int ret = cn->completion()->get_return_value();
+    set_status() << "request complete; ret=" << ret;
+    if (ret >= 0 && objv_tracker) {
+      objv_tracker->apply_write();
+    }
+    return ret;
+  }
+};
+
+class RGWSimpleRadosWriteAttrsCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider* dpp;
+  rgw::sal::RadosStore* const store;
+  RGWObjVersionTracker* objv_tracker;
+  rgw_raw_obj obj;
+  std::map<std::string, bufferlist> attrs;
+  bool exclusive;
+
+  rgw_rados_ref ref;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+
+public:
+  RGWSimpleRadosWriteAttrsCR(const DoutPrefixProvider* dpp,
+			     rgw::sal::RadosStore* const store,
+                             rgw_raw_obj obj,
+                             std::map<std::string, bufferlist> attrs,
+                             RGWObjVersionTracker* objv_tracker = nullptr,
+                             bool exclusive = false)
+			     : RGWSimpleCoroutine(store->ctx()), dpp(dpp),
+			       store(store), objv_tracker(objv_tracker),
+			       obj(std::move(obj)), attrs(std::move(attrs)),
+			       exclusive(exclusive) {}
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to get ref for (" << obj << ") ret="
+			 << r << dendl;
+      return r;
+    }
+
+    set_status() << "sending request";
+
+    librados::ObjectWriteOperation op;
+    if (exclusive) {
+      op.create(true);
+    }
+    if (objv_tracker) {
+      objv_tracker->prepare_op_for_write(&op);
+    }
+
+    for (const auto& [name, bl] : attrs) {
+      if (!bl.length())
+	continue;
+      op.setxattr(name.c_str(), bl);
+    }
+
+    cn = stack->create_completion_notifier();
+    if (!op.size()) {
+      cn->cb();
+      return 0;
+    }
+
+    return ref.pool.ioctx().aio_operate(ref.obj.oid, cn->completion(), &op);
+  }
+
+  int request_complete() override {
+    int ret = cn->completion()->get_return_value();
+    set_status() << "request complete; ret=" << ret;
+    if (ret >= 0 && objv_tracker) {
+      objv_tracker->apply_write();
+    }
+    return ret;
+  }
+};
+
+class RGWRadosSetOmapKeysCR : public RGWSimpleCoroutine {
+  rgw::sal::RadosStore* store;
+  std::map<std::string, bufferlist> entries;
+
+  rgw_rados_ref ref;
+
+  rgw_raw_obj obj;
+
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosSetOmapKeysCR(rgw::sal::RadosStore* _store,
+		      const rgw_raw_obj& _obj,
+		      std::map<std::string, bufferlist>& _entries);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWRadosGetOmapKeysCR : public RGWSimpleCoroutine {
+ public:
+  struct Result {
+    rgw_rados_ref ref;
+    std::set<std::string> entries;
+    bool more = false;
+  };
+  using ResultPtr = std::shared_ptr<Result>;
+
+  RGWRadosGetOmapKeysCR(rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
+                        const std::string& _marker, int _max_entries,
+                        ResultPtr result);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+
+ private:
+  rgw::sal::RadosStore* store;
+  rgw_raw_obj obj;
+  std::string marker;
+  int max_entries;
+  ResultPtr result;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+};
+
+class RGWRadosGetOmapValsCR : public RGWSimpleCoroutine {
+ public:
+  struct Result {
+    rgw_rados_ref ref;
+    std::map<std::string, bufferlist> entries;
+    bool more = false;
+  };
+  using ResultPtr = std::shared_ptr<Result>;
+
+  RGWRadosGetOmapValsCR(rgw::sal::RadosStore* _store, const rgw_raw_obj& _obj,
+                        const std::string& _marker, int _max_entries,
+                        ResultPtr result);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+
+ private:
+  rgw::sal::RadosStore* store;
+  rgw_raw_obj obj;
+  std::string marker;
+  int max_entries;
+  ResultPtr result;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+};
+
+class RGWRadosRemoveOmapKeysCR : public RGWSimpleCoroutine {
+  rgw::sal::RadosStore* store;
+
+  rgw_rados_ref ref;
+
+  std::set<std::string> keys;
+
+  rgw_raw_obj obj;
+
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosRemoveOmapKeysCR(rgw::sal::RadosStore* _store,
+		      const rgw_raw_obj& _obj,
+		      const std::set<std::string>& _keys);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+
+  int request_complete() override;
+};
+
+class RGWRadosRemoveCR : public RGWSimpleCoroutine {
+  rgw::sal::RadosStore* store;
+  librados::IoCtx ioctx;
+  const rgw_raw_obj obj;
+  RGWObjVersionTracker* objv_tracker;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosRemoveCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+                   RGWObjVersionTracker* objv_tracker = nullptr);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWRadosRemoveOidCR : public RGWSimpleCoroutine {
+  librados::IoCtx ioctx;
+  const std::string oid;
+  RGWObjVersionTracker* objv_tracker;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+		      librados::IoCtx&& ioctx, std::string_view oid,
+		      RGWObjVersionTracker* objv_tracker = nullptr);
+
+  RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+		      RGWSI_RADOS::Obj& obj,
+		      RGWObjVersionTracker* objv_tracker = nullptr);
+
+  RGWRadosRemoveOidCR(rgw::sal::RadosStore* store,
+		      RGWSI_RADOS::Obj&& obj,
+		      RGWObjVersionTracker* objv_tracker = nullptr);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWSimpleRadosLockCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  std::string lock_name;
+  std::string cookie;
+  uint32_t duration;
+
+  rgw_raw_obj obj;
+
+  RGWAsyncLockSystemObj *req;
+
+public:
+  RGWSimpleRadosLockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+		      const rgw_raw_obj& _obj,
+                      const std::string& _lock_name,
+		      const std::string& _cookie,
+		      uint32_t _duration);
+  ~RGWSimpleRadosLockCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override;
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+
+  static std::string gen_random_cookie(CephContext* cct) {
+    static constexpr std::size_t COOKIE_LEN = 16;
+    char buf[COOKIE_LEN + 1];
+    gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
+    return buf;
+  }
+};
+
+class RGWSimpleRadosUnlockCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  std::string lock_name;
+  std::string cookie;
+
+  rgw_raw_obj obj;
+
+  RGWAsyncUnlockSystemObj *req;
+
+public:
+  RGWSimpleRadosUnlockCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+		      const rgw_raw_obj& _obj, 
+                      const std::string& _lock_name,
+		      const std::string& _cookie);
+  ~RGWSimpleRadosUnlockCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override;
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+#define OMAP_APPEND_MAX_ENTRIES_DEFAULT 100
+
+class RGWOmapAppend : public RGWConsumerCR<std::string> {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+
+  rgw_raw_obj obj;
+
+  bool going_down;
+
+  int num_pending_entries;
+  std::list<std::string> pending_entries;
+
+  std::map<std::string, bufferlist> entries;
+
+  uint64_t window_size;
+  uint64_t total_entries;
+public:
+  RGWOmapAppend(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                const rgw_raw_obj& _obj,
+                uint64_t _window_size = OMAP_APPEND_MAX_ENTRIES_DEFAULT);
+  int operate(const DoutPrefixProvider *dpp) override;
+  void flush_pending();
+  bool append(const std::string& s);
+  bool finish();
+
+  uint64_t get_total_entries() {
+    return total_entries;
+  }
+
+  const rgw_raw_obj& get_obj() {
+    return obj;
+  }
+};
+
+class RGWShardedOmapCRManager {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  RGWCoroutine *op;
+
+  int num_shards;
+
+  std::vector<RGWOmapAppend *> shards;
+public:
+  RGWShardedOmapCRManager(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store, RGWCoroutine *_op, int _num_shards, const rgw_pool& pool, const std::string& oid_prefix)
+                      : async_rados(_async_rados),
+		        store(_store), op(_op), num_shards(_num_shards) {
+    shards.reserve(num_shards);
+    for (int i = 0; i < num_shards; ++i) {
+      char buf[oid_prefix.size() + 16];
+      snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), i);
+      RGWOmapAppend *shard = new RGWOmapAppend(async_rados, store, rgw_raw_obj(pool, buf));
+      shard->get();
+      shards.push_back(shard);
+      op->spawn(shard, false);
+    }
+  }
+
+  ~RGWShardedOmapCRManager() {
+    for (auto shard : shards) {
+      shard->put();
+    }
+  }
+
+  bool append(const std::string& entry, int shard_id) {
+    return shards[shard_id]->append(entry);
+  }
+  bool finish() {
+    bool success = true;
+    for (auto& append_op : shards) {
+      success &= (append_op->finish() && (!append_op->is_error()));
+    }
+    return success;
+  }
+
+  uint64_t get_total_entries(int shard_id) {
+    return shards[shard_id]->get_total_entries();
+  }
+};
+
+class RGWAsyncGetBucketInstanceInfo : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  rgw_bucket bucket;
+  const DoutPrefixProvider *dpp;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncGetBucketInstanceInfo(RGWCoroutine *caller, RGWAioCompletionNotifier *cn,
+                                rgw::sal::RadosStore* _store, const rgw_bucket& bucket,
+                                const DoutPrefixProvider *dpp)
+    : RGWAsyncRadosRequest(caller, cn), store(_store), bucket(bucket), dpp(dpp) {}
+
+  RGWBucketInfo bucket_info;
+  std::map<std::string, bufferlist> attrs;
+};
+
+class RGWAsyncPutBucketInstanceInfo : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  RGWBucketInfo& bucket_info;
+  bool exclusive;
+  real_time mtime;
+  std::map<std::string, ceph::bufferlist>* attrs;
+  const DoutPrefixProvider *dpp;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncPutBucketInstanceInfo(RGWCoroutine* caller,
+				RGWAioCompletionNotifier* cn,
+                                rgw::sal::RadosStore* store,
+				RGWBucketInfo& bucket_info,
+				bool exclusive,
+				real_time mtime,
+				std::map<std::string, ceph::bufferlist>* attrs,
+                                const DoutPrefixProvider* dpp)
+    : RGWAsyncRadosRequest(caller, cn), store(store), bucket_info(bucket_info),
+      exclusive(exclusive), mtime(mtime), attrs(attrs), dpp(dpp) {}
+};
+
+class RGWGetBucketInstanceInfoCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  rgw_bucket bucket;
+  RGWBucketInfo *bucket_info;
+  std::map<std::string, bufferlist> *pattrs;
+  const DoutPrefixProvider *dpp;
+
+  RGWAsyncGetBucketInstanceInfo *req{nullptr};
+
+public:
+  // rgw_bucket constructor
+  RGWGetBucketInstanceInfoCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                             const rgw_bucket& _bucket, RGWBucketInfo *_bucket_info,
+                             std::map<std::string, bufferlist> *_pattrs, const DoutPrefixProvider *dpp)
+    : RGWSimpleCoroutine(_store->ctx()), async_rados(_async_rados), store(_store),
+      bucket(_bucket), bucket_info(_bucket_info), pattrs(_pattrs), dpp(dpp) {}
+  ~RGWGetBucketInstanceInfoCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncGetBucketInstanceInfo(this, stack->create_completion_notifier(), store, bucket, dpp);
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    if (bucket_info) {
+      *bucket_info = std::move(req->bucket_info);
+    }
+    if (pattrs) {
+      *pattrs = std::move(req->attrs);
+    }
+    return req->get_ret_status();
+  }
+};
+
+class RGWPutBucketInstanceInfoCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  RGWBucketInfo& bucket_info;
+  bool exclusive;
+  real_time mtime;
+  std::map<std::string, ceph::bufferlist>* attrs;
+  const DoutPrefixProvider *dpp;
+
+  RGWAsyncPutBucketInstanceInfo* req = nullptr;
+
+public:
+  // rgw_bucket constructor
+  RGWPutBucketInstanceInfoCR(RGWAsyncRadosProcessor *async_rados,
+			     rgw::sal::RadosStore* store,
+			     RGWBucketInfo& bucket_info,
+			     bool exclusive,
+			     real_time mtime,
+			     std::map<std::string, ceph::bufferlist>* attrs,
+                             const DoutPrefixProvider *dpp)
+    : RGWSimpleCoroutine(store->ctx()), async_rados(async_rados), store(store),
+      bucket_info(bucket_info), exclusive(exclusive),
+      mtime(mtime), attrs(attrs), dpp(dpp) {}
+  ~RGWPutBucketInstanceInfoCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = nullptr;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncPutBucketInstanceInfo(this,
+					    stack->create_completion_notifier(),
+					    store, bucket_info, exclusive,
+					    mtime, attrs, dpp);
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWRadosBILogTrimCR : public RGWSimpleCoroutine {
+  const RGWBucketInfo& bucket_info;
+  int shard_id;
+  const rgw::bucket_index_layout_generation generation;
+  RGWRados::BucketShard bs;
+  std::string start_marker;
+  std::string end_marker;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ public:
+  RGWRadosBILogTrimCR(const DoutPrefixProvider *dpp,
+                      rgw::sal::RadosStore* store, const RGWBucketInfo& bucket_info,
+                      int shard_id,
+		      const rgw::bucket_index_layout_generation& generation,
+		      const std::string& start_marker,
+                      const std::string& end_marker);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWAsyncFetchRemoteObj : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  std::optional<rgw_user> user_id;
+
+  rgw_bucket src_bucket;
+  std::optional<rgw_placement_rule> dest_placement_rule;
+  RGWBucketInfo dest_bucket_info;
+
+  rgw_obj_key key;
+  std::optional<rgw_obj_key> dest_key;
+  std::optional<uint64_t> versioned_epoch;
+
+  real_time src_mtime;
+
+  bool copy_if_newer;
+  std::shared_ptr<RGWFetchObjFilter> filter;
+  rgw_zone_set_entry source_trace_entry;
+  rgw_zone_set zones_trace;
+  PerfCounters* counters;
+  const DoutPrefixProvider *dpp;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncFetchRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                         const rgw_zone_id& _source_zone,
+                         std::optional<rgw_user>& _user_id,
+                         const rgw_bucket& _src_bucket,
+			 std::optional<rgw_placement_rule> _dest_placement_rule,
+                         const RGWBucketInfo& _dest_bucket_info,
+                         const rgw_obj_key& _key,
+                         const std::optional<rgw_obj_key>& _dest_key,
+                         std::optional<uint64_t> _versioned_epoch,
+                         bool _if_newer,
+                         std::shared_ptr<RGWFetchObjFilter> _filter,
+                         const rgw_zone_set_entry& source_trace_entry,
+                         rgw_zone_set *_zones_trace,
+                         PerfCounters* counters, const DoutPrefixProvider *dpp)
+    : RGWAsyncRadosRequest(caller, cn), store(_store),
+      source_zone(_source_zone),
+      user_id(_user_id),
+      src_bucket(_src_bucket),
+      dest_placement_rule(_dest_placement_rule),
+      dest_bucket_info(_dest_bucket_info),
+      key(_key),
+      dest_key(_dest_key),
+      versioned_epoch(_versioned_epoch),
+      copy_if_newer(_if_newer),
+      filter(_filter),
+      source_trace_entry(source_trace_entry),
+      counters(counters),
+      dpp(dpp)
+  {
+    if (_zones_trace) {
+      zones_trace = *_zones_trace;
+    }
+  }
+};
+
+class RGWFetchRemoteObjCR : public RGWSimpleCoroutine {
+  CephContext *cct;
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  std::optional<rgw_user> user_id;
+
+  rgw_bucket src_bucket;
+  std::optional<rgw_placement_rule> dest_placement_rule;
+  RGWBucketInfo dest_bucket_info;
+
+  rgw_obj_key key;
+  std::optional<rgw_obj_key> dest_key;
+  std::optional<uint64_t> versioned_epoch;
+
+  real_time src_mtime;
+
+  bool copy_if_newer;
+
+  std::shared_ptr<RGWFetchObjFilter> filter;
+
+  RGWAsyncFetchRemoteObj *req;
+  const rgw_zone_set_entry& source_trace_entry;
+  rgw_zone_set *zones_trace;
+  PerfCounters* counters;
+  const DoutPrefixProvider *dpp;
+
+public:
+  RGWFetchRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                      const rgw_zone_id& _source_zone,
+                      std::optional<rgw_user> _user_id,
+                      const rgw_bucket& _src_bucket,
+		      std::optional<rgw_placement_rule> _dest_placement_rule,
+                      const RGWBucketInfo& _dest_bucket_info,
+                      const rgw_obj_key& _key,
+                      const std::optional<rgw_obj_key>& _dest_key,
+                      std::optional<uint64_t> _versioned_epoch,
+                      bool _if_newer,
+                      std::shared_ptr<RGWFetchObjFilter> _filter,
+                      const rgw_zone_set_entry& source_trace_entry,
+                      rgw_zone_set *_zones_trace,
+                      PerfCounters* counters, const DoutPrefixProvider *dpp)
+    : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()),
+      async_rados(_async_rados), store(_store),
+      source_zone(_source_zone),
+      user_id(_user_id),
+      src_bucket(_src_bucket),
+      dest_placement_rule(_dest_placement_rule),
+      dest_bucket_info(_dest_bucket_info),
+      key(_key),
+      dest_key(_dest_key),
+      versioned_epoch(_versioned_epoch),
+      copy_if_newer(_if_newer),
+      filter(_filter),
+      req(NULL),
+      source_trace_entry(source_trace_entry),
+      zones_trace(_zones_trace), counters(counters), dpp(dpp) {}
+
+
+  ~RGWFetchRemoteObjCR() override {
+    request_cleanup();
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncFetchRemoteObj(this, stack->create_completion_notifier(), store,
+				     source_zone, user_id, src_bucket, dest_placement_rule, dest_bucket_info,
+                                     key, dest_key, versioned_epoch, copy_if_newer, filter,
+                                     source_trace_entry, zones_trace, counters, dpp);
+    async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWAsyncStatRemoteObj : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  rgw_bucket src_bucket;
+  rgw_obj_key key;
+
+  ceph::real_time *pmtime;
+  uint64_t *psize;
+  std::string *petag;
+  std::map<std::string, bufferlist> *pattrs;
+  std::map<std::string, std::string> *pheaders;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncStatRemoteObj(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                         const rgw_zone_id& _source_zone,
+                         rgw_bucket& _src_bucket,
+                         const rgw_obj_key& _key,
+                         ceph::real_time *_pmtime,
+                         uint64_t *_psize,
+                         std::string *_petag,
+                         std::map<std::string, bufferlist> *_pattrs,
+                         std::map<std::string, std::string> *_pheaders) : RGWAsyncRadosRequest(caller, cn), store(_store),
+                                                      source_zone(_source_zone),
+                                                      src_bucket(_src_bucket),
+                                                      key(_key),
+                                                      pmtime(_pmtime),
+                                                      psize(_psize),
+                                                      petag(_petag),
+                                                      pattrs(_pattrs),
+                                                      pheaders(_pheaders) {}
+};
+
+class RGWStatRemoteObjCR : public RGWSimpleCoroutine {
+  CephContext *cct;
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  rgw_bucket src_bucket;
+  rgw_obj_key key;
+
+  ceph::real_time *pmtime;
+  uint64_t *psize;
+  std::string *petag;
+  std::map<std::string, bufferlist> *pattrs;
+  std::map<std::string, std::string> *pheaders;
+
+  RGWAsyncStatRemoteObj *req;
+
+public:
+  RGWStatRemoteObjCR(RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                      const rgw_zone_id& _source_zone,
+                      rgw_bucket& _src_bucket,
+                      const rgw_obj_key& _key,
+                      ceph::real_time *_pmtime,
+                      uint64_t *_psize,
+                      std::string *_petag,
+                      std::map<std::string, bufferlist> *_pattrs,
+                      std::map<std::string, std::string> *_pheaders) : RGWSimpleCoroutine(_store->ctx()), cct(_store->ctx()),
+                                       async_rados(_async_rados), store(_store),
+                                       source_zone(_source_zone),
+                                       src_bucket(_src_bucket),
+                                       key(_key),
+                                       pmtime(_pmtime),
+                                       psize(_psize),
+                                       petag(_petag),
+                                       pattrs(_pattrs),
+                                       pheaders(_pheaders),
+                                       req(NULL) {}
+
+
+  ~RGWStatRemoteObjCR() override {
+    request_cleanup();
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncStatRemoteObj(this, stack->create_completion_notifier(), store, source_zone,
+                                    src_bucket, key, pmtime, psize, petag, pattrs, pheaders);
+    async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWAsyncRemoveObj : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  std::unique_ptr<rgw::sal::Object> obj;
+
+  std::string owner;
+  std::string owner_display_name;
+  bool versioned;
+  uint64_t versioned_epoch;
+  std::string marker_version_id;
+
+  bool del_if_older;
+  ceph::real_time timestamp;
+  rgw_zone_set zones_trace;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncRemoveObj(const DoutPrefixProvider *_dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, 
+                         rgw::sal::RadosStore* _store,
+                         const rgw_zone_id& _source_zone,
+                         RGWBucketInfo& _bucket_info,
+                         const rgw_obj_key& _key,
+                         const std::string& _owner,
+                         const std::string& _owner_display_name,
+                         bool _versioned,
+                         uint64_t _versioned_epoch,
+                         bool _delete_marker,
+                         bool _if_older,
+                         real_time& _timestamp,
+                         rgw_zone_set* _zones_trace) : RGWAsyncRadosRequest(caller, cn), dpp(_dpp), store(_store),
+                                                      source_zone(_source_zone),
+                                                      owner(_owner),
+                                                      owner_display_name(_owner_display_name),
+                                                      versioned(_versioned),
+                                                      versioned_epoch(_versioned_epoch),
+                                                      del_if_older(_if_older),
+                                                      timestamp(_timestamp) {
+    if (_delete_marker) {
+      marker_version_id = _key.instance;
+    }
+
+    if (_zones_trace) {
+      zones_trace = *_zones_trace;
+    }
+    store->get_bucket(nullptr, _bucket_info, &bucket);
+    obj = bucket->get_object(_key);
+  }
+};
+
+class RGWRemoveObjCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  CephContext *cct;
+  RGWAsyncRadosProcessor *async_rados;
+  rgw::sal::RadosStore* store;
+  rgw_zone_id source_zone;
+
+  RGWBucketInfo bucket_info;
+
+  rgw_obj_key key;
+  bool versioned;
+  uint64_t versioned_epoch;
+  bool delete_marker;
+  std::string owner;
+  std::string owner_display_name;
+
+  bool del_if_older;
+  real_time timestamp;
+
+  RGWAsyncRemoveObj *req;
+  
+  rgw_zone_set *zones_trace;
+
+public:
+  RGWRemoveObjCR(const DoutPrefixProvider *_dpp, RGWAsyncRadosProcessor *_async_rados, rgw::sal::RadosStore* _store,
+                      const rgw_zone_id& _source_zone,
+                      RGWBucketInfo& _bucket_info,
+                      const rgw_obj_key& _key,
+                      bool _versioned,
+                      uint64_t _versioned_epoch,
+                      std::string *_owner,
+                      std::string *_owner_display_name,
+                      bool _delete_marker,
+                      real_time *_timestamp,
+                      rgw_zone_set *_zones_trace) : RGWSimpleCoroutine(_store->ctx()), dpp(_dpp), cct(_store->ctx()),
+                                       async_rados(_async_rados), store(_store),
+                                       source_zone(_source_zone),
+                                       bucket_info(_bucket_info),
+                                       key(_key),
+                                       versioned(_versioned),
+                                       versioned_epoch(_versioned_epoch),
+                                       delete_marker(_delete_marker), req(NULL), zones_trace(_zones_trace) {
+    del_if_older = (_timestamp != NULL);
+    if (_timestamp) {
+      timestamp = *_timestamp;
+    }
+
+    if (_owner) {
+      owner = *_owner;
+    }
+
+    if (_owner_display_name) {
+      owner_display_name = *_owner_display_name;
+    }
+  }
+  ~RGWRemoveObjCR() override {
+    request_cleanup();
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncRemoveObj(dpp, this, stack->create_completion_notifier(), store, source_zone, bucket_info,
+                                key, owner, owner_display_name, versioned, versioned_epoch,
+                                delete_marker, del_if_older, timestamp, zones_trace);
+    async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+/// \brief Collect average latency
+///
+/// Used in data sync to back off on concurrency when latency of lock
+/// operations rises.
+///
+/// \warning This class is not thread safe. We do not use a mutex
+/// because all coroutines spawned by RGWDataSyncCR share a single thread.
+class LatencyMonitor {
+  ceph::timespan total;
+  std::uint64_t count = 0;
+
+public:
+
+  LatencyMonitor() = default;
+  void add_latency(ceph::timespan latency) {
+    total += latency;
+    ++count;
+  }
+
+  ceph::timespan avg_latency() {
+    using namespace std::literals;
+    return count == 0 ? 0s : total / count;
+  }
+};
+
+class RGWContinuousLeaseCR : public RGWCoroutine {
+  RGWAsyncRadosProcessor* async_rados;
+  rgw::sal::RadosStore* store;
+
+  const rgw_raw_obj obj;
+
+  const std::string lock_name;
+  const std::string cookie{RGWSimpleRadosLockCR::gen_random_cookie(cct)};
+
+  int interval;
+  bool going_down{false};
+  bool locked{false};
+  
+  const ceph::timespan interval_tolerance;
+  const ceph::timespan ts_interval;
+
+  RGWCoroutine* caller;
+
+  bool aborted{false};
+  
+  ceph::coarse_mono_time last_renew_try_time;
+  ceph::coarse_mono_time current_time;
+
+  LatencyMonitor* latency;
+
+public:
+  RGWContinuousLeaseCR(RGWAsyncRadosProcessor* async_rados,
+                       rgw::sal::RadosStore* _store,
+                       rgw_raw_obj obj, std::string lock_name,
+                       int interval, RGWCoroutine* caller,
+		       LatencyMonitor* const latency)
+    : RGWCoroutine(_store->ctx()), async_rados(async_rados), store(_store),
+      obj(std::move(obj)), lock_name(std::move(lock_name)),
+      interval(interval), interval_tolerance(ceph::make_timespan(9*interval/10)),
+      ts_interval(ceph::make_timespan(interval)), caller(caller), latency(latency)
+  {}
+
+  virtual ~RGWContinuousLeaseCR() override;
+
+  int operate(const DoutPrefixProvider *dpp) override;
+
+  bool is_locked() const {
+    if (ceph::coarse_mono_clock::now() - last_renew_try_time > ts_interval) {
+      return false;
+    }
+    return locked;
+  }
+
+  void set_locked(bool status) {
+    locked = status;
+  }
+
+  void go_down() {
+    going_down = true;
+    wakeup();
+  }
+
+  void abort() {
+    aborted = true;
+  }
+};
+
+class RGWRadosTimelogAddCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  std::list<cls_log_entry> entries;
+
+  std::string oid;
+
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosTimelogAddCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* _store, const std::string& _oid,
+		        const cls_log_entry& entry);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWRadosTimelogTrimCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ protected:
+  std::string oid;
+  real_time start_time;
+  real_time end_time;
+  std::string from_marker;
+  std::string to_marker;
+
+ public:
+  RGWRadosTimelogTrimCR(const DoutPrefixProvider *dpp, 
+                        rgw::sal::RadosStore* store, const std::string& oid,
+                        const real_time& start_time, const real_time& end_time,
+                        const std::string& from_marker,
+                        const std::string& to_marker);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+// wrapper to update last_trim_marker on success
+class RGWSyncLogTrimCR : public RGWRadosTimelogTrimCR {
+  CephContext *cct;
+  std::string *last_trim_marker;
+ public:
+  static constexpr const char* max_marker = "99999999";
+
+  RGWSyncLogTrimCR(const DoutPrefixProvider *dpp,
+                   rgw::sal::RadosStore* store, const std::string& oid,
+                   const std::string& to_marker, std::string *last_trim_marker);
+  int request_complete() override;
+};
+
+class RGWAsyncStatObj : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWBucketInfo bucket_info;
+  rgw_obj obj;
+  uint64_t *psize;
+  real_time *pmtime;
+  uint64_t *pepoch;
+  RGWObjVersionTracker *objv_tracker;
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override;
+public:
+  RGWAsyncStatObj(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* store,
+                  const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr,
+                  real_time *pmtime = nullptr, uint64_t *pepoch = nullptr,
+                  RGWObjVersionTracker *objv_tracker = nullptr)
+	  : RGWAsyncRadosRequest(caller, cn), dpp(dpp), store(store), obj(obj), psize(psize),
+	  pmtime(pmtime), pepoch(pepoch), objv_tracker(objv_tracker) {}
+};
+
+class RGWStatObjCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWAsyncRadosProcessor *async_rados;
+  RGWBucketInfo bucket_info;
+  rgw_obj obj;
+  uint64_t *psize;
+  real_time *pmtime;
+  uint64_t *pepoch;
+  RGWObjVersionTracker *objv_tracker;
+  RGWAsyncStatObj *req = nullptr;
+ public:
+  RGWStatObjCR(const DoutPrefixProvider *dpp, RGWAsyncRadosProcessor *async_rados, rgw::sal::RadosStore* store,
+	  const RGWBucketInfo& _bucket_info, const rgw_obj& obj, uint64_t *psize = nullptr,
+	  real_time* pmtime = nullptr, uint64_t *pepoch = nullptr,
+	  RGWObjVersionTracker *objv_tracker = nullptr);
+  ~RGWStatObjCR() override {
+    request_cleanup();
+  }
+  void request_cleanup() override;
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+/// coroutine wrapper for IoCtx::aio_notify()
+class RGWRadosNotifyCR : public RGWSimpleCoroutine {
+  rgw::sal::RadosStore* const store;
+  const rgw_raw_obj obj;
+  bufferlist request;
+  const uint64_t timeout_ms;
+  bufferlist *response;
+  rgw_rados_ref ref;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+
+public:
+  RGWRadosNotifyCR(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+                   bufferlist& request, uint64_t timeout_ms,
+                   bufferlist *response);
+
+  int send_request(const DoutPrefixProvider *dpp) override;
+  int request_complete() override;
+};
+
+class RGWDataPostNotifyCR : public RGWCoroutine {
+  RGWRados *store;
+  RGWHTTPManager& http_manager;
+  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& shards;
+  const char *source_zone;
+  RGWRESTConn *conn;
+
+public:
+  RGWDataPostNotifyCR(RGWRados *_store, RGWHTTPManager& _http_manager, bc::flat_map<int,
+                    bc::flat_set<rgw_data_notify_entry> >& _shards, const char *_zone, RGWRESTConn *_conn)
+                    : RGWCoroutine(_store->ctx()), store(_store), http_manager(_http_manager),
+                      shards(_shards), source_zone(_zone), conn(_conn) {}
+
+  int operate(const DoutPrefixProvider* dpp) override;
+};
+
diff --git a/src/rgw/driver/rados/rgw_cr_tools.cc b/src/rgw/driver/rados/rgw_cr_tools.cc
new file mode 100644
index 000000000..94665a35a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_cr_tools.cc
@@ -0,0 +1,292 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_cr_tools.h"
+#include "rgw_bucket.h"
+#include "rgw_user.h"
+#include "rgw_op.h"
+#include "rgw_acl_s3.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+template<>
+int RGWUserCreateCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  CephContext *cct = store->ctx();
+
+  const int32_t default_max_buckets =
+    cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+
+  RGWUserAdminOpState op_state(store);
+
+  auto& user = params.user;
+
+  op_state.set_user_id(user);
+  op_state.set_display_name(params.display_name);
+  op_state.set_user_email(params.email);
+  op_state.set_caps(params.caps);
+  op_state.set_access_key(params.access_key);
+  op_state.set_secret_key(params.secret_key);
+
+  if (!params.key_type.empty()) {
+    int32_t key_type = KEY_TYPE_S3;
+    if (params.key_type == "swift") {
+      key_type = KEY_TYPE_SWIFT;
+    }
+
+    op_state.set_key_type(key_type);
+  }
+
+  op_state.set_max_buckets(params.max_buckets.value_or(default_max_buckets));
+  op_state.set_suspension(params.suspended);
+  op_state.set_system(params.system);
+  op_state.set_exclusive(params.exclusive);
+
+  if (params.generate_key) {
+    op_state.set_generate_key();
+  }
+
+
+  if (params.apply_quota) {
+    RGWQuota quota;
+
+    if (cct->_conf->rgw_bucket_default_quota_max_objects >= 0) {
+      quota.bucket_quota.max_objects = cct->_conf->rgw_bucket_default_quota_max_objects;
+      quota.bucket_quota.enabled = true;
+    }
+
+    if (cct->_conf->rgw_bucket_default_quota_max_size >= 0) {
+      quota.bucket_quota.max_size = cct->_conf->rgw_bucket_default_quota_max_size;
+      quota.bucket_quota.enabled = true;
+    }
+
+    if (cct->_conf->rgw_user_default_quota_max_objects >= 0) {
+      quota.user_quota.max_objects = cct->_conf->rgw_user_default_quota_max_objects;
+      quota.user_quota.enabled = true;
+    }
+
+    if (cct->_conf->rgw_user_default_quota_max_size >= 0) {
+      quota.user_quota.max_size = cct->_conf->rgw_user_default_quota_max_size;
+      quota.user_quota.enabled = true;
+    }
+
+    if (quota.bucket_quota.enabled) {
+      op_state.set_bucket_quota(quota.bucket_quota);
+    }
+
+    if (quota.user_quota.enabled) {
+      op_state.set_user_quota(quota.user_quota);
+    }
+  }
+
+  RGWNullFlusher flusher;
+  return RGWUserAdminOp_User::create(dpp, store, op_state, flusher, null_yield);
+}
+
+template<>
+int RGWGetUserInfoCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  return store->ctl()->user->get_info_by_uid(dpp, params.user, result.get(), null_yield);
+}
+
+template<>
+int RGWGetBucketInfoCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  return store->get_bucket(dpp, nullptr, params.tenant, params.bucket_name, &result->bucket, null_yield);
+}
+
+template<>
+int RGWBucketCreateLocalCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  CephContext *cct = store->ctx();
+  auto& zone_svc = store->svc()->zone;
+
+  const auto& user_info = params.user_info.get();
+  const auto& user = user_info->user_id;
+  const auto& bucket_name = params.bucket_name;
+  auto& placement_rule = params.placement_rule;
+
+  if (!placement_rule.empty() &&
+      !zone_svc->get_zone_params().valid_placement(placement_rule)) {
+    ldpp_dout(dpp, 0) << "placement target (" << placement_rule << ")"
+      << " doesn't exist in the placement targets of zonegroup"
+      << " (" << zone_svc->get_zonegroup().api_name << ")" << dendl;
+    return -ERR_INVALID_LOCATION_CONSTRAINT;
+  }
+
+  /* we need to make sure we read bucket info, it's not read before for this
+   * specific request */
+  RGWBucketInfo bucket_info;
+  map<string, bufferlist> bucket_attrs;
+
+  int ret = store->getRados()->get_bucket_info(store->svc(), user.tenant, bucket_name,
+				  bucket_info, nullptr, null_yield, dpp, &bucket_attrs);
+  if (ret < 0 && ret != -ENOENT)
+    return ret;
+  bool bucket_exists = (ret != -ENOENT);
+
+  RGWAccessControlPolicy old_policy(cct);
+  ACLOwner bucket_owner;
+  bucket_owner.set_id(user);
+  bucket_owner.set_name(user_info->display_name);
+  if (bucket_exists) {
+    ret = rgw_op_get_bucket_policy_from_attr(dpp, cct, store, bucket_info,
+                                             bucket_attrs, &old_policy, null_yield);
+    if (ret >= 0)  {
+      if (old_policy.get_owner().get_id().compare(user) != 0) {
+        return -EEXIST;
+      }
+    }
+  }
+
+  RGWBucketInfo master_info;
+  rgw_bucket *pmaster_bucket = nullptr;
+  uint32_t *pmaster_num_shards = nullptr;
+  real_time creation_time;
+
+  string zonegroup_id = zone_svc->get_zonegroup().get_id();
+
+  if (bucket_exists) {
+    rgw_placement_rule selected_placement_rule;
+    rgw_bucket bucket;
+    bucket.tenant = user.tenant;
+    bucket.name = bucket_name;
+    ret = zone_svc->select_bucket_placement(dpp, *user_info, zonegroup_id,
+					    placement_rule,
+					    &selected_placement_rule, nullptr, null_yield);
+    if (selected_placement_rule != bucket_info.placement_rule) {
+      ldpp_dout(dpp, 0) << "bucket already exists on a different placement rule: "
+        << " selected_rule= " << selected_placement_rule
+        << " existing_rule= " << bucket_info.placement_rule << dendl;
+      return -EEXIST;
+    }
+  }
+
+  /* Encode special metadata first as we're using std::map::emplace under
+   * the hood. This method will add the new items only if the map doesn't
+   * contain such keys yet. */
+  RGWAccessControlPolicy_S3 policy(cct);
+  policy.create_canned(bucket_owner, bucket_owner, string()); /* default private policy */
+  bufferlist aclbl;
+  policy.encode(aclbl);
+  map<string, buffer::list> attrs;
+  attrs.emplace(std::move(RGW_ATTR_ACL), std::move(aclbl));
+
+  RGWQuotaInfo quota_info;
+  const RGWQuotaInfo * pquota_info = nullptr;
+
+  rgw_bucket bucket;
+  bucket.tenant = user.tenant;
+  bucket.name = bucket_name;
+
+  RGWBucketInfo info;
+  obj_version ep_objv;
+
+  ret = store->getRados()->create_bucket(*user_info, bucket, zonegroup_id,
+                                placement_rule, bucket_info.swift_ver_location,
+                                pquota_info, attrs,
+                                info, nullptr, &ep_objv, creation_time,
+				pmaster_bucket, pmaster_num_shards, null_yield, dpp, true);
+
+
+  if (ret && ret != -EEXIST)
+    return ret;
+
+  bool existed = (ret == -EEXIST);
+
+  if (existed) {
+    if (info.owner != user) {
+      ldpp_dout(dpp, 20) << "NOTICE: bucket already exists under a different user (bucket=" << bucket << " user=" << user << " bucket_owner=" << info.owner << dendl;
+      return -EEXIST;
+    }
+    bucket = info.bucket;
+  }
+
+  ret = store->ctl()->bucket->link_bucket(user, bucket, info.creation_time, null_yield, dpp, false);
+  if (ret && !existed && ret != -EEXIST) {
+    /* if it exists (or previously existed), don't remove it! */
+    int r = store->ctl()->bucket->unlink_bucket(user, bucket, null_yield, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: failed to unlink bucket: ret=" << r << dendl;
+    }
+  } else if (ret == -EEXIST || (ret == 0 && existed)) {
+    ret = -ERR_BUCKET_EXISTS;
+  }
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: bucket creation (bucket=" << bucket << ") return ret=" << ret << dendl;
+  }
+
+  return ret;
+}
+
+template<>
+int RGWObjectSimplePutCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  RGWDataAccess::ObjectRef obj;
+
+  CephContext *cct = store->ctx();
+
+  int ret = params.bucket->get_object(params.key, &obj);
+  if (ret < 0) {
+    lderr(cct) << "ERROR: failed to get object: " << cpp_strerror(-ret) << dendl;
+    return -ret;
+  }
+
+  if (params.user_data) {
+    obj->set_user_data(*params.user_data);
+  }
+
+  ret = obj->put(params.data, params.attrs, dpp, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: put object returned error: " << cpp_strerror(-ret) << dendl;
+  }
+
+  return 0;
+}
+
+template<>
+int RGWBucketLifecycleConfigCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  CephContext *cct = store->ctx();
+
+  RGWLC *lc = store->getRados()->get_lc();
+  if (!lc) {
+    lderr(cct) << "ERROR: lifecycle object is not initialized!" << dendl;
+    return -EIO;
+  }
+
+  int ret = lc->set_bucket_config(params.bucket,
+                                  params.bucket_attrs,
+                                  &params.config);
+  if (ret < 0) {
+    lderr(cct) << "ERROR: failed to set lifecycle on bucke: " << cpp_strerror(-ret) << dendl;
+    return -ret;
+  }
+
+  return 0;
+}
+
+template<>
+int RGWBucketGetSyncPolicyHandlerCR::Request::_send_request(const DoutPrefixProvider *dpp)
+{
+  int r = store->ctl()->bucket->get_sync_policy_handler(params.zone,
+                                                        params.bucket,
+                                                        &result->policy_handler,
+                                                        null_yield,
+                                                        dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: " << __func__ << "(): get_sync_policy_handler() returned " << r << dendl;
+    return  r;
+  }
+
+  return 0;
+}
diff --git a/src/rgw/driver/rados/rgw_cr_tools.h b/src/rgw/driver/rados/rgw_cr_tools.h
new file mode 100644
index 000000000..4cd97aa82
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_cr_tools.h
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_cr_rados.h"
+#include "rgw_tools.h"
+#include "rgw_lc.h"
+
+#include "services/svc_bucket_sync.h"
+
+struct rgw_user_create_params {
+  rgw_user user;
+  std::string display_name;
+  std::string email;
+  std::string access_key;
+  std::string secret_key;
+  std::string key_type; /* "swift" or "s3" */
+  std::string caps;
+
+  bool generate_key{true};
+  bool suspended{false};
+  std::optional<int32_t> max_buckets;
+  bool system{false};
+  bool exclusive{false};
+  bool apply_quota{true};
+};
+
+using RGWUserCreateCR = RGWSimpleWriteOnlyAsyncCR<rgw_user_create_params>;
+
+struct rgw_get_user_info_params {
+  rgw_user user;
+};
+
+using RGWGetUserInfoCR = RGWSimpleAsyncCR<rgw_get_user_info_params, RGWUserInfo>;
+
+struct rgw_get_bucket_info_params {
+  std::string tenant;
+  std::string bucket_name;
+};
+
+struct rgw_get_bucket_info_result {
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+};
+
+using RGWGetBucketInfoCR = RGWSimpleAsyncCR<rgw_get_bucket_info_params, rgw_get_bucket_info_result>;
+
+struct rgw_bucket_create_local_params {
+  std::shared_ptr<RGWUserInfo> user_info;
+  std::string bucket_name;
+  rgw_placement_rule placement_rule;
+};
+
+using RGWBucketCreateLocalCR = RGWSimpleWriteOnlyAsyncCR<rgw_bucket_create_local_params>;
+
+struct rgw_object_simple_put_params {
+  RGWDataAccess::BucketRef bucket;
+  rgw_obj_key key;
+  bufferlist data;
+  std::map<std::string, bufferlist> attrs;
+  std::optional<std::string> user_data;
+};
+
+using RGWObjectSimplePutCR = RGWSimpleWriteOnlyAsyncCR<rgw_object_simple_put_params>;
+
+
+struct rgw_bucket_lifecycle_config_params {
+  rgw::sal::Bucket* bucket;
+  rgw::sal::Attrs bucket_attrs;
+  RGWLifecycleConfiguration config;
+};
+
+using RGWBucketLifecycleConfigCR = RGWSimpleWriteOnlyAsyncCR<rgw_bucket_lifecycle_config_params>;
+
+struct rgw_bucket_get_sync_policy_params {
+  std::optional<rgw_zone_id> zone;
+  std::optional<rgw_bucket> bucket;
+};
+
+struct rgw_bucket_get_sync_policy_result {
+  RGWBucketSyncPolicyHandlerRef policy_handler;
+};
+
+using RGWBucketGetSyncPolicyHandlerCR = RGWSimpleAsyncCR<rgw_bucket_get_sync_policy_params, rgw_bucket_get_sync_policy_result>;
+
diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.cc b/src/rgw/driver/rados/rgw_d3n_datacache.cc
new file mode 100644
index 000000000..f1bf731ae
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_d3n_datacache.cc
@@ -0,0 +1,369 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_d3n_datacache.h"
+#include "rgw_rest_client.h"
+#include "rgw_auth_s3.h"
+#include "rgw_op.h"
+#include "rgw_common.h"
+#include "rgw_auth_s3.h"
+#include "rgw_op.h"
+#include "rgw_crypt_sanitize.h"
+#if defined(__linux__)
+#include <features.h>
+#endif
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace efs = std::filesystem;
+#else
+#include <experimental/filesystem>
+namespace efs = std::experimental::filesystem;
+#endif
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int D3nCacheAioWriteRequest::d3n_libaio_prepare_write_op(bufferlist& bl, unsigned int len, string oid, string cache_location)
+{
+  std::string location = cache_location + url_encode(oid, true);
+  int r = 0;
+
+  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): Write To Cache, location=" << location << dendl;
+  cb = new struct aiocb;
+  mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+  memset(cb, 0, sizeof(struct aiocb));
+  r = fd = ::open(location.c_str(), O_WRONLY | O_CREAT | O_TRUNC, mode);
+  if (fd < 0) {
+    ldout(cct, 0) << "ERROR: D3nCacheAioWriteRequest::create_io: open file failed, errno=" << errno << ", location='" << location.c_str() << "'" << dendl;
+    goto done;
+  }
+  if (g_conf()->rgw_d3n_l1_fadvise != POSIX_FADV_NORMAL)
+    posix_fadvise(fd, 0, 0, g_conf()->rgw_d3n_l1_fadvise);
+  cb->aio_fildes = fd;
+
+  data = malloc(len);
+  if (!data) {
+    ldout(cct, 0) << "ERROR: D3nCacheAioWriteRequest::create_io: memory allocation failed" << dendl;
+    goto close_file;
+  }
+  cb->aio_buf = data;
+  memcpy((void*)data, bl.c_str(), len);
+  cb->aio_nbytes = len;
+  goto done;
+
+close_file:
+  ::close(fd);
+done:
+  return r;
+}
+
+D3nDataCache::D3nDataCache()
+  : cct(nullptr), io_type(_io_type::ASYNC_IO), free_data_cache_size(0), outstanding_write_size(0)
+{
+  lsubdout(g_ceph_context, rgw_datacache, 5) << "D3nDataCache: " << __func__ << "()" << dendl;
+}
+
+void D3nDataCache::init(CephContext *_cct) {
+  cct = _cct;
+  free_data_cache_size = cct->_conf->rgw_d3n_l1_datacache_size;
+  head = nullptr;
+  tail = nullptr;
+  cache_location = cct->_conf->rgw_d3n_l1_datacache_persistent_path;
+  if(cache_location.back() != '/') {
+      cache_location += "/";
+  }
+  try {
+    if (efs::exists(cache_location)) {
+      // d3n: evict the cache storage directory
+      if (g_conf()->rgw_d3n_l1_evict_cache_on_start) {
+        lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: evicting the persistent storage directory on start" << dendl;
+        for (auto& p : efs::directory_iterator(cache_location)) {
+          efs::remove_all(p.path());
+        }
+      }
+    } else {
+      // create the cache storage directory
+      lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: init: creating the persistent storage directory on start" << dendl;
+      efs::create_directories(cache_location);
+    }
+  } catch (const efs::filesystem_error& e) {
+    lderr(g_ceph_context) << "D3nDataCache: init: ERROR initializing the cache storage directory '" << cache_location <<
+                              "' : " << e.what() << dendl;
+  }
+
+  auto conf_eviction_policy = cct->_conf.get_val<std::string>("rgw_d3n_l1_eviction_policy");
+  ceph_assert(conf_eviction_policy == "lru" || conf_eviction_policy == "random");
+  if (conf_eviction_policy == "lru")
+    eviction_policy = _eviction_policy::LRU;
+  if (conf_eviction_policy == "random")
+    eviction_policy = _eviction_policy::RANDOM;
+
+#if defined(HAVE_LIBAIO) && defined(__GLIBC__)
+  // libaio setup
+  struct aioinit ainit{0};
+  ainit.aio_threads = cct->_conf.get_val<int64_t>("rgw_d3n_libaio_aio_threads");
+  ainit.aio_num = cct->_conf.get_val<int64_t>("rgw_d3n_libaio_aio_num");
+  ainit.aio_idle_time = 10;
+  aio_init(&ainit);
+#endif
+}
+
+int D3nDataCache::d3n_io_write(bufferlist& bl, unsigned int len, std::string oid)
+{
+  D3nChunkDataInfo* chunk_info = new D3nChunkDataInfo;
+  std::string location = cache_location + url_encode(oid, true);
+
+  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl;
+  FILE *cache_file = nullptr;
+  int r = 0;
+  size_t nbytes = 0;
+
+  cache_file = fopen(location.c_str(), "w+");
+  if (cache_file == nullptr) {
+    ldout(cct, 0) << "ERROR: D3nDataCache::fopen file has return error, errno=" << errno << dendl;
+    return -errno;
+  }
+
+  nbytes = fwrite(bl.c_str(), 1, len, cache_file);
+  if (nbytes != len) {
+    ldout(cct, 0) << "ERROR: D3nDataCache::io_write: fwrite has returned error: nbytes!=len, nbytes=" << nbytes << ", len=" << len << dendl;
+    return -EIO;
+  }
+
+  r = fclose(cache_file);
+  if (r != 0) {
+    ldout(cct, 0) << "ERROR: D3nDataCache::fclsoe file has return error, errno=" << errno << dendl;
+    return -errno;
+  }
+
+  { // update cahce_map entries for new chunk in cache
+    const std::lock_guard l(d3n_cache_lock);
+    chunk_info->oid = oid;
+    chunk_info->set_ctx(cct);
+    chunk_info->size = len;
+    d3n_cache_map.insert(pair<string, D3nChunkDataInfo*>(oid, chunk_info));
+  }
+
+  return r;
+}
+
+void d3n_libaio_write_cb(sigval sigval)
+{
+  lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+  D3nCacheAioWriteRequest* c = static_cast<D3nCacheAioWriteRequest*>(sigval.sival_ptr);
+  c->priv_data->d3n_libaio_write_completion_cb(c);
+}
+
+
+void D3nDataCache::d3n_libaio_write_completion_cb(D3nCacheAioWriteRequest* c)
+{
+  D3nChunkDataInfo* chunk_info{nullptr};
+
+  ldout(cct, 5) << "D3nDataCache: " << __func__ << "(): oid=" << c->oid << dendl;
+
+  { // update cache_map entries for new chunk in cache
+    const std::lock_guard l(d3n_cache_lock);
+    d3n_outstanding_write_list.erase(c->oid);
+    chunk_info = new D3nChunkDataInfo;
+    chunk_info->oid = c->oid;
+    chunk_info->set_ctx(cct);
+    chunk_info->size = c->cb->aio_nbytes;
+    d3n_cache_map.insert(pair<string, D3nChunkDataInfo*>(c->oid, chunk_info));
+  }
+
+  { // update free size
+    const std::lock_guard l(d3n_eviction_lock);
+    free_data_cache_size -= c->cb->aio_nbytes;
+    outstanding_write_size -= c->cb->aio_nbytes;
+    lru_insert_head(chunk_info);
+  }
+  delete c;
+  c = nullptr;
+}
+
+int D3nDataCache::d3n_libaio_create_write_request(bufferlist& bl, unsigned int len, std::string oid)
+{
+  lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "(): Write To Cache, oid=" << oid << ", len=" << len << dendl;
+  struct D3nCacheAioWriteRequest* wr = new struct D3nCacheAioWriteRequest(cct);
+  int r=0;
+  if ((r = wr->d3n_libaio_prepare_write_op(bl, len, oid, cache_location)) < 0) {
+    ldout(cct, 0) << "ERROR: D3nDataCache: " << __func__ << "() prepare libaio write op r=" << r << dendl;
+    goto done;
+  }
+  wr->cb->aio_sigevent.sigev_notify = SIGEV_THREAD;
+  wr->cb->aio_sigevent.sigev_notify_function = d3n_libaio_write_cb;
+  wr->cb->aio_sigevent.sigev_notify_attributes = nullptr;
+  wr->cb->aio_sigevent.sigev_value.sival_ptr = (void*)wr;
+  wr->oid = oid;
+  wr->priv_data = this;
+
+  if ((r = ::aio_write(wr->cb)) != 0) {
+    ldout(cct, 0) << "ERROR: D3nDataCache: " << __func__ << "() aio_write r=" << r << dendl;
+    goto error;
+  }
+  return 0;
+
+error:
+  delete wr;
+done:
+  return r;
+}
+
+void D3nDataCache::put(bufferlist& bl, unsigned int len, std::string& oid)
+{
+  size_t sr = 0;
+  uint64_t freed_size = 0, _free_data_cache_size = 0, _outstanding_write_size = 0;
+
+  ldout(cct, 10) << "D3nDataCache::" << __func__ << "(): oid=" << oid << ", len=" << len << dendl;
+  {
+    const std::lock_guard l(d3n_cache_lock);
+    std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.find(oid);
+    if (iter != d3n_cache_map.end()) {
+      ldout(cct, 10) << "D3nDataCache::" << __func__ << "(): data already cached, no rewrite" << dendl;
+      return;
+    }
+    auto it = d3n_outstanding_write_list.find(oid);
+    if (it != d3n_outstanding_write_list.end()) {
+      ldout(cct, 10) << "D3nDataCache: NOTE: data put in cache already issued, no rewrite" << dendl;
+      return;
+    }
+    d3n_outstanding_write_list.insert(oid);
+  }
+  {
+    const std::lock_guard l(d3n_eviction_lock);
+    _free_data_cache_size = free_data_cache_size;
+    _outstanding_write_size = outstanding_write_size;
+  }
+  ldout(cct, 20) << "D3nDataCache: Before eviction _free_data_cache_size:" << _free_data_cache_size << ", _outstanding_write_size:" << _outstanding_write_size << ", freed_size:" << freed_size << dendl;
+  while (len > (_free_data_cache_size - _outstanding_write_size + freed_size)) {
+    ldout(cct, 20) << "D3nDataCache: enter eviction" << dendl;
+    if (eviction_policy == _eviction_policy::LRU) {
+      sr = lru_eviction();
+    } else if (eviction_policy == _eviction_policy::RANDOM) {
+      sr = random_eviction();
+    } else {
+      ldout(cct, 0) << "D3nDataCache: Warning: unknown cache eviction policy, defaulting to lru eviction" << dendl;
+      sr = lru_eviction();
+    }
+    if (sr == 0) {
+      ldout(cct, 2) << "D3nDataCache: Warning: eviction was not able to free disk space, not writing to cache" << dendl;
+      d3n_outstanding_write_list.erase(oid);
+      return;
+    }
+    ldout(cct, 20) << "D3nDataCache: completed eviction of " << sr << " bytes" << dendl;
+    freed_size += sr;
+  }
+  int r = 0;
+  r = d3n_libaio_create_write_request(bl, len, oid);
+  if (r < 0) {
+    const std::lock_guard l(d3n_cache_lock);
+    d3n_outstanding_write_list.erase(oid);
+    ldout(cct, 1) << "D3nDataCache: create_aio_write_request fail, r=" << r << dendl;
+    return;
+  }
+
+  const std::lock_guard l(d3n_eviction_lock);
+  free_data_cache_size += freed_size;
+  outstanding_write_size += len;
+}
+
+bool D3nDataCache::get(const string& oid, const off_t len)
+{
+  const std::lock_guard l(d3n_cache_lock);
+  bool exist = false;
+  string location = cache_location + url_encode(oid, true);
+
+  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl;
+  std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.find(oid);
+  if (!(iter == d3n_cache_map.end())) {
+    // check inside cache whether file exists or not!!!! then make exist true;
+    struct D3nChunkDataInfo* chdo = iter->second;
+    struct stat st;
+    int r = stat(location.c_str(), &st);
+    if ( r != -1 && st.st_size == len) { // file exists and containes required data range length
+      exist = true;
+      /*LRU*/
+      /*get D3nChunkDataInfo*/
+      const std::lock_guard l(d3n_eviction_lock);
+      lru_remove(chdo);
+      lru_insert_head(chdo);
+    } else {
+      d3n_cache_map.erase(oid);
+      const std::lock_guard l(d3n_eviction_lock);
+      lru_remove(chdo);
+      delete chdo;
+      exist = false;
+    }
+  }
+  return exist;
+}
+
+size_t D3nDataCache::random_eviction()
+{
+  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl;
+  int n_entries = 0;
+  int random_index = 0;
+  size_t freed_size = 0;
+  D3nChunkDataInfo* del_entry;
+  string del_oid, location;
+  {
+    const std::lock_guard l(d3n_cache_lock);
+    n_entries = d3n_cache_map.size();
+    if (n_entries <= 0) {
+      return -1;
+    }
+    srand (time(NULL));
+    random_index = ceph::util::generate_random_number<int>(0, n_entries-1);
+    std::unordered_map<string, D3nChunkDataInfo*>::iterator iter = d3n_cache_map.begin();
+    std::advance(iter, random_index);
+    del_oid = iter->first;
+    del_entry =  iter->second;
+    ldout(cct, 20) << "D3nDataCache: random_eviction: index:" << random_index << ", free size: " << del_entry->size << dendl;
+    freed_size = del_entry->size;
+    delete del_entry;
+    del_entry = nullptr;
+    d3n_cache_map.erase(del_oid); // oid
+  }
+
+  location = cache_location + url_encode(del_oid, true);
+  ::remove(location.c_str());
+  return freed_size;
+}
+
+size_t D3nDataCache::lru_eviction()
+{
+  lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl;
+  int n_entries = 0;
+  size_t freed_size = 0;
+  D3nChunkDataInfo* del_entry;
+  string del_oid, location;
+
+  {
+    const std::lock_guard l(d3n_eviction_lock);
+    del_entry = tail;
+    if (del_entry == nullptr) {
+      ldout(cct, 2) << "D3nDataCache: lru_eviction: del_entry=null_ptr" << dendl;
+      return 0;
+    }
+    lru_remove(del_entry);
+  }
+
+  {
+    const std::lock_guard l(d3n_cache_lock);
+    n_entries = d3n_cache_map.size();
+    if (n_entries <= 0) {
+      ldout(cct, 2) << "D3nDataCache: lru_eviction: cache_map.size<=0" << dendl;
+      return -1;
+    }
+    del_oid = del_entry->oid;
+    ldout(cct, 20) << "D3nDataCache: lru_eviction: oid to remove: " << del_oid << dendl;
+    d3n_cache_map.erase(del_oid); // oid
+  }
+  freed_size = del_entry->size;
+  delete del_entry;
+  location = cache_location + url_encode(del_oid, true);
+  ::remove(location.c_str());
+  return freed_size;
+}
diff --git a/src/rgw/driver/rados/rgw_d3n_datacache.h b/src/rgw/driver/rados/rgw_d3n_datacache.h
new file mode 100644
index 000000000..feaa3f2b7
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_d3n_datacache.h
@@ -0,0 +1,259 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rados.h"
+#include <curl/curl.h>
+
+#include "rgw_common.h"
+
+#include <unistd.h>
+#include <signal.h>
+#include "include/Context.h"
+#include "include/lru.h"
+#include "rgw_d3n_cacherequest.h"
+
+
+/*D3nDataCache*/
+struct D3nDataCache;
+
+
+struct D3nChunkDataInfo : public LRUObject {
+	CephContext *cct;
+	uint64_t size;
+	time_t access_time;
+	std::string address;
+	std::string oid;
+	bool complete;
+	struct D3nChunkDataInfo* lru_prev;
+	struct D3nChunkDataInfo* lru_next;
+
+	D3nChunkDataInfo(): size(0) {}
+
+	void set_ctx(CephContext *_cct) {
+		cct = _cct;
+	}
+
+	void dump(Formatter *f) const;
+	static void generate_test_instances(std::list<D3nChunkDataInfo*>& o);
+};
+
+struct D3nCacheAioWriteRequest {
+	std::string oid;
+	void *data;
+	int fd;
+	struct aiocb *cb;
+	D3nDataCache *priv_data;
+	CephContext *cct;
+
+	D3nCacheAioWriteRequest(CephContext *_cct) : cct(_cct) {}
+	int d3n_libaio_prepare_write_op(bufferlist& bl, unsigned int len, std::string oid, std::string cache_location);
+
+  ~D3nCacheAioWriteRequest() {
+    ::close(fd);
+		cb->aio_buf = nullptr;
+		free(data);
+		data = nullptr;
+		delete(cb);
+  }
+};
+
+struct D3nDataCache {
+
+private:
+  std::unordered_map<std::string, D3nChunkDataInfo*> d3n_cache_map;
+  std::set<std::string> d3n_outstanding_write_list;
+  std::mutex d3n_cache_lock;
+  std::mutex d3n_eviction_lock;
+
+  CephContext *cct;
+  enum class _io_type {
+    SYNC_IO = 1,
+    ASYNC_IO = 2,
+    SEND_FILE = 3
+  } io_type;
+  enum class _eviction_policy {
+    LRU=0, RANDOM=1
+  } eviction_policy;
+
+  struct sigaction action;
+  uint64_t free_data_cache_size = 0;
+  uint64_t outstanding_write_size = 0;
+  struct D3nChunkDataInfo* head;
+  struct D3nChunkDataInfo* tail;
+
+private:
+  void add_io();
+
+public:
+  D3nDataCache();
+  ~D3nDataCache() {
+    while (lru_eviction() > 0);
+  }
+
+  std::string cache_location;
+
+  bool get(const std::string& oid, const off_t len);
+  void put(bufferlist& bl, unsigned int len, std::string& obj_key);
+  int d3n_io_write(bufferlist& bl, unsigned int len, std::string oid);
+  int d3n_libaio_create_write_request(bufferlist& bl, unsigned int len, std::string oid);
+  void d3n_libaio_write_completion_cb(D3nCacheAioWriteRequest* c);
+  size_t random_eviction();
+  size_t lru_eviction();
+
+  void init(CephContext *_cct);
+
+  void lru_insert_head(struct D3nChunkDataInfo* o) {
+    lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+    o->lru_next = head;
+    o->lru_prev = nullptr;
+    if (head) {
+      head->lru_prev = o;
+    } else {
+      tail = o;
+    }
+    head = o;
+  }
+
+  void lru_insert_tail(struct D3nChunkDataInfo* o) {
+    lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+    o->lru_next = nullptr;
+    o->lru_prev = tail;
+    if (tail) {
+      tail->lru_next = o;
+    } else {
+      head = o;
+    }
+    tail = o;
+  }
+
+  void lru_remove(struct D3nChunkDataInfo* o) {
+    lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "()" << dendl;
+    if (o->lru_next)
+      o->lru_next->lru_prev = o->lru_prev;
+    else
+      tail = o->lru_prev;
+    if (o->lru_prev)
+      o->lru_prev->lru_next = o->lru_next;
+    else
+      head = o->lru_next;
+    o->lru_next = o->lru_prev = nullptr;
+  }
+};
+
+
+template <class T>
+class D3nRGWDataCache : public T {
+
+public:
+  D3nRGWDataCache() {}
+
+  int init_rados() override {
+    int ret;
+    ret = T::init_rados();
+    if (ret < 0)
+      return ret;
+
+    return 0;
+  }
+
+  int get_obj_iterate_cb(const DoutPrefixProvider *dpp, const rgw_raw_obj& read_obj, off_t obj_ofs,
+                         off_t read_ofs, off_t len, bool is_head_obj,
+                         RGWObjState *astate, void *arg) override;
+};
+
+template<typename T>
+int D3nRGWDataCache<T>::get_obj_iterate_cb(const DoutPrefixProvider *dpp, const rgw_raw_obj& read_obj, off_t obj_ofs,
+                                 off_t read_ofs, off_t len, bool is_head_obj,
+                                 RGWObjState *astate, void *arg) {
+  lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache::" << __func__ << "(): is head object : " << is_head_obj << dendl;
+  librados::ObjectReadOperation op;
+  struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+  std::string oid, key;
+
+  if (is_head_obj) {
+    // only when reading from the head object do we need to do the atomic test
+    int r = T::append_atomic_test(dpp, astate, op);
+    if (r < 0)
+      return r;
+
+    if (astate &&
+        obj_ofs < astate->data.length()) {
+      unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
+
+      r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
+      if (r < 0)
+        return r;
+
+      len -= chunk_len;
+      d->offset += chunk_len;
+      read_ofs += chunk_len;
+      obj_ofs += chunk_len;
+      if (!len)
+        return 0;
+    }
+
+    auto obj = d->rgwrados->svc.rados->obj(read_obj);
+    r = obj.open(dpp);
+    if (r < 0) {
+      lsubdout(g_ceph_context, rgw, 4) << "failed to open rados context for " << read_obj << dendl;
+      return r;
+    }
+
+    ldpp_dout(dpp, 20) << "D3nDataCache::" << __func__ << "(): oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
+    op.read(read_ofs, len, nullptr, nullptr);
+
+    const uint64_t cost = len;
+    const uint64_t id = obj_ofs; // use logical object offset for sorting replies
+
+    auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+    return d->flush(std::move(completed));
+  } else {
+    ldpp_dout(dpp, 20) << "D3nDataCache::" << __func__ << "(): oid=" << read_obj.oid << ", is_head_obj=" << is_head_obj << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << ", len=" << len << dendl;
+    int r;
+
+    op.read(read_ofs, len, nullptr, nullptr);
+
+    const uint64_t cost = len;
+    const uint64_t id = obj_ofs; // use logical object offset for sorting replies
+    oid = read_obj.oid;
+
+    auto obj = d->rgwrados->svc.rados->obj(read_obj);
+    r = obj.open(dpp);
+    if (r < 0) {
+      lsubdout(g_ceph_context, rgw, 0) << "D3nDataCache: Error: failed to open rados context for " << read_obj << ", r=" << r << dendl;
+      return r;
+    }
+
+    const bool is_compressed = (astate->attrset.find(RGW_ATTR_COMPRESSION) != astate->attrset.end());
+    const bool is_encrypted = (astate->attrset.find(RGW_ATTR_CRYPT_MODE) != astate->attrset.end());
+    if (read_ofs != 0 || astate->size != astate->accounted_size || is_compressed || is_encrypted) {
+      d->d3n_bypass_cache_write = true;
+      lsubdout(g_ceph_context, rgw, 5) << "D3nDataCache: " << __func__ << "(): Note - bypassing datacache: oid=" << read_obj.oid << ", read_ofs!=0 = " << read_ofs << ", size=" << astate->size << " != accounted_size=" << astate->accounted_size << ", is_compressed=" << is_compressed << ", is_encrypted=" << is_encrypted  << dendl;
+      auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+      r = d->flush(std::move(completed));
+      return r;
+    }
+
+    if (d->rgwrados->d3n_data_cache->get(oid, len)) {
+      // Read From Cache
+      ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): READ FROM CACHE: oid=" << read_obj.oid << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << ", len=" << len << dendl;
+      auto completed = d->aio->get(obj, rgw::Aio::d3n_cache_op(dpp, d->yield, read_ofs, len, d->rgwrados->d3n_data_cache->cache_location), cost, id);
+      r = d->flush(std::move(completed));
+      if (r < 0) {
+        lsubdout(g_ceph_context, rgw, 0) << "D3nDataCache: " << __func__ << "(): Error: failed to drain/flush, r= " << r << dendl;
+      }
+      return r;
+    } else {
+      // Write To Cache
+      ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): WRITE TO CACHE: oid=" << read_obj.oid << ", obj-ofs=" << obj_ofs << ", read_ofs=" << read_ofs << " len=" << len << dendl;
+      auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+      return d->flush(std::move(completed));
+    }
+  }
+  lsubdout(g_ceph_context, rgw, 1) << "D3nDataCache: " << __func__ << "(): Warning: Check head object cache handling flow, oid=" << read_obj.oid << dendl;
+
+  return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_data_sync.cc b/src/rgw/driver/rados/rgw_data_sync.cc
new file mode 100644
index 000000000..a5730e51d
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_data_sync.cc
@@ -0,0 +1,6762 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/ceph_json.h"
+#include "common/RefCountedObj.h"
+#include "common/WorkQueue.h"
+#include "common/Throttle.h"
+#include "common/errno.h"
+
+#include "rgw_common.h"
+#include "rgw_zone.h"
+#include "rgw_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_cr_tools.h"
+#include "rgw_http_client.h"
+#include "rgw_bucket.h"
+#include "rgw_bucket_sync.h"
+#include "rgw_bucket_sync_cache.h"
+#include "rgw_datalog.h"
+#include "rgw_metadata.h"
+#include "rgw_sync_counters.h"
+#include "rgw_sync_error_repo.h"
+#include "rgw_sync_module.h"
+#include "rgw_sal.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sync_modules.h"
+
+#include "include/common_fwd.h"
+#include "include/random.h"
+
+#include <boost/asio/yield.hpp>
+#include <string_view>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "data sync: ")
+
+using namespace std;
+
+static const string datalog_sync_status_oid_prefix = "datalog.sync-status";
+static const string datalog_sync_status_shard_prefix = "datalog.sync-status.shard";
+static const string datalog_sync_full_sync_index_prefix = "data.full-sync.index";
+static const string bucket_full_status_oid_prefix = "bucket.full-sync-status";
+static const string bucket_status_oid_prefix = "bucket.sync-status";
+static const string object_status_oid_prefix = "bucket.sync-status";
+
+void rgw_datalog_info::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("num_objects", num_shards, obj);
+}
+
+void rgw_datalog_entry::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("key", key, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("timestamp", ut, obj);
+  timestamp = ut.to_real_time();
+}
+
+void rgw_datalog_shard_data::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("marker", marker, obj);
+  JSONDecoder::decode_json("truncated", truncated, obj);
+  JSONDecoder::decode_json("entries", entries, obj);
+};
+
+// print a bucket shard with [gen]
+std::string to_string(const rgw_bucket_shard& bs, std::optional<uint64_t> gen)
+{
+  constexpr auto digits10 = std::numeric_limits<uint64_t>::digits10;
+  constexpr auto reserve = 2 + digits10; // [value]
+  auto str = bs.get_key('/', ':', ':', reserve);
+  str.append(1, '[');
+  str.append(std::to_string(gen.value_or(0)));
+  str.append(1, ']');
+  return str;
+}
+
+class RGWReadDataSyncStatusMarkersCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *env;
+  const int num_shards;
+  int shard_id{0};;
+
+  map<uint32_t, rgw_data_sync_marker>& markers;
+  std::vector<RGWObjVersionTracker>& objvs;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to read data sync status: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  RGWReadDataSyncStatusMarkersCR(RGWDataSyncCtx *sc, int num_shards,
+                                 map<uint32_t, rgw_data_sync_marker>& markers,
+                                 std::vector<RGWObjVersionTracker>& objvs)
+    : RGWShardCollectCR(sc->cct, MAX_CONCURRENT_SHARDS),
+      sc(sc), env(sc->env), num_shards(num_shards), markers(markers), objvs(objvs)
+  {}
+  bool spawn_next() override;
+};
+
+bool RGWReadDataSyncStatusMarkersCR::spawn_next()
+{
+  if (shard_id >= num_shards) {
+    return false;
+  }
+  using CR = RGWSimpleRadosReadCR<rgw_data_sync_marker>;
+  spawn(new CR(env->dpp, env->driver,
+               rgw_raw_obj(env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id)),
+               &markers[shard_id], true, &objvs[shard_id]),
+        false);
+  shard_id++;
+  return true;
+}
+
+class RGWReadDataSyncRecoveringShardsCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *env;
+
+  uint64_t max_entries;
+  int num_shards;
+  int shard_id{0};
+
+  string marker;
+  std::vector<RGWRadosGetOmapKeysCR::ResultPtr>& omapkeys;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to list recovering data sync: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  RGWReadDataSyncRecoveringShardsCR(RGWDataSyncCtx *sc, uint64_t _max_entries, int _num_shards,
+                                    std::vector<RGWRadosGetOmapKeysCR::ResultPtr>& omapkeys)
+    : RGWShardCollectCR(sc->cct, MAX_CONCURRENT_SHARDS), sc(sc), env(sc->env),
+      max_entries(_max_entries), num_shards(_num_shards), omapkeys(omapkeys)
+  {}
+  bool spawn_next() override;
+};
+
+bool RGWReadDataSyncRecoveringShardsCR::spawn_next()
+{
+  if (shard_id >= num_shards)
+    return false;
+ 
+  string error_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id) + ".retry";
+  auto& shard_keys = omapkeys[shard_id];
+  shard_keys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
+  spawn(new RGWRadosGetOmapKeysCR(env->driver, rgw_raw_obj(env->svc->zone->get_zone_params().log_pool, error_oid),
+                                  marker, max_entries, shard_keys), false);
+
+  ++shard_id;
+  return true;
+}
+
+class RGWReadDataSyncStatusCoroutine : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_data_sync_status *sync_status;
+  RGWObjVersionTracker* objv_tracker;
+  std::vector<RGWObjVersionTracker>& objvs;
+
+public:
+  RGWReadDataSyncStatusCoroutine(RGWDataSyncCtx *_sc,
+                                 rgw_data_sync_status *_status,
+                                 RGWObjVersionTracker* objv_tracker,
+                                 std::vector<RGWObjVersionTracker>& objvs)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(sc->env), sync_status(_status),
+      objv_tracker(objv_tracker), objvs(objvs)
+  {}
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadDataSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // read sync info
+    using ReadInfoCR = RGWSimpleRadosReadCR<rgw_data_sync_info>;
+    yield {
+      bool empty_on_enoent = false; // fail on ENOENT
+      call(new ReadInfoCR(dpp, sync_env->driver,
+                          rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone)),
+                          &sync_status->sync_info, empty_on_enoent, objv_tracker));
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to read sync status info with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    // read shard markers
+    objvs.resize(sync_status->sync_info.num_shards);
+    using ReadMarkersCR = RGWReadDataSyncStatusMarkersCR;
+    yield call(new ReadMarkersCR(sc, sync_status->sync_info.num_shards,
+                                 sync_status->sync_markers, objvs));
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to read sync status markers with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWReadRemoteDataLogShardInfoCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  RGWRESTReadResource *http_op;
+
+  int shard_id;
+  RGWDataChangesLogInfo *shard_info;
+
+public:
+  RGWReadRemoteDataLogShardInfoCR(RGWDataSyncCtx *_sc,
+                                  int _shard_id, RGWDataChangesLogInfo *_shard_info) : RGWCoroutine(_sc->cct),
+                                                      sc(_sc),
+                                                      sync_env(_sc->env),
+                                                      http_op(NULL),
+                                                      shard_id(_shard_id),
+                                                      shard_info(_shard_info) {
+  }
+
+  ~RGWReadRemoteDataLogShardInfoCR() override {
+    if (http_op) {
+      http_op->put();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield {
+	char buf[16];
+	snprintf(buf, sizeof(buf), "%d", shard_id);
+        rgw_http_param_pair pairs[] = { { "type" , "data" },
+	                                { "id", buf },
+					{ "info" , NULL },
+	                                { NULL, NULL } };
+
+        string p = "/admin/log/";
+
+        http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
+
+        init_new_io(http_op);
+
+        int ret = http_op->aio_read(dpp);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+          return set_cr_error(ret);
+        }
+
+        return io_block(0);
+      }
+      yield {
+        int ret = http_op->wait(shard_info, null_yield);
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        return set_cr_done();
+      }
+    }
+    return 0;
+  }
+};
+
+struct read_remote_data_log_response {
+  string marker;
+  bool truncated;
+  vector<rgw_data_change_log_entry> entries;
+
+  read_remote_data_log_response() : truncated(false) {}
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("marker", marker, obj);
+    JSONDecoder::decode_json("truncated", truncated, obj);
+    JSONDecoder::decode_json("entries", entries, obj);
+  };
+};
+
+class RGWReadRemoteDataLogShardCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  RGWRESTReadResource *http_op = nullptr;
+
+  int shard_id;
+  const std::string& marker;
+  string *pnext_marker;
+  vector<rgw_data_change_log_entry> *entries;
+  bool *truncated;
+
+  read_remote_data_log_response response;
+  std::optional<TOPNSPC::common::PerfGuard> timer;
+
+public:
+  RGWReadRemoteDataLogShardCR(RGWDataSyncCtx *_sc, int _shard_id,
+                              const std::string& marker, string *pnext_marker,
+                              vector<rgw_data_change_log_entry> *_entries,
+                              bool *_truncated)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      shard_id(_shard_id), marker(marker), pnext_marker(pnext_marker),
+      entries(_entries), truncated(_truncated) {
+  }
+  ~RGWReadRemoteDataLogShardCR() override {
+    if (http_op) {
+      http_op->put();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield {
+	char buf[16];
+	snprintf(buf, sizeof(buf), "%d", shard_id);
+        rgw_http_param_pair pairs[] = { { "type" , "data" },
+	                                { "id", buf },
+	                                { "marker", marker.c_str() },
+	                                { "extra-info", "true" },
+	                                { NULL, NULL } };
+
+        string p = "/admin/log/";
+
+        http_op = new RGWRESTReadResource(sc->conn, p, pairs, NULL, sync_env->http_manager);
+
+        init_new_io(http_op);
+
+        if (sync_env->counters) {
+          timer.emplace(sync_env->counters, sync_counters::l_poll);
+        }
+        int ret = http_op->aio_read(dpp);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+          if (sync_env->counters) {
+            sync_env->counters->inc(sync_counters::l_poll_err);
+          }
+          return set_cr_error(ret);
+        }
+
+        return io_block(0);
+      }
+      yield {
+        timer.reset();
+        int ret = http_op->wait(&response, null_yield);
+        if (ret < 0) {
+          if (sync_env->counters && ret != -ENOENT) {
+            sync_env->counters->inc(sync_counters::l_poll_err);
+          }
+          return set_cr_error(ret);
+        }
+        entries->clear();
+        entries->swap(response.entries);
+        *pnext_marker = response.marker;
+        *truncated = response.truncated;
+        return set_cr_done();
+      }
+    }
+    return 0;
+  }
+};
+
+class RGWReadRemoteDataLogInfoCR : public RGWShardCollectCR {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  int num_shards;
+  map<int, RGWDataChangesLogInfo> *datalog_info;
+
+  int shard_id;
+#define READ_DATALOG_MAX_CONCURRENT 10
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to fetch remote datalog info: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+public:
+  RGWReadRemoteDataLogInfoCR(RGWDataSyncCtx *_sc,
+                     int _num_shards,
+                     map<int, RGWDataChangesLogInfo> *_datalog_info) : RGWShardCollectCR(_sc->cct, READ_DATALOG_MAX_CONCURRENT),
+                                                                 sc(_sc), sync_env(_sc->env), num_shards(_num_shards),
+                                                                 datalog_info(_datalog_info), shard_id(0) {}
+  bool spawn_next() override;
+};
+
+bool RGWReadRemoteDataLogInfoCR::spawn_next() {
+  if (shard_id >= num_shards) {
+    return false;
+  }
+  spawn(new RGWReadRemoteDataLogShardInfoCR(sc, shard_id, &(*datalog_info)[shard_id]), false);
+  shard_id++;
+  return true;
+}
+
+class RGWListRemoteDataLogShardCR : public RGWSimpleCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  RGWRESTReadResource *http_op;
+
+  int shard_id;
+  string marker;
+  uint32_t max_entries;
+  rgw_datalog_shard_data *result;
+
+public:
+  RGWListRemoteDataLogShardCR(RGWDataSyncCtx *sc, int _shard_id,
+                              const string& _marker, uint32_t _max_entries,
+                              rgw_datalog_shard_data *_result)
+    : RGWSimpleCoroutine(sc->cct), sc(sc), sync_env(sc->env), http_op(NULL),
+      shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    RGWRESTConn *conn = sc->conn;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%d", shard_id);
+
+    char max_entries_buf[32];
+    snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
+
+    const char *marker_key = (marker.empty() ? "" : "marker");
+
+    rgw_http_param_pair pairs[] = { { "type", "data" },
+      { "id", buf },
+      { "max-entries", max_entries_buf },
+      { marker_key, marker.c_str() },
+      { NULL, NULL } };
+
+    string p = "/admin/log/";
+
+    http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+    init_new_io(http_op);
+
+    int ret = http_op->aio_read(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+      log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+      http_op->put();
+      return ret;
+    }
+
+    return 0;
+  }
+
+  int request_complete() override {
+    int ret = http_op->wait(result, null_yield);
+    http_op->put();
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote datalog shard, ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+};
+
+class RGWListRemoteDataLogCR : public RGWShardCollectCR {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  map<int, string> shards;
+  int max_entries_per_shard;
+  map<int, rgw_datalog_shard_data> *result;
+
+  map<int, string>::iterator iter;
+#define READ_DATALOG_MAX_CONCURRENT 10
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to list remote datalog: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+public:
+  RGWListRemoteDataLogCR(RGWDataSyncCtx *_sc,
+                     map<int, string>& _shards,
+                     int _max_entries_per_shard,
+                     map<int, rgw_datalog_shard_data> *_result) : RGWShardCollectCR(_sc->cct, READ_DATALOG_MAX_CONCURRENT),
+                                                                 sc(_sc), sync_env(_sc->env), max_entries_per_shard(_max_entries_per_shard),
+                                                                 result(_result) {
+    shards.swap(_shards);
+    iter = shards.begin();
+  }
+  bool spawn_next() override;
+};
+
+bool RGWListRemoteDataLogCR::spawn_next() {
+  if (iter == shards.end()) {
+    return false;
+  }
+
+  spawn(new RGWListRemoteDataLogShardCR(sc, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
+  ++iter;
+  return true;
+}
+
+class RGWInitDataSyncStatusCoroutine : public RGWCoroutine {
+  static constexpr auto lock_name{ "sync_lock"sv };
+  RGWDataSyncCtx* const sc;
+  RGWDataSyncEnv* const sync_env{ sc->env };
+  const uint32_t num_shards;
+  rgw_data_sync_status* const status;
+  RGWSyncTraceNodeRef tn;
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  RGWObjVersionTracker& objv_tracker;
+  std::vector<RGWObjVersionTracker>& objvs;
+
+  const rgw_pool& pool{ sync_env->svc->zone->get_zone_params().log_pool };
+  const string sync_status_oid{
+    RGWDataSyncStatusManager::sync_status_oid(sc->source_zone) };
+
+  map<int, RGWDataChangesLogInfo> shards_info;
+
+
+public:
+  RGWInitDataSyncStatusCoroutine(
+    RGWDataSyncCtx* _sc, uint32_t num_shards, uint64_t instance_id,
+    const RGWSyncTraceNodeRef& tn_parent, rgw_data_sync_status* status,
+    boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr,
+    RGWObjVersionTracker& objv_tracker,
+    std::vector<RGWObjVersionTracker>& objvs)
+    : RGWCoroutine(_sc->cct), sc(_sc), num_shards(num_shards), status(status),
+      tn(sync_env->sync_tracer->add_node(tn_parent, "init_data_sync_status")),
+      lease_cr(std::move(lease_cr)), objv_tracker(objv_tracker), objvs(objvs) {
+    status->sync_info.instance_id = instance_id;
+  }
+
+  static auto continuous_lease_cr(RGWDataSyncCtx* const sc,
+				  RGWCoroutine* const caller) {
+    auto lock_duration = sc->cct->_conf->rgw_sync_lease_period;
+    return new RGWContinuousLeaseCR(
+      sc->env->async_rados, sc->env->driver,
+      { sc->env->svc->zone->get_zone_params().log_pool,
+	RGWDataSyncStatusManager::sync_status_oid(sc->source_zone) },
+      string(lock_name), lock_duration, caller, &sc->lcc);
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    int ret;
+    reenter(this) {
+      if (!lease_cr->is_locked()) {
+	drain_all();
+	return set_cr_error(-ECANCELED);
+      }
+
+      using WriteInfoCR = RGWSimpleRadosWriteCR<rgw_data_sync_info>;
+      yield call(new WriteInfoCR(dpp, sync_env->driver,
+                                 rgw_raw_obj{pool, sync_status_oid},
+                                 status->sync_info, &objv_tracker));
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode));
+        return set_cr_error(retcode);
+      }
+
+      // In the original code we reacquired the lock. Since
+      // RGWSimpleRadosWriteCR doesn't appear to touch the attributes
+      // and cls_version works across it, this should be unnecessary.
+      // Putting a note here just in case. If we see ECANCELED where
+      // we expect EBUSY, we can revisit this.
+
+      /* fetch current position in logs */
+      yield {
+        RGWRESTConn *conn = sync_env->svc->zone->get_zone_conn(sc->source_zone);
+        if (!conn) {
+          tn->log(0, SSTR("ERROR: connection to zone " << sc->source_zone << " does not exist!"));
+          return set_cr_error(-EIO);
+        }
+        for (uint32_t i = 0; i < num_shards; i++) {
+          spawn(new RGWReadRemoteDataLogShardInfoCR(sc, i, &shards_info[i]), true);
+        }
+      }
+      while (collect(&ret, NULL)) {
+        if (ret < 0) {
+          tn->log(0, SSTR("ERROR: failed to read remote data log shards"));
+          return set_state(RGWCoroutine_Error);
+        }
+        yield;
+      }
+      yield {
+        objvs.resize(num_shards);
+        for (uint32_t i = 0; i < num_shards; i++) {
+          RGWDataChangesLogInfo& info = shards_info[i];
+          auto& marker = status->sync_markers[i];
+          marker.next_step_marker = info.marker;
+          marker.timestamp = info.last_update;
+          const auto& oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, i);
+          auto& objv = objvs[i];
+          objv.generate_new_write_ver(cct);
+          using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_data_sync_marker>;
+          spawn(new WriteMarkerCR(dpp, sync_env->driver,
+                                  rgw_raw_obj{pool, oid}, marker, &objv), true);
+        }
+      }
+      while (collect(&ret, NULL)) {
+        if (ret < 0) {
+          tn->log(0, SSTR("ERROR: failed to write data sync status markers"));
+          return set_state(RGWCoroutine_Error);
+        }
+        yield;
+      }
+
+      status->sync_info.state = rgw_data_sync_info::StateBuildingFullSyncMaps;
+      yield call(new WriteInfoCR(dpp, sync_env->driver,
+                                 rgw_raw_obj{pool, sync_status_oid},
+                                 status->sync_info, &objv_tracker));
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to write sync status info with " << retcode));
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+RGWRemoteDataLog::RGWRemoteDataLog(const DoutPrefixProvider *dpp,
+                                   rgw::sal::RadosStore* driver,
+                                   RGWAsyncRadosProcessor *async_rados)
+  : RGWCoroutinesManager(driver->ctx(), driver->getRados()->get_cr_registry()),
+      dpp(dpp), driver(driver),
+      cct(driver->ctx()), cr_registry(driver->getRados()->get_cr_registry()),
+      async_rados(async_rados),
+      http_manager(driver->ctx(), completion_mgr),
+      data_sync_cr(NULL),
+      initialized(false)
+{
+}
+
+int RGWRemoteDataLog::read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info)
+{
+  rgw_http_param_pair pairs[] = { { "type", "data" },
+                                  { NULL, NULL } };
+
+  int ret = sc.conn->get_json_resource(dpp, "/admin/log", pairs, null_yield, *log_info);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to fetch datalog info" << dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << "remote datalog, num_shards=" << log_info->num_shards << dendl;
+
+  return 0;
+}
+
+int RGWRemoteDataLog::read_source_log_shards_info(const DoutPrefixProvider *dpp, map<int, RGWDataChangesLogInfo> *shards_info)
+{
+  rgw_datalog_info log_info;
+  int ret = read_log_info(dpp, &log_info);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return run(dpp, new RGWReadRemoteDataLogInfoCR(&sc, log_info.num_shards, shards_info));
+}
+
+int RGWRemoteDataLog::read_source_log_shards_next(const DoutPrefixProvider *dpp, map<int, string> shard_markers, map<int, rgw_datalog_shard_data> *result)
+{
+  return run(dpp, new RGWListRemoteDataLogCR(&sc, shard_markers, 1, result));
+}
+
+int RGWRemoteDataLog::init(const rgw_zone_id& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger,
+                           RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& _sync_module,
+                           PerfCounters* counters)
+{
+  sync_env.init(dpp, cct, driver, driver->svc(), async_rados, &http_manager, _error_logger,
+                _sync_tracer, _sync_module, counters);
+  sc.init(&sync_env, _conn, _source_zone);
+
+  if (initialized) {
+    return 0;
+  }
+
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+
+  tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "data");
+
+  initialized = true;
+
+  return 0;
+}
+
+void RGWRemoteDataLog::finish()
+{
+  stop();
+}
+
+int RGWRemoteDataLog::read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status)
+{
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWObjVersionTracker objv;
+  std::vector<RGWObjVersionTracker> shard_objvs;
+  RGWCoroutinesManager crs(cct, cr_registry);
+  RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWDataSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+
+  RGWDataSyncCtx sc_local = sc;
+  sc_local.env = &sync_env_local;
+
+  ret = crs.run(dpp, new RGWReadDataSyncStatusCoroutine(&sc_local, sync_status,
+                                                        &objv, shard_objvs));
+  http_manager.stop();
+  return ret;
+}
+
+int RGWRemoteDataLog::read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, set<int>& recovering_shards)
+{
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(cct, cr_registry);
+  RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWDataSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+
+  RGWDataSyncCtx sc_local = sc;
+  sc_local.env = &sync_env_local;
+
+  std::vector<RGWRadosGetOmapKeysCR::ResultPtr> omapkeys;
+  omapkeys.resize(num_shards);
+  uint64_t max_entries{1};
+
+  ret = crs.run(dpp, new RGWReadDataSyncRecoveringShardsCR(&sc_local, max_entries, num_shards, omapkeys));
+  http_manager.stop();
+
+  if (ret == 0) {
+    for (int i = 0; i < num_shards; i++) {
+      if (omapkeys[i]->entries.size() != 0) {
+        recovering_shards.insert(i);
+      }
+    }
+  }
+
+  return ret;
+}
+
+namespace RGWRDL {
+class DataSyncInitCR : public RGWCoroutine {
+  RGWDataSyncCtx* const sc;
+  const uint32_t num_shards;
+  uint64_t instance_id;
+  const RGWSyncTraceNodeRef& tn;
+  rgw_data_sync_status* const sync_status;
+  std::vector<RGWObjVersionTracker>& objvs;
+
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+
+  RGWObjVersionTracker objv_tracker;
+
+public:
+
+  DataSyncInitCR(RGWDataSyncCtx* sc, uint32_t num_shards, uint64_t instance_id,
+		 const RGWSyncTraceNodeRef& tn,
+		 rgw_data_sync_status* sync_status,
+		 std::vector<RGWObjVersionTracker>& objvs)
+    : RGWCoroutine(sc->cct), sc(sc), num_shards(num_shards),
+      instance_id(instance_id), tn(tn),
+      sync_status(sync_status), objvs(objvs) {}
+
+  ~DataSyncInitCR() override {
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      lease_cr.reset(
+	RGWInitDataSyncStatusCoroutine::continuous_lease_cr(sc, this));
+
+      yield spawn(lease_cr.get(), false);
+      while (!lease_cr->is_locked()) {
+	if (lease_cr->is_done()) {
+	  tn->log(5, "ERROR: failed to take data sync status lease");
+	  set_status("lease lock failed, early abort");
+	  drain_all();
+	  return set_cr_error(lease_cr->get_ret_status());
+	}
+	tn->log(5, "waiting on data sync status lease");
+	yield set_sleeping(true);
+      }
+      tn->log(5, "acquired data sync status lease");
+      objv_tracker.generate_new_write_ver(sc->cct);
+      yield call(new RGWInitDataSyncStatusCoroutine(sc, num_shards, instance_id,
+						    tn, sync_status, lease_cr,
+						    objv_tracker, objvs));
+      lease_cr->go_down();
+      lease_cr.reset();
+      drain_all();
+      if (retcode < 0) {
+	set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+}
+
+int RGWRemoteDataLog::init_sync_status(const DoutPrefixProvider *dpp, int num_shards)
+{
+  rgw_data_sync_status sync_status;
+  std::vector<RGWObjVersionTracker> objvs;
+  sync_status.sync_info.num_shards = num_shards;
+
+  RGWCoroutinesManager crs(cct, cr_registry);
+  RGWHTTPManager http_manager(cct, crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWDataSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+  auto instance_id = ceph::util::generate_random_number<uint64_t>();
+  RGWDataSyncCtx sc_local = sc;
+  sc_local.env = &sync_env_local;
+  ret = crs.run(dpp, new RGWRDL::DataSyncInitCR(&sc_local, num_shards,
+						instance_id, tn, &sync_status, objvs));
+  http_manager.stop();
+  return ret;
+}
+
+static string full_data_sync_index_shard_oid(const rgw_zone_id& source_zone, int shard_id)
+{
+  char buf[datalog_sync_full_sync_index_prefix.size() + 1 + source_zone.id.size() + 1 + 16];
+  snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_full_sync_index_prefix.c_str(), source_zone.id.c_str(), shard_id);
+  return string(buf);
+}
+
+struct read_metadata_list {
+  string marker;
+  bool truncated;
+  list<string> keys;
+  int count;
+
+  read_metadata_list() : truncated(false), count(0) {}
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("marker", marker, obj);
+    JSONDecoder::decode_json("truncated", truncated, obj);
+    JSONDecoder::decode_json("keys", keys, obj);
+    JSONDecoder::decode_json("count", count, obj);
+  }
+};
+
+struct bucket_instance_meta_info {
+  string key;
+  obj_version ver;
+  utime_t mtime;
+  RGWBucketInstanceMetadataObject data;
+
+  bucket_instance_meta_info() {}
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("key", key, obj);
+    JSONDecoder::decode_json("ver", ver, obj);
+    JSONDecoder::decode_json("mtime", mtime, obj);
+    JSONDecoder::decode_json("data", data, obj);
+  }
+};
+
+class RGWReadRemoteBucketIndexLogInfoCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  const string instance_key;
+
+  rgw_bucket_index_marker_info *info;
+
+public:
+  RGWReadRemoteBucketIndexLogInfoCR(RGWDataSyncCtx *_sc,
+				    const rgw_bucket& bucket,
+				    rgw_bucket_index_marker_info *_info)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      instance_key(bucket.get_key()), info(_info) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield {
+        rgw_http_param_pair pairs[] = { { "type" , "bucket-index" },
+	                                { "bucket-instance", instance_key.c_str() },
+					{ "info" , NULL },
+	                                { NULL, NULL } };
+
+        string p = "/admin/log/";
+        call(new RGWReadRESTResourceCR<rgw_bucket_index_marker_info>(sync_env->cct, sc->conn, sync_env->http_manager, p, pairs, info));
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+
+class RGWListBucketIndexesCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env = sc->env;
+
+  rgw::sal::RadosStore* driver = sync_env->driver;
+
+  rgw_data_sync_status *sync_status;
+  std::vector<RGWObjVersionTracker>& objvs;
+
+  int req_ret = 0;
+  int ret = 0;
+
+  list<string>::iterator iter;
+
+  unique_ptr<RGWShardedOmapCRManager> entries_index;
+  string oid_prefix =
+    datalog_sync_full_sync_index_prefix + "." + sc->source_zone.id;
+
+  string path = "/admin/metadata/bucket.instance";
+  bucket_instance_meta_info meta_info;
+  string key;
+
+  bool failed = false;
+  bool truncated = false;
+  read_metadata_list result;
+
+public:
+  RGWListBucketIndexesCR(RGWDataSyncCtx* sc,
+                         rgw_data_sync_status* sync_status, std::vector<RGWObjVersionTracker>& objvs)
+    : RGWCoroutine(sc->cct), sc(sc), sync_status(sync_status), objvs(objvs) {}
+  ~RGWListBucketIndexesCR() override { }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      entries_index = std::make_unique<RGWShardedOmapCRManager>(
+	sync_env->async_rados, driver, this,
+	cct->_conf->rgw_data_log_num_shards,
+	sync_env->svc->zone->get_zone_params().log_pool,
+	oid_prefix);
+      yield; // yield so OmapAppendCRs can start
+
+      do {
+        yield {
+          string entrypoint = "/admin/metadata/bucket.instance"s;
+
+          rgw_http_param_pair pairs[] = {{"max-entries", "1000"},
+                                         {"marker", result.marker.c_str()},
+                                         {NULL, NULL}};
+
+          call(new RGWReadRESTResourceCR<read_metadata_list>(
+		 sync_env->cct, sc->conn, sync_env->http_manager,
+		 entrypoint, pairs, &result));
+	}
+	if (retcode < 0) {
+	  ldpp_dout(dpp, 0)
+	    << "ERROR: failed to fetch metadata for section bucket.instance"
+	    << dendl;
+          return set_cr_error(retcode);
+        }
+
+        for (iter = result.keys.begin(); iter != result.keys.end(); ++iter) {
+          ldpp_dout(dpp, 20) << "list metadata: section=bucket.instance key="
+			     << *iter << dendl;
+          key = *iter;
+
+          yield {
+            rgw_http_param_pair pairs[] = {{"key", key.c_str()},
+                                           {NULL, NULL}};
+
+            call(new RGWReadRESTResourceCR<bucket_instance_meta_info>(
+		   sync_env->cct, sc->conn, sync_env->http_manager, path, pairs,
+		   &meta_info));
+          }
+	  if (retcode < 0) {
+	    ldpp_dout(dpp, 0) << "ERROR: failed to fetch metadata for key: "
+			      << key << dendl;
+	    return set_cr_error(retcode);
+	  }
+	  // Now that bucket full sync is bucket-wide instead of
+	  // per-shard, we only need to register a single shard of
+	  // each bucket to guarantee that sync will see everything
+	  // that happened before data full sync starts. This also
+	  // means we don't have to care about the bucket's current
+	  // shard count.
+	  yield entries_index->append(
+	    fmt::format("{}:{}", key, 0),
+	    sync_env->svc->datalog_rados->get_log_shard_id(
+	      meta_info.data.get_bucket_info().bucket, 0));
+	}
+	truncated = result.truncated;
+      } while (truncated);
+
+      yield {
+        if (!entries_index->finish()) {
+          failed = true;
+        }
+      }
+      if (!failed) {
+        for (auto iter = sync_status->sync_markers.begin();
+	     iter != sync_status->sync_markers.end();
+	     ++iter) {
+          int shard_id = (int)iter->first;
+          rgw_data_sync_marker& marker = iter->second;
+          marker.total_entries = entries_index->get_total_entries(shard_id);
+          spawn(new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(
+		  dpp, sync_env->driver,
+		  rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool,
+			      RGWDataSyncStatusManager::shard_obj_name(
+				sc->source_zone, shard_id)),
+		  marker, &objvs[shard_id]),
+		true);
+	}
+      } else {
+        yield call(sync_env->error_logger->log_error_cr(
+		     dpp, sc->conn->get_remote_id(), "data.init", "",
+		     EIO, string("failed to build bucket instances map")));
+      }
+      while (collect(&ret, NULL)) {
+	if (ret < 0) {
+          yield call(sync_env->error_logger->log_error_cr(
+		       dpp, sc->conn->get_remote_id(), "data.init", "",
+		       -ret, string("failed to driver sync status: ") +
+		       cpp_strerror(-ret)));
+	  req_ret = ret;
+	}
+	yield;
+      }
+      drain_all();
+      if (req_ret < 0) {
+        yield return set_cr_error(req_ret);
+      }
+       yield return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+#define DATA_SYNC_UPDATE_MARKER_WINDOW 1
+
+class RGWDataSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  string marker_oid;
+  rgw_data_sync_marker sync_marker;
+  RGWSyncTraceNodeRef tn;
+  RGWObjVersionTracker& objv;
+
+public:
+  RGWDataSyncShardMarkerTrack(RGWDataSyncCtx *_sc,
+                         const string& _marker_oid,
+                         const rgw_data_sync_marker& _marker,
+                         RGWSyncTraceNodeRef& _tn, RGWObjVersionTracker& objv) : RGWSyncShardMarkerTrack(DATA_SYNC_UPDATE_MARKER_WINDOW),
+                                                                sc(_sc), sync_env(_sc->env),
+                                                                marker_oid(_marker_oid),
+                                                                sync_marker(_marker),
+                                                                tn(_tn), objv(objv) {}
+
+  RGWCoroutine* store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+    sync_marker.marker = new_marker;
+    sync_marker.pos = index_pos;
+    sync_marker.timestamp = timestamp;
+
+    tn->log(20, SSTR("updating marker marker_oid=" << marker_oid << " marker=" << new_marker));
+
+    return new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(sync_env->dpp, sync_env->driver,
+                                                           rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, marker_oid),
+                                                           sync_marker, &objv);
+  }
+
+  RGWOrderCallCR *allocate_order_control_cr() override {
+    return new RGWLastCallerWinsCR(sync_env->cct);
+  }
+};
+
+// ostream wrappers to print buckets without copying strings
+struct bucket_str {
+  const rgw_bucket& b;
+  explicit bucket_str(const rgw_bucket& b) : b(b) {}
+};
+std::ostream& operator<<(std::ostream& out, const bucket_str& rhs) {
+  auto& b = rhs.b;
+  if (!b.tenant.empty()) {
+    out << b.tenant << '/';
+  }
+  out << b.name;
+  if (!b.bucket_id.empty()) {
+    out << ':' << b.bucket_id;
+  }
+  return out;
+}
+
+struct bucket_str_noinstance {
+  const rgw_bucket& b;
+  explicit bucket_str_noinstance(const rgw_bucket& b) : b(b) {}
+};
+std::ostream& operator<<(std::ostream& out, const bucket_str_noinstance& rhs) {
+  auto& b = rhs.b;
+  if (!b.tenant.empty()) {
+    out << b.tenant << '/';
+  }
+  out << b.name;
+  return out;
+}
+
+struct bucket_shard_str {
+  const rgw_bucket_shard& bs;
+  explicit bucket_shard_str(const rgw_bucket_shard& bs) : bs(bs) {}
+};
+std::ostream& operator<<(std::ostream& out, const bucket_shard_str& rhs) {
+  auto& bs = rhs.bs;
+  out << bucket_str{bs.bucket};
+  if (bs.shard_id >= 0) {
+    out << ':' << bs.shard_id;
+  }
+  return out;
+}
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<bucket_shard_str> : fmt::ostream_formatter {};
+#endif
+
+struct all_bucket_info {
+  RGWBucketInfo bucket_info;
+  map<string, bufferlist> attrs;
+};
+
+struct rgw_sync_pipe_info_entity
+{
+private:
+  RGWBucketInfo bucket_info;
+  map<string, bufferlist> bucket_attrs;
+  bool _has_bucket_info{false};
+
+public:
+  rgw_zone_id zone;
+
+  rgw_sync_pipe_info_entity() {}
+  rgw_sync_pipe_info_entity(const rgw_sync_bucket_entity& e,
+                            std::optional<all_bucket_info>& binfo) {
+    if (e.zone) {
+      zone = *e.zone;
+    }
+    if (!e.bucket) {
+      return;
+    }
+    if (!binfo ||
+        binfo->bucket_info.bucket != *e.bucket) {
+      bucket_info.bucket = *e.bucket;
+    } else {
+      set_bucket_info(*binfo);
+    }
+  }
+
+  void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
+    if (_has_bucket_info) {
+      return;
+    }
+    if (bucket_info.bucket.name.empty()) {
+      return;
+    }
+
+    auto iter = buckets_info.find(bucket_info.bucket);
+    if (iter == buckets_info.end()) {
+      return;
+    }
+
+    set_bucket_info(iter->second);
+  }
+
+  bool has_bucket_info() const {
+    return _has_bucket_info;
+  }
+
+  void set_bucket_info(const all_bucket_info& all_info) {
+    bucket_info = all_info.bucket_info;
+    bucket_attrs = all_info.attrs;
+    _has_bucket_info = true;
+  }
+
+  const RGWBucketInfo& get_bucket_info() const {
+    return bucket_info;
+  }
+
+  const rgw_bucket& get_bucket() const {
+    return bucket_info.bucket;
+  }
+
+  bool operator<(const rgw_sync_pipe_info_entity& e) const {
+    if (zone < e.zone) {
+      return false;
+    }
+    if (zone > e.zone) {
+      return true;
+    }
+    return (bucket_info.bucket < e.bucket_info.bucket);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const rgw_sync_pipe_info_entity& e) {
+  auto& bucket = e.get_bucket_info().bucket;
+
+  out << e.zone << ":" << bucket.get_key();
+  return out;
+}
+
+struct rgw_sync_pipe_handler_info {
+  RGWBucketSyncFlowManager::pipe_handler handler;
+  rgw_sync_pipe_info_entity source;
+  rgw_sync_pipe_info_entity target;
+
+  rgw_sync_pipe_handler_info() {}
+  rgw_sync_pipe_handler_info(const RGWBucketSyncFlowManager::pipe_handler& _handler,
+                     std::optional<all_bucket_info> source_bucket_info,
+                     std::optional<all_bucket_info> target_bucket_info) : handler(_handler),
+                                                                          source(handler.source, source_bucket_info),
+                                                                          target(handler.dest, target_bucket_info) {
+  }
+
+  bool operator<(const rgw_sync_pipe_handler_info& p) const {
+    if (source < p.source) {
+      return true;
+    }
+    if (p.source < source) {
+      return false;
+    }
+    return (target < p.target);
+  }
+
+  void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
+    source.update_empty_bucket_info(buckets_info);
+    target.update_empty_bucket_info(buckets_info);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const rgw_sync_pipe_handler_info& p) {
+  out << p.source << ">" << p.target;
+  return out;
+}
+
+struct rgw_sync_pipe_info_set {
+  std::set<rgw_sync_pipe_handler_info> handlers;
+
+  using iterator = std::set<rgw_sync_pipe_handler_info>::iterator;
+
+  void clear() {
+    handlers.clear();
+  }
+
+  void insert(const RGWBucketSyncFlowManager::pipe_handler& handler,
+              std::optional<all_bucket_info>& source_bucket_info,
+              std::optional<all_bucket_info>& target_bucket_info) {
+    rgw_sync_pipe_handler_info p(handler, source_bucket_info, target_bucket_info);
+    handlers.insert(p);
+  }
+
+  iterator begin() {
+    return handlers.begin();
+  }
+
+  iterator end() {
+    return handlers.end();
+  }
+
+  size_t size() const {
+    return handlers.size();
+  }
+
+  bool empty() const {
+    return handlers.empty();
+  }
+
+  void update_empty_bucket_info(const std::map<rgw_bucket, all_bucket_info>& buckets_info) {
+    if (buckets_info.empty()) {
+      return;
+    }
+
+    std::set<rgw_sync_pipe_handler_info> p;
+
+    for (auto pipe : handlers) {
+      pipe.update_empty_bucket_info(buckets_info);
+      p.insert(pipe);
+    }
+
+    handlers = std::move(p);
+  }
+};
+
+class RGWRunBucketSourcesSyncCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+
+  rgw_sync_pipe_info_set pipes;
+  rgw_sync_pipe_info_set::iterator siter;
+
+  rgw_bucket_sync_pair_info sync_pair;
+
+  RGWSyncTraceNodeRef tn;
+  ceph::real_time* progress;
+  std::vector<ceph::real_time> shard_progress;
+  std::vector<ceph::real_time>::iterator cur_shard_progress;
+
+  RGWRESTConn *conn{nullptr};
+  rgw_zone_id last_zone;
+
+  std::optional<uint64_t> gen;
+  rgw_bucket_index_marker_info marker_info;
+  BucketIndexShardsManager marker_mgr;
+
+public:
+  RGWRunBucketSourcesSyncCR(RGWDataSyncCtx *_sc,
+                            boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                            const rgw_bucket_shard& source_bs,
+                            const RGWSyncTraceNodeRef& _tn_parent,
+			    std::optional<uint64_t> gen,
+                            ceph::real_time* progress);
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+class RGWDataSyncSingleEntryCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw::bucket_sync::Handle state; // cached bucket-shard state
+  rgw_data_sync_obligation obligation; // input obligation
+  std::optional<rgw_data_sync_obligation> complete; // obligation to complete
+  uint32_t obligation_counter = 0;
+  RGWDataSyncShardMarkerTrack *marker_tracker;
+  rgw_raw_obj error_repo;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  RGWSyncTraceNodeRef tn;
+
+  ceph::real_time progress;
+  int sync_status = 0;
+public:
+  RGWDataSyncSingleEntryCR(RGWDataSyncCtx *_sc, rgw::bucket_sync::Handle state,
+                           rgw_data_sync_obligation _obligation,
+                           RGWDataSyncShardMarkerTrack *_marker_tracker,
+                           const rgw_raw_obj& error_repo,
+                           boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                           const RGWSyncTraceNodeRef& _tn_parent)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      state(std::move(state)), obligation(std::move(_obligation)),
+      marker_tracker(_marker_tracker), error_repo(error_repo),
+      lease_cr(std::move(lease_cr)) {
+    set_description() << "data sync single entry (source_zone=" << sc->source_zone << ") " << obligation;
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", to_string(obligation.bs, obligation.gen));
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      if (state->obligation) {
+        // this is already syncing in another DataSyncSingleEntryCR
+        if (state->obligation->timestamp < obligation.timestamp) {
+          // cancel existing obligation and overwrite it
+          tn->log(10, SSTR("canceling existing obligation " << *state->obligation));
+          complete = std::move(*state->obligation);
+          *state->obligation = std::move(obligation);
+          state->counter++;
+        } else {
+          // cancel new obligation
+          tn->log(10, SSTR("canceling new obligation " << obligation));
+          complete = std::move(obligation);
+        }
+      } else {
+        // start syncing a new obligation
+        state->obligation = obligation;
+        obligation_counter = state->counter;
+        state->counter++;
+
+        // loop until the latest obligation is satisfied, because other callers
+        // may update the obligation while we're syncing
+        while ((state->obligation->timestamp == ceph::real_time() ||
+                state->progress_timestamp < state->obligation->timestamp) &&
+               obligation_counter != state->counter) {
+          obligation_counter = state->counter;
+          progress = ceph::real_time{};
+
+          ldout(cct, 4) << "starting sync on " << bucket_shard_str{state->key.first}
+              << ' ' << *state->obligation << " progress timestamp " << state->progress_timestamp
+              << " progress " << progress << dendl;
+          yield call(new RGWRunBucketSourcesSyncCR(sc, lease_cr,
+                                                   state->key.first, tn,
+                                                   state->obligation->gen,
+						   &progress));
+          if (retcode < 0) {
+            break;
+          }
+          state->progress_timestamp = std::max(progress, state->progress_timestamp);
+        }
+        // any new obligations will process themselves
+        complete = std::move(*state->obligation);
+        state->obligation.reset();
+
+        tn->log(10, SSTR("sync finished on " << bucket_shard_str{state->key.first}
+                         << " progress=" << progress << ' ' << complete << " r=" << retcode));
+      }
+      sync_status = retcode;
+
+      if (sync_status == -ENOENT) {
+        // this was added when 'tenant/' was added to datalog entries, because
+        // preexisting tenant buckets could never sync and would stay in the
+        // error_repo forever
+        tn->log(0, SSTR("WARNING: skipping data log entry for missing bucket " << complete->bs));
+        sync_status = 0;
+      }
+
+      if (sync_status < 0) {
+        // write actual sync failures for 'radosgw-admin sync error list'
+        if (sync_status != -EBUSY && sync_status != -EAGAIN) {
+          yield call(sync_env->error_logger->log_error_cr(dpp, sc->conn->get_remote_id(), "data",
+                                                          to_string(complete->bs, complete->gen),
+                                                          -sync_status, string("failed to sync bucket instance: ") + cpp_strerror(-sync_status)));
+          if (retcode < 0) {
+            tn->log(0, SSTR("ERROR: failed to log sync failure: retcode=" << retcode));
+          }
+        }
+        if (complete->timestamp != ceph::real_time{}) {
+          tn->log(10, SSTR("writing " << *complete << " to error repo for retry"));
+          yield call(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+                                              rgw::error_repo::encode_key(complete->bs, complete->gen),
+                                              complete->timestamp));
+          if (retcode < 0) {
+            tn->log(0, SSTR("ERROR: failed to log sync failure in error repo: retcode=" << retcode));
+          }
+        }
+      } else if (complete->retry) {
+        yield call(rgw::error_repo::remove_cr(sync_env->driver->svc()->rados, error_repo,
+                                              rgw::error_repo::encode_key(complete->bs, complete->gen),
+                                              complete->timestamp));
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: failed to remove omap key from error repo ("
+             << error_repo << " retcode=" << retcode));
+        }
+      }
+      /* FIXME: what do do in case of error */
+      if (marker_tracker && !complete->marker.empty()) {
+        /* update marker */
+        yield call(marker_tracker->finish(complete->marker));
+        if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+      }
+      if (sync_status == 0) {
+        sync_status = retcode;
+      }
+      if (sync_status < 0) {
+        return set_cr_error(sync_status);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+rgw_raw_obj datalog_oid_for_error_repo(RGWDataSyncCtx *sc, rgw::sal::RadosStore* driver,
+                                      rgw_pool& pool, rgw_bucket_shard& bs) {
+  int datalog_shard = driver->svc()->datalog_rados->choose_oid(bs);
+  string oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, datalog_shard);
+  return rgw_raw_obj(pool, oid + ".retry");
+  }
+
+class RGWDataIncrementalSyncFullObligationCR: public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_shard source_bs;
+  rgw_raw_obj error_repo;
+  std::string error_marker;
+  ceph::real_time timestamp;
+  RGWSyncTraceNodeRef tn;
+  rgw_bucket_index_marker_info remote_info;
+  rgw_pool pool;
+  uint32_t sid;
+  rgw_bucket_shard bs;
+  std::vector<store_gen_shards>::const_iterator each;
+
+public:
+  RGWDataIncrementalSyncFullObligationCR(RGWDataSyncCtx *_sc, rgw_bucket_shard& _source_bs,
+                                         const rgw_raw_obj& error_repo, const std::string& _error_marker,
+                                         ceph::real_time& _timestamp, RGWSyncTraceNodeRef& _tn)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), source_bs(_source_bs),
+      error_repo(error_repo), error_marker(_error_marker), timestamp(_timestamp),
+      tn(sync_env->sync_tracer->add_node(_tn, "error_repo", SSTR(bucket_shard_str(source_bs))))
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, source_bs.bucket, &remote_info));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      each = remote_info.generations.cbegin();
+      for (; each != remote_info.generations.cend(); each++) {
+        for (sid = 0; sid < each->num_shards; sid++) {
+          bs.bucket = source_bs.bucket;
+          bs.shard_id = sid;
+	  pool = sync_env->svc->zone->get_zone_params().log_pool;
+          error_repo = datalog_oid_for_error_repo(sc, sync_env->driver, pool, source_bs);
+          tn->log(10, SSTR("writing shard_id " << sid << " of gen " << each->gen << " to error repo for retry"));
+          yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+                            rgw::error_repo::encode_key(bs, each->gen),
+			    timestamp), sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window),
+                            [&](uint64_t stack_id, int ret) {
+                              if (ret < 0) {
+                                retcode = ret;
+                              }
+                              return 0;
+                            });
+        }
+      }
+      drain_all_cb([&](uint64_t stack_id, int ret) {
+                   if (ret < 0) {
+                     tn->log(10, SSTR("writing to error repo returned error: " << ret));
+                   }
+                   return ret;
+                 });
+
+      // once everything succeeds, remove the full sync obligation from the error repo
+      yield call(rgw::error_repo::remove_cr(sync_env->driver->svc()->rados, error_repo,
+                                            error_marker, timestamp));
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+RGWCoroutine* data_sync_single_entry(RGWDataSyncCtx *sc, const rgw_bucket_shard& src,
+                                std::optional<uint64_t> gen,
+                                const std::string marker,
+                                ceph::real_time timestamp,
+                                boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                                boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache,
+                                RGWDataSyncShardMarkerTrack* marker_tracker,
+                                rgw_raw_obj error_repo,
+                                RGWSyncTraceNodeRef& tn,
+                                bool retry) {
+  auto state = bucket_shard_cache->get(src, gen);
+  auto obligation = rgw_data_sync_obligation{src, gen, marker, timestamp, retry};
+  return new RGWDataSyncSingleEntryCR(sc, std::move(state), std::move(obligation),
+                                      &*marker_tracker, error_repo,
+                                      lease_cr.get(), tn);
+}
+
+static ceph::real_time timestamp_for_bucket_shard(rgw::sal::RadosStore* driver,
+                                                const rgw_data_sync_status& sync_status,
+                                                const rgw_bucket_shard& bs) {
+  int datalog_shard = driver->svc()->datalog_rados->choose_oid(bs);
+  auto status = sync_status.sync_markers.find(datalog_shard);
+  if (status == sync_status.sync_markers.end()) {
+    return ceph::real_clock::zero();
+  }
+  return status->second.timestamp;
+}
+
+class RGWDataFullSyncSingleEntryCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_pool pool;
+  rgw_bucket_shard source_bs;
+  const std::string key;
+  rgw_data_sync_status sync_status;
+  rgw_raw_obj error_repo;
+  ceph::real_time timestamp;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache;
+  RGWDataSyncShardMarkerTrack* marker_tracker;
+  RGWSyncTraceNodeRef tn;
+  rgw_bucket_index_marker_info remote_info;
+  uint32_t sid;
+  std::vector<store_gen_shards>::iterator each;
+  uint64_t i{0};
+  RGWCoroutine* shard_cr = nullptr;
+  bool first_shard = true;
+  bool error_inject;
+
+public:
+  RGWDataFullSyncSingleEntryCR(RGWDataSyncCtx *_sc, const rgw_pool& _pool, const rgw_bucket_shard& _source_bs,
+                      const std::string& _key, const rgw_data_sync_status& sync_status, const rgw_raw_obj& _error_repo,
+                      ceph::real_time _timestamp, boost::intrusive_ptr<const RGWContinuousLeaseCR> _lease_cr,
+                      boost::intrusive_ptr<rgw::bucket_sync::Cache> _bucket_shard_cache,
+                      RGWDataSyncShardMarkerTrack* _marker_tracker,
+                      RGWSyncTraceNodeRef& _tn)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), pool(_pool), source_bs(_source_bs), key(_key),
+      error_repo(_error_repo), timestamp(_timestamp), lease_cr(std::move(_lease_cr)),
+      bucket_shard_cache(_bucket_shard_cache), marker_tracker(_marker_tracker), tn(_tn) {
+        error_inject = (sync_env->cct->_conf->rgw_sync_data_full_inject_err_probability > 0);
+      }
+
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      if (error_inject &&
+          rand() % 10000 < cct->_conf->rgw_sync_data_full_inject_err_probability * 10000.0) {
+        tn->log(0, SSTR("injecting read bilog info error on key=" << key));
+        retcode = -ENOENT;
+      } else {
+        tn->log(0, SSTR("read bilog info key=" << key));
+        yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, source_bs.bucket, &remote_info));
+      }
+
+      if (retcode < 0) {
+        tn->log(10, SSTR("full sync: failed to read remote bucket info. Writing "
+                        << source_bs.shard_id << " to error repo for retry"));
+        yield call(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+                                            rgw::error_repo::encode_key(source_bs, std::nullopt),
+                                            timestamp));
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: failed to log " << source_bs.shard_id << " in error repo: retcode=" << retcode));
+        }
+        yield call(marker_tracker->finish(key));
+        return set_cr_error(retcode);
+      }
+
+      //wait to sync the first shard of the oldest generation and then sync all other shards.
+      //if any of the operations fail at any time, write them into error repo for later retry.
+
+      each = remote_info.generations.begin();
+      for (; each != remote_info.generations.end(); each++) {
+        for (sid = 0; sid < each->num_shards; sid++) {
+          source_bs.shard_id = sid;
+          // use the error repo and sync status timestamp from the datalog shard corresponding to source_bs
+          error_repo = datalog_oid_for_error_repo(sc, sync_env->driver, pool, source_bs);
+          timestamp = timestamp_for_bucket_shard(sync_env->driver, sync_status, source_bs);
+          if (retcode < 0) {
+            tn->log(10, SSTR("Write " << source_bs.shard_id << " to error repo for retry"));
+            yield_spawn_window(rgw::error_repo::write_cr(sync_env->driver->svc()->rados, error_repo,
+                rgw::error_repo::encode_key(source_bs, each->gen),
+		timestamp), sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window), std::nullopt);
+          } else {
+          shard_cr = data_sync_single_entry(sc, source_bs, each->gen, key, timestamp,
+                      lease_cr, bucket_shard_cache, nullptr, error_repo, tn, false);
+          tn->log(10, SSTR("full sync: syncing shard_id " << sid << " of gen " << each->gen));
+          if (first_shard) {
+            yield call(shard_cr);
+            first_shard = false;
+          } else {
+            yield_spawn_window(shard_cr, sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window),
+                              [&](uint64_t stack_id, int ret) {
+                                if (ret < 0) {
+                                  retcode = ret;
+                                }
+                                return retcode;
+                                });
+            }
+          }
+        }
+        drain_all_cb([&](uint64_t stack_id, int ret) {
+                if (ret < 0) {
+                  retcode = ret;
+                }
+                return retcode;
+              });
+      }
+
+      yield call(marker_tracker->finish(key));
+      if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWDataBaseSyncShardCR : public RGWCoroutine {
+protected:
+  RGWDataSyncCtx *const sc;
+  const rgw_pool& pool;
+  const uint32_t shard_id;
+  rgw_data_sync_marker& sync_marker;
+  RGWSyncTraceNodeRef tn;
+  const string& status_oid;
+  const rgw_raw_obj& error_repo;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  const rgw_data_sync_status& sync_status;
+  RGWObjVersionTracker& objv;
+  boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache;
+
+  std::optional<RGWDataSyncShardMarkerTrack> marker_tracker;
+  RGWRadosGetOmapValsCR::ResultPtr omapvals;
+  rgw_bucket_shard source_bs;
+
+  int parse_bucket_key(const std::string& key, rgw_bucket_shard& bs) const {
+    int ret = rgw_bucket_parse_bucket_key(sc->env->cct, key,
+                                       &bs.bucket, &bs.shard_id);
+    //for the case of num_shards 0, shard_id gets a value of -1
+    //because of the way bucket instance gets parsed in the absence of shard_id delimiter.
+    //interpret it as a non-negative value.
+    if (ret == 0) {
+      if (bs.shard_id < 0) {
+        bs.shard_id = 0;
+      }
+    }
+    return ret;
+  }
+
+  RGWDataBaseSyncShardCR(
+    RGWDataSyncCtx *const _sc, const rgw_pool& pool, const uint32_t shard_id,
+    rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
+    const string& status_oid, const rgw_raw_obj& error_repo,
+    boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+    const rgw_data_sync_status& sync_status,
+    RGWObjVersionTracker& objv,
+    const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache)
+    : RGWCoroutine(_sc->cct), sc(_sc), pool(pool), shard_id(shard_id),
+      sync_marker(sync_marker), tn(tn), status_oid(status_oid),
+      error_repo(error_repo), lease_cr(std::move(lease_cr)),
+      sync_status(sync_status), objv(objv),
+      bucket_shard_cache(bucket_shard_cache) {}
+};
+
+class RGWDataFullSyncShardCR : public RGWDataBaseSyncShardCR {
+  static constexpr auto OMAP_GET_MAX_ENTRIES = 100;
+
+  string oid;
+  uint64_t total_entries = 0;
+  ceph::real_time entry_timestamp;
+  std::map<std::string, bufferlist> entries;
+  std::map<std::string, bufferlist>::iterator iter;
+  string error_marker;
+
+public:
+
+  RGWDataFullSyncShardCR(
+    RGWDataSyncCtx *const sc, const rgw_pool& pool, const uint32_t shard_id,
+    rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
+    const string& status_oid, const rgw_raw_obj& error_repo,
+    boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+    const rgw_data_sync_status& sync_status, RGWObjVersionTracker& objv,
+    const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache)
+    : RGWDataBaseSyncShardCR(sc, pool, shard_id, sync_marker, tn,
+			     status_oid, error_repo, std::move(lease_cr),
+			     sync_status, objv, bucket_shard_cache) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      tn->log(10, "start full sync");
+      oid = full_data_sync_index_shard_oid(sc->source_zone, shard_id);
+      marker_tracker.emplace(sc, status_oid, sync_marker, tn, objv);
+      total_entries = sync_marker.pos;
+      entry_timestamp = sync_marker.timestamp; // time when full sync started
+      do {
+        if (!lease_cr->is_locked()) {
+          drain_all();
+          tn->log(1, "lease is lost, abort");
+          return set_cr_error(-ECANCELED);
+        }
+        omapvals = std::make_shared<RGWRadosGetOmapValsCR::Result>();
+        yield call(new RGWRadosGetOmapValsCR(sc->env->driver,
+					     rgw_raw_obj(pool, oid),
+                                             sync_marker.marker,
+					     OMAP_GET_MAX_ENTRIES, omapvals));
+        if (retcode < 0) {
+          drain_all();
+          return set_cr_error(retcode);
+        }
+        entries = std::move(omapvals->entries);
+        if (entries.size() > 0) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+        }
+        tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync"));
+        iter = entries.begin();
+        for (; iter != entries.end(); ++iter) {
+          retcode = parse_bucket_key(iter->first, source_bs);
+          if (retcode < 0) {
+            tn->log(1, SSTR("failed to parse bucket shard: " << iter->first));
+            marker_tracker->try_update_high_marker(iter->first, 0,
+						   entry_timestamp);
+            continue;
+          }
+          tn->log(20, SSTR("full sync: " << iter->first));
+          total_entries++;
+          if (!marker_tracker->start(iter->first, total_entries,
+				     entry_timestamp)) {
+            tn->log(0, SSTR("ERROR: cannot start syncing " << iter->first
+			    << ". Duplicate entry?"));
+          } else {
+            tn->log(10, SSTR("timestamp for " << iter->first << " is :" << entry_timestamp));
+            yield_spawn_window(new RGWDataFullSyncSingleEntryCR(
+				 sc, pool, source_bs, iter->first, sync_status,
+				 error_repo, entry_timestamp, lease_cr,
+				 bucket_shard_cache, &*marker_tracker, tn),
+			       sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window),
+			       std::nullopt);
+          }
+	  sync_marker.marker = iter->first;
+        }
+      } while (omapvals->more);
+      omapvals.reset();
+
+      drain_all();
+
+      tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+
+      /* update marker to reflect we're done with full sync */
+      sync_marker.state = rgw_data_sync_marker::IncrementalSync;
+      sync_marker.marker = sync_marker.next_step_marker;
+      sync_marker.next_step_marker.clear();
+      yield call(new RGWSimpleRadosWriteCR<rgw_data_sync_marker>(
+             sc->env->dpp, sc->env->driver,
+             rgw_raw_obj(pool, status_oid), sync_marker, &objv));
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to set sync marker: retcode=" << retcode));
+        return set_cr_error(retcode);
+      }
+
+      // clean up full sync index, ignoring errors
+      yield call(new RGWRadosRemoveCR(sc->env->driver, {pool, oid}));
+
+      // transition to incremental sync
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWDataIncSyncShardCR : public RGWDataBaseSyncShardCR {
+  static constexpr int max_error_entries = 10;
+  static constexpr uint32_t retry_backoff_secs = 60;
+
+  ceph::mutex& inc_lock;
+  bc::flat_set<rgw_data_notify_entry>& modified_shards;
+
+  bc::flat_set<rgw_data_notify_entry> current_modified;
+  decltype(current_modified)::iterator modified_iter;
+
+  ceph::coarse_real_time error_retry_time;
+  string error_marker;
+  std::map<std::string, bufferlist> error_entries;
+  decltype(error_entries)::iterator iter;
+  ceph::real_time entry_timestamp;
+  std::optional<uint64_t> gen;
+
+  string next_marker;
+  vector<rgw_data_change_log_entry> log_entries;
+  decltype(log_entries)::iterator log_iter;
+  bool truncated = false;
+  int cbret = 0;
+
+  utime_t get_idle_interval() const {
+    ceph::timespan interval = std::chrono::seconds(cct->_conf->rgw_data_sync_poll_interval);
+    if (!ceph::coarse_real_clock::is_zero(error_retry_time)) {
+      auto now = ceph::coarse_real_clock::now();
+      if (error_retry_time > now) {
+        auto d = error_retry_time - now;
+        if (interval > d) {
+          interval = d;
+        }
+      }
+    }
+    // convert timespan -> time_point -> utime_t
+    return utime_t(ceph::coarse_real_clock::zero() + interval);
+  }
+
+
+public:
+
+  RGWDataIncSyncShardCR(
+    RGWDataSyncCtx *const sc, const rgw_pool& pool, const uint32_t shard_id,
+    rgw_data_sync_marker& sync_marker, RGWSyncTraceNodeRef tn,
+    const string& status_oid, const rgw_raw_obj& error_repo,
+    boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+    const rgw_data_sync_status& sync_status, RGWObjVersionTracker& objv,
+    const boost::intrusive_ptr<rgw::bucket_sync::Cache>& bucket_shard_cache,
+    ceph::mutex& inc_lock,
+    bc::flat_set<rgw_data_notify_entry>& modified_shards)
+    : RGWDataBaseSyncShardCR(sc, pool, shard_id, sync_marker, tn,
+			     status_oid, error_repo, std::move(lease_cr),
+			     sync_status, objv, bucket_shard_cache),
+      inc_lock(inc_lock), modified_shards(modified_shards) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      tn->log(10, "start incremental sync");
+      marker_tracker.emplace(sc, status_oid, sync_marker, tn, objv);
+      do {
+        if (!lease_cr->is_locked()) {
+          drain_all();
+          tn->log(1, "lease is lost, abort");
+          return set_cr_error(-ECANCELED);
+        }
+	{
+	  current_modified.clear();
+	  std::unique_lock il(inc_lock);
+	  current_modified.swap(modified_shards);
+	  il.unlock();
+	}
+
+        if (current_modified.size() > 0) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+        }
+        /* process out of band updates */
+        for (modified_iter = current_modified.begin();
+	     modified_iter != current_modified.end();
+	     ++modified_iter) {
+	  if (!lease_cr->is_locked()) {
+	    drain_all();
+	    yield call(marker_tracker->flush());
+	    if (retcode < 0) {
+	      tn->log(0, SSTR("ERROR: data sync marker_tracker.flush() returned retcode=" << retcode));
+	      return set_cr_error(retcode);
+	    }
+	    return set_cr_error(-ECANCELED);
+	  }
+          retcode = parse_bucket_key(modified_iter->key, source_bs);
+          if (retcode < 0) {
+            tn->log(1, SSTR("failed to parse bucket shard: "
+			    << modified_iter->key));
+	    continue;
+          }
+          tn->log(20, SSTR("received async update notification: "
+			   << modified_iter->key));
+          spawn(data_sync_single_entry(sc, source_bs, modified_iter->gen, {},
+				       ceph::real_time{}, lease_cr,
+				       bucket_shard_cache, &*marker_tracker,
+				       error_repo, tn, false), false);
+	}
+
+        if (error_retry_time <= ceph::coarse_real_clock::now()) {
+          /* process bucket shards that previously failed */
+          omapvals = std::make_shared<RGWRadosGetOmapValsCR::Result>();
+          yield call(new RGWRadosGetOmapValsCR(sc->env->driver, error_repo,
+                                               error_marker, max_error_entries,
+					       omapvals));
+          error_entries = std::move(omapvals->entries);
+          tn->log(20, SSTR("read error repo, got " << error_entries.size()
+			   << " entries"));
+          iter = error_entries.begin();
+          for (; iter != error_entries.end(); ++iter) {
+	    if (!lease_cr->is_locked()) {
+	      drain_all();
+	      yield call(marker_tracker->flush());
+	      if (retcode < 0) {
+		tn->log(0, SSTR("ERROR: data sync marker_tracker.flush() returned retcode=" << retcode));
+		return set_cr_error(retcode);
+	      }
+	      return set_cr_error(-ECANCELED);
+	    }
+            error_marker = iter->first;
+            entry_timestamp = rgw::error_repo::decode_value(iter->second);
+            retcode = rgw::error_repo::decode_key(iter->first, source_bs, gen);
+            if (retcode == -EINVAL) {
+              // backward compatibility for string keys that don't encode a gen
+              retcode = parse_bucket_key(error_marker, source_bs);
+            }
+            if (retcode < 0) {
+              tn->log(1, SSTR("failed to parse bucket shard: " << error_marker));
+              spawn(rgw::error_repo::remove_cr(sc->env->driver->svc()->rados,
+					       error_repo, error_marker,
+					       entry_timestamp),
+		    false);
+              continue;
+            }
+            tn->log(10, SSTR("gen is " << gen));
+            if (!gen) {
+              // write all full sync obligations for the bucket to error repo
+              spawn(new RGWDataIncrementalSyncFullObligationCR(sc, source_bs,
+                     error_repo, error_marker, entry_timestamp, tn), false);
+            } else {
+              tn->log(20, SSTR("handle error entry key="
+			       << to_string(source_bs, gen)
+			       << " timestamp=" << entry_timestamp));
+              spawn(data_sync_single_entry(sc, source_bs, gen, "",
+					   entry_timestamp, lease_cr,
+					   bucket_shard_cache, &*marker_tracker,
+					   error_repo, tn, true), false);
+            }
+          }
+          if (!omapvals->more) {
+            error_retry_time = ceph::coarse_real_clock::now() +
+	      make_timespan(retry_backoff_secs);
+            error_marker.clear();
+          }
+        }
+        omapvals.reset();
+
+        tn->log(20, SSTR("shard_id=" << shard_id << " sync_marker="
+			 << sync_marker.marker));
+        yield call(new RGWReadRemoteDataLogShardCR(sc, shard_id,
+						   sync_marker.marker,
+                                                   &next_marker, &log_entries,
+						   &truncated));
+        if (retcode < 0 && retcode != -ENOENT) {
+          tn->log(0, SSTR("ERROR: failed to read remote data log info: ret="
+			  << retcode));
+          drain_all();
+          return set_cr_error(retcode);
+        }
+
+        if (log_entries.size() > 0) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+        }
+
+        for (log_iter = log_entries.begin();
+	     log_iter != log_entries.end();
+	     ++log_iter) {
+	  if (!lease_cr->is_locked()) {
+	    drain_all();
+	    yield call(marker_tracker->flush());
+	    if (retcode < 0) {
+	      tn->log(0, SSTR("ERROR: data sync marker_tracker.flush() returned retcode=" << retcode));
+	      return set_cr_error(retcode);
+	    }
+	    return set_cr_error(-ECANCELED);
+	  }
+
+          tn->log(20, SSTR("shard_id=" << shard_id << " log_entry: " << log_iter->log_id << ":" << log_iter->log_timestamp << ":" << log_iter->entry.key));
+          retcode = parse_bucket_key(log_iter->entry.key, source_bs);
+          if (retcode < 0) {
+            tn->log(1, SSTR("failed to parse bucket shard: "
+			    << log_iter->entry.key));
+            marker_tracker->try_update_high_marker(log_iter->log_id, 0,
+						   log_iter->log_timestamp);
+            continue;
+          }
+          if (!marker_tracker->start(log_iter->log_id, 0,
+				     log_iter->log_timestamp)) {
+            tn->log(0, SSTR("ERROR: cannot start syncing " << log_iter->log_id
+			    << ". Duplicate entry?"));
+          } else {
+            tn->log(1, SSTR("incremental sync on " << log_iter->entry.key  << "shard: " << shard_id << "on gen " << log_iter->entry.gen));
+            yield_spawn_window(data_sync_single_entry(sc, source_bs, log_iter->entry.gen, log_iter->log_id,
+                                                 log_iter->log_timestamp, lease_cr,bucket_shard_cache,
+                                                 &*marker_tracker, error_repo, tn, false),
+                               sc->lcc.adj_concurrency(cct->_conf->rgw_data_sync_spawn_window),
+                               [&](uint64_t stack_id, int ret) {
+                                 if (ret < 0) {
+                                   tn->log(10, SSTR("data_sync_single_entry returned error: " << ret));
+                                   cbret = ret;
+                                 }
+                                 return 0;
+                                });
+          }
+        }
+        if (cbret < 0 ) {
+          retcode = cbret;
+          drain_all();
+          return set_cr_error(retcode);
+        }
+
+        tn->log(20, SSTR("shard_id=" << shard_id <<
+			 " sync_marker="<< sync_marker.marker
+			 << " next_marker=" << next_marker
+			 << " truncated=" << truncated));
+        if (!next_marker.empty()) {
+          sync_marker.marker = next_marker;
+        } else if (!log_entries.empty()) {
+          sync_marker.marker = log_entries.back().log_id;
+        }
+        if (!truncated) {
+          // we reached the end, wait a while before checking for more
+          tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+	  yield wait(get_idle_interval());
+	}
+      } while (true);
+    }
+    return 0;
+  }
+};
+
+class RGWDataSyncShardCR : public RGWCoroutine {
+  RGWDataSyncCtx *const sc;
+  const rgw_pool pool;
+  const uint32_t shard_id;
+  rgw_data_sync_marker& sync_marker;
+  rgw_data_sync_status sync_status;
+  const RGWSyncTraceNodeRef tn;
+  RGWObjVersionTracker& objv;
+  bool *reset_backoff;
+
+  ceph::mutex inc_lock = ceph::make_mutex("RGWDataSyncShardCR::inc_lock");
+  ceph::condition_variable inc_cond;
+
+  RGWDataSyncEnv *const sync_env{ sc->env };
+
+  const string status_oid{ RGWDataSyncStatusManager::shard_obj_name(
+      sc->source_zone, shard_id) };
+  const rgw_raw_obj error_repo{ pool, status_oid + ".retry" };
+
+  // target number of entries to cache before recycling idle ones
+  static constexpr size_t target_cache_size = 256;
+  boost::intrusive_ptr<rgw::bucket_sync::Cache> bucket_shard_cache {
+    rgw::bucket_sync::Cache::create(target_cache_size) };
+
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+
+  bc::flat_set<rgw_data_notify_entry> modified_shards;
+
+public:
+  RGWDataSyncShardCR(RGWDataSyncCtx* const _sc, const rgw_pool& pool,
+                     const uint32_t shard_id, rgw_data_sync_marker& marker,
+                     const rgw_data_sync_status& sync_status,
+                     RGWSyncTraceNodeRef& tn, RGWObjVersionTracker& objv, bool *reset_backoff)
+    : RGWCoroutine(_sc->cct), sc(_sc), pool(pool), shard_id(shard_id),
+      sync_marker(marker), sync_status(sync_status), tn(tn),
+      objv(objv), reset_backoff(reset_backoff) {
+    set_description() << "data sync shard source_zone=" << sc->source_zone
+		      << " shard_id=" << shard_id;
+  }
+
+  ~RGWDataSyncShardCR() override {
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+  }
+
+  void append_modified_shards(bc::flat_set<rgw_data_notify_entry>& entries) {
+    std::lock_guard l{inc_lock};
+    modified_shards.insert(entries.begin(), entries.end());
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield init_lease_cr();
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          tn->log(5, "failed to take lease");
+          set_status("lease lock failed, early abort");
+          drain_all();
+          return set_cr_error(lease_cr->get_ret_status());
+        }
+        set_sleeping(true);
+        yield;
+      }
+      *reset_backoff = true;
+      tn->log(10, "took lease");
+      /* Reread data sync status to fech latest marker and objv */
+      objv.clear();
+      yield call(new RGWSimpleRadosReadCR<rgw_data_sync_marker>(sync_env->dpp, sync_env->driver,
+                                                             rgw_raw_obj(pool, status_oid),
+                                                             &sync_marker, true, &objv));
+      if (retcode < 0) {
+        lease_cr->go_down();
+        drain_all();
+        return set_cr_error(retcode);
+      }
+
+      while (true) {
+	if (sync_marker.state == rgw_data_sync_marker::FullSync) {
+	  yield call(new RGWDataFullSyncShardCR(sc, pool, shard_id,
+						sync_marker, tn,
+						status_oid, error_repo,
+						lease_cr, sync_status,
+            objv, bucket_shard_cache));
+	  if (retcode < 0) {
+	    if (retcode != -EBUSY) {
+	      tn->log(10, SSTR("full sync failed (retcode=" << retcode << ")"));
+	    }
+	    lease_cr->go_down();
+	    drain_all();
+	    return set_cr_error(retcode);
+	  }
+	} else if (sync_marker.state == rgw_data_sync_marker::IncrementalSync) {
+	  yield call(new RGWDataIncSyncShardCR(sc, pool, shard_id,
+					       sync_marker, tn,
+					       status_oid, error_repo,
+					       lease_cr, sync_status,
+					       objv, bucket_shard_cache,
+					       inc_lock, modified_shards));
+	  if (retcode < 0) {
+	    if (retcode != -EBUSY) {
+	      tn->log(10, SSTR("incremental sync failed (retcode=" << retcode
+			       << ")"));
+	    }
+	    lease_cr->go_down();
+	    drain_all();
+	    return set_cr_error(retcode);
+	  }
+	} else {
+	  lease_cr->go_down();
+	  drain_all();
+	  return set_cr_error(-EIO);
+	}
+      }
+    }
+    return 0;
+  }
+
+  void init_lease_cr() {
+    set_status("acquiring sync lock");
+    uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+    string lock_name = "sync_lock";
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+    auto driver = sync_env->driver;
+    lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, driver,
+                                            rgw_raw_obj(pool, status_oid),
+                                            lock_name, lock_duration, this,
+					    &sc->lcc));
+    lease_stack.reset(spawn(lease_cr.get(), false));
+  }
+};
+
+class RGWDataSyncShardControlCR : public RGWBackoffControlCR {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  rgw_pool pool;
+
+  uint32_t shard_id;
+  rgw_data_sync_marker sync_marker;
+  rgw_data_sync_status sync_status;
+
+  RGWSyncTraceNodeRef tn;
+  RGWObjVersionTracker& objv;
+public:
+  RGWDataSyncShardControlCR(RGWDataSyncCtx *_sc, const rgw_pool& _pool,
+                           uint32_t _shard_id, rgw_data_sync_marker& _marker,
+                           const rgw_data_sync_status& sync_status,
+                           RGWObjVersionTracker& objv,
+                           RGWSyncTraceNodeRef& _tn_parent)
+          : RGWBackoffControlCR(_sc->cct, false),
+          sc(_sc), sync_env(_sc->env),
+          pool(_pool),
+          shard_id(_shard_id),
+          sync_marker(_marker), objv(objv) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "shard", std::to_string(shard_id));
+  }
+
+  RGWCoroutine *alloc_cr() override {
+    return new RGWDataSyncShardCR(sc, pool, shard_id, sync_marker, sync_status, tn, objv, backoff_ptr());
+  }
+
+  RGWCoroutine *alloc_finisher_cr() override {
+    return new RGWSimpleRadosReadCR<rgw_data_sync_marker>(sync_env->dpp, sync_env->driver,
+                                                          rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id)),
+                                                          &sync_marker, true, &objv);
+  }
+
+  void append_modified_shards(bc::flat_set<rgw_data_notify_entry>& keys) {
+    std::lock_guard l{cr_lock()};
+
+    RGWDataSyncShardCR *cr = static_cast<RGWDataSyncShardCR *>(get_cr());
+    if (!cr) {
+      return;
+    }
+
+    cr->append_modified_shards(keys);
+  }
+};
+
+class RGWDataSyncCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  uint32_t num_shards;
+
+  rgw_data_sync_status sync_status;
+  std::vector<RGWObjVersionTracker> objvs;
+
+  ceph::mutex shard_crs_lock =
+    ceph::make_mutex("RGWDataSyncCR::shard_crs_lock");
+  map<int, RGWDataSyncShardControlCR *> shard_crs;
+
+  bool *reset_backoff;
+
+  RGWSyncTraceNodeRef tn;
+
+  RGWDataSyncModule *data_sync_module{nullptr};
+
+  boost::intrusive_ptr<RGWContinuousLeaseCR> init_lease;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+
+  RGWObjVersionTracker obj_version;
+public:
+  RGWDataSyncCR(RGWDataSyncCtx *_sc, uint32_t _num_shards, RGWSyncTraceNodeRef& _tn, bool *_reset_backoff) : RGWCoroutine(_sc->cct),
+                                                      sc(_sc), sync_env(_sc->env),
+                                                      num_shards(_num_shards),
+                                                      reset_backoff(_reset_backoff), tn(_tn) {
+
+  }
+
+  ~RGWDataSyncCR() override {
+    for (auto iter : shard_crs) {
+      iter.second->put();
+    }
+    if (init_lease) {
+      init_lease->abort();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+      /* read sync status */
+      yield call(new RGWReadDataSyncStatusCoroutine(sc, &sync_status,
+                                                    &obj_version, objvs));
+
+      data_sync_module = sync_env->sync_module->get_data_handler();
+
+      if (retcode < 0 && retcode != -ENOENT) {
+        tn->log(0, SSTR("ERROR: failed to fetch sync status, retcode=" << retcode));
+        return set_cr_error(retcode);
+      }
+
+      if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state !=
+	  rgw_data_sync_info::StateSync) {
+	init_lease.reset(
+	  RGWInitDataSyncStatusCoroutine::continuous_lease_cr(sc, this));
+	yield lease_stack.reset(spawn(init_lease.get(), false));
+
+	while (!init_lease->is_locked()) {
+	  if (init_lease->is_done()) {
+	    tn->log(5, "ERROR: failed to take data sync status lease");
+	    set_status("lease lock failed, early abort");
+	    drain_all();
+	    return set_cr_error(init_lease->get_ret_status());
+	  }
+	  tn->log(5, "waiting on data sync status lease");
+	  yield set_sleeping(true);
+	}
+	tn->log(5, "acquired data sync status lease");
+
+	// Reread sync status now that we've acquired the lock!
+	obj_version.clear();
+	yield call(new RGWReadDataSyncStatusCoroutine(sc, &sync_status, &obj_version, objvs));
+	if (retcode < 0) {
+	  tn->log(0, SSTR("ERROR: failed to fetch sync status, retcode=" << retcode));
+	  return set_cr_error(retcode);
+	}
+      }
+
+      /* state: init status */
+      if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateInit) {
+        tn->log(20, SSTR("init"));
+        sync_status.sync_info.num_shards = num_shards;
+        uint64_t instance_id;
+        instance_id = ceph::util::generate_random_number<uint64_t>();
+        yield call(new RGWInitDataSyncStatusCoroutine(sc, num_shards, instance_id, tn,
+                                                      &sync_status, init_lease, obj_version, objvs));
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: failed to init sync, retcode=" << retcode));
+	  init_lease->go_down();
+	  drain_all();
+          return set_cr_error(retcode);
+        }
+        // sets state = StateBuildingFullSyncMaps
+
+        *reset_backoff = true;
+      }
+
+      data_sync_module->init(sc, sync_status.sync_info.instance_id);
+
+      if  ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateBuildingFullSyncMaps) {
+        tn->log(10, SSTR("building full sync maps"));
+        /* call sync module init here */
+        sync_status.sync_info.num_shards = num_shards;
+        yield call(data_sync_module->init_sync(dpp, sc));
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: sync module init_sync() failed, retcode=" << retcode));
+          return set_cr_error(retcode);
+        }
+
+        if (!init_lease->is_locked()) {
+          init_lease->go_down();
+          drain_all();
+          return set_cr_error(-ECANCELED);
+        }
+        /* state: building full sync maps */
+        yield call(new RGWListBucketIndexesCR(sc, &sync_status, objvs));
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: failed to build full sync maps, retcode=" << retcode));
+          return set_cr_error(retcode);
+        }
+        sync_status.sync_info.state = rgw_data_sync_info::StateSync;
+
+        if (!init_lease->is_locked()) {
+          init_lease->go_down();
+          drain_all();
+          return set_cr_error(-ECANCELED);
+        }
+        /* update new state */
+        yield call(set_sync_info_cr());
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: failed to write sync status, retcode=" << retcode));
+          return set_cr_error(retcode);
+        }
+
+        *reset_backoff = true;
+      }
+
+      yield call(data_sync_module->start_sync(dpp, sc));
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to start sync, retcode=" << retcode));
+        return set_cr_error(retcode);
+      }
+
+      if ((rgw_data_sync_info::SyncState)sync_status.sync_info.state == rgw_data_sync_info::StateSync) {
+        if (init_lease) {
+          init_lease->go_down();
+          drain_all();
+          init_lease.reset();
+          lease_stack.reset();
+        }
+        yield {
+          tn->log(10, SSTR("spawning " << num_shards << " shards sync"));
+          for (map<uint32_t, rgw_data_sync_marker>::iterator iter = sync_status.sync_markers.begin();
+               iter != sync_status.sync_markers.end(); ++iter) {
+            RGWDataSyncShardControlCR *cr = new RGWDataSyncShardControlCR(sc, sync_env->svc->zone->get_zone_params().log_pool,
+                                                                          iter->first, iter->second, sync_status, objvs[iter->first], tn);
+            cr->get();
+            shard_crs_lock.lock();
+            shard_crs[iter->first] = cr;
+            shard_crs_lock.unlock();
+            spawn(cr, true);
+          }
+        }
+      }
+
+      return set_cr_done();
+    }
+    return 0;
+  }
+
+  RGWCoroutine *set_sync_info_cr() {
+    return new RGWSimpleRadosWriteCR<rgw_data_sync_info>(sync_env->dpp, sync_env->driver,
+                                                         rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, RGWDataSyncStatusManager::sync_status_oid(sc->source_zone)),
+                                                         sync_status.sync_info, &obj_version);
+  }
+
+  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
+    std::lock_guard l{shard_crs_lock};
+    map<int, RGWDataSyncShardControlCR *>::iterator iter = shard_crs.find(shard_id);
+    if (iter == shard_crs.end()) {
+      return;
+    }
+    iter->second->append_modified_shards(entries);
+    iter->second->wakeup();
+  }
+};
+
+class RGWDefaultDataSyncModule : public RGWDataSyncModule {
+public:
+  RGWDefaultDataSyncModule() {}
+
+  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc,
+                            rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                            std::optional<uint64_t> versioned_epoch,
+                            const rgw_zone_set_entry& source_trace_entry,
+                            rgw_zone_set *zones_trace) override;
+  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+};
+
+class RGWDefaultSyncModuleInstance : public RGWSyncModuleInstance {
+  RGWDefaultDataSyncModule data_handler;
+public:
+  RGWDefaultSyncModuleInstance() {}
+  RGWDataSyncModule *get_data_handler() override {
+    return &data_handler;
+  }
+  bool supports_user_writes() override {
+    return true;
+  }
+};
+
+int RGWDefaultSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance)
+{
+  instance->reset(new RGWDefaultSyncModuleInstance());
+  return 0;
+}
+
+class RGWUserPermHandler {
+  friend struct Init;
+  friend class Bucket;
+
+  RGWDataSyncEnv *sync_env;
+  rgw_user uid;
+
+  struct _info {
+    RGWUserInfo user_info;
+    rgw::IAM::Environment env;
+    std::unique_ptr<rgw::auth::Identity> identity;
+    RGWAccessControlPolicy user_acl;
+  };
+
+  std::shared_ptr<_info> info;
+
+  struct Init;
+
+  std::shared_ptr<Init> init_action;
+
+  struct Init : public RGWGenericAsyncCR::Action {
+    RGWDataSyncEnv *sync_env;
+
+    rgw_user uid;
+    std::shared_ptr<RGWUserPermHandler::_info> info;
+
+    int ret{0};
+    
+    Init(RGWUserPermHandler *handler) : sync_env(handler->sync_env),
+                                        uid(handler->uid),
+                                        info(handler->info) {}
+    int operate() override {
+      auto user_ctl = sync_env->driver->getRados()->ctl.user;
+
+      ret = user_ctl->get_info_by_uid(sync_env->dpp, uid, &info->user_info, null_yield);
+      if (ret < 0) {
+        return ret;
+      }
+
+      info->identity = rgw::auth::transform_old_authinfo(sync_env->cct,
+                                                         uid,
+                                                         RGW_PERM_FULL_CONTROL,
+                                                         false, /* system_request? */
+                                                         TYPE_RGW);
+
+      map<string, bufferlist> uattrs;
+
+      ret = user_ctl->get_attrs_by_uid(sync_env->dpp, uid, &uattrs, null_yield);
+      if (ret == 0) {
+        ret = RGWUserPermHandler::policy_from_attrs(sync_env->cct, uattrs, &info->user_acl);
+      }
+      if (ret == -ENOENT) {
+        info->user_acl.create_default(uid, info->user_info.display_name);
+      }
+
+      return 0;
+    }
+  };
+
+public:
+  RGWUserPermHandler(RGWDataSyncEnv *_sync_env,
+                     const rgw_user& _uid) : sync_env(_sync_env),
+                                             uid(_uid) {}
+
+  RGWCoroutine *init_cr() {
+    info = make_shared<_info>();
+    init_action = make_shared<Init>(this);
+
+    return new RGWGenericAsyncCR(sync_env->cct,
+                                 sync_env->async_rados,
+                                 init_action);
+  }
+
+  class Bucket {
+    RGWDataSyncEnv *sync_env;
+    std::shared_ptr<_info> info;
+    RGWAccessControlPolicy bucket_acl;
+    std::optional<perm_state> ps;
+  public:
+    Bucket() {}
+
+    int init(RGWUserPermHandler *handler,
+             const RGWBucketInfo& bucket_info,
+             const map<string, bufferlist>& bucket_attrs);
+
+    bool verify_bucket_permission(int perm);
+    bool verify_object_permission(const map<string, bufferlist>& obj_attrs,
+                                  int perm);
+  };
+
+  static int policy_from_attrs(CephContext *cct,
+                               const map<string, bufferlist>& attrs,
+                               RGWAccessControlPolicy *acl) {
+    acl->set_ctx(cct);
+
+    auto aiter = attrs.find(RGW_ATTR_ACL);
+    if (aiter == attrs.end()) {
+      return -ENOENT;
+    }
+    auto iter = aiter->second.begin();
+    try {
+      acl->decode(iter);
+    } catch (buffer::error& err) {
+      ldout(cct, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
+      return -EIO;
+    }
+
+    return 0;
+  }
+
+  int init_bucket(const RGWBucketInfo& bucket_info,
+                  const map<string, bufferlist>& bucket_attrs,
+                  Bucket *bs) {
+    return bs->init(this, bucket_info, bucket_attrs);
+  }
+};
+
+int RGWUserPermHandler::Bucket::init(RGWUserPermHandler *handler,
+                                     const RGWBucketInfo& bucket_info,
+                                     const map<string, bufferlist>& bucket_attrs)
+{
+  sync_env = handler->sync_env;
+  info = handler->info;
+
+  int r = RGWUserPermHandler::policy_from_attrs(sync_env->cct, bucket_attrs, &bucket_acl);
+  if (r < 0) {
+    return r;
+  }
+
+  ps.emplace(sync_env->cct,
+             info->env,
+             info->identity.get(),
+             bucket_info,
+             info->identity->get_perm_mask(),
+             false, /* defer to bucket acls */
+             nullptr, /* referer */
+             false); /* request_payer */
+
+  return 0;
+}
+
+bool RGWUserPermHandler::Bucket::verify_bucket_permission(int perm)
+{
+  return verify_bucket_permission_no_policy(sync_env->dpp,
+                                            &(*ps),
+                                            &info->user_acl,
+                                            &bucket_acl,
+                                            perm);
+}
+
+bool RGWUserPermHandler::Bucket::verify_object_permission(const map<string, bufferlist>& obj_attrs,
+                                                          int perm)
+{
+  RGWAccessControlPolicy obj_acl;
+
+  int r = policy_from_attrs(sync_env->cct, obj_attrs, &obj_acl);
+  if (r < 0) {
+    return r;
+  }
+
+  return verify_bucket_permission_no_policy(sync_env->dpp,
+                                            &(*ps),
+                                            &bucket_acl,
+                                            &obj_acl,
+                                            perm);
+}
+
+class RGWFetchObjFilter_Sync : public RGWFetchObjFilter_Default {
+  rgw_bucket_sync_pipe sync_pipe;
+
+  std::shared_ptr<RGWUserPermHandler::Bucket> bucket_perms;
+  std::optional<rgw_sync_pipe_dest_params> verify_dest_params;
+
+  std::optional<ceph::real_time> mtime;
+  std::optional<string> etag;
+  std::optional<uint64_t> obj_size;
+
+  std::unique_ptr<rgw::auth::Identity> identity;
+
+  std::shared_ptr<bool> need_retry;
+
+public:
+  RGWFetchObjFilter_Sync(rgw_bucket_sync_pipe& _sync_pipe,
+                         std::shared_ptr<RGWUserPermHandler::Bucket>& _bucket_perms,
+                         std::optional<rgw_sync_pipe_dest_params>&& _verify_dest_params,
+                         std::shared_ptr<bool>& _need_retry) : sync_pipe(_sync_pipe),
+                                         bucket_perms(_bucket_perms),
+                                         verify_dest_params(std::move(_verify_dest_params)),
+                                         need_retry(_need_retry) {
+    *need_retry = false;
+  }
+
+  int filter(CephContext *cct,
+             const rgw_obj_key& source_key,
+             const RGWBucketInfo& dest_bucket_info,
+             std::optional<rgw_placement_rule> dest_placement_rule,
+             const map<string, bufferlist>& obj_attrs,
+             std::optional<rgw_user> *poverride_owner,
+             const rgw_placement_rule **prule) override;
+};
+
+int RGWFetchObjFilter_Sync::filter(CephContext *cct,
+                                   const rgw_obj_key& source_key,
+                                   const RGWBucketInfo& dest_bucket_info,
+                                   std::optional<rgw_placement_rule> dest_placement_rule,
+                                   const map<string, bufferlist>& obj_attrs,
+                                   std::optional<rgw_user> *poverride_owner,
+                                   const rgw_placement_rule **prule)
+{
+  int abort_err = -ERR_PRECONDITION_FAILED;
+
+  rgw_sync_pipe_params params;
+
+  RGWObjTags obj_tags;
+
+  auto iter = obj_attrs.find(RGW_ATTR_TAGS);
+  if (iter != obj_attrs.end()) {
+    try {
+      auto it = iter->second.cbegin();
+      obj_tags.decode(it);
+    } catch (buffer::error &err) {
+      ldout(cct, 0) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
+    }
+  }
+
+  if (!sync_pipe.info.handler.find_obj_params(source_key,
+                                              obj_tags.get_tags(),
+                                              &params)) {
+    return abort_err;
+  }
+
+  if (verify_dest_params &&
+      !(*verify_dest_params == params.dest)) {
+    /* raced! original dest params were different, will need to retry */
+    ldout(cct, 0) << "WARNING: " << __func__ << ": pipe dest params are different than original params, must have raced with object rewrite, retrying" << dendl;
+    *need_retry = true;
+    return -ECANCELED;
+  }
+
+  std::optional<std::map<string, bufferlist> > new_attrs;
+
+  if (params.dest.acl_translation) {
+    rgw_user& acl_translation_owner = params.dest.acl_translation->owner;
+    if (!acl_translation_owner.empty()) {
+      if (params.mode == rgw_sync_pipe_params::MODE_USER &&
+          acl_translation_owner != dest_bucket_info.owner) {
+        ldout(cct, 0) << "ERROR: " << __func__ << ": acl translation was requested, but user (" << acl_translation_owner
+          << ") is not dest bucket owner (" << dest_bucket_info.owner << ")" << dendl;
+        return -EPERM;
+      }
+      *poverride_owner = acl_translation_owner;
+    }
+  }
+  if (params.mode == rgw_sync_pipe_params::MODE_USER) {
+    if (!bucket_perms->verify_object_permission(obj_attrs, RGW_PERM_READ)) {
+      ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to fetch object" << dendl;
+      return -EPERM;
+    }
+  }
+
+  if (!dest_placement_rule &&
+      params.dest.storage_class) {
+    dest_rule.storage_class = *params.dest.storage_class;
+    dest_rule.inherit_from(dest_bucket_info.placement_rule);
+    dest_placement_rule = dest_rule;
+    *prule = &dest_rule;
+  }
+
+  return RGWFetchObjFilter_Default::filter(cct,
+                                           source_key,
+                                           dest_bucket_info,
+                                           dest_placement_rule,
+                                           obj_attrs,
+                                           poverride_owner,
+                                           prule);
+}
+
+class RGWObjFetchCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_sync_pipe& sync_pipe;
+  rgw_obj_key& key;
+  std::optional<rgw_obj_key> dest_key;
+  std::optional<uint64_t> versioned_epoch;
+  const rgw_zone_set_entry& source_trace_entry;
+  rgw_zone_set *zones_trace;
+
+  bool need_more_info{false};
+  bool check_change{false};
+
+  ceph::real_time src_mtime;
+  uint64_t src_size;
+  string src_etag;
+  map<string, bufferlist> src_attrs;
+  map<string, string> src_headers;
+
+  std::optional<rgw_user> param_user;
+  rgw_sync_pipe_params::Mode param_mode;
+
+  std::optional<RGWUserPermHandler> user_perms;
+  std::shared_ptr<RGWUserPermHandler::Bucket> source_bucket_perms;
+  RGWUserPermHandler::Bucket dest_bucket_perms;
+
+  std::optional<rgw_sync_pipe_dest_params> dest_params;
+
+  int try_num{0};
+  std::shared_ptr<bool> need_retry;
+public:
+  RGWObjFetchCR(RGWDataSyncCtx *_sc,
+                rgw_bucket_sync_pipe& _sync_pipe,
+                rgw_obj_key& _key,
+                std::optional<rgw_obj_key> _dest_key,
+                std::optional<uint64_t> _versioned_epoch,
+                const rgw_zone_set_entry& source_trace_entry,
+                rgw_zone_set *_zones_trace) : RGWCoroutine(_sc->cct),
+                                              sc(_sc), sync_env(_sc->env),
+                                              sync_pipe(_sync_pipe),
+                                              key(_key),
+                                              dest_key(_dest_key),
+                                              versioned_epoch(_versioned_epoch),
+                                              source_trace_entry(source_trace_entry),
+                                              zones_trace(_zones_trace) {
+  }
+
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+#define MAX_RACE_RETRIES_OBJ_FETCH 10
+      for (try_num = 0; try_num < MAX_RACE_RETRIES_OBJ_FETCH; ++try_num) {
+
+        {
+          std::optional<rgw_user> param_acl_translation;
+          std::optional<string> param_storage_class;
+
+          if (!sync_pipe.info.handler.find_basic_info_without_tags(key,
+                                                                   &param_user,
+                                                                   &param_acl_translation,
+                                                                   &param_storage_class,
+                                                                   &param_mode,
+                                                                   &need_more_info)) {
+            if (!need_more_info) {
+              return set_cr_error(-ERR_PRECONDITION_FAILED);
+            }
+          }
+        }
+
+        if (need_more_info) {
+          ldout(cct, 20) << "Could not determine exact policy rule for obj=" << key << ", will read source object attributes" << dendl;
+          /*
+           * we need to fetch info about source object, so that we can determine
+           * the correct policy configuration. This can happen if there are multiple
+           * policy rules, and some depend on the object tagging */
+          yield call(new RGWStatRemoteObjCR(sync_env->async_rados,
+                                            sync_env->driver,
+                                            sc->source_zone,
+                                            sync_pipe.info.source_bs.bucket,
+                                            key,
+                                            &src_mtime,
+                                            &src_size,
+                                            &src_etag,
+                                            &src_attrs,
+                                            &src_headers));
+          if (retcode < 0) {
+            return set_cr_error(retcode);
+          }
+
+          RGWObjTags obj_tags;
+
+          auto iter = src_attrs.find(RGW_ATTR_TAGS);
+          if (iter != src_attrs.end()) {
+            try {
+              auto it = iter->second.cbegin();
+              obj_tags.decode(it);
+            } catch (buffer::error &err) {
+              ldout(cct, 0) << "ERROR: " << __func__ << ": caught buffer::error couldn't decode TagSet " << dendl;
+            }
+          }
+
+          rgw_sync_pipe_params params;
+          if (!sync_pipe.info.handler.find_obj_params(key,
+                                                      obj_tags.get_tags(),
+                                                      &params)) {
+            return set_cr_error(-ERR_PRECONDITION_FAILED);
+          }
+
+          param_user = params.user;
+          param_mode = params.mode;
+
+          dest_params = params.dest;
+        }
+
+        if (param_mode == rgw_sync_pipe_params::MODE_USER) {
+          if (!param_user) {
+            ldout(cct, 20) << "ERROR: " << __func__ << ": user level sync but user param not set" << dendl;
+            return set_cr_error(-EPERM);
+          }
+          user_perms.emplace(sync_env, *param_user);
+
+          yield call(user_perms->init_cr());
+          if (retcode < 0) {
+            ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init user perms manager for uid=" << *param_user << dendl;
+            return set_cr_error(retcode);
+          }
+
+          /* verify that user is allowed to write at the target bucket */
+          int r = user_perms->init_bucket(sync_pipe.dest_bucket_info,
+                                          sync_pipe.dest_bucket_attrs,
+                                          &dest_bucket_perms);
+          if (r < 0) {
+            ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl;
+            return set_cr_error(retcode);
+          }
+
+          if (!dest_bucket_perms.verify_bucket_permission(RGW_PERM_WRITE)) {
+            ldout(cct, 0) << "ERROR: " << __func__ << ": permission check failed: user not allowed to write into bucket (bucket=" << sync_pipe.info.dest_bucket.get_key() << ")" << dendl;
+            return -EPERM;
+          }
+
+          /* init source bucket permission structure */
+          source_bucket_perms = make_shared<RGWUserPermHandler::Bucket>();
+          r = user_perms->init_bucket(sync_pipe.source_bucket_info,
+                                      sync_pipe.source_bucket_attrs,
+                                      source_bucket_perms.get());
+          if (r < 0) {
+            ldout(cct, 20) << "ERROR: " << __func__ << ": failed to init bucket perms manager for uid=" << *param_user << " bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << dendl;
+            return set_cr_error(retcode);
+          }
+        }
+
+        yield {
+          if (!need_retry) {
+            need_retry = make_shared<bool>();
+          }
+          auto filter = make_shared<RGWFetchObjFilter_Sync>(sync_pipe,
+                                                            source_bucket_perms,
+                                                            std::move(dest_params),
+                                                            need_retry);
+
+          call(new RGWFetchRemoteObjCR(sync_env->async_rados, sync_env->driver, sc->source_zone,
+                                       nullopt,
+                                       sync_pipe.info.source_bs.bucket,
+                                       std::nullopt, sync_pipe.dest_bucket_info,
+                                       key, dest_key, versioned_epoch,
+                                       true,
+                                       std::static_pointer_cast<RGWFetchObjFilter>(filter),
+                                       source_trace_entry, zones_trace,
+                                       sync_env->counters, dpp));
+        }
+        if (retcode < 0) {
+          if (*need_retry) {
+            continue;
+          }
+          return set_cr_error(retcode);
+        }
+
+        return set_cr_done();
+      }
+
+      ldout(cct, 0) << "ERROR: " << __func__ << ": Too many retries trying to fetch object, possibly a bug: bucket=" << sync_pipe.source_bucket_info.bucket.get_key() << " key=" << key << dendl;
+
+      return set_cr_error(-EIO);
+    }
+    return 0;
+  }
+};
+
+RGWCoroutine *RGWDefaultDataSyncModule::sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc,
+                                                    rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                                                    std::optional<uint64_t> versioned_epoch,
+                                                    const rgw_zone_set_entry& source_trace_entry,
+                                                    rgw_zone_set *zones_trace)
+{
+  return new RGWObjFetchCR(sc, sync_pipe, key, std::nullopt, versioned_epoch,
+                           source_trace_entry, zones_trace);
+}
+
+RGWCoroutine *RGWDefaultDataSyncModule::remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                                                      real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+  auto sync_env = sc->env;
+  return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
+                            sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
+                            NULL, NULL, false, &mtime, zones_trace);
+}
+
+RGWCoroutine *RGWDefaultDataSyncModule::create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                                             rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+  auto sync_env = sc->env;
+  return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
+                            sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
+                            &owner.id, &owner.display_name, true, &mtime, zones_trace);
+}
+
+class RGWArchiveDataSyncModule : public RGWDefaultDataSyncModule {
+public:
+  RGWArchiveDataSyncModule() {}
+
+  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc,
+                            rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                            std::optional<uint64_t> versioned_epoch,
+                            const rgw_zone_set_entry& source_trace_entry,
+                            rgw_zone_set *zones_trace) override;
+  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override;
+};
+
+class RGWArchiveSyncModuleInstance : public RGWDefaultSyncModuleInstance {
+  RGWArchiveDataSyncModule data_handler;
+public:
+  RGWArchiveSyncModuleInstance() {}
+  RGWDataSyncModule *get_data_handler() override {
+    return &data_handler;
+  }
+  RGWMetadataHandler *alloc_bucket_meta_handler() override {
+    return RGWArchiveBucketMetaHandlerAllocator::alloc();
+  }
+  RGWBucketInstanceMetadataHandlerBase *alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver) override {
+    return RGWArchiveBucketInstanceMetaHandlerAllocator::alloc(driver);
+  }
+};
+
+int RGWArchiveSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance)
+{
+  instance->reset(new RGWArchiveSyncModuleInstance());
+  return 0;
+}
+
+RGWCoroutine *RGWArchiveDataSyncModule::sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc,
+                                                    rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                                                    std::optional<uint64_t> versioned_epoch,
+                                                    const rgw_zone_set_entry& source_trace_entry,
+                                                    rgw_zone_set *zones_trace)
+{
+  auto sync_env = sc->env;
+  ldout(sc->cct, 5) << "SYNC_ARCHIVE: sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+  if (!sync_pipe.dest_bucket_info.versioned() ||
+     (sync_pipe.dest_bucket_info.flags & BUCKET_VERSIONS_SUSPENDED)) {
+      ldout(sc->cct, 0) << "SYNC_ARCHIVE: sync_object: enabling object versioning for archive bucket" << dendl;
+      sync_pipe.dest_bucket_info.flags = (sync_pipe.dest_bucket_info.flags & ~BUCKET_VERSIONS_SUSPENDED) | BUCKET_VERSIONED;
+      int op_ret = sync_env->driver->getRados()->put_bucket_instance_info(sync_pipe.dest_bucket_info, false, real_time(), NULL, sync_env->dpp, null_yield);
+      if (op_ret < 0) {
+         ldpp_dout(sync_env->dpp, 0) << "SYNC_ARCHIVE: sync_object: error versioning archive bucket" << dendl;
+         return NULL;
+      }
+  }
+
+  std::optional<rgw_obj_key> dest_key;
+
+  if (versioned_epoch.value_or(0) == 0) { /* force version if not set */
+    versioned_epoch = 0;
+    dest_key = key;
+  }
+
+  if (key.instance.empty()) {
+    dest_key = key;
+    sync_env->driver->getRados()->gen_rand_obj_instance_name(&(*dest_key));
+  }
+
+  return new RGWObjFetchCR(sc, sync_pipe, key, dest_key, versioned_epoch,
+                           source_trace_entry, zones_trace);
+}
+
+RGWCoroutine *RGWArchiveDataSyncModule::remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                                                     real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+  ldout(sc->cct, 0) << "SYNC_ARCHIVE: remove_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch << dendl;
+  return NULL;
+}
+
+RGWCoroutine *RGWArchiveDataSyncModule::create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                                            rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace)
+{
+  ldout(sc->cct, 0) << "SYNC_ARCHIVE: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+	                            << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+  auto sync_env = sc->env;
+  return new RGWRemoveObjCR(sync_env->dpp, sync_env->async_rados, sync_env->driver, sc->source_zone,
+                            sync_pipe.dest_bucket_info, key, versioned, versioned_epoch,
+                            &owner.id, &owner.display_name, true, &mtime, zones_trace);
+}
+
+class RGWDataSyncControlCR : public RGWBackoffControlCR
+{
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  uint32_t num_shards;
+
+  RGWSyncTraceNodeRef tn;
+
+  static constexpr bool exit_on_error = false; // retry on all errors
+public:
+  RGWDataSyncControlCR(RGWDataSyncCtx *_sc, uint32_t _num_shards,
+                       RGWSyncTraceNodeRef& _tn_parent) : RGWBackoffControlCR(_sc->cct, exit_on_error),
+                                                          sc(_sc), sync_env(_sc->env), num_shards(_num_shards) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "sync");
+  }
+
+  RGWCoroutine *alloc_cr() override {
+    return new RGWDataSyncCR(sc, num_shards, tn, backoff_ptr());
+  }
+
+  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
+    ceph::mutex& m = cr_lock();
+
+    m.lock();
+    RGWDataSyncCR *cr = static_cast<RGWDataSyncCR *>(get_cr());
+    if (!cr) {
+      m.unlock();
+      return;
+    }
+
+    cr->get();
+    m.unlock();
+
+    if (cr) {
+      cr->wakeup(shard_id, entries);
+    }
+
+    cr->put();
+  }
+};
+
+void RGWRemoteDataLog::wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) {
+  std::shared_lock rl{lock};
+  if (!data_sync_cr) {
+    return;
+  }
+  data_sync_cr->wakeup(shard_id, entries);
+}
+
+int RGWRemoteDataLog::run_sync(const DoutPrefixProvider *dpp, int num_shards)
+{
+  lock.lock();
+  data_sync_cr = new RGWDataSyncControlCR(&sc, num_shards, tn);
+  data_sync_cr->get(); // run() will drop a ref, so take another
+  lock.unlock();
+
+  int r = run(dpp, data_sync_cr);
+
+  lock.lock();
+  data_sync_cr->put();
+  data_sync_cr = NULL;
+  lock.unlock();
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to run sync" << dendl;
+    return r;
+  }
+  return 0;
+}
+
+CephContext *RGWDataSyncStatusManager::get_cct() const
+{
+  return driver->ctx();
+}
+
+int RGWDataSyncStatusManager::init(const DoutPrefixProvider *dpp)
+{
+  RGWZone *zone_def;
+
+  if (!(zone_def = driver->svc()->zone->find_zone(source_zone))) {
+    ldpp_dout(this, 0) << "ERROR: failed to find zone config info for zone=" << source_zone << dendl;
+    return -EIO;
+  }
+
+  if (!driver->svc()->sync_modules->get_manager()->supports_data_export(zone_def->tier_type)) {
+    return -ENOTSUP;
+  }
+
+  const RGWZoneParams& zone_params = driver->svc()->zone->get_zone_params();
+
+  if (sync_module == nullptr) {
+    sync_module = driver->get_sync_module();
+  }
+
+  conn = driver->svc()->zone->get_zone_conn(source_zone);
+  if (!conn) {
+    ldpp_dout(this, 0) << "connection object to zone " << source_zone << " does not exist" << dendl;
+    return -EINVAL;
+  }
+
+  error_logger = new RGWSyncErrorLogger(driver, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
+
+  int r = source_log.init(source_zone, conn, error_logger, driver->getRados()->get_sync_tracer(),
+                          sync_module, counters);
+  if (r < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to init remote log, r=" << r << dendl;
+    finalize();
+    return r;
+  }
+
+  rgw_datalog_info datalog_info;
+  r = source_log.read_log_info(dpp, &datalog_info);
+  if (r < 0) {
+    ldpp_dout(this, 5) << "ERROR: master.read_log_info() returned r=" << r << dendl;
+    finalize();
+    return r;
+  }
+
+  num_shards = datalog_info.num_shards;
+
+  for (int i = 0; i < num_shards; i++) {
+    shard_objs[i] = rgw_raw_obj(zone_params.log_pool, shard_obj_name(source_zone, i));
+  }
+
+  return 0;
+}
+
+void RGWDataSyncStatusManager::finalize()
+{
+  delete error_logger;
+  error_logger = nullptr;
+}
+
+unsigned RGWDataSyncStatusManager::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& RGWDataSyncStatusManager::gen_prefix(std::ostream& out) const
+{
+  auto zone = std::string_view{source_zone.id};
+  return out << "data sync zone:" << zone.substr(0, 8) << ' ';
+}
+
+string RGWDataSyncStatusManager::sync_status_oid(const rgw_zone_id& source_zone)
+{
+  char buf[datalog_sync_status_oid_prefix.size() + source_zone.id.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%s", datalog_sync_status_oid_prefix.c_str(), source_zone.id.c_str());
+
+  return string(buf);
+}
+
+string RGWDataSyncStatusManager::shard_obj_name(const rgw_zone_id& source_zone, int shard_id)
+{
+  char buf[datalog_sync_status_shard_prefix.size() + source_zone.id.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%s.%d", datalog_sync_status_shard_prefix.c_str(), source_zone.id.c_str(), shard_id);
+
+  return string(buf);
+}
+
+class RGWInitBucketShardSyncStatusCoroutine : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  const rgw_bucket_sync_pair_info& sync_pair;
+  const string sync_status_oid;
+
+  rgw_bucket_shard_sync_info& status;
+  RGWObjVersionTracker& objv_tracker;
+  const BucketIndexShardsManager& marker_mgr;
+  bool exclusive;
+public:
+  RGWInitBucketShardSyncStatusCoroutine(RGWDataSyncCtx *_sc,
+                                        const rgw_bucket_sync_pair_info& _sync_pair,
+                                        rgw_bucket_shard_sync_info& _status,
+                                        uint64_t gen,
+                                        const BucketIndexShardsManager& _marker_mgr,
+                                        RGWObjVersionTracker& objv_tracker,
+                                        bool exclusive)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      sync_pair(_sync_pair),
+      sync_status_oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, _sync_pair, gen)),
+      status(_status), objv_tracker(objv_tracker), marker_mgr(_marker_mgr), exclusive(exclusive)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield {
+        rgw_raw_obj obj(sync_env->svc->zone->get_zone_params().log_pool, sync_status_oid);
+
+        // whether or not to do full sync, incremental sync will follow anyway
+        if (sync_env->sync_module->should_full_sync()) {
+          const auto max_marker = marker_mgr.get(sync_pair.source_bs.shard_id, "");
+          status.inc_marker.position = max_marker;
+        }
+        status.inc_marker.timestamp = ceph::real_clock::now();
+        status.state = rgw_bucket_shard_sync_info::StateIncrementalSync;
+
+        map<string, bufferlist> attrs;
+        status.encode_all_attrs(attrs);
+        call(new RGWSimpleRadosWriteAttrsCR(dpp, sync_env->driver,
+                                            obj, attrs, &objv_tracker, exclusive));
+      }
+
+      if (retcode < 0) {
+        ldout(cct, 20) << "ERROR: init marker position failed. error: " << retcode << dendl;
+        return set_cr_error(retcode);
+      }
+      ldout(cct, 20) << "init marker position: " << status.inc_marker.position << 
+        ". written to shard status object: " << sync_status_oid << dendl;
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+#define BUCKET_SYNC_ATTR_PREFIX RGW_ATTR_PREFIX "bucket-sync."
+
+template <class T>
+static bool decode_attr(CephContext *cct, map<string, bufferlist>& attrs, const string& attr_name, T *val)
+{
+  map<string, bufferlist>::iterator iter = attrs.find(attr_name);
+  if (iter == attrs.end()) {
+    *val = T();
+    return false;
+  }
+
+  auto biter = iter->second.cbegin();
+  try {
+    decode(*val, biter);
+  } catch (buffer::error& err) {
+    ldout(cct, 0) << "ERROR: failed to decode attribute: " << attr_name << dendl;
+    return false;
+  }
+  return true;
+}
+
+void rgw_bucket_shard_sync_info::decode_from_attrs(CephContext *cct, map<string, bufferlist>& attrs)
+{
+  if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "state", &state)) {
+    decode_attr(cct, attrs, "state", &state);
+  }
+  if (!decode_attr(cct, attrs, BUCKET_SYNC_ATTR_PREFIX "inc_marker", &inc_marker)) {
+    decode_attr(cct, attrs, "inc_marker", &inc_marker);
+  }
+}
+
+void rgw_bucket_shard_sync_info::encode_all_attrs(map<string, bufferlist>& attrs)
+{
+  encode_state_attr(attrs);
+  inc_marker.encode_attr(attrs);
+}
+
+void rgw_bucket_shard_sync_info::encode_state_attr(map<string, bufferlist>& attrs)
+{
+  using ceph::encode;
+  encode(state, attrs[BUCKET_SYNC_ATTR_PREFIX "state"]);
+}
+
+void rgw_bucket_shard_full_sync_marker::encode_attr(map<string, bufferlist>& attrs)
+{
+  using ceph::encode;
+  encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "full_marker"]);
+}
+
+void rgw_bucket_shard_inc_sync_marker::encode_attr(map<string, bufferlist>& attrs)
+{
+  using ceph::encode;
+  encode(*this, attrs[BUCKET_SYNC_ATTR_PREFIX "inc_marker"]);
+}
+
+class RGWReadBucketPipeSyncStatusCoroutine : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  string oid;
+  rgw_bucket_shard_sync_info *status;
+  RGWObjVersionTracker* objv_tracker;
+  map<string, bufferlist> attrs;
+public:
+  RGWReadBucketPipeSyncStatusCoroutine(RGWDataSyncCtx *_sc,
+                                   const rgw_bucket_sync_pair_info& sync_pair,
+                                   rgw_bucket_shard_sync_info *_status,
+                                   RGWObjVersionTracker* objv_tracker,
+                                   uint64_t gen)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, gen)),
+      status(_status), objv_tracker(objv_tracker)
+  {}
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadBucketPipeSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    yield call(new RGWSimpleRadosReadAttrsCR(dpp, sync_env->driver,
+                                             rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, oid),
+                                             &attrs, true, objv_tracker));
+    if (retcode == -ENOENT) {
+      *status = rgw_bucket_shard_sync_info();
+      return set_cr_done();
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to call fetch bucket shard info oid=" << oid << " ret=" << retcode << dendl;
+      return set_cr_error(retcode);
+    }
+    status->decode_from_attrs(sync_env->cct, attrs);
+    return set_cr_done();
+  }
+  return 0;
+}
+
+// wrap ReadSyncStatus and set a flag if it's not in incremental
+class CheckBucketShardStatusIsIncremental : public RGWReadBucketPipeSyncStatusCoroutine {
+  bool* result;
+  rgw_bucket_shard_sync_info status;
+ public:
+  CheckBucketShardStatusIsIncremental(RGWDataSyncCtx* sc,
+                                      const rgw_bucket_sync_pair_info& sync_pair,
+                                      bool* result)
+    : RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &status, nullptr, 0 /*no gen in compat mode*/),
+      result(result)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    int r = RGWReadBucketPipeSyncStatusCoroutine::operate(dpp);
+    if (state == RGWCoroutine_Done &&
+        status.state != rgw_bucket_shard_sync_info::StateIncrementalSync) {
+      *result = false;
+    }
+    return r;
+  }
+};
+
+class CheckAllBucketShardStatusIsIncremental : public RGWShardCollectCR {
+  // start with 1 shard, and only spawn more if we detect an existing shard.
+  // this makes the backward compatilibility check far less expensive in the
+  // general case where no shards exist
+  static constexpr int initial_concurrent_shards = 1;
+  static constexpr int max_concurrent_shards = 16;
+
+  RGWDataSyncCtx* sc;
+  rgw_bucket_sync_pair_info sync_pair;
+  const int num_shards;
+  bool* result;
+  int shard = 0;
+ public:
+  CheckAllBucketShardStatusIsIncremental(RGWDataSyncCtx* sc,
+                                         const rgw_bucket_sync_pair_info& sync_pair,
+                                         int num_shards, bool* result)
+    : RGWShardCollectCR(sc->cct, initial_concurrent_shards),
+      sc(sc), sync_pair(sync_pair), num_shards(num_shards), result(result)
+  {}
+
+  bool spawn_next() override {
+    // stop spawning if we saw any errors or non-incremental shards
+    if (shard >= num_shards || status < 0 || !*result) {
+      return false;
+    }
+    sync_pair.source_bs.shard_id = shard++;
+    spawn(new CheckBucketShardStatusIsIncremental(sc, sync_pair, result), false);
+    return true;
+  }
+
+ private:
+  int handle_result(int r) override {
+    if (r < 0) {
+      ldout(cct, 4) << "failed to read bucket shard status: "
+          << cpp_strerror(r) << dendl;
+    } else if (shard == 0) {
+      // enable concurrency once the first shard succeeds
+      max_concurrent = max_concurrent_shards;
+    }
+    return r;
+  }
+};
+
+// wrap InitBucketShardSyncStatus with local storage for 'status' and 'objv'
+// and a loop to retry on racing writes
+class InitBucketShardStatusCR : public RGWCoroutine {
+  RGWDataSyncCtx* sc;
+  rgw_bucket_sync_pair_info pair;
+  rgw_bucket_shard_sync_info status;
+  RGWObjVersionTracker objv;
+  const uint64_t gen;
+  const BucketIndexShardsManager& marker_mgr;
+
+ public:
+  InitBucketShardStatusCR(RGWDataSyncCtx* sc,
+                         const rgw_bucket_sync_pair_info& pair,
+                         uint64_t gen,
+                         const BucketIndexShardsManager& marker_mgr)
+    : RGWCoroutine(sc->cct), sc(sc), pair(pair), gen(gen), marker_mgr(marker_mgr)
+  {}
+  int operate(const DoutPrefixProvider *dpp) {
+    reenter(this) {
+      // non exclusive create with empty status
+      objv.generate_new_write_ver(cct);
+      yield call(new RGWInitBucketShardSyncStatusCoroutine(sc, pair, status, gen, marker_mgr, objv, false));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class InitBucketShardStatusCollectCR : public RGWShardCollectCR {
+  static constexpr int max_concurrent_shards = 16;
+  RGWDataSyncCtx* sc;
+  rgw_bucket_sync_pair_info sync_pair;
+  const uint64_t gen;
+  const BucketIndexShardsManager& marker_mgr;
+
+  const int num_shards;
+  int shard = 0;
+
+  int handle_result(int r) override {
+    if (r < 0) {
+      ldout(cct, 4) << "failed to init bucket shard status: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  InitBucketShardStatusCollectCR(RGWDataSyncCtx* sc,
+                                 const rgw_bucket_sync_pair_info& sync_pair,
+                                 uint64_t gen,
+                                 const BucketIndexShardsManager& marker_mgr,
+                                 int num_shards)
+    : RGWShardCollectCR(sc->cct, max_concurrent_shards),
+      sc(sc), sync_pair(sync_pair), gen(gen), marker_mgr(marker_mgr), num_shards(num_shards)
+  {}
+
+  bool spawn_next() override {
+    if (shard >= num_shards || status < 0) { // stop spawning on any errors
+      return false;
+    }
+    sync_pair.source_bs.shard_id = shard++;
+    spawn(new InitBucketShardStatusCR(sc, sync_pair, gen, marker_mgr), false);
+    return true;
+  }
+};
+
+class RemoveBucketShardStatusCR : public RGWCoroutine {
+  RGWDataSyncCtx* const sc;
+  RGWDataSyncEnv* const sync_env;
+
+  rgw_bucket_sync_pair_info sync_pair;
+  rgw_raw_obj obj;
+  RGWObjVersionTracker objv;
+
+public:
+  RemoveBucketShardStatusCR(RGWDataSyncCtx* sc,
+                             const rgw_bucket_sync_pair_info& sync_pair, uint64_t gen)
+    : RGWCoroutine(sc->cct), sc(sc), sync_env(sc->env),
+      sync_pair(sync_pair),
+      obj(sync_env->svc->zone->get_zone_params().log_pool, 
+          RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, gen))
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield call(new RGWRadosRemoveCR(sync_env->driver, obj, &objv));
+ 			if (retcode < 0 && retcode != -ENOENT) {
+        ldout(cct, 20) << "ERROR: failed to remove bucket shard status for: " << sync_pair << 
+          ". with error: " << retcode << dendl;
+        return set_cr_error(retcode);
+      }
+      ldout(cct, 20) << "removed bucket shard status object: " << obj.oid << dendl;
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RemoveBucketShardStatusCollectCR : public RGWShardCollectCR {
+  static constexpr int max_concurrent_shards = 16;
+  RGWDataSyncCtx* const sc;
+  RGWDataSyncEnv* const sync_env;
+  rgw_bucket_sync_pair_info sync_pair;
+  const uint64_t gen;
+
+  const int num_shards;
+  int shard = 0;
+
+  int handle_result(int r) override {
+    if (r < 0) {
+      ldout(cct, 4) << "failed to remove bucket shard status object: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  RemoveBucketShardStatusCollectCR(RGWDataSyncCtx* sc,
+                                 const rgw_bucket_sync_pair_info& sync_pair,
+                                 uint64_t gen,
+                                 int num_shards)
+    : RGWShardCollectCR(sc->cct, max_concurrent_shards),
+      sc(sc), sync_env(sc->env), sync_pair(sync_pair), gen(gen), num_shards(num_shards)
+  {}
+
+  bool spawn_next() override {
+    if (shard >= num_shards) {
+      return false;
+    }
+    sync_pair.source_bs.shard_id = shard++;
+    spawn(new RemoveBucketShardStatusCR(sc, sync_pair, gen),  false);
+    return true;
+  }
+};
+
+class InitBucketFullSyncStatusCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  const rgw_bucket_sync_pair_info& sync_pair;
+  const rgw_raw_obj& status_obj;
+  rgw_bucket_sync_status& status;
+  RGWObjVersionTracker& objv;
+  const RGWBucketInfo& source_info;
+  const bool check_compat;
+
+  const rgw_bucket_index_marker_info& info;
+  BucketIndexShardsManager marker_mgr;
+
+  bool all_incremental = true;
+  bool no_zero = false;
+
+public:
+  InitBucketFullSyncStatusCR(RGWDataSyncCtx* sc,
+                             const rgw_bucket_sync_pair_info& sync_pair,
+                             const rgw_raw_obj& status_obj,
+                             rgw_bucket_sync_status& status,
+                             RGWObjVersionTracker& objv,
+			     const RGWBucketInfo& source_info,
+                             bool check_compat,
+                             const rgw_bucket_index_marker_info& info)
+    : RGWCoroutine(sc->cct), sc(sc), sync_env(sc->env),
+      sync_pair(sync_pair), status_obj(status_obj),
+      status(status), objv(objv), source_info(source_info),
+      check_compat(check_compat), info(info)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      retcode = marker_mgr.from_string(info.max_marker, -1);
+      if (retcode < 0) {
+        lderr(cct) << "failed to parse bilog shard markers: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      status.state = BucketSyncState::Init;
+
+      if (info.oldest_gen == 0) {
+	if (check_compat) {
+	  // use shard count from our log gen=0
+	  // try to convert existing per-shard incremental status for backward compatibility
+	  if (source_info.layout.logs.empty() ||
+	      source_info.layout.logs.front().gen > 0) {
+	    ldpp_dout(dpp, 20) << "no generation zero when checking compatibility" << dendl;
+	    no_zero = true;
+	  } else if (auto& log = source_info.layout.logs.front();
+                     log.layout.type != rgw::BucketLogType::InIndex) {
+	    ldpp_dout(dpp, 20) << "unrecognized log layout type when checking compatibility " << log.layout.type << dendl;
+	    no_zero = true;
+	  }
+	  if (!no_zero) {
+	    yield {
+	      const int num_shards0 = rgw::num_shards(
+		source_info.layout.logs.front().layout.in_index.layout);
+	      call(new CheckAllBucketShardStatusIsIncremental(sc, sync_pair,
+							      num_shards0,
+							      &all_incremental));
+	    }
+	    if (retcode < 0) {
+	      return set_cr_error(retcode);
+	    }
+	    if (all_incremental) {
+	      // we can use existing status and resume incremental sync
+	      status.state = BucketSyncState::Incremental;
+	    }
+	  } else {
+	    all_incremental = false;
+	  }
+	}
+      }
+
+      if (status.state != BucketSyncState::Incremental) {
+	// initialize all shard sync status. this will populate the log marker
+        // positions where incremental sync will resume after full sync
+	yield {
+	  const int num_shards = marker_mgr.get().size();
+	  call(new InitBucketShardStatusCollectCR(sc, sync_pair, info.latest_gen, marker_mgr, num_shards));
+	}
+	if (retcode < 0) {
+          ldout(cct, 20) << "failed to init bucket shard status: "
+			 << cpp_strerror(retcode) << dendl;
+	  return set_cr_error(retcode);
+        }
+
+        if (sync_env->sync_module->should_full_sync()) {
+          status.state = BucketSyncState::Full;
+        } else {
+          status.state = BucketSyncState::Incremental;
+        }
+      }
+
+      status.shards_done_with_gen.resize(marker_mgr.get().size());
+      status.incremental_gen = info.latest_gen;
+
+      ldout(cct, 20) << "writing bucket sync status during init. state=" << status.state << ". marker=" << status.full.position << dendl;
+
+      // write bucket sync status
+      using CR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
+      yield call(new CR(dpp, sync_env->driver,
+			status_obj, status, &objv, false));
+      if (retcode < 0) {
+        ldout(cct, 20) << "failed to write bucket shard status: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+#define OMAP_READ_MAX_ENTRIES 10
+class RGWReadRecoveringBucketShardsCoroutine : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw::sal::RadosStore* driver;
+  
+  const int shard_id;
+  int max_entries;
+
+  set<string>& recovering_buckets;
+  string marker;
+  string error_oid;
+
+  RGWRadosGetOmapKeysCR::ResultPtr omapkeys;
+  set<string> error_entries;
+  int max_omap_entries;
+  int count;
+
+public:
+  RGWReadRecoveringBucketShardsCoroutine(RGWDataSyncCtx *_sc, const int _shard_id,
+                                      set<string>& _recovering_buckets, const int _max_entries) 
+  : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+  driver(sync_env->driver), shard_id(_shard_id), max_entries(_max_entries),
+  recovering_buckets(_recovering_buckets), max_omap_entries(OMAP_READ_MAX_ENTRIES)
+  {
+    error_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id) + ".retry";
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadRecoveringBucketShardsCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this){
+    //read recovering bucket shards
+    count = 0;
+    do {
+      omapkeys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
+      yield call(new RGWRadosGetOmapKeysCR(driver, rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, error_oid),
+            marker, max_omap_entries, omapkeys));
+
+      if (retcode == -ENOENT) {
+        break;
+      }
+
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "failed to read recovering bucket shards with " 
+          << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      error_entries = std::move(omapkeys->entries);
+      if (error_entries.empty()) {
+        break;
+      }
+
+      count += error_entries.size();
+      marker = *error_entries.rbegin();
+      for (const std::string& key : error_entries) {
+        rgw_bucket_shard bs;
+        std::optional<uint64_t> gen;
+        if (int r = rgw::error_repo::decode_key(key, bs, gen); r < 0) {
+          // insert the key as-is
+          recovering_buckets.insert(std::move(key));
+        } else if (gen) {
+          recovering_buckets.insert(fmt::format("{}[{}]", bucket_shard_str{bs}, *gen));
+        } else {
+          recovering_buckets.insert(fmt::format("{}[full]", bucket_shard_str{bs}));
+        }
+      }
+    } while (omapkeys->more && count < max_entries);
+  
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+class RGWReadPendingBucketShardsCoroutine : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw::sal::RadosStore* driver;
+
+  const int shard_id;
+  int max_entries;
+
+  set<string>& pending_buckets;
+  string marker;
+  string status_oid;
+
+  rgw_data_sync_marker* sync_marker;
+  int count;
+
+  std::string next_marker;
+  vector<rgw_data_change_log_entry> log_entries;
+  bool truncated;
+
+public:
+  RGWReadPendingBucketShardsCoroutine(RGWDataSyncCtx *_sc, const int _shard_id,
+                                      set<string>& _pending_buckets,
+                                      rgw_data_sync_marker* _sync_marker, const int _max_entries) 
+  : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+  driver(sync_env->driver), shard_id(_shard_id), max_entries(_max_entries),
+  pending_buckets(_pending_buckets), sync_marker(_sync_marker)
+  {
+    status_oid = RGWDataSyncStatusManager::shard_obj_name(sc->source_zone, shard_id);
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadPendingBucketShardsCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this){
+    //read sync status marker
+    using CR = RGWSimpleRadosReadCR<rgw_data_sync_marker>;
+    yield call(new CR(dpp, sync_env->driver,
+                      rgw_raw_obj(sync_env->svc->zone->get_zone_params().log_pool, status_oid),
+                      sync_marker));
+    if (retcode < 0) {
+      ldpp_dout(dpp, 0) << "failed to read sync status marker with " 
+        << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+
+    //read pending bucket shards
+    marker = sync_marker->marker;
+    count = 0;
+    do{
+      yield call(new RGWReadRemoteDataLogShardCR(sc, shard_id, marker,
+                                                 &next_marker, &log_entries, &truncated));
+
+      if (retcode == -ENOENT) {
+        break;
+      }
+
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "failed to read remote data log info with " 
+          << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      if (log_entries.empty()) {
+        break;
+      }
+
+      count += log_entries.size();
+      for (const auto& entry : log_entries) {
+        pending_buckets.insert(entry.entry.key);
+      }
+    }while(truncated && count < max_entries);
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+int RGWRemoteDataLog::read_shard_status(const DoutPrefixProvider *dpp, int shard_id, set<string>& pending_buckets, set<string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries)
+{
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(driver->ctx(), driver->getRados()->get_cr_registry());
+  RGWHTTPManager http_manager(driver->ctx(), crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWDataSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+  RGWDataSyncCtx sc_local = sc;
+  sc_local.env = &sync_env_local;
+  list<RGWCoroutinesStack *> stacks;
+  RGWCoroutinesStack* recovering_stack = new RGWCoroutinesStack(driver->ctx(), &crs);
+  recovering_stack->call(new RGWReadRecoveringBucketShardsCoroutine(&sc_local, shard_id, recovering_buckets, max_entries));
+  stacks.push_back(recovering_stack);
+  RGWCoroutinesStack* pending_stack = new RGWCoroutinesStack(driver->ctx(), &crs);
+  pending_stack->call(new RGWReadPendingBucketShardsCoroutine(&sc_local, shard_id, pending_buckets, sync_marker, max_entries));
+  stacks.push_back(pending_stack);
+  ret = crs.run(dpp, stacks);
+  http_manager.stop();
+  return ret;
+}
+
+CephContext *RGWBucketPipeSyncStatusManager::get_cct() const
+{
+  return driver->ctx();
+}
+
+void rgw_bucket_entry_owner::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("ID", id, obj);
+  JSONDecoder::decode_json("DisplayName", display_name, obj);
+}
+
+struct bucket_list_entry {
+  bool delete_marker;
+  rgw_obj_key key;
+  bool is_latest;
+  real_time mtime;
+  string etag;
+  uint64_t size;
+  string storage_class;
+  rgw_bucket_entry_owner owner;
+  uint64_t versioned_epoch;
+  string rgw_tag;
+
+  bucket_list_entry() : delete_marker(false), is_latest(false), size(0), versioned_epoch(0) {}
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("IsDeleteMarker", delete_marker, obj);
+    JSONDecoder::decode_json("Key", key.name, obj);
+    JSONDecoder::decode_json("VersionId", key.instance, obj);
+    JSONDecoder::decode_json("IsLatest", is_latest, obj);
+    string mtime_str;
+    JSONDecoder::decode_json("RgwxMtime", mtime_str, obj);
+
+    struct tm t;
+    uint32_t nsec;
+    if (parse_iso8601(mtime_str.c_str(), &t, &nsec)) {
+      ceph_timespec ts;
+      ts.tv_sec = (uint64_t)internal_timegm(&t);
+      ts.tv_nsec = nsec;
+      mtime = real_clock::from_ceph_timespec(ts);
+    }
+    JSONDecoder::decode_json("ETag", etag, obj);
+    JSONDecoder::decode_json("Size", size, obj);
+    JSONDecoder::decode_json("StorageClass", storage_class, obj);
+    JSONDecoder::decode_json("Owner", owner, obj);
+    JSONDecoder::decode_json("VersionedEpoch", versioned_epoch, obj);
+    JSONDecoder::decode_json("RgwxTag", rgw_tag, obj);
+    if (key.instance == "null" && !versioned_epoch) {
+      key.instance.clear();
+    }
+  }
+
+  RGWModifyOp get_modify_op() const {
+    if (delete_marker) {
+      return CLS_RGW_OP_LINK_OLH_DM;
+    } else if (!key.instance.empty() && key.instance != "null") {
+      return CLS_RGW_OP_LINK_OLH;
+    } else {
+      return CLS_RGW_OP_ADD;
+    }
+  }
+};
+
+struct bucket_list_result {
+  string name;
+  string prefix;
+  string key_marker;
+  string version_id_marker;
+  int max_keys;
+  bool is_truncated;
+  list<bucket_list_entry> entries;
+
+  bucket_list_result() : max_keys(0), is_truncated(false) {}
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("Name", name, obj);
+    JSONDecoder::decode_json("Prefix", prefix, obj);
+    JSONDecoder::decode_json("KeyMarker", key_marker, obj);
+    JSONDecoder::decode_json("VersionIdMarker", version_id_marker, obj);
+    JSONDecoder::decode_json("MaxKeys", max_keys, obj);
+    JSONDecoder::decode_json("IsTruncated", is_truncated, obj);
+    JSONDecoder::decode_json("Entries", entries, obj);
+  }
+};
+
+class RGWListRemoteBucketCR: public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  const rgw_bucket_shard& bs;
+  rgw_obj_key marker_position;
+
+  bucket_list_result *result;
+
+public:
+  RGWListRemoteBucketCR(RGWDataSyncCtx *_sc, const rgw_bucket_shard& bs,
+                        rgw_obj_key& _marker_position, bucket_list_result *_result)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env), bs(bs),
+      marker_position(_marker_position), result(_result) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield {
+        rgw_http_param_pair pairs[] = { { "versions" , NULL },
+					{ "format" , "json" },
+					{ "objs-container" , "true" },
+					{ "key-marker" , marker_position.name.c_str() },
+					{ "version-id-marker" , marker_position.instance.c_str() },
+	                                { NULL, NULL } };
+        string p = string("/") + bs.bucket.get_key(':', 0);
+        call(new RGWReadRESTResourceCR<bucket_list_result>(sync_env->cct, sc->conn, sync_env->http_manager, p, pairs, result));
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+struct next_bilog_result {
+  uint64_t generation = 0;
+  int num_shards = 0;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("generation", generation, obj);
+    JSONDecoder::decode_json("num_shards", num_shards, obj);
+  }
+};
+
+struct bilog_list_result {
+  list<rgw_bi_log_entry> entries;
+  bool truncated{false};
+  std::optional<next_bilog_result> next_log;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("entries", entries, obj);
+    JSONDecoder::decode_json("truncated", truncated, obj);
+    JSONDecoder::decode_json("next_log", next_log, obj);
+  }
+};
+
+class RGWListBucketIndexLogCR: public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  const string instance_key;
+  string marker;
+
+  bilog_list_result *result;
+  std::optional<PerfGuard> timer;
+  uint64_t generation;
+  std::string gen_str = std::to_string(generation);
+  uint32_t format_ver{1};
+
+public:
+  RGWListBucketIndexLogCR(RGWDataSyncCtx *_sc, const rgw_bucket_shard& bs, string& _marker,
+                          uint64_t _generation, bilog_list_result *_result)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      instance_key(bs.get_key()), marker(_marker), result(_result), generation(_generation) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      if (sync_env->counters) {
+        timer.emplace(sync_env->counters, sync_counters::l_poll);
+      }
+      yield {
+        rgw_http_param_pair pairs[] = { { "bucket-instance", instance_key.c_str() },
+					{ "format" , "json" },
+					{ "marker" , marker.c_str() },
+					{ "type", "bucket-index" },
+					{ "generation", gen_str.c_str() },
+					{ "format-ver", "2"},
+	                                { NULL, NULL } };
+
+        call(new RGWReadRESTResourceCR<bilog_list_result>(sync_env->cct, sc->conn, sync_env->http_manager,
+                                                      "/admin/log", pairs, result));
+      }
+      timer.reset();
+      if (retcode < 0) {
+        if (sync_env->counters) {
+          sync_env->counters->inc(sync_counters::l_poll_err);
+        }
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+#define BUCKET_SYNC_UPDATE_MARKER_WINDOW 10
+
+class RGWBucketFullSyncMarkerTrack : public RGWSyncShardMarkerTrack<rgw_obj_key, rgw_obj_key> {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  const rgw_raw_obj& status_obj;
+  rgw_bucket_sync_status& sync_status;
+  RGWSyncTraceNodeRef tn;
+  RGWObjVersionTracker& objv_tracker;
+
+public:
+  RGWBucketFullSyncMarkerTrack(RGWDataSyncCtx *_sc,
+                               const rgw_raw_obj& status_obj,
+                               rgw_bucket_sync_status& sync_status,
+                               RGWSyncTraceNodeRef tn,
+                               RGWObjVersionTracker& objv_tracker)
+    : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW),
+      sc(_sc), sync_env(_sc->env), status_obj(status_obj),
+      sync_status(sync_status), tn(std::move(tn)), objv_tracker(objv_tracker)
+  {}
+
+
+  RGWCoroutine *store_marker(const rgw_obj_key& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+    sync_status.full.position = new_marker;
+    sync_status.full.count = index_pos;
+
+    tn->log(20, SSTR("updating marker oid=" << status_obj.oid << " marker=" << new_marker));
+    return new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
+        sync_env->dpp, sync_env->driver,
+	status_obj, sync_status, &objv_tracker);
+  }
+
+  RGWOrderCallCR *allocate_order_control_cr() override {
+    return new RGWLastCallerWinsCR(sync_env->cct);
+  }
+};
+
+// write the incremental sync status and update 'stable_timestamp' on success
+class RGWWriteBucketShardIncSyncStatus : public RGWCoroutine {
+  RGWDataSyncEnv *sync_env;
+  rgw_raw_obj obj;
+  rgw_bucket_shard_inc_sync_marker sync_marker;
+  ceph::real_time* stable_timestamp;
+  RGWObjVersionTracker& objv_tracker;
+  std::map<std::string, bufferlist> attrs;
+ public:
+  RGWWriteBucketShardIncSyncStatus(RGWDataSyncEnv *sync_env,
+                                   const rgw_raw_obj& obj,
+                                   const rgw_bucket_shard_inc_sync_marker& sync_marker,
+                                   ceph::real_time* stable_timestamp,
+                                   RGWObjVersionTracker& objv_tracker)
+    : RGWCoroutine(sync_env->cct), sync_env(sync_env), obj(obj),
+      sync_marker(sync_marker), stable_timestamp(stable_timestamp),
+      objv_tracker(objv_tracker)
+  {}
+  int operate(const DoutPrefixProvider *dpp) {
+    reenter(this) {
+      sync_marker.encode_attr(attrs);
+
+      yield call(new RGWSimpleRadosWriteAttrsCR(sync_env->dpp, sync_env->driver,
+                                                obj, attrs, &objv_tracker));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      if (stable_timestamp) {
+        *stable_timestamp = sync_marker.timestamp;
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWBucketIncSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, rgw_obj_key> {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  rgw_raw_obj obj;
+  rgw_bucket_shard_inc_sync_marker sync_marker;
+
+  map<rgw_obj_key, string> key_to_marker;
+
+  struct operation {
+    rgw_obj_key key;
+    bool is_olh;
+  };
+  map<string, operation> marker_to_op;
+  std::set<std::string> pending_olh; // object names with pending olh operations
+
+  RGWSyncTraceNodeRef tn;
+  RGWObjVersionTracker& objv_tracker;
+  ceph::real_time* stable_timestamp;
+
+  void handle_finish(const string& marker) override {
+    auto iter = marker_to_op.find(marker);
+    if (iter == marker_to_op.end()) {
+      return;
+    }
+    auto& op = iter->second;
+    key_to_marker.erase(op.key);
+    reset_need_retry(op.key);
+    if (op.is_olh) {
+      pending_olh.erase(op.key.name);
+    }
+    marker_to_op.erase(iter);
+  }
+
+public:
+  RGWBucketIncSyncShardMarkerTrack(RGWDataSyncCtx *_sc,
+                         const string& _marker_oid,
+                         const rgw_bucket_shard_inc_sync_marker& _marker,
+                         RGWSyncTraceNodeRef tn,
+                         RGWObjVersionTracker& objv_tracker,
+                         ceph::real_time* stable_timestamp)
+    : RGWSyncShardMarkerTrack(BUCKET_SYNC_UPDATE_MARKER_WINDOW),
+      sc(_sc), sync_env(_sc->env),
+      obj(sync_env->svc->zone->get_zone_params().log_pool, _marker_oid),
+      sync_marker(_marker), tn(std::move(tn)), objv_tracker(objv_tracker),
+      stable_timestamp(stable_timestamp)
+  {}
+
+  const rgw_raw_obj& get_obj() const { return obj; }
+
+  RGWCoroutine* store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+    sync_marker.position = new_marker;
+    sync_marker.timestamp = timestamp;
+
+    tn->log(20, SSTR("updating marker marker_oid=" << obj.oid << " marker=" << new_marker << " timestamp=" << timestamp));
+    return new RGWWriteBucketShardIncSyncStatus(sync_env, obj, sync_marker,
+                                                stable_timestamp, objv_tracker);
+  }
+
+  /*
+   * create index from key -> <op, marker>, and from marker -> key
+   * this is useful so that we can insure that we only have one
+   * entry for any key that is used. This is needed when doing
+   * incremenatl sync of data, and we don't want to run multiple
+   * concurrent sync operations for the same bucket shard 
+   * Also, we should make sure that we don't run concurrent operations on the same key with
+   * different ops.
+   */
+  bool index_key_to_marker(const rgw_obj_key& key, const string& marker, bool is_olh) {
+    auto result = key_to_marker.emplace(key, marker);
+    if (!result.second) { // exists
+      set_need_retry(key);
+      return false;
+    }
+    marker_to_op[marker] = operation{key, is_olh};
+    if (is_olh) {
+      // prevent other olh ops from starting on this object name
+      pending_olh.insert(key.name);
+    }
+    return true;
+  }
+
+  bool can_do_op(const rgw_obj_key& key, bool is_olh) {
+    // serialize olh ops on the same object name
+    if (is_olh && pending_olh.count(key.name)) {
+      tn->log(20, SSTR("sync of " << key << " waiting for pending olh op"));
+      return false;
+    }
+    return (key_to_marker.find(key) == key_to_marker.end());
+  }
+
+  RGWOrderCallCR *allocate_order_control_cr() override {
+    return new RGWLastCallerWinsCR(sync_env->cct);
+  }
+};
+
+static bool ignore_sync_error(int err) {
+  switch (err) {
+    case -ENOENT:
+    case -EPERM:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+template <class T, class K>
+class RGWBucketSyncSingleEntryCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  rgw_bucket_sync_pipe& sync_pipe;
+  rgw_bucket_shard& bs;
+
+  rgw_obj_key key;
+  bool versioned;
+  std::optional<uint64_t> versioned_epoch;
+  rgw_bucket_entry_owner owner;
+  real_time timestamp;
+  RGWModifyOp op;
+  RGWPendingState op_state;
+
+  T entry_marker;
+  RGWSyncShardMarkerTrack<T, K> *marker_tracker;
+
+  int sync_status;
+
+  stringstream error_ss;
+
+  bool error_injection;
+
+  RGWDataSyncModule *data_sync_module;
+
+  rgw_zone_set_entry source_trace_entry;
+  rgw_zone_set zones_trace;
+
+  RGWSyncTraceNodeRef tn;
+  std::string zone_name;
+
+public:
+  RGWBucketSyncSingleEntryCR(RGWDataSyncCtx *_sc,
+                             rgw_bucket_sync_pipe& _sync_pipe,
+                             const rgw_obj_key& _key, bool _versioned,
+                             std::optional<uint64_t> _versioned_epoch,
+                             real_time& _timestamp,
+                             const rgw_bucket_entry_owner& _owner,
+                             RGWModifyOp _op, RGWPendingState _op_state,
+		             const T& _entry_marker, RGWSyncShardMarkerTrack<T, K> *_marker_tracker, rgw_zone_set& _zones_trace,
+                             RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sc->cct),
+						      sc(_sc), sync_env(_sc->env),
+                                                      sync_pipe(_sync_pipe), bs(_sync_pipe.info.source_bs),
+                                                      key(_key), versioned(_versioned), versioned_epoch(_versioned_epoch),
+                                                      owner(_owner),
+                                                      timestamp(_timestamp), op(_op),
+                                                      op_state(_op_state),
+                                                      entry_marker(_entry_marker),
+                                                      marker_tracker(_marker_tracker),
+                                                      sync_status(0){
+    stringstream ss;
+    ss << bucket_shard_str{bs} << "/" << key << "[" << versioned_epoch.value_or(0) << "]";
+    set_description() << "bucket sync single entry (source_zone=" << sc->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state;
+    set_status("init");
+
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", SSTR(key));
+
+    tn->log(20, SSTR("bucket sync single entry (source_zone=" << sc->source_zone << ") b=" << ss.str() << " log_entry=" << entry_marker << " op=" << (int)op << " op_state=" << (int)op_state));
+    error_injection = (sync_env->cct->_conf->rgw_sync_data_inject_err_probability > 0);
+
+    data_sync_module = sync_env->sync_module->get_data_handler();
+
+    source_trace_entry.zone = sc->source_zone.id;
+    source_trace_entry.location_key = _sync_pipe.info.source_bs.bucket.get_key();
+
+    zones_trace = _zones_trace;
+    zones_trace.insert(sync_env->svc->zone->get_zone().id, _sync_pipe.info.dest_bucket.get_key());
+
+    if (sc->env->ostr) {
+      RGWZone* z;
+      if ((z = sc->env->driver->svc()->zone->find_zone(sc->source_zone))) {
+	zone_name = z->name;
+      }
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      /* skip entries that are not complete */
+      if (op_state != CLS_RGW_STATE_COMPLETE) {
+        goto done;
+      }
+      tn->set_flag(RGW_SNS_FLAG_ACTIVE);
+      do {
+        yield {
+          marker_tracker->reset_need_retry(key);
+          if (key.name.empty()) {
+            /* shouldn't happen */
+            set_status("skipping empty entry");
+            tn->log(0, "entry with empty obj name, skipping");
+            goto done;
+          }
+          if (error_injection &&
+              rand() % 10000 < cct->_conf->rgw_sync_data_inject_err_probability * 10000.0) {
+            tn->log(0, SSTR(": injecting data sync error on key=" << key.name));
+            retcode = -EIO;
+          } else if (op == CLS_RGW_OP_ADD ||
+                     op == CLS_RGW_OP_LINK_OLH) {
+            set_status("syncing obj");
+            tn->log(5, SSTR("bucket sync: sync obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
+	    if (versioned_epoch) {
+	      pretty_print(sc->env, "Syncing object s3://{}/{} version {} in sync from zone {}\n", 
+			   bs.bucket.name, key, *versioned_epoch, zone_name);
+	    } else {
+	      pretty_print(sc->env, "Syncing object s3://{}/{} in sync from zone {}\n",
+			   bs.bucket.name, key, zone_name);
+	    }
+            call(data_sync_module->sync_object(dpp, sc, sync_pipe, key, versioned_epoch,
+                                               source_trace_entry, &zones_trace));
+          } else if (op == CLS_RGW_OP_DEL || op == CLS_RGW_OP_UNLINK_INSTANCE) {
+            set_status("removing obj");
+	    if (versioned_epoch) {
+	      pretty_print(sc->env, "Deleting object s3://{}/{} version {} in sync from zone {}\n",
+			   bs.bucket.name, key, *versioned_epoch, zone_name);
+	    } else {
+	      pretty_print(sc->env, "Deleting object s3://{}/{} in sync from zone {}\n",
+			   bs.bucket.name, key, zone_name);
+	    }
+            if (op == CLS_RGW_OP_UNLINK_INSTANCE) {
+              versioned = true;
+            }
+            tn->log(10, SSTR("removing obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
+            call(data_sync_module->remove_object(dpp, sc, sync_pipe, key, timestamp, versioned, versioned_epoch.value_or(0), &zones_trace));
+            // our copy of the object is more recent, continue as if it succeeded
+          } else if (op == CLS_RGW_OP_LINK_OLH_DM) {
+            set_status("creating delete marker");
+            tn->log(10, SSTR("creating delete marker: obj: " << sc->source_zone << "/" << bs.bucket << "/" << key << "[" << versioned_epoch.value_or(0) << "]"));
+            call(data_sync_module->create_delete_marker(dpp, sc, sync_pipe, key, timestamp, owner, versioned, versioned_epoch.value_or(0), &zones_trace));
+          }
+          tn->set_resource_name(SSTR(bucket_str_noinstance(bs.bucket) << "/" << key));
+        }
+        if (retcode == -ERR_PRECONDITION_FAILED) {
+	  pretty_print(sc->env, "Skipping object s3://{}/{} in sync from zone {}\n",
+		       bs.bucket.name, key, zone_name);
+          set_status("Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)");
+          tn->log(0, "Skipping object sync: precondition failed (object contains newer change or policy doesn't allow sync)");
+          retcode = 0;
+        }
+      } while (marker_tracker->need_retry(key));
+      {
+        tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+        if (retcode >= 0) {
+          tn->log(10, "success");
+        } else {
+          tn->log(10, SSTR("failed, retcode=" << retcode << " (" << cpp_strerror(-retcode) << ")"));
+        }
+      }
+
+      if (retcode < 0 && retcode != -ENOENT) {
+        set_status() << "failed to sync obj; retcode=" << retcode;
+        tn->log(0, SSTR("ERROR: failed to sync object: "
+            << bucket_shard_str{bs} << "/" << key.name));
+        if (!ignore_sync_error(retcode)) {
+          error_ss << bucket_shard_str{bs} << "/" << key.name;
+          sync_status = retcode;
+        }
+      }
+      if (!error_ss.str().empty()) {
+        yield call(sync_env->error_logger->log_error_cr(dpp, sc->conn->get_remote_id(), "data", error_ss.str(), -retcode, string("failed to sync object") + cpp_strerror(-sync_status)));
+      }
+done:
+      if (sync_status == 0) {
+        /* update marker */
+        set_status() << "calling marker_tracker->finish(" << entry_marker << ")";
+        yield call(marker_tracker->finish(entry_marker));
+        sync_status = retcode;
+      }
+      if (sync_status < 0) {
+        return set_cr_error(sync_status);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWBucketFullSyncCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_sync_pipe& sync_pipe;
+  rgw_bucket_sync_status& sync_status;
+  rgw_bucket_shard& bs;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  bucket_list_result list_result;
+  list<bucket_list_entry>::iterator entries_iter;
+  rgw_obj_key list_marker;
+  bucket_list_entry *entry{nullptr};
+
+  int total_entries{0};
+
+  int sync_result{0};
+
+  const rgw_raw_obj& status_obj;
+  RGWObjVersionTracker& objv;
+
+  rgw_zone_set zones_trace;
+
+  RGWSyncTraceNodeRef tn;
+  RGWBucketFullSyncMarkerTrack marker_tracker;
+
+  struct _prefix_handler {
+    RGWBucketSyncFlowManager::pipe_rules_ref rules;
+    RGWBucketSyncFlowManager::pipe_rules::prefix_map_t::const_iterator iter;
+    std::optional<string> cur_prefix;
+
+    void set_rules(RGWBucketSyncFlowManager::pipe_rules_ref& _rules) {
+      rules = _rules;
+    }
+
+    bool revalidate_marker(rgw_obj_key *marker) {
+      if (cur_prefix &&
+          boost::starts_with(marker->name, *cur_prefix)) {
+        return true;
+      }
+      if (!rules) {
+        return false;
+      }
+      iter = rules->prefix_search(marker->name);
+      if (iter == rules->prefix_end()) {
+        return false;
+      }
+      cur_prefix = iter->first;
+      marker->name = *cur_prefix;
+      marker->instance.clear();
+      return true;
+    }
+
+    bool check_key_handled(const rgw_obj_key& key) {
+      if (!rules) {
+        return false;
+      }
+      if (cur_prefix &&
+          boost::starts_with(key.name, *cur_prefix)) {
+        return true;
+      }
+      iter = rules->prefix_search(key.name);
+      if (iter == rules->prefix_end()) {
+        return false;
+      }
+      cur_prefix = iter->first;
+      return boost::starts_with(key.name, iter->first);
+    }
+  } prefix_handler;
+
+public:
+  RGWBucketFullSyncCR(RGWDataSyncCtx *_sc,
+                      rgw_bucket_sync_pipe& _sync_pipe,
+                      const rgw_raw_obj& status_obj,
+                      boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                      rgw_bucket_sync_status& sync_status,
+                      RGWSyncTraceNodeRef tn_parent,
+                      RGWObjVersionTracker& objv_tracker)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      sync_pipe(_sync_pipe), sync_status(sync_status),
+      bs(_sync_pipe.info.source_bs),
+      lease_cr(std::move(lease_cr)), status_obj(status_obj), objv(objv_tracker),
+      tn(sync_env->sync_tracer->add_node(tn_parent, "full_sync",
+                                         SSTR(bucket_shard_str{bs}))),
+      marker_tracker(sc, status_obj, sync_status, tn, objv_tracker)
+  {
+    zones_trace.insert(sc->source_zone.id, sync_pipe.info.dest_bucket.get_key());
+    prefix_handler.set_rules(sync_pipe.get_rules());
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketFullSyncCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    list_marker = sync_status.full.position;
+
+    total_entries = sync_status.full.count;
+    do {
+      if (lease_cr && !lease_cr->is_locked()) {
+        tn->log(1, "no lease or lease is lost, abort");
+        drain_all();
+	yield call(marker_tracker.flush());
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode));
+          return set_cr_error(retcode);
+	}
+        return set_cr_error(-ECANCELED);
+      }
+      set_status("listing remote bucket");
+      tn->log(20, "listing bucket for full sync");
+
+      if (!prefix_handler.revalidate_marker(&list_marker)) {
+        set_status() << "finished iterating over all available prefixes: last marker=" << list_marker;
+        tn->log(20, SSTR("finished iterating over all available prefixes: last marker=" << list_marker));
+        break;
+      }
+
+      yield call(new RGWListRemoteBucketCR(sc, bs, list_marker, &list_result));
+      if (retcode < 0 && retcode != -ENOENT) {
+        set_status("failed bucket listing, going down");
+        drain_all();
+        yield spawn(marker_tracker.flush(), true);
+        return set_cr_error(retcode);
+      }
+      if (list_result.entries.size() > 0) {
+        tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+      }
+      entries_iter = list_result.entries.begin();
+      for (; entries_iter != list_result.entries.end(); ++entries_iter) {
+        if (lease_cr && !lease_cr->is_locked()) {
+          drain_all();
+          yield call(marker_tracker.flush());
+          tn->log(1, "no lease or lease is lost, abort");
+          if (retcode < 0) {
+            tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode));
+            return set_cr_error(retcode);
+          }
+          return set_cr_error(-ECANCELED);
+        }
+        tn->log(20, SSTR("[full sync] syncing object: "
+            << bucket_shard_str{bs} << "/" << entries_iter->key));
+        entry = &(*entries_iter);
+        list_marker = entries_iter->key;
+        if (!prefix_handler.check_key_handled(entries_iter->key)) {
+          set_status() << "skipping entry due to policy rules: " << entries_iter->key;
+          tn->log(20, SSTR("skipping entry due to policy rules: " << entries_iter->key));
+          continue;
+        }
+        total_entries++;
+        if (!marker_tracker.start(entry->key, total_entries, real_time())) {
+          tn->log(0, SSTR("ERROR: cannot start syncing " << entry->key << ". Duplicate entry?"));
+        } else {
+          using SyncCR = RGWBucketSyncSingleEntryCR<rgw_obj_key, rgw_obj_key>;
+          yield spawn(new SyncCR(sc, sync_pipe, entry->key,
+                                 false, /* versioned, only matters for object removal */
+                                 entry->versioned_epoch, entry->mtime,
+                                 entry->owner, entry->get_modify_op(), CLS_RGW_STATE_COMPLETE,
+                                 entry->key, &marker_tracker, zones_trace, tn),
+                      false);
+        }
+        drain_with_cb(sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window),
+                      [&](uint64_t stack_id, int ret) {
+                if (ret < 0) {
+                  tn->log(10, "a sync operation returned error");
+                  sync_result = ret;
+                }
+                return 0;
+              });
+      }
+    } while (list_result.is_truncated && sync_result == 0);
+    set_status("done iterating over all objects");
+
+    /* wait for all operations to complete */
+    drain_all_cb([&](uint64_t stack_id, int ret) {
+      if (ret < 0) {
+        tn->log(10, "a sync operation returned error");
+        sync_result = ret;
+      }
+      return 0;
+    });
+    tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+    if (lease_cr && !lease_cr->is_locked()) {
+      tn->log(1, "no lease or lease is lost, abort");
+      yield call(marker_tracker.flush());
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode));
+        return set_cr_error(retcode);
+      }
+      return set_cr_error(-ECANCELED);
+    }
+    yield call(marker_tracker.flush());
+    if (retcode < 0) {
+      tn->log(0, SSTR("ERROR: bucket full sync marker_tracker.flush() returned retcode=" << retcode));
+      return set_cr_error(retcode);
+    }
+    /* update sync state to incremental */
+    if (sync_result == 0) {
+      sync_status.state = BucketSyncState::Incremental;
+      tn->log(5, SSTR("set bucket state=" << sync_status.state));
+      yield call(new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
+	      dpp, sync_env->driver, status_obj, sync_status, &objv));
+      tn->log(5, SSTR("bucket status objv=" << objv));
+    } else {
+      tn->log(10, SSTR("backing out with sync_status=" << sync_result));
+    }
+    if (retcode < 0 && sync_result == 0) { /* actually tried to set incremental state and failed */
+      tn->log(0, SSTR("ERROR: failed to set sync state on bucket "
+          << bucket_shard_str{bs} << " retcode=" << retcode));
+      return set_cr_error(retcode);
+    }
+    if (sync_result < 0) {
+      return set_cr_error(sync_result);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+static bool has_olh_epoch(RGWModifyOp op) {
+  return op == CLS_RGW_OP_LINK_OLH || op == CLS_RGW_OP_UNLINK_INSTANCE;
+}
+
+class RGWBucketShardIsDoneCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_sync_status bucket_status;
+  const rgw_raw_obj& bucket_status_obj;
+  const int shard_id;
+  RGWObjVersionTracker objv_tracker;
+  const next_bilog_result& next_log;
+  const uint64_t generation;
+
+public:
+  RGWBucketShardIsDoneCR(RGWDataSyncCtx *_sc, const rgw_raw_obj& _bucket_status_obj,
+                         int _shard_id, const next_bilog_result& _next_log, const uint64_t _gen)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      bucket_status_obj(_bucket_status_obj),
+      shard_id(_shard_id), next_log(_next_log), generation(_gen) {}
+
+  int operate(const DoutPrefixProvider* dpp) override
+  {
+    reenter(this) {
+      do {
+        // read bucket sync status
+        objv_tracker.clear();
+        using ReadCR = RGWSimpleRadosReadCR<rgw_bucket_sync_status>;
+        yield call(new ReadCR(dpp, sync_env->driver,
+                              bucket_status_obj, &bucket_status, false, &objv_tracker));
+        if (retcode < 0) {
+          ldpp_dout(dpp, 20) << "failed to read bucket shard status: "
+              << cpp_strerror(retcode) << dendl;
+          return set_cr_error(retcode);
+        }
+
+        if (bucket_status.state != BucketSyncState::Incremental) {
+          // exit with success to avoid stale shard being
+          // retried in error repo if we lost a race
+          ldpp_dout(dpp, 20) << "RGWBucketShardIsDoneCR found sync state = " << bucket_status.state << dendl;
+          return set_cr_done();
+        }
+
+        if (bucket_status.incremental_gen != generation) {
+          // exit with success to avoid stale shard being
+          // retried in error repo if we lost a race
+          ldpp_dout(dpp, 20) << "RGWBucketShardIsDoneCR expected gen: " << generation
+              << ", got: " << bucket_status.incremental_gen << dendl;
+          return set_cr_done();
+        }
+
+        yield {
+          // update bucket_status after a shard is done with current gen
+          auto& done = bucket_status.shards_done_with_gen;
+          done[shard_id] = true;
+
+          // increment gen if all shards are already done with current gen
+          if (std::all_of(done.begin(), done.end(),
+            [] (const bool done){return done; } )) {
+            bucket_status.incremental_gen = next_log.generation;
+            done.clear();
+            done.resize(next_log.num_shards, false);
+          }
+          ldpp_dout(dpp, 20) << "bucket status incremental gen is " << bucket_status.incremental_gen << dendl;
+          using WriteCR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
+          call(new WriteCR(dpp, sync_env->driver,
+                            bucket_status_obj, bucket_status, &objv_tracker, false));
+        }
+        if (retcode < 0 && retcode != -ECANCELED) {
+          ldpp_dout(dpp, 20) << "failed to write bucket sync status: " << cpp_strerror(retcode) << dendl;
+          return set_cr_error(retcode);
+        } else if (retcode >= 0) {
+          return set_cr_done();
+        }
+      } while (retcode == -ECANCELED);
+    }
+    return 0;
+  }
+};
+
+class RGWBucketShardIncrementalSyncCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_sync_pipe& sync_pipe;
+  RGWBucketSyncFlowManager::pipe_rules_ref rules;
+  rgw_bucket_shard& bs;
+  const rgw_raw_obj& bucket_status_obj;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  bilog_list_result extended_result;
+  list<rgw_bi_log_entry> list_result;
+  int next_num_shards;
+  uint64_t next_gen;
+  bool truncated;
+
+  list<rgw_bi_log_entry>::iterator entries_iter, entries_end;
+  map<pair<string, string>, pair<real_time, RGWModifyOp> > squash_map;
+  rgw_bucket_shard_sync_info& sync_info;
+  uint64_t generation;
+  rgw_obj_key key;
+  rgw_bi_log_entry *entry{nullptr};
+  bool updated_status{false};
+  rgw_zone_id zone_id;
+  string target_location_key;
+
+  string cur_id;
+
+  int sync_status{0};
+  bool syncstopped{false};
+
+  RGWSyncTraceNodeRef tn;
+  RGWBucketIncSyncShardMarkerTrack marker_tracker;
+
+public:
+  RGWBucketShardIncrementalSyncCR(RGWDataSyncCtx *_sc,
+                                  rgw_bucket_sync_pipe& _sync_pipe,
+                                  const std::string& shard_status_oid,
+                                  const rgw_raw_obj& _bucket_status_obj,
+                                  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                                  rgw_bucket_shard_sync_info& sync_info,
+                                  uint64_t generation,
+                                  RGWSyncTraceNodeRef& _tn_parent,
+                                  RGWObjVersionTracker& objv_tracker,
+                                  ceph::real_time* stable_timestamp)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      sync_pipe(_sync_pipe), bs(_sync_pipe.info.source_bs),
+      bucket_status_obj(_bucket_status_obj), lease_cr(std::move(lease_cr)),
+      sync_info(sync_info), generation(generation), zone_id(sync_env->svc->zone->get_zone().id),
+      tn(sync_env->sync_tracer->add_node(_tn_parent, "inc_sync",
+                                         SSTR(bucket_shard_str{bs}))),
+      marker_tracker(sc, shard_status_oid, sync_info.inc_marker, tn,
+                     objv_tracker, stable_timestamp)
+  {
+    set_description() << "bucket shard incremental sync bucket="
+        << bucket_shard_str{bs};
+    set_status("init");
+    rules = sync_pipe.get_rules();
+    target_location_key = sync_pipe.info.dest_bucket.get_key();
+  }
+
+  bool check_key_handled(const rgw_obj_key& key) {
+    if (!rules) {
+      return false;
+    }
+    auto iter = rules->prefix_search(key.name);
+    if (iter == rules->prefix_end()) {
+      return false;
+    }
+    return boost::starts_with(key.name, iter->first);
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWBucketShardIncrementalSyncCR::operate(const DoutPrefixProvider *dpp)
+{
+  int ret;
+  reenter(this) {
+    do {
+      if (lease_cr && !lease_cr->is_locked()) {
+        tn->log(1, "no lease or lease is lost, abort");
+        drain_all();
+        yield call(marker_tracker.flush());
+        if (retcode < 0) {
+          tn->log(0, SSTR("ERROR: incremental sync marker_tracker.flush() returned retcode=" << retcode));
+          return set_cr_error(retcode);
+        }
+        return set_cr_error(-ECANCELED);
+      }
+      tn->log(20, SSTR("listing bilog for incremental sync; position=" << sync_info.inc_marker.position));
+      set_status() << "listing bilog; position=" << sync_info.inc_marker.position;
+      yield call(new RGWListBucketIndexLogCR(sc, bs, sync_info.inc_marker.position, generation, &extended_result));
+      if (retcode < 0 && retcode != -ENOENT) {
+        /* wait for all operations to complete */
+        drain_all();
+        yield spawn(marker_tracker.flush(), true);
+        return set_cr_error(retcode);
+      }
+      list_result = std::move(extended_result.entries);
+      truncated = extended_result.truncated;
+      if (extended_result.next_log) {
+        next_gen = extended_result.next_log->generation;
+        next_num_shards = extended_result.next_log->num_shards;
+      }
+
+      squash_map.clear();
+      entries_iter = list_result.begin();
+      entries_end = list_result.end();
+      for (; entries_iter != entries_end; ++entries_iter) {
+        auto e = *entries_iter;
+        if (e.op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP) {
+          ldpp_dout(dpp, 20) << "syncstop at: " << e.timestamp << ". marker: " << e.id << dendl;
+          syncstopped = true;
+          entries_end = std::next(entries_iter); // stop after this entry
+          break;
+        }
+        if (e.op == RGWModifyOp::CLS_RGW_OP_RESYNC) {
+          ldpp_dout(dpp, 20) << "syncstart at: " << e.timestamp << ". marker: " << e.id << dendl;
+          continue;
+        }
+        if (e.op == CLS_RGW_OP_CANCEL) {
+          continue;
+        }
+        if (e.state != CLS_RGW_STATE_COMPLETE) {
+          continue;
+        }
+        if (e.zones_trace.exists(zone_id.id, target_location_key)) {
+          continue;
+        }
+        auto& squash_entry = squash_map[make_pair(e.object, e.instance)];
+        // don't squash over olh entries - we need to apply their olh_epoch
+        if (has_olh_epoch(squash_entry.second) && !has_olh_epoch(e.op)) {
+          continue;
+        }
+        if (squash_entry.first <= e.timestamp) {
+          squash_entry = make_pair<>(e.timestamp, e.op);
+        }
+      }
+
+      entries_iter = list_result.begin();
+      for (; entries_iter != entries_end; ++entries_iter) {
+        if (lease_cr && !lease_cr->is_locked()) {
+          tn->log(1, "no lease or lease is lost, abort");
+          drain_all();
+	  yield call(marker_tracker.flush());
+          if (retcode < 0) {
+            tn->log(0, SSTR("ERROR: incremental sync marker_tracker.flush() returned retcode=" << retcode));
+            return set_cr_error(retcode);
+          }
+          return set_cr_error(-ECANCELED);
+        }
+        entry = &(*entries_iter);
+        {
+          ssize_t p = entry->id.find('#'); /* entries might have explicit shard info in them, e.g., 6#00000000004.94.3 */
+          if (p < 0) {
+            cur_id = entry->id;
+          } else {
+            cur_id = entry->id.substr(p + 1);
+          }
+        }
+        sync_info.inc_marker.position = cur_id;
+
+        if (entry->op == RGWModifyOp::CLS_RGW_OP_SYNCSTOP || entry->op == RGWModifyOp::CLS_RGW_OP_RESYNC) {
+          ldpp_dout(dpp, 20) << "detected syncstop or resync on " << entries_iter->timestamp << ", skipping entry" << dendl;
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+
+        if (!key.set(rgw_obj_index_key{entry->object, entry->instance})) {
+          set_status() << "parse_raw_oid() on " << entry->object << " returned false, skipping entry";
+          tn->log(20, SSTR("parse_raw_oid() on " << entry->object << " returned false, skipping entry"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+
+        tn->log(20, SSTR("parsed entry: id=" << cur_id << " iter->object=" << entry->object << " iter->instance=" << entry->instance << " name=" << key.name << " instance=" << key.instance << " ns=" << key.ns));
+
+        if (!key.ns.empty()) {
+          set_status() << "skipping entry in namespace: " << entry->object;
+          tn->log(20, SSTR("skipping entry in namespace: " << entry->object));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+
+        if (!check_key_handled(key)) {
+          set_status() << "skipping entry due to policy rules: " << entry->object;
+          tn->log(20, SSTR("skipping entry due to policy rules: " << entry->object));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+
+        set_status() << "got entry.id=" << cur_id << " key=" << key << " op=" << (int)entry->op;
+        if (entry->op == CLS_RGW_OP_CANCEL) {
+          set_status() << "canceled operation, skipping";
+          tn->log(20, SSTR("skipping object: "
+              << bucket_shard_str{bs} << "/" << key << ": canceled operation"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+        if (entry->state != CLS_RGW_STATE_COMPLETE) {
+          set_status() << "non-complete operation, skipping";
+          tn->log(20, SSTR("skipping object: "
+              << bucket_shard_str{bs} << "/" << key << ": non-complete operation"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+        if (entry->zones_trace.exists(zone_id.id, target_location_key)) {
+          set_status() << "redundant operation, skipping";
+          tn->log(20, SSTR("skipping object: "
+              <<bucket_shard_str{bs} <<"/"<<key<<": redundant operation"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+        if (make_pair<>(entry->timestamp, entry->op) != squash_map[make_pair(entry->object, entry->instance)]) {
+          set_status() << "squashed operation, skipping";
+          tn->log(20, SSTR("skipping object: "
+              << bucket_shard_str{bs} << "/" << key << ": squashed operation"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+        tn->set_flag(RGW_SNS_FLAG_ACTIVE);
+        tn->log(20, SSTR("syncing object: "
+            << bucket_shard_str{bs} << "/" << key));
+        updated_status = false;
+        while (!marker_tracker.can_do_op(key, has_olh_epoch(entry->op))) {
+          if (!updated_status) {
+            set_status() << "can't do op, conflicting inflight operation";
+            updated_status = true;
+          }
+          tn->log(5, SSTR("can't do op on key=" << key << " need to wait for conflicting operation to complete"));
+          yield wait_for_child();
+          bool again = true;
+          while (again) {
+            again = collect(&ret, nullptr);
+            if (ret < 0) {
+              tn->log(0, SSTR("ERROR: a child operation returned error (ret=" << ret << ")"));
+              sync_status = ret;
+              /* we have reported this error */
+            }
+          }
+          if (sync_status != 0)
+            break;
+        }
+        if (sync_status != 0) {
+          /* get error, stop */
+          break;
+        }
+        if (!marker_tracker.index_key_to_marker(key, cur_id, has_olh_epoch(entry->op))) {
+          set_status() << "can't do op, sync already in progress for object";
+          tn->log(20, SSTR("skipping sync of entry: " << cur_id << ":" << key << " sync already in progress for object"));
+          marker_tracker.try_update_high_marker(cur_id, 0, entry->timestamp);
+          continue;
+        }
+        // yield {
+          set_status() << "start object sync";
+          if (!marker_tracker.start(cur_id, 0, entry->timestamp)) {
+            tn->log(0, SSTR("ERROR: cannot start syncing " << cur_id << ". Duplicate entry?"));
+          } else {
+            std::optional<uint64_t> versioned_epoch;
+            rgw_bucket_entry_owner owner(entry->owner, entry->owner_display_name);
+            if (entry->ver.pool < 0) {
+              versioned_epoch = entry->ver.epoch;
+            }
+            tn->log(20, SSTR("entry->timestamp=" << entry->timestamp));
+            using SyncCR = RGWBucketSyncSingleEntryCR<string, rgw_obj_key>;
+            spawn(new SyncCR(sc, sync_pipe, key,
+                             entry->is_versioned(), versioned_epoch,
+                             entry->timestamp, owner, entry->op, entry->state,
+                             cur_id, &marker_tracker, entry->zones_trace, tn),
+                  false);
+          }
+        // }
+	  drain_with_cb(sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window),
+                      [&](uint64_t stack_id, int ret) {
+                if (ret < 0) {
+                  tn->log(10, "a sync operation returned error");
+                  sync_status = ret;
+                }
+                return 0;
+              });
+      }
+
+    } while (!list_result.empty() && sync_status == 0 && !syncstopped);
+
+    drain_all_cb([&](uint64_t stack_id, int ret) {
+      if (ret < 0) {
+        tn->log(10, "a sync operation returned error");
+        sync_status = ret;
+      }
+      return 0;
+    });
+    tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+
+    if (syncstopped) {
+      // transition to StateStopped in RGWSyncBucketShardCR. if sync is
+      // still disabled, we'll delete the sync status object. otherwise we'll
+      // restart full sync to catch any changes that happened while sync was
+      // disabled
+      sync_info.state = rgw_bucket_shard_sync_info::StateStopped;
+      return set_cr_done();
+    }
+
+    yield call(marker_tracker.flush());
+    if (retcode < 0) {
+      tn->log(0, SSTR("ERROR: incremental sync marker_tracker.flush() returned retcode=" << retcode));
+      return set_cr_error(retcode);
+    }
+    if (sync_status < 0) {
+      tn->log(10, SSTR("backing out with sync_status=" << sync_status));
+      return set_cr_error(sync_status);
+    }
+
+    if (!truncated && extended_result.next_log) {
+      yield call(new RGWBucketShardIsDoneCR(sc, bucket_status_obj, bs.shard_id, *extended_result.next_log, generation));
+      if (retcode < 0) {
+        ldout(cct, 20) << "failed to update bucket sync status: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      yield {
+        // delete the shard status object
+        auto status_obj = sync_env->svc->rados->obj(marker_tracker.get_obj());
+        retcode = status_obj.open(dpp);
+        if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+        call(new RGWRadosRemoveOidCR(sync_env->driver, std::move(status_obj)));
+        if (retcode < 0) {
+          ldpp_dout(dpp, 20) << "failed to remove shard status object: " << cpp_strerror(retcode) << dendl;
+          return set_cr_error(retcode);
+        }
+      }
+    }
+
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWGetBucketPeersCR : public RGWCoroutine {
+  RGWDataSyncEnv *sync_env;
+
+  std::optional<rgw_bucket> target_bucket;
+  std::optional<rgw_zone_id> source_zone;
+  std::optional<rgw_bucket> source_bucket;
+
+  rgw_sync_pipe_info_set *pipes;
+  map<rgw_bucket, all_bucket_info> buckets_info;
+  map<rgw_bucket, all_bucket_info>::iterator siiter;
+  std::optional<all_bucket_info> target_bucket_info;
+  std::optional<all_bucket_info> source_bucket_info;
+
+  rgw_sync_pipe_info_set::iterator siter;
+
+  std::shared_ptr<rgw_bucket_get_sync_policy_result> source_policy;
+  std::shared_ptr<rgw_bucket_get_sync_policy_result> target_policy;
+
+  RGWSyncTraceNodeRef tn;
+
+  using pipe_const_iter = map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>::const_iterator;
+
+  static pair<pipe_const_iter, pipe_const_iter> get_pipe_iters(const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& m, std::optional<rgw_zone_id> zone) {
+    if (!zone) {
+      return { m.begin(), m.end() };
+    }
+
+    auto b = m.find(*zone);
+    if (b == m.end()) {
+      return { b, b };
+    }
+    return { b, std::next(b) };
+  }
+
+  void filter_sources(std::optional<rgw_zone_id> source_zone,
+                      std::optional<rgw_bucket> source_bucket,
+                      const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& all_sources,
+                      rgw_sync_pipe_info_set *result) {
+    ldpp_dout(sync_env->dpp, 20) << __func__ << ": source_zone=" << source_zone.value_or(rgw_zone_id("*")).id
+                                << " source_bucket=" << source_bucket.value_or(rgw_bucket())
+                                << " all_sources.size()=" << all_sources.size() << dendl;
+    auto iters = get_pipe_iters(all_sources, source_zone);
+    for (auto i = iters.first; i != iters.second; ++i) {
+      for (auto& handler : i->second) {
+        if (!handler.specific()) {
+          ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": skipping" << dendl;
+          continue;
+        }
+        if (source_bucket &&
+            !source_bucket->match(*handler.source.bucket)) {
+          continue;
+        }
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": adding" << dendl;
+        result->insert(handler, source_bucket_info, target_bucket_info);
+      }
+    }
+  }
+
+  void filter_targets(std::optional<rgw_zone_id> target_zone,
+                      std::optional<rgw_bucket> target_bucket,
+                      const map<rgw_zone_id, RGWBucketSyncFlowManager::pipe_set>& all_targets,
+                      rgw_sync_pipe_info_set *result) {
+    ldpp_dout(sync_env->dpp, 20) << __func__ << ": target_zone=" << source_zone.value_or(rgw_zone_id("*")).id
+                                << " target_bucket=" << source_bucket.value_or(rgw_bucket())
+                                << " all_targets.size()=" << all_targets.size() << dendl;
+    auto iters = get_pipe_iters(all_targets, target_zone);
+    for (auto i = iters.first; i != iters.second; ++i) {
+      for (auto& handler : i->second) {
+        if (target_bucket &&
+            handler.dest.bucket &&
+            !target_bucket->match(*handler.dest.bucket)) {
+          ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": skipping" << dendl;
+          continue;
+        }
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ": pipe_handler=" << handler << ": adding" << dendl;
+        result->insert(handler, source_bucket_info, target_bucket_info);
+      }
+    }
+  }
+
+  void update_from_target_bucket_policy();
+  void update_from_source_bucket_policy();
+
+  struct GetHintTargets : public RGWGenericAsyncCR::Action {
+    RGWDataSyncEnv *sync_env;
+    rgw_bucket source_bucket;
+    std::set<rgw_bucket> targets;
+    
+    GetHintTargets(RGWDataSyncEnv *_sync_env,
+                   const rgw_bucket& _source_bucket) : sync_env(_sync_env),
+                                                       source_bucket(_source_bucket) {}
+    int operate() override {
+      int r = sync_env->svc->bucket_sync->get_bucket_sync_hints(sync_env->dpp, 
+                                                                source_bucket,
+                                                                nullptr,
+                                                                &targets,
+                                                                null_yield);
+      if (r < 0) {
+        ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): failed to fetch bucket sync hints for bucket=" << source_bucket << dendl;
+        return r;
+      }
+
+      return 0;
+    }
+  };
+
+  std::shared_ptr<GetHintTargets> get_hint_targets_action;
+  std::set<rgw_bucket>::iterator hiter;
+
+public:
+  RGWGetBucketPeersCR(RGWDataSyncEnv *_sync_env,
+                      std::optional<rgw_bucket> _target_bucket,
+                      std::optional<rgw_zone_id> _source_zone,
+                      std::optional<rgw_bucket> _source_bucket,
+                      rgw_sync_pipe_info_set *_pipes,
+                      const RGWSyncTraceNodeRef& _tn_parent)
+    : RGWCoroutine(_sync_env->cct),
+      sync_env(_sync_env),
+      target_bucket(_target_bucket),
+      source_zone(_source_zone),
+      source_bucket(_source_bucket),
+      pipes(_pipes),
+      tn(sync_env->sync_tracer->add_node(_tn_parent, "get_bucket_peers",
+                                         SSTR( "target=" << target_bucket.value_or(rgw_bucket())
+                                               << ":source=" << target_bucket.value_or(rgw_bucket())
+                                               << ":source_zone=" << source_zone.value_or(rgw_zone_id("*")).id))) {
+      }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+std::ostream& operator<<(std::ostream& out, std::optional<rgw_bucket_shard>& bs) {
+  if (!bs) {
+    out << "*";
+  } else {
+    out << *bs;
+  }
+  return out;
+}
+
+static RGWCoroutine* sync_bucket_shard_cr(RGWDataSyncCtx* sc,
+                                          boost::intrusive_ptr<const RGWContinuousLeaseCR> lease,
+                                          const rgw_bucket_sync_pair_info& sync_pair,
+                                          std::optional<uint64_t> gen,
+                                          const RGWSyncTraceNodeRef& tn,
+                                          ceph::real_time* progress);
+
+RGWRunBucketSourcesSyncCR::RGWRunBucketSourcesSyncCR(RGWDataSyncCtx *_sc,
+                                                     boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                                                     const rgw_bucket_shard& source_bs,
+                                                     const RGWSyncTraceNodeRef& _tn_parent,
+						     std::optional<uint64_t> gen,
+                                                     ceph::real_time* progress)
+  : RGWCoroutine(_sc->env->cct), sc(_sc), sync_env(_sc->env),
+    lease_cr(std::move(lease_cr)),
+    tn(sync_env->sync_tracer->add_node(
+	 _tn_parent, "bucket_sync_sources",
+	 SSTR( "source=" << source_bs << ":source_zone=" << sc->source_zone))),
+    progress(progress),
+    gen(gen)
+{
+  sync_pair.source_bs = source_bs;
+}
+
+int RGWRunBucketSourcesSyncCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    yield call(new RGWGetBucketPeersCR(sync_env, std::nullopt, sc->source_zone,
+                                       sync_pair.source_bs.bucket, &pipes, tn));
+    if (retcode < 0 && retcode != -ENOENT) {
+      tn->log(0, SSTR("ERROR: failed to read sync status for bucket. error: " << retcode));
+      return set_cr_error(retcode);
+    }
+
+    ldpp_dout(dpp, 20) << __func__ << "(): requested source_bs=" << sync_pair.source_bs << dendl;
+
+    if (pipes.empty()) {
+      ldpp_dout(dpp, 20) << __func__ << "(): no relevant sync pipes found" << dendl;
+      return set_cr_done();
+    }
+
+    shard_progress.resize(pipes.size());
+    cur_shard_progress = shard_progress.begin();
+
+    for (siter = pipes.begin(); siter != pipes.end(); ++siter, ++cur_shard_progress) {
+      ldpp_dout(dpp, 20) << __func__ << "(): sync pipe=" << *siter << dendl;
+
+      sync_pair.dest_bucket = siter->target.get_bucket();
+      sync_pair.handler = siter->handler;
+
+      ldpp_dout(dpp, 20) << __func__ << "(): sync_pair=" << sync_pair << dendl;
+
+      yield_spawn_window(sync_bucket_shard_cr(sc, lease_cr, sync_pair,
+                                              gen, tn, &*cur_shard_progress),
+                         sc->lcc.adj_concurrency(cct->_conf->rgw_bucket_sync_spawn_window),
+                         [&](uint64_t stack_id, int ret) {
+                           if (ret < 0) {
+                             tn->log(10, SSTR("ERROR: a sync operation returned error: " << ret));
+                           }
+                           return ret;
+                         });
+    }
+    drain_all_cb([&](uint64_t stack_id, int ret) {
+                   if (ret < 0) {
+                     tn->log(10, SSTR("a sync operation returned error: " << ret));
+                   }
+                   return ret;
+                 });
+    if (progress) {
+      *progress = *std::min_element(shard_progress.begin(), shard_progress.end());
+    }
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+class RGWSyncGetBucketInfoCR : public RGWCoroutine {
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket bucket;
+  RGWBucketInfo *pbucket_info;
+  map<string, bufferlist> *pattrs;
+  RGWMetaSyncEnv meta_sync_env;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWSyncGetBucketInfoCR(RGWDataSyncEnv *_sync_env,
+                         const rgw_bucket& _bucket,
+                         RGWBucketInfo *_pbucket_info,
+                         map<string, bufferlist> *_pattrs,
+                         const RGWSyncTraceNodeRef& _tn_parent)
+    : RGWCoroutine(_sync_env->cct),
+      sync_env(_sync_env),
+      bucket(_bucket),
+      pbucket_info(_pbucket_info),
+      pattrs(_pattrs),
+      tn(sync_env->sync_tracer->add_node(_tn_parent, "get_bucket_info",
+                                         SSTR(bucket))) {
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWSyncGetBucketInfoCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->driver, bucket, pbucket_info, pattrs, dpp));
+    if (retcode == -ENOENT) {
+      /* bucket instance info has not been synced in yet, fetch it now */
+      yield {
+        tn->log(10, SSTR("no local info for bucket:" << ": fetching metadata"));
+        string raw_key = string("bucket.instance:") + bucket.get_key();
+
+        meta_sync_env.init(dpp, cct, sync_env->driver, sync_env->svc->zone->get_master_conn(), sync_env->async_rados,
+                           sync_env->http_manager, sync_env->error_logger, sync_env->sync_tracer);
+
+        call(new RGWMetaSyncSingleEntryCR(&meta_sync_env, raw_key,
+                                          string() /* no marker */,
+                                          MDLOG_STATUS_COMPLETE,
+                                          NULL /* no marker tracker */,
+                                          tn));
+      }
+      if (retcode < 0) {
+        tn->log(0, SSTR("ERROR: failed to fetch bucket instance info for " << bucket_str{bucket}));
+        return set_cr_error(retcode);
+      }
+
+      yield call(new RGWGetBucketInstanceInfoCR(sync_env->async_rados, sync_env->driver, bucket, pbucket_info, pattrs, dpp));
+    }
+    if (retcode < 0) {
+      tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{bucket}));
+      return set_cr_error(retcode);
+    }
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+void RGWGetBucketPeersCR::update_from_target_bucket_policy()
+{
+  if (!target_policy ||
+      !target_policy->policy_handler ||
+      !pipes) {
+    return;
+  }
+
+  auto handler = target_policy->policy_handler.get();
+
+  filter_sources(source_zone,
+                 source_bucket,
+                 handler->get_sources(),
+                 pipes);
+
+  for (siter = pipes->begin(); siter != pipes->end(); ++siter) {
+    if (!siter->source.has_bucket_info()) {
+      buckets_info.emplace(siter->source.get_bucket(), all_bucket_info());
+    }
+    if (!siter->target.has_bucket_info()) {
+      buckets_info.emplace(siter->target.get_bucket(), all_bucket_info());
+    }
+  }
+}
+
+void RGWGetBucketPeersCR::update_from_source_bucket_policy()
+{
+  if (!source_policy ||
+      !source_policy->policy_handler ||
+      !pipes) {
+    return;
+  }
+
+  auto handler = source_policy->policy_handler.get();
+
+  filter_targets(sync_env->svc->zone->get_zone().id,
+                 target_bucket,
+                 handler->get_targets(),
+                 pipes);
+
+  for (siter = pipes->begin(); siter != pipes->end(); ++siter) {
+    if (!siter->source.has_bucket_info()) {
+      buckets_info.emplace(siter->source.get_bucket(), all_bucket_info());
+    }
+    if (!siter->target.has_bucket_info()) {
+      buckets_info.emplace(siter->target.get_bucket(), all_bucket_info());
+    }
+  }
+}
+
+
+class RGWSyncGetBucketSyncPolicyHandlerCR : public RGWCoroutine {
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket bucket;
+  rgw_bucket_get_sync_policy_params get_policy_params;
+
+  std::shared_ptr<rgw_bucket_get_sync_policy_result> policy;
+
+  RGWSyncTraceNodeRef tn;
+
+  int i;
+
+public:
+  RGWSyncGetBucketSyncPolicyHandlerCR(RGWDataSyncEnv *_sync_env,
+                         std::optional<rgw_zone_id> zone,
+                         const rgw_bucket& _bucket,
+                         std::shared_ptr<rgw_bucket_get_sync_policy_result>& _policy,
+                         const RGWSyncTraceNodeRef& _tn_parent)
+    : RGWCoroutine(_sync_env->cct),
+      sync_env(_sync_env),
+      bucket(_bucket),
+      policy(_policy),
+      tn(sync_env->sync_tracer->add_node(_tn_parent, "get_sync_policy_handler",
+                                         SSTR(bucket))) {
+    get_policy_params.zone = zone;
+    get_policy_params.bucket = bucket;
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      for (i = 0; i < 2; ++i) {
+        yield call(new RGWBucketGetSyncPolicyHandlerCR(sync_env->async_rados,
+                                                       sync_env->driver,
+                                                       get_policy_params,
+                                                       policy,
+                                                       dpp));
+        if (retcode < 0 &&
+            retcode != -ENOENT) {
+          return set_cr_error(retcode);
+        }
+
+        if (retcode == 0) {
+          return set_cr_done();
+        }
+
+        /* bucket instance was not found,
+         * try to get bucket instance info, can trigger
+         * metadata sync of bucket instance
+         */
+        yield call(new RGWSyncGetBucketInfoCR(sync_env, 
+                                              bucket, 
+                                              nullptr,
+                                              nullptr,
+                                              tn));
+        if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+      }
+    }
+
+    return 0;
+  }
+};
+
+
+int RGWGetBucketPeersCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    if (pipes) {
+      pipes->clear();
+    }
+    if (target_bucket) {
+      target_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+      yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
+                                                         nullopt,
+                                                         *target_bucket,
+                                                         target_policy,
+                                                         tn));
+      if (retcode < 0 &&
+          retcode != -ENOENT) {
+        return set_cr_error(retcode);
+      }
+
+      update_from_target_bucket_policy();
+    }
+
+    if (source_bucket && source_zone) {
+      source_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+      yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
+                                                         source_zone,
+                                                         *source_bucket,
+                                                         source_policy,
+                                                         tn));
+      if (retcode < 0 &&
+          retcode != -ENOENT) {
+        return set_cr_error(retcode);
+      }
+
+      if (source_policy->policy_handler) {
+        auto& opt_bucket_info = source_policy->policy_handler->get_bucket_info();
+        auto& opt_attrs = source_policy->policy_handler->get_bucket_attrs();
+        if (opt_bucket_info && opt_attrs) {
+          source_bucket_info.emplace();
+          source_bucket_info->bucket_info = *opt_bucket_info;
+          source_bucket_info->attrs = *opt_attrs;
+        }
+      }
+
+      if (!target_bucket) {
+        get_hint_targets_action = make_shared<GetHintTargets>(sync_env, *source_bucket);
+
+        yield call(new RGWGenericAsyncCR(cct, sync_env->async_rados,
+                                         get_hint_targets_action));
+        if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+
+        /* hints might have incomplete bucket ids,
+         * in which case we need to figure out the current
+         * bucket_id
+         */
+        for (hiter = get_hint_targets_action->targets.begin();
+             hiter != get_hint_targets_action->targets.end();
+             ++hiter) {
+          ldpp_dout(dpp, 20) << "Got sync hint for bucket=" << *source_bucket << ": " << hiter->get_key() << dendl;
+
+          target_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+          yield call(new RGWSyncGetBucketSyncPolicyHandlerCR(sync_env,
+                                                             nullopt,
+                                                             *hiter,
+                                                             target_policy,
+                                                             tn));
+          if (retcode < 0 &&
+              retcode != -ENOENT) {
+            return set_cr_error(retcode);
+          }
+          update_from_target_bucket_policy();
+        }
+      }
+    }
+
+    update_from_source_bucket_policy();
+
+    for (siiter = buckets_info.begin(); siiter != buckets_info.end(); ++siiter) {
+      if (siiter->second.bucket_info.bucket.name.empty()) {
+        yield call(new RGWSyncGetBucketInfoCR(sync_env, siiter->first,
+                                              &siiter->second.bucket_info,
+                                              &siiter->second.attrs,
+                                              tn));
+      }
+    }
+
+    if (pipes) {
+      pipes->update_empty_bucket_info(buckets_info);
+    }
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+class RGWSyncBucketShardCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr;
+  rgw_bucket_sync_pair_info sync_pair;
+  rgw_bucket_sync_pipe& sync_pipe;
+  bool& bucket_stopped;
+  uint64_t generation;
+  ceph::real_time* progress;
+
+  const std::string shard_status_oid;
+  const rgw_raw_obj bucket_status_obj;
+  rgw_bucket_shard_sync_info sync_status;
+  RGWObjVersionTracker objv_tracker;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWSyncBucketShardCR(RGWDataSyncCtx *_sc,
+                       boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                       const rgw_bucket_sync_pair_info& _sync_pair,
+                       rgw_bucket_sync_pipe& sync_pipe,
+                       bool& bucket_stopped,
+                       uint64_t generation,
+                       const RGWSyncTraceNodeRef& tn,
+                       ceph::real_time* progress)
+    : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+      lease_cr(std::move(lease_cr)), sync_pair(_sync_pair),
+      sync_pipe(sync_pipe), bucket_stopped(bucket_stopped), generation(generation), progress(progress),
+      shard_status_oid(RGWBucketPipeSyncStatusManager::inc_status_oid(sc->source_zone, sync_pair, generation)),
+      bucket_status_obj(sc->env->svc->zone->get_zone_params().log_pool,
+                 RGWBucketPipeSyncStatusManager::full_status_oid(sc->source_zone,
+                                                                 sync_pair.source_bs.bucket,
+                                                                 sync_pair.dest_bucket)),
+      tn(tn) {
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWSyncBucketShardCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    objv_tracker.clear();
+    yield call(new RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &sync_status, &objv_tracker, generation));
+    if (retcode < 0 && retcode != -ENOENT) {
+      tn->log(0, SSTR("ERROR: failed to read sync status for bucket. error: " << retcode));
+      return set_cr_error(retcode);
+    }
+
+    tn->log(20, SSTR("sync status for source bucket shard: " << sync_status.state));
+    sync_status.state = rgw_bucket_shard_sync_info::StateIncrementalSync;
+    if (progress) {
+      *progress = sync_status.inc_marker.timestamp;
+    }
+
+    yield call(new RGWBucketShardIncrementalSyncCR(sc, sync_pipe,
+                                                   shard_status_oid, bucket_status_obj, lease_cr,
+                                                   sync_status, generation, tn,
+                                                   objv_tracker, progress));
+    if (retcode < 0) {
+      tn->log(5, SSTR("incremental sync on bucket failed, retcode=" << retcode));
+      return set_cr_error(retcode);
+    }
+
+    if (sync_status.state == rgw_bucket_shard_sync_info::StateStopped) {
+      tn->log(20, SSTR("syncstopped indication for source bucket shard"));
+      bucket_stopped = true;
+    }
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+class RGWSyncBucketCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *env;
+  boost::intrusive_ptr<const RGWContinuousLeaseCR> data_lease_cr;
+  boost::intrusive_ptr<RGWContinuousLeaseCR> bucket_lease_cr;
+  rgw_bucket_sync_pair_info sync_pair;
+  rgw_bucket_sync_pipe sync_pipe;
+  std::optional<uint64_t> gen;
+  ceph::real_time* progress;
+
+  const std::string lock_name = "bucket sync";
+  const uint32_t lock_duration;
+  const rgw_raw_obj status_obj;
+  rgw_bucket_sync_status bucket_status;
+  bool bucket_stopped = false;
+  RGWObjVersionTracker objv;
+  bool init_check_compat = false;
+  rgw_bucket_index_marker_info info;
+  rgw_raw_obj error_repo;
+  rgw_bucket_shard source_bs;
+  rgw_pool pool;
+  uint64_t current_gen = 0;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWSyncBucketCR(RGWDataSyncCtx *_sc,
+                  boost::intrusive_ptr<const RGWContinuousLeaseCR> lease_cr,
+                  const rgw_bucket_sync_pair_info& _sync_pair,
+                  std::optional<uint64_t> gen,
+                  const RGWSyncTraceNodeRef& _tn_parent,
+                  ceph::real_time* progress)
+    : RGWCoroutine(_sc->cct), sc(_sc), env(_sc->env),
+      data_lease_cr(std::move(lease_cr)), sync_pair(_sync_pair),
+      gen(gen), progress(progress),
+      lock_duration(cct->_conf->rgw_sync_lease_period),
+      status_obj(env->svc->zone->get_zone_params().log_pool,
+                 RGWBucketPipeSyncStatusManager::full_status_oid(sc->source_zone,
+                                                                 sync_pair.source_bs.bucket,
+                                                                 sync_pair.dest_bucket)),
+      tn(env->sync_tracer->add_node(_tn_parent, "bucket",
+                                    SSTR(bucket_str{_sync_pair.dest_bucket} << "<-" << bucket_shard_str{_sync_pair.source_bs} ))) {
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+static RGWCoroutine* sync_bucket_shard_cr(RGWDataSyncCtx* sc,
+                                          boost::intrusive_ptr<const RGWContinuousLeaseCR> lease,
+                                          const rgw_bucket_sync_pair_info& sync_pair,
+                                          std::optional<uint64_t> gen,
+                                          const RGWSyncTraceNodeRef& tn,
+                                          ceph::real_time* progress)
+{
+  return new RGWSyncBucketCR(sc, std::move(lease), sync_pair,
+                             gen, tn, progress);
+}
+
+#define RELEASE_LOCK(cr) \
+	if (cr) {cr->go_down(); drain_all(); cr.reset();}
+
+int RGWSyncBucketCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // read source/destination bucket info
+    yield call(new RGWSyncGetBucketInfoCR(env, sync_pair.source_bs.bucket, &sync_pipe.source_bucket_info,
+                                          &sync_pipe.source_bucket_attrs, tn));
+    if (retcode < 0) {
+      tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{sync_pair.source_bs.bucket}));
+      return set_cr_error(retcode);
+    }
+
+    yield call(new RGWSyncGetBucketInfoCR(env, sync_pair.dest_bucket, &sync_pipe.dest_bucket_info,
+                                          &sync_pipe.dest_bucket_attrs, tn));
+    if (retcode < 0) {
+      tn->log(0, SSTR("ERROR: failed to retrieve bucket info for bucket=" << bucket_str{sync_pair.source_bs.bucket}));
+      return set_cr_error(retcode);
+    }
+
+    sync_pipe.info = sync_pair;
+
+    // read bucket sync status
+    using ReadCR = RGWSimpleRadosReadCR<rgw_bucket_sync_status>;
+    using WriteCR = RGWSimpleRadosWriteCR<rgw_bucket_sync_status>;
+
+    objv.clear();
+    yield call(new ReadCR(dpp, env->driver,
+                          status_obj, &bucket_status, false, &objv));
+    if (retcode == -ENOENT) {
+      // if the full sync status object didn't exist yet, run the backward
+      // compatability logic in InitBucketFullSyncStatusCR below. if it did
+      // exist, a `bucket sync init` probably requested its re-initialization,
+      // and shouldn't try to resume incremental sync
+      init_check_compat = true;
+
+      // use exclusive create to set state=Init
+      objv.generate_new_write_ver(cct);
+      yield call(new WriteCR(dpp, env->driver, status_obj, bucket_status, &objv, true));
+      tn->log(20, "bucket status object does not exist, create a new one");
+      if (retcode == -EEXIST) {
+        // raced with another create, read its status
+        tn->log(20, "raced with another create, read its status");
+        objv.clear();
+        yield call(new ReadCR(dpp, env->driver,
+                              status_obj, &bucket_status, false, &objv));
+      }
+    }
+    if (retcode < 0) {
+      tn->log(20, SSTR("ERROR: failed to read bucket status object. error: " << retcode));
+      return set_cr_error(retcode);
+    }
+
+    do {
+      tn->log(20, SSTR("sync status for source bucket: " << bucket_status.state << 
+            ". lease is: " << (bucket_lease_cr ? "taken" : "not taken") << ". stop indications is: " << bucket_stopped));
+
+      if (bucket_status.state != BucketSyncState::Incremental ||
+          bucket_stopped) {
+
+        if (!bucket_lease_cr) {
+          bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
+                lock_name, lock_duration, this, &sc->lcc));
+          yield spawn(bucket_lease_cr.get(), false);
+          while (!bucket_lease_cr->is_locked()) {
+            if (bucket_lease_cr->is_done()) {
+              tn->log(5, "failed to take lease");
+              set_status("lease lock failed, early abort");
+              drain_all();
+              return set_cr_error(bucket_lease_cr->get_ret_status());
+            }
+            tn->log(5, "waiting on bucket lease");
+            yield set_sleeping(true);
+          }
+        }
+
+        // if state is Init or Stopped, we query the remote RGW for ther state
+        yield call(new RGWReadRemoteBucketIndexLogInfoCR(sc, sync_pair.source_bs.bucket, &info));
+        if (retcode < 0) {
+          RELEASE_LOCK(bucket_lease_cr);
+          return set_cr_error(retcode);
+        }
+        if (info.syncstopped) {
+          // remote indicates stopped state
+          tn->log(20, "remote bilog indicates that sync was stopped");
+
+          // if state was incremental, remove all per-shard status objects
+          if (bucket_status.state == BucketSyncState::Incremental) {
+            yield {
+              const auto num_shards = bucket_status.shards_done_with_gen.size();
+              const auto gen = bucket_status.incremental_gen;
+              call(new RemoveBucketShardStatusCollectCR(sc, sync_pair, gen, num_shards));
+            }
+          }
+
+          // check if local state is "stopped"
+          objv.clear();
+          yield call(new ReadCR(dpp, env->driver,
+                status_obj, &bucket_status, false, &objv));
+          if (retcode < 0) {
+            tn->log(20, SSTR("ERROR: failed to read status before writing 'stopped'. error: " << retcode));
+            RELEASE_LOCK(bucket_lease_cr);
+            return set_cr_error(retcode);
+          }
+          if (bucket_status.state != BucketSyncState::Stopped) {
+            // make sure that state is changed to stopped localy
+            bucket_status.state = BucketSyncState::Stopped;
+            yield call(new WriteCR(dpp, env->driver, status_obj, bucket_status,
+				   &objv, false));
+            if (retcode < 0) {
+              tn->log(20, SSTR("ERROR: failed to write 'stopped' status. error: " << retcode));
+              RELEASE_LOCK(bucket_lease_cr);
+              return set_cr_error(retcode);
+            }
+          }
+          RELEASE_LOCK(bucket_lease_cr);
+          return set_cr_done();
+        }
+        if (bucket_stopped) {
+          tn->log(20, SSTR("ERROR: switched from 'stop' to 'start' sync. while state is: " << bucket_status.state));
+          bucket_stopped = false;
+          bucket_status.state = BucketSyncState::Init;
+        }
+      }
+
+      if (bucket_status.state != BucketSyncState::Incremental) {
+        // if the state wasn't Incremental, take a bucket-wide lease to prevent
+        // different shards from duplicating the init and full sync
+        if (!bucket_lease_cr) {
+          bucket_lease_cr.reset(new RGWContinuousLeaseCR(env->async_rados, env->driver, status_obj,
+							 lock_name, lock_duration, this, &sc->lcc));
+          yield spawn(bucket_lease_cr.get(), false);
+          while (!bucket_lease_cr->is_locked()) {
+            if (bucket_lease_cr->is_done()) {
+              tn->log(5, "failed to take lease");
+              set_status("lease lock failed, early abort");
+              drain_all();
+              return set_cr_error(bucket_lease_cr->get_ret_status());
+            }
+            tn->log(5, "waiting on bucket lease");
+            yield set_sleeping(true);
+          }
+        }
+
+        // reread the status after acquiring the lock
+        objv.clear();
+        yield call(new ReadCR(dpp, env->driver, status_obj,
+                              &bucket_status, false, &objv));
+        if (retcode < 0) {
+          RELEASE_LOCK(bucket_lease_cr);
+          tn->log(20, SSTR("ERROR: reading the status after acquiring the lock failed. error: " << retcode));
+          return set_cr_error(retcode);
+        }
+        tn->log(20, SSTR("status after acquiring the lock is: " << bucket_status.state));
+
+	yield call(new InitBucketFullSyncStatusCR(sc, sync_pair, status_obj,
+						  bucket_status, objv,
+						  sync_pipe.source_bucket_info,
+						  init_check_compat, info));
+
+        if (retcode < 0) {
+          tn->log(20, SSTR("ERROR: init full sync failed. error: " << retcode));
+          RELEASE_LOCK(bucket_lease_cr);
+          return set_cr_error(retcode);
+        }
+      }
+
+      assert(bucket_status.state == BucketSyncState::Incremental || 
+          bucket_status.state == BucketSyncState::Full);
+
+      if (bucket_status.state == BucketSyncState::Full) {
+        assert(bucket_lease_cr);
+        yield call(new RGWBucketFullSyncCR(sc, sync_pipe, status_obj,
+                                           bucket_lease_cr, bucket_status,
+                                           tn, objv));
+        if (retcode < 0) {
+          tn->log(20, SSTR("ERROR: full sync failed. error: " << retcode));
+          RELEASE_LOCK(bucket_lease_cr);
+          return set_cr_error(retcode);
+        }
+      }
+
+      if (bucket_status.state == BucketSyncState::Incremental) {
+        // lease not required for incremental sync
+        RELEASE_LOCK(bucket_lease_cr);
+
+        assert(sync_pair.source_bs.shard_id >= 0);
+        // if a specific gen was requested, compare that to the sync status
+        if (gen) {
+          current_gen = bucket_status.incremental_gen;
+	  source_bs = sync_pair.source_bs;
+          if (*gen > current_gen) {
+	    /* In case the data log entry is missing for previous gen, it may
+	     * not be marked complete and the sync can get stuck. To avoid it,
+	     * may be we can add this (shardid, gen) to error repo to force
+	     * sync and mark that shard as completed.
+	     */
+	    pool = sc->env->svc->zone->get_zone_params().log_pool;
+            if ((static_cast<std::size_t>(source_bs.shard_id) < bucket_status.shards_done_with_gen.size()) &&
+	       !bucket_status.shards_done_with_gen[source_bs.shard_id]) {
+	      // use the error repo and sync status timestamp from the datalog shard corresponding to source_bs
+              error_repo = datalog_oid_for_error_repo(sc, sc->env->driver,
+			   pool, source_bs);
+              yield call(rgw::error_repo::write_cr(sc->env->driver->svc()->rados, error_repo,
+                                              rgw::error_repo::encode_key(source_bs, current_gen),
+                                              ceph::real_clock::zero()));
+              if (retcode < 0) {
+                tn->log(0, SSTR("ERROR: failed to log prev gen entry (bucket=" << source_bs.bucket << ", shard_id=" << source_bs.shard_id << ", gen=" << current_gen << " in error repo: retcode=" << retcode));
+              } else {
+                tn->log(20, SSTR("logged prev gen entry (bucket=" << source_bs.bucket << ", shard_id=" << source_bs.shard_id << ", gen=" << current_gen << " in error repo: retcode=" << retcode));
+	      }
+	    }
+            retcode = -EAGAIN;
+            tn->log(10, SSTR("ERROR: requested sync of future generation "
+                             << *gen << " > " << current_gen
+                             << ", returning " << retcode << " for later retry"));
+            return set_cr_error(retcode);
+          } else if (*gen < current_gen) {
+            tn->log(10, SSTR("WARNING: requested sync of past generation "
+                             << *gen << " < " << current_gen
+                             << ", returning success"));
+            return set_cr_done();
+          }
+        }
+
+        if (static_cast<std::size_t>(sync_pair.source_bs.shard_id) >= bucket_status.shards_done_with_gen.size()) {
+          tn->log(1, SSTR("bucket shard " << sync_pair.source_bs << " index out of bounds"));
+          return set_cr_done(); // return success so we don't retry
+        }
+        if (bucket_status.shards_done_with_gen[sync_pair.source_bs.shard_id]) {
+          tn->log(10, SSTR("bucket shard " << sync_pair.source_bs << " of gen " <<
+                          gen << " already synced."));
+          return set_cr_done();
+        }
+
+        yield call(new RGWSyncBucketShardCR(sc, data_lease_cr, sync_pair,
+                                            sync_pipe, bucket_stopped,
+                                            bucket_status.incremental_gen, tn, progress));
+        if (retcode < 0) {
+          tn->log(20, SSTR("ERROR: incremental sync failed. error: " << retcode));
+          return set_cr_error(retcode);
+        }
+      }
+      // loop back to previous states unless incremental sync returns normally
+    } while (bucket_status.state != BucketSyncState::Incremental || bucket_stopped);
+
+    return set_cr_done();
+  }
+
+  return 0;
+}
+
+int RGWBucketPipeSyncStatusManager::do_init(const DoutPrefixProvider *dpp,
+					    std::ostream* ostr)
+{
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+
+  sync_module.reset(new RGWDefaultSyncModuleInstance());
+  auto async_rados = driver->svc()->rados->get_async_processor();
+
+  sync_env.init(this, driver->ctx(), driver,
+                driver->svc(), async_rados, &http_manager,
+                error_logger.get(), driver->getRados()->get_sync_tracer(),
+                sync_module, nullptr);
+
+  sync_env.ostr = ostr;
+
+  rgw_sync_pipe_info_set pipes;
+
+  ret = cr_mgr.run(dpp, new RGWGetBucketPeersCR(&sync_env,
+                                           dest_bucket,
+                                           source_zone,
+                                           source_bucket,
+                                           &pipes,
+                                           sync_env.sync_tracer->root_node));
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "failed to get bucket source peers info: (ret=" << ret << "): " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  if (pipes.empty()) {
+    ldpp_dout(this, 0) << "No peers. This is not a valid multisite configuration." << dendl;
+    return -EINVAL;
+  }
+
+  for (auto& pipe : pipes) {
+    auto& szone = pipe.source.zone;
+
+    auto conn = driver->svc()->zone->get_zone_conn(szone);
+    if (!conn) {
+      ldpp_dout(this, 0) << "connection object to zone " << szone << " does not exist" << dendl;
+      return -EINVAL;
+    }
+
+    RGWZone* z;
+    if (!(z = driver->svc()->zone->find_zone(szone))) {
+      ldpp_dout(this, 0) << "zone " << szone << " does not exist" << dendl;
+      return -EINVAL;
+    }
+    sources.emplace_back(&sync_env, szone, conn,
+			 pipe.source.get_bucket_info(),
+			 pipe.target.get_bucket(),
+			 pipe.handler, z->name);
+  }
+
+  return 0;
+}
+
+int RGWBucketPipeSyncStatusManager::remote_info(const DoutPrefixProvider *dpp,
+						source& s,
+						uint64_t* oldest_gen,
+						uint64_t* latest_gen,
+						uint64_t* num_shards)
+{
+  rgw_bucket_index_marker_info remote_info;
+  BucketIndexShardsManager remote_markers;
+  auto r = rgw_read_remote_bilog_info(dpp, s.sc.conn, s.info.bucket,
+				      remote_info, remote_markers,
+				      null_yield);
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		      << " rgw_read_remote_bilog_info: r="
+		      << r << dendl;
+    return r;
+  }
+  if (oldest_gen)
+    *oldest_gen = remote_info.oldest_gen;
+
+  if (latest_gen)
+    *latest_gen = remote_info.latest_gen;
+
+  if (num_shards)
+    *num_shards = remote_markers.get().size();
+
+  return 0;
+}
+
+tl::expected<std::unique_ptr<RGWBucketPipeSyncStatusManager>, int>
+RGWBucketPipeSyncStatusManager::construct(
+  const DoutPrefixProvider* dpp,
+  rgw::sal::RadosStore* driver,
+  std::optional<rgw_zone_id> source_zone,
+  std::optional<rgw_bucket> source_bucket,
+  const rgw_bucket& dest_bucket,
+  std::ostream* ostr)
+{
+  std::unique_ptr<RGWBucketPipeSyncStatusManager> self{
+    new RGWBucketPipeSyncStatusManager(driver, source_zone, source_bucket,
+				       dest_bucket)};
+  auto r = self->do_init(dpp, ostr);
+  if (r < 0) {
+    return tl::unexpected(r);
+  }
+  return self;
+}
+
+int RGWBucketPipeSyncStatusManager::init_sync_status(
+  const DoutPrefixProvider *dpp)
+{
+  // Just running one at a time saves us from buildup/teardown and in
+  // practice we only do one zone at a time.
+  for (auto& source : sources) {
+    list<RGWCoroutinesStack*> stacks;
+    RGWCoroutinesStack *stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
+    pretty_print(source.sc.env, "Initializing sync state of bucket {} with zone {}.\n",
+		 source.info.bucket.name, source.zone_name);
+    stack->call(new RGWSimpleRadosWriteCR<rgw_bucket_sync_status>(
+		  dpp, source.sc.env->driver,
+		  {sync_env.svc->zone->get_zone_params().log_pool,
+                   full_status_oid(source.sc.source_zone,
+				   source.info.bucket,
+				   source.dest)},
+		  rgw_bucket_sync_status{}));
+    stacks.push_back(stack);
+    auto r = cr_mgr.run(dpp, stacks);
+    if (r < 0) {
+      pretty_print(source.sc.env,
+		   "Initialization of sync state for bucket {} with zone {} "
+		   "failed with error {}\n",
+		   source.info.bucket.name, source.zone_name, cpp_strerror(r));
+    }
+  }
+  return 0;
+}
+
+tl::expected<std::map<int, rgw_bucket_shard_sync_info>, int>
+RGWBucketPipeSyncStatusManager::read_sync_status(
+  const DoutPrefixProvider *dpp)
+{
+  std::map<int, rgw_bucket_shard_sync_info> sync_status;
+  list<RGWCoroutinesStack *> stacks;
+
+  auto sz = sources.begin();
+
+  if (source_zone) {
+    sz = std::find_if(sources.begin(), sources.end(),
+		      [this](const source& s) {
+			return s.sc.source_zone == *source_zone;
+		      }
+      );
+    if (sz == sources.end()) {
+      ldpp_dout(this, 0) << "ERROR: failed to find source zone: "
+			 << *source_zone << dendl;
+      return tl::unexpected(-ENOENT);
+    }
+  } else {
+    ldpp_dout(this, 5) << "No source zone specified, using source zone: "
+		       << sz->sc.source_zone << dendl;
+    return tl::unexpected(-ENOENT);
+  }
+  uint64_t num_shards, latest_gen;
+  auto ret = remote_info(dpp, *sz, nullptr, &latest_gen, &num_shards);
+  if (ret < 0) {
+    ldpp_dout(this, 5) << "Unable to get remote info: "
+		       << ret << dendl;
+    return tl::unexpected(ret);
+  }
+  auto stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
+  std::vector<rgw_bucket_sync_pair_info> pairs(num_shards);
+  for (auto shard = 0u; shard < num_shards; ++shard) {
+    auto& pair = pairs[shard];
+    pair.source_bs.bucket = sz->info.bucket;
+    pair.dest_bucket = sz->dest;
+    pair.source_bs.shard_id = shard;
+    stack->call(new RGWReadBucketPipeSyncStatusCoroutine(
+		  &sz->sc, pair, &sync_status[shard],
+		  nullptr, latest_gen));
+  }
+
+  stacks.push_back(stack);
+
+  ret = cr_mgr.run(dpp, stacks);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to read sync status for "
+		       << bucket_str{dest_bucket} << dendl;
+    return tl::unexpected(ret);
+  }
+
+  return sync_status;
+}
+
+namespace rgw::bucket_sync_run {
+// Retry-loop over calls to sync_bucket_shard_cr
+class ShardCR : public RGWCoroutine {
+  static constexpr auto allowed_retries = 10u;
+
+  RGWDataSyncCtx& sc;
+  const rgw_bucket_sync_pair_info& pair;
+  const uint64_t gen;
+  unsigned retries = 0;
+
+  ceph::real_time prev_progress;
+  ceph::real_time progress;
+
+public:
+
+  ShardCR(RGWDataSyncCtx& sc, const rgw_bucket_sync_pair_info& pair,
+	  const uint64_t gen)
+    : RGWCoroutine(sc.cct), sc(sc), pair(pair), gen(gen) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      // Since all errors (except ECANCELED) are considered retryable,
+      // retry other errors so long as we're making progress.
+      for (retries = 0u, retcode = -EDOM;
+	   (retries < allowed_retries) && (retcode != 0);
+	   ++retries) {
+	ldpp_dout(dpp, 5) << "ShardCR: syncing bucket shard on: "
+			  << "zone=" << sc.source_zone
+			  << ", bucket=" << pair.source_bs.bucket.name
+			  << ", shard=" << pair.source_bs.shard_id
+			  << ", gen=" << gen
+			  << dendl;
+	yield call(sync_bucket_shard_cr(&sc, nullptr, pair, gen,
+					sc.env->sync_tracer->root_node,
+					&progress));
+
+	if (retcode == -ECANCELED) {
+	  ldpp_dout(dpp, -1) << "ERROR: Got -ECANCELED for "
+			     << pair.source_bs << dendl;
+	  drain_all();
+	  return set_cr_error(retcode);
+	} else if (retcode < 0) {
+	  ldpp_dout(dpp, 5) << "WARNING: Got error, retcode=" << retcode << " for "
+			    << pair.source_bs << "on retry "
+			    << retries + 1 << " of " << allowed_retries
+			    << " allowed" << dendl;
+	  // Reset the retry counter if we made any progress
+	  if (progress != prev_progress) {
+	    retries = 0;
+	  }
+	  prev_progress = progress;
+	}
+      }
+      if (retcode < 0) {
+	ldpp_dout(dpp, -1) << "ERROR: Exhausted retries for "
+			   << pair.source_bs << " retcode="
+			   << retcode << dendl;
+	drain_all();
+	return set_cr_error(retcode);
+      }
+
+      drain_all();
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+// Loop over calls to ShardCR with limited concurrency
+class GenCR : public RGWShardCollectCR {
+  static constexpr auto MAX_CONCURRENT_SHARDS = 64;
+
+  RGWDataSyncCtx& sc;
+  const uint64_t gen;
+
+  std::vector<rgw_bucket_sync_pair_info> pairs;
+  decltype(pairs)::const_iterator iter;
+
+public:
+  GenCR(RGWDataSyncCtx& sc, const rgw_bucket& source, const rgw_bucket& dest,
+	const uint64_t gen, const uint64_t shards,
+	const RGWBucketSyncFlowManager::pipe_handler& handler)
+    : RGWShardCollectCR(sc.cct, MAX_CONCURRENT_SHARDS),
+      sc(sc), gen(gen) {
+    pairs.resize(shards);
+    for (auto shard = 0u; shard < shards; ++shard) {
+      auto& pair = pairs[shard];
+      pair.handler = handler;
+      pair.source_bs.bucket = source;
+      pair.dest_bucket = dest;
+      pair.source_bs.shard_id = shard;
+    }
+    iter = pairs.cbegin();
+    assert(pairs.size() == shards);
+  }
+
+  virtual bool spawn_next() override {
+    if (iter == pairs.cend()) {
+      return false;
+    }
+    spawn(new ShardCR(sc, *iter, gen), false);
+    ++iter;
+    return true;
+  }
+
+  int handle_result(int r) override {
+    if (r < 0) {
+      ldpp_dout(sc.env->dpp, 4) << "ERROR: Error syncing shard: "
+				<< cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+};
+
+// Read sync status, loop over calls to GenCR
+class SourceCR : public RGWCoroutine {
+  RGWDataSyncCtx& sc;
+  const RGWBucketInfo& info;
+  const rgw_bucket& dest;
+  const RGWBucketSyncFlowManager::pipe_handler& handler;
+  const rgw_raw_obj status_obj{
+    sc.env->svc->zone->get_zone_params().log_pool,
+    RGWBucketPipeSyncStatusManager::full_status_oid(sc.source_zone, info.bucket,
+						    dest)};
+
+  BucketSyncState state = BucketSyncState::Incremental;
+  uint64_t gen = 0;
+  uint64_t num_shards = 0;
+  rgw_bucket_sync_status status;
+  std::string zone_name;
+
+public:
+
+  SourceCR(RGWDataSyncCtx& sc, const RGWBucketInfo& info,
+	   const rgw_bucket& dest,
+	   const RGWBucketSyncFlowManager::pipe_handler& handler,
+	   const std::string& zone_name)
+    : RGWCoroutine(sc.cct), sc(sc), info(info), dest(dest), handler(handler),
+      zone_name(zone_name) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      // Get the source's status. In incremental sync, this gives us
+      // the generation and shard count that is next needed to be run.
+      yield call(new RGWSimpleRadosReadCR<rgw_bucket_sync_status>(
+		   dpp, sc.env->driver, status_obj, &status));
+      if (retcode < 0) {
+	ldpp_dout(dpp, -1) << "ERROR: Unable to fetch status for zone="
+			   << sc.source_zone << " retcode="
+			   << retcode << dendl;
+	drain_all();
+	return set_cr_error(retcode);
+      }
+
+      if (status.state == BucketSyncState::Stopped) {
+	// Nothing to do.
+	pretty_print(sc.env, "Sync of bucket {} from source zone {} is in state Stopped. "
+		     "Nothing to do.\n", dest.name, zone_name);
+	ldpp_dout(dpp, 5) << "SourceCR: Bucket is in state Stopped, returning."
+			  << dendl;
+	drain_all();
+	return set_cr_done();
+      }
+
+      do {
+	state = status.state;
+	gen = status.incremental_gen;
+	num_shards = status.shards_done_with_gen.size();
+
+	ldpp_dout(dpp, 5) << "SourceCR: "
+			  << "state=" << state
+			  << ", gen=" << gen
+			  << ", num_shards=" << num_shards
+			  << dendl;
+
+	// Special case to handle full sync. Since full sync no longer
+	// uses shards and has no generations, we sync shard zero,
+	// though use the current generation so a following
+	// incremental sync can carry on.
+	if (state != BucketSyncState::Incremental) {
+	  pretty_print(sc.env, "Beginning full sync of bucket {} from source zone {}.\n",
+		       dest.name, zone_name);
+	  ldpp_dout(dpp, 5)  << "SourceCR: Calling GenCR with "
+			     << "gen=" << gen
+			     << ", num_shards=" << 1
+			     << dendl;
+	  yield call(new GenCR(sc, info.bucket, dest, gen, 1, handler));
+	} else {
+	  pretty_print(sc.env, "Beginning incremental sync of bucket {}, generation {} from source zone {}.\n",
+		       dest.name, gen, zone_name);
+	  ldpp_dout(dpp, 5) << "SourceCR: Calling GenCR with "
+			    << "gen=" << gen
+			    << ", num_shards=" << num_shards
+			    << dendl;
+	  yield call(new GenCR(sc, info.bucket, dest, gen, num_shards,
+			       handler));
+	}
+	if (retcode < 0) {
+	  ldpp_dout(dpp, -1) << "ERROR: Giving up syncing from "
+			     << sc.source_zone << " retcode="
+			     << retcode << dendl;
+	  drain_all();
+	  return set_cr_error(retcode);
+	}
+
+	pretty_print(sc.env, "Completed.\n");
+
+	yield call(new RGWSimpleRadosReadCR<rgw_bucket_sync_status>(
+		     dpp, sc.env->driver, status_obj, &status));
+	if (retcode < 0) {
+	  ldpp_dout(dpp, -1) << "ERROR: Unable to fetch status for zone="
+			     << sc.source_zone << " retcode="
+			     << retcode << dendl;
+	  drain_all();
+	  return set_cr_error(retcode);
+	}
+	// Repeat until we have done an incremental run and the
+	// generation remains unchanged.
+	ldpp_dout(dpp, 5) << "SourceCR: "
+			  << "state=" << state
+			  << ", gen=" << gen
+			  << ", num_shards=" << num_shards
+			  << ", status.state=" << status.state
+			  << ", status.incremental_gen=" << status.incremental_gen
+			  << ", status.shards_done_with_gen.size()=" << status.shards_done_with_gen.size()
+			  << dendl;
+      } while (state != BucketSyncState::Incremental ||
+	       gen != status.incremental_gen);
+      drain_all();
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+} // namespace rgw::bucket_sync_run
+
+int RGWBucketPipeSyncStatusManager::run(const DoutPrefixProvider *dpp)
+{
+  list<RGWCoroutinesStack *> stacks;
+  for (auto& source : sources) {
+    auto stack = new RGWCoroutinesStack(driver->ctx(), &cr_mgr);
+    stack->call(new rgw::bucket_sync_run::SourceCR(
+		  source.sc, source.info, source.dest, source.handler,
+		  source.zone_name));
+    stacks.push_back(stack);
+  }
+  auto ret = cr_mgr.run(dpp, stacks);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: Sync unsuccessful on bucket "
+		       << bucket_str{dest_bucket} << dendl;
+  }
+  return ret;
+}
+
+unsigned RGWBucketPipeSyncStatusManager::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& RGWBucketPipeSyncStatusManager::gen_prefix(std::ostream& out) const
+{
+  auto zone = std::string_view{source_zone.value_or(rgw_zone_id("*")).id};
+  return out << "bucket sync zone:" << zone.substr(0, 8)
+    << " bucket:" << dest_bucket << ' ';
+}
+
+string RGWBucketPipeSyncStatusManager::full_status_oid(const rgw_zone_id& source_zone,
+                                                       const rgw_bucket& source_bucket,
+                                                       const rgw_bucket& dest_bucket)
+{
+  if (source_bucket == dest_bucket) {
+    return bucket_full_status_oid_prefix + "." + source_zone.id + ":"
+        + dest_bucket.get_key();
+  } else {
+    return bucket_full_status_oid_prefix + "." + source_zone.id + ":"
+        + dest_bucket.get_key() + ":" + source_bucket.get_key();
+  }
+}
+
+inline std::string generation_token(uint64_t gen) {
+  return (gen == 0) ? "" : (":" + std::to_string(gen));
+}
+
+string RGWBucketPipeSyncStatusManager::inc_status_oid(const rgw_zone_id& source_zone,
+                                                      const rgw_bucket_sync_pair_info& sync_pair,
+                                                      uint64_t gen)
+{
+  if (sync_pair.source_bs.bucket == sync_pair.dest_bucket) {
+    return bucket_status_oid_prefix + "." + source_zone.id + ":" + sync_pair.source_bs.get_key() + 
+      generation_token(gen);
+  } else {
+    return bucket_status_oid_prefix + "." + source_zone.id + ":" + sync_pair.dest_bucket.get_key() + ":" + sync_pair.source_bs.get_key() +
+      generation_token(gen);
+  }
+}
+
+string RGWBucketPipeSyncStatusManager::obj_status_oid(const rgw_bucket_sync_pipe& sync_pipe,
+                                                      const rgw_zone_id& source_zone,
+                                                      const rgw_obj& obj)
+{
+  string prefix = object_status_oid_prefix + "." + source_zone.id + ":" + obj.bucket.get_key();
+  if (sync_pipe.source_bucket_info.bucket !=
+      sync_pipe.dest_bucket_info.bucket) {
+    prefix += string("/") + sync_pipe.dest_bucket_info.bucket.get_key();
+  }
+  return prefix + ":" + obj.key.name + ":" + obj.key.instance;
+}
+
+int rgw_read_remote_bilog_info(const DoutPrefixProvider *dpp,
+                               RGWRESTConn* conn,
+                               const rgw_bucket& bucket,
+                               rgw_bucket_index_marker_info& info,
+                               BucketIndexShardsManager& markers,
+                               optional_yield y)
+{
+  const auto instance_key = bucket.get_key();
+  const rgw_http_param_pair params[] = {
+    { "type" , "bucket-index" },
+    { "bucket-instance", instance_key.c_str() },
+    { "info" , nullptr },
+    { nullptr, nullptr }
+  };
+  int r = conn->get_json_resource(dpp, "/admin/log/", params, y, info);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "failed to fetch remote log markers: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  // parse shard markers
+  r = markers.from_string(info.max_marker, -1);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "failed to decode remote log markers" << dendl;
+    return r;
+  }
+  return 0;
+}
+
+class RGWCollectBucketSyncStatusCR : public RGWShardCollectCR {
+  static constexpr int max_concurrent_shards = 16;
+  rgw::sal::RadosStore* const driver;
+  RGWDataSyncCtx *const sc;
+  RGWDataSyncEnv *const env;
+  const uint64_t gen;
+
+  rgw_bucket_sync_pair_info sync_pair;
+  using Vector = std::vector<rgw_bucket_shard_sync_info>;
+  Vector::iterator i, end;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to read bucket shard sync status: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  RGWCollectBucketSyncStatusCR(rgw::sal::RadosStore* driver, RGWDataSyncCtx *sc,
+                               const rgw_bucket_sync_pair_info& sync_pair,
+                               uint64_t gen,
+                               Vector *status)
+    : RGWShardCollectCR(sc->cct, max_concurrent_shards),
+      driver(driver), sc(sc), env(sc->env), gen(gen), sync_pair(sync_pair),
+      i(status->begin()), end(status->end())
+  {}
+
+  bool spawn_next() override {
+    if (i == end) {
+      return false;
+    }
+    spawn(new RGWReadBucketPipeSyncStatusCoroutine(sc, sync_pair, &*i, nullptr, gen), false);
+    ++i;
+    ++sync_pair.source_bs.shard_id;
+    return true;
+  }
+};
+
+int rgw_read_bucket_full_sync_status(const DoutPrefixProvider *dpp,
+                                     rgw::sal::RadosStore *driver,
+                                     const rgw_sync_bucket_pipe& pipe,
+                                     rgw_bucket_sync_status *status,
+                                     optional_yield y)
+{
+  auto get_oid = RGWBucketPipeSyncStatusManager::full_status_oid;
+  const rgw_raw_obj obj{driver->svc()->zone->get_zone_params().log_pool,
+                        get_oid(*pipe.source.zone, *pipe.source.bucket, *pipe.dest.bucket)};
+
+  auto svc = driver->svc()->sysobj;
+  auto sysobj = svc->get_obj(obj);
+  bufferlist bl;
+  int ret = sysobj.rop().read(dpp, &bl, y);
+  if (ret < 0)
+    return ret;
+
+  try {
+    auto iter = bl.cbegin();
+    using ceph::decode;
+    rgw_bucket_sync_status result;
+    decode(result, iter);
+    *status = result;
+    return 0;
+  } catch (const buffer::error& err) {
+    lderr(svc->ctx()) << "error decoding " << obj << ": " << err.what() << dendl;
+    return -EIO;
+  }
+}
+
+int rgw_read_bucket_inc_sync_status(const DoutPrefixProvider *dpp,
+                                    rgw::sal::RadosStore *driver,
+                                    const rgw_sync_bucket_pipe& pipe,
+                                    uint64_t gen,
+                                    std::vector<rgw_bucket_shard_sync_info> *status)
+{
+  if (!pipe.source.zone ||
+      !pipe.source.bucket ||
+      !pipe.dest.zone ||
+      !pipe.dest.bucket) {
+    return -EINVAL;
+  }
+
+  rgw_bucket_sync_pair_info sync_pair;
+  sync_pair.source_bs.bucket = *pipe.source.bucket;
+  sync_pair.source_bs.shard_id = 0;
+  sync_pair.dest_bucket = *pipe.dest.bucket;
+
+  RGWDataSyncEnv env;
+  RGWSyncModuleInstanceRef module; // null sync module
+  env.init(dpp, driver->ctx(), driver, driver->svc(), driver->svc()->rados->get_async_processor(),
+           nullptr, nullptr, nullptr, module, nullptr);
+
+  RGWDataSyncCtx sc;
+  sc.init(&env, nullptr, *pipe.source.zone);
+
+  RGWCoroutinesManager crs(driver->ctx(), driver->getRados()->get_cr_registry());
+  return crs.run(dpp, new RGWCollectBucketSyncStatusCR(driver, &sc,
+                                                  sync_pair,
+                                                  gen,
+                                                  status));
+}
+
+void rgw_data_sync_info::generate_test_instances(list<rgw_data_sync_info*>& o)
+{
+  auto info = new rgw_data_sync_info;
+  info->state = rgw_data_sync_info::StateBuildingFullSyncMaps;
+  info->num_shards = 8;
+  o.push_back(info);
+  o.push_back(new rgw_data_sync_info);
+}
+
+void rgw_data_sync_marker::generate_test_instances(list<rgw_data_sync_marker*>& o)
+{
+  auto marker = new rgw_data_sync_marker;
+  marker->state = rgw_data_sync_marker::IncrementalSync;
+  marker->marker = "01234";
+  marker->pos = 5;
+  o.push_back(marker);
+  o.push_back(new rgw_data_sync_marker);
+}
+
+void rgw_data_sync_status::generate_test_instances(list<rgw_data_sync_status*>& o)
+{
+  o.push_back(new rgw_data_sync_status);
+}
+
+void rgw_bucket_shard_full_sync_marker::dump(Formatter *f) const
+{
+  encode_json("position", position, f);
+  encode_json("count", count, f);
+}
+
+void rgw_bucket_shard_inc_sync_marker::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("position", position, obj);
+  JSONDecoder::decode_json("timestamp", timestamp, obj);
+}
+
+void rgw_bucket_shard_inc_sync_marker::dump(Formatter *f) const
+{
+  encode_json("position", position, f);
+  encode_json("timestamp", timestamp, f);
+}
+
+void rgw_bucket_shard_sync_info::decode_json(JSONObj *obj)
+{
+  std::string s;
+  JSONDecoder::decode_json("status", s, obj);
+  if (s == "full-sync") {
+    state = StateFullSync;
+  } else if (s == "incremental-sync") {
+    state = StateIncrementalSync;
+  } else if (s == "stopped") {
+    state = StateStopped;
+  } else {
+    state = StateInit;
+  }
+  JSONDecoder::decode_json("inc_marker", inc_marker, obj);
+}
+
+void rgw_bucket_shard_full_sync_marker::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("position", position, obj);
+  JSONDecoder::decode_json("count", count, obj);
+}
+
+void rgw_bucket_shard_sync_info::dump(Formatter *f) const
+{
+  const char *s{nullptr};
+  switch ((SyncState)state) {
+    case StateInit:
+    s = "init";
+    break;
+  case StateFullSync:
+    s = "full-sync";
+    break;
+  case StateIncrementalSync:
+    s = "incremental-sync";
+    break;
+  case StateStopped:
+    s = "stopped";
+    break;
+  default:
+    s = "unknown";
+    break;
+  }
+  encode_json("status", s, f);
+  encode_json("inc_marker", inc_marker, f);
+}
+
+void rgw_bucket_full_sync_status::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("position", position, obj);
+  JSONDecoder::decode_json("count", count, obj);
+}
+
+void rgw_bucket_full_sync_status::dump(Formatter *f) const
+{
+  encode_json("position", position, f);
+  encode_json("count", count, f);
+}
+
+void encode_json(const char *name, BucketSyncState state, Formatter *f)
+{
+  switch (state) {
+  case BucketSyncState::Init:
+    encode_json(name, "init", f);
+    break;
+  case BucketSyncState::Full:
+    encode_json(name, "full-sync", f);
+    break;
+  case BucketSyncState::Incremental:
+    encode_json(name, "incremental-sync", f);
+    break;
+  case BucketSyncState::Stopped:
+    encode_json(name, "stopped", f);
+    break;
+  default:
+    encode_json(name, "unknown", f);
+    break;
+  }
+}
+
+void decode_json_obj(BucketSyncState& state, JSONObj *obj)
+{
+  std::string s;
+  decode_json_obj(s, obj);
+  if (s == "full-sync") {
+    state = BucketSyncState::Full;
+  } else if (s == "incremental-sync") {
+    state = BucketSyncState::Incremental;
+  } else if (s == "stopped") {
+    state = BucketSyncState::Stopped;
+  } else {
+    state = BucketSyncState::Init;
+  }
+}
+
+void rgw_bucket_sync_status::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("state", state, obj);
+  JSONDecoder::decode_json("full", full, obj);
+  JSONDecoder::decode_json("incremental_gen", incremental_gen, obj);
+}
+
+void rgw_bucket_sync_status::dump(Formatter *f) const
+{
+  encode_json("state", state, f);
+  encode_json("full", full, f);
+  encode_json("incremental_gen", incremental_gen, f);
+}
+
+
+void bilog_status_v2::dump(Formatter *f) const
+{
+  encode_json("sync_status", sync_status, f);
+  encode_json("inc_status", inc_status, f);
+}
+
+void bilog_status_v2::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("sync_status", sync_status, obj);
+  JSONDecoder::decode_json("inc_status", inc_status, obj);
+}
diff --git a/src/rgw/driver/rados/rgw_data_sync.h b/src/rgw/driver/rados/rgw_data_sync.h
new file mode 100644
index 000000000..b9a39343f
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_data_sync.h
@@ -0,0 +1,868 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+
+#include "include/encoding.h"
+
+#include "common/ceph_json.h"
+#include "common/likely.h"
+
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_http_client.h"
+#include "rgw_sal_rados.h"
+
+#include "rgw_datalog.h"
+#include "rgw_sync.h"
+#include "rgw_sync_module.h"
+#include "rgw_sync_trace.h"
+#include "rgw_sync_policy.h"
+
+#include "rgw_bucket_sync.h"
+
+// represents an obligation to sync an entry up a given time
+struct rgw_data_sync_obligation {
+  rgw_bucket_shard bs;
+  std::optional<uint64_t> gen;
+  std::string marker;
+  ceph::real_time timestamp;
+  bool retry = false;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_data_sync_obligation& o) {
+  out << "key=" << o.bs;
+  if (o.gen) {
+    out << '[' << *o.gen << ']';
+  }
+  if (!o.marker.empty()) {
+    out << " marker=" << o.marker;
+  }
+  if (o.timestamp != ceph::real_time{}) {
+    out << " timestamp=" << o.timestamp;
+  }
+  if (o.retry) {
+    out << " retry";
+  }
+  return out;
+}
+
+class JSONObj;
+struct rgw_sync_bucket_pipe;
+
+struct rgw_bucket_sync_pair_info {
+  RGWBucketSyncFlowManager::pipe_handler handler; /* responsible for sync filters */
+  rgw_bucket_shard source_bs;
+  rgw_bucket dest_bucket;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_sync_pair_info& p) {
+  if (p.source_bs.bucket == p.dest_bucket) {
+    return out << p.source_bs;
+  }
+  return out << p.source_bs << "->" << p.dest_bucket;
+}
+
+struct rgw_bucket_sync_pipe {
+  rgw_bucket_sync_pair_info info;
+  RGWBucketInfo source_bucket_info;
+  std::map<std::string, bufferlist> source_bucket_attrs;
+  RGWBucketInfo dest_bucket_info;
+  std::map<std::string, bufferlist> dest_bucket_attrs;
+
+  RGWBucketSyncFlowManager::pipe_rules_ref& get_rules() {
+    return info.handler.rules;
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_sync_pipe& p) {
+  return out << p.info;
+}
+
+struct rgw_datalog_info {
+  uint32_t num_shards;
+
+  rgw_datalog_info() : num_shards(0) {}
+
+  void decode_json(JSONObj *obj);
+};
+
+struct rgw_data_sync_info {
+  enum SyncState {
+    StateInit = 0,
+    StateBuildingFullSyncMaps = 1,
+    StateSync = 2,
+  };
+
+  uint16_t state;
+  uint32_t num_shards;
+
+  uint64_t instance_id{0};
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(state, bl);
+    encode(num_shards, bl);
+    encode(instance_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(2, bl);
+     decode(state, bl);
+     decode(num_shards, bl);
+     if (struct_v >= 2) {
+       decode(instance_id, bl);
+     }
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const {
+    std::string s;
+    switch ((SyncState)state) {
+      case StateInit:
+	s = "init";
+	break;
+      case StateBuildingFullSyncMaps:
+	s = "building-full-sync-maps";
+	break;
+      case StateSync:
+	s = "sync";
+	break;
+      default:
+	s = "unknown";
+	break;
+    }
+    encode_json("status", s, f);
+    encode_json("num_shards", num_shards, f);
+    encode_json("instance_id", instance_id, f);
+  }
+  void decode_json(JSONObj *obj) {
+    std::string s;
+    JSONDecoder::decode_json("status", s, obj);
+    if (s == "building-full-sync-maps") {
+      state = StateBuildingFullSyncMaps;
+    } else if (s == "sync") {
+      state = StateSync;
+    } else {
+      state = StateInit;
+    }
+    JSONDecoder::decode_json("num_shards", num_shards, obj);
+    JSONDecoder::decode_json("instance_id", instance_id, obj);
+  }
+  static void generate_test_instances(std::list<rgw_data_sync_info*>& o);
+
+  rgw_data_sync_info() : state((int)StateInit), num_shards(0) {}
+};
+WRITE_CLASS_ENCODER(rgw_data_sync_info)
+
+struct rgw_data_sync_marker {
+  enum SyncState {
+    FullSync = 0,
+    IncrementalSync = 1,
+  };
+  uint16_t state;
+  std::string marker;
+  std::string next_step_marker;
+  uint64_t total_entries;
+  uint64_t pos;
+  real_time timestamp;
+
+  rgw_data_sync_marker() : state(FullSync), total_entries(0), pos(0) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(state, bl);
+    encode(marker, bl);
+    encode(next_step_marker, bl);
+    encode(total_entries, bl);
+    encode(pos, bl);
+    encode(timestamp, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+    decode(state, bl);
+    decode(marker, bl);
+    decode(next_step_marker, bl);
+    decode(total_entries, bl);
+    decode(pos, bl);
+    decode(timestamp, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const {
+    const char *s{nullptr};
+    switch ((SyncState)state) {
+      case FullSync:
+        s = "full-sync";
+        break;
+      case IncrementalSync:
+        s = "incremental-sync";
+        break;
+      default:
+        s = "unknown";
+        break;
+    }
+    encode_json("status", s, f);
+    encode_json("marker", marker, f);
+    encode_json("next_step_marker", next_step_marker, f);
+    encode_json("total_entries", total_entries, f);
+    encode_json("pos", pos, f);
+    encode_json("timestamp", utime_t(timestamp), f);
+  }
+  void decode_json(JSONObj *obj) {
+    std::string s;
+    JSONDecoder::decode_json("status", s, obj);
+    if (s == "full-sync") {
+      state = FullSync;
+    } else if (s == "incremental-sync") {
+      state = IncrementalSync;
+    }
+    JSONDecoder::decode_json("marker", marker, obj);
+    JSONDecoder::decode_json("next_step_marker", next_step_marker, obj);
+    JSONDecoder::decode_json("total_entries", total_entries, obj);
+    JSONDecoder::decode_json("pos", pos, obj);
+    utime_t t;
+    JSONDecoder::decode_json("timestamp", t, obj);
+    timestamp = t.to_real_time();
+  }
+  static void generate_test_instances(std::list<rgw_data_sync_marker*>& o);
+};
+WRITE_CLASS_ENCODER(rgw_data_sync_marker)
+
+struct rgw_data_sync_status {
+  rgw_data_sync_info sync_info;
+  std::map<uint32_t, rgw_data_sync_marker> sync_markers;
+
+  rgw_data_sync_status() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(sync_info, bl);
+    /* sync markers are encoded separately */
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+    decode(sync_info, bl);
+    /* sync markers are decoded separately */
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const {
+    encode_json("info", sync_info, f);
+    encode_json("markers", sync_markers, f);
+  }
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("info", sync_info, obj);
+    JSONDecoder::decode_json("markers", sync_markers, obj);
+  }
+  static void generate_test_instances(std::list<rgw_data_sync_status*>& o);
+};
+WRITE_CLASS_ENCODER(rgw_data_sync_status)
+
+struct rgw_datalog_entry {
+  std::string key;
+  ceph::real_time timestamp;
+
+  void decode_json(JSONObj *obj);
+};
+
+struct rgw_datalog_shard_data {
+  std::string marker;
+  bool truncated;
+  std::vector<rgw_datalog_entry> entries;
+
+  void decode_json(JSONObj *obj);
+};
+
+class RGWAsyncRadosProcessor;
+class RGWDataSyncControlCR;
+
+struct rgw_bucket_entry_owner {
+  std::string id;
+  std::string display_name;
+
+  rgw_bucket_entry_owner() {}
+  rgw_bucket_entry_owner(const std::string& _id, const std::string& _display_name) : id(_id), display_name(_display_name) {}
+
+  void decode_json(JSONObj *obj);
+};
+
+class RGWSyncErrorLogger;
+class RGWRESTConn;
+class RGWServices;
+
+struct RGWDataSyncEnv {
+  const DoutPrefixProvider *dpp{nullptr};
+  CephContext *cct{nullptr};
+  rgw::sal::RadosStore* driver{nullptr};
+  RGWServices *svc{nullptr};
+  RGWAsyncRadosProcessor *async_rados{nullptr};
+  RGWHTTPManager *http_manager{nullptr};
+  RGWSyncErrorLogger *error_logger{nullptr};
+  RGWSyncTraceManager *sync_tracer{nullptr};
+  RGWSyncModuleInstanceRef sync_module{nullptr};
+  PerfCounters* counters{nullptr};
+
+  RGWDataSyncEnv() {}
+
+  void init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _driver, RGWServices *_svc,
+            RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
+            RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer,
+            RGWSyncModuleInstanceRef& _sync_module,
+            PerfCounters* _counters) {
+     dpp = _dpp;
+    cct = _cct;
+    driver = _driver;
+    svc = _svc;
+    async_rados = _async_rados;
+    http_manager = _http_manager;
+    error_logger = _error_logger;
+    sync_tracer = _sync_tracer;
+    sync_module = _sync_module;
+    counters = _counters;
+  }
+
+  std::string shard_obj_name(int shard_id);
+  std::string status_oid();
+
+  std::ostream* ostr{nullptr}; // For pretty printing progress
+};
+
+// pretty ostream output for `radosgw-admin bucket sync run`
+#if FMT_VERSION >= 90000
+template<typename ...T>
+void pretty_print(const RGWDataSyncEnv* env, fmt::format_string<T...> fmt, T&& ...t) {
+#else
+template<typename S, typename ...T>
+void pretty_print(const RGWDataSyncEnv* env, const S& fmt, T&& ...t) {
+#endif
+  if (unlikely(!!env->ostr)) {
+    fmt::print(*env->ostr, fmt, std::forward<T>(t)...);
+    env->ostr->flush();
+  }
+}
+
+/// \brief Adjust concurrency based on latency
+///
+/// Keep a running average of operation latency and scale concurrency
+/// down when latency rises.
+class LatencyConcurrencyControl : public LatencyMonitor {
+  static constexpr auto dout_subsys = ceph_subsys_rgw;
+  ceph::coarse_mono_time last_warning;
+public:
+  CephContext* cct;
+
+  LatencyConcurrencyControl(CephContext* cct)
+    : cct(cct)  {}
+
+  /// \brief Lower concurrency when latency rises
+  ///
+  /// Since we have multiple spawn windows (data sync overall and
+  /// bucket), accept a number of concurrent operations to spawn and,
+  /// if latency is high, cut it in half. If latency is really high,
+  /// cut it to 1.
+  int64_t adj_concurrency(int64_t concurrency) {
+    using namespace std::literals;
+    auto threshold = (cct->_conf->rgw_sync_lease_period * 1s) / 12;
+
+    if (avg_latency() >= 2 * threshold) [[unlikely]] {
+      auto now = ceph::coarse_mono_clock::now();
+      if (now - last_warning > 5min) {
+        ldout(cct, -1)
+            << "WARNING: The OSD cluster is overloaded and struggling to "
+            << "complete ops. You need more capacity to serve this level "
+	    << "of demand." << dendl;
+	last_warning = now;
+      }
+      return 1;
+    } else if (avg_latency() >= threshold) [[unlikely]] {
+      return concurrency / 2;
+    } else [[likely]] {
+      return concurrency;
+    }
+  }
+};
+
+struct RGWDataSyncCtx {
+  RGWDataSyncEnv *env{nullptr};
+  CephContext *cct{nullptr};
+
+  RGWRESTConn *conn{nullptr};
+  rgw_zone_id source_zone;
+
+  LatencyConcurrencyControl lcc{nullptr};
+
+  RGWDataSyncCtx() = default;
+
+  RGWDataSyncCtx(RGWDataSyncEnv* env,
+		 RGWRESTConn* conn,
+		 const rgw_zone_id& source_zone)
+    : env(env), cct(env->cct), conn(conn), source_zone(source_zone), lcc(cct) {}
+
+  void init(RGWDataSyncEnv *_env,
+            RGWRESTConn *_conn,
+            const rgw_zone_id& _source_zone) {
+    cct = _env->cct;
+    env = _env;
+    conn = _conn;
+    source_zone = _source_zone;
+    lcc.cct = cct;
+  }
+};
+
+class RGWRados;
+
+class RGWRemoteDataLog : public RGWCoroutinesManager {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* driver;
+  CephContext *cct;
+  RGWCoroutinesManagerRegistry *cr_registry;
+  RGWAsyncRadosProcessor *async_rados;
+  RGWHTTPManager http_manager;
+
+  RGWDataSyncEnv sync_env;
+  RGWDataSyncCtx sc;
+
+  ceph::shared_mutex lock = ceph::make_shared_mutex("RGWRemoteDataLog::lock");
+  RGWDataSyncControlCR *data_sync_cr;
+
+  RGWSyncTraceNodeRef tn;
+
+  bool initialized;
+
+public:
+  RGWRemoteDataLog(const DoutPrefixProvider *dpp,
+                   rgw::sal::RadosStore* _store,
+                   RGWAsyncRadosProcessor *async_rados);
+  int init(const rgw_zone_id& _source_zone, RGWRESTConn *_conn, RGWSyncErrorLogger *_error_logger,
+           RGWSyncTraceManager *_sync_tracer, RGWSyncModuleInstanceRef& module,
+           PerfCounters* _counters);
+  void finish();
+
+  int read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info);
+  int read_source_log_shards_info(const DoutPrefixProvider *dpp, std::map<int, RGWDataChangesLogInfo> *shards_info);
+  int read_source_log_shards_next(const DoutPrefixProvider *dpp, std::map<int, std::string> shard_markers, std::map<int, rgw_datalog_shard_data> *result);
+  int read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status);
+  int read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, std::set<int>& recovering_shards);
+  int read_shard_status(const DoutPrefixProvider *dpp, int shard_id, std::set<std::string>& lagging_buckets,std::set<std::string>& recovering_buckets, rgw_data_sync_marker* sync_marker, const int max_entries);
+  int init_sync_status(const DoutPrefixProvider *dpp, int num_shards);
+  int run_sync(const DoutPrefixProvider *dpp, int num_shards);
+
+  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries);
+};
+
+class RGWDataSyncStatusManager : public DoutPrefixProvider {
+  rgw::sal::RadosStore* driver;
+
+  rgw_zone_id source_zone;
+  RGWRESTConn *conn;
+  RGWSyncErrorLogger *error_logger;
+  RGWSyncModuleInstanceRef sync_module;
+  PerfCounters* counters;
+
+  RGWRemoteDataLog source_log;
+
+  std::string source_status_oid;
+  std::string source_shard_status_oid_prefix;
+
+  std::map<int, rgw_raw_obj> shard_objs;
+
+  int num_shards;
+
+public:
+  RGWDataSyncStatusManager(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
+                           const rgw_zone_id& _source_zone, PerfCounters* counters)
+    : driver(_driver), source_zone(_source_zone), conn(NULL), error_logger(NULL),
+      sync_module(nullptr), counters(counters),
+      source_log(this, driver, async_rados), num_shards(0) {}
+  RGWDataSyncStatusManager(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
+                           const rgw_zone_id& _source_zone, PerfCounters* counters,
+                           const RGWSyncModuleInstanceRef& _sync_module)
+    : driver(_driver), source_zone(_source_zone), conn(NULL), error_logger(NULL),
+      sync_module(_sync_module), counters(counters),
+      source_log(this, driver, async_rados), num_shards(0) {}
+  ~RGWDataSyncStatusManager() {
+    finalize();
+  }
+  int init(const DoutPrefixProvider *dpp);
+  void finalize();
+
+  static std::string shard_obj_name(const rgw_zone_id& source_zone, int shard_id);
+  static std::string sync_status_oid(const rgw_zone_id& source_zone);
+
+  int read_sync_status(const DoutPrefixProvider *dpp, rgw_data_sync_status *sync_status) {
+    return source_log.read_sync_status(dpp, sync_status);
+  }
+
+  int read_recovering_shards(const DoutPrefixProvider *dpp, const int num_shards, std::set<int>& recovering_shards) {
+    return source_log.read_recovering_shards(dpp, num_shards, recovering_shards);
+  }
+
+  int read_shard_status(const DoutPrefixProvider *dpp, int shard_id, std::set<std::string>& lagging_buckets, std::set<std::string>& recovering_buckets, rgw_data_sync_marker *sync_marker, const int max_entries) {
+    return source_log.read_shard_status(dpp, shard_id, lagging_buckets, recovering_buckets,sync_marker, max_entries);
+  }
+  int init_sync_status(const DoutPrefixProvider *dpp) { return source_log.init_sync_status(dpp, num_shards); }
+
+  int read_log_info(const DoutPrefixProvider *dpp, rgw_datalog_info *log_info) {
+    return source_log.read_log_info(dpp, log_info);
+  }
+  int read_source_log_shards_info(const DoutPrefixProvider *dpp, std::map<int, RGWDataChangesLogInfo> *shards_info) {
+    return source_log.read_source_log_shards_info(dpp, shards_info);
+  }
+  int read_source_log_shards_next(const DoutPrefixProvider *dpp, std::map<int, std::string> shard_markers, std::map<int, rgw_datalog_shard_data> *result) {
+    return source_log.read_source_log_shards_next(dpp, shard_markers, result);
+  }
+
+  int run(const DoutPrefixProvider *dpp) { return source_log.run_sync(dpp, num_shards); }
+
+  void wakeup(int shard_id, bc::flat_set<rgw_data_notify_entry>& entries) { return source_log.wakeup(shard_id, entries); }
+
+  void stop() {
+    source_log.finish();
+  }
+
+  // implements DoutPrefixProvider
+  CephContext *get_cct() const override;
+  unsigned get_subsys() const override;
+  std::ostream& gen_prefix(std::ostream& out) const override;
+};
+
+class RGWBucketPipeSyncStatusManager;
+class RGWBucketSyncCR;
+
+struct rgw_bucket_shard_full_sync_marker {
+  rgw_obj_key position;
+  uint64_t count;
+
+  rgw_bucket_shard_full_sync_marker() : count(0) {}
+
+  void encode_attr(std::map<std::string, bufferlist>& attrs);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(position, bl);
+    encode(count, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+    decode(position, bl);
+    decode(count, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_shard_full_sync_marker)
+
+struct rgw_bucket_shard_inc_sync_marker {
+  std::string position;
+  ceph::real_time timestamp;
+
+  void encode_attr(std::map<std::string, bufferlist>& attrs);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(position, bl);
+    encode(timestamp, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(position, bl);
+    if (struct_v >= 2) {
+      decode(timestamp, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_shard_inc_sync_marker)
+
+struct rgw_bucket_shard_sync_info {
+  enum SyncState {
+    StateInit = 0,
+    StateFullSync = 1,
+    StateIncrementalSync = 2,
+    StateStopped = 3,
+  };
+
+  uint16_t state;
+  rgw_bucket_shard_inc_sync_marker inc_marker;
+
+  void decode_from_attrs(CephContext *cct, std::map<std::string, bufferlist>& attrs);
+  void encode_all_attrs(std::map<std::string, bufferlist>& attrs);
+  void encode_state_attr(std::map<std::string, bufferlist>& attrs);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(state, bl);
+    encode(inc_marker, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(2, bl);
+     decode(state, bl);
+     if (struct_v <= 1) {
+       rgw_bucket_shard_full_sync_marker full_marker;
+       decode(full_marker, bl);
+     }
+     decode(inc_marker, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  rgw_bucket_shard_sync_info() : state((int)StateInit) {}
+
+};
+WRITE_CLASS_ENCODER(rgw_bucket_shard_sync_info)
+
+struct rgw_bucket_full_sync_status {
+  rgw_obj_key position;
+  uint64_t count = 0;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(position, bl);
+    encode(count, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(position, bl);
+    decode(count, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_full_sync_status)
+
+enum class BucketSyncState : uint8_t {
+  Init = 0,
+  Full,
+  Incremental,
+  Stopped,
+};
+inline std::ostream& operator<<(std::ostream& out, const BucketSyncState& s) {
+  switch (s) {
+  case BucketSyncState::Init: out << "init"; break;
+  case BucketSyncState::Full: out << "full"; break;
+  case BucketSyncState::Incremental: out << "incremental"; break;
+  case BucketSyncState::Stopped: out << "stopped"; break;
+  }
+  return out;
+}
+
+void encode_json(const char *name, BucketSyncState state, Formatter *f);
+void decode_json_obj(BucketSyncState& state, JSONObj *obj);
+
+struct rgw_bucket_sync_status {
+  BucketSyncState state = BucketSyncState::Init;
+  rgw_bucket_full_sync_status full;
+  uint64_t incremental_gen = 0;
+  std::vector<bool> shards_done_with_gen;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(state, bl);
+    encode(full, bl);
+    encode(incremental_gen, bl);
+    encode(shards_done_with_gen, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(state, bl);
+    decode(full, bl);
+    if (struct_v > 1) {
+      decode(incremental_gen, bl);
+      decode(shards_done_with_gen, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_bucket_sync_status)
+
+struct bilog_status_v2 {
+  rgw_bucket_sync_status sync_status;
+  std::vector<rgw_bucket_shard_sync_info> inc_status;
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+
+struct store_gen_shards {
+  uint64_t gen = 0;
+  uint32_t num_shards = 0;
+
+  void dump(Formatter *f) const {
+    encode_json("gen", gen, f);
+    encode_json("num_shards", num_shards, f);
+  }
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("gen", gen, obj);
+    JSONDecoder::decode_json("num_shards", num_shards, obj);
+  }
+};
+
+struct rgw_bucket_index_marker_info {
+  std::string bucket_ver;
+  std::string master_ver;
+  std::string max_marker;
+  bool syncstopped{false};
+  uint64_t oldest_gen = 0;
+  uint64_t latest_gen = 0;
+  std::vector<store_gen_shards> generations;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("bucket_ver", bucket_ver, obj);
+    JSONDecoder::decode_json("master_ver", master_ver, obj);
+    JSONDecoder::decode_json("max_marker", max_marker, obj);
+    JSONDecoder::decode_json("syncstopped", syncstopped, obj);
+    JSONDecoder::decode_json("oldest_gen", oldest_gen, obj);
+    JSONDecoder::decode_json("latest_gen", latest_gen, obj);
+    JSONDecoder::decode_json("generations", generations, obj);
+  }
+};
+
+
+class BucketIndexShardsManager;
+
+int rgw_read_remote_bilog_info(const DoutPrefixProvider *dpp,
+                               RGWRESTConn* conn,
+                               const rgw_bucket& bucket,
+                               rgw_bucket_index_marker_info& info,
+                               BucketIndexShardsManager& markers,
+                               optional_yield y);
+
+class RGWBucketPipeSyncStatusManager : public DoutPrefixProvider {
+  rgw::sal::RadosStore* driver;
+
+  RGWDataSyncEnv sync_env;
+
+  RGWCoroutinesManager cr_mgr{driver->ctx(),
+                              driver->getRados()->get_cr_registry()};
+
+  RGWHTTPManager http_manager{driver->ctx(), cr_mgr.get_completion_mgr()};
+
+  std::optional<rgw_zone_id> source_zone;
+  std::optional<rgw_bucket> source_bucket;
+
+  std::unique_ptr<RGWSyncErrorLogger> error_logger =
+    std::make_unique<RGWSyncErrorLogger>(driver, RGW_SYNC_ERROR_LOG_SHARD_PREFIX,
+					 ERROR_LOGGER_SHARDS);
+  RGWSyncModuleInstanceRef sync_module;
+
+  rgw_bucket dest_bucket;
+
+  struct source {
+    RGWDataSyncCtx sc;
+    RGWBucketInfo info;
+    rgw_bucket dest;
+    RGWBucketSyncFlowManager::pipe_handler handler;
+    std::string zone_name;
+
+    source(RGWDataSyncEnv* env, const rgw_zone_id& zone, RGWRESTConn* conn,
+	   const RGWBucketInfo& info, const rgw_bucket& dest,
+	   const RGWBucketSyncFlowManager::pipe_handler& handler,
+	   const std::string& zone_name)
+      : sc(env, conn, zone), info(info), dest(dest), handler(handler),
+	zone_name(zone_name) {}
+  };
+  std::vector<source> sources;
+
+  int do_init(const DoutPrefixProvider *dpp, std::ostream* ostr);
+  RGWBucketPipeSyncStatusManager(rgw::sal::RadosStore* driver,
+				 std::optional<rgw_zone_id> source_zone,
+				 std::optional<rgw_bucket> source_bucket,
+				 const rgw_bucket& dest_bucket)
+    : driver(driver), source_zone(source_zone), source_bucket(source_bucket),
+      dest_bucket(dest_bucket) {}
+
+  int remote_info(const DoutPrefixProvider *dpp, source& s,
+		  uint64_t* oldest_gen, uint64_t* latest_gen,
+		  uint64_t* num_shards);
+public:
+  static tl::expected<std::unique_ptr<RGWBucketPipeSyncStatusManager>, int>
+  construct(const DoutPrefixProvider* dpp, rgw::sal::RadosStore* driver,
+	    std::optional<rgw_zone_id> source_zone,
+	    std::optional<rgw_bucket> source_bucket,
+	    const rgw_bucket& dest_bucket, std::ostream *ostream);
+  ~RGWBucketPipeSyncStatusManager() = default;
+
+
+  static std::string full_status_oid(const rgw_zone_id& source_zone,
+				     const rgw_bucket& source_bucket,
+				     const rgw_bucket& dest_bucket);
+  static std::string inc_status_oid(const rgw_zone_id& source_zone,
+				    const rgw_bucket_sync_pair_info& bs,
+				    uint64_t gen);
+  // specific source obj sync status, can be used by sync modules
+  static std::string obj_status_oid(const rgw_bucket_sync_pipe& sync_pipe,
+				    const rgw_zone_id& source_zone,
+				    const rgw_obj& obj);
+
+  // implements DoutPrefixProvider
+  CephContext *get_cct() const override;
+  unsigned get_subsys() const override;
+  std::ostream& gen_prefix(std::ostream& out) const override;
+
+  int init_sync_status(const DoutPrefixProvider *dpp);
+  tl::expected<std::map<int, rgw_bucket_shard_sync_info>, int> read_sync_status(
+    const DoutPrefixProvider *dpp);
+  int run(const DoutPrefixProvider *dpp);
+};
+
+/// read the full sync status with respect to a source bucket
+int rgw_read_bucket_full_sync_status(const DoutPrefixProvider *dpp,
+                                     rgw::sal::RadosStore *driver,
+                                     const rgw_sync_bucket_pipe& pipe,
+                                     rgw_bucket_sync_status *status,
+                                     optional_yield y);
+
+/// read the incremental sync status of all bucket shards from the given source zone
+int rgw_read_bucket_inc_sync_status(const DoutPrefixProvider *dpp,
+                                    rgw::sal::RadosStore *driver,
+                                    const rgw_sync_bucket_pipe& pipe,
+                                    uint64_t gen,
+                                    std::vector<rgw_bucket_shard_sync_info> *status);
+
+class RGWDefaultSyncModule : public RGWSyncModule {
+public:
+  RGWDefaultSyncModule() {}
+  bool supports_writes() override { return true; }
+  bool supports_data_export() override { return true; }
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
+
+class RGWArchiveSyncModule : public RGWDefaultSyncModule {
+public:
+  RGWArchiveSyncModule() {}
+  bool supports_writes() override { return true; }
+  bool supports_data_export() override { return false; }
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
diff --git a/src/rgw/driver/rados/rgw_datalog.cc b/src/rgw/driver/rados/rgw_datalog.cc
new file mode 100644
index 000000000..7ca37abf6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_datalog.cc
@@ -0,0 +1,1090 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <vector>
+
+#include "common/async/yield_context.h"
+#include "common/debug.h"
+#include "common/containers.h"
+#include "common/errno.h"
+#include "common/error_code.h"
+
+#include "common/async/blocked_completion.h"
+#include "common/async/librados_completion.h"
+
+#include "cls/fifo/cls_fifo_types.h"
+#include "cls/log/cls_log_client.h"
+
+#include "cls_fifo_legacy.h"
+#include "rgw_bucket_layout.h"
+#include "rgw_datalog.h"
+#include "rgw_log_backing.h"
+#include "rgw_tools.h"
+
+#define dout_context g_ceph_context
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+namespace bs = boost::system;
+namespace lr = librados;
+
+using ceph::containers::tiny_vector;
+
+void rgw_data_change::dump(ceph::Formatter *f) const
+{
+  std::string type;
+  switch (entity_type) {
+    case ENTITY_TYPE_BUCKET:
+      type = "bucket";
+      break;
+    default:
+      type = "unknown";
+  }
+  encode_json("entity_type", type, f);
+  encode_json("key", key, f);
+  utime_t ut(timestamp);
+  encode_json("timestamp", ut, f);
+  encode_json("gen", gen, f);
+}
+
+void rgw_data_change::decode_json(JSONObj *obj) {
+  std::string s;
+  JSONDecoder::decode_json("entity_type", s, obj);
+  if (s == "bucket") {
+    entity_type = ENTITY_TYPE_BUCKET;
+  } else {
+    entity_type = ENTITY_TYPE_UNKNOWN;
+  }
+  JSONDecoder::decode_json("key", key, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("timestamp", ut, obj);
+  timestamp = ut.to_real_time();
+  JSONDecoder::decode_json("gen", gen, obj);
+}
+
+void rgw_data_change_log_entry::dump(Formatter *f) const
+{
+  encode_json("log_id", log_id, f);
+  utime_t ut(log_timestamp);
+  encode_json("log_timestamp", ut, f);
+  encode_json("entry", entry, f);
+}
+
+void rgw_data_change_log_entry::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("log_id", log_id, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("log_timestamp", ut, obj);
+  log_timestamp = ut.to_real_time();
+  JSONDecoder::decode_json("entry", entry, obj);
+}
+
+void rgw_data_notify_entry::dump(Formatter *f) const
+{
+  encode_json("key", key, f);
+  encode_json("gen", gen, f);
+}
+
+void rgw_data_notify_entry::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("key", key, obj);
+  JSONDecoder::decode_json("gen", gen, obj);
+}
+
+class RGWDataChangesOmap final : public RGWDataChangesBE {
+  using centries = std::list<cls_log_entry>;
+  std::vector<std::string> oids;
+
+public:
+  RGWDataChangesOmap(lr::IoCtx& ioctx,
+		     RGWDataChangesLog& datalog,
+		     uint64_t gen_id,
+		     int num_shards)
+    : RGWDataChangesBE(ioctx, datalog, gen_id) {
+    oids.reserve(num_shards);
+    for (auto i = 0; i < num_shards; ++i) {
+      oids.push_back(get_oid(i));
+    }
+  }
+  ~RGWDataChangesOmap() override = default;
+
+  void prepare(ceph::real_time ut, const std::string& key,
+	       ceph::buffer::list&& entry, entries& out) override {
+    if (!std::holds_alternative<centries>(out)) {
+      ceph_assert(std::visit([](const auto& v) { return std::empty(v); }, out));
+      out = centries();
+    }
+
+    cls_log_entry e;
+    cls_log_add_prepare_entry(e, utime_t(ut), {}, key, entry);
+    std::get<centries>(out).push_back(std::move(e));
+  }
+  int push(const DoutPrefixProvider *dpp, int index, entries&& items, optional_yield y) override {
+    lr::ObjectWriteOperation op;
+    cls_log_add(op, std::get<centries>(items), true);
+    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": failed to push to " << oids[index] << cpp_strerror(-r)
+		 << dendl;
+    }
+    return r;
+  }
+  int push(const DoutPrefixProvider *dpp, int index, ceph::real_time now,
+	   const std::string& key, ceph::buffer::list&& bl,
+	   optional_yield y) override {
+    lr::ObjectWriteOperation op;
+    cls_log_add(op, utime_t(now), {}, key, bl);
+    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": failed to push to " << oids[index]
+		 << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  int list(const DoutPrefixProvider *dpp, int index, int max_entries,
+	   std::vector<rgw_data_change_log_entry>& entries,
+	   std::optional<std::string_view> marker,
+	   std::string* out_marker, bool* truncated,
+	   optional_yield y) override {
+    std::list<cls_log_entry> log_entries;
+    lr::ObjectReadOperation op;
+    cls_log_list(op, {}, {}, std::string(marker.value_or("")),
+		 max_entries, log_entries, out_marker, truncated);
+    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, nullptr, y);
+    if (r == -ENOENT) {
+      *truncated = false;
+      return 0;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": failed to list " << oids[index]
+		 << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    for (auto iter = log_entries.begin(); iter != log_entries.end(); ++iter) {
+      rgw_data_change_log_entry log_entry;
+      log_entry.log_id = iter->id;
+      auto rt = iter->timestamp.to_real_time();
+      log_entry.log_timestamp = rt;
+      auto liter = iter->data.cbegin();
+      try {
+	decode(log_entry.entry, liter);
+      } catch (ceph::buffer::error& err) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		   << ": failed to decode data changes log entry: "
+		   << err.what() << dendl;
+	return -EIO;
+      }
+      entries.push_back(log_entry);
+    }
+    return 0;
+  }
+  int get_info(const DoutPrefixProvider *dpp, int index,
+	       RGWDataChangesLogInfo *info, optional_yield y) override {
+    cls_log_header header;
+    lr::ObjectReadOperation op;
+    cls_log_info(op, &header);
+    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, nullptr, y);
+    if (r == -ENOENT) r = 0;
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": failed to get info from " << oids[index]
+		 << cpp_strerror(-r) << dendl;
+    } else {
+      info->marker = header.max_marker;
+      info->last_update = header.max_time.to_real_time();
+    }
+    return r;
+  }
+  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+	   optional_yield y) override {
+    lr::ObjectWriteOperation op;
+    cls_log_trim(op, {}, {}, {}, std::string(marker));
+    auto r = rgw_rados_operate(dpp, ioctx, oids[index], &op, y);
+    if (r == -ENOENT) r = -ENODATA;
+    if (r < 0 && r != -ENODATA) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": failed to get info from " << oids[index]
+		 << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+	   lr::AioCompletion* c) override {
+    lr::ObjectWriteOperation op;
+    cls_log_trim(op, {}, {}, {}, std::string(marker));
+    auto r = ioctx.aio_operate(oids[index], c, &op, 0);
+    if (r == -ENOENT) r = -ENODATA;
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": failed to get info from " << oids[index]
+		 << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  std::string_view max_marker() const override {
+    return "99999999";
+  }
+  int is_empty(const DoutPrefixProvider *dpp, optional_yield y) override {
+    for (auto shard = 0u; shard < oids.size(); ++shard) {
+      std::list<cls_log_entry> log_entries;
+      lr::ObjectReadOperation op;
+      std::string out_marker;
+      bool truncated;
+      cls_log_list(op, {}, {}, {}, 1, log_entries, &out_marker, &truncated);
+      auto r = rgw_rados_operate(dpp, ioctx, oids[shard], &op, nullptr, y);
+      if (r == -ENOENT) {
+	continue;
+      }
+      if (r < 0) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		   << ": failed to list " << oids[shard]
+		   << cpp_strerror(-r) << dendl;
+	return r;
+      }
+      if (!log_entries.empty()) {
+	return 0;
+      }
+    }
+    return 1;
+  }
+};
+
+class RGWDataChangesFIFO final : public RGWDataChangesBE {
+  using centries = std::vector<ceph::buffer::list>;
+  tiny_vector<LazyFIFO> fifos;
+
+public:
+  RGWDataChangesFIFO(lr::IoCtx& ioctx,
+		     RGWDataChangesLog& datalog,
+		     uint64_t gen_id, int shards)
+    : RGWDataChangesBE(ioctx, datalog, gen_id),
+      fifos(shards, [&ioctx, this](std::size_t i, auto emplacer) {
+	emplacer.emplace(ioctx, get_oid(i));
+      }) {}
+  ~RGWDataChangesFIFO() override = default;
+  void prepare(ceph::real_time, const std::string&,
+	       ceph::buffer::list&& entry, entries& out) override {
+    if (!std::holds_alternative<centries>(out)) {
+      ceph_assert(std::visit([](auto& v) { return std::empty(v); }, out));
+      out = centries();
+    }
+    std::get<centries>(out).push_back(std::move(entry));
+  }
+  int push(const DoutPrefixProvider *dpp, int index, entries&& items,
+	   optional_yield y) override {
+    auto r = fifos[index].push(dpp, std::get<centries>(items), y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": unable to push to FIFO: " << get_oid(index)
+		 << ": " << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  int push(const DoutPrefixProvider *dpp, int index, ceph::real_time,
+	   const std::string&, ceph::buffer::list&& bl,
+	   optional_yield y) override {
+    auto r = fifos[index].push(dpp, std::move(bl), y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": unable to push to FIFO: " << get_oid(index)
+		 << ": " << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  int list(const DoutPrefixProvider *dpp, int index, int max_entries,
+	   std::vector<rgw_data_change_log_entry>& entries,
+	   std::optional<std::string_view> marker, std::string* out_marker,
+	   bool* truncated, optional_yield y) override {
+    std::vector<rgw::cls::fifo::list_entry> log_entries;
+    bool more = false;
+    auto r = fifos[index].list(dpp, max_entries, marker, &log_entries, &more,
+			       y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": unable to list FIFO: " << get_oid(index)
+		 << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    for (const auto& entry : log_entries) {
+      rgw_data_change_log_entry log_entry;
+      log_entry.log_id = entry.marker;
+      log_entry.log_timestamp = entry.mtime;
+      auto liter = entry.data.cbegin();
+      try {
+	decode(log_entry.entry, liter);
+      } catch (const buffer::error& err) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		   << ": failed to decode data changes log entry: "
+		   << err.what() << dendl;
+	return -EIO;
+      }
+      entries.push_back(std::move(log_entry));
+    }
+    if (truncated)
+      *truncated = more;
+    if (out_marker && !log_entries.empty()) {
+      *out_marker = log_entries.back().marker;
+    }
+    return 0;
+  }
+  int get_info(const DoutPrefixProvider *dpp, int index,
+	       RGWDataChangesLogInfo *info, optional_yield y) override {
+    auto& fifo = fifos[index];
+    auto r = fifo.read_meta(dpp, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": unable to get FIFO metadata: " << get_oid(index)
+		 << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    rados::cls::fifo::info m;
+    fifo.meta(dpp, m, y);
+    auto p = m.head_part_num;
+    if (p < 0) {
+      info->marker = "";
+      info->last_update = ceph::real_clock::zero();
+      return 0;
+    }
+    rgw::cls::fifo::part_info h;
+    r = fifo.get_part_info(dpp, p, &h, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": unable to get part info: " << get_oid(index) << "/" << p
+		 << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    info->marker = rgw::cls::fifo::marker{p, h.last_ofs}.to_string();
+    info->last_update = h.max_time;
+    return 0;
+  }
+  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+	   optional_yield y) override {
+    auto r = fifos[index].trim(dpp, marker, false, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		 << ": unable to trim FIFO: " << get_oid(index)
+		 << ": " << cpp_strerror(-r) << dendl;
+    }
+    return r;
+  }
+  int trim(const DoutPrefixProvider *dpp, int index, std::string_view marker,
+	   librados::AioCompletion* c) override {
+    int r = 0;
+    if (marker == rgw::cls::fifo::marker(0, 0).to_string()) {
+      rgw_complete_aio_completion(c, -ENODATA);
+    } else {
+      // This null_yield is used for lazily opening FIFOs.
+      //
+      // shouldn't exist, but it can't be eliminated
+      // since your caller is an RGWCoroutine in the data sync code.
+      //
+      // It can be eliminated after Reef when we can get rid of
+      // AioCompletion entirely.
+      fifos[index].trim(dpp, marker, false, c, null_yield);
+    }
+    return r;
+  }
+  std::string_view max_marker() const override {
+    static const std::string mm =
+      rgw::cls::fifo::marker::max().to_string();
+    return std::string_view(mm);
+  }
+  int is_empty(const DoutPrefixProvider *dpp, optional_yield y) override {
+    std::vector<rgw::cls::fifo::list_entry> log_entries;
+    bool more = false;
+    for (auto shard = 0u; shard < fifos.size(); ++shard) {
+      auto r = fifos[shard].list(dpp, 1, {}, &log_entries, &more, y);
+      if (r < 0) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+		   << ": unable to list FIFO: " << get_oid(shard)
+		   << ": " << cpp_strerror(-r) << dendl;
+	return r;
+      }
+      if (!log_entries.empty()) {
+	return 0;
+      }
+    }
+    return 1;
+  }
+};
+
+RGWDataChangesLog::RGWDataChangesLog(CephContext* cct)
+  : cct(cct),
+    num_shards(cct->_conf->rgw_data_log_num_shards),
+    prefix(get_prefix()),
+    changes(cct->_conf->rgw_data_log_changes_size) {}
+
+bs::error_code DataLogBackends::handle_init(entries_t e) noexcept {
+  std::unique_lock l(m);
+
+  for (const auto& [gen_id, gen] : e) {
+    if (gen.pruned) {
+      lderr(datalog.cct)
+	<< __PRETTY_FUNCTION__ << ":" << __LINE__
+	<< ": ERROR: given empty generation: gen_id=" << gen_id << dendl;
+    }
+    if (count(gen_id) != 0) {
+      lderr(datalog.cct)
+	<< __PRETTY_FUNCTION__ << ":" << __LINE__
+	<< ": ERROR: generation already exists: gen_id=" << gen_id << dendl;
+    }
+    try {
+      switch (gen.type) {
+      case log_type::omap:
+	emplace(gen_id, new RGWDataChangesOmap(ioctx, datalog, gen_id, shards));
+	break;
+      case log_type::fifo:
+	emplace(gen_id, new RGWDataChangesFIFO(ioctx, datalog, gen_id, shards));
+	break;
+      default:
+	lderr(datalog.cct)
+	  << __PRETTY_FUNCTION__ << ":" << __LINE__
+	  << ": IMPOSSIBLE: invalid log type: gen_id=" << gen_id
+	  << ", type" << gen.type << dendl;
+	return bs::error_code(EFAULT, bs::system_category());
+      }
+    } catch (const bs::system_error& err) {
+      lderr(datalog.cct)
+	  << __PRETTY_FUNCTION__ << ":" << __LINE__
+	  << ": error setting up backend: gen_id=" << gen_id
+	  << ", err=" << err.what() << dendl;
+      return err.code();
+    }
+  }
+  return {};
+}
+bs::error_code DataLogBackends::handle_new_gens(entries_t e) noexcept {
+  return handle_init(std::move(e));
+}
+bs::error_code DataLogBackends::handle_empty_to(uint64_t new_tail) noexcept {
+  std::unique_lock l(m);
+  auto i = cbegin();
+  if (i->first < new_tail) {
+    return {};
+  }
+  if (new_tail >= (cend() - 1)->first) {
+    lderr(datalog.cct)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << ": ERROR: attempt to trim head: new_tail=" << new_tail << dendl;
+    return bs::error_code(EFAULT, bs::system_category());
+  }
+  erase(i, upper_bound(new_tail));
+  return {};
+}
+
+
+int RGWDataChangesLog::start(const DoutPrefixProvider *dpp, const RGWZone* _zone,
+			     const RGWZoneParams& zoneparams,
+			     librados::Rados* lr)
+{
+  zone = _zone;
+  ceph_assert(zone);
+  auto defbacking = to_log_type(
+    cct->_conf.get_val<std::string>("rgw_default_data_log_backing"));
+  // Should be guaranteed by `set_enum_allowed`
+  ceph_assert(defbacking);
+  auto log_pool = zoneparams.log_pool;
+  auto r = rgw_init_ioctx(dpp, lr, log_pool, ioctx, true, false);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__
+	       << ": Failed to initialized ioctx, r=" << r
+	       << ", pool=" << log_pool << dendl;
+    return -r;
+  }
+
+  // This null_yield is in startup code, so it doesn't matter that much.
+  auto besr = logback_generations::init<DataLogBackends>(
+    dpp, ioctx, metadata_log_oid(), [this](uint64_t gen_id, int shard) {
+      return get_oid(gen_id, shard);
+    },
+    num_shards, *defbacking, null_yield, *this);
+
+
+  if (!besr) {
+    lderr(cct) << __PRETTY_FUNCTION__
+	       << ": Error initializing backends: "
+	       << besr.error().message() << dendl;
+    return ceph::from_error_code(besr.error());
+  }
+
+  bes = std::move(*besr);
+  renew_thread = make_named_thread("rgw_dt_lg_renew",
+				   &RGWDataChangesLog::renew_run, this);
+  return 0;
+}
+
+int RGWDataChangesLog::choose_oid(const rgw_bucket_shard& bs) {
+  const auto& name = bs.bucket.name;
+  auto shard_shift = (bs.shard_id > 0 ? bs.shard_id : 0);
+  auto r = (ceph_str_hash_linux(name.data(), name.size()) +
+	    shard_shift) % num_shards;
+  return static_cast<int>(r);
+}
+
+int RGWDataChangesLog::renew_entries(const DoutPrefixProvider *dpp)
+{
+  if (!zone->log_data)
+    return 0;
+
+  /* we can't keep the bucket name as part of the cls_log_entry, and we need
+   * it later, so we keep two lists under the map */
+  bc::flat_map<int, std::pair<std::vector<BucketGen>,
+			      RGWDataChangesBE::entries>> m;
+
+  std::unique_lock l(lock);
+  decltype(cur_cycle) entries;
+  entries.swap(cur_cycle);
+  l.unlock();
+
+  auto ut = real_clock::now();
+  auto be = bes->head();
+  for (const auto& [bs, gen] : entries) {
+    auto index = choose_oid(bs);
+
+    rgw_data_change change;
+    bufferlist bl;
+    change.entity_type = ENTITY_TYPE_BUCKET;
+    change.key = bs.get_key();
+    change.timestamp = ut;
+    change.gen = gen;
+    encode(change, bl);
+
+    m[index].first.push_back({bs, gen});
+    be->prepare(ut, change.key, std::move(bl), m[index].second);
+  }
+
+  for (auto& [index, p] : m) {
+    auto& [buckets, entries] = p;
+
+    auto now = real_clock::now();
+
+    // This null_yield can stay (for now) as we're in our own thread.
+    auto ret = be->push(dpp, index, std::move(entries), null_yield);
+    if (ret < 0) {
+      /* we don't really need to have a special handling for failed cases here,
+       * as this is just an optimization. */
+      ldpp_dout(dpp, -1) << "ERROR: svc.cls->timelog.add() returned " << ret << dendl;
+      return ret;
+    }
+
+    auto expiration = now;
+    expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window);
+    for (auto& [bs, gen] : buckets) {
+      update_renewed(bs, gen, expiration);
+    }
+  }
+
+  return 0;
+}
+
+auto RGWDataChangesLog::_get_change(const rgw_bucket_shard& bs,
+				    uint64_t gen)
+  -> ChangeStatusPtr
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+  ChangeStatusPtr status;
+  if (!changes.find({bs, gen}, status)) {
+    status = std::make_shared<ChangeStatus>();
+    changes.add({bs, gen}, status);
+  }
+  return status;
+}
+
+void RGWDataChangesLog::register_renew(const rgw_bucket_shard& bs,
+				       const rgw::bucket_log_layout_generation& gen)
+{
+  std::scoped_lock l{lock};
+  cur_cycle.insert({bs, gen.gen});
+}
+
+void RGWDataChangesLog::update_renewed(const rgw_bucket_shard& bs,
+				       uint64_t gen,
+				       real_time expiration)
+{
+  std::unique_lock l{lock};
+  auto status = _get_change(bs, gen);
+  l.unlock();
+
+  ldout(cct, 20) << "RGWDataChangesLog::update_renewd() bucket_name="
+		 << bs.bucket.name << " shard_id=" << bs.shard_id
+		 << " expiration=" << expiration << dendl;
+
+  std::unique_lock sl(status->lock);
+  status->cur_expiration = expiration;
+}
+
+int RGWDataChangesLog::get_log_shard_id(rgw_bucket& bucket, int shard_id) {
+  rgw_bucket_shard bs(bucket, shard_id);
+  return choose_oid(bs);
+}
+
+bool RGWDataChangesLog::filter_bucket(const DoutPrefixProvider *dpp, 
+                                      const rgw_bucket& bucket,
+				      optional_yield y) const
+{
+  if (!bucket_filter) {
+    return true;
+  }
+
+  return bucket_filter(bucket, y, dpp);
+}
+
+std::string RGWDataChangesLog::get_oid(uint64_t gen_id, int i) const {
+  return (gen_id > 0 ?
+	  fmt::format("{}@G{}.{}", prefix, gen_id, i) :
+	  fmt::format("{}.{}", prefix, i));
+}
+
+int RGWDataChangesLog::add_entry(const DoutPrefixProvider *dpp,
+				 const RGWBucketInfo& bucket_info,
+				 const rgw::bucket_log_layout_generation& gen,
+				 int shard_id, optional_yield y)
+{
+  auto& bucket = bucket_info.bucket;
+
+  if (!filter_bucket(dpp, bucket, y)) {
+    return 0;
+  }
+
+  if (observer) {
+    observer->on_bucket_changed(bucket.get_key());
+  }
+
+  rgw_bucket_shard bs(bucket, shard_id);
+
+  int index = choose_oid(bs);
+
+  mark_modified(index, bs, gen.gen);
+
+  std::unique_lock l(lock);
+
+  auto status = _get_change(bs, gen.gen);
+  l.unlock();
+
+  auto now = real_clock::now();
+
+  std::unique_lock sl(status->lock);
+
+  ldpp_dout(dpp, 20) << "RGWDataChangesLog::add_entry() bucket.name=" << bucket.name
+		     << " shard_id=" << shard_id << " now=" << now
+		     << " cur_expiration=" << status->cur_expiration << dendl;
+
+  if (now < status->cur_expiration) {
+    /* no need to send, recently completed */
+    sl.unlock();
+    register_renew(bs, gen);
+    return 0;
+  }
+
+  RefCountedCond* cond;
+
+  if (status->pending) {
+    cond = status->cond;
+
+    ceph_assert(cond);
+
+    status->cond->get();
+    sl.unlock();
+
+    int ret = cond->wait();
+    cond->put();
+    if (!ret) {
+      register_renew(bs, gen);
+    }
+    return ret;
+  }
+
+  status->cond = new RefCountedCond;
+  status->pending = true;
+
+  ceph::real_time expiration;
+
+  int ret;
+
+  do {
+    status->cur_sent = now;
+
+    expiration = now;
+    expiration += ceph::make_timespan(cct->_conf->rgw_data_log_window);
+
+    sl.unlock();
+
+    ceph::buffer::list bl;
+    rgw_data_change change;
+    change.entity_type = ENTITY_TYPE_BUCKET;
+    change.key = bs.get_key();
+    change.timestamp = now;
+    change.gen = gen.gen;
+    encode(change, bl);
+
+    ldpp_dout(dpp, 20) << "RGWDataChangesLog::add_entry() sending update with now=" << now << " cur_expiration=" << expiration << dendl;
+
+    auto be = bes->head();
+    ret = be->push(dpp, index, now, change.key, std::move(bl), y);
+
+    now = real_clock::now();
+
+    sl.lock();
+
+  } while (!ret && real_clock::now() > expiration);
+
+  cond = status->cond;
+
+  status->pending = false;
+  /* time of when operation started, not completed */
+  status->cur_expiration = status->cur_sent;
+  status->cur_expiration += make_timespan(cct->_conf->rgw_data_log_window);
+  status->cond = nullptr;
+  sl.unlock();
+
+  cond->done(ret);
+  cond->put();
+
+  return ret;
+}
+
+int DataLogBackends::list(const DoutPrefixProvider *dpp, int shard, int max_entries,
+			  std::vector<rgw_data_change_log_entry>& entries,
+			  std::string_view marker, std::string* out_marker,
+			  bool* truncated, optional_yield y)
+{
+  const auto [start_id, start_cursor] = cursorgen(marker);
+  auto gen_id = start_id;
+  std::string out_cursor;
+  while (max_entries > 0) {
+    std::vector<rgw_data_change_log_entry> gentries;
+    std::unique_lock l(m);
+    auto i = lower_bound(gen_id);
+    if (i == end()) return 0;
+    auto be = i->second;
+    l.unlock();
+    gen_id = be->gen_id;
+    auto r = be->list(dpp, shard, max_entries, gentries,
+		      gen_id == start_id ? start_cursor : std::string{},
+		      &out_cursor, truncated, y);
+    if (r < 0)
+      return r;
+
+    if (out_marker && !out_cursor.empty()) {
+      *out_marker = gencursor(gen_id, out_cursor);
+    }
+    for (auto& g : gentries) {
+      g.log_id = gencursor(gen_id, g.log_id);
+    }
+    if (int s = gentries.size(); s < 0 || s > max_entries)
+      max_entries = 0;
+    else
+      max_entries -= gentries.size();
+
+    std::move(gentries.begin(), gentries.end(),
+	      std::back_inserter(entries));
+    ++gen_id;
+  }
+  return 0;
+}
+
+int RGWDataChangesLog::list_entries(const DoutPrefixProvider *dpp, int shard, int max_entries,
+				    std::vector<rgw_data_change_log_entry>& entries,
+				    std::string_view marker,
+				    std::string* out_marker, bool* truncated,
+				    optional_yield y)
+{
+  assert(shard < num_shards);
+  return bes->list(dpp, shard, max_entries, entries, marker, out_marker,
+		   truncated, y);
+}
+
+int RGWDataChangesLog::list_entries(const DoutPrefixProvider *dpp, int max_entries,
+				    std::vector<rgw_data_change_log_entry>& entries,
+				    LogMarker& marker, bool *ptruncated,
+				    optional_yield y)
+{
+  bool truncated;
+  entries.clear();
+  for (; marker.shard < num_shards && int(entries.size()) < max_entries;
+       marker.shard++, marker.marker.clear()) {
+    int ret = list_entries(dpp, marker.shard, max_entries - entries.size(),
+			   entries, marker.marker, NULL, &truncated, y);
+    if (ret == -ENOENT) {
+      continue;
+    }
+    if (ret < 0) {
+      return ret;
+    }
+    if (!truncated) {
+      *ptruncated = false;
+      return 0;
+    }
+  }
+  *ptruncated = (marker.shard < num_shards);
+  return 0;
+}
+
+int RGWDataChangesLog::get_info(const DoutPrefixProvider *dpp, int shard_id,
+				RGWDataChangesLogInfo *info, optional_yield y)
+{
+  assert(shard_id < num_shards);
+  auto be = bes->head();
+  auto r = be->get_info(dpp, shard_id, info, y);
+  if (!info->marker.empty()) {
+    info->marker = gencursor(be->gen_id, info->marker);
+  }
+  return r;
+}
+
+int DataLogBackends::trim_entries(const DoutPrefixProvider *dpp, int shard_id,
+				  std::string_view marker, optional_yield y)
+{
+  auto [target_gen, cursor] = cursorgen(marker);
+  std::unique_lock l(m);
+  const auto head_gen = (end() - 1)->second->gen_id;
+  const auto tail_gen = begin()->first;
+  if (target_gen < tail_gen) return 0;
+  auto r = 0;
+  for (auto be = lower_bound(0)->second;
+       be->gen_id <= target_gen && be->gen_id <= head_gen && r >= 0;
+       be = upper_bound(be->gen_id)->second) {
+    l.unlock();
+    auto c = be->gen_id == target_gen ? cursor : be->max_marker();
+    r = be->trim(dpp, shard_id, c, y);
+    if (r == -ENOENT)
+      r = -ENODATA;
+    if (r == -ENODATA && be->gen_id < target_gen)
+      r = 0;
+    if (be->gen_id == target_gen)
+      break;
+    l.lock();
+  };
+  return r;
+}
+
+int RGWDataChangesLog::trim_entries(const DoutPrefixProvider *dpp, int shard_id,
+				    std::string_view marker, optional_yield y)
+{
+  assert(shard_id < num_shards);
+  return bes->trim_entries(dpp, shard_id, marker, y);
+}
+
+class GenTrim : public rgw::cls::fifo::Completion<GenTrim> {
+public:
+  DataLogBackends* const bes;
+  const int shard_id;
+  const uint64_t target_gen;
+  const std::string cursor;
+  const uint64_t head_gen;
+  const uint64_t tail_gen;
+  boost::intrusive_ptr<RGWDataChangesBE> be;
+
+  GenTrim(const DoutPrefixProvider *dpp, DataLogBackends* bes, int shard_id, uint64_t target_gen,
+	  std::string cursor, uint64_t head_gen, uint64_t tail_gen,
+	  boost::intrusive_ptr<RGWDataChangesBE> be,
+	  lr::AioCompletion* super)
+    : Completion(dpp, super), bes(bes), shard_id(shard_id), target_gen(target_gen),
+      cursor(std::move(cursor)), head_gen(head_gen), tail_gen(tail_gen),
+      be(std::move(be)) {}
+
+  void handle(const DoutPrefixProvider *dpp, Ptr&& p, int r) {
+    auto gen_id = be->gen_id;
+    be.reset();
+    if (r == -ENOENT)
+      r = -ENODATA;
+    if (r == -ENODATA && gen_id < target_gen)
+      r = 0;
+    if (r < 0) {
+      complete(std::move(p), r);
+      return;
+    }
+
+    {
+      std::unique_lock l(bes->m);
+      auto i = bes->upper_bound(gen_id);
+      if (i == bes->end() || i->first > target_gen || i->first > head_gen) {
+	l.unlock();
+	complete(std::move(p), -ENODATA);
+	return;
+      }
+      be = i->second;
+    }
+    auto c = be->gen_id == target_gen ? cursor : be->max_marker();
+    be->trim(dpp, shard_id, c, call(std::move(p)));
+  }
+};
+
+void DataLogBackends::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+				   librados::AioCompletion* c)
+{
+  auto [target_gen, cursor] = cursorgen(marker);
+  std::unique_lock l(m);
+  const auto head_gen = (end() - 1)->second->gen_id;
+  const auto tail_gen = begin()->first;
+  if (target_gen < tail_gen) {
+    l.unlock();
+    rgw_complete_aio_completion(c, -ENODATA);
+    return;
+  }
+  auto be = begin()->second;
+  l.unlock();
+  auto gt = std::make_unique<GenTrim>(dpp, this, shard_id, target_gen,
+				      std::string(cursor), head_gen, tail_gen,
+				      be, c);
+
+  auto cc = be->gen_id == target_gen ? cursor : be->max_marker();
+  be->trim(dpp, shard_id, cc,  GenTrim::call(std::move(gt)));
+}
+
+int DataLogBackends::trim_generations(const DoutPrefixProvider *dpp,
+				      std::optional<uint64_t>& through,
+				      optional_yield y) {
+  if (size() != 1) {
+    std::vector<mapped_type> candidates;
+    {
+      std::scoped_lock l(m);
+      auto e = cend() - 1;
+      for (auto i = cbegin(); i < e; ++i) {
+	candidates.push_back(i->second);
+      }
+    }
+
+    std::optional<uint64_t> highest;
+    for (auto& be : candidates) {
+      auto r = be->is_empty(dpp, y);
+      if (r < 0) {
+	return r;
+      } else if (r == 1) {
+	highest = be->gen_id;
+      } else {
+	break;
+      }
+    }
+
+    through = highest;
+    if (!highest) {
+      return 0;
+    }
+    auto ec = empty_to(dpp, *highest, y);
+    if (ec) {
+      return ceph::from_error_code(ec);
+    }
+  }
+
+  return ceph::from_error_code(remove_empty(dpp, y));
+}
+
+
+int RGWDataChangesLog::trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+				    librados::AioCompletion* c)
+{
+  assert(shard_id < num_shards);
+  bes->trim_entries(dpp, shard_id, marker, c);
+  return 0;
+}
+
+bool RGWDataChangesLog::going_down() const
+{
+  return down_flag;
+}
+
+RGWDataChangesLog::~RGWDataChangesLog() {
+  down_flag = true;
+  if (renew_thread.joinable()) {
+    renew_stop();
+    renew_thread.join();
+  }
+}
+
+void RGWDataChangesLog::renew_run() noexcept {
+  static constexpr auto runs_per_prune = 150;
+  auto run = 0;
+  for (;;) {
+    const DoutPrefix dp(cct, dout_subsys, "rgw data changes log: ");
+    ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: start" << dendl;
+    int r = renew_entries(&dp);
+    if (r < 0) {
+      ldpp_dout(&dp, 0) << "ERROR: RGWDataChangesLog::renew_entries returned error r=" << r << dendl;
+    }
+
+    if (going_down())
+      break;
+
+    if (run == runs_per_prune) {
+      std::optional<uint64_t> through;
+      ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: pruning old generations" << dendl;
+      // This null_yield can stay, for now, as it's in its own thread.
+      trim_generations(&dp, through, null_yield);
+      if (r < 0) {
+	derr << "RGWDataChangesLog::ChangesRenewThread: failed pruning r="
+	     << r << dendl;
+      } else if (through) {
+	ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: pruned generations "
+		<< "through " << *through << "." << dendl;
+      } else {
+	ldpp_dout(&dp, 2) << "RGWDataChangesLog::ChangesRenewThread: nothing to prune."
+		<< dendl;
+      }
+      run = 0;
+    } else {
+      ++run;
+    }
+
+    int interval = cct->_conf->rgw_data_log_window * 3 / 4;
+    std::unique_lock locker{renew_lock};
+    renew_cond.wait_for(locker, std::chrono::seconds(interval));
+  }
+}
+
+void RGWDataChangesLog::renew_stop()
+{
+  std::lock_guard l{renew_lock};
+  renew_cond.notify_all();
+}
+
+void RGWDataChangesLog::mark_modified(int shard_id, const rgw_bucket_shard& bs, uint64_t gen)
+{
+  if (!cct->_conf->rgw_data_notify_interval_msec) {
+    return;
+  }
+
+  auto key = bs.get_key();
+  {
+    std::shared_lock rl{modified_lock}; // read lock to check for existence
+    auto shard = modified_shards.find(shard_id);
+    if (shard != modified_shards.end() && shard->second.count({key, gen})) {
+      return;
+    }
+  }
+
+  std::unique_lock wl{modified_lock}; // write lock for insertion
+  modified_shards[shard_id].insert(rgw_data_notify_entry{key, gen});
+}
+
+std::string RGWDataChangesLog::max_marker() const {
+  return gencursor(std::numeric_limits<uint64_t>::max(),
+		   "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
+}
+
+int RGWDataChangesLog::change_format(const DoutPrefixProvider *dpp, log_type type, optional_yield y) {
+  return ceph::from_error_code(bes->new_backing(dpp, type, y));
+}
+
+int RGWDataChangesLog::trim_generations(const DoutPrefixProvider *dpp,
+					std::optional<uint64_t>& through,
+					optional_yield y) {
+  return bes->trim_generations(dpp, through, y);
+}
+
+void RGWDataChangesLogInfo::dump(Formatter *f) const
+{
+  encode_json("marker", marker, f);
+  utime_t ut(last_update);
+  encode_json("last_update", ut, f);
+}
+
+void RGWDataChangesLogInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("marker", marker, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("last_update", ut, obj);
+  last_update = ut.to_real_time();
+}
+
+
diff --git a/src/rgw/driver/rados/rgw_datalog.h b/src/rgw/driver/rados/rgw_datalog.h
new file mode 100644
index 000000000..174cf86de
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_datalog.h
@@ -0,0 +1,394 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <cstdint>
+#include <list>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <variant>
+#include <vector>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/smart_ptr/intrusive_ptr.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <fmt/format.h>
+
+#include "common/async/yield_context.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "include/function2.hpp"
+
+#include "include/rados/librados.hpp"
+
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+#include "common/ceph_time.h"
+#include "common/Formatter.h"
+#include "common/lru_map.h"
+#include "common/RefCountedObj.h"
+
+#include "cls/log/cls_log_types.h"
+
+#include "rgw_basic_types.h"
+#include "rgw_log_backing.h"
+#include "rgw_sync_policy.h"
+#include "rgw_zone.h"
+#include "rgw_trim_bilog.h"
+
+namespace bc = boost::container;
+
+enum DataLogEntityType {
+  ENTITY_TYPE_UNKNOWN = 0,
+  ENTITY_TYPE_BUCKET = 1,
+};
+
+struct rgw_data_change {
+  DataLogEntityType entity_type;
+  std::string key;
+  ceph::real_time timestamp;
+  uint64_t gen = 0;
+
+  void encode(ceph::buffer::list& bl) const {
+    // require decoders to recognize v2 when gen>0
+    const uint8_t compat = (gen == 0) ? 1 : 2;
+    ENCODE_START(2, compat, bl);
+    auto t = std::uint8_t(entity_type);
+    encode(t, bl);
+    encode(key, bl);
+    encode(timestamp, bl);
+    encode(gen, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(2, bl);
+     std::uint8_t t;
+     decode(t, bl);
+     entity_type = DataLogEntityType(t);
+     decode(key, bl);
+     decode(timestamp, bl);
+     if (struct_v < 2) {
+       gen = 0;
+     } else {
+       decode(gen, bl);
+     }
+     DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+WRITE_CLASS_ENCODER(rgw_data_change)
+
+struct rgw_data_change_log_entry {
+  std::string log_id;
+  ceph::real_time log_timestamp;
+  rgw_data_change entry;
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(log_id, bl);
+    encode(log_timestamp, bl);
+    encode(entry, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(log_id, bl);
+     decode(log_timestamp, bl);
+     decode(entry, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+WRITE_CLASS_ENCODER(rgw_data_change_log_entry)
+
+struct RGWDataChangesLogInfo {
+  std::string marker;
+  ceph::real_time last_update;
+
+  void dump(ceph::Formatter* f) const;
+  void decode_json(JSONObj* obj);
+};
+
+struct RGWDataChangesLogMarker {
+  int shard = 0;
+  std::string marker;
+
+  RGWDataChangesLogMarker() = default;
+};
+
+class RGWDataChangesLog;
+
+struct rgw_data_notify_entry {
+  std::string key;
+  uint64_t gen = 0;
+
+  void dump(ceph::Formatter* f) const;
+  void decode_json(JSONObj* obj);
+
+  rgw_data_notify_entry& operator=(const rgw_data_notify_entry&) = default;
+
+  bool operator <(const rgw_data_notify_entry& d) const {
+    if (key < d.key) {
+      return true;
+    }
+    if (d.key < key) {
+      return false;
+    }
+    return gen < d.gen;
+  }
+  friend std::ostream& operator <<(std::ostream& m,
+				   const rgw_data_notify_entry& e) {
+    return m << "[key: " << e.key << ", gen: " << e.gen << "]";
+  }
+};
+
+class RGWDataChangesBE;
+
+class DataLogBackends final
+  : public logback_generations,
+    private bc::flat_map<uint64_t, boost::intrusive_ptr<RGWDataChangesBE>> {
+  friend class logback_generations;
+  friend class GenTrim;
+
+  std::mutex m;
+  RGWDataChangesLog& datalog;
+
+  DataLogBackends(librados::IoCtx& ioctx,
+		  std::string oid,
+		  fu2::unique_function<std::string(
+		    uint64_t, int) const>&& get_oid,
+		  int shards, RGWDataChangesLog& datalog) noexcept
+    : logback_generations(ioctx, oid, std::move(get_oid),
+			  shards), datalog(datalog) {}
+public:
+
+  boost::intrusive_ptr<RGWDataChangesBE> head() {
+    std::unique_lock l(m);
+    auto i = end();
+    --i;
+    return i->second;
+  }
+  int list(const DoutPrefixProvider *dpp, int shard, int max_entries,
+	   std::vector<rgw_data_change_log_entry>& entries,
+	   std::string_view marker, std::string* out_marker, bool* truncated,
+	   optional_yield y);
+  int trim_entries(const DoutPrefixProvider *dpp, int shard_id,
+		   std::string_view marker, optional_yield y);
+  void trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+		    librados::AioCompletion* c);
+  void set_zero(RGWDataChangesBE* be) {
+    emplace(0, be);
+  }
+
+  bs::error_code handle_init(entries_t e) noexcept override;
+  bs::error_code handle_new_gens(entries_t e) noexcept override;
+  bs::error_code handle_empty_to(uint64_t new_tail) noexcept override;
+
+  int trim_generations(const DoutPrefixProvider *dpp,
+		       std::optional<uint64_t>& through,
+		       optional_yield y);
+};
+
+struct BucketGen {
+  rgw_bucket_shard shard;
+  uint64_t gen;
+
+  BucketGen(const rgw_bucket_shard& shard, uint64_t gen)
+    : shard(shard), gen(gen) {}
+
+  BucketGen(rgw_bucket_shard&& shard, uint64_t gen)
+    : shard(std::move(shard)), gen(gen) {}
+
+  BucketGen(const BucketGen&) = default;
+  BucketGen(BucketGen&&) = default;
+  BucketGen& operator =(const BucketGen&) = default;
+  BucketGen& operator =(BucketGen&&) = default;
+
+  ~BucketGen() = default;
+};
+
+inline bool operator ==(const BucketGen& l, const BucketGen& r) {
+  return (l.shard == r.shard) && (l.gen == r.gen);
+}
+
+inline bool operator <(const BucketGen& l, const BucketGen& r) {
+  if (l.shard < r.shard) {
+    return true;
+  } else if (l.shard == r.shard) {
+    return l.gen < r.gen;
+  } else {
+    return false;
+  }
+}
+
+class RGWDataChangesLog {
+  friend DataLogBackends;
+  CephContext *cct;
+  librados::IoCtx ioctx;
+  rgw::BucketChangeObserver *observer = nullptr;
+  const RGWZone* zone;
+  std::unique_ptr<DataLogBackends> bes;
+
+  const int num_shards;
+  std::string get_prefix() {
+    auto prefix = cct->_conf->rgw_data_log_obj_prefix;
+    return prefix.empty() ? prefix : "data_log";
+  }
+  std::string metadata_log_oid() {
+    return get_prefix() + "generations_metadata";
+  }
+  std::string prefix;
+
+  ceph::mutex lock = ceph::make_mutex("RGWDataChangesLog::lock");
+  ceph::shared_mutex modified_lock =
+    ceph::make_shared_mutex("RGWDataChangesLog::modified_lock");
+  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>> modified_shards;
+
+  std::atomic<bool> down_flag = { false };
+
+  struct ChangeStatus {
+    std::shared_ptr<const rgw_sync_policy_info> sync_policy;
+    ceph::real_time cur_expiration;
+    ceph::real_time cur_sent;
+    bool pending = false;
+    RefCountedCond* cond = nullptr;
+    ceph::mutex lock = ceph::make_mutex("RGWDataChangesLog::ChangeStatus");
+  };
+
+  using ChangeStatusPtr = std::shared_ptr<ChangeStatus>;
+
+  lru_map<BucketGen, ChangeStatusPtr> changes;
+
+  bc::flat_set<BucketGen> cur_cycle;
+
+  ChangeStatusPtr _get_change(const rgw_bucket_shard& bs, uint64_t gen);
+  void register_renew(const rgw_bucket_shard& bs,
+		      const rgw::bucket_log_layout_generation& gen);
+  void update_renewed(const rgw_bucket_shard& bs,
+		      uint64_t gen,
+		      ceph::real_time expiration);
+
+  ceph::mutex renew_lock = ceph::make_mutex("ChangesRenewThread::lock");
+  ceph::condition_variable renew_cond;
+  void renew_run() noexcept;
+  void renew_stop();
+  std::thread renew_thread;
+
+  std::function<bool(const rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp)> bucket_filter;
+  bool going_down() const;
+  bool filter_bucket(const DoutPrefixProvider *dpp, const rgw_bucket& bucket, optional_yield y) const;
+  int renew_entries(const DoutPrefixProvider *dpp);
+
+public:
+
+  RGWDataChangesLog(CephContext* cct);
+  ~RGWDataChangesLog();
+
+  int start(const DoutPrefixProvider *dpp, const RGWZone* _zone, const RGWZoneParams& zoneparams,
+	    librados::Rados* lr);
+  int choose_oid(const rgw_bucket_shard& bs);
+  int add_entry(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+		const rgw::bucket_log_layout_generation& gen, int shard_id,
+		optional_yield y);
+  int get_log_shard_id(rgw_bucket& bucket, int shard_id);
+  int list_entries(const DoutPrefixProvider *dpp, int shard, int max_entries,
+		   std::vector<rgw_data_change_log_entry>& entries,
+		   std::string_view marker, std::string* out_marker,
+		   bool* truncated, optional_yield y);
+  int trim_entries(const DoutPrefixProvider *dpp, int shard_id,
+		   std::string_view marker, optional_yield y);
+  int trim_entries(const DoutPrefixProvider *dpp, int shard_id, std::string_view marker,
+		   librados::AioCompletion* c); // :(
+  int get_info(const DoutPrefixProvider *dpp, int shard_id,
+	       RGWDataChangesLogInfo *info, optional_yield y);
+
+  using LogMarker = RGWDataChangesLogMarker;
+
+  int list_entries(const DoutPrefixProvider *dpp, int max_entries,
+		   std::vector<rgw_data_change_log_entry>& entries,
+		   LogMarker& marker, bool* ptruncated,
+		   optional_yield y);
+
+  void mark_modified(int shard_id, const rgw_bucket_shard& bs, uint64_t gen);
+  auto read_clear_modified() {
+    std::unique_lock wl{modified_lock};
+    decltype(modified_shards) modified;
+    modified.swap(modified_shards);
+    modified_shards.clear();
+    return modified;
+  }
+
+  void set_observer(rgw::BucketChangeObserver *observer) {
+    this->observer = observer;
+  }
+
+  void set_bucket_filter(decltype(bucket_filter)&& f) {
+    bucket_filter = std::move(f);
+  }
+  // a marker that compares greater than any other
+  std::string max_marker() const;
+  std::string get_oid(uint64_t gen_id, int shard_id) const;
+
+
+  int change_format(const DoutPrefixProvider *dpp, log_type type, optional_yield y);
+  int trim_generations(const DoutPrefixProvider *dpp,
+		       std::optional<uint64_t>& through,
+		       optional_yield y);
+};
+
+class RGWDataChangesBE : public boost::intrusive_ref_counter<RGWDataChangesBE> {
+protected:
+  librados::IoCtx& ioctx;
+  CephContext* const cct;
+  RGWDataChangesLog& datalog;
+
+  std::string get_oid(int shard_id) {
+    return datalog.get_oid(gen_id, shard_id);
+  }
+public:
+  using entries = std::variant<std::list<cls_log_entry>,
+			       std::vector<ceph::buffer::list>>;
+
+  const uint64_t gen_id;
+
+  RGWDataChangesBE(librados::IoCtx& ioctx,
+		   RGWDataChangesLog& datalog,
+		   uint64_t gen_id)
+    : ioctx(ioctx), cct(static_cast<CephContext*>(ioctx.cct())),
+      datalog(datalog), gen_id(gen_id) {}
+  virtual ~RGWDataChangesBE() = default;
+
+  virtual void prepare(ceph::real_time now,
+		       const std::string& key,
+		       ceph::buffer::list&& entry,
+		       entries& out) = 0;
+  virtual int push(const DoutPrefixProvider *dpp, int index, entries&& items,
+		   optional_yield y) = 0;
+  virtual int push(const DoutPrefixProvider *dpp, int index, ceph::real_time now,
+		   const std::string& key, ceph::buffer::list&& bl,
+		   optional_yield y) = 0;
+  virtual int list(const DoutPrefixProvider *dpp, int shard, int max_entries,
+		   std::vector<rgw_data_change_log_entry>& entries,
+		   std::optional<std::string_view> marker,
+		   std::string* out_marker, bool* truncated,
+		   optional_yield y) = 0;
+  virtual int get_info(const DoutPrefixProvider *dpp, int index,
+		       RGWDataChangesLogInfo *info, optional_yield y) = 0;
+  virtual int trim(const DoutPrefixProvider *dpp, int index,
+		   std::string_view marker, optional_yield y) = 0;
+  virtual int trim(const DoutPrefixProvider *dpp, int index,
+		   std::string_view marker, librados::AioCompletion* c) = 0;
+  virtual std::string_view max_marker() const = 0;
+  // 1 on empty, 0 on non-empty, negative on error.
+  virtual int is_empty(const DoutPrefixProvider *dpp, optional_yield y) = 0;
+};
diff --git a/src/rgw/driver/rados/rgw_datalog_notify.cc b/src/rgw/driver/rados/rgw_datalog_notify.cc
new file mode 100644
index 000000000..12cdc532f
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_datalog_notify.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_datalog_notify.h"
+#include "rgw_datalog.h"
+
+// custom encoding for v1 notify API
+struct EntryEncoderV1 {
+  const rgw_data_notify_entry& entry;
+};
+struct SetEncoderV1 {
+  const bc::flat_set<rgw_data_notify_entry>& entries;
+};
+
+// encode rgw_data_notify_entry as string
+void encode_json(const char *name, const EntryEncoderV1& e, Formatter *f)
+{
+  f->dump_string(name, e.entry.key); // encode the key only
+}
+// encode set<rgw_data_notify_entry> as set<string>
+void encode_json(const char *name, const SetEncoderV1& e, Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto& entry : e.entries) {
+    encode_json("obj", EntryEncoderV1{entry}, f);
+  }
+  f->close_section();
+}
+// encode map<int, set<rgw_data_notify_entry>> as map<int, set<string>>
+void encode_json(const char *name, const rgw_data_notify_v1_encoder& e, Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto& [key, val] : e.shards) {
+    f->open_object_section("entry");
+    encode_json("key", key, f);
+    encode_json("val", SetEncoderV1{val}, f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+struct EntryDecoderV1 {
+  rgw_data_notify_entry& entry;
+};
+struct SetDecoderV1 {
+  bc::flat_set<rgw_data_notify_entry>& entries;
+};
+
+// decode string into rgw_data_notify_entry
+void decode_json_obj(EntryDecoderV1& d, JSONObj *obj)
+{
+  decode_json_obj(d.entry.key, obj);
+  d.entry.gen = 0;
+}
+// decode set<string> into set<rgw_data_notify_entry>
+void decode_json_obj(SetDecoderV1& d, JSONObj *obj)
+{
+  for (JSONObjIter o = obj->find_first(); !o.end(); ++o) {
+    rgw_data_notify_entry val;
+    auto decoder = EntryDecoderV1{val};
+    decode_json_obj(decoder, *o);
+    d.entries.insert(std::move(val));
+  }
+}
+// decode map<int, set<string>> into map<int, set<rgw_data_notify_entry>>
+void decode_json_obj(rgw_data_notify_v1_decoder& d, JSONObj *obj)
+{
+  for (JSONObjIter o = obj->find_first(); !o.end(); ++o) {
+    int shard_id = 0;
+    JSONDecoder::decode_json("key", shard_id, *o);
+    bc::flat_set<rgw_data_notify_entry> val;
+    SetDecoderV1 decoder{val};
+    JSONDecoder::decode_json("val", decoder, *o);
+    d.shards[shard_id] = std::move(val);
+  }
+}
diff --git a/src/rgw/driver/rados/rgw_datalog_notify.h b/src/rgw/driver/rados/rgw_datalog_notify.h
new file mode 100644
index 000000000..4cd1b3c11
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_datalog_notify.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "rgw_datalog.h"
+
+namespace bc = boost::container;
+
+namespace ceph { class Formatter; }
+class JSONObj;
+
+class RGWCoroutine;
+class RGWHTTPManager;
+class RGWRESTConn;
+
+struct rgw_data_notify_entry;
+
+// json encoder and decoder for notify v1 API
+struct rgw_data_notify_v1_encoder {
+  const bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>& shards;
+};
+void encode_json(const char *name, const rgw_data_notify_v1_encoder& e,
+                 ceph::Formatter *f);
+struct rgw_data_notify_v1_decoder {
+  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>>& shards;
+};
+void decode_json_obj(rgw_data_notify_v1_decoder& d, JSONObj *obj);
diff --git a/src/rgw/driver/rados/rgw_etag_verifier.cc b/src/rgw/driver/rados/rgw_etag_verifier.cc
new file mode 100644
index 000000000..52f7c7948
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_etag_verifier.cc
@@ -0,0 +1,191 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_etag_verifier.h"
+#include "rgw_obj_manifest.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::putobj {
+
+int create_etag_verifier(const DoutPrefixProvider *dpp, 
+                         CephContext* cct, rgw::sal::DataProcessor* filter,
+                         const bufferlist& manifest_bl,
+                         const std::optional<RGWCompressionInfo>& compression,
+                         etag_verifier_ptr& verifier)
+{
+  RGWObjManifest manifest;
+
+  try {
+    auto miter = manifest_bl.cbegin();
+    decode(manifest, miter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
+    return -EIO;
+  }
+
+  RGWObjManifestRule rule;
+  bool found = manifest.get_rule(0, &rule);
+  if (!found) {
+    ldpp_dout(dpp, -1) << "ERROR: manifest->get_rule() could not find rule" << dendl;
+    return -EIO;
+  }
+
+  if (rule.start_part_num == 0) {
+    /* Atomic object */
+    verifier.emplace<ETagVerifier_Atomic>(cct, filter);
+    return 0;
+  }
+
+  uint64_t cur_part_ofs = UINT64_MAX;
+  std::vector<uint64_t> part_ofs;
+
+  /*
+   * We must store the offset of each part to calculate the ETAGs for each
+   * MPU part. These part ETags then become the input for the MPU object
+   * Etag.
+   */
+  for (auto mi = manifest.obj_begin(dpp); mi != manifest.obj_end(dpp); ++mi) {
+    if (cur_part_ofs == mi.get_part_ofs())
+      continue;
+    cur_part_ofs = mi.get_part_ofs();
+    ldpp_dout(dpp, 20) << "MPU Part offset:" << cur_part_ofs << dendl;
+    part_ofs.push_back(cur_part_ofs);
+  }
+
+  if (compression) {
+    // if the source object was compressed, the manifest is storing
+    // compressed part offsets. transform the compressed offsets back to
+    // their original offsets by finding the first block of each part
+    const auto& blocks = compression->blocks;
+    auto block = blocks.begin();
+    for (auto& ofs : part_ofs) {
+      // find the compression_block with new_ofs == ofs
+      constexpr auto less = [] (const compression_block& block, uint64_t ofs) {
+        return block.new_ofs < ofs;
+      };
+      block = std::lower_bound(block, blocks.end(), ofs, less);
+      if (block == blocks.end() || block->new_ofs != ofs) {
+        ldpp_dout(dpp, 4) << "no match for compressed offset " << ofs
+            << ", disabling etag verification" << dendl;
+        return -EIO;
+      }
+      ofs = block->old_ofs;
+      ldpp_dout(dpp, 20) << "MPU Part uncompressed offset:" << ofs << dendl;
+    }
+  }
+
+  verifier.emplace<ETagVerifier_MPU>(cct, std::move(part_ofs), filter);
+  return 0;
+}
+
+int ETagVerifier_Atomic::process(bufferlist&& in, uint64_t logical_offset)
+{
+  bufferlist out;
+  if (in.length() > 0)
+    hash.Update((const unsigned char *)in.c_str(), in.length());
+
+  return Pipe::process(std::move(in), logical_offset);
+}
+
+void ETagVerifier_Atomic::calculate_etag()
+{
+  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+
+  /* Return early if ETag has already been calculated */
+  if (!calculated_etag.empty())
+    return;
+
+  hash.Final(m);
+  buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
+  calculated_etag = calc_md5;
+  ldout(cct, 20) << "Single part object: " << " etag:" << calculated_etag
+          << dendl;
+}
+
+void ETagVerifier_MPU::process_end_of_MPU_part()
+{
+  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char calc_md5_part[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+  std::string calculated_etag_part;
+
+  hash.Final(m);
+  mpu_etag_hash.Update((const unsigned char *)m, sizeof(m));
+  hash.Restart();
+
+  if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
+    buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5_part);
+    calculated_etag_part = calc_md5_part;
+    ldout(cct, 20) << "Part etag: " << calculated_etag_part << dendl;
+  }
+
+  cur_part_index++;
+  next_part_index++;
+}
+
+int ETagVerifier_MPU::process(bufferlist&& in, uint64_t logical_offset)
+{
+  uint64_t bl_end = in.length() + logical_offset;
+
+  /* Handle the last MPU part */
+  if (size_t(next_part_index) == part_ofs.size()) {
+    hash.Update((const unsigned char *)in.c_str(), in.length());
+    goto done;
+  }
+
+  /* Incoming bufferlist spans two MPU parts. Calculate separate ETags */
+  if (bl_end > part_ofs[next_part_index]) {
+
+    uint64_t part_one_len = part_ofs[next_part_index] - logical_offset;
+    hash.Update((const unsigned char *)in.c_str(), part_one_len);
+    process_end_of_MPU_part();
+
+    hash.Update((const unsigned char *)in.c_str() + part_one_len,
+      bl_end - part_ofs[cur_part_index]);
+    /*
+     * If we've moved to the last part of the MPU, avoid usage of
+     * parts_ofs[next_part_index] as it will lead to our-of-range access.
+     */
+    if (size_t(next_part_index) == part_ofs.size())
+      goto done;
+  } else {
+    hash.Update((const unsigned char *)in.c_str(), in.length());
+  }
+
+  /* Update the MPU Etag if the current part has ended */
+  if (logical_offset + in.length() + 1 == part_ofs[next_part_index])
+    process_end_of_MPU_part();
+
+done:
+  return Pipe::process(std::move(in), logical_offset);
+}
+
+void ETagVerifier_MPU::calculate_etag()
+{
+  const uint32_t parts = part_ofs.size();
+  constexpr auto digits10 = std::numeric_limits<uint32_t>::digits10;
+  constexpr auto extra = 2 + digits10; // add "-%u\0" at the end
+
+  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE], mpu_m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + extra];
+
+  /* Return early if ETag has already been calculated */
+  if (!calculated_etag.empty())
+    return;
+
+  hash.Final(m);
+  mpu_etag_hash.Update((const unsigned char *)m, sizeof(m));
+
+  /* Refer RGWCompleteMultipart::execute() for ETag calculation for MPU object */
+  mpu_etag_hash.Final(mpu_m);
+  buf_to_hex(mpu_m, CEPH_CRYPTO_MD5_DIGESTSIZE, final_etag_str);
+  snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
+           sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+           "-%u", parts);
+
+  calculated_etag = final_etag_str;
+  ldout(cct, 20) << "MPU calculated ETag:" << calculated_etag << dendl;
+}
+
+} // namespace rgw::putobj
diff --git a/src/rgw/driver/rados/rgw_etag_verifier.h b/src/rgw/driver/rados/rgw_etag_verifier.h
new file mode 100644
index 000000000..18a4f5a3f
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_etag_verifier.h
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * RGW Etag Verifier is an RGW filter which enables the objects copied using
+ * multisite sync to be verified using their ETag from source i.e. the MD5
+ * checksum of the object is computed at the destination and is verified to be
+ * identical to the ETag stored in the object HEAD at source cluster.
+ * 
+ * For MPU objects, a different filter named RGWMultipartEtagFilter is applied
+ * which re-computes ETag using RGWObjManifest. This computes the ETag using the
+ * same algorithm used at the source cluster i.e. MD5 sum of the individual ETag
+ * on the MPU parts.
+ */
+
+#pragma once
+
+#include "rgw_putobj.h"
+#include "rgw_op.h"
+#include "common/static_ptr.h"
+
+namespace rgw::putobj {
+
+class ETagVerifier : public rgw::putobj::Pipe
+{
+protected:
+  CephContext* cct;
+  MD5 hash;
+  std::string calculated_etag;
+
+public:
+  ETagVerifier(CephContext* cct_, rgw::sal::DataProcessor *next)
+    : Pipe(next), cct(cct_) {
+      // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+      hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+    }
+
+  virtual void calculate_etag() = 0;
+  std::string get_calculated_etag() { return calculated_etag;}
+
+}; /* ETagVerifier */
+
+class ETagVerifier_Atomic : public ETagVerifier
+{
+public:
+  ETagVerifier_Atomic(CephContext* cct_, rgw::sal::DataProcessor *next)
+    : ETagVerifier(cct_, next) {}
+
+  int process(bufferlist&& data, uint64_t logical_offset) override;
+  void calculate_etag() override;
+
+}; /* ETagVerifier_Atomic */
+
+class ETagVerifier_MPU : public ETagVerifier
+{
+  std::vector<uint64_t> part_ofs;
+  uint64_t cur_part_index{0}, next_part_index{1};
+  MD5 mpu_etag_hash;
+ 
+  void process_end_of_MPU_part();
+
+public:
+  ETagVerifier_MPU(CephContext* cct,
+                             std::vector<uint64_t> part_ofs,
+                             rgw::sal::DataProcessor *next)
+    : ETagVerifier(cct, next),
+      part_ofs(std::move(part_ofs))
+  {
+    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+    hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  }
+
+  int process(bufferlist&& data, uint64_t logical_offset) override;
+  void calculate_etag() override;
+
+}; /* ETagVerifier_MPU */
+
+constexpr auto max_etag_verifier_size = std::max(
+    sizeof(ETagVerifier_Atomic),
+    sizeof(ETagVerifier_MPU)
+  );
+using etag_verifier_ptr = ceph::static_ptr<ETagVerifier, max_etag_verifier_size>;
+
+int create_etag_verifier(const DoutPrefixProvider *dpp, 
+                         CephContext* cct, rgw::sal::DataProcessor* next,
+                         const bufferlist& manifest_bl,
+                         const std::optional<RGWCompressionInfo>& compression,
+                         etag_verifier_ptr& verifier);
+
+} // namespace rgw::putobj
diff --git a/src/rgw/driver/rados/rgw_gc.cc b/src/rgw/driver/rados/rgw_gc.cc
new file mode 100644
index 000000000..bd16bde1b
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_gc.cc
@@ -0,0 +1,811 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_gc.h"
+
+#include "rgw_tools.h"
+#include "include/scope_guard.h"
+#include "include/rados/librados.hpp"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw_gc/cls_rgw_gc_client.h"
+#include "cls/refcount/cls_refcount_client.h"
+#include "cls/version/cls_version_client.h"
+#include "rgw_perf_counters.h"
+#include "cls/lock/cls_lock_client.h"
+#include "include/random.h"
+#include "rgw_gc_log.h"
+
+#include <list> // XXX
+#include <sstream>
+#include "xxhash.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace librados;
+
+static string gc_oid_prefix = "gc";
+static string gc_index_lock_name = "gc_process";
+
+void RGWGC::initialize(CephContext *_cct, RGWRados *_store) {
+  cct = _cct;
+  store = _store;
+
+  max_objs = min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max());
+
+  obj_names = new string[max_objs];
+
+  for (int i = 0; i < max_objs; i++) {
+    obj_names[i] = gc_oid_prefix;
+    char buf[32];
+    snprintf(buf, 32, ".%d", i);
+    obj_names[i].append(buf);
+
+    auto it = transitioned_objects_cache.begin() + i;
+    transitioned_objects_cache.insert(it, false);
+
+    //version = 0 -> not ready for transition
+    //version = 1 -> marked ready for transition
+    librados::ObjectWriteOperation op;
+    op.create(false);
+    const uint64_t queue_size = cct->_conf->rgw_gc_max_queue_size, num_deferred_entries = cct->_conf->rgw_gc_max_deferred;
+    gc_log_init2(op, queue_size, num_deferred_entries);
+    store->gc_operate(this, obj_names[i], &op);
+  }
+}
+
+void RGWGC::finalize()
+{
+  delete[] obj_names;
+}
+
+int RGWGC::tag_index(const string& tag)
+{
+  return rgw_shards_mod(XXH64(tag.c_str(), tag.size(), seed), max_objs);
+}
+
+std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWGC::send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag)
+{
+  ldpp_dout(this, 20) << "RGWGC::send_split_chain - tag is: " << tag << dendl;
+
+  if (cct->_conf->rgw_max_chunk_size) {
+    cls_rgw_obj_chain broken_chain;
+    ldpp_dout(this, 20) << "RGWGC::send_split_chain - rgw_max_chunk_size is: " << cct->_conf->rgw_max_chunk_size << dendl;
+
+    for (auto it = chain.objs.begin(); it != chain.objs.end(); it++) {
+      ldpp_dout(this, 20) << "RGWGC::send_split_chain - adding obj with name: " << it->key << dendl;
+      broken_chain.objs.emplace_back(*it);
+      cls_rgw_gc_obj_info info;
+      info.tag = tag;
+      info.chain = broken_chain;
+      cls_rgw_gc_set_entry_op op;
+      op.info = info;
+      size_t total_encoded_size = op.estimate_encoded_size();
+      ldpp_dout(this, 20) << "RGWGC::send_split_chain - total_encoded_size is: " << total_encoded_size << dendl;
+
+      if (total_encoded_size > cct->_conf->rgw_max_chunk_size) { //dont add to chain, and send to gc
+        broken_chain.objs.pop_back();
+        --it;
+        ldpp_dout(this, 20) << "RGWGC::send_split_chain - more than, dont add to broken chain and send chain" << dendl;
+        auto ret = send_chain(broken_chain, tag);
+        if (ret < 0) {
+          broken_chain.objs.insert(broken_chain.objs.end(), it, chain.objs.end()); // add all the remainder objs to the list to be deleted inline
+          ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
+          return {ret, {broken_chain}};
+        }
+        broken_chain.objs.clear();
+      }
+    }
+    if (!broken_chain.objs.empty()) { //when the chain is smaller than or equal to rgw_max_chunk_size
+      ldpp_dout(this, 20) << "RGWGC::send_split_chain - sending leftover objects" << dendl;
+      auto ret = send_chain(broken_chain, tag);
+      if (ret < 0) {
+        ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
+        return {ret, {broken_chain}};
+      }
+    }
+  } else {
+    auto ret = send_chain(chain, tag);
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "RGWGC::send_split_chain - send chain returned error: " << ret << dendl;
+      return {ret, {std::move(chain)}};
+    }
+  }
+  return {0, {}};
+}
+
+int RGWGC::send_chain(const cls_rgw_obj_chain& chain, const string& tag)
+{
+  ObjectWriteOperation op;
+  cls_rgw_gc_obj_info info;
+  info.chain = chain;
+  info.tag = tag;
+  gc_log_enqueue2(op, cct->_conf->rgw_gc_obj_min_wait, info);
+
+  int i = tag_index(tag);
+
+  ldpp_dout(this, 20) << "RGWGC::send_chain - on object name: " << obj_names[i] << "tag is: " << tag << dendl;
+
+  auto ret = store->gc_operate(this, obj_names[i], &op);
+  if (ret != -ECANCELED && ret != -EPERM) {
+    return ret;
+  }
+  ObjectWriteOperation set_entry_op;
+  cls_rgw_gc_set_entry(set_entry_op, cct->_conf->rgw_gc_obj_min_wait, info);
+  return store->gc_operate(this, obj_names[i], &set_entry_op);
+}
+
+struct defer_chain_state {
+  librados::AioCompletion* completion = nullptr;
+  // TODO: hold a reference on the state in RGWGC to avoid use-after-free if
+  // RGWGC destructs before this completion fires
+  RGWGC* gc = nullptr;
+  cls_rgw_gc_obj_info info;
+
+  ~defer_chain_state() {
+    if (completion) {
+      completion->release();
+    }
+  }
+};
+
+static void async_defer_callback(librados::completion_t, void* arg)
+{
+  std::unique_ptr<defer_chain_state> state{static_cast<defer_chain_state*>(arg)};
+  if (state->completion->get_return_value() == -ECANCELED) {
+    state->gc->on_defer_canceled(state->info);
+  }
+}
+
+void RGWGC::on_defer_canceled(const cls_rgw_gc_obj_info& info)
+{
+  const std::string& tag = info.tag;
+  const int i = tag_index(tag);
+
+  // ECANCELED from cls_version_check() tells us that we've transitioned
+  transitioned_objects_cache[i] = true;
+
+  ObjectWriteOperation op;
+  cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info);
+  cls_rgw_gc_remove(op, {tag});
+
+  auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+  store->gc_aio_operate(obj_names[i], c, &op);
+  c->release();
+}
+
+int RGWGC::async_defer_chain(const string& tag, const cls_rgw_obj_chain& chain)
+{
+  const int i = tag_index(tag);
+  cls_rgw_gc_obj_info info;
+  info.chain = chain;
+  info.tag = tag;
+
+  // if we've transitioned this shard object, we can rely on the cls_rgw_gc queue
+  if (transitioned_objects_cache[i]) {
+    ObjectWriteOperation op;
+    cls_rgw_gc_queue_defer_entry(op, cct->_conf->rgw_gc_obj_min_wait, info);
+
+    // this tag may still be present in omap, so remove it once the cls_rgw_gc
+    // enqueue succeeds
+    cls_rgw_gc_remove(op, {tag});
+
+    auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+    int ret = store->gc_aio_operate(obj_names[i], c, &op);
+    c->release();
+    return ret;
+  }
+
+  // if we haven't seen the transition yet, write the defer to omap with cls_rgw
+  ObjectWriteOperation op;
+
+  // assert that we haven't initialized cls_rgw_gc queue. this prevents us
+  // from writing new entries to omap after the transition
+  gc_log_defer1(op, cct->_conf->rgw_gc_obj_min_wait, info);
+
+  // prepare a callback to detect the transition via ECANCELED from cls_version_check()
+  auto state = std::make_unique<defer_chain_state>();
+  state->gc = this;
+  state->info.chain = chain;
+  state->info.tag = tag;
+  state->completion = librados::Rados::aio_create_completion(
+      state.get(), async_defer_callback);
+
+  int ret = store->gc_aio_operate(obj_names[i], state->completion, &op);
+  if (ret == 0) {
+    state.release(); // release ownership until async_defer_callback()
+  }
+  return ret;
+}
+
+int RGWGC::remove(int index, const std::vector<string>& tags, AioCompletion **pc)
+{
+  ObjectWriteOperation op;
+  cls_rgw_gc_remove(op, tags);
+
+  auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+  int ret = store->gc_aio_operate(obj_names[index], c, &op);
+  if (ret < 0) {
+    c->release();
+  } else {
+    *pc = c;
+  }
+  return ret;
+}
+
+int RGWGC::remove(int index, int num_entries)
+{
+  ObjectWriteOperation op;
+  cls_rgw_gc_queue_remove_entries(op, num_entries);
+
+  return store->gc_operate(this, obj_names[index], &op);
+}
+
+int RGWGC::list(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
+{
+  result.clear();
+  string next_marker;
+  bool check_queue = false;
+
+  for (; *index < max_objs && result.size() < max; (*index)++, marker.clear(), check_queue = false) {
+    std::list<cls_rgw_gc_obj_info> entries, queue_entries;
+    int ret = 0;
+
+    //processing_queue is set to true from previous iteration if the queue was under process and probably has more elements in it.
+    if (! transitioned_objects_cache[*index] && ! check_queue && ! processing_queue) {
+      ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, max - result.size(), expired_only, entries, truncated, next_marker);
+      if (ret != -ENOENT && ret < 0) {
+        return ret;
+      }
+      obj_version objv;
+      cls_version_read(store->gc_pool_ctx, obj_names[*index], &objv);
+      if (ret == -ENOENT || entries.size() == 0) {
+        if (objv.ver == 0) {
+          continue;
+        } else {
+          if (! expired_only) {
+            transitioned_objects_cache[*index] = true;
+            marker.clear();
+          } else {
+            std::list<cls_rgw_gc_obj_info> non_expired_entries;
+            ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[*index], marker, 1, false, non_expired_entries, truncated, next_marker);
+            if (non_expired_entries.size() == 0) {
+              transitioned_objects_cache[*index] = true;
+              marker.clear();
+            }
+          }
+        }
+      }
+      if ((objv.ver == 1) && (entries.size() < max - result.size())) {
+        check_queue = true;
+        marker.clear();
+      }
+    }
+    if (transitioned_objects_cache[*index] || check_queue || processing_queue) {
+      processing_queue = false;
+      ret = cls_rgw_gc_queue_list_entries(store->gc_pool_ctx, obj_names[*index], marker, (max - result.size()) - entries.size(), expired_only, queue_entries, truncated, next_marker);
+      if (ret < 0) {
+        return ret;
+      }
+    }
+    if (entries.size() == 0 && queue_entries.size() == 0)
+      continue;
+
+    std::list<cls_rgw_gc_obj_info>::iterator iter;
+    for (iter = entries.begin(); iter != entries.end(); ++iter) {
+      result.push_back(*iter);
+    }
+
+    for (iter = queue_entries.begin(); iter != queue_entries.end(); ++iter) {
+      result.push_back(*iter);
+    }
+
+    marker = next_marker;
+
+    if (*index == max_objs - 1) {
+      if (queue_entries.size() > 0 && *truncated) {
+        processing_queue = true;
+      } else {
+        processing_queue = false;
+      }
+      /* we cut short here, truncated will hold the correct value */
+      return 0;
+    }
+
+    if (result.size() == max) {
+      if (queue_entries.size() > 0 && *truncated) {
+        processing_queue = true;
+      } else {
+        processing_queue = false;
+        *index += 1; //move to next gc object
+      }
+
+      /* close approximation, it might be that the next of the objects don't hold
+       * anything, in this case truncated should have been false, but we can find
+       * that out on the next iteration
+       */
+      *truncated = true;
+      return 0;
+    }
+  }
+  *truncated = false;
+  processing_queue = false;
+
+  return 0;
+}
+
+class RGWGCIOManager {
+  const DoutPrefixProvider* dpp;
+  CephContext *cct;
+  RGWGC *gc;
+
+  struct IO {
+    enum Type {
+      UnknownIO = 0,
+      TailIO = 1,
+      IndexIO = 2,
+    } type{UnknownIO};
+    librados::AioCompletion *c{nullptr};
+    string oid;
+    int index{-1};
+    string tag;
+  };
+
+  deque<IO> ios;
+  vector<std::vector<string> > remove_tags;
+  /* tracks the number of remaining shadow objects for a given tag in order to
+   * only remove the tag once all shadow objects have themselves been removed
+   */
+  vector<map<string, size_t> > tag_io_size;
+
+#define MAX_AIO_DEFAULT 10
+  size_t max_aio{MAX_AIO_DEFAULT};
+
+public:
+  RGWGCIOManager(const DoutPrefixProvider* _dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp),
+                                                                                  cct(_cct),
+                                                                                  gc(_gc) {
+    max_aio = cct->_conf->rgw_gc_max_concurrent_io;
+    remove_tags.resize(min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max()));
+    tag_io_size.resize(min(static_cast<int>(cct->_conf->rgw_gc_max_objs), rgw_shards_max()));
+  }
+
+  ~RGWGCIOManager() {
+    for (auto io : ios) {
+      io.c->release();
+    }
+  }
+
+  int schedule_io(IoCtx *ioctx, const string& oid, ObjectWriteOperation *op,
+		  int index, const string& tag) {
+    while (ios.size() > max_aio) {
+      if (gc->going_down()) {
+        return 0;
+      }
+      auto ret = handle_next_completion();
+      //Return error if we are using queue, else ignore it
+      if (gc->transitioned_objects_cache[index] && ret < 0) {
+        return ret;
+      }
+    }
+
+    auto c = librados::Rados::aio_create_completion(nullptr, nullptr);
+    int ret = ioctx->aio_operate(oid, c, op);
+    if (ret < 0) {
+      return ret;
+    }
+    ios.push_back(IO{IO::TailIO, c, oid, index, tag});
+
+    return 0;
+  }
+
+  int handle_next_completion() {
+    ceph_assert(!ios.empty());
+    IO& io = ios.front();
+    io.c->wait_for_complete();
+    int ret = io.c->get_return_value();
+    io.c->release();
+
+    if (ret == -ENOENT) {
+      ret = 0;
+    }
+
+    if (io.type == IO::IndexIO && ! gc->transitioned_objects_cache[io.index]) {
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "WARNING: gc cleanup of tags on gc shard index=" <<
+	  io.index << " returned error, ret=" << ret << dendl;
+      }
+      goto done;
+    }
+
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: gc could not remove oid=" << io.oid <<
+	", ret=" << ret << dendl;
+      goto done;
+    }
+
+    if (! gc->transitioned_objects_cache[io.index]) {
+      schedule_tag_removal(io.index, io.tag);
+    }
+
+  done:
+    ios.pop_front();
+    return ret;
+  }
+
+  /* This is a request to schedule a tag removal. It will be called once when
+   * there are no shadow objects. But it will also be called for every shadow
+   * object when there are any. Since we do not want the tag to be removed
+   * until all shadow objects have been successfully removed, the scheduling
+   * will not happen until the shadow object count goes down to zero
+   */
+  void schedule_tag_removal(int index, string tag) {
+    auto& ts = tag_io_size[index];
+    auto ts_it = ts.find(tag);
+    if (ts_it != ts.end()) {
+      auto& size = ts_it->second;
+      --size;
+      // wait all shadow obj delete return
+      if (size != 0)
+        return;
+
+      ts.erase(ts_it);
+    }
+
+    auto& rt = remove_tags[index];
+
+    rt.push_back(tag);
+    if (rt.size() >= (size_t)cct->_conf->rgw_gc_max_trim_chunk) {
+      flush_remove_tags(index, rt);
+    }
+  }
+
+  void add_tag_io_size(int index, string tag, size_t size) {
+    auto& ts = tag_io_size[index];
+    ts.emplace(tag, size);
+  }
+
+  int drain_ios() {
+    int ret_val = 0;
+    while (!ios.empty()) {
+      if (gc->going_down()) {
+        return -EAGAIN;
+      }
+      auto ret = handle_next_completion();
+      if (ret < 0) {
+        ret_val = ret;
+      }
+    }
+    return ret_val;
+  }
+
+  void drain() {
+    drain_ios();
+    flush_remove_tags();
+    /* the tags draining might have generated more ios, drain those too */
+    drain_ios();
+  }
+
+  void flush_remove_tags(int index, vector<string>& rt) {
+    IO index_io;
+    index_io.type = IO::IndexIO;
+    index_io.index = index;
+
+    ldpp_dout(dpp, 20) << __func__ <<
+      " removing entries from gc log shard index=" << index << ", size=" <<
+      rt.size() << ", entries=" << rt << dendl;
+
+    auto rt_guard = make_scope_guard(
+      [&]
+	{
+	  rt.clear();
+	}
+      );
+
+    int ret = gc->remove(index, rt, &index_io.c);
+    if (ret < 0) {
+      /* we already cleared list of tags, this prevents us from
+       * ballooning in case of a persistent problem
+       */
+      ldpp_dout(dpp, 0) << "WARNING: failed to remove tags on gc shard index=" <<
+	index << " ret=" << ret << dendl;
+      return;
+    }
+    if (perfcounter) {
+      /* log the count of tags retired for rate estimation */
+      perfcounter->inc(l_rgw_gc_retire, rt.size());
+    }
+    ios.push_back(index_io);
+  }
+
+  void flush_remove_tags() {
+    int index = 0;
+    for (auto& rt : remove_tags) {
+      if (! gc->transitioned_objects_cache[index]) {
+        flush_remove_tags(index, rt);
+      }
+      ++index;
+    }
+  }
+
+  int remove_queue_entries(int index, int num_entries) {
+    int ret = gc->remove(index, num_entries);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to remove queue entries on index=" <<
+	    index << " ret=" << ret << dendl;
+      return ret;
+    }
+    if (perfcounter) {
+      /* log the count of tags retired for rate estimation */
+      perfcounter->inc(l_rgw_gc_retire, num_entries);
+    }
+    return 0;
+  }
+}; // class RGWGCIOManger
+
+int RGWGC::process(int index, int max_secs, bool expired_only,
+                   RGWGCIOManager& io_manager)
+{
+  ldpp_dout(this, 20) << "RGWGC::process entered with GC index_shard=" <<
+    index << ", max_secs=" << max_secs << ", expired_only=" <<
+    expired_only << dendl;
+
+  rados::cls::lock::Lock l(gc_index_lock_name);
+  utime_t end = ceph_clock_now();
+
+  /* max_secs should be greater than zero. We don't want a zero max_secs
+   * to be translated as no timeout, since we'd then need to break the
+   * lock and that would require a manual intervention. In this case
+   * we can just wait it out. */
+  if (max_secs <= 0)
+    return -EAGAIN;
+
+  end += max_secs;
+  utime_t time(max_secs, 0);
+  l.set_duration(time);
+
+  int ret = l.lock_exclusive(&store->gc_pool_ctx, obj_names[index]);
+  if (ret == -EBUSY) { /* already locked by another gc processor */
+    ldpp_dout(this, 10) << "RGWGC::process failed to acquire lock on " <<
+      obj_names[index] << dendl;
+    return 0;
+  }
+  if (ret < 0)
+    return ret;
+
+  string marker;
+  string next_marker;
+  bool truncated;
+  IoCtx *ctx = new IoCtx;
+  do {
+    int max = 100;
+    std::list<cls_rgw_gc_obj_info> entries;
+
+    int ret = 0;
+
+    if (! transitioned_objects_cache[index]) {
+      ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, max, expired_only, entries, &truncated, next_marker);
+      ldpp_dout(this, 20) <<
+      "RGWGC::process cls_rgw_gc_list returned with returned:" << ret <<
+      ", entries.size=" << entries.size() << ", truncated=" << truncated <<
+      ", next_marker='" << next_marker << "'" << dendl;
+      obj_version objv;
+      cls_version_read(store->gc_pool_ctx, obj_names[index], &objv);
+      if ((objv.ver == 1) && entries.size() == 0) {
+        std::list<cls_rgw_gc_obj_info> non_expired_entries;
+        ret = cls_rgw_gc_list(store->gc_pool_ctx, obj_names[index], marker, 1, false, non_expired_entries, &truncated, next_marker);
+        if (non_expired_entries.size() == 0) {
+          transitioned_objects_cache[index] = true;
+          marker.clear();
+          ldpp_dout(this, 20) << "RGWGC::process cls_rgw_gc_list returned NO non expired entries, so setting cache entry to TRUE" << dendl;
+        } else {
+          ret = 0;
+          goto done;
+        }
+      }
+      if ((objv.ver == 0) && (ret == -ENOENT || entries.size() == 0)) {
+        ret = 0;
+        goto done;
+      }
+    }
+
+    if (transitioned_objects_cache[index]) {
+      ret = cls_rgw_gc_queue_list_entries(store->gc_pool_ctx, obj_names[index], marker, max, expired_only, entries, &truncated, next_marker);
+      ldpp_dout(this, 20) <<
+      "RGWGC::process cls_rgw_gc_queue_list_entries returned with return value:" << ret <<
+      ", entries.size=" << entries.size() << ", truncated=" << truncated <<
+      ", next_marker='" << next_marker << "'" << dendl;
+      if (entries.size() == 0) {
+        ret = 0;
+        goto done;
+      }
+    }
+
+    if (ret < 0)
+      goto done;
+
+    marker = next_marker;
+
+    string last_pool;
+    std::list<cls_rgw_gc_obj_info>::iterator iter;
+    for (iter = entries.begin(); iter != entries.end(); ++iter) {
+      cls_rgw_gc_obj_info& info = *iter;
+
+      ldpp_dout(this, 20) << "RGWGC::process iterating over entry tag='" <<
+	info.tag << "', time=" << info.time << ", chain.objs.size()=" <<
+	info.chain.objs.size() << dendl;
+
+      std::list<cls_rgw_obj>::iterator liter;
+      cls_rgw_obj_chain& chain = info.chain;
+
+      utime_t now = ceph_clock_now();
+      if (now >= end) {
+        goto done;
+      }
+      if (! transitioned_objects_cache[index]) {
+        if (chain.objs.empty()) {
+          io_manager.schedule_tag_removal(index, info.tag);
+        } else {
+          io_manager.add_tag_io_size(index, info.tag, chain.objs.size());
+        }
+      }
+      if (! chain.objs.empty()) {
+	for (liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
+	  cls_rgw_obj& obj = *liter;
+
+	  if (obj.pool != last_pool) {
+	    delete ctx;
+	    ctx = new IoCtx;
+	    ret = rgw_init_ioctx(this, store->get_rados_handle(), obj.pool, *ctx);
+	    if (ret < 0) {
+        if (transitioned_objects_cache[index]) {
+          goto done;
+        }
+	      last_pool = "";
+	      ldpp_dout(this, 0) << "ERROR: failed to create ioctx pool=" <<
+		obj.pool << dendl;
+	      continue;
+	    }
+	    last_pool = obj.pool;
+	  }
+
+	  ctx->locator_set_key(obj.loc);
+
+	  const string& oid = obj.key.name; /* just stored raw oid there */
+
+	  ldpp_dout(this, 5) << "RGWGC::process removing " << obj.pool <<
+	    ":" << obj.key.name << dendl;
+	  ObjectWriteOperation op;
+	  cls_refcount_put(op, info.tag, true);
+
+	  ret = io_manager.schedule_io(ctx, oid, &op, index, info.tag);
+	  if (ret < 0) {
+	    ldpp_dout(this, 0) <<
+	      "WARNING: failed to schedule deletion for oid=" << oid << dendl;
+      if (transitioned_objects_cache[index]) {
+        //If deleting oid failed for any of them, we will not delete queue entries
+        goto done;
+      }
+	  }
+	  if (going_down()) {
+	    // leave early, even if tag isn't removed, it's ok since it
+	    // will be picked up next time around
+	    goto done;
+	  }
+	} // chains loop
+      } // else -- chains not empty
+    } // entries loop
+    if (transitioned_objects_cache[index] && entries.size() > 0) {
+      ret = io_manager.drain_ios();
+      if (ret < 0) {
+        goto done;
+      }
+      //Remove the entries from the queue
+      ldpp_dout(this, 5) << "RGWGC::process removing entries, marker: " << marker << dendl;
+      ret = io_manager.remove_queue_entries(index, entries.size());
+      if (ret < 0) {
+        ldpp_dout(this, 0) <<
+          "WARNING: failed to remove queue entries" << dendl;
+        goto done;
+      }
+    }
+  } while (truncated);
+
+done:
+  /* we don't drain here, because if we're going down we don't want to
+   * hold the system if backend is unresponsive
+   */
+  l.unlock(&store->gc_pool_ctx, obj_names[index]);
+  delete ctx;
+
+  return 0;
+}
+
+int RGWGC::process(bool expired_only)
+{
+  int max_secs = cct->_conf->rgw_gc_processor_max_time;
+
+  const int start = ceph::util::generate_random_number(0, max_objs - 1);
+
+  RGWGCIOManager io_manager(this, store->ctx(), this);
+
+  for (int i = 0; i < max_objs; i++) {
+    int index = (i + start) % max_objs;
+    int ret = process(index, max_secs, expired_only, io_manager);
+    if (ret < 0)
+      return ret;
+  }
+  if (!going_down()) {
+    io_manager.drain();
+  }
+
+  return 0;
+}
+
+bool RGWGC::going_down()
+{
+  return down_flag;
+}
+
+void RGWGC::start_processor()
+{
+  worker = new GCWorker(this, cct, this);
+  worker->create("rgw_gc");
+}
+
+void RGWGC::stop_processor()
+{
+  down_flag = true;
+  if (worker) {
+    worker->stop();
+    worker->join();
+  }
+  delete worker;
+  worker = NULL;
+}
+
+unsigned RGWGC::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& RGWGC::gen_prefix(std::ostream& out) const
+{
+  return out << "garbage collection: ";
+}
+
+void *RGWGC::GCWorker::entry() {
+  do {
+    utime_t start = ceph_clock_now();
+    ldpp_dout(dpp, 2) << "garbage collection: start" << dendl;
+    int r = gc->process(true);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: garbage collection process() returned error r=" << r << dendl;
+    }
+    ldpp_dout(dpp, 2) << "garbage collection: stop" << dendl;
+
+    if (gc->going_down())
+      break;
+
+    utime_t end = ceph_clock_now();
+    end -= start;
+    int secs = cct->_conf->rgw_gc_processor_period;
+
+    if (secs <= end.sec())
+      continue; // next round
+
+    secs -= end.sec();
+
+    std::unique_lock locker{lock};
+    cond.wait_for(locker, std::chrono::seconds(secs));
+  } while (!gc->going_down());
+
+  return NULL;
+}
+
+void RGWGC::GCWorker::stop()
+{
+  std::lock_guard l{lock};
+  cond.notify_all();
+}
diff --git a/src/rgw/driver/rados/rgw_gc.h b/src/rgw/driver/rados/rgw_gc.h
new file mode 100644
index 000000000..f3df64099
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_gc.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+#include "rgw_common.h"
+#include "rgw_sal.h"
+#include "rgw_rados.h"
+#include "cls/rgw/cls_rgw_types.h"
+
+#include <atomic>
+
+class RGWGCIOManager;
+
+class RGWGC : public DoutPrefixProvider {
+  CephContext *cct;
+  RGWRados *store;
+  int max_objs;
+  std::string *obj_names;
+  std::atomic<bool> down_flag = { false };
+
+  static constexpr uint64_t seed = 8675309;
+
+  int tag_index(const std::string& tag);
+  int send_chain(const cls_rgw_obj_chain& chain, const std::string& tag);
+
+  class GCWorker : public Thread {
+    const DoutPrefixProvider *dpp;
+    CephContext *cct;
+    RGWGC *gc;
+    ceph::mutex lock = ceph::make_mutex("GCWorker");
+    ceph::condition_variable cond;
+
+  public:
+    GCWorker(const DoutPrefixProvider *_dpp, CephContext *_cct, RGWGC *_gc) : dpp(_dpp), cct(_cct), gc(_gc) {}
+    void *entry() override;
+    void stop();
+  };
+
+  GCWorker *worker;
+public:
+  RGWGC() : cct(NULL), store(NULL), max_objs(0), obj_names(NULL), worker(NULL) {}
+  ~RGWGC() {
+    stop_processor();
+    finalize();
+  }
+  std::vector<bool> transitioned_objects_cache;
+  std::tuple<int, std::optional<cls_rgw_obj_chain>> send_split_chain(const cls_rgw_obj_chain& chain, const std::string& tag);
+
+  // asynchronously defer garbage collection on an object that's still being read
+  int async_defer_chain(const std::string& tag, const cls_rgw_obj_chain& info);
+
+  // callback for when async_defer_chain() fails with ECANCELED
+  void on_defer_canceled(const cls_rgw_gc_obj_info& info);
+
+  int remove(int index, const std::vector<std::string>& tags, librados::AioCompletion **pc);
+  int remove(int index, int num_entries);
+
+  void initialize(CephContext *_cct, RGWRados *_store);
+  void finalize();
+
+  int list(int *index, std::string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue);
+  void list_init(int *index) { *index = 0; }
+  int process(int index, int process_max_secs, bool expired_only,
+              RGWGCIOManager& io_manager);
+  int process(bool expired_only);
+
+  bool going_down();
+  void start_processor();
+  void stop_processor();
+
+  CephContext *get_cct() const override { return store->ctx(); }
+  unsigned get_subsys() const;
+
+  std::ostream& gen_prefix(std::ostream& out) const;
+
+};
diff --git a/src/rgw/driver/rados/rgw_gc_log.cc b/src/rgw/driver/rados/rgw_gc_log.cc
new file mode 100644
index 000000000..ad819eddc
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_gc_log.cc
@@ -0,0 +1,55 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_gc_log.h"
+
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw_gc/cls_rgw_gc_client.h"
+#include "cls/version/cls_version_client.h"
+
+
+void gc_log_init2(librados::ObjectWriteOperation& op,
+                  uint64_t max_size, uint64_t max_deferred)
+{
+  obj_version objv; // objv.ver = 0
+  cls_version_check(op, objv, VER_COND_EQ);
+  cls_rgw_gc_queue_init(op, max_size, max_deferred);
+  objv.ver = 1;
+  cls_version_set(op, objv);
+}
+
+void gc_log_enqueue1(librados::ObjectWriteOperation& op,
+                     uint32_t expiration, cls_rgw_gc_obj_info& info)
+{
+  obj_version objv; // objv.ver = 0
+  cls_version_check(op, objv, VER_COND_EQ);
+  cls_rgw_gc_set_entry(op, expiration, info);
+}
+
+void gc_log_enqueue2(librados::ObjectWriteOperation& op,
+                     uint32_t expiration, const cls_rgw_gc_obj_info& info)
+{
+  obj_version objv;
+  objv.ver = 1;
+  cls_version_check(op, objv, VER_COND_EQ);
+  cls_rgw_gc_queue_enqueue(op, expiration, info);
+}
+
+void gc_log_defer1(librados::ObjectWriteOperation& op,
+                   uint32_t expiration, const cls_rgw_gc_obj_info& info)
+{
+  obj_version objv; // objv.ver = 0
+  cls_version_check(op, objv, VER_COND_EQ);
+  cls_rgw_gc_defer_entry(op, expiration, info.tag);
+}
+
+void gc_log_defer2(librados::ObjectWriteOperation& op,
+                   uint32_t expiration, const cls_rgw_gc_obj_info& info)
+{
+  obj_version objv;
+  objv.ver = 1;
+  cls_version_check(op, objv, VER_COND_EQ);
+  cls_rgw_gc_queue_defer_entry(op, expiration, info);
+  // TODO: conditional on whether omap is known to be empty
+  cls_rgw_gc_remove(op, {info.tag});
+}
diff --git a/src/rgw/driver/rados/rgw_lc_tier.cc b/src/rgw/driver/rados/rgw_lc_tier.cc
new file mode 100644
index 000000000..c52acef65
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_lc_tier.cc
@@ -0,0 +1,1310 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+#include <iostream>
+#include <map>
+
+#include "common/Formatter.h"
+#include <common/errno.h>
+#include "rgw_lc.h"
+#include "rgw_lc_tier.h"
+#include "rgw_string.h"
+#include "rgw_zone.h"
+#include "rgw_common.h"
+#include "rgw_rest.h"
+#include "svc_zone.h"
+
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+struct rgw_lc_multipart_part_info {
+  int part_num{0};
+  uint64_t ofs{0};
+  uint64_t size{0};
+  std::string etag;
+};
+
+struct rgw_lc_obj_properties {
+  ceph::real_time mtime;
+  std::string etag;
+  uint64_t versioned_epoch{0};
+  std::map<std::string, RGWTierACLMapping>& target_acl_mappings;
+  std::string target_storage_class;
+
+  rgw_lc_obj_properties(ceph::real_time _mtime, std::string _etag,
+      uint64_t _versioned_epoch, std::map<std::string,
+      RGWTierACLMapping>& _t_acl_mappings,
+      std::string _t_storage_class) :
+    mtime(_mtime), etag(_etag),
+    versioned_epoch(_versioned_epoch),
+    target_acl_mappings(_t_acl_mappings),
+    target_storage_class(_t_storage_class) {}
+};
+
+struct rgw_lc_multipart_upload_info {
+  std::string upload_id;
+  uint64_t obj_size;
+  ceph::real_time mtime;
+  std::string etag;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(upload_id, bl);
+    encode(obj_size, bl);
+    encode(mtime, bl);
+    encode(etag, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(upload_id, bl);
+    decode(obj_size, bl);
+    decode(mtime, bl);
+    decode(etag, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_lc_multipart_upload_info)
+
+static inline string get_key_instance(const rgw_obj_key& key)
+{
+  if (!key.instance.empty() &&
+      !key.have_null_instance()) {
+    return "-" + key.instance;
+  }
+  return "";
+}
+
+static inline string get_key_oid(const rgw_obj_key& key)
+{
+  string oid = key.name;
+  if (!key.instance.empty() &&
+      !key.have_null_instance()) {
+    oid += string("-") + key.instance;
+  }
+  return oid;
+}
+
+static inline string obj_to_aws_path(const rgw_obj& obj)
+{
+  string path = obj.bucket.name + "/" + get_key_oid(obj.key);
+  return path;
+}
+
+static int read_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
+    const rgw_raw_obj *status_obj, rgw_lc_multipart_upload_info *status)
+{
+  int ret = 0;
+  rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
+
+  if (!rados) {
+    ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
+    return -1;
+  }
+
+  auto& pool = status_obj->pool;
+  const auto oid = status_obj->oid;
+  auto sysobj = rados->svc()->sysobj;
+  bufferlist bl;
+
+  ret = rgw_get_system_obj(sysobj, pool, oid, bl, nullptr, nullptr,
+      null_yield, dpp);
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (bl.length() > 0) {
+    try {
+      auto p = bl.cbegin();
+      status->decode(p);
+    } catch (buffer::error& e) {
+      ldpp_dout(dpp, 10) << "failed to decode status obj: "
+        << e.what() << dendl;
+      return -EIO;
+    }
+  } else {
+    return -EIO;
+  }
+
+  return 0;
+}
+
+static int put_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
+    const rgw_raw_obj *status_obj, rgw_lc_multipart_upload_info *status)
+{
+  int ret = 0;
+  rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
+
+  if (!rados) {
+    ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
+    return -1;
+  }
+
+  auto& pool = status_obj->pool;
+  const auto oid = status_obj->oid;
+  auto sysobj = rados->svc()->sysobj;
+  bufferlist bl;
+  status->encode(bl);
+
+  ret = rgw_put_system_obj(dpp, sysobj, pool, oid, bl, true, nullptr,
+      real_time{}, null_yield);
+
+  return ret;
+}
+
+static int delete_upload_status(const DoutPrefixProvider *dpp, rgw::sal::Driver *driver,
+    const rgw_raw_obj *status_obj)
+{
+  int ret = 0;
+  rgw::sal::RadosStore *rados = dynamic_cast<rgw::sal::RadosStore*>(driver);
+
+  if (!rados) {
+    ldpp_dout(dpp, 0) << "ERROR: Not a RadosStore. Cannot be transitioned to cloud." << dendl;
+    return -1;
+  }
+
+  auto& pool = status_obj->pool;
+  const auto oid = status_obj->oid;
+  auto sysobj = rados->svc()->sysobj;
+
+  ret = rgw_delete_system_obj(dpp, sysobj, pool, oid, nullptr, null_yield);
+
+  return ret;
+}
+
+static std::set<string> keep_headers = { "CONTENT_TYPE",
+                                         "CONTENT_ENCODING",
+                                         "CONTENT_DISPOSITION",
+                                         "CONTENT_LANGUAGE" };
+
+/*
+ * mapping between rgw object attrs and output http fields
+ *
+ static const struct rgw_http_attr base_rgw_to_http_attrs[] = {
+ { RGW_ATTR_CONTENT_LANG,      "Content-Language" },
+ { RGW_ATTR_EXPIRES,           "Expires" },
+ { RGW_ATTR_CACHE_CONTROL,     "Cache-Control" },
+ { RGW_ATTR_CONTENT_DISP,      "Content-Disposition" },
+ { RGW_ATTR_CONTENT_ENC,       "Content-Encoding" },
+ { RGW_ATTR_USER_MANIFEST,     "X-Object-Manifest" },
+ { RGW_ATTR_X_ROBOTS_TAG ,     "X-Robots-Tag" },
+ { RGW_ATTR_STORAGE_CLASS ,    "X-Amz-Storage-Class" },
+// RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION header depends on access mode:
+// S3 endpoint: x-amz-website-redirect-location
+// S3Website endpoint: Location
+{ RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION, "x-amz-website-redirect-location" },
+}; */
+
+static void init_headers(map<string, bufferlist>& attrs,
+    map<string, string>& headers)
+{
+  for (auto& kv : attrs) {
+    const char * name = kv.first.c_str();
+    const auto aiter = rgw_to_http_attrs.find(name);
+
+    if (aiter != std::end(rgw_to_http_attrs)) {
+      headers[aiter->second] = rgw_bl_str(kv.second);
+    } else if (strncmp(name, RGW_ATTR_META_PREFIX,
+          sizeof(RGW_ATTR_META_PREFIX)-1) == 0) {
+      name += sizeof(RGW_ATTR_META_PREFIX) - 1;
+      string sname(name);
+      string name_prefix = RGW_ATTR_META_PREFIX;
+      char full_name_buf[name_prefix.size() + sname.size() + 1];
+      snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s",
+          static_cast<int>(name_prefix.length()),
+          name_prefix.data(),
+          static_cast<int>(sname.length()),
+          sname.data());
+      headers[full_name_buf] = rgw_bl_str(kv.second);
+    } else if (strcmp(name,RGW_ATTR_CONTENT_TYPE) == 0) {
+      headers["CONTENT_TYPE"] = rgw_bl_str(kv.second);
+    }
+  }
+}
+
+/* Read object or just head from remote endpoint. For now initializes only headers,
+ * but can be extended to fetch etag, mtime etc if needed.
+ */
+static int cloud_tier_get_object(RGWLCCloudTierCtx& tier_ctx, bool head,
+                         std::map<std::string, std::string>& headers) {
+  RGWRESTConn::get_obj_params req_params;
+  std::string target_obj_name;
+  int ret = 0;
+  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+        tier_ctx.target_storage_class);
+  std::string etag;
+  RGWRESTStreamRWRequest *in_req;
+
+  rgw_bucket dest_bucket;
+  dest_bucket.name = tier_ctx.target_bucket_name;
+  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+                    tier_ctx.obj->get_name();
+  if (!tier_ctx.o.is_current()) {
+    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+  }
+
+  rgw_obj dest_obj(dest_bucket, rgw_obj_key(target_obj_name));
+
+  /* init input connection */
+  req_params.get_op = !head;
+  req_params.prepend_metadata = true;
+  req_params.rgwx_stat = true;
+  req_params.sync_manifest = true;
+  req_params.skip_decrypt = true;
+
+  ret = tier_ctx.conn.get_obj(tier_ctx.dpp, dest_obj, req_params, true /* send */, &in_req);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: " << __func__ << "(): conn.get_obj() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  /* fetch headers */
+  ret = tier_ctx.conn.complete_request(in_req, nullptr, nullptr, nullptr, nullptr, &headers, null_yield);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(tier_ctx.dpp, 20) << "ERROR: " << __func__ << "(): conn.complete_request() returned ret=" << ret << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+static bool is_already_tiered(const DoutPrefixProvider *dpp,
+                             std::map<std::string, std::string>& headers,
+                             ceph::real_time& mtime) {
+  char buf[32];
+  map<string, string> attrs = headers;
+
+  for (const auto& a : attrs) {
+    ldpp_dout(dpp, 20) << "GetCrf attr[" << a.first << "] = " << a.second <<dendl;
+  }
+  utime_t ut(mtime);
+  snprintf(buf, sizeof(buf), "%lld.%09lld",
+      (long long)ut.sec(),
+      (long long)ut.nsec());
+
+  string s = attrs["X_AMZ_META_RGWX_SOURCE_MTIME"];
+
+  if (s.empty())
+    s = attrs["x_amz_meta_rgwx_source_mtime"];
+
+  ldpp_dout(dpp, 20) << "is_already_tiered attrs[X_AMZ_META_RGWX_SOURCE_MTIME] = " << s <<dendl;
+  ldpp_dout(dpp, 20) << "is_already_tiered mtime buf = " << buf <<dendl;
+
+  if (!s.empty() && !strcmp(s.c_str(), buf)){
+    return 1;
+  }
+  return 0;
+}
+
+/* Read object locally & also initialize dest rest obj based on read attrs */
+class RGWLCStreamRead
+{
+  CephContext *cct;
+  const DoutPrefixProvider *dpp;
+  std::map<std::string, bufferlist> attrs;
+  uint64_t obj_size;
+  rgw::sal::Object *obj;
+  const real_time &mtime;
+
+  bool multipart{false};
+  uint64_t m_part_size{0};
+  off_t m_part_off{0};
+  off_t m_part_end{0};
+
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op;
+  off_t ofs{0};
+  off_t end{0};
+  rgw_rest_obj rest_obj;
+
+  int retcode{0};
+
+  public:
+  RGWLCStreamRead(CephContext *_cct, const DoutPrefixProvider *_dpp,
+      rgw::sal::Object *_obj, const real_time &_mtime) :
+    cct(_cct), dpp(_dpp), obj(_obj), mtime(_mtime),
+    read_op(obj->get_read_op()) {}
+
+  ~RGWLCStreamRead() {};
+  int set_range(off_t _ofs, off_t _end);
+  int get_range(off_t &_ofs, off_t &_end);
+  rgw_rest_obj& get_rest_obj();
+  void set_multipart(uint64_t part_size, off_t part_off, off_t part_end);
+  int init();
+  int init_rest_obj();
+  int read(off_t ofs, off_t end, RGWGetDataCB *out_cb);
+};
+
+/* Send PUT op to remote endpoint */
+class RGWLCCloudStreamPut
+{
+  const DoutPrefixProvider *dpp;
+  rgw_lc_obj_properties obj_properties;
+  RGWRESTConn& conn;
+  const rgw_obj& dest_obj;
+  std::string etag;
+  RGWRESTStreamS3PutObj *out_req{nullptr};
+
+  struct multipart_info {
+    bool is_multipart{false};
+    std::string upload_id;
+    int part_num{0};
+    uint64_t part_size;
+  } multipart;
+
+  int retcode;
+
+  public:
+  RGWLCCloudStreamPut(const DoutPrefixProvider *_dpp,
+      const rgw_lc_obj_properties&  _obj_properties,
+      RGWRESTConn& _conn,
+      const rgw_obj& _dest_obj) :
+    dpp(_dpp), obj_properties(_obj_properties), conn(_conn), dest_obj(_dest_obj) {
+    }
+  int init();
+  static bool keep_attr(const std::string& h);
+  static void init_send_attrs(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj,
+      const rgw_lc_obj_properties& obj_properties,
+      std::map<std::string, std::string>& attrs);
+  void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj);
+  void handle_headers(const std::map<std::string, std::string>& headers);
+  bool get_etag(std::string *petag);
+  void set_multipart(const std::string& upload_id, int part_num, uint64_t part_size);
+  int send();
+  RGWGetDataCB *get_cb();
+  int complete_request();
+};
+
+int RGWLCStreamRead::set_range(off_t _ofs, off_t _end) {
+  ofs = _ofs;
+  end = _end;
+
+  return 0;
+}
+
+int RGWLCStreamRead::get_range(off_t &_ofs, off_t &_end) {
+  _ofs = ofs;
+  _end = end;
+
+  return 0;
+}
+
+rgw_rest_obj& RGWLCStreamRead::get_rest_obj() {
+  return rest_obj;
+}
+
+void RGWLCStreamRead::set_multipart(uint64_t part_size, off_t part_off, off_t part_end) {
+  multipart = true;
+  m_part_size = part_size;
+  m_part_off = part_off;
+  m_part_end = part_end;
+}
+
+int RGWLCStreamRead::init() {
+  optional_yield y = null_yield;
+  real_time read_mtime;
+
+  read_op->params.lastmod = &read_mtime;
+
+  int ret = read_op->prepare(y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fail to prepare read_op, ret = " << ret << dendl;
+    return ret;
+  }
+
+  if (read_mtime != mtime) {
+    /* raced */
+    return -ECANCELED;
+  }
+
+  attrs = obj->get_attrs();
+  obj_size = obj->get_obj_size();
+
+  ret = init_rest_obj();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fail to initialize rest_obj, ret = " << ret << dendl;
+    return ret;
+  }
+
+  if (!multipart) {
+    set_range(0, obj_size - 1);
+  } else {
+    set_range(m_part_off, m_part_end);
+  }
+  return 0;
+}
+
+int RGWLCStreamRead::init_rest_obj() {
+  /* Initialize rgw_rest_obj. 
+   * Reference: do_decode_rest_obj
+   * Check how to copy headers content */ 
+  rest_obj.init(obj->get_key());
+
+  if (!multipart) {
+    rest_obj.content_len = obj_size;
+  } else {
+    rest_obj.content_len = m_part_size;
+  }
+
+  /* For mulitpart attrs are sent as part of InitMultipartCR itself */
+  if (multipart) {
+    return 0;
+  }
+
+  /*
+   * XXX: verify if its right way to copy attrs into rest obj
+   */
+  init_headers(attrs, rest_obj.attrs);
+
+  rest_obj.acls.set_ctx(cct);
+  const auto aiter = attrs.find(RGW_ATTR_ACL);
+  if (aiter != attrs.end()) {
+    bufferlist& bl = aiter->second;
+    auto bliter = bl.cbegin();
+    try {
+      rest_obj.acls.decode(bliter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode policy off attrs" << dendl;
+      return -EIO;
+    }
+  } else {
+    ldpp_dout(dpp, 0) << "WARNING: acl attrs not provided" << dendl;
+  }
+  return 0;
+}
+
+int RGWLCStreamRead::read(off_t ofs, off_t end, RGWGetDataCB *out_cb) {
+  int ret = read_op->iterate(dpp, ofs, end, out_cb, null_yield);
+  return ret;
+}
+
+int RGWLCCloudStreamPut::init() {
+  /* init output connection */
+  if (multipart.is_multipart) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%d", multipart.part_num);
+    rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() },
+                                     { "partNumber", buf },
+                                     { nullptr, nullptr } };
+    conn.put_obj_send_init(dest_obj, params, &out_req);
+  } else {
+    conn.put_obj_send_init(dest_obj, nullptr, &out_req);
+  }
+
+  return 0;
+}
+
+bool RGWLCCloudStreamPut::keep_attr(const string& h) {
+  return (keep_headers.find(h) != keep_headers.end());
+}
+
+void RGWLCCloudStreamPut::init_send_attrs(const DoutPrefixProvider *dpp,
+    const rgw_rest_obj& rest_obj,
+    const rgw_lc_obj_properties& obj_properties,
+    std::map<string, string>& attrs) {
+
+  map<string, RGWTierACLMapping>& acl_mappings(obj_properties.target_acl_mappings);
+  const std::string& target_storage_class = obj_properties.target_storage_class;
+
+  attrs.clear();
+
+  for (auto& hi : rest_obj.attrs) {
+    if (keep_attr(hi.first)) {
+      attrs.insert(hi);
+    } else {
+      std::string s1 = boost::algorithm::to_lower_copy(hi.first);
+      const char* k = std::strstr(s1.c_str(), "x-amz");
+      if (k) {
+        attrs[k] = hi.second;
+      }
+    }
+  }
+
+  const auto acl = rest_obj.acls.get_acl();
+
+  map<int, vector<string> > access_map;
+
+  if (!acl_mappings.empty()) {
+    for (auto& grant : acl.get_grant_map()) {
+      auto& orig_grantee = grant.first;
+      auto& perm = grant.second;
+
+      string grantee;
+
+      const auto& am = acl_mappings;
+
+      const auto iter = am.find(orig_grantee);
+      if (iter == am.end()) {
+        ldpp_dout(dpp, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl;
+        continue;
+      }
+
+      grantee = iter->second.dest_id;
+
+      string type;
+
+      switch (iter->second.type) {
+        case ACL_TYPE_CANON_USER:
+          type = "id";
+          break;
+        case ACL_TYPE_EMAIL_USER:
+          type = "emailAddress";
+          break;
+        case ACL_TYPE_GROUP:
+          type = "uri";
+          break;
+        default:
+          continue;
+      }
+
+      string tv = type + "=" + grantee;
+
+      int flags = perm.get_permission().get_permissions();
+      if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) {
+        access_map[flags].push_back(tv);
+        continue;
+      }
+
+      for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) {
+        if (flags & i) {
+          access_map[i].push_back(tv);
+        }
+      }
+    }
+  }
+
+  for (const auto& aiter : access_map) {
+    int grant_type = aiter.first;
+
+    string header_str("x-amz-grant-");
+
+    switch (grant_type) {
+      case RGW_PERM_READ:
+        header_str.append("read");
+        break;
+      case RGW_PERM_WRITE:
+        header_str.append("write");
+        break;
+      case RGW_PERM_READ_ACP:
+        header_str.append("read-acp");
+        break;
+      case RGW_PERM_WRITE_ACP:
+        header_str.append("write-acp");
+        break;
+      case RGW_PERM_FULL_CONTROL:
+        header_str.append("full-control");
+        break;
+    }
+
+    string s;
+
+    for (const auto& viter : aiter.second) {
+      if (!s.empty()) {
+        s.append(", ");
+      }
+      s.append(viter);
+    }
+
+    ldpp_dout(dpp, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl;
+
+    attrs[header_str] = s;
+  }
+
+  /* Copy target storage class */
+  if (!target_storage_class.empty()) {
+    attrs["x-amz-storage-class"] = target_storage_class;
+  } else {
+    attrs["x-amz-storage-class"] = "STANDARD";
+  }
+
+  /* New attribute to specify its transitioned from RGW */
+  attrs["x-amz-meta-rgwx-source"] = "rgw";
+  attrs["x-rgw-cloud"] = "true";
+  attrs["x-rgw-cloud-keep-attrs"] = "true";
+
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%llu", (long long)obj_properties.versioned_epoch);
+  attrs["x-amz-meta-rgwx-versioned-epoch"] = buf;
+
+  utime_t ut(obj_properties.mtime);
+  snprintf(buf, sizeof(buf), "%lld.%09lld",
+      (long long)ut.sec(),
+      (long long)ut.nsec());
+
+  attrs["x-amz-meta-rgwx-source-mtime"] = buf;
+  attrs["x-amz-meta-rgwx-source-etag"] = obj_properties.etag;
+  attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name;
+  if (!rest_obj.key.instance.empty()) {
+    attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance;
+  }
+  for (const auto& a : attrs) {
+    ldpp_dout(dpp, 30) << "init_send_attrs attr[" << a.first << "] = " << a.second <<dendl;
+  }
+}
+
+void RGWLCCloudStreamPut::send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) {
+  auto r = static_cast<RGWRESTStreamS3PutObj *>(out_req);
+
+  std::map<std::string, std::string> new_attrs;
+  if (!multipart.is_multipart) {
+    init_send_attrs(dpp, rest_obj, obj_properties, new_attrs);
+  }
+
+  r->set_send_length(rest_obj.content_len);
+
+  RGWAccessControlPolicy policy;
+
+  r->send_ready(dpp, conn.get_key(), new_attrs, policy);
+}
+
+void RGWLCCloudStreamPut::handle_headers(const map<string, string>& headers) {
+  for (const auto& h : headers) {
+    if (h.first == "ETAG") {
+      etag = h.second;
+    }
+  }
+}
+
+bool RGWLCCloudStreamPut::get_etag(string *petag) {
+  if (etag.empty()) {
+    return false;
+  }
+  *petag = etag;
+  return true;
+}
+
+void RGWLCCloudStreamPut::set_multipart(const string& upload_id, int part_num, uint64_t part_size) {
+  multipart.is_multipart = true;
+  multipart.upload_id = upload_id;
+  multipart.part_num = part_num;
+  multipart.part_size = part_size;
+}
+
+int RGWLCCloudStreamPut::send() {
+  int ret = RGWHTTP::send(out_req);
+  return ret;
+}
+
+RGWGetDataCB *RGWLCCloudStreamPut::get_cb() {
+  return out_req->get_out_cb();
+}
+
+int RGWLCCloudStreamPut::complete_request() {
+  int ret = conn.complete_request(out_req, etag, &obj_properties.mtime, null_yield);
+  return ret;
+}
+
+/* Read local copy and write to Cloud endpoint */
+static int cloud_tier_transfer_object(const DoutPrefixProvider* dpp,
+                            RGWLCStreamRead* readf, RGWLCCloudStreamPut* writef) {
+  std::string url;
+  bufferlist bl;
+  bool sent_attrs{false};
+  int ret{0};
+  off_t ofs;
+  off_t end;
+
+  ret = readf->init();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fail to initialize in_crf, ret = " << ret << dendl;
+    return ret;
+  }
+  readf->get_range(ofs, end);
+  rgw_rest_obj& rest_obj = readf->get_rest_obj();
+  if (!sent_attrs) {
+    ret = writef->init();
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: fail to initialize out_crf, ret = " << ret << dendl;
+      return ret;
+    }
+
+    writef->send_ready(dpp, rest_obj);
+    ret = writef->send();
+    if (ret < 0) {
+      return ret;
+    }
+    sent_attrs = true;
+  }
+
+  ret = readf->read(ofs, end, writef->get_cb());
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fail to read from in_crf, ret = " << ret << dendl;
+    return ret;
+  }
+
+  ret = writef->complete_request();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fail to complete request, ret = " << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+static int cloud_tier_plain_transfer(RGWLCCloudTierCtx& tier_ctx) {
+  int ret;
+
+  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+                        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+                        tier_ctx.target_storage_class);
+  std::string target_obj_name;
+
+  rgw_bucket dest_bucket;
+  dest_bucket.name = tier_ctx.target_bucket_name;
+
+  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+    tier_ctx.obj->get_name();
+  if (!tier_ctx.o.is_current()) {
+    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+  }
+
+  rgw_obj dest_obj(dest_bucket, rgw_obj_key(target_obj_name));
+
+  tier_ctx.obj->set_atomic();
+
+  /* Prepare Read from source */
+  /* TODO: Define readf, writef as stack variables. For some reason,
+   * when used as stack variables (esp., readf), the transition seems to
+   * be taking lot of time eventually erroring out at times.
+   */
+  std::shared_ptr<RGWLCStreamRead> readf;
+  readf.reset(new RGWLCStreamRead(tier_ctx.cct, tier_ctx.dpp,
+        tier_ctx.obj, tier_ctx.o.meta.mtime));
+
+  std::shared_ptr<RGWLCCloudStreamPut> writef;
+  writef.reset(new RGWLCCloudStreamPut(tier_ctx.dpp, obj_properties, tier_ctx.conn,
+               dest_obj));
+
+  /* actual Read & Write */
+  ret = cloud_tier_transfer_object(tier_ctx.dpp, readf.get(), writef.get());
+
+  return ret;
+}
+
+static int cloud_tier_send_multipart_part(RGWLCCloudTierCtx& tier_ctx,
+                                const std::string& upload_id,
+                                const rgw_lc_multipart_part_info& part_info,
+                                std::string *petag) {
+  int ret;
+
+  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+                        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+                        tier_ctx.target_storage_class);
+  std::string target_obj_name;
+  off_t end;
+
+  rgw_bucket dest_bucket;
+  dest_bucket.name = tier_ctx.target_bucket_name;
+
+  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+    tier_ctx.obj->get_name();
+  if (!tier_ctx.o.is_current()) {
+    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+  }
+
+  rgw_obj dest_obj(dest_bucket, rgw_obj_key(target_obj_name));
+
+  tier_ctx.obj->set_atomic();
+
+  /* TODO: Define readf, writef as stack variables. For some reason,
+   * when used as stack variables (esp., readf), the transition seems to
+   * be taking lot of time eventually erroring out at times. */
+  std::shared_ptr<RGWLCStreamRead> readf;
+  readf.reset(new RGWLCStreamRead(tier_ctx.cct, tier_ctx.dpp,
+        tier_ctx.obj, tier_ctx.o.meta.mtime));
+
+  std::shared_ptr<RGWLCCloudStreamPut> writef;
+  writef.reset(new RGWLCCloudStreamPut(tier_ctx.dpp, obj_properties, tier_ctx.conn,
+               dest_obj));
+
+  /* Prepare Read from source */
+  end = part_info.ofs + part_info.size - 1;
+  readf->set_multipart(part_info.size, part_info.ofs, end);
+
+  /* Prepare write */
+  writef->set_multipart(upload_id, part_info.part_num, part_info.size);
+
+  /* actual Read & Write */
+  ret = cloud_tier_transfer_object(tier_ctx.dpp, readf.get(), writef.get());
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (!(writef->get_etag(petag))) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to get etag from PUT request" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+static int cloud_tier_abort_multipart(const DoutPrefixProvider *dpp,
+      RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
+      const std::string& upload_id) {
+  int ret;
+  bufferlist out_bl;
+  bufferlist bl;
+  rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+
+  string resource = obj_to_aws_path(dest_obj);
+  ret = dest_conn.send_resource(dpp, "DELETE", resource, params, nullptr,
+      out_bl, &bl, nullptr, null_yield);
+
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (ret=" << ret << ")" << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+static int cloud_tier_init_multipart(const DoutPrefixProvider *dpp,
+      RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
+      uint64_t obj_size, std::map<std::string, std::string>& attrs,
+      std::string& upload_id) {
+  bufferlist out_bl;
+  bufferlist bl;
+
+  struct InitMultipartResult {
+    std::string bucket;
+    std::string key;
+    std::string upload_id;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+      RGWXMLDecoder::decode_xml("Key", key, obj);
+      RGWXMLDecoder::decode_xml("UploadId", upload_id, obj);
+    }
+  } result;
+
+  int ret;
+  rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} };
+
+  string resource = obj_to_aws_path(dest_obj);
+
+  ret = dest_conn.send_resource(dpp, "POST", resource, params, &attrs,
+      out_bl, &bl, nullptr, null_yield);
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
+    return ret;
+  }
+  /*
+   * If one of the following fails we cannot abort upload, as we cannot
+   * extract the upload id. If one of these fail it's very likely that that's
+   * the least of our problem.
+   */
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+    return -EIO;
+  }
+
+  if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+    string str(out_bl.c_str(), out_bl.length());
+    ldpp_dout(dpp, 5) << "ERROR: failed to parse xml initmultipart: " << str << dendl;
+    return -EIO;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true);
+  } catch (RGWXMLDecoder::err& err) {
+    string str(out_bl.c_str(), out_bl.length());
+    ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+    return -EIO;
+  }
+
+  ldpp_dout(dpp, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl;
+
+  upload_id = result.upload_id;
+
+  return 0;
+}
+
+static int cloud_tier_complete_multipart(const DoutPrefixProvider *dpp,
+      RGWRESTConn& dest_conn, const rgw_obj& dest_obj,
+      std::string& upload_id,
+      const std::map<int, rgw_lc_multipart_part_info>& parts) {
+  rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+
+  stringstream ss;
+  XMLFormatter formatter;
+  int ret;
+
+  bufferlist bl, out_bl;
+  string resource = obj_to_aws_path(dest_obj);
+
+  struct CompleteMultipartReq {
+    std::map<int, rgw_lc_multipart_part_info> parts;
+
+    explicit CompleteMultipartReq(const std::map<int, rgw_lc_multipart_part_info>& _parts) : parts(_parts) {}
+
+    void dump_xml(Formatter *f) const {
+      for (const auto& p : parts) {
+        f->open_object_section("Part");
+        encode_xml("PartNumber", p.first, f);
+        encode_xml("ETag", p.second.etag, f);
+        f->close_section();
+      };
+    }
+  } req_enc(parts);
+
+  struct CompleteMultipartResult {
+    std::string location;
+    std::string bucket;
+    std::string key;
+    std::string etag;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Location", bucket, obj);
+      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+      RGWXMLDecoder::decode_xml("Key", key, obj);
+      RGWXMLDecoder::decode_xml("ETag", etag, obj);
+    }
+  } result;
+
+  encode_xml("CompleteMultipartUpload", req_enc, &formatter);
+
+  formatter.flush(ss);
+  bl.append(ss.str());
+
+  ret = dest_conn.send_resource(dpp, "POST", resource, params, nullptr,
+      out_bl, &bl, nullptr, null_yield);
+
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to complete multipart upload for dest object=" << dest_obj << dendl;
+    return ret;
+  }
+  /*
+   * If one of the following fails we cannot abort upload, as we cannot
+   * extract the upload id. If one of these fail it's very likely that that's
+   * the least of our problem.
+   */
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+    return -EIO;
+  }
+
+  if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+    string str(out_bl.c_str(), out_bl.length());
+    ldpp_dout(dpp, 5) << "ERROR: failed to parse xml Completemultipart: " << str << dendl;
+    return -EIO;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true);
+  } catch (RGWXMLDecoder::err& err) {
+    string str(out_bl.c_str(), out_bl.length());
+    ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+    return -EIO;
+  }
+
+  ldpp_dout(dpp, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl;
+
+  return ret;
+}
+
+static int cloud_tier_abort_multipart_upload(RGWLCCloudTierCtx& tier_ctx,
+      const rgw_obj& dest_obj, const rgw_raw_obj& status_obj,
+      const std::string& upload_id) {
+  int ret;
+
+  ret = cloud_tier_abort_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, upload_id);
+
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " ret=" << ret << dendl;
+    /* ignore error, best effort */
+  }
+  /* remove status obj */
+  ret = delete_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " ret=" << ret << dendl;
+    // ignore error, best effort 
+  }
+  return 0;
+}
+
+static int cloud_tier_multipart_transfer(RGWLCCloudTierCtx& tier_ctx) {
+  rgw_obj src_obj;
+  rgw_obj dest_obj;
+
+  uint64_t obj_size;
+  std::string src_etag;
+  rgw_rest_obj rest_obj;
+
+  rgw_lc_multipart_upload_info status;
+
+  std::map<std::string, std::string> new_attrs;
+
+  rgw_raw_obj status_obj;
+
+  RGWBucketInfo b;
+  std::string target_obj_name;
+  rgw_bucket target_bucket;
+
+  int ret;
+
+  rgw_lc_obj_properties obj_properties(tier_ctx.o.meta.mtime, tier_ctx.o.meta.etag,
+        tier_ctx.o.versioned_epoch, tier_ctx.acl_mappings,
+        tier_ctx.target_storage_class);
+
+  uint32_t part_size{0};
+  uint32_t num_parts{0};
+
+  int cur_part{0};
+  uint64_t cur_ofs{0};
+  std::map<int, rgw_lc_multipart_part_info> parts;
+
+  obj_size = tier_ctx.o.meta.size;
+
+  target_bucket.name = tier_ctx.target_bucket_name;
+
+  target_obj_name = tier_ctx.bucket_info.bucket.name + "/" +
+    tier_ctx.obj->get_name();
+  if (!tier_ctx.o.is_current()) {
+    target_obj_name += get_key_instance(tier_ctx.obj->get_key());
+  }
+  dest_obj.init(target_bucket, target_obj_name);
+
+  rgw_pool pool = static_cast<rgw::sal::RadosStore*>(tier_ctx.driver)->svc()->zone->get_zone_params().log_pool;
+  status_obj = rgw_raw_obj(pool, "lc_multipart_" + tier_ctx.obj->get_oid());
+
+  ret = read_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj, &status);
+
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to read sync status of object " << src_obj << " ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (ret >= 0) {
+    // check here that mtime and size did not change 
+    if (status.mtime != obj_properties.mtime || status.obj_size != obj_size ||
+        status.etag != obj_properties.etag) {
+      cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
+      ret = -ENOENT;
+    }
+  }
+
+  if (ret == -ENOENT) { 
+    RGWLCStreamRead readf(tier_ctx.cct, tier_ctx.dpp, tier_ctx.obj, tier_ctx.o.meta.mtime);
+
+    readf.init();
+
+    rest_obj = readf.get_rest_obj();
+
+    RGWLCCloudStreamPut::init_send_attrs(tier_ctx.dpp, rest_obj, obj_properties, new_attrs);
+
+    ret = cloud_tier_init_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, obj_size, new_attrs, status.upload_id);
+    if (ret < 0) {
+      return ret;
+    }
+
+    status.obj_size = obj_size;
+    status.mtime = obj_properties.mtime;
+    status.etag = obj_properties.etag;
+
+    ret = put_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj, &status);
+
+    if (ret < 0) {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to driver multipart upload state, ret=" << ret << dendl;
+      // continue with upload anyway 
+    }
+
+#define MULTIPART_MAX_PARTS 10000
+#define MULTIPART_MAX_PARTS 10000
+    uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS;
+    uint64_t min_conf_size = tier_ctx.multipart_min_part_size;
+
+    if (min_conf_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
+      min_conf_size = MULTIPART_MIN_POSSIBLE_PART_SIZE;
+    }
+
+    part_size = std::max(min_conf_size, min_part_size);
+    num_parts = (obj_size + part_size - 1) / part_size;
+    cur_part = 1;
+    cur_ofs = 0;
+  }
+
+  for (; (uint32_t)cur_part <= num_parts; ++cur_part) {
+    ldpp_dout(tier_ctx.dpp, 20) << "cur_part = "<< cur_part << ", info.ofs = " << cur_ofs << ", info.size = " << part_size << ", obj size = " << obj_size<< ", num_parts:" << num_parts << dendl;
+    rgw_lc_multipart_part_info& cur_part_info = parts[cur_part];
+    cur_part_info.part_num = cur_part;
+    cur_part_info.ofs = cur_ofs;
+    cur_part_info.size = std::min((uint64_t)part_size, obj_size - cur_ofs);
+
+    cur_ofs += cur_part_info.size;
+
+    ret = cloud_tier_send_multipart_part(tier_ctx,
+            status.upload_id,
+            cur_part_info,
+            &cur_part_info.etag);
+
+    if (ret < 0) {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to send multipart part of obj=" << tier_ctx.obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << cur_part << " (error: " << cpp_strerror(-ret) << ")" << dendl;
+      cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
+      return ret;
+    }
+
+  }
+
+  ret = cloud_tier_complete_multipart(tier_ctx.dpp, tier_ctx.conn, dest_obj, status.upload_id, parts);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to complete multipart upload of obj=" << tier_ctx.obj << " (error: " << cpp_strerror(-ret) << ")" << dendl;
+    cloud_tier_abort_multipart_upload(tier_ctx, dest_obj, status_obj, status.upload_id);
+    return ret;
+  }
+
+  /* remove status obj */
+  ret = delete_upload_status(tier_ctx.dpp, tier_ctx.driver, &status_obj);
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to abort multipart upload obj=" << tier_ctx.obj << " upload_id=" << status.upload_id << " part number " << cur_part << " (" << cpp_strerror(-ret) << ")" << dendl;
+    // ignore error, best effort 
+  }
+  return 0;
+}
+
+/* Check if object has already been transitioned */
+static int cloud_tier_check_object(RGWLCCloudTierCtx& tier_ctx, bool& already_tiered) {
+  int ret;
+  std::map<std::string, std::string> headers;
+
+  /* Fetch Head object */
+  ret = cloud_tier_get_object(tier_ctx, true, headers);
+
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to fetch HEAD from cloud for obj=" << tier_ctx.obj << " , ret = " << ret << dendl;
+    return ret;
+  }
+
+  already_tiered = is_already_tiered(tier_ctx.dpp, headers, tier_ctx.o.meta.mtime);
+
+  if (already_tiered) {
+    ldpp_dout(tier_ctx.dpp, 20) << "is_already_tiered true" << dendl;
+  } else {
+    ldpp_dout(tier_ctx.dpp, 20) << "is_already_tiered false..going with out_crf writing" << dendl;
+  }
+
+  return ret;
+}
+
+static int cloud_tier_create_bucket(RGWLCCloudTierCtx& tier_ctx) {
+  bufferlist out_bl;
+  int ret = 0;
+  pair<string, string> key(tier_ctx.storage_class, tier_ctx.target_bucket_name);
+  struct CreateBucketResult {
+    std::string code;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Code", code, obj);
+    }
+  } result;
+
+  ldpp_dout(tier_ctx.dpp, 30) << "Cloud_tier_ctx: creating bucket:" << tier_ctx.target_bucket_name << dendl;
+  bufferlist bl;
+  string resource = tier_ctx.target_bucket_name;
+
+  ret = tier_ctx.conn.send_resource(tier_ctx.dpp, "PUT", resource, nullptr, nullptr,
+                                    out_bl, &bl, nullptr, null_yield);
+
+  if (ret < 0 ) {
+    ldpp_dout(tier_ctx.dpp, 0) << "create target bucket : " << tier_ctx.target_bucket_name << " returned ret:" << ret << dendl;
+  }
+  if (out_bl.length() > 0) {
+    RGWXMLDecoder::XMLParser parser;
+    if (!parser.init()) {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to initialize xml parser for parsing create_bucket response from server" << dendl;
+      return -EIO;
+    }
+
+    if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+      string str(out_bl.c_str(), out_bl.length());
+      ldpp_dout(tier_ctx.dpp, 5) << "ERROR: failed to parse xml createbucket: " << str << dendl;
+      return -EIO;
+    }
+
+    try {
+      RGWXMLDecoder::decode_xml("Error", result, &parser, true);
+    } catch (RGWXMLDecoder::err& err) {
+      string str(out_bl.c_str(), out_bl.length());
+      ldpp_dout(tier_ctx.dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+      return -EIO;
+    }
+
+    if (result.code != "BucketAlreadyOwnedByYou" && result.code != "BucketAlreadyExists") {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: Creating target bucket failed with error: " << result.code << dendl;
+      return -EIO;
+    }
+  }
+
+  return 0;
+}
+
+int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets) {
+  int ret = 0;
+
+  // check if target_path is already created
+  std::set<std::string>::iterator it;
+
+  it = cloud_targets.find(tier_ctx.target_bucket_name);
+  tier_ctx.target_bucket_created = (it != cloud_targets.end());
+
+  /* If run first time attempt to create the target bucket */
+  if (!tier_ctx.target_bucket_created) {
+    ret = cloud_tier_create_bucket(tier_ctx);
+
+    if (ret < 0) {
+      ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to create target bucket on the cloud endpoint ret=" << ret << dendl;
+      return ret;
+    }
+    tier_ctx.target_bucket_created = true;
+    cloud_targets.insert(tier_ctx.target_bucket_name);
+  }
+
+  /* Since multiple zones may try to transition the same object to the cloud,
+   * verify if the object is already transitioned. And since its just a best
+   * effort, do not bail out in case of any errors.
+   */
+  bool already_tiered = false;
+  ret = cloud_tier_check_object(tier_ctx, already_tiered);
+
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to check object on the cloud endpoint ret=" << ret << dendl;
+  }
+
+  if (already_tiered) {
+    ldpp_dout(tier_ctx.dpp, 20) << "Object (" << tier_ctx.o.key << ") is already tiered" << dendl;
+    return 0;
+  }
+
+  uint64_t size = tier_ctx.o.meta.size;
+  uint64_t multipart_sync_threshold = tier_ctx.multipart_sync_threshold;
+
+  if (multipart_sync_threshold < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
+    multipart_sync_threshold = MULTIPART_MIN_POSSIBLE_PART_SIZE;
+  }
+
+  if (size < multipart_sync_threshold) {
+    ret = cloud_tier_plain_transfer(tier_ctx);
+  } else {
+    tier_ctx.is_multipart_upload = true;
+    ret = cloud_tier_multipart_transfer(tier_ctx);
+  } 
+
+  if (ret < 0) {
+    ldpp_dout(tier_ctx.dpp, 0) << "ERROR: failed to transition object ret=" << ret << dendl;
+  }
+
+  return ret;
+}
diff --git a/src/rgw/driver/rados/rgw_lc_tier.h b/src/rgw/driver/rados/rgw_lc_tier.h
new file mode 100644
index 000000000..729c4c304
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_lc_tier.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_lc.h"
+#include "rgw_rest_conn.h"
+#include "rgw_rados.h"
+#include "rgw_zone.h"
+#include "rgw_sal_rados.h"
+#include "rgw_cr_rest.h"
+
+#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
+#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024)
+
+struct RGWLCCloudTierCtx {
+  CephContext *cct;
+  const DoutPrefixProvider *dpp;
+
+  /* Source */
+  rgw_bucket_dir_entry& o;
+  rgw::sal::Driver *driver;
+  RGWBucketInfo& bucket_info;
+  std::string storage_class;
+
+  rgw::sal::Object *obj;
+
+  /* Remote */
+  RGWRESTConn& conn;
+  std::string target_bucket_name;
+  std::string target_storage_class;
+
+  std::map<std::string, RGWTierACLMapping> acl_mappings;
+  uint64_t multipart_min_part_size;
+  uint64_t multipart_sync_threshold;
+
+  bool is_multipart_upload{false};
+  bool target_bucket_created{true};
+
+  RGWLCCloudTierCtx(CephContext* _cct, const DoutPrefixProvider *_dpp,
+      rgw_bucket_dir_entry& _o, rgw::sal::Driver *_driver,
+      RGWBucketInfo &_binfo, rgw::sal::Object *_obj,
+      RGWRESTConn& _conn, std::string& _bucket,
+      std::string& _storage_class) :
+    cct(_cct), dpp(_dpp), o(_o), driver(_driver), bucket_info(_binfo),
+    obj(_obj), conn(_conn), target_bucket_name(_bucket),
+    target_storage_class(_storage_class) {}
+};
+
+/* Transition object to cloud endpoint */
+int rgw_cloud_tier_transfer_object(RGWLCCloudTierCtx& tier_ctx, std::set<std::string>& cloud_targets);
diff --git a/src/rgw/driver/rados/rgw_log_backing.cc b/src/rgw/driver/rados/rgw_log_backing.cc
new file mode 100644
index 000000000..7c9dafe7e
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_log_backing.cc
@@ -0,0 +1,708 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "cls/log/cls_log_client.h"
+#include "cls/version/cls_version_client.h"
+
+#include "rgw_log_backing.h"
+#include "rgw_tools.h"
+#include "cls_fifo_legacy.h"
+
+using namespace std::chrono_literals;
+namespace cb = ceph::buffer;
+
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+enum class shard_check { dne, omap, fifo, corrupt };
+inline std::ostream& operator <<(std::ostream& m, const shard_check& t) {
+  switch (t) {
+  case shard_check::dne:
+    return m << "shard_check::dne";
+  case shard_check::omap:
+    return m << "shard_check::omap";
+  case shard_check::fifo:
+    return m << "shard_check::fifo";
+  case shard_check::corrupt:
+    return m << "shard_check::corrupt";
+  }
+
+  return m << "shard_check::UNKNOWN=" << static_cast<uint32_t>(t);
+}
+
+namespace {
+/// Return the shard type, and a bool to see whether it has entries.
+shard_check
+probe_shard(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+	    bool& fifo_unsupported, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << " probing oid=" << oid
+		     << dendl;
+  if (!fifo_unsupported) {
+    std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
+    auto r = rgw::cls::fifo::FIFO::open(dpp, ioctx, oid,
+					&fifo, y,
+					std::nullopt, true);
+    switch (r) {
+    case 0:
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << ": oid=" << oid << " is FIFO"
+			 << dendl;
+      return shard_check::fifo;
+
+    case -ENODATA:
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << ": oid=" << oid << " is empty and therefore OMAP"
+			 << dendl;
+      return shard_check::omap;
+
+    case -ENOENT:
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << ": oid=" << oid << " does not exist"
+			 << dendl;
+      return shard_check::dne;
+
+    case -EPERM:
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << ": FIFO is unsupported, marking."
+			 << dendl;
+      fifo_unsupported = true;
+      return shard_check::omap;
+
+    default:
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			 << ": error probing: r=" << r
+			 << ", oid=" << oid << dendl;
+      return shard_check::corrupt;
+    }
+  } else {
+    // Since FIFO is unsupported, OMAP is the only alternative
+    return shard_check::omap;
+  }
+}
+
+tl::expected<log_type, bs::error_code>
+handle_dne(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx,
+	   log_type def,
+	   std::string oid,
+	   bool fifo_unsupported,
+	   optional_yield y)
+{
+  if (def == log_type::fifo) {
+    if (fifo_unsupported) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " WARNING: FIFO set as default but not supported by OSD. "
+		 << "Falling back to OMAP." << dendl;
+      return log_type::omap;
+    }
+    std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
+    auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid,
+					  &fifo, y,
+					  std::nullopt);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " error creating FIFO: r=" << r
+		 << ", oid=" << oid << dendl;
+      return tl::unexpected(bs::error_code(-r, bs::system_category()));
+    }
+  }
+  return def;
+}
+}
+
+tl::expected<log_type, bs::error_code>
+log_backing_type(const DoutPrefixProvider *dpp, 
+                 librados::IoCtx& ioctx,
+		 log_type def,
+		 int shards,
+		 const fu2::unique_function<std::string(int) const>& get_oid,
+		 optional_yield y)
+{
+  auto check = shard_check::dne;
+  bool fifo_unsupported = false;
+  for (int i = 0; i < shards; ++i) {
+    auto c = probe_shard(dpp, ioctx, get_oid(i), fifo_unsupported, y);
+    if (c == shard_check::corrupt)
+      return tl::unexpected(bs::error_code(EIO, bs::system_category()));
+    if (c == shard_check::dne) continue;
+    if (check == shard_check::dne) {
+      check = c;
+      continue;
+    }
+
+    if (check != c) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << " clashing types: check=" << check
+		 << ", c=" << c << dendl;
+      return tl::unexpected(bs::error_code(EIO, bs::system_category()));
+    }
+  }
+  if (check == shard_check::corrupt) {
+    ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << " should be unreachable!" << dendl;
+    return tl::unexpected(bs::error_code(EIO, bs::system_category()));
+  }
+
+  if (check == shard_check::dne)
+    return handle_dne(dpp, ioctx,
+		      def,
+		      get_oid(0),
+		      fifo_unsupported,
+		      y);
+
+  return (check == shard_check::fifo ? log_type::fifo : log_type::omap);
+}
+
+bs::error_code log_remove(const DoutPrefixProvider *dpp, 
+                          librados::IoCtx& ioctx,
+			  int shards,
+			  const fu2::unique_function<std::string(int) const>& get_oid,
+			  bool leave_zero,
+			  optional_yield y)
+{
+  bs::error_code ec;
+  for (int i = 0; i < shards; ++i) {
+    auto oid = get_oid(i);
+    rados::cls::fifo::info info;
+    uint32_t part_header_size = 0, part_entry_overhead = 0;
+
+    auto r = rgw::cls::fifo::get_meta(dpp, ioctx, oid, std::nullopt, &info,
+				      &part_header_size, &part_entry_overhead,
+				      0, y, true);
+    if (r == -ENOENT) continue;
+    if (r == 0 && info.head_part_num > -1) {
+      for (auto j = info.tail_part_num; j <= info.head_part_num; ++j) {
+	librados::ObjectWriteOperation op;
+	op.remove();
+	auto part_oid = info.part_oid(j);
+	auto subr = rgw_rados_operate(dpp, ioctx, part_oid, &op, null_yield);
+	if (subr < 0 && subr != -ENOENT) {
+	  if (!ec)
+	    ec = bs::error_code(-subr, bs::system_category());
+	  ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << ": failed removing FIFO part: part_oid=" << part_oid
+		     << ", subr=" << subr << dendl;
+	}
+      }
+    }
+    if (r < 0 && r != -ENODATA) {
+      if (!ec)
+	ec = bs::error_code(-r, bs::system_category());
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": failed checking FIFO part: oid=" << oid
+		 << ", r=" << r << dendl;
+    }
+    librados::ObjectWriteOperation op;
+    if (i == 0 && leave_zero) {
+      // Leave shard 0 in existence, but remove contents and
+      // omap. cls_lock stores things in the xattrs. And sync needs to
+      // rendezvous with locks on generation 0 shard 0.
+      op.omap_set_header({});
+      op.omap_clear();
+      op.truncate(0);
+    } else {
+      op.remove();
+    }
+    r = rgw_rados_operate(dpp, ioctx, oid, &op, null_yield);
+    if (r < 0 && r != -ENOENT) {
+      if (!ec)
+	ec = bs::error_code(-r, bs::system_category());
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": failed removing shard: oid=" << oid
+		 << ", r=" << r << dendl;
+    }
+  }
+  return ec;
+}
+
+logback_generations::~logback_generations() {
+  if (watchcookie > 0) {
+    auto cct = static_cast<CephContext*>(ioctx.cct());
+    auto r = ioctx.unwatch2(watchcookie);
+    if (r < 0) {
+      lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": failed unwatching oid=" << oid
+		 << ", r=" << r << dendl;
+    }
+  }
+}
+
+bs::error_code logback_generations::setup(const DoutPrefixProvider *dpp,
+                                          log_type def,
+					  optional_yield y) noexcept
+{
+  try {
+    // First, read.
+    auto cct = static_cast<CephContext*>(ioctx.cct());
+    auto res = read(dpp, y);
+    if (!res && res.error() != bs::errc::no_such_file_or_directory) {
+      return res.error();
+    }
+    if (res) {
+      std::unique_lock lock(m);
+      std::tie(entries_, version) = std::move(*res);
+    } else {
+      // Are we the first? Then create generation 0 and the generations
+      // metadata.
+      librados::ObjectWriteOperation op;
+      auto type = log_backing_type(dpp, ioctx, def, shards,
+				   [this](int shard) {
+				     return this->get_oid(0, shard);
+				   }, y);
+      if (!type)
+	return type.error();
+
+      logback_generation l;
+      l.type = *type;
+
+      std::unique_lock lock(m);
+      version.ver = 1;
+      static constexpr auto TAG_LEN = 24;
+      version.tag.clear();
+      append_rand_alpha(cct, version.tag, version.tag, TAG_LEN);
+      op.create(true);
+      cls_version_set(op, version);
+      cb::list bl;
+      entries_.emplace(0, std::move(l));
+      encode(entries_, bl);
+      lock.unlock();
+
+      op.write_full(bl);
+      auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+      if (r < 0 && r != -EEXIST) {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << ": failed writing oid=" << oid
+		   << ", r=" << r << dendl;
+	bs::system_error(-r, bs::system_category());
+      }
+      // Did someone race us? Then re-read.
+      if (r != 0) {
+	res = read(dpp, y);
+	if (!res)
+	  return res.error();
+	if (res->first.empty())
+	  return bs::error_code(EIO, bs::system_category());
+	auto l = res->first.begin()->second;
+	// In the unlikely event that someone raced us, created
+	// generation zero, incremented, then erased generation zero,
+	// don't leave generation zero lying around.
+	if (l.gen_id != 0) {
+	  auto ec = log_remove(dpp, ioctx, shards,
+			       [this](int shard) {
+				 return this->get_oid(0, shard);
+			       }, true, y);
+	  if (ec) return ec;
+	}
+	std::unique_lock lock(m);
+	std::tie(entries_, version) = std::move(*res);
+      }
+    }
+    // Pass all non-empty generations to the handler
+    std::unique_lock lock(m);
+    auto i = lowest_nomempty(entries_);
+    entries_t e;
+    std::copy(i, entries_.cend(),
+	      std::inserter(e, e.end()));
+    m.unlock();
+    auto ec = watch();
+    if (ec) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": failed to re-establish watch, unsafe to continue: oid="
+		 << oid << ", ec=" << ec.message() << dendl;
+    }
+    return handle_init(std::move(e));
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+}
+
+bs::error_code logback_generations::update(const DoutPrefixProvider *dpp, optional_yield y) noexcept
+{
+  try {
+    auto res = read(dpp, y);
+    if (!res) {
+      return res.error();
+    }
+
+    std::unique_lock l(m);
+    auto& [es, v] = *res;
+    if (v == version) {
+      // Nothing to do!
+      return {};
+    }
+
+    // Check consistency and prepare update
+    if (es.empty()) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": INCONSISTENCY! Read empty update." << dendl;
+      return bs::error_code(EFAULT, bs::system_category());
+    }
+    auto cur_lowest = lowest_nomempty(entries_);
+    // Straight up can't happen
+    assert(cur_lowest != entries_.cend());
+    auto new_lowest = lowest_nomempty(es);
+    if (new_lowest == es.cend()) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": INCONSISTENCY! Read update with no active head." << dendl;
+      return bs::error_code(EFAULT, bs::system_category());
+    }
+    if (new_lowest->first < cur_lowest->first) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": INCONSISTENCY! Tail moved wrong way." << dendl;
+      return bs::error_code(EFAULT, bs::system_category());
+    }
+
+    std::optional<uint64_t> highest_empty;
+    if (new_lowest->first > cur_lowest->first && new_lowest != es.begin()) {
+      --new_lowest;
+      highest_empty = new_lowest->first;
+    }
+
+    entries_t new_entries;
+
+    if ((es.end() - 1)->first < (entries_.end() - 1)->first) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": INCONSISTENCY! Head moved wrong way." << dendl;
+      return bs::error_code(EFAULT, bs::system_category());
+    }
+
+    if ((es.end() - 1)->first > (entries_.end() - 1)->first) {
+      auto ei = es.lower_bound((entries_.end() - 1)->first + 1);
+      std::copy(ei, es.end(), std::inserter(new_entries, new_entries.end()));
+    }
+
+    // Everything checks out!
+
+    version = v;
+    entries_ = es;
+    l.unlock();
+
+    if (highest_empty) {
+      auto ec = handle_empty_to(*highest_empty);
+      if (ec) return ec;
+    }
+
+    if (!new_entries.empty()) {
+      auto ec = handle_new_gens(std::move(new_entries));
+      if (ec) return ec;
+    }
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+  return {};
+}
+
+auto logback_generations::read(const DoutPrefixProvider *dpp, optional_yield y) noexcept ->
+  tl::expected<std::pair<entries_t, obj_version>, bs::error_code>
+{
+  try {
+    librados::ObjectReadOperation op;
+    std::unique_lock l(m);
+    cls_version_check(op, version, VER_COND_GE);
+    l.unlock();
+    obj_version v2;
+    cls_version_read(op, &v2);
+    cb::list bl;
+    op.read(0, 0, &bl, nullptr);
+    auto r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, y);
+    if (r < 0) {
+      if (r == -ENOENT) {
+	ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		      << ": oid=" << oid
+		      << " not found" << dendl;
+      } else {
+	ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		   << ": failed reading oid=" << oid
+		   << ", r=" << r << dendl;
+      }
+      return tl::unexpected(bs::error_code(-r, bs::system_category()));
+    }
+    auto bi = bl.cbegin();
+    entries_t e;
+    try {
+      decode(e, bi);
+    } catch (const cb::error& err) {
+      return tl::unexpected(err.code());
+    }
+    return std::pair{ std::move(e), std::move(v2) };
+  } catch (const std::bad_alloc&) {
+    return tl::unexpected(bs::error_code(ENOMEM, bs::system_category()));
+  }
+}
+
+bs::error_code logback_generations::write(const DoutPrefixProvider *dpp, entries_t&& e,
+					  std::unique_lock<std::mutex>&& l_,
+					  optional_yield y) noexcept
+{
+  auto l = std::move(l_);
+  ceph_assert(l.mutex() == &m &&
+	      l.owns_lock());
+  try {
+    librados::ObjectWriteOperation op;
+    cls_version_check(op, version, VER_COND_GE);
+    cb::list bl;
+    encode(e, bl);
+    op.write_full(bl);
+    cls_version_inc(op);
+    auto r = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+    if (r == 0) {
+      entries_ = std::move(e);
+      version.inc();
+      return {};
+    }
+    l.unlock();
+    if (r < 0 && r != -ECANCELED) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": failed reading oid=" << oid
+		 << ", r=" << r << dendl;
+      return { -r, bs::system_category() };
+    }
+    if (r == -ECANCELED) {
+      auto ec = update(dpp, y);
+      if (ec) {
+	return ec;
+      } else {
+	return { ECANCELED, bs::system_category() };
+      }
+    }
+  } catch (const std::bad_alloc&) {
+    return { ENOMEM, bs::system_category() };
+  }
+  return {};
+}
+
+
+bs::error_code logback_generations::watch() noexcept {
+  try {
+    auto cct = static_cast<CephContext*>(ioctx.cct());
+    auto r = ioctx.watch2(oid, &watchcookie, this);
+    if (r < 0) {
+      lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": failed to set watch oid=" << oid
+		 << ", r=" << r << dendl;
+      return { -r, bs::system_category() };
+    }
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+  return {};
+}
+
+bs::error_code logback_generations::new_backing(const DoutPrefixProvider *dpp, 
+                                                log_type type,
+						optional_yield y) noexcept {
+  static constexpr auto max_tries = 10;
+  try {
+    auto ec = update(dpp, y);
+    if (ec) return ec;
+    auto tries = 0;
+    entries_t new_entries;
+    do {
+      std::unique_lock l(m);
+      auto last = entries_.end() - 1;
+      if (last->second.type == type) {
+	// Nothing to be done
+	return {};
+      }
+      auto newgenid = last->first + 1;
+      logback_generation newgen;
+      newgen.gen_id = newgenid;
+      newgen.type = type;
+      new_entries.emplace(newgenid, newgen);
+      auto es = entries_;
+      es.emplace(newgenid, std::move(newgen));
+      ec = write(dpp, std::move(es), std::move(l), y);
+      ++tries;
+    } while (ec == bs::errc::operation_canceled &&
+	     tries < max_tries);
+    if (tries >= max_tries) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": exhausted retry attempts." << dendl;
+      return ec;
+    }
+
+    if (ec) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": write failed with ec=" << ec.message() << dendl;
+      return ec;
+    }
+
+    cb::list bl, rbl;
+
+    auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": notify failed with r=" << r << dendl;
+      return { -r, bs::system_category() };
+    }
+    ec = handle_new_gens(new_entries);
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+  return {};
+}
+
+bs::error_code logback_generations::empty_to(const DoutPrefixProvider *dpp, 
+                                             uint64_t gen_id,
+					     optional_yield y) noexcept {
+  static constexpr auto max_tries = 10;
+  try {
+    auto ec = update(dpp, y);
+    if (ec) return ec;
+    auto tries = 0;
+    uint64_t newtail = 0;
+    do {
+      std::unique_lock l(m);
+      {
+	auto last = entries_.end() - 1;
+	if (gen_id >= last->first) {
+	  ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		     << ": Attempt to trim beyond the possible." << dendl;
+	  return bs::error_code(EINVAL, bs::system_category());
+	}
+      }
+      auto es = entries_;
+      auto ei = es.upper_bound(gen_id);
+      if (ei == es.begin()) {
+	// Nothing to be done.
+	return {};
+      }
+      for (auto i = es.begin(); i < ei; ++i) {
+	newtail = i->first;
+	i->second.pruned = ceph::real_clock::now();
+      }
+      ec = write(dpp, std::move(es), std::move(l), y);
+      ++tries;
+    } while (ec == bs::errc::operation_canceled &&
+	     tries < max_tries);
+    if (tries >= max_tries) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": exhausted retry attempts." << dendl;
+      return ec;
+    }
+
+    if (ec) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": write failed with ec=" << ec.message() << dendl;
+      return ec;
+    }
+
+    cb::list bl, rbl;
+
+    auto r = rgw_rados_notify(dpp, ioctx, oid, bl, 10'000, &rbl, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": notify failed with r=" << r << dendl;
+      return { -r, bs::system_category() };
+    }
+    ec = handle_empty_to(newtail);
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+  return {};
+}
+
+bs::error_code logback_generations::remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept {
+  static constexpr auto max_tries = 10;
+  try {
+    auto ec = update(dpp, y);
+    if (ec) return ec;
+    auto tries = 0;
+    entries_t new_entries;
+    std::unique_lock l(m);
+    ceph_assert(!entries_.empty());
+    {
+      auto i = lowest_nomempty(entries_);
+      if (i == entries_.begin()) {
+	return {};
+      }
+    }
+    entries_t es;
+    auto now = ceph::real_clock::now();
+    l.unlock();
+    do {
+      std::copy_if(entries_.cbegin(), entries_.cend(),
+		   std::inserter(es, es.end()),
+		   [now](const auto& e) {
+		     if (!e.second.pruned)
+		       return false;
+
+		     auto pruned = *e.second.pruned;
+		     return (now - pruned) >= 1h;
+		   });
+      auto es2 = entries_;
+      for (const auto& [gen_id, e] : es) {
+	ceph_assert(e.pruned);
+	auto ec = log_remove(dpp, ioctx, shards,
+			     [this, gen_id = gen_id](int shard) {
+			       return this->get_oid(gen_id, shard);
+			     }, (gen_id == 0), y);
+	if (ec) {
+	  ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			     << ": Error pruning: gen_id=" << gen_id
+			     << " ec=" << ec.message() << dendl;
+	}
+	if (auto i = es2.find(gen_id); i != es2.end()) {
+	  es2.erase(i);
+	}
+      }
+      l.lock();
+      es.clear();
+      ec = write(dpp, std::move(es2), std::move(l), y);
+      ++tries;
+    } while (ec == bs::errc::operation_canceled &&
+	     tries < max_tries);
+    if (tries >= max_tries) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": exhausted retry attempts." << dendl;
+      return ec;
+    }
+
+    if (ec) {
+      ldpp_dout(dpp, -1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		 << ": write failed with ec=" << ec.message() << dendl;
+      return ec;
+    }
+  } catch (const std::bad_alloc&) {
+    return bs::error_code(ENOMEM, bs::system_category());
+  }
+  return {};
+}
+
+void logback_generations::handle_notify(uint64_t notify_id,
+					uint64_t cookie,
+					uint64_t notifier_id,
+					bufferlist& bl)
+{
+  auto cct = static_cast<CephContext*>(ioctx.cct());
+  const DoutPrefix dp(cct, dout_subsys, "logback generations handle_notify: ");
+  if (notifier_id != my_id) {
+    auto ec = update(&dp, null_yield);
+    if (ec) {
+      lderr(cct)
+	<< __PRETTY_FUNCTION__ << ":" << __LINE__
+	<< ": update failed, no one to report to and no safe way to continue."
+	<< dendl;
+      abort();
+    }
+  }
+  cb::list rbl;
+  ioctx.notify_ack(oid, notify_id, watchcookie, rbl);
+}
+
+void logback_generations::handle_error(uint64_t cookie, int err) {
+  auto cct = static_cast<CephContext*>(ioctx.cct());
+  auto r = ioctx.unwatch2(watchcookie);
+  if (r < 0) {
+    lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << ": failed to set unwatch oid=" << oid
+	       << ", r=" << r << dendl;
+  }
+
+  auto ec = watch();
+  if (ec) {
+    lderr(cct) << __PRETTY_FUNCTION__ << ":" << __LINE__
+	       << ": failed to re-establish watch, unsafe to continue: oid="
+	       << oid << ", ec=" << ec.message() << dendl;
+  }
+}
diff --git a/src/rgw/driver/rados/rgw_log_backing.h b/src/rgw/driver/rados/rgw_log_backing.h
new file mode 100644
index 000000000..3dfdb8ee4
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_log_backing.h
@@ -0,0 +1,394 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <optional>
+#include <iostream>
+#include <string>
+#include <string_view>
+
+#include <strings.h>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/system/error_code.hpp>
+
+#include <fmt/format.h>
+
+#include "include/rados/librados.hpp"
+#include "include/encoding.h"
+#include "include/expected.hpp"
+#include "include/function2.hpp"
+
+#include "cls/version/cls_version_types.h"
+
+#include "common/async/yield_context.h"
+#include "common/Formatter.h"
+#include "common/strtol.h"
+
+namespace bc = boost::container;
+namespace bs = boost::system;
+
+#include "cls_fifo_legacy.h"
+
+/// Type of log backing, stored in the mark used in the quick check,
+/// and passed to checking functions.
+enum class log_type {
+  omap = 0,
+  fifo = 1
+};
+
+inline void encode(const log_type& type, ceph::buffer::list& bl) {
+  auto t = static_cast<uint8_t>(type);
+  encode(t, bl);
+}
+
+inline void decode(log_type& type, bufferlist::const_iterator& bl) {
+  uint8_t t;
+  decode(t, bl);
+  type = static_cast<log_type>(t);
+}
+
+inline std::optional<log_type> to_log_type(std::string_view s) {
+  if (strncasecmp(s.data(), "omap", s.length()) == 0) {
+    return log_type::omap;
+  } else if (strncasecmp(s.data(), "fifo", s.length()) == 0) {
+    return log_type::fifo;
+  } else {
+    return std::nullopt;
+  }
+}
+inline std::ostream& operator <<(std::ostream& m, const log_type& t) {
+  switch (t) {
+  case log_type::omap:
+    return m << "log_type::omap";
+  case log_type::fifo:
+    return m << "log_type::fifo";
+  }
+
+  return m << "log_type::UNKNOWN=" << static_cast<uint32_t>(t);
+}
+
+/// Look over the shards in a log and determine the type.
+tl::expected<log_type, bs::error_code>
+log_backing_type(const DoutPrefixProvider *dpp, 
+                 librados::IoCtx& ioctx,
+		 log_type def,
+		 int shards, //< Total number of shards
+		 /// A function taking a shard number and
+		 /// returning an oid.
+		 const fu2::unique_function<std::string(int) const>& get_oid,
+		 optional_yield y);
+
+/// Remove all log shards and associated parts of fifos.
+bs::error_code log_remove(librados::IoCtx& ioctx,
+			  int shards, //< Total number of shards
+			  /// A function taking a shard number and
+			  /// returning an oid.
+			  const fu2::unique_function<std::string(int) const>& get_oid,
+			  bool leave_zero,
+			  optional_yield y);
+
+
+struct logback_generation {
+  uint64_t gen_id = 0;
+  log_type type;
+  std::optional<ceph::real_time> pruned;
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(gen_id, bl);
+    encode(type, bl);
+    encode(pruned, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(gen_id, bl);
+    decode(type, bl);
+    decode(pruned, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(logback_generation)
+inline std::ostream& operator <<(std::ostream& m, const logback_generation& g) {
+  return m << "[" << g.gen_id << "," << g.type << ","
+	   << (g.pruned ? "PRUNED" : "NOT PRUNED") << "]";
+}
+
+class logback_generations : public librados::WatchCtx2 {
+public:
+  using entries_t = bc::flat_map<uint64_t, logback_generation>;
+
+protected:
+  librados::IoCtx& ioctx;
+  logback_generations(librados::IoCtx& ioctx,
+		      std::string oid,
+		      fu2::unique_function<std::string(
+			uint64_t, int) const>&& get_oid,
+		      int shards) noexcept
+    : ioctx(ioctx), oid(oid), get_oid(std::move(get_oid)),
+      shards(shards) {}
+
+    uint64_t my_id = ioctx.get_instance_id();
+
+private:
+  const std::string oid;
+  const fu2::unique_function<std::string(uint64_t, int) const> get_oid;
+
+protected:
+  const int shards;
+
+private:
+
+  uint64_t watchcookie = 0;
+
+  obj_version version;
+  std::mutex m;
+  entries_t entries_;
+
+  tl::expected<std::pair<entries_t, obj_version>, bs::error_code>
+  read(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
+  bs::error_code write(const DoutPrefixProvider *dpp, entries_t&& e, std::unique_lock<std::mutex>&& l_,
+		       optional_yield y) noexcept;
+  bs::error_code setup(const DoutPrefixProvider *dpp, log_type def, optional_yield y) noexcept;
+
+  bs::error_code watch() noexcept;
+
+  auto lowest_nomempty(const entries_t& es) {
+    return std::find_if(es.begin(), es.end(),
+			[](const auto& e) {
+			  return !e.second.pruned;
+			});
+  }
+
+public:
+
+  /// For the use of watch/notify.
+
+  void handle_notify(uint64_t notify_id,
+		     uint64_t cookie,
+		     uint64_t notifier_id,
+		     bufferlist& bl) override final;
+
+  void handle_error(uint64_t cookie, int err) override final;
+
+  /// Public interface
+
+  virtual ~logback_generations();
+
+  template<typename T, typename... Args>
+  static tl::expected<std::unique_ptr<T>, bs::error_code>
+  init(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx_, std::string oid_,
+       fu2::unique_function<std::string(uint64_t, int) const>&& get_oid_,
+       int shards_, log_type def, optional_yield y,
+       Args&& ...args) noexcept {
+    try {
+      T* lgp = new T(ioctx_, std::move(oid_),
+		     std::move(get_oid_),
+		     shards_, std::forward<Args>(args)...);
+      std::unique_ptr<T> lg(lgp);
+      lgp = nullptr;
+      auto ec = lg->setup(dpp, def, y);
+      if (ec)
+	return tl::unexpected(ec);
+      // Obnoxiousness for C++ Compiler in Bionic Beaver
+      return tl::expected<std::unique_ptr<T>, bs::error_code>(std::move(lg));
+    } catch (const std::bad_alloc&) {
+      return tl::unexpected(bs::error_code(ENOMEM, bs::system_category()));
+    }
+  }
+
+  bs::error_code update(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
+
+  entries_t entries() const {
+    return entries_;
+  }
+
+  bs::error_code new_backing(const DoutPrefixProvider *dpp, log_type type, optional_yield y) noexcept;
+
+  bs::error_code empty_to(const DoutPrefixProvider *dpp, uint64_t gen_id, optional_yield y) noexcept;
+
+  bs::error_code remove_empty(const DoutPrefixProvider *dpp, optional_yield y) noexcept;
+
+  // Callbacks, to be defined by descendant.
+
+  /// Handle initialization on startup
+  ///
+  /// @param e All non-empty generations
+  virtual bs::error_code handle_init(entries_t e) noexcept = 0;
+
+  /// Handle new generations.
+  ///
+  /// @param e Map of generations added since last update
+  virtual bs::error_code handle_new_gens(entries_t e) noexcept = 0;
+
+  /// Handle generations being marked empty
+  ///
+  /// @param new_tail Lowest non-empty generation
+  virtual bs::error_code handle_empty_to(uint64_t new_tail) noexcept = 0;
+};
+
+inline std::string gencursor(uint64_t gen_id, std::string_view cursor) {
+  return (gen_id > 0 ?
+	  fmt::format("G{:0>20}@{}", gen_id, cursor) :
+	  std::string(cursor));
+}
+
+inline std::pair<uint64_t, std::string_view>
+cursorgen(std::string_view cursor_) {
+  if (cursor_.empty()) {
+    return { 0, "" };
+  }
+  std::string_view cursor = cursor_;
+  if (cursor[0] != 'G') {
+    return { 0, cursor };
+  }
+  cursor.remove_prefix(1);
+  auto gen_id = ceph::consume<uint64_t>(cursor);
+  if (!gen_id || cursor[0] != '@') {
+    return { 0, cursor_ };
+  }
+  cursor.remove_prefix(1);
+  return { *gen_id, cursor };
+}
+
+class LazyFIFO {
+  librados::IoCtx& ioctx;
+  std::string oid;
+  std::mutex m;
+  std::unique_ptr<rgw::cls::fifo::FIFO> fifo;
+
+  int lazy_init(const DoutPrefixProvider *dpp, optional_yield y) {
+    std::unique_lock l(m);
+    if (fifo) return 0;
+    auto r = rgw::cls::fifo::FIFO::create(dpp, ioctx, oid, &fifo, y);
+    if (r) {
+      fifo.reset();
+    }
+    return r;
+  }
+
+public:
+
+  LazyFIFO(librados::IoCtx& ioctx, std::string oid)
+    : ioctx(ioctx), oid(std::move(oid)) {}
+
+  int read_meta(const DoutPrefixProvider *dpp, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->read_meta(dpp, y);
+  }
+
+  int meta(const DoutPrefixProvider *dpp, rados::cls::fifo::info& info, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    info = fifo->meta();
+    return 0;
+  }
+
+  int get_part_layout_info(const DoutPrefixProvider *dpp, 
+                           std::uint32_t& part_header_size,
+			   std::uint32_t& part_entry_overhead,
+			   optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    std::tie(part_header_size, part_entry_overhead)
+      = fifo->get_part_layout_info();
+    return 0;
+  }
+
+  int push(const DoutPrefixProvider *dpp, 
+           const ceph::buffer::list& bl,
+	   optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->push(dpp, bl, y);
+  }
+
+  int push(const DoutPrefixProvider *dpp, 
+           ceph::buffer::list& bl,
+	   librados::AioCompletion* c,
+	   optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->push(dpp, bl, c);
+    return 0;
+  }
+
+  int push(const DoutPrefixProvider *dpp, 
+           const std::vector<ceph::buffer::list>& data_bufs,
+	   optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->push(dpp, data_bufs, y);
+  }
+
+  int push(const DoutPrefixProvider *dpp, 
+            const std::vector<ceph::buffer::list>& data_bufs,
+	    librados::AioCompletion* c,
+	    optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->push(dpp, data_bufs, c);
+    return 0;
+  }
+
+  int list(const DoutPrefixProvider *dpp, 
+           int max_entries, std::optional<std::string_view> markstr,
+	   std::vector<rgw::cls::fifo::list_entry>* out,
+	   bool* more, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->list(dpp, max_entries, markstr, out, more, y);
+  }
+
+  int list(const DoutPrefixProvider *dpp, int max_entries, std::optional<std::string_view> markstr,
+	   std::vector<rgw::cls::fifo::list_entry>* out, bool* more,
+	   librados::AioCompletion* c, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->list(dpp, max_entries, markstr, out, more, c);
+    return 0;
+  }
+
+  int trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->trim(dpp, markstr, exclusive, y);
+  }
+
+  int trim(const DoutPrefixProvider *dpp, std::string_view markstr, bool exclusive, librados::AioCompletion* c,
+	   optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->trim(dpp, markstr, exclusive, c);
+    return 0;
+  }
+
+  int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, rados::cls::fifo::part_header* header,
+		    optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    return fifo->get_part_info(dpp, part_num, header, y);
+  }
+
+  int get_part_info(const DoutPrefixProvider *dpp, int64_t part_num, rados::cls::fifo::part_header* header,
+		    librados::AioCompletion* c, optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->get_part_info(part_num, header, c);
+    return 0;
+  }
+
+  int get_head_info(const DoutPrefixProvider *dpp, fu2::unique_function<
+		      void(int r, rados::cls::fifo::part_header&&)>&& f,
+		    librados::AioCompletion* c,
+		    optional_yield y) {
+    auto r = lazy_init(dpp, y);
+    if (r < 0) return r;
+    fifo->get_head_info(dpp, std::move(f), c);
+    return 0;
+  }
+};
diff --git a/src/rgw/driver/rados/rgw_metadata.cc b/src/rgw/driver/rados/rgw_metadata.cc
new file mode 100644
index 000000000..e3e49316e
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_metadata.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_metadata.h"
+
+#include "rgw_zone.h"
+#include "rgw_mdlog.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_cls.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+const std::string RGWMetadataLogHistory::oid = "meta.history";
+
+struct obj_version;
+
+void rgw_shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
+{
+  uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
+  char buf[16];
+  if (shard_id) {
+    *shard_id = val % max_shards;
+  }
+  snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
+  name = prefix + buf;
+}
+
+void rgw_shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
+{
+  uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
+  val ^= ceph_str_hash_linux(section.c_str(), section.size());
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
+  name = prefix + buf;
+}
+
+void rgw_shard_name(const string& prefix, unsigned shard_id, string& name)
+{
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%u", shard_id);
+  name = prefix + buf;
+}
+
+int RGWMetadataLog::add_entry(const DoutPrefixProvider *dpp, const string& hash_key, const string& section, const string& key, bufferlist& bl) {
+  if (!svc.zone->need_to_log_metadata())
+    return 0;
+
+  string oid;
+  int shard_id;
+
+  rgw_shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, &shard_id);
+  mark_modified(shard_id);
+  real_time now = real_clock::now();
+  return svc.cls->timelog.add(dpp, oid, now, section, key, bl, null_yield);
+}
+
+int RGWMetadataLog::get_shard_id(const string& hash_key, int *shard_id)
+{
+  string oid;
+
+  rgw_shard_name(prefix, cct->_conf->rgw_md_log_max_shards, hash_key, oid, shard_id);
+  return 0;
+}
+
+int RGWMetadataLog::store_entries_in_shard(const DoutPrefixProvider *dpp, list<cls_log_entry>& entries, int shard_id, librados::AioCompletion *completion)
+{
+  string oid;
+
+  mark_modified(shard_id);
+  rgw_shard_name(prefix, shard_id, oid);
+  return svc.cls->timelog.add(dpp, oid, entries, completion, false, null_yield);
+}
+
+void RGWMetadataLog::init_list_entries(int shard_id, const real_time& from_time, const real_time& end_time, 
+                                       const string& marker, void **handle)
+{
+  LogListCtx *ctx = new LogListCtx();
+
+  ctx->cur_shard = shard_id;
+  ctx->from_time = from_time;
+  ctx->end_time  = end_time;
+  ctx->marker    = marker;
+
+  get_shard_oid(ctx->cur_shard, ctx->cur_oid);
+
+  *handle = (void *)ctx;
+}
+
+void RGWMetadataLog::complete_list_entries(void *handle) {
+  LogListCtx *ctx = static_cast<LogListCtx *>(handle);
+  delete ctx;
+}
+
+int RGWMetadataLog::list_entries(const DoutPrefixProvider *dpp, void *handle,
+				 int max_entries,
+				 list<cls_log_entry>& entries,
+				 string *last_marker,
+				 bool *truncated) {
+  LogListCtx *ctx = static_cast<LogListCtx *>(handle);
+
+  if (!max_entries) {
+    *truncated = false;
+    return 0;
+  }
+
+  std::string next_marker;
+  int ret = svc.cls->timelog.list(dpp, ctx->cur_oid, ctx->from_time, ctx->end_time,
+                                  max_entries, entries, ctx->marker,
+                                  &next_marker, truncated, null_yield);
+  if ((ret < 0) && (ret != -ENOENT))
+    return ret;
+
+  ctx->marker = std::move(next_marker);
+  if (last_marker) {
+    *last_marker = ctx->marker;
+  }
+
+  if (ret == -ENOENT)
+    *truncated = false;
+
+  return 0;
+}
+
+int RGWMetadataLog::get_info(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfo *info)
+{
+  string oid;
+  get_shard_oid(shard_id, oid);
+
+  cls_log_header header;
+
+  int ret = svc.cls->timelog.info(dpp, oid, &header, null_yield);
+  if ((ret < 0) && (ret != -ENOENT))
+    return ret;
+
+  info->marker = header.max_marker;
+  info->last_update = header.max_time.to_real_time();
+
+  return 0;
+}
+
+static void _mdlog_info_completion(librados::completion_t cb, void *arg)
+{
+  auto infoc = static_cast<RGWMetadataLogInfoCompletion *>(arg);
+  infoc->finish(cb);
+  infoc->put(); // drop the ref from get_info_async()
+}
+
+RGWMetadataLogInfoCompletion::RGWMetadataLogInfoCompletion(info_callback_t cb)
+  : completion(librados::Rados::aio_create_completion((void *)this,
+                                                      _mdlog_info_completion)),
+    callback(cb)
+{
+}
+
+RGWMetadataLogInfoCompletion::~RGWMetadataLogInfoCompletion()
+{
+  completion->release();
+}
+
+int RGWMetadataLog::get_info_async(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfoCompletion *completion)
+{
+  string oid;
+  get_shard_oid(shard_id, oid);
+
+  completion->get(); // hold a ref until the completion fires
+
+  return svc.cls->timelog.info_async(dpp, completion->get_io_obj(), oid,
+                                     &completion->get_header(),
+                                     completion->get_completion());
+}
+
+int RGWMetadataLog::trim(const DoutPrefixProvider *dpp, int shard_id, const real_time& from_time, const real_time& end_time,
+                         const string& start_marker, const string& end_marker)
+{
+  string oid;
+  get_shard_oid(shard_id, oid);
+
+  return svc.cls->timelog.trim(dpp, oid, from_time, end_time, start_marker,
+                               end_marker, nullptr, null_yield);
+}
+  
+int RGWMetadataLog::lock_exclusive(const DoutPrefixProvider *dpp, int shard_id, timespan duration, string& zone_id, string& owner_id) {
+  string oid;
+  get_shard_oid(shard_id, oid);
+
+  return svc.cls->lock.lock_exclusive(dpp, svc.zone->get_zone_params().log_pool, oid, duration, zone_id, owner_id);
+}
+
+int RGWMetadataLog::unlock(const DoutPrefixProvider *dpp, int shard_id, string& zone_id, string& owner_id) {
+  string oid;
+  get_shard_oid(shard_id, oid);
+
+  return svc.cls->lock.unlock(dpp, svc.zone->get_zone_params().log_pool, oid, zone_id, owner_id);
+}
+
+void RGWMetadataLog::mark_modified(int shard_id)
+{
+  lock.get_read();
+  if (modified_shards.find(shard_id) != modified_shards.end()) {
+    lock.unlock();
+    return;
+  }
+  lock.unlock();
+
+  std::unique_lock wl{lock};
+  modified_shards.insert(shard_id);
+}
+
+void RGWMetadataLog::read_clear_modified(set<int> &modified)
+{
+  std::unique_lock wl{lock};
+  modified.swap(modified_shards);
+  modified_shards.clear();
+}
+
+void RGWMetadataLogInfo::dump(Formatter *f) const
+{
+  encode_json("marker", marker, f);
+  utime_t ut(last_update);
+  encode_json("last_update", ut, f);
+}
+
+void RGWMetadataLogInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("marker", marker, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("last_update", ut, obj);
+  last_update = ut.to_real_time();
+}
+
diff --git a/src/rgw/driver/rados/rgw_metadata.h b/src/rgw/driver/rados/rgw_metadata.h
new file mode 100644
index 000000000..c83db7c40
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_metadata.h
@@ -0,0 +1,298 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <utility>
+#include <boost/optional.hpp>
+
+#include "include/types.h"
+#include "rgw_common.h"
+#include "rgw_period_history.h"
+#include "rgw_mdlog_types.h"
+#include "cls/version/cls_version_types.h"
+#include "cls/log/cls_log_types.h"
+#include "common/RefCountedObj.h"
+#include "common/ceph_time.h"
+#include "services/svc_meta_be.h"
+#include "rgw_sal_fwd.h"
+
+
+class RGWCoroutine;
+class JSONObj;
+struct RGWObjVersionTracker;
+
+struct obj_version;
+
+
+class RGWMetadataObject {
+protected:
+  obj_version objv;
+  ceph::real_time mtime;
+  std::map<std::string, bufferlist> *pattrs{nullptr};
+  
+public:
+  RGWMetadataObject() {}
+  RGWMetadataObject(const obj_version& v,
+		    real_time m) : objv(v), mtime(m) {}
+  virtual ~RGWMetadataObject() {}
+  obj_version& get_version();
+  real_time& get_mtime() { return mtime; }
+  void set_pattrs(std::map<std::string, bufferlist> *_pattrs) {
+    pattrs = _pattrs;
+  }
+  std::map<std::string, bufferlist> *get_pattrs() {
+    return pattrs;
+  }
+
+  virtual void dump(Formatter *f) const {}
+};
+
+class RGWMetadataManager;
+
+class RGWMetadataHandler {
+  friend class RGWMetadataManager;
+
+protected:
+  CephContext *cct;
+
+public:
+  RGWMetadataHandler() {}
+  virtual ~RGWMetadataHandler();
+  virtual std::string get_type() = 0;
+
+  void base_init(CephContext *_cct) {
+    cct = _cct;
+  }
+
+  virtual RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) = 0;
+
+  virtual int get(std::string& entry, RGWMetadataObject **obj, optional_yield, const DoutPrefixProvider *dpp) = 0;
+  virtual int put(std::string& entry,
+                  RGWMetadataObject *obj,
+                  RGWObjVersionTracker& objv_tracker,
+                  optional_yield, 
+                  const DoutPrefixProvider *dpp,
+                  RGWMDLogSyncType type,
+                  bool from_remote_zone) = 0;
+  virtual int remove(std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp) = 0;
+
+  virtual int mutate(const std::string& entry,
+		     const ceph::real_time& mtime,
+		     RGWObjVersionTracker *objv_tracker,
+                     optional_yield y,
+                     const DoutPrefixProvider *dpp,
+		     RGWMDLogStatus op_type,
+		     std::function<int()> f) = 0;
+
+  virtual int list_keys_init(const DoutPrefixProvider *dpp, const std::string& marker, void **phandle) = 0;
+  virtual int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated) = 0;
+  virtual void list_keys_complete(void *handle) = 0;
+
+  virtual std::string get_marker(void *handle) = 0;
+
+  virtual int get_shard_id(const std::string& entry, int *shard_id) {
+    *shard_id = 0;
+    return 0;
+  }
+  virtual int attach(RGWMetadataManager *manager);
+};
+
+class RGWMetadataHandler_GenericMetaBE : public RGWMetadataHandler {
+  friend class RGWSI_MetaBackend;
+  friend class RGWMetadataManager;
+  friend class Put;
+
+public:
+  class Put;
+
+protected:
+  RGWSI_MetaBackend_Handler *be_handler;
+
+  virtual int do_get(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) = 0;
+  virtual int do_put(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWMetadataObject *obj,
+                     RGWObjVersionTracker& objv_tracker, optional_yield y,
+                     const DoutPrefixProvider *dpp, RGWMDLogSyncType type, 
+                     bool from_remote_zone) = 0;
+  virtual int do_put_operate(Put *put_op, const DoutPrefixProvider *dpp);
+  virtual int do_remove(RGWSI_MetaBackend_Handler::Op *op, std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) = 0;
+
+public:
+  RGWMetadataHandler_GenericMetaBE() {}
+
+  void base_init(CephContext *_cct,
+            RGWSI_MetaBackend_Handler *_be_handler) {
+    RGWMetadataHandler::base_init(_cct);
+    be_handler = _be_handler;
+  }
+
+  RGWSI_MetaBackend_Handler *get_be_handler() {
+    return be_handler;
+  }
+
+  class Put {
+  protected:
+    RGWMetadataHandler_GenericMetaBE *handler;
+    RGWSI_MetaBackend_Handler::Op *op;
+    std::string& entry;
+    RGWMetadataObject *obj;
+    RGWObjVersionTracker& objv_tracker;
+    RGWMDLogSyncType apply_type;
+    optional_yield y;
+    bool from_remote_zone{false};
+
+    int get(RGWMetadataObject **obj, const DoutPrefixProvider *dpp) {
+      return handler->do_get(op, entry, obj, y, dpp);
+    }
+  public:
+    Put(RGWMetadataHandler_GenericMetaBE *_handler, RGWSI_MetaBackend_Handler::Op *_op,
+        std::string& _entry, RGWMetadataObject *_obj,
+        RGWObjVersionTracker& _objv_tracker, optional_yield _y,
+        RGWMDLogSyncType _type, bool from_remote_zone);
+
+    virtual ~Put() {}
+
+    virtual int put_pre(const DoutPrefixProvider *dpp) {
+      return 0;
+    }
+    virtual int put(const DoutPrefixProvider *dpp) {
+      return 0;
+    }
+    virtual int put_post(const DoutPrefixProvider *dpp) {
+      return 0;
+    }
+    virtual int finalize() {
+      return 0;
+    }
+  };
+
+  int get(std::string& entry, RGWMetadataObject **obj, optional_yield, const DoutPrefixProvider *dpp) override;
+  int put(std::string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp, RGWMDLogSyncType type, bool from_remote_zone) override;
+  int remove(std::string& entry, RGWObjVersionTracker& objv_tracker, optional_yield, const DoutPrefixProvider *dpp) override;
+
+  int mutate(const std::string& entry,
+	     const ceph::real_time& mtime,
+	     RGWObjVersionTracker *objv_tracker,
+             optional_yield y,
+             const DoutPrefixProvider *dpp,
+	     RGWMDLogStatus op_type,
+	     std::function<int()> f) override;
+
+  int get_shard_id(const std::string& entry, int *shard_id) override;
+
+  int list_keys_init(const DoutPrefixProvider *dpp, const std::string& marker, void **phandle) override;
+  int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated) override;
+  void list_keys_complete(void *handle) override;
+
+  std::string get_marker(void *handle) override;
+
+  /**
+   * Compare an incoming versus on-disk tag/version+mtime combo against
+   * the sync mode to see if the new one should replace the on-disk one.
+   *
+   * @return true if the update should proceed, false otherwise.
+   */
+  static bool check_versions(bool exists,
+                             const obj_version& ondisk, const real_time& ondisk_time,
+                             const obj_version& incoming, const real_time& incoming_time,
+                             RGWMDLogSyncType sync_mode) {
+    switch (sync_mode) {
+    case APPLY_UPDATES:
+      if ((ondisk.tag != incoming.tag) ||
+	  (ondisk.ver >= incoming.ver))
+	return false;
+      break;
+    case APPLY_NEWER:
+      if (ondisk_time >= incoming_time)
+	return false;
+      break;
+    case APPLY_EXCLUSIVE:
+      if (exists)
+        return false;
+      break;
+    case APPLY_ALWAYS: //deliberate fall-thru -- we always apply!
+    default: break;
+    }
+    return true;
+  }
+};
+
+class RGWMetadataTopHandler;
+
+class RGWMetadataManager {
+  friend class RGWMetadataHandler;
+
+  CephContext *cct;
+  RGWSI_Meta *meta_svc;
+  std::map<std::string, RGWMetadataHandler *> handlers;
+  std::unique_ptr<RGWMetadataTopHandler> md_top_handler;
+
+  int find_handler(const std::string& metadata_key, RGWMetadataHandler **handler, std::string& entry);
+  int register_handler(RGWMetadataHandler *handler);
+
+public:
+  RGWMetadataManager(RGWSI_Meta *_meta_svc);
+  ~RGWMetadataManager();
+
+  RGWMetadataHandler *get_handler(const std::string& type);
+
+  int get(std::string& metadata_key, Formatter *f, optional_yield y, const DoutPrefixProvider *dpp);
+  int put(std::string& metadata_key, bufferlist& bl, optional_yield y,
+          const DoutPrefixProvider *dpp,
+          RGWMDLogSyncType sync_mode,
+          bool from_remote_zone,
+          obj_version *existing_version = NULL);
+  int remove(std::string& metadata_key, optional_yield y, const DoutPrefixProvider *dpp);
+
+  int mutate(const std::string& metadata_key,
+	     const ceph::real_time& mtime,
+	     RGWObjVersionTracker *objv_tracker,
+             optional_yield y,
+             const DoutPrefixProvider *dpp,
+	     RGWMDLogStatus op_type,
+	     std::function<int()> f);
+
+  int list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, void **phandle);
+  int list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void **phandle);
+  int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, std::list<std::string>& keys, bool *truncated);
+  void list_keys_complete(void *handle);
+
+  std::string get_marker(void *handle);
+
+  void dump_log_entry(cls_log_entry& entry, Formatter *f);
+
+  void get_sections(std::list<std::string>& sections);
+
+  void parse_metadata_key(const std::string& metadata_key, std::string& type, std::string& entry);
+
+  int get_shard_id(const std::string& section, const std::string& key, int *shard_id);
+};
+
+class RGWMetadataHandlerPut_SObj : public RGWMetadataHandler_GenericMetaBE::Put
+{
+protected:
+  std::unique_ptr<RGWMetadataObject> oo;
+  RGWMetadataObject *old_obj{nullptr};
+  bool exists{false};
+
+public:
+  RGWMetadataHandlerPut_SObj(RGWMetadataHandler_GenericMetaBE *handler, RGWSI_MetaBackend_Handler::Op *op,
+                             std::string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
+			     optional_yield y,
+                             RGWMDLogSyncType type, bool from_remote_zone);
+  ~RGWMetadataHandlerPut_SObj();
+
+  int put_pre(const DoutPrefixProvider *dpp) override;
+  int put(const DoutPrefixProvider *dpp) override;
+  virtual int put_check(const DoutPrefixProvider *dpp) {
+    return 0;
+  }
+  virtual int put_checked(const DoutPrefixProvider *dpp);
+  virtual void encode_obj(bufferlist *bl) {}
+};
+
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& key, std::string& name, int *shard_id);
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name);
+void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name);
+
diff --git a/src/rgw/driver/rados/rgw_notify.cc b/src/rgw/driver/rados/rgw_notify.cc
new file mode 100644
index 000000000..b1835016e
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_notify.cc
@@ -0,0 +1,1023 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "rgw_notify.h"
+#include "cls/2pc_queue/cls_2pc_queue_client.h"
+#include "cls/lock/cls_lock_client.h"
+#include <memory>
+#include <boost/algorithm/hex.hpp>
+#include <boost/context/protected_fixedsize_stack.hpp>
+#include <spawn/spawn.hpp>
+#include "rgw_sal_rados.h"
+#include "rgw_pubsub.h"
+#include "rgw_pubsub_push.h"
+#include "rgw_perf_counters.h"
+#include "common/dout.h"
+#include <chrono>
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::notify {
+
+struct event_entry_t {
+  rgw_pubsub_s3_event event;
+  std::string push_endpoint;
+  std::string push_endpoint_args;
+  std::string arn_topic;
+  
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(event, bl);
+    encode(push_endpoint, bl);
+    encode(push_endpoint_args, bl);
+    encode(arn_topic, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(event, bl);
+    decode(push_endpoint, bl);
+    decode(push_endpoint_args, bl);
+    decode(arn_topic, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(event_entry_t)
+
+using queues_t = std::set<std::string>;
+
+// use mmap/mprotect to allocate 128k coroutine stacks
+auto make_stack_allocator() {
+  return boost::context::protected_fixedsize_stack{128*1024};
+}
+
+const std::string Q_LIST_OBJECT_NAME = "queues_list_object";
+
+class Manager : public DoutPrefixProvider {
+  const size_t max_queue_size;
+  const uint32_t queues_update_period_ms;
+  const uint32_t queues_update_retry_ms;
+  const uint32_t queue_idle_sleep_us;
+  const utime_t failover_time;
+  CephContext* const cct;
+  static constexpr auto COOKIE_LEN = 16;
+  const std::string lock_cookie;
+  boost::asio::io_context io_context;
+  boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work_guard;
+  const uint32_t worker_count;
+  std::vector<std::thread> workers;
+  const uint32_t stale_reservations_period_s;
+  const uint32_t reservations_cleanup_period_s;
+public:
+  librados::IoCtx& rados_ioctx;
+private:
+
+  CephContext *get_cct() const override { return cct; }
+  unsigned get_subsys() const override { return dout_subsys; }
+  std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw notify: "; }
+
+  // read the list of queues from the queue list object
+  int read_queue_list(queues_t& queues, optional_yield y) {
+    constexpr auto max_chunk = 1024U;
+    std::string start_after;
+    bool more = true;
+    int rval;
+    while (more) {
+      librados::ObjectReadOperation op;
+      queues_t queues_chunk;
+      op.omap_get_keys2(start_after, max_chunk, &queues_chunk, &more, &rval);
+      const auto ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, nullptr, y);
+      if (ret == -ENOENT) {
+        // queue list object was not created - nothing to do
+        return 0;
+      }
+      if (ret < 0) {
+        // TODO: do we need to check on rval as well as ret?
+        ldpp_dout(this, 1) << "ERROR: failed to read queue list. error: " << ret << dendl;
+        return ret;
+      }
+      queues.merge(queues_chunk);
+    }
+    return 0;
+  }
+
+  // set m1 to be the minimum between m1 and m2
+  static int set_min_marker(std::string& m1, const std::string m2) {
+    cls_queue_marker mr1;
+    cls_queue_marker mr2;
+    if (mr1.from_str(m1.c_str()) < 0 || mr2.from_str(m2.c_str()) < 0) {
+      return -EINVAL;
+    }
+    if (mr2.gen <= mr1.gen && mr2.offset < mr1.offset) {
+      m1 = m2;
+    }
+    return 0;
+  }
+
+  using Clock = ceph::coarse_mono_clock;
+  using Executor = boost::asio::io_context::executor_type;
+  using Timer = boost::asio::basic_waitable_timer<Clock,
+        boost::asio::wait_traits<Clock>, Executor>;
+
+  class tokens_waiter {
+    const std::chrono::hours infinite_duration;
+    size_t pending_tokens;
+    Timer timer;
+ 
+    struct token {
+      tokens_waiter& waiter;
+      token(tokens_waiter& _waiter) : waiter(_waiter) {
+        ++waiter.pending_tokens;
+      }
+      
+      ~token() {
+        --waiter.pending_tokens;
+        if (waiter.pending_tokens == 0) {
+          waiter.timer.cancel();
+        }   
+      }   
+    };
+  
+  public:
+
+    tokens_waiter(boost::asio::io_context& io_context) :
+      infinite_duration(1000),
+      pending_tokens(0),
+      timer(io_context) {}  
+ 
+    void async_wait(yield_context yield) {
+      if (pending_tokens == 0) {
+        return;
+      }
+      timer.expires_from_now(infinite_duration);
+      boost::system::error_code ec; 
+      timer.async_wait(yield[ec]);
+      ceph_assert(ec == boost::system::errc::operation_canceled);
+    }   
+ 
+    token make_token() {    
+      return token(*this);
+    }   
+  };
+
+  // processing of a specific entry
+  // return whether processing was successfull (true) or not (false)
+  bool process_entry(const cls_queue_entry& entry, yield_context yield) {
+    event_entry_t event_entry;
+    auto iter = entry.data.cbegin();
+    try {
+      decode(event_entry, iter);
+    } catch (buffer::error& err) {
+      ldpp_dout(this, 5) << "WARNING: failed to decode entry. error: " << err.what() << dendl;
+      return false;
+    }
+    try {
+      // TODO move endpoint creation to queue level
+      const auto push_endpoint = RGWPubSubEndpoint::create(event_entry.push_endpoint, event_entry.arn_topic,
+          RGWHTTPArgs(event_entry.push_endpoint_args, this), 
+          cct);
+      ldpp_dout(this, 20) << "INFO: push endpoint created: " << event_entry.push_endpoint <<
+        " for entry: " << entry.marker << dendl;
+      const auto ret = push_endpoint->send_to_completion_async(cct, event_entry.event, optional_yield(io_context, yield));
+      if (ret < 0) {
+        ldpp_dout(this, 5) << "WARNING: push entry: " << entry.marker << " to endpoint: " << event_entry.push_endpoint 
+          << " failed. error: " << ret << " (will retry)" << dendl;
+        return false;
+      } else {
+        ldpp_dout(this, 20) << "INFO: push entry: " << entry.marker << " to endpoint: " << event_entry.push_endpoint 
+          << " ok" <<  dendl;
+        if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok);
+        return true;
+      }
+    } catch (const RGWPubSubEndpoint::configuration_error& e) {
+      ldpp_dout(this, 5) << "WARNING: failed to create push endpoint: " 
+          << event_entry.push_endpoint << " for entry: " << entry.marker << ". error: " << e.what() << " (will retry) " << dendl;
+      return false;
+    }
+  }
+
+  // clean stale reservation from queue
+  void cleanup_queue(const std::string& queue_name, yield_context yield) {
+    while (true) {
+      ldpp_dout(this, 20) << "INFO: trying to perform stale reservation cleanup for queue: " << queue_name << dendl;
+      const auto now = ceph::coarse_real_time::clock::now();
+      const auto stale_time = now - std::chrono::seconds(stale_reservations_period_s);
+      librados::ObjectWriteOperation op;
+      op.assert_exists();
+      rados::cls::lock::assert_locked(&op, queue_name+"_lock", 
+        ClsLockType::EXCLUSIVE,
+        lock_cookie, 
+        "" /*no tag*/);
+      cls_2pc_queue_expire_reservations(op, stale_time);
+      // check ownership and do reservation cleanup in one batch
+      auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield));
+      if (ret == -ENOENT) {
+        // queue was deleted
+        ldpp_dout(this, 5) << "INFO: queue: " 
+          << queue_name << ". was removed. cleanup will stop" << dendl;
+        return;
+      }
+      if (ret == -EBUSY) {
+        ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
+        return;
+      }
+      if (ret < 0) {
+        ldpp_dout(this, 5) << "WARNING: failed to cleanup stale reservation from queue and/or lock queue: " << queue_name
+          << ". error: " << ret << dendl;
+      }
+      Timer timer(io_context);
+      timer.expires_from_now(std::chrono::seconds(reservations_cleanup_period_s));
+      boost::system::error_code ec;
+	    timer.async_wait(yield[ec]);
+    }
+  }
+
+  // processing of a specific queue
+  void process_queue(const std::string& queue_name, yield_context yield) {
+    constexpr auto max_elements = 1024;
+    auto is_idle = false;
+    const std::string start_marker;
+
+    // start a the cleanup coroutine for the queue
+    spawn::spawn(io_context, [this, queue_name](yield_context yield) {
+            cleanup_queue(queue_name, yield);
+            }, make_stack_allocator());
+    
+    while (true) {
+      // if queue was empty the last time, sleep for idle timeout
+      if (is_idle) {
+        Timer timer(io_context);
+        timer.expires_from_now(std::chrono::microseconds(queue_idle_sleep_us));
+        boost::system::error_code ec;
+	      timer.async_wait(yield[ec]);
+      }
+
+      // get list of entries in the queue
+      is_idle = true;
+      bool truncated = false;
+      std::string end_marker;
+      std::vector<cls_queue_entry> entries;
+      auto total_entries = 0U;
+      {
+        librados::ObjectReadOperation op;
+        op.assert_exists();
+        bufferlist obl;
+        int rval;
+        rados::cls::lock::assert_locked(&op, queue_name+"_lock", 
+          ClsLockType::EXCLUSIVE,
+          lock_cookie, 
+          "" /*no tag*/);
+        cls_2pc_queue_list_entries(op, start_marker, max_elements, &obl, &rval);
+        // check ownership and list entries in one batch
+        auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, nullptr, optional_yield(io_context, yield));
+        if (ret == -ENOENT) {
+          // queue was deleted
+          ldpp_dout(this, 5) << "INFO: queue: " 
+            << queue_name << ". was removed. processing will stop" << dendl;
+          return;
+        }
+        if (ret == -EBUSY) {
+          ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
+          return;
+        }
+        if (ret < 0) {
+          ldpp_dout(this, 5) << "WARNING: failed to get list of entries in queue and/or lock queue: " 
+            << queue_name << ". error: " << ret << " (will retry)" << dendl;
+          continue;
+        }
+        ret = cls_2pc_queue_list_entries_result(obl, entries, &truncated, end_marker);
+        if (ret < 0) {
+          ldpp_dout(this, 5) << "WARNING: failed to parse list of entries in queue: " 
+            << queue_name << ". error: " << ret << " (will retry)" << dendl;
+          continue;
+        }
+      }
+      total_entries = entries.size();
+      if (total_entries == 0) {
+        // nothing in the queue
+        continue;
+      }
+      // log when queue is not idle
+      ldpp_dout(this, 20) << "INFO: found: " << total_entries << " entries in: " << queue_name <<
+        ". end marker is: " << end_marker << dendl;
+      
+      is_idle = false;
+      auto has_error = false;
+      auto remove_entries = false;
+      auto entry_idx = 1U;
+      tokens_waiter waiter(io_context);
+      for (auto& entry : entries) {
+        if (has_error) {
+          // bail out on first error
+          break;
+        }
+        // TODO pass entry pointer instead of by-value
+        spawn::spawn(yield, [this, &queue_name, entry_idx, total_entries, &end_marker, &remove_entries, &has_error, &waiter, entry](yield_context yield) {
+            const auto token = waiter.make_token();
+            if (process_entry(entry, yield)) {
+              ldpp_dout(this, 20) << "INFO: processing of entry: " << 
+                entry.marker << " (" << entry_idx << "/" << total_entries << ") from: " << queue_name << " ok" << dendl;
+              remove_entries = true;
+            }  else {
+              if (set_min_marker(end_marker, entry.marker) < 0) {
+                ldpp_dout(this, 1) << "ERROR: cannot determin minimum between malformed markers: " << end_marker << ", " << entry.marker << dendl;
+              } else {
+                ldpp_dout(this, 20) << "INFO: new end marker for removal: " << end_marker << " from: " << queue_name << dendl;
+              }
+              has_error = true;
+              ldpp_dout(this, 20) << "INFO: processing of entry: " << 
+                entry.marker << " (" << entry_idx << "/" << total_entries << ") from: " << queue_name << " failed" << dendl;
+            } 
+        }, make_stack_allocator());
+        ++entry_idx;
+      }
+
+      // wait for all pending work to finish
+      waiter.async_wait(yield);
+
+      // delete all published entries from queue
+      if (remove_entries) {
+        librados::ObjectWriteOperation op;
+        op.assert_exists();
+        rados::cls::lock::assert_locked(&op, queue_name+"_lock", 
+          ClsLockType::EXCLUSIVE,
+          lock_cookie, 
+          "" /*no tag*/);
+        cls_2pc_queue_remove_entries(op, end_marker); 
+        // check ownership and deleted entries in one batch
+        const auto ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield)); 
+        if (ret == -ENOENT) {
+          // queue was deleted
+          ldpp_dout(this, 5) << "INFO: queue: " 
+            << queue_name << ". was removed. processing will stop" << dendl;
+          return;
+        }
+        if (ret == -EBUSY) {
+          ldpp_dout(this, 5) << "WARNING: queue: " << queue_name << " ownership moved to another daemon. processing will stop" << dendl;
+          return;
+        }
+        if (ret < 0) {
+          ldpp_dout(this, 1) << "ERROR: failed to remove entries and/or lock queue up to: " << end_marker <<  " from queue: " 
+            << queue_name << ". error: " << ret << dendl;
+        } else {
+          ldpp_dout(this, 20) << "INFO: removed entries up to: " << end_marker <<  " from queue: " 
+          << queue_name << dendl;
+        }
+      }
+    }
+  }
+
+  // lits of owned queues
+  using owned_queues_t = std::unordered_set<std::string>;
+
+  // process all queues
+  // find which of the queues is owned by this daemon and process it
+  void process_queues(yield_context yield) {
+    auto has_error = false;
+    owned_queues_t owned_queues;
+
+    // add randomness to the duration between queue checking
+    // to make sure that different daemons are not synced
+    std::random_device seed;
+    std::mt19937 rnd_gen(seed());
+    const auto min_jitter = 100; // ms
+    const auto max_jitter = 500; // ms
+    std::uniform_int_distribution<> duration_jitter(min_jitter, max_jitter);
+
+    std::vector<std::string> queue_gc;
+    std::mutex queue_gc_lock;
+    while (true) {
+      Timer timer(io_context);
+      const auto duration = (has_error ? 
+        std::chrono::milliseconds(queues_update_retry_ms) : std::chrono::milliseconds(queues_update_period_ms)) + 
+        std::chrono::milliseconds(duration_jitter(rnd_gen));
+      timer.expires_from_now(duration);
+      const auto tp = ceph::coarse_real_time::clock::to_time_t(ceph::coarse_real_time::clock::now() + duration);
+      ldpp_dout(this, 20) << "INFO: next queues processing will happen at: " << std::ctime(&tp)  << dendl;
+      boost::system::error_code ec;
+      timer.async_wait(yield[ec]);
+
+      queues_t queues;
+      auto ret = read_queue_list(queues, optional_yield(io_context, yield));
+      if (ret < 0) {
+        has_error = true;
+        continue;
+      }
+
+      for (const auto& queue_name : queues) {
+        // try to lock the queue to check if it is owned by this rgw
+        // or if ownershif needs to be taken
+        librados::ObjectWriteOperation op;
+        op.assert_exists();
+        rados::cls::lock::lock(&op, queue_name+"_lock", 
+              ClsLockType::EXCLUSIVE,
+              lock_cookie, 
+              "" /*no tag*/,
+              "" /*no description*/,
+              failover_time,
+              LOCK_FLAG_MAY_RENEW);
+
+        ret = rgw_rados_operate(this, rados_ioctx, queue_name, &op, optional_yield(io_context, yield));
+        if (ret == -EBUSY) {
+          // lock is already taken by another RGW
+          ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " owned (locked) by another daemon" << dendl;
+          // if queue was owned by this RGW, processing should be stopped, queue would be deleted from list afterwards
+          continue;
+        }
+        if (ret == -ENOENT) {
+          // queue is deleted - processing will stop the next time we try to read from the queue
+          ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " should not be locked - already deleted" << dendl;
+          continue;
+        }
+        if (ret < 0) {
+          // failed to lock for another reason, continue to process other queues
+          ldpp_dout(this, 1) << "ERROR: failed to lock queue: " << queue_name << ". error: " << ret << dendl;
+          has_error = true;
+          continue;
+        }
+        // add queue to list of owned queues
+        if (owned_queues.insert(queue_name).second) {
+          ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " now owned (locked) by this daemon" << dendl;
+          // start processing this queue
+          spawn::spawn(io_context, [this, &queue_gc, &queue_gc_lock, queue_name](yield_context yield) {
+            process_queue(queue_name, yield);
+            // if queue processing ended, it measn that the queue was removed or not owned anymore
+            // mark it for deletion
+            std::lock_guard lock_guard(queue_gc_lock);
+            queue_gc.push_back(queue_name);
+            ldpp_dout(this, 10) << "INFO: queue: " << queue_name << " marked for removal" << dendl;
+          }, make_stack_allocator());
+        } else {
+          ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " ownership (lock) renewed" << dendl;
+        }
+      }
+      // erase all queue that were deleted
+      {
+        std::lock_guard lock_guard(queue_gc_lock);
+        std::for_each(queue_gc.begin(), queue_gc.end(), [this, &owned_queues](const std::string& queue_name) {
+          owned_queues.erase(queue_name);
+          ldpp_dout(this, 20) << "INFO: queue: " << queue_name << " removed" << dendl;
+        });
+        queue_gc.clear();
+      }
+    }
+  }
+
+public:
+
+  ~Manager() {
+    work_guard.reset();
+    io_context.stop();
+    std::for_each(workers.begin(), workers.end(), [] (auto& worker) { worker.join(); });
+  }
+
+  // ctor: start all threads
+  Manager(CephContext* _cct, uint32_t _max_queue_size, uint32_t _queues_update_period_ms, 
+          uint32_t _queues_update_retry_ms, uint32_t _queue_idle_sleep_us, u_int32_t failover_time_ms, 
+          uint32_t _stale_reservations_period_s, uint32_t _reservations_cleanup_period_s,
+          uint32_t _worker_count, rgw::sal::RadosStore* store) :
+    max_queue_size(_max_queue_size),
+    queues_update_period_ms(_queues_update_period_ms),
+    queues_update_retry_ms(_queues_update_retry_ms),
+    queue_idle_sleep_us(_queue_idle_sleep_us),
+    failover_time(std::chrono::milliseconds(failover_time_ms)),
+    cct(_cct),
+    lock_cookie(gen_rand_alphanumeric(cct, COOKIE_LEN)),
+    work_guard(boost::asio::make_work_guard(io_context)),
+    worker_count(_worker_count),
+    stale_reservations_period_s(_stale_reservations_period_s),
+    reservations_cleanup_period_s(_reservations_cleanup_period_s),
+    rados_ioctx(store->getRados()->get_notif_pool_ctx())
+    {
+      spawn::spawn(io_context, [this] (yield_context yield) {
+            process_queues(yield);
+          }, make_stack_allocator());
+
+      // start the worker threads to do the actual queue processing
+      const std::string WORKER_THREAD_NAME = "notif-worker";
+      for (auto worker_id = 0U; worker_id < worker_count; ++worker_id) {
+        workers.emplace_back([this]() {
+          try {
+            io_context.run(); 
+          } catch (const std::exception& err) {
+            ldpp_dout(this, 10) << "Notification worker failed with error: " << err.what() << dendl;
+            throw(err);
+          }
+        });
+        const auto rc = ceph_pthread_setname(workers.back().native_handle(), 
+          (WORKER_THREAD_NAME+std::to_string(worker_id)).c_str());
+        ceph_assert(rc == 0);
+      }
+      ldpp_dout(this, 10) << "Started notification manager with: " << worker_count << " workers" << dendl;
+    }
+
+  int add_persistent_topic(const std::string& topic_name, optional_yield y) {
+    if (topic_name == Q_LIST_OBJECT_NAME) {
+      ldpp_dout(this, 1) << "ERROR: topic name cannot be: " << Q_LIST_OBJECT_NAME << " (conflict with queue list object name)" << dendl;
+      return -EINVAL;
+    }
+    librados::ObjectWriteOperation op;
+    op.create(true);
+    cls_2pc_queue_init(op, topic_name, max_queue_size);
+    auto ret = rgw_rados_operate(this, rados_ioctx, topic_name, &op, y);
+    if (ret == -EEXIST) {
+      // queue already exists - nothing to do
+      ldpp_dout(this, 20) << "INFO: queue for topic: " << topic_name << " already exists. nothing to do" << dendl;
+      return 0;
+    }
+    if (ret < 0) {
+      // failed to create queue
+      ldpp_dout(this, 1) << "ERROR: failed to create queue for topic: " << topic_name << ". error: " << ret << dendl;
+      return ret;
+    }
+   
+    bufferlist empty_bl;
+    std::map<std::string, bufferlist> new_topic{{topic_name, empty_bl}};
+    op.omap_set(new_topic);
+    ret = rgw_rados_operate(this, rados_ioctx, Q_LIST_OBJECT_NAME, &op, y);
+    if (ret < 0) {
+      ldpp_dout(this, 1) << "ERROR: failed to add queue: " << topic_name << " to queue list. error: " << ret << dendl;
+      return ret;
+    } 
+    ldpp_dout(this, 20) << "INFO: queue: " << topic_name << " added to queue list"  << dendl;
+    return 0;
+  }
+};
+
+// singleton manager
+// note that the manager itself is not a singleton, and multiple instances may co-exist
+// TODO make the pointer atomic in allocation and deallocation to avoid race conditions
+static Manager* s_manager = nullptr;
+
+constexpr size_t MAX_QUEUE_SIZE = 128*1000*1000; // 128MB
+constexpr uint32_t Q_LIST_UPDATE_MSEC = 1000*30;     // check queue list every 30seconds
+constexpr uint32_t Q_LIST_RETRY_MSEC = 1000;         // retry every second if queue list update failed
+constexpr uint32_t IDLE_TIMEOUT_USEC = 100*1000;     // idle sleep 100ms
+constexpr uint32_t FAILOVER_TIME_MSEC = 3*Q_LIST_UPDATE_MSEC; // FAILOVER TIME 3x renew time
+constexpr uint32_t WORKER_COUNT = 1;                 // 1 worker thread
+constexpr uint32_t STALE_RESERVATIONS_PERIOD_S = 120;   // cleanup reservations that are more than 2 minutes old
+constexpr uint32_t RESERVATIONS_CLEANUP_PERIOD_S = 30; // reservation cleanup every 30 seconds
+
+bool init(CephContext* cct, rgw::sal::RadosStore* store, const DoutPrefixProvider *dpp) {
+  if (s_manager) {
+    return false;
+  }
+  // TODO: take conf from CephContext
+  s_manager = new Manager(cct, MAX_QUEUE_SIZE, 
+      Q_LIST_UPDATE_MSEC, Q_LIST_RETRY_MSEC, 
+      IDLE_TIMEOUT_USEC, FAILOVER_TIME_MSEC, 
+      STALE_RESERVATIONS_PERIOD_S, RESERVATIONS_CLEANUP_PERIOD_S,
+      WORKER_COUNT,
+      store);
+  return true;
+}
+
+void shutdown() {
+  delete s_manager;
+  s_manager = nullptr;
+}
+
+int add_persistent_topic(const std::string& topic_name, optional_yield y) {
+  if (!s_manager) {
+    return -EAGAIN;
+  }
+  return s_manager->add_persistent_topic(topic_name, y);
+}
+
+int remove_persistent_topic(const DoutPrefixProvider* dpp, librados::IoCtx& rados_ioctx, const std::string& topic_name, optional_yield y) {
+  librados::ObjectWriteOperation op;
+  op.remove();
+  auto ret = rgw_rados_operate(dpp, rados_ioctx, topic_name, &op, y);
+  if (ret == -ENOENT) {
+    // queue already removed - nothing to do
+    ldpp_dout(dpp, 20) << "INFO: queue for topic: " << topic_name << " already removed. nothing to do" << dendl;
+    return 0;
+  }
+  if (ret < 0) {
+    // failed to remove queue
+    ldpp_dout(dpp, 1) << "ERROR: failed to remove queue for topic: " << topic_name << ". error: " << ret << dendl;
+    return ret;
+  }
+
+  std::set<std::string> topic_to_remove{{topic_name}};
+  op.omap_rm_keys(topic_to_remove);
+  ret = rgw_rados_operate(dpp, rados_ioctx, Q_LIST_OBJECT_NAME, &op, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to remove queue: " << topic_name << " from queue list. error: " << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << "INFO: queue: " << topic_name << " removed from queue list"  << dendl;
+  return 0;
+}
+
+int remove_persistent_topic(const std::string& topic_name, optional_yield y) {
+  if (!s_manager) {
+    return -EAGAIN;
+  }
+  return remove_persistent_topic(s_manager, s_manager->rados_ioctx, topic_name, y);
+}
+
+rgw::sal::Object* get_object_with_atttributes(
+  const reservation_t& res, rgw::sal::Object* obj) {
+  // in case of copy obj, the tags and metadata are taken from source
+  const auto src_obj = res.src_object ? res.src_object : obj;
+  if (src_obj->get_attrs().empty()) {
+    if (!src_obj->get_bucket()) {
+      src_obj->set_bucket(res.bucket);
+    }
+    const auto ret = src_obj->get_obj_attrs(res.yield, res.dpp);
+    if (ret < 0) {
+      ldpp_dout(res.dpp, 20) << "failed to get attributes from object: " << 
+        src_obj->get_key() << ". ret = " << ret << dendl;
+      return nullptr;
+    }
+  }
+  return src_obj;
+}
+
+static inline void filter_amz_meta(meta_map_t& dest, const meta_map_t& src) {
+  std::copy_if(src.cbegin(), src.cend(),
+               std::inserter(dest, dest.end()),
+               [](const auto& m) {
+                 return (boost::algorithm::starts_with(m.first, RGW_AMZ_META_PREFIX));
+               });
+}
+
+
+static inline void metadata_from_attributes(
+  reservation_t& res, rgw::sal::Object* obj) {
+  auto& metadata = res.x_meta_map;
+  const auto src_obj = get_object_with_atttributes(res, obj);
+  if (!src_obj) {
+    return;
+  }
+  res.metadata_fetched_from_attributes = true;
+  for (auto& attr : src_obj->get_attrs()) {
+    if (boost::algorithm::starts_with(attr.first, RGW_ATTR_META_PREFIX)) {
+      std::string_view key(attr.first);
+      key.remove_prefix(sizeof(RGW_ATTR_PREFIX)-1);
+      // we want to pass a null terminated version
+      // of the bufferlist, hence "to_str().c_str()"
+      metadata.emplace(key, attr.second.to_str().c_str());
+    }
+  }
+}
+
+static inline void tags_from_attributes(
+  const reservation_t& res, rgw::sal::Object* obj, KeyMultiValueMap& tags) {
+  const auto src_obj = get_object_with_atttributes(res, obj);
+  if (!src_obj) {
+    return;
+  }
+  const auto& attrs = src_obj->get_attrs();
+  const auto attr_iter = attrs.find(RGW_ATTR_TAGS);
+  if (attr_iter != attrs.end()) {
+    auto bliter = attr_iter->second.cbegin();
+    RGWObjTags obj_tags;
+    try {
+      ::decode(obj_tags, bliter);
+    } catch(buffer::error&) {
+      // not able to decode tags
+      return;
+    }
+    tags = std::move(obj_tags.get_tags());
+  }
+}
+
+// populate event from request
+static inline void populate_event(reservation_t& res,
+        rgw::sal::Object* obj,
+        uint64_t size,
+        const ceph::real_time& mtime, 
+        const std::string& etag, 
+        const std::string& version, 
+        EventType event_type,
+        rgw_pubsub_s3_event& event) {
+  event.eventTime = mtime;
+  event.eventName = to_event_string(event_type);
+  event.userIdentity = res.user_id;    // user that triggered the change
+  event.x_amz_request_id = res.req_id; // request ID of the original change
+  event.x_amz_id_2 = res.store->getRados()->host_id; // RGW on which the change was made
+  // configurationId is filled from notification configuration
+  event.bucket_name = res.bucket->get_name();
+  event.bucket_ownerIdentity = res.bucket->get_owner() ?
+    res.bucket->get_owner()->get_id().id : res.bucket->get_info().owner.id;
+  const auto region = res.store->get_zone()->get_zonegroup().get_api_name();
+  rgw::ARN bucket_arn(res.bucket->get_key());
+  bucket_arn.region = region; 
+  event.bucket_arn = to_string(bucket_arn);
+  event.object_key = res.object_name ? *res.object_name : obj->get_name();
+  event.object_size = size;
+  event.object_etag = etag;
+  event.object_versionId = version;
+  event.awsRegion = region;
+  // use timestamp as per key sequence id (hex encoded)
+  const utime_t ts(real_clock::now());
+  boost::algorithm::hex((const char*)&ts, (const char*)&ts + sizeof(utime_t), 
+          std::back_inserter(event.object_sequencer));
+  set_event_id(event.id, etag, ts);
+  event.bucket_id = res.bucket->get_bucket_id();
+  // pass meta data
+  if (!res.metadata_fetched_from_attributes) {
+    // either no metadata exist or no metadata filter was used
+    metadata_from_attributes(res, obj);
+  }
+  event.x_meta_map = res.x_meta_map;
+  // pass tags
+  if (!res.tagset ||
+      (*res.tagset).get_tags().empty()) {
+    // try to fetch the tags from the attributes
+    tags_from_attributes(res, obj, event.tags);
+  } else {
+    event.tags = (*res.tagset).get_tags();
+  }
+  // opaque data will be filled from topic configuration
+}
+
+static inline bool notification_match(reservation_t& res,
+				      const rgw_pubsub_topic_filter& filter,
+				      EventType event,
+				      const RGWObjTags* req_tags) {
+  if (!match(filter.events, event)) { 
+    return false;
+  }
+  const auto obj = res.object;
+  if (!match(filter.s3_filter.key_filter, 
+        res.object_name ? *res.object_name : obj->get_name())) {
+    return false;
+  }
+
+  if (!filter.s3_filter.metadata_filter.kv.empty()) {
+    // metadata filter exists
+    if (res.s) {
+      filter_amz_meta(res.x_meta_map, res.s->info.x_meta_map);
+    }
+    metadata_from_attributes(res, obj);
+    if (!match(filter.s3_filter.metadata_filter, res.x_meta_map)) {
+      return false;
+    }
+  }
+
+  if (!filter.s3_filter.tag_filter.kv.empty()) {
+    // tag filter exists
+    if (req_tags) {
+      // tags in the request
+      if (!match(filter.s3_filter.tag_filter, req_tags->get_tags())) {
+        return false;
+      }
+    } else if (res.tagset && !(*res.tagset).get_tags().empty()) {
+      // tags were cached in req_state
+      if (!match(filter.s3_filter.tag_filter, (*res.tagset).get_tags())) {
+        return false;
+      }
+    } else {
+      // try to fetch tags from the attributes
+      KeyMultiValueMap tags;
+      tags_from_attributes(res, obj, tags);
+      if (!match(filter.s3_filter.tag_filter, tags)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+  int publish_reserve(const DoutPrefixProvider* dpp,
+		      EventType event_type,
+		      reservation_t& res,
+		      const RGWObjTags* req_tags)
+{
+  const RGWPubSub ps(res.store, res.user_tenant);
+  const RGWPubSub::Bucket ps_bucket(ps, res.bucket);
+  rgw_pubsub_bucket_topics bucket_topics;
+  auto rc = ps_bucket.get_topics(res.dpp, bucket_topics, res.yield);
+  if (rc < 0) {
+    // failed to fetch bucket topics
+    return rc;
+  }
+  for (const auto& bucket_topic : bucket_topics.topics) {
+    const rgw_pubsub_topic_filter& topic_filter = bucket_topic.second;
+    const rgw_pubsub_topic& topic_cfg = topic_filter.topic;
+    if (!notification_match(res, topic_filter, event_type, req_tags)) {
+      // notification does not apply to req_state
+      continue;
+    }
+    ldpp_dout(res.dpp, 20) << "INFO: notification: '" << topic_filter.s3_id <<
+        "' on topic: '" << topic_cfg.dest.arn_topic << 
+        "' and bucket: '" << res.bucket->get_name() <<
+        "' (unique topic: '" << topic_cfg.name <<
+        "') apply to event of type: '" << to_string(event_type) << "'" << dendl;
+
+    cls_2pc_reservation::id_t res_id;
+    if (topic_cfg.dest.persistent) {
+      // TODO: take default reservation size from conf
+      constexpr auto DEFAULT_RESERVATION = 4*1024U; // 4K
+      res.size = DEFAULT_RESERVATION;
+      librados::ObjectWriteOperation op;
+      bufferlist obl;
+      int rval;
+      const auto& queue_name = topic_cfg.dest.arn_topic;
+      cls_2pc_queue_reserve(op, res.size, 1, &obl, &rval);
+      auto ret = rgw_rados_operate(
+	res.dpp, res.store->getRados()->get_notif_pool_ctx(),
+	queue_name, &op, res.yield, librados::OPERATION_RETURNVEC);
+      if (ret < 0) {
+        ldpp_dout(res.dpp, 1) <<
+	  "ERROR: failed to reserve notification on queue: "
+			      << queue_name << ". error: " << ret << dendl;
+        // if no space is left in queue we ask client to slow down
+        return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret;
+      }
+      ret = cls_2pc_queue_reserve_result(obl, res_id);
+      if (ret < 0) {
+        ldpp_dout(res.dpp, 1) << "ERROR: failed to parse reservation id. error: " << ret << dendl;
+        return ret;
+      }
+    }
+    res.topics.emplace_back(topic_filter.s3_id, topic_cfg, res_id);
+  }
+  return 0;
+}
+
+int publish_commit(rgw::sal::Object* obj,
+		   uint64_t size,
+		   const ceph::real_time& mtime,
+		   const std::string& etag,
+		   const std::string& version,
+		   EventType event_type,
+		   reservation_t& res,
+		   const DoutPrefixProvider* dpp)
+{
+  for (auto& topic : res.topics) {
+    if (topic.cfg.dest.persistent &&
+	topic.res_id == cls_2pc_reservation::NO_ID) {
+      // nothing to commit or already committed/aborted
+      continue;
+    }
+    event_entry_t event_entry;
+    populate_event(res, obj, size, mtime, etag, version, event_type, event_entry.event);
+    event_entry.event.configurationId = topic.configurationId;
+    event_entry.event.opaque_data = topic.cfg.opaque_data;
+    if (topic.cfg.dest.persistent) { 
+      event_entry.push_endpoint = std::move(topic.cfg.dest.push_endpoint);
+      event_entry.push_endpoint_args =
+	std::move(topic.cfg.dest.push_endpoint_args);
+      event_entry.arn_topic = topic.cfg.dest.arn_topic;
+      bufferlist bl;
+      encode(event_entry, bl);
+      const auto& queue_name = topic.cfg.dest.arn_topic;
+      if (bl.length() > res.size) {
+        // try to make a larger reservation, fail only if this is not possible
+        ldpp_dout(dpp, 5) << "WARNING: committed size: " << bl.length()
+			  << " exceeded reserved size: " << res.size
+			  <<
+          " . trying to make a larger reservation on queue:" << queue_name
+			  << dendl;
+        // first cancel the existing reservation
+        librados::ObjectWriteOperation op;
+        cls_2pc_queue_abort(op, topic.res_id);
+        auto ret = rgw_rados_operate(
+	  dpp, res.store->getRados()->get_notif_pool_ctx(),
+	  topic.cfg.dest.arn_topic, &op,
+	  res.yield);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << "ERROR: failed to abort reservation: "
+			    << topic.res_id << 
+            " when trying to make a larger reservation on queue: " << queue_name
+			    << ". error: " << ret << dendl;
+          return ret;
+        }
+        // now try to make a bigger one
+	buffer::list obl;
+        int rval;
+        cls_2pc_queue_reserve(op, bl.length(), 1, &obl, &rval);
+        ret = rgw_rados_operate(
+	  dpp, res.store->getRados()->get_notif_pool_ctx(),
+          queue_name, &op, res.yield, librados::OPERATION_RETURNVEC);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << "ERROR: failed to reserve extra space on queue: "
+			    << queue_name
+			    << ". error: " << ret << dendl;
+          return (ret == -ENOSPC) ? -ERR_RATE_LIMITED : ret;
+        }
+        ret = cls_2pc_queue_reserve_result(obl, topic.res_id);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << "ERROR: failed to parse reservation id for "
+	    "extra space. error: " << ret << dendl;
+          return ret;
+        }
+      }
+      std::vector<buffer::list> bl_data_vec{std::move(bl)};
+      librados::ObjectWriteOperation op;
+      cls_2pc_queue_commit(op, bl_data_vec, topic.res_id);
+      const auto ret = rgw_rados_operate(
+	dpp, res.store->getRados()->get_notif_pool_ctx(),
+	queue_name, &op, res.yield);
+      topic.res_id = cls_2pc_reservation::NO_ID;
+      if (ret < 0) {
+        ldpp_dout(dpp, 1) << "ERROR: failed to commit reservation to queue: "
+			  << queue_name << ". error: " << ret
+			  << dendl;
+        return ret;
+      }
+    } else {
+      try {
+        // TODO add endpoint LRU cache
+        const auto push_endpoint = RGWPubSubEndpoint::create(
+	  topic.cfg.dest.push_endpoint,
+	  topic.cfg.dest.arn_topic,
+	  RGWHTTPArgs(topic.cfg.dest.push_endpoint_args, dpp),
+	  dpp->get_cct());
+        ldpp_dout(res.dpp, 20) << "INFO: push endpoint created: "
+			       << topic.cfg.dest.push_endpoint << dendl;
+        const auto ret = push_endpoint->send_to_completion_async(
+	  dpp->get_cct(), event_entry.event, res.yield);
+        if (ret < 0) {
+          ldpp_dout(dpp, 1) << "ERROR: push to endpoint "
+			    << topic.cfg.dest.push_endpoint
+			    << " failed. error: " << ret << dendl;
+          if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed);
+          return ret;
+        }
+        if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_ok);
+      } catch (const RGWPubSubEndpoint::configuration_error& e) {
+        ldpp_dout(dpp, 1) << "ERROR: failed to create push endpoint: " 
+            << topic.cfg.dest.push_endpoint << ". error: " << e.what() << dendl;
+        if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_failed);
+        return -EINVAL;
+      }
+    }
+  }
+  return 0;
+}
+
+int publish_abort(reservation_t& res) {
+  for (auto& topic : res.topics) {
+    if (!topic.cfg.dest.persistent ||
+	topic.res_id == cls_2pc_reservation::NO_ID) {
+      // nothing to abort or already committed/aborted
+      continue;
+    }
+    const auto& queue_name = topic.cfg.dest.arn_topic;
+    librados::ObjectWriteOperation op;
+    cls_2pc_queue_abort(op, topic.res_id);
+    const auto ret = rgw_rados_operate(
+      res.dpp, res.store->getRados()->get_notif_pool_ctx(),
+      queue_name, &op, res.yield);
+    if (ret < 0) {
+      ldpp_dout(res.dpp, 1) << "ERROR: failed to abort reservation: "
+			    << topic.res_id <<
+        " from queue: " << queue_name << ". error: " << ret << dendl;
+      return ret;
+    }
+    topic.res_id = cls_2pc_reservation::NO_ID;
+  }
+  return 0;
+}
+
+reservation_t::reservation_t(const DoutPrefixProvider* _dpp,
+			     rgw::sal::RadosStore* _store,
+			     const req_state* _s,
+			     rgw::sal::Object* _object,
+			     rgw::sal::Object* _src_object,
+			     const std::string* _object_name,
+			     optional_yield y) :
+  dpp(_s), store(_store), s(_s), size(0) /* XXX */,
+  object(_object), src_object(_src_object), bucket(_s->bucket.get()),
+  object_name(_object_name),
+  tagset(_s->tagset),
+  metadata_fetched_from_attributes(false),
+  user_id(_s->user->get_id().id),
+  user_tenant(_s->user->get_id().tenant),
+  req_id(_s->req_id),
+  yield(y)
+{
+  filter_amz_meta(x_meta_map, _s->info.x_meta_map);
+}
+
+reservation_t::reservation_t(const DoutPrefixProvider* _dpp,
+			     rgw::sal::RadosStore* _store,
+			     rgw::sal::Object* _object,
+			     rgw::sal::Object* _src_object,
+			     rgw::sal::Bucket* _bucket,
+			     const std::string& _user_id,
+			     const std::string& _user_tenant,
+			     const std::string& _req_id,
+			     optional_yield y) :
+    dpp(_dpp), store(_store), s(nullptr), size(0) /* XXX */,
+    object(_object), src_object(_src_object), bucket(_bucket),
+    object_name(nullptr),
+    metadata_fetched_from_attributes(false),
+    user_id(_user_id),
+    user_tenant(_user_tenant),
+    req_id(_req_id),
+    yield(y)
+{}
+
+reservation_t::~reservation_t() {
+  publish_abort(*this);
+}
+
+} // namespace rgw::notify
diff --git a/src/rgw/driver/rados/rgw_notify.h b/src/rgw/driver/rados/rgw_notify.h
new file mode 100644
index 000000000..9269611e4
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_notify.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include "common/ceph_time.h"
+#include "include/common_fwd.h"
+#include "rgw_notify_event_type.h"
+#include "common/async/yield_context.h"
+#include "cls/2pc_queue/cls_2pc_queue_types.h"
+#include "rgw_pubsub.h"
+
+// forward declarations
+namespace rgw::sal {
+    class RadosStore;
+    class RGWObject;
+}
+
+class RGWRados;
+struct rgw_obj_key;
+
+namespace rgw::notify {
+
+// initialize the notification manager
+// notification manager is dequeing the 2-phase-commit queues
+// and send the notifications to the endpoints
+bool init(CephContext* cct, rgw::sal::RadosStore* store, const DoutPrefixProvider *dpp);
+
+// shutdown the notification manager
+void shutdown();
+
+// create persistent delivery queue for a topic (endpoint)
+// this operation also add a topic name to the common (to all RGWs) list of all topics
+int add_persistent_topic(const std::string& topic_name, optional_yield y);
+
+// remove persistent delivery queue for a topic (endpoint)
+// this operation also remove the topic name from the common (to all RGWs) list of all topics
+int remove_persistent_topic(const std::string& topic_name, optional_yield y);
+
+// same as the above, expect you need to provide the IoCtx, the above uses rgw::notify::Manager::rados_ioctx
+int remove_persistent_topic(const DoutPrefixProvider* dpp, librados::IoCtx& rados_ioctx, const std::string& topic_name, optional_yield y);
+
+// struct holding reservation information
+// populated in the publish_reserve call
+// then used to commit or abort the reservation
+struct reservation_t {
+  struct topic_t {
+    topic_t(const std::string& _configurationId, const rgw_pubsub_topic& _cfg,
+	    cls_2pc_reservation::id_t _res_id) :
+      configurationId(_configurationId), cfg(_cfg), res_id(_res_id) {}
+
+    const std::string configurationId;
+    const rgw_pubsub_topic cfg;
+    // res_id is reset after topic is committed/aborted
+    cls_2pc_reservation::id_t res_id;
+  };
+
+  const DoutPrefixProvider* const dpp;
+  std::vector<topic_t> topics;
+  rgw::sal::RadosStore* const store;
+  const req_state* const s;
+  size_t size;
+  rgw::sal::Object* const object;
+  rgw::sal::Object* const src_object; // may differ from object
+  rgw::sal::Bucket* const bucket;
+  const std::string* const object_name;
+  boost::optional<const RGWObjTags&> tagset;
+  meta_map_t x_meta_map; // metadata cached by value
+  bool metadata_fetched_from_attributes;
+  const std::string user_id;
+  const std::string user_tenant;
+  const std::string req_id;
+  optional_yield yield;
+
+  /* ctor for rgw_op callers */
+  reservation_t(const DoutPrefixProvider* _dpp,
+		rgw::sal::RadosStore* _store,
+		const req_state* _s,
+		rgw::sal::Object* _object,
+		rgw::sal::Object* _src_object,
+		const std::string* _object_name,
+		optional_yield y);
+
+  /* ctor for non-request caller (e.g., lifecycle) */
+  reservation_t(const DoutPrefixProvider* _dpp,
+		rgw::sal::RadosStore* _store,
+		rgw::sal::Object* _object,
+		rgw::sal::Object* _src_object,
+		rgw::sal::Bucket* _bucket,
+		const std::string& _user_id,
+		const std::string& _user_tenant,
+		const std::string& _req_id,
+		optional_yield y);
+
+  // dtor doing resource leak guarding
+  // aborting the reservation if not already committed or aborted
+  ~reservation_t();
+};
+
+// create a reservation on the 2-phase-commit queue
+  int publish_reserve(const DoutPrefixProvider *dpp,
+		      EventType event_type,
+		      reservation_t& reservation,
+		      const RGWObjTags* req_tags);
+
+// commit the reservation to the queue
+int publish_commit(rgw::sal::Object* obj,
+        uint64_t size,
+        const ceph::real_time& mtime, 
+        const std::string& etag, 
+        const std::string& version,
+        EventType event_type,
+        reservation_t& reservation,
+        const DoutPrefixProvider *dpp);
+
+// cancel the reservation
+int publish_abort(reservation_t& reservation);
+
+}
+
diff --git a/src/rgw/driver/rados/rgw_obj_manifest.cc b/src/rgw/driver/rados/rgw_obj_manifest.cc
new file mode 100644
index 000000000..92ade8120
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_obj_manifest.cc
@@ -0,0 +1,409 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_obj_manifest.h"
+
+#include "services/svc_zone.h"
+#include "rgw_rados.h"
+#include "rgw_bucket.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int RGWObjManifest::generator::create_next(uint64_t ofs)
+{
+  if (ofs < last_ofs) /* only going forward */
+    return -EINVAL;
+
+  uint64_t max_head_size = manifest->get_max_head_size();
+
+  if (ofs < max_head_size) {
+    manifest->set_head_size(ofs);
+  }
+
+  if (ofs >= max_head_size) {
+    manifest->set_head_size(max_head_size);
+    cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
+    cur_stripe_size = rule.stripe_max_size;
+
+    if (cur_part_id == 0 && max_head_size > 0) {
+      cur_stripe++;
+    }
+  }
+
+  last_ofs = ofs;
+  manifest->set_obj_size(ofs);
+
+  manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
+
+  return 0;
+}
+
+int RGWObjManifest::append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup,
+                           const RGWZoneParams& zone_params)
+{
+  if (explicit_objs || m.explicit_objs) {
+    return append_explicit(dpp, m, zonegroup, zone_params);
+  }
+
+  if (rules.empty()) {
+    *this = m;
+    return 0;
+  }
+
+  string override_prefix;
+
+  if (prefix.empty()) {
+    prefix = m.prefix;
+  }
+
+  if (prefix != m.prefix) {
+    override_prefix = m.prefix;
+  }
+
+  map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
+  if (miter == m.rules.end()) {
+    return append_explicit(dpp, m, zonegroup, zone_params);
+  }
+
+  for (; miter != m.rules.end(); ++miter) {
+    map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
+
+    RGWObjManifestRule& rule = last_rule->second;
+
+    if (rule.part_size == 0) {
+      rule.part_size = obj_size - rule.start_ofs;
+    }
+
+    RGWObjManifestRule& next_rule = miter->second;
+    if (!next_rule.part_size) {
+      next_rule.part_size = m.obj_size - next_rule.start_ofs;
+    }
+
+    string rule_prefix = prefix;
+    if (!rule.override_prefix.empty()) {
+      rule_prefix = rule.override_prefix;
+    }
+
+    string next_rule_prefix = m.prefix;
+    if (!next_rule.override_prefix.empty()) {
+      next_rule_prefix = next_rule.override_prefix;
+    }
+
+    if (rule.part_size != next_rule.part_size ||
+        rule.stripe_max_size != next_rule.stripe_max_size ||
+        rule_prefix != next_rule_prefix) {
+      if (next_rule_prefix != prefix) {
+        append_rules(m, miter, &next_rule_prefix);
+      } else {
+        append_rules(m, miter, NULL);
+      }
+      break;
+    }
+
+    uint64_t expected_part_num = rule.start_part_num + 1;
+    if (rule.part_size > 0) {
+      expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
+    }
+
+    if (expected_part_num != next_rule.start_part_num) {
+      append_rules(m, miter, NULL);
+      break;
+    }
+  }
+
+  set_obj_size(obj_size + m.obj_size);
+
+  return 0;
+}
+
+void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
+                                  string *override_prefix)
+{
+  for (; miter != m.rules.end(); ++miter) {
+    RGWObjManifestRule rule = miter->second;
+    rule.start_ofs += obj_size;
+    if (override_prefix)
+      rule.override_prefix = *override_prefix;
+    rules[rule.start_ofs] = rule;
+  }
+}
+
+void RGWObjManifest::convert_to_explicit(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
+{
+  if (explicit_objs) {
+    return;
+  }
+  obj_iterator iter = obj_begin(dpp);
+
+  while (iter != obj_end(dpp)) {
+    RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
+    const rgw_obj_select& os = iter.get_location();
+    const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
+    part.loc_ofs = 0;
+
+    uint64_t ofs = iter.get_stripe_ofs();
+
+    if (ofs == 0) {
+      part.loc = obj;
+    } else {
+      RGWSI_Tier_RADOS::raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
+    }
+    ++iter;
+    uint64_t next_ofs = iter.get_stripe_ofs();
+
+    part.size = next_ofs - ofs;
+  }
+
+  explicit_objs = true;
+  rules.clear();
+  prefix.clear();
+}
+
+int RGWObjManifest::append_explicit(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
+{
+  if (!explicit_objs) {
+    convert_to_explicit(dpp, zonegroup, zone_params);
+  }
+  if (!m.explicit_objs) {
+    m.convert_to_explicit(dpp, zonegroup, zone_params);
+  }
+  map<uint64_t, RGWObjManifestPart>::iterator iter;
+  uint64_t base = obj_size;
+  for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
+    RGWObjManifestPart& part = iter->second;
+    objs[base + iter->first] = part;
+  }
+  obj_size += m.obj_size;
+
+  return 0;
+}
+
+bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
+{
+  if (rules.empty()) {
+    return false;
+  }
+
+  map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
+  if (iter != rules.begin()) {
+    --iter;
+  }
+
+  *rule = iter->second;
+
+  return true;
+}
+
+int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m,
+                                            const rgw_placement_rule& head_placement_rule,
+                                            const rgw_placement_rule *tail_placement_rule,
+                                            const rgw_bucket& _b, const rgw_obj& _obj)
+{
+  manifest = _m;
+
+  if (!tail_placement_rule) {
+    manifest->set_tail_placement(head_placement_rule, _b);
+  } else {
+    rgw_placement_rule new_tail_rule = *tail_placement_rule;
+    new_tail_rule.inherit_from(head_placement_rule);
+    manifest->set_tail_placement(new_tail_rule, _b);
+  }
+
+  manifest->set_head(head_placement_rule, _obj, 0);
+  last_ofs = 0;
+
+  if (manifest->get_prefix().empty()) {
+    char buf[33];
+    gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
+
+    string oid_prefix = ".";
+    oid_prefix.append(buf);
+    oid_prefix.append("_");
+
+    manifest->set_prefix(oid_prefix);
+  }
+
+  bool found = manifest->get_rule(0, &rule);
+  if (!found) {
+    derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
+    return -EIO;
+  }
+
+  uint64_t head_size = manifest->get_head_size();
+
+  if (head_size > 0) {
+    cur_stripe_size = head_size;
+  } else {
+    cur_stripe_size = rule.stripe_max_size;
+  }
+  
+  cur_part_id = rule.start_part_num;
+
+  manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
+
+  // Normal object which not generated through copy operation 
+  manifest->set_tail_instance(_obj.key.instance);
+
+  return 0;
+}
+
+void RGWObjManifestPart::generate_test_instances(std::list<RGWObjManifestPart*>& o)
+{
+  o.push_back(new RGWObjManifestPart);
+
+  RGWObjManifestPart *p = new RGWObjManifestPart;
+  rgw_bucket b;
+  init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12");
+
+  p->loc = rgw_obj(b, "object");
+  p->loc_ofs = 512 * 1024;
+  p->size = 128 * 1024;
+  o.push_back(p);
+}
+
+void RGWObjManifest::generate_test_instances(std::list<RGWObjManifest*>& o)
+{
+  RGWObjManifest *m = new RGWObjManifest;
+  map<uint64_t, RGWObjManifestPart> objs;
+  uint64_t total_size = 0;
+  for (int i = 0; i<10; i++) {
+    RGWObjManifestPart p;
+    rgw_bucket b;
+    init_bucket(&b, "tenant", "bucket", ".pool", ".index_pool", "marker_", "12");
+    p.loc = rgw_obj(b, "object");
+    p.loc_ofs = 0;
+    p.size = 512 * 1024;
+    total_size += p.size;
+    objs[total_size] = p;
+  }
+  m->set_explicit(total_size, objs);
+  o.push_back(m);
+  o.push_back(new RGWObjManifest);
+}
+
+void RGWObjManifestPart::dump(Formatter *f) const
+{
+  f->open_object_section("loc");
+  loc.dump(f);
+  f->close_section();
+  f->dump_unsigned("loc_ofs", loc_ofs);
+  f->dump_unsigned("size", size);
+}
+
+void RGWObjManifest::obj_iterator::dump(Formatter *f) const
+{
+  f->dump_unsigned("part_ofs", part_ofs);
+  f->dump_unsigned("stripe_ofs", stripe_ofs);
+  f->dump_unsigned("ofs", ofs);
+  f->dump_unsigned("stripe_size", stripe_size);
+  f->dump_int("cur_part_id", cur_part_id);
+  f->dump_int("cur_stripe", cur_stripe);
+  f->dump_string("cur_override_prefix", cur_override_prefix);
+  f->dump_object("location", location);
+}
+
+void RGWObjManifest::dump(Formatter *f) const
+{
+  map<uint64_t, RGWObjManifestPart>::const_iterator iter = objs.begin();
+  f->open_array_section("objs");
+  for (; iter != objs.end(); ++iter) {
+    f->dump_unsigned("ofs", iter->first);
+    f->open_object_section("part");
+    iter->second.dump(f);
+    f->close_section();
+  }
+  f->close_section();
+  f->dump_unsigned("obj_size", obj_size);
+  ::encode_json("explicit_objs", explicit_objs, f);
+  ::encode_json("head_size", head_size, f);
+  ::encode_json("max_head_size", max_head_size, f);
+  ::encode_json("prefix", prefix, f);
+  ::encode_json("rules", rules, f);
+  ::encode_json("tail_instance", tail_instance, f);
+  ::encode_json("tail_placement", tail_placement, f);
+  ::encode_json("tier_type", tier_type, f);
+
+  if (tier_type == "cloud-s3") {
+    ::encode_json("tier_config", tier_config, f);
+  }
+
+  // nullptr being passed into iterators since there
+  // is no cct and we aren't doing anything with these
+  // iterators that would write do the log
+  f->dump_object("begin_iter", obj_begin(nullptr));
+  f->dump_object("end_iter", obj_end(nullptr));
+}
+
+void RGWObjManifestRule::dump(Formatter *f) const
+{
+  encode_json("start_part_num", start_part_num, f);
+  encode_json("start_ofs", start_ofs, f);
+  encode_json("part_size", part_size, f);
+  encode_json("stripe_max_size", stripe_max_size, f);
+  encode_json("override_prefix", override_prefix, f);
+}
+
+void rgw_obj_select::dump(Formatter *f) const
+{
+  f->dump_string("placement_rule", placement_rule.to_str());
+  f->dump_object("obj", obj);
+  f->dump_object("raw_obj", raw_obj);
+  f->dump_bool("is_raw", is_raw);
+}
+
+void RGWObjTier::dump(Formatter *f) const
+{
+  encode_json("name", name, f);
+  encode_json("tier_placement", tier_placement, f);
+  encode_json("is_multipart_upload", is_multipart_upload, f);
+}
+
+// returns true on success, false on failure
+static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
+                                  const rgw_placement_rule& head_placement_rule,
+                                  const rgw_obj& obj, rgw_pool *pool)
+{
+  if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
+    RGWZonePlacementInfo placement;
+    if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
+      return false;
+    }
+
+    if (!obj.in_extra_data) {
+      *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
+    } else {
+      *pool = placement.get_data_extra_pool();
+    }
+  }
+
+  return true;
+}
+
+static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
+                           const rgw_placement_rule& head_placement_rule,
+                           const rgw_obj& obj, rgw_raw_obj *raw_obj)
+{
+  get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
+
+  return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
+}
+
+rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
+{
+  if (!is_raw) {
+    rgw_raw_obj r;
+    rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
+    return r;
+  }
+  return raw_obj;
+}
+
+// returns true on success, false on failure
+bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
+{
+  return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
+}
+
diff --git a/src/rgw/driver/rados/rgw_obj_manifest.h b/src/rgw/driver/rados/rgw_obj_manifest.h
new file mode 100644
index 000000000..6984184aa
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_obj_manifest.h
@@ -0,0 +1,622 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * introduce changes or include files which can only be compiled in
+ * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h)
+ */
+
+#pragma once
+
+#include "rgw_zone_types.h"
+#include "rgw_bucket_types.h"
+#include "rgw_obj_types.h"
+#include "rgw_placement_types.h"
+
+#include "common/dout.h"
+#include "common/Formatter.h"
+
+class RGWSI_Zone;
+struct RGWZoneGroup;
+struct RGWZoneParams;
+class RGWRados;
+
+namespace rgw { namespace sal {
+  class RadosStore;
+} };
+
+class rgw_obj_select {
+  rgw_placement_rule placement_rule;
+  rgw_obj obj;
+  rgw_raw_obj raw_obj;
+  bool is_raw;
+
+public:
+  rgw_obj_select() : is_raw(false) {}
+  explicit rgw_obj_select(const rgw_obj& _obj) : obj(_obj), is_raw(false) {}
+  explicit rgw_obj_select(const rgw_raw_obj& _raw_obj) : raw_obj(_raw_obj), is_raw(true) {}
+  rgw_obj_select(const rgw_obj_select& rhs) {
+    placement_rule = rhs.placement_rule;
+    is_raw = rhs.is_raw;
+    if (is_raw) {
+      raw_obj = rhs.raw_obj;
+    } else {
+      obj = rhs.obj;
+    }
+  }
+
+  rgw_raw_obj get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const;
+  rgw_raw_obj get_raw_obj(RGWRados* store) const;
+
+  rgw_obj_select& operator=(const rgw_obj& rhs) {
+    obj = rhs;
+    is_raw = false;
+    return *this;
+  }
+
+  rgw_obj_select& operator=(const rgw_raw_obj& rhs) {
+    raw_obj = rhs;
+    is_raw = true;
+    return *this;
+  }
+
+  void set_placement_rule(const rgw_placement_rule& rule) {
+    placement_rule = rule;
+  }
+  void dump(Formatter *f) const;
+};
+
+struct RGWObjManifestPart {
+  rgw_obj loc;   /* the object where the data is located */
+  uint64_t loc_ofs;  /* the offset at that object where the data is located */
+  uint64_t size;     /* the part size */
+
+  RGWObjManifestPart() : loc_ofs(0), size(0) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(loc, bl);
+    encode(loc_ofs, bl);
+    encode(size, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl);
+     decode(loc, bl);
+     decode(loc_ofs, bl);
+     decode(size, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWObjManifestPart*>& o);
+};
+WRITE_CLASS_ENCODER(RGWObjManifestPart)
+
+/*
+ The manifest defines a set of rules for structuring the object parts.
+ There are a few terms to note:
+     - head: the head part of the object, which is the part that contains
+       the first chunk of data. An object might not have a head (as in the
+       case of multipart-part objects).
+     - stripe: data portion of a single rgw object that resides on a single
+       rados object.
+     - part: a collection of stripes that make a contiguous part of an
+       object. A regular object will only have one part (although might have
+       many stripes), a multipart object might have many parts. Each part
+       has a fixed stripe size, although the last stripe of a part might
+       be smaller than that. Consecutive parts may be merged if their stripe
+       value is the same.
+*/
+
+struct RGWObjManifestRule {
+  uint32_t start_part_num;
+  uint64_t start_ofs;
+  uint64_t part_size; /* each part size, 0 if there's no part size, meaning it's unlimited */
+  uint64_t stripe_max_size; /* underlying obj max size */
+  std::string override_prefix;
+
+  RGWObjManifestRule() : start_part_num(0), start_ofs(0), part_size(0), stripe_max_size(0) {}
+  RGWObjManifestRule(uint32_t _start_part_num, uint64_t _start_ofs, uint64_t _part_size, uint64_t _stripe_max_size) :
+                       start_part_num(_start_part_num), start_ofs(_start_ofs), part_size(_part_size), stripe_max_size(_stripe_max_size) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(start_part_num, bl);
+    encode(start_ofs, bl);
+    encode(part_size, bl);
+    encode(stripe_max_size, bl);
+    encode(override_prefix, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(start_part_num, bl);
+    decode(start_ofs, bl);
+    decode(part_size, bl);
+    decode(stripe_max_size, bl);
+    if (struct_v >= 2)
+      decode(override_prefix, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWObjManifestRule)
+
+struct RGWObjTier {
+    std::string name;
+    RGWZoneGroupPlacementTier tier_placement;
+    bool is_multipart_upload{false};
+
+    RGWObjTier(): name("none") {}
+
+    void encode(bufferlist& bl) const {
+      ENCODE_START(2, 2, bl);
+      encode(name, bl);
+      encode(tier_placement, bl);
+      encode(is_multipart_upload, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::const_iterator& bl) {
+      DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+      decode(name, bl);
+      decode(tier_placement, bl);
+      decode(is_multipart_upload, bl);
+      DECODE_FINISH(bl);
+    }
+    void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWObjTier)
+
+class RGWObjManifest {
+protected:
+  bool explicit_objs{false}; /* really old manifest? */
+  std::map<uint64_t, RGWObjManifestPart> objs;
+
+  uint64_t obj_size{0};
+
+  rgw_obj obj;
+  uint64_t head_size{0};
+  rgw_placement_rule head_placement_rule;
+
+  uint64_t max_head_size{0};
+  std::string prefix;
+  rgw_bucket_placement tail_placement; /* might be different than the original bucket,
+                                       as object might have been copied across pools */
+  std::map<uint64_t, RGWObjManifestRule> rules;
+
+  std::string tail_instance; /* tail object's instance */
+
+  std::string tier_type;
+  RGWObjTier tier_config;
+
+  void convert_to_explicit(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params);
+  int append_explicit(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params);
+  void append_rules(RGWObjManifest& m, std::map<uint64_t, RGWObjManifestRule>::iterator& iter, std::string *override_prefix);
+
+public:
+
+  RGWObjManifest() = default;
+  RGWObjManifest(const RGWObjManifest& rhs) {
+    *this = rhs;
+  }
+  RGWObjManifest& operator=(const RGWObjManifest& rhs) {
+    explicit_objs = rhs.explicit_objs;
+    objs = rhs.objs;
+    obj_size = rhs.obj_size;
+    obj = rhs.obj;
+    head_size = rhs.head_size;
+    max_head_size = rhs.max_head_size;
+    prefix = rhs.prefix;
+    tail_placement = rhs.tail_placement;
+    rules = rhs.rules;
+    tail_instance = rhs.tail_instance;
+    tier_type = rhs.tier_type;
+    tier_config = rhs.tier_config;
+    return *this;
+  }
+
+  std::map<uint64_t, RGWObjManifestPart>& get_explicit_objs() {
+    return objs;
+  }
+
+
+  void set_explicit(uint64_t _size, std::map<uint64_t, RGWObjManifestPart>& _objs) {
+    explicit_objs = true;
+    objs.swap(_objs);
+    set_obj_size(_size);
+  }
+
+  void get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe, uint64_t ofs,
+                             std::string *override_prefix, rgw_obj_select *location) const;
+
+  void set_trivial_rule(uint64_t tail_ofs, uint64_t stripe_max_size) {
+    RGWObjManifestRule rule(0, tail_ofs, 0, stripe_max_size);
+    rules[0] = rule;
+    max_head_size = tail_ofs;
+  }
+
+  void set_multipart_part_rule(uint64_t stripe_max_size, uint64_t part_num) {
+    RGWObjManifestRule rule(0, 0, 0, stripe_max_size);
+    rule.start_part_num = part_num;
+    rules[0] = rule;
+    max_head_size = 0;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(8, 6, bl);
+    encode(obj_size, bl);
+    encode(objs, bl);
+    encode(explicit_objs, bl);
+    encode(obj, bl);
+    encode(head_size, bl);
+    encode(max_head_size, bl);
+    encode(prefix, bl);
+    encode(rules, bl);
+    bool encode_tail_bucket = !(tail_placement.bucket == obj.bucket);
+    encode(encode_tail_bucket, bl);
+    if (encode_tail_bucket) {
+      encode(tail_placement.bucket, bl);
+    }
+    bool encode_tail_instance = (tail_instance != obj.key.instance);
+    encode(encode_tail_instance, bl);
+    if (encode_tail_instance) {
+      encode(tail_instance, bl);
+    }
+    encode(head_placement_rule, bl);
+    encode(tail_placement.placement_rule, bl);
+    encode(tier_type, bl);
+    encode(tier_config, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN_32(7, 2, 2, bl);
+    decode(obj_size, bl);
+    decode(objs, bl);
+    if (struct_v >= 3) {
+      decode(explicit_objs, bl);
+      decode(obj, bl);
+      decode(head_size, bl);
+      decode(max_head_size, bl);
+      decode(prefix, bl);
+      decode(rules, bl);
+    } else {
+      explicit_objs = true;
+      if (!objs.empty()) {
+        std::map<uint64_t, RGWObjManifestPart>::iterator iter = objs.begin();
+        obj = iter->second.loc;
+        head_size = iter->second.size;
+        max_head_size = head_size;
+      }
+    }
+
+    if (explicit_objs && head_size > 0 && !objs.empty()) {
+      /* patch up manifest due to issue 16435:
+       * the first object in the explicit objs list might not be the one we need to access, use the
+       * head object instead if set. This would happen if we had an old object that was created
+       * when the explicit objs manifest was around, and it got copied.
+       */
+      rgw_obj& obj_0 = objs[0].loc;
+      if (!obj_0.get_oid().empty() && obj_0.key.ns.empty()) {
+        objs[0].loc = obj;
+        objs[0].size = head_size;
+      }
+    }
+
+    if (struct_v >= 4) {
+      if (struct_v < 6) {
+        decode(tail_placement.bucket, bl);
+      } else {
+        bool need_to_decode;
+        decode(need_to_decode, bl);
+        if (need_to_decode) {
+          decode(tail_placement.bucket, bl);
+        } else {
+          tail_placement.bucket = obj.bucket;
+        }
+      }
+    }
+
+    if (struct_v >= 5) {
+      if (struct_v < 6) {
+        decode(tail_instance, bl);
+      } else {
+        bool need_to_decode;
+        decode(need_to_decode, bl);
+        if (need_to_decode) {
+          decode(tail_instance, bl);
+        } else {
+          tail_instance = obj.key.instance;
+        }
+      }
+    } else { // old object created before 'tail_instance' field added to manifest
+      tail_instance = obj.key.instance;
+    }
+
+    if (struct_v >= 7) {
+      decode(head_placement_rule, bl);
+      decode(tail_placement.placement_rule, bl);
+    }
+
+    if (struct_v >= 8) {
+      decode(tier_type, bl);
+      decode(tier_config, bl);
+    }
+
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWObjManifest*>& o);
+
+  int append(const DoutPrefixProvider *dpp, RGWObjManifest& m, const RGWZoneGroup& zonegroup,
+             const RGWZoneParams& zone_params);
+
+  bool get_rule(uint64_t ofs, RGWObjManifestRule *rule);
+
+  bool empty() const {
+    if (explicit_objs)
+      return objs.empty();
+    return rules.empty();
+  }
+
+  bool has_explicit_objs() const {
+    return explicit_objs;
+  }
+
+  bool has_tail() const {
+    if (explicit_objs) {
+      if (objs.size() == 1) {
+        auto iter = objs.begin();
+        const rgw_obj& o = iter->second.loc;
+        return !(obj == o);
+      }
+      return (objs.size() >= 2);
+    }
+    return (obj_size > head_size);
+  }
+
+  void set_head(const rgw_placement_rule& placement_rule, const rgw_obj& _o, uint64_t _s) {
+    head_placement_rule = placement_rule;
+    obj = _o;
+    head_size = _s;
+
+    if (explicit_objs && head_size > 0) {
+      objs[0].loc = obj;
+      objs[0].size = head_size;
+    }
+  }
+
+  const rgw_obj& get_obj() const {
+    return obj;
+  }
+
+  void set_tail_placement(const rgw_placement_rule& placement_rule, const rgw_bucket& _b) {
+    tail_placement.placement_rule = placement_rule;
+    tail_placement.bucket = _b;
+  }
+
+  const rgw_bucket_placement& get_tail_placement() const {
+    return tail_placement;
+  }
+
+  const rgw_placement_rule& get_head_placement_rule() const {
+    return head_placement_rule;
+  }
+
+  void set_prefix(const std::string& _p) {
+    prefix = _p;
+  }
+
+  const std::string& get_prefix() const {
+    return prefix;
+  }
+
+  void set_tail_instance(const std::string& _ti) {
+    tail_instance = _ti;
+  }
+
+  const std::string& get_tail_instance() const {
+    return tail_instance;
+  }
+
+  void set_head_size(uint64_t _s) {
+    head_size = _s;
+  }
+
+  void set_obj_size(uint64_t s) {
+    obj_size = s;
+  }
+
+  uint64_t get_obj_size() const {
+    return obj_size;
+  }
+
+  uint64_t get_head_size() const {
+    return head_size;
+  }
+
+  uint64_t get_max_head_size() const {
+    return max_head_size;
+  }
+
+  const std::string& get_tier_type() {
+      return tier_type;
+  }
+
+  inline void set_tier_type(std::string value) {
+      /* Only "cloud-s3" tier-type is supported for now */
+      if (value == "cloud-s3") {
+        tier_type = value;
+      }
+  }
+
+  inline void set_tier_config(RGWObjTier t) {
+      /* Set only if tier_type set to "cloud-s3" */
+      if (tier_type != "cloud-s3")
+        return;
+
+      tier_config.name = t.name;
+      tier_config.tier_placement = t.tier_placement;
+      tier_config.is_multipart_upload = t.is_multipart_upload;
+  }
+
+  inline const void get_tier_config(RGWObjTier* t) {
+      if (tier_type != "cloud-s3")
+        return;
+
+      t->name = tier_config.name;
+      t->tier_placement = tier_config.tier_placement;
+      t->is_multipart_upload = tier_config.is_multipart_upload;
+  }
+
+  class obj_iterator {
+    const DoutPrefixProvider *dpp;
+    const RGWObjManifest *manifest = nullptr;
+    uint64_t part_ofs = 0;   /* where current part starts */
+    uint64_t stripe_ofs = 0; /* where current stripe starts */
+    uint64_t ofs = 0;        /* current position within the object */
+    uint64_t stripe_size = 0;      /* current part size */
+
+    int cur_part_id = 0;
+    int cur_stripe = 0;
+    std::string cur_override_prefix;
+
+    rgw_obj_select location;
+
+    std::map<uint64_t, RGWObjManifestRule>::const_iterator rule_iter;
+    std::map<uint64_t, RGWObjManifestRule>::const_iterator next_rule_iter;
+    std::map<uint64_t, RGWObjManifestPart>::const_iterator explicit_iter;
+
+    void update_explicit_pos();
+
+  public:
+    obj_iterator() = default;
+    explicit obj_iterator(const DoutPrefixProvider *_dpp, const RGWObjManifest *_m)
+      : obj_iterator(_dpp, _m, 0)
+    {}
+    obj_iterator(const DoutPrefixProvider *_dpp, const RGWObjManifest *_m, uint64_t _ofs) : dpp(_dpp), manifest(_m) {
+      seek(_ofs);
+    }
+    void seek(uint64_t ofs);
+
+    void operator++();
+    bool operator==(const obj_iterator& rhs) const {
+      return (ofs == rhs.ofs);
+    }
+    bool operator!=(const obj_iterator& rhs) const {
+      return (ofs != rhs.ofs);
+    }
+    const rgw_obj_select& get_location() {
+      return location;
+    }
+
+    /* where current part starts */
+    uint64_t get_part_ofs() const {
+      return part_ofs;
+    }
+
+    /* start of current stripe */
+    uint64_t get_stripe_ofs() {
+      if (manifest->explicit_objs) {
+        return explicit_iter->first;
+      }
+      return stripe_ofs;
+    }
+
+    /* current ofs relative to start of rgw object */
+    uint64_t get_ofs() const {
+      return ofs;
+    }
+
+    int get_cur_part_id() const {
+      return cur_part_id;
+    }
+
+    /* stripe number */
+    int get_cur_stripe() const {
+      return cur_stripe;
+    }
+
+    /* current stripe size */
+    uint64_t get_stripe_size() {
+      if (manifest->explicit_objs) {
+        return explicit_iter->second.size;
+      }
+      return stripe_size;
+    }
+
+    /* offset where data starts within current stripe */
+    uint64_t location_ofs() {
+      if (manifest->explicit_objs) {
+        return explicit_iter->second.loc_ofs;
+      }
+      return 0; /* all stripes start at zero offset */
+    }
+
+    void update_location();
+
+    void dump(Formatter *f) const;
+  }; // class obj_iterator
+
+  obj_iterator obj_begin(const DoutPrefixProvider *dpp) const { return obj_iterator{dpp, this}; }
+  obj_iterator obj_end(const DoutPrefixProvider *dpp) const { return obj_iterator{dpp, this, obj_size}; }
+  obj_iterator obj_find(const DoutPrefixProvider *dpp, uint64_t ofs) const {
+    return obj_iterator{dpp, this, std::min(ofs, obj_size)};
+  }
+
+  /*
+   * simple object generator. Using a simple single rule manifest.
+   */
+  class generator {
+    RGWObjManifest *manifest;
+    uint64_t last_ofs;
+    uint64_t cur_part_ofs;
+    int cur_part_id;
+    int cur_stripe;
+    uint64_t cur_stripe_size;
+    std::string cur_oid;
+    
+    std::string oid_prefix;
+
+    rgw_obj_select cur_obj;
+
+    RGWObjManifestRule rule;
+
+  public:
+    generator() : manifest(NULL), last_ofs(0), cur_part_ofs(0), cur_part_id(0), 
+		  cur_stripe(0), cur_stripe_size(0) {}
+    int create_begin(CephContext *cct, RGWObjManifest *manifest,
+                     const rgw_placement_rule& head_placement_rule,
+                     const rgw_placement_rule *tail_placement_rule,
+                     const rgw_bucket& bucket,
+                     const rgw_obj& obj);
+
+    int create_next(uint64_t ofs);
+
+    rgw_raw_obj get_cur_obj(RGWZoneGroup& zonegroup, RGWZoneParams& zone_params) { return cur_obj.get_raw_obj(zonegroup, zone_params); }
+    rgw_raw_obj get_cur_obj(RGWRados* store) const { return cur_obj.get_raw_obj(store); }
+
+    /* total max size of current stripe (including head obj) */
+    uint64_t cur_stripe_max_size() const {
+      return cur_stripe_size;
+    }
+  };
+};
+WRITE_CLASS_ENCODER(RGWObjManifest)
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.cc b/src/rgw/driver/rados/rgw_object_expirer_core.cc
new file mode 100644
index 000000000..ec1bf3fb6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_object_expirer_core.cc
@@ -0,0 +1,442 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+
+#include "auth/Crypto.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "global/global_init.h"
+
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_log.h"
+#include "rgw_formats.h"
+#include "rgw_usage.h"
+#include "rgw_object_expirer_core.h"
+#include "rgw_zone.h"
+#include "rgw_sal_rados.h"
+
+#include "services/svc_rados.h"
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_bi_rados.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/timeindex/cls_timeindex_client.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static string objexp_lock_name = "gc_process";
+
+static string objexp_hint_get_shardname(int shard_num)
+{
+  char buf[64];
+  snprintf(buf, sizeof(buf), "obj_delete_at_hint.%010u", (unsigned)shard_num);
+  return buf;
+}
+
+static int objexp_key_shard(const rgw_obj_index_key& key, int num_shards)
+{
+  string obj_key = key.name + key.instance;
+  return RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards);
+}
+
+static string objexp_hint_get_keyext(const string& tenant_name,
+                                     const string& bucket_name,
+                                     const string& bucket_id,
+                                     const rgw_obj_key& obj_key) {
+  return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
+    ":" + obj_key.name + ":" + obj_key.instance;
+}
+
+static void objexp_get_shard(int shard_num,
+                             string *shard)
+{
+  *shard = objexp_hint_get_shardname(shard_num);
+}
+
+static int objexp_hint_parse(const DoutPrefixProvider *dpp, CephContext *cct, cls_timeindex_entry &ti_entry,
+                             objexp_hint_entry *hint_entry)
+{
+  try {
+    auto iter = ti_entry.value.cbegin();
+    decode(*hint_entry, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: couldn't decode avail_pools" << dendl;
+  }
+
+  return 0;
+}
+
+int RGWObjExpStore::objexp_hint_add(const DoutPrefixProvider *dpp, 
+                              const ceph::real_time& delete_at,
+                              const string& tenant_name,
+                              const string& bucket_name,
+                              const string& bucket_id,
+                              const rgw_obj_index_key& obj_key)
+{
+  const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
+          bucket_id, obj_key);
+  objexp_hint_entry he = {
+      .tenant = tenant_name,
+      .bucket_name = bucket_name,
+      .bucket_id = bucket_id,
+      .obj_key = obj_key,
+      .exp_time = delete_at };
+  bufferlist hebl;
+  encode(he, hebl);
+  librados::ObjectWriteOperation op;
+  cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
+
+  string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key, cct->_conf->rgw_objexp_hints_num_shards));
+  auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, shard_name));
+  int r = obj.open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
+    return r;
+  }
+  return obj.operate(dpp, &op, null_yield);
+}
+
+int RGWObjExpStore::objexp_hint_list(const DoutPrefixProvider *dpp, 
+                               const string& oid,
+                               const ceph::real_time& start_time,
+                               const ceph::real_time& end_time,
+                               const int max_entries,
+                               const string& marker,
+                               list<cls_timeindex_entry>& entries, /* out */
+                               string *out_marker,                 /* out */
+                               bool *truncated)                    /* out */
+{
+  librados::ObjectReadOperation op;
+  cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
+        out_marker, truncated);
+
+  auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, oid));
+  int r = obj.open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
+    return r;
+  }
+  bufferlist obl;
+  int ret = obj.operate(dpp, &op, &obl, null_yield);
+
+  if ((ret < 0 ) && (ret != -ENOENT)) {
+    return ret;
+  }
+
+  if ((ret == -ENOENT) && truncated) {
+    *truncated = false;
+  }
+
+  return 0;
+}
+
+static int cls_timeindex_trim_repeat(const DoutPrefixProvider *dpp, 
+                                rgw_rados_ref ref,
+                                const string& oid,
+                                const utime_t& from_time,
+                                const utime_t& to_time,
+                                const string& from_marker,
+                                const string& to_marker)
+{
+  bool done = false;
+  do {
+    librados::ObjectWriteOperation op;
+    cls_timeindex_trim(op, from_time, to_time, from_marker, to_marker);
+    int r = rgw_rados_operate(dpp, ref.pool.ioctx(), oid, &op, null_yield);
+    if (r == -ENODATA)
+      done = true;
+    else if (r < 0)
+      return r;
+  } while (!done);
+
+  return 0;
+}
+
+int RGWObjExpStore::objexp_hint_trim(const DoutPrefixProvider *dpp, 
+                               const string& oid,
+                               const ceph::real_time& start_time,
+                               const ceph::real_time& end_time,
+                               const string& from_marker,
+                               const string& to_marker)
+{
+  auto obj = rados_svc->obj(rgw_raw_obj(driver->svc()->zone->get_zone_params().log_pool, oid));
+  int r = obj.open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to open obj=" << obj << " (r=" << r << ")" << dendl;
+    return r;
+  }
+  auto& ref = obj.get_ref();
+  int ret = cls_timeindex_trim_repeat(dpp, ref, oid, utime_t(start_time), utime_t(end_time),
+          from_marker, to_marker);
+  if ((ret < 0 ) && (ret != -ENOENT)) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWObjectExpirer::garbage_single_object(const DoutPrefixProvider *dpp, objexp_hint_entry& hint)
+{
+  RGWBucketInfo bucket_info;
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  int ret = driver->get_bucket(dpp, nullptr, rgw_bucket(hint.tenant, hint.bucket_name, hint.bucket_id), &bucket, null_yield);
+  if (-ENOENT == ret) {
+    ldpp_dout(dpp, 15) << "NOTICE: cannot find bucket = " \
+        << hint.bucket_name << ". The object must be already removed" << dendl;
+    return -ERR_PRECONDITION_FAILED;
+  } else if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: could not init bucket = " \
+        << hint.bucket_name << "due to ret = " << ret << dendl;
+    return ret;
+  }
+
+  rgw_obj_key key = hint.obj_key;
+  if (key.instance.empty()) {
+    key.instance = "null";
+  }
+
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
+  obj->set_atomic();
+  ret = obj->delete_object(dpp, null_yield);
+
+  return ret;
+}
+
+void RGWObjectExpirer::garbage_chunk(const DoutPrefixProvider *dpp, 
+                                  list<cls_timeindex_entry>& entries,      /* in  */
+                                  bool& need_trim)                         /* out */
+{
+  need_trim = false;
+
+  for (list<cls_timeindex_entry>::iterator iter = entries.begin();
+       iter != entries.end();
+       ++iter)
+  {
+    objexp_hint_entry hint;
+    ldpp_dout(dpp, 15) << "got removal hint for: " << iter->key_ts.sec() \
+        << " - " << iter->key_ext << dendl;
+
+    int ret = objexp_hint_parse(dpp, driver->ctx(), *iter, &hint);
+    if (ret < 0) {
+      ldpp_dout(dpp, 1) << "cannot parse removal hint for " << hint.obj_key << dendl;
+      continue;
+    }
+
+    /* PRECOND_FAILED simply means that our hint is not valid.
+     * We can silently ignore that and move forward. */
+    ret = garbage_single_object(dpp, hint);
+    if (ret == -ERR_PRECONDITION_FAILED) {
+      ldpp_dout(dpp, 15) << "not actual hint for object: " << hint.obj_key << dendl;
+    } else if (ret < 0) {
+      ldpp_dout(dpp, 1) << "cannot remove expired object: " << hint.obj_key << dendl;
+    }
+
+    need_trim = true;
+  }
+
+  return;
+}
+
+void RGWObjectExpirer::trim_chunk(const DoutPrefixProvider *dpp, 
+                                  const string& shard,
+                                  const utime_t& from,
+                                  const utime_t& to,
+                                  const string& from_marker,
+                                  const string& to_marker)
+{
+  ldpp_dout(dpp, 20) << "trying to trim removal hints to=" << to
+                          << ", to_marker=" << to_marker << dendl;
+
+  real_time rt_from = from.to_real_time();
+  real_time rt_to = to.to_real_time();
+
+  int ret = exp_store.objexp_hint_trim(dpp, shard, rt_from, rt_to,
+                                       from_marker, to_marker);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR during trim: " << ret << dendl;
+  }
+
+  return;
+}
+
+bool RGWObjectExpirer::process_single_shard(const DoutPrefixProvider *dpp, 
+                                            const string& shard,
+                                            const utime_t& last_run,
+                                            const utime_t& round_start)
+{
+  string marker;
+  string out_marker;
+  bool truncated = false;
+  bool done = true;
+
+  CephContext *cct = driver->ctx();
+  int num_entries = cct->_conf->rgw_objexp_chunk_size;
+
+  int max_secs = cct->_conf->rgw_objexp_gc_interval;
+  utime_t end = ceph_clock_now();
+  end += max_secs;
+
+  rados::cls::lock::Lock l(objexp_lock_name);
+
+  utime_t time(max_secs, 0);
+  l.set_duration(time);
+
+  int ret = l.lock_exclusive(&static_cast<rgw::sal::RadosStore*>(driver)->getRados()->objexp_pool_ctx, shard);
+  if (ret == -EBUSY) { /* already locked by another processor */
+    ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " << shard << dendl;
+    return false;
+  }
+
+  do {
+    real_time rt_last = last_run.to_real_time();
+    real_time rt_start = round_start.to_real_time();
+
+    list<cls_timeindex_entry> entries;
+    ret = exp_store.objexp_hint_list(dpp, shard, rt_last, rt_start,
+                                     num_entries, marker, entries,
+                                     &out_marker, &truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, 10) << "cannot get removal hints from shard: " << shard
+                     << dendl;
+      continue;
+    }
+
+    bool need_trim;
+    garbage_chunk(dpp, entries, need_trim);
+
+    if (need_trim) {
+      trim_chunk(dpp, shard, last_run, round_start, marker, out_marker);
+    }
+
+    utime_t now = ceph_clock_now();
+    if (now >= end) {
+      done = false;
+      break;
+    }
+
+    marker = out_marker;
+  } while (truncated);
+
+  l.unlock(&static_cast<rgw::sal::RadosStore*>(driver)->getRados()->objexp_pool_ctx, shard);
+  return done;
+}
+
+/* Returns true if all shards have been processed successfully. */
+bool RGWObjectExpirer::inspect_all_shards(const DoutPrefixProvider *dpp, 
+                                          const utime_t& last_run,
+                                          const utime_t& round_start)
+{
+  CephContext * const cct = driver->ctx();
+  int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
+  bool all_done = true;
+
+  for (int i = 0; i < num_shards; i++) {
+    string shard;
+    objexp_get_shard(i, &shard);
+
+    ldpp_dout(dpp, 20) << "processing shard = " << shard << dendl;
+
+    if (! process_single_shard(dpp, shard, last_run, round_start)) {
+      all_done = false;
+    }
+  }
+
+  return all_done;
+}
+
+bool RGWObjectExpirer::going_down()
+{
+  return down_flag;
+}
+
+void RGWObjectExpirer::start_processor()
+{
+  worker = new OEWorker(driver->ctx(), this);
+  worker->create("rgw_obj_expirer");
+}
+
+void RGWObjectExpirer::stop_processor()
+{
+  down_flag = true;
+  if (worker) {
+    worker->stop();
+    worker->join();
+  }
+  delete worker;
+  worker = NULL;
+}
+
+void *RGWObjectExpirer::OEWorker::entry() {
+  utime_t last_run;
+  do {
+    utime_t start = ceph_clock_now();
+    ldpp_dout(this, 2) << "object expiration: start" << dendl;
+    if (oe->inspect_all_shards(this, last_run, start)) {
+      /* All shards have been processed properly. Next time we can start
+       * from this moment. */
+      last_run = start;
+    }
+    ldpp_dout(this, 2) << "object expiration: stop" << dendl;
+
+
+    if (oe->going_down())
+      break;
+
+    utime_t end = ceph_clock_now();
+    end -= start;
+    int secs = cct->_conf->rgw_objexp_gc_interval;
+
+    if (secs <= end.sec())
+      continue; // next round
+
+    secs -= end.sec();
+
+    std::unique_lock l{lock};
+    cond.wait_for(l, std::chrono::seconds(secs));
+  } while (!oe->going_down());
+
+  return NULL;
+}
+
+void RGWObjectExpirer::OEWorker::stop()
+{
+  std::lock_guard l{lock};
+  cond.notify_all();
+}
+
+CephContext *RGWObjectExpirer::OEWorker::get_cct() const 
+{ 
+  return cct; 
+}
+
+unsigned RGWObjectExpirer::OEWorker::get_subsys() const 
+{
+    return dout_subsys;
+}
+
+std::ostream& RGWObjectExpirer::OEWorker::gen_prefix(std::ostream& out) const 
+{ 
+  return out << "rgw object expirer Worker thread: "; 
+}
diff --git a/src/rgw/driver/rados/rgw_object_expirer_core.h b/src/rgw/driver/rados/rgw_object_expirer_core.h
new file mode 100644
index 000000000..be63815c1
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_object_expirer_core.h
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <cerrno>
+#include <sstream>
+#include <iostream>
+
+#include "auth/Crypto.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "global/global_init.h"
+
+#include "include/common_fwd.h"
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_sal_rados.h"
+
+class RGWSI_RADOS;
+class RGWSI_Zone;
+class RGWBucketInfo;
+class cls_timeindex_entry;
+
+class RGWObjExpStore {
+  CephContext *cct;
+  RGWSI_RADOS *rados_svc;
+  rgw::sal::RadosStore* driver;
+public:
+  RGWObjExpStore(CephContext *_cct, RGWSI_RADOS *_rados_svc, rgw::sal::RadosStore* _driver) : cct(_cct),
+                                                                                      rados_svc(_rados_svc),
+                                                                                      driver(_driver) {}
+
+  int objexp_hint_add(const DoutPrefixProvider *dpp, 
+                      const ceph::real_time& delete_at,
+                      const std::string& tenant_name,
+                      const std::string& bucket_name,
+                      const std::string& bucket_id,
+                      const rgw_obj_index_key& obj_key);
+
+  int objexp_hint_list(const DoutPrefixProvider *dpp, 
+                       const std::string& oid,
+                       const ceph::real_time& start_time,
+                       const ceph::real_time& end_time,
+                       const int max_entries,
+                       const std::string& marker,
+                       std::list<cls_timeindex_entry>& entries, /* out */
+                       std::string *out_marker,                 /* out */
+                       bool *truncated);                   /* out */
+
+  int objexp_hint_trim(const DoutPrefixProvider *dpp, 
+                       const std::string& oid,
+                       const ceph::real_time& start_time,
+                       const ceph::real_time& end_time,
+                       const std::string& from_marker,
+                       const std::string& to_marker);
+};
+
+class RGWObjectExpirer {
+protected:
+  rgw::sal::Driver* driver;
+  RGWObjExpStore exp_store;
+
+  class OEWorker : public Thread, public DoutPrefixProvider {
+    CephContext *cct;
+    RGWObjectExpirer *oe;
+    ceph::mutex lock = ceph::make_mutex("OEWorker");
+    ceph::condition_variable cond;
+
+  public:
+    OEWorker(CephContext * const cct,
+             RGWObjectExpirer * const oe)
+      : cct(cct),
+        oe(oe) {
+    }
+
+    void *entry() override;
+    void stop();
+
+    CephContext *get_cct() const override;
+    unsigned get_subsys() const override;
+    std::ostream& gen_prefix(std::ostream& out) const override;
+  };
+
+  OEWorker *worker{nullptr};
+  std::atomic<bool> down_flag = { false };
+
+public:
+  explicit RGWObjectExpirer(rgw::sal::Driver* _driver)
+    : driver(_driver),
+      exp_store(_driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados, static_cast<rgw::sal::RadosStore*>(driver)),
+      worker(NULL) {
+  }
+  ~RGWObjectExpirer() {
+    stop_processor();
+  }
+
+  int hint_add(const DoutPrefixProvider *dpp, 
+               const ceph::real_time& delete_at,
+               const std::string& tenant_name,
+               const std::string& bucket_name,
+               const std::string& bucket_id,
+               const rgw_obj_index_key& obj_key) {
+    return exp_store.objexp_hint_add(dpp, delete_at, tenant_name, bucket_name,
+                                     bucket_id, obj_key);
+  }
+
+  int garbage_single_object(const DoutPrefixProvider *dpp, objexp_hint_entry& hint);
+
+  void garbage_chunk(const DoutPrefixProvider *dpp, 
+                     std::list<cls_timeindex_entry>& entries, /* in  */
+                     bool& need_trim);                        /* out */
+
+  void trim_chunk(const DoutPrefixProvider *dpp, 
+                  const std::string& shard,
+                  const utime_t& from,
+                  const utime_t& to,
+                  const std::string& from_marker,
+                  const std::string& to_marker);
+
+  bool process_single_shard(const DoutPrefixProvider *dpp, 
+                            const std::string& shard,
+                            const utime_t& last_run,
+                            const utime_t& round_start);
+
+  bool inspect_all_shards(const DoutPrefixProvider *dpp, 
+                          const utime_t& last_run,
+                          const utime_t& round_start);
+
+  bool going_down();
+  void start_processor();
+  void stop_processor();
+};
diff --git a/src/rgw/driver/rados/rgw_otp.cc b/src/rgw/driver/rados/rgw_otp.cc
new file mode 100644
index 000000000..07cc14f11
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_otp.cc
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+
+#include <string>
+#include <map>
+#include <boost/algorithm/string.hpp>
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+#include "rgw_otp.h"
+#include "rgw_zone.h"
+#include "rgw_metadata.h"
+
+#include "include/types.h"
+
+#include "rgw_common.h"
+#include "rgw_tools.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_meta.h"
+#include "services/svc_meta_be.h"
+#include "services/svc_meta_be_otp.h"
+#include "services/svc_otp.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+
+class RGWOTPMetadataHandler;
+
+class RGWOTPMetadataObject : public RGWMetadataObject {
+  friend class RGWOTPMetadataHandler;
+
+  otp_devices_list_t devices;
+public:
+  RGWOTPMetadataObject() {}
+  RGWOTPMetadataObject(otp_devices_list_t&& _devices, const obj_version& v, const real_time m) {
+    devices = std::move(_devices);
+    objv = v;
+    mtime = m;
+  }
+
+  void dump(Formatter *f) const override {
+    encode_json("devices", devices, f);
+  }
+
+  otp_devices_list_t& get_devs() {
+    return devices;
+  }
+};
+
+
+class RGWOTPMetadataHandler : public RGWOTPMetadataHandlerBase {
+  friend class RGWOTPCtl;
+
+  struct Svc {
+    RGWSI_Zone *zone;
+    RGWSI_MetaBackend *meta_be;
+    RGWSI_OTP *otp;
+  } svc;
+
+  int init(RGWSI_Zone *zone,
+           RGWSI_MetaBackend *_meta_be,
+           RGWSI_OTP *_otp) {
+    base_init(zone->ctx(), _otp->get_be_handler().get());
+    svc.zone = zone;
+    svc.meta_be = _meta_be;
+    svc.otp = _otp;
+    return 0;
+  }
+
+  int call(std::function<int(RGWSI_OTP_BE_Ctx& ctx)> f) {
+    return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+      RGWSI_OTP_BE_Ctx ctx(op->ctx());
+      return f(ctx);
+    });
+  }
+
+  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+    otp_devices_list_t devices;
+    try {
+      JSONDecoder::decode_json("devices", devices, jo);
+    } catch (JSONDecoder::err& e) {
+      return nullptr;
+    }
+
+    return new RGWOTPMetadataObject(std::move(devices), objv, mtime);
+  }
+
+  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWObjVersionTracker objv_tracker;
+
+    std::unique_ptr<RGWOTPMetadataObject> mdo(new RGWOTPMetadataObject);
+
+    
+    RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
+
+    int ret = svc.otp->read_all(be_ctx,
+                                entry,
+                                &mdo->get_devs(),
+                                &mdo->get_mtime(),
+                                &objv_tracker,
+                                y,
+                                dpp);
+    if (ret < 0) {
+      return ret;
+    }
+
+    mdo->objv = objv_tracker.read_version;
+
+    *obj = mdo.release();
+
+    return 0;
+  }
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+             RGWMetadataObject *_obj, RGWObjVersionTracker& objv_tracker,
+             optional_yield y,
+             const DoutPrefixProvider *dpp,
+             RGWMDLogSyncType type, bool from_remote_zone) override {
+    RGWOTPMetadataObject *obj = static_cast<RGWOTPMetadataObject *>(_obj);
+
+    RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
+
+    int ret = svc.otp->store_all(dpp, be_ctx,
+                                 entry,
+                                 obj->devices,
+                                 obj->mtime,
+                                 &objv_tracker,
+                                 y);
+    if (ret < 0) {
+      return ret;
+    }
+
+    return STATUS_APPLIED;
+  }
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+                optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWSI_MBOTP_RemoveParams params;
+
+    RGWSI_OTP_BE_Ctx be_ctx(op->ctx());
+
+    return svc.otp->remove_all(dpp, be_ctx,
+                               entry,
+                               &objv_tracker,
+                               y);
+  }
+
+public:
+  RGWOTPMetadataHandler() {}
+
+  string get_type() override { return "otp"; }
+};
+
+
+RGWOTPCtl::RGWOTPCtl(RGWSI_Zone *zone_svc,
+		     RGWSI_OTP *otp_svc)
+{
+  svc.zone = zone_svc;
+  svc.otp = otp_svc;
+}
+
+
+void RGWOTPCtl::init(RGWOTPMetadataHandler *_meta_handler)
+{
+  meta_handler = _meta_handler;
+  be_handler = meta_handler->get_be_handler();
+}
+
+int RGWOTPCtl::read_all(const rgw_user& uid,
+                        RGWOTPInfo *info,
+                        optional_yield y,
+                        const DoutPrefixProvider *dpp,
+                        const GetParams& params)
+{
+  info->uid = uid;
+  return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
+    return svc.otp->read_all(ctx, uid, &info->devices, params.mtime, params.objv_tracker, y, dpp);
+  });
+}
+
+int RGWOTPCtl::store_all(const DoutPrefixProvider *dpp, 
+                         const RGWOTPInfo& info,
+                         optional_yield y,
+                         const PutParams& params)
+{
+  return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
+    return svc.otp->store_all(dpp, ctx, info.uid, info.devices, params.mtime, params.objv_tracker, y);
+  });
+}
+
+int RGWOTPCtl::remove_all(const DoutPrefixProvider *dpp,
+                          const rgw_user& uid,
+                          optional_yield y,
+                          const RemoveParams& params)
+{
+  return meta_handler->call([&](RGWSI_OTP_BE_Ctx& ctx) {
+    return svc.otp->remove_all(dpp, ctx, uid, params.objv_tracker, y);
+  });
+}
+
+
+RGWMetadataHandler *RGWOTPMetaHandlerAllocator::alloc()
+{
+  return new RGWOTPMetadataHandler();
+}
diff --git a/src/rgw/driver/rados/rgw_otp.h b/src/rgw/driver/rados/rgw_otp.h
new file mode 100644
index 000000000..885e8abb8
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_otp.h
@@ -0,0 +1,110 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_sal_fwd.h"
+#include "cls/otp/cls_otp_types.h"
+#include "services/svc_meta_be_otp.h"
+
+#include "rgw_basic_types.h"
+#include "rgw_metadata.h"
+
+
+class RGWObjVersionTracker;
+class RGWMetadataHandler;
+class RGWOTPMetadataHandler;
+class RGWSI_Zone;
+class RGWSI_OTP;
+class RGWSI_MetaBackend;
+
+class RGWOTPMetadataHandlerBase : public RGWMetadataHandler_GenericMetaBE {
+public:
+  virtual ~RGWOTPMetadataHandlerBase() {}
+  virtual int init(RGWSI_Zone *zone,
+		   RGWSI_MetaBackend *_meta_be,
+		   RGWSI_OTP *_otp) = 0;
+};
+
+class RGWOTPMetaHandlerAllocator {
+public:
+  static RGWMetadataHandler *alloc();
+};
+
+struct RGWOTPInfo {
+  rgw_user uid;
+  otp_devices_list_t devices;
+};
+
+
+class RGWOTPCtl
+{
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_OTP *otp{nullptr};
+  } svc;
+
+  RGWOTPMetadataHandler *meta_handler;
+  RGWSI_MetaBackend_Handler *be_handler;
+  
+public:
+  RGWOTPCtl(RGWSI_Zone *zone_svc,
+	    RGWSI_OTP *otp_svc);
+
+  void init(RGWOTPMetadataHandler *_meta_handler);
+
+  struct GetParams {
+    RGWObjVersionTracker *objv_tracker{nullptr};
+    ceph::real_time *mtime{nullptr};
+
+    GetParams() {}
+
+    GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+
+    GetParams& set_mtime(ceph::real_time *_mtime) {
+      mtime = _mtime;
+      return *this;
+    }
+  };
+
+  struct PutParams {
+    RGWObjVersionTracker *objv_tracker{nullptr};
+    ceph::real_time mtime;
+
+    PutParams() {}
+
+    PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+
+    PutParams& set_mtime(const ceph::real_time& _mtime) {
+      mtime = _mtime;
+      return *this;
+    }
+  };
+
+  struct RemoveParams {
+    RGWObjVersionTracker *objv_tracker{nullptr};
+
+    RemoveParams() {}
+
+    RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+  };
+
+  int read_all(const rgw_user& uid, RGWOTPInfo *info, optional_yield y,
+               const DoutPrefixProvider *dpp,
+               const GetParams& params = {});
+  int store_all(const DoutPrefixProvider *dpp, 
+                const RGWOTPInfo& info, optional_yield y,
+                const PutParams& params = {});
+  int remove_all(const DoutPrefixProvider *dpp, 
+                 const rgw_user& user, optional_yield y,
+                 const RemoveParams& params = {});
+};
diff --git a/src/rgw/driver/rados/rgw_period.cc b/src/rgw/driver/rados/rgw_period.cc
new file mode 100644
index 000000000..61602b354
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_period.cc
@@ -0,0 +1,324 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync.h"
+
+#include "services/svc_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw_zone_defaults;
+
+int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup,
+                             const string& zonegroup_id) const
+{
+  map<string, RGWZoneGroup>::const_iterator iter;
+  if (!zonegroup_id.empty()) {
+    iter = period_map.zonegroups.find(zonegroup_id);
+  } else {
+    iter = period_map.zonegroups.find("default");
+  }
+  if (iter != period_map.zonegroups.end()) {
+    zonegroup = iter->second;
+    return 0;
+  }
+
+  return -ENOENT;
+}
+
+int RGWPeriod::get_latest_epoch(const DoutPrefixProvider *dpp, epoch_t& latest_epoch, optional_yield y)
+{
+  RGWPeriodLatestEpochInfo info;
+
+  int ret = read_latest_epoch(dpp, info, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  latest_epoch = info.epoch;
+
+  return 0;
+}
+
+int RGWPeriod::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  rgw_pool pool(get_pool(cct));
+
+  // delete the object for each period epoch
+  for (epoch_t e = 1; e <= epoch; e++) {
+    RGWPeriod p{get_id(), e};
+    rgw_raw_obj oid{pool, p.get_period_oid()};
+    auto sysobj = sysobj_svc->get_obj(oid);
+    int ret = sysobj.wop().remove(dpp, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: failed to delete period object " << oid
+          << ": " << cpp_strerror(-ret) << dendl;
+    }
+  }
+
+  // delete the .latest_epoch object
+  rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
+  auto sysobj = sysobj_svc->get_obj(oid);
+  int ret = sysobj.wop().remove(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "WARNING: failed to delete period object " << oid
+        << ": " << cpp_strerror(-ret) << dendl;
+  }
+  return ret;
+}
+
+int RGWPeriod::add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y)
+{
+  if (zonegroup.realm_id != realm_id) {
+    return 0;
+  }
+  int ret = period_map.update(zonegroup, cct);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return store_info(dpp, false, y);
+}
+
+int RGWPeriod::update(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  auto zone_svc = sysobj_svc->get_zone_svc();
+  ldpp_dout(dpp, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
+  list<string> zonegroups;
+  int ret = zone_svc->list_zonegroups(dpp, zonegroups);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  // clear zone short ids of removed zones. period_map.update() will add the
+  // remaining zones back
+  period_map.short_zone_ids.clear();
+
+  for (auto& iter : zonegroups) {
+    RGWZoneGroup zg(string(), iter);
+    ret = zg.init(dpp, cct, sysobj_svc, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
+      continue;
+    }
+
+    if (zg.realm_id != realm_id) {
+      ldpp_dout(dpp, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
+      continue;
+    }
+
+    if (zg.master_zone.empty()) {
+      ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
+      return -EINVAL;
+    }
+
+    if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
+      ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name()
+                   << " has a non existent master zone "<< dendl;
+      return -EINVAL;
+    }
+
+    if (zg.is_master_zonegroup()) {
+      master_zonegroup = zg.get_id();
+      master_zone = zg.master_zone;
+    }
+
+    int ret = period_map.update(zg, cct);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  ret = period_config.read(dpp, sysobj_svc, realm_id, y);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to read period config: "
+        << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+void RGWPeriod::fork()
+{
+  ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
+  predecessor_uuid = id;
+  id = get_staging_id(realm_id);
+  period_map.reset();
+  realm_epoch++;
+}
+
+static int read_sync_status(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw_meta_sync_status *sync_status)
+{
+  rgw::sal::RadosStore* rados_store = static_cast<rgw::sal::RadosStore*>(driver);
+  // initialize a sync status manager to read the status
+  RGWMetaSyncStatusManager mgr(rados_store, rados_store->svc()->rados->get_async_processor());
+  int r = mgr.init(dpp);
+  if (r < 0) {
+    return r;
+  }
+  r = mgr.read_sync_status(dpp, sync_status);
+  mgr.stop();
+  return r;
+}
+
+int RGWPeriod::update_sync_status(const DoutPrefixProvider *dpp,
+                                  rgw::sal::Driver* driver, /* for now */
+				  const RGWPeriod &current_period,
+                                  std::ostream& error_stream,
+                                  bool force_if_stale)
+{
+  rgw_meta_sync_status status;
+  int r = read_sync_status(dpp, driver, &status);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "period failed to read sync status: "
+        << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  std::vector<std::string> markers;
+
+  const auto current_epoch = current_period.get_realm_epoch();
+  if (current_epoch != status.sync_info.realm_epoch) {
+    // no sync status markers for the current period
+    ceph_assert(current_epoch > status.sync_info.realm_epoch);
+    const int behind = current_epoch - status.sync_info.realm_epoch;
+    if (!force_if_stale && current_epoch > 1) {
+      error_stream << "ERROR: This zone is " << behind << " period(s) behind "
+          "the current master zone in metadata sync. If this zone is promoted "
+          "to master, any metadata changes during that time are likely to "
+          "be lost.\n"
+          "Waiting for this zone to catch up on metadata sync (see "
+          "'radosgw-admin sync status') is recommended.\n"
+          "To promote this zone to master anyway, add the flag "
+          "--yes-i-really-mean-it." << std::endl;
+      return -EINVAL;
+    }
+    // empty sync status markers - other zones will skip this period during
+    // incremental metadata sync
+    markers.resize(status.sync_info.num_shards);
+  } else {
+    markers.reserve(status.sync_info.num_shards);
+    for (auto& i : status.sync_markers) {
+      auto& marker = i.second;
+      // filter out markers from other periods
+      if (marker.realm_epoch != current_epoch) {
+        marker.marker.clear();
+      }
+      markers.emplace_back(std::move(marker.marker));
+    }
+  }
+
+  std::swap(sync_status, markers);
+  return 0;
+}
+
+int RGWPeriod::commit(const DoutPrefixProvider *dpp,
+		      rgw::sal::Driver* driver,
+		      RGWRealm& realm, const RGWPeriod& current_period,
+                      std::ostream& error_stream, optional_yield y,
+		      bool force_if_stale)
+{
+  auto zone_svc = sysobj_svc->get_zone_svc();
+  ldpp_dout(dpp, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
+  // gateway must be in the master zone to commit
+  if (master_zone != zone_svc->get_zone_params().get_id()) {
+    error_stream << "Cannot commit period on zone "
+        << zone_svc->get_zone_params().get_id() << ", it must be sent to "
+        "the period's master zone " << master_zone << '.' << std::endl;
+    return -EINVAL;
+  }
+  // period predecessor must match current period
+  if (predecessor_uuid != current_period.get_id()) {
+    error_stream << "Period predecessor " << predecessor_uuid
+        << " does not match current period " << current_period.get_id()
+        << ". Use 'period pull' to get the latest period from the master, "
+        "reapply your changes, and try again." << std::endl;
+    return -EINVAL;
+  }
+  // realm epoch must be 1 greater than current period
+  if (realm_epoch != current_period.get_realm_epoch() + 1) {
+    error_stream << "Period's realm epoch " << realm_epoch
+        << " does not come directly after current realm epoch "
+        << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
+        "latest realm and period from the master zone, reapply your changes, "
+        "and try again." << std::endl;
+    return -EINVAL;
+  }
+  // did the master zone change?
+  if (master_zone != current_period.get_master_zone()) {
+    // store the current metadata sync status in the period
+    int r = update_sync_status(dpp, driver, current_period, error_stream, force_if_stale);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to update metadata sync status: "
+          << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    // create an object with a new period id
+    r = create(dpp, y, true);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    // set as current period
+    r = realm.set_current_period(dpp, *this, y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to update realm's current period: "
+          << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    ldpp_dout(dpp, 4) << "Promoted to master zone and committed new period "
+        << id << dendl;
+    realm.notify_new_period(dpp, *this, y);
+    return 0;
+  }
+  // period must be based on current epoch
+  if (epoch != current_period.get_epoch()) {
+    error_stream << "Period epoch " << epoch << " does not match "
+        "predecessor epoch " << current_period.get_epoch()
+        << ". Use 'period pull' to get the latest epoch from the master zone, "
+        "reapply your changes, and try again." << std::endl;
+    return -EINVAL;
+  }
+  // set period as next epoch
+  set_id(current_period.get_id());
+  set_epoch(current_period.get_epoch() + 1);
+  set_predecessor(current_period.get_predecessor());
+  realm_epoch = current_period.get_realm_epoch();
+  // write the period to rados
+  int r = store_info(dpp, false, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+  // set as latest epoch
+  r = update_latest_epoch(dpp, epoch, y);
+  if (r == -EEXIST) {
+    // already have this epoch (or a more recent one)
+    return 0;
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+  r = reflect(dpp, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+  ldpp_dout(dpp, 4) << "Committed new epoch " << epoch
+      << " for period " << id << dendl;
+  realm.notify_new_period(dpp, *this, y);
+  return 0;
+}
+
+void RGWPeriod::generate_test_instances(list<RGWPeriod*> &o)
+{
+  RGWPeriod *z = new RGWPeriod;
+  o.push_back(z);
+  o.push_back(new RGWPeriod);
+}
+
+
diff --git a/src/rgw/driver/rados/rgw_pubsub_push.cc b/src/rgw/driver/rados/rgw_pubsub_push.cc
new file mode 100644
index 000000000..bdb24ce9a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_pubsub_push.cc
@@ -0,0 +1,460 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_pubsub_push.h"
+#include <string>
+#include <sstream>
+#include <algorithm>
+#include "include/buffer_fwd.h"
+#include "common/Formatter.h"
+#include "common/iso_8601.h"
+#include "common/async/completion.h"
+#include "rgw_common.h"
+#include "rgw_data_sync.h"
+#include "rgw_pubsub.h"
+#include "acconfig.h"
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+#include "rgw_amqp.h"
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+#include "rgw_kafka.h"
+#endif
+#include <boost/asio/yield.hpp>
+#include <boost/algorithm/string.hpp>
+#include <functional>
+#include "rgw_perf_counters.h"
+
+using namespace rgw;
+
+template<typename EventType>
+std::string json_format_pubsub_event(const EventType& event) {
+  std::stringstream ss;
+  JSONFormatter f(false);
+  {
+    Formatter::ObjectSection s(f, EventType::json_type_plural);
+    {
+      Formatter::ArraySection s(f, EventType::json_type_plural);
+      encode_json("", event, &f);
+    }
+  }
+  f.flush(ss);
+  return ss.str();
+}
+  
+bool get_bool(const RGWHTTPArgs& args, const std::string& name, bool default_value) {
+  bool value;
+  bool exists;
+  if (args.get_bool(name.c_str(), &value, &exists) == -EINVAL) {
+    throw RGWPubSubEndpoint::configuration_error("invalid boolean value for " + name);
+  }
+  if (!exists) {
+    return default_value;
+  }
+  return value;
+}
+
+class RGWPubSubHTTPEndpoint : public RGWPubSubEndpoint {
+private:
+  const std::string endpoint;
+  typedef unsigned ack_level_t;
+  ack_level_t ack_level; // TODO: not used for now
+  const bool verify_ssl;
+  const bool cloudevents;
+  static const ack_level_t ACK_LEVEL_ANY = 0;
+  static const ack_level_t ACK_LEVEL_NON_ERROR = 1;
+
+public:
+  RGWPubSubHTTPEndpoint(const std::string& _endpoint, const RGWHTTPArgs& args) : 
+    endpoint(_endpoint), verify_ssl(get_bool(args, "verify-ssl", true)), cloudevents(get_bool(args, "cloudevents", false)) 
+  {
+    bool exists;
+    const auto& str_ack_level = args.get("http-ack-level", &exists);
+    if (!exists || str_ack_level == "any") {
+      // "any" is default
+      ack_level = ACK_LEVEL_ANY;
+    } else if (str_ack_level == "non-error") {
+      ack_level = ACK_LEVEL_NON_ERROR;
+    } else {
+      ack_level = std::atoi(str_ack_level.c_str());
+      if (ack_level < 100 || ack_level >= 600) {
+        throw configuration_error("HTTP/S: invalid http-ack-level: " + str_ack_level);
+      }
+    }
+  }
+
+  int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+    bufferlist read_bl;
+    RGWPostHTTPData request(cct, "POST", endpoint, &read_bl, verify_ssl);
+    const auto post_data = json_format_pubsub_event(event);
+    if (cloudevents) {
+      // following: https://github.com/cloudevents/spec/blob/v1.0.1/http-protocol-binding.md
+      // using "Binary Content Mode"
+      request.append_header("ce-specversion", "1.0");
+      request.append_header("ce-type", "com.amazonaws." + event.eventName);
+      request.append_header("ce-time", to_iso_8601(event.eventTime)); 
+      // default output of iso8601 is also RFC3339 compatible
+      request.append_header("ce-id", event.x_amz_request_id + "." + event.x_amz_id_2);
+      request.append_header("ce-source", event.eventSource + "." + event.awsRegion + "." + event.bucket_name);
+      request.append_header("ce-subject", event.object_key);
+    }
+    request.set_post_data(post_data);
+    request.set_send_length(post_data.length());
+    request.append_header("Content-Type", "application/json");
+    if (perfcounter) perfcounter->inc(l_rgw_pubsub_push_pending);
+    const auto rc = RGWHTTP::process(&request, y);
+    if (perfcounter) perfcounter->dec(l_rgw_pubsub_push_pending);
+    // TODO: use read_bl to process return code and handle according to ack level
+    return rc;
+  }
+
+  std::string to_str() const override {
+    std::string str("HTTP/S Endpoint");
+    str += "\nURI: " + endpoint;
+    str += (verify_ssl ? "\nverify SSL" : "\ndon't verify SSL");
+    return str;
+  }
+};
+
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+class RGWPubSubAMQPEndpoint : public RGWPubSubEndpoint {
+private:
+  enum class ack_level_t {
+    None,
+    Broker,
+    Routable
+  };
+  CephContext* const cct;
+  const std::string endpoint;
+  const std::string topic;
+  const std::string exchange;
+  ack_level_t ack_level;
+  amqp::connection_id_t conn_id;
+
+  bool get_verify_ssl(const RGWHTTPArgs& args) {
+    bool exists;
+    auto str_verify_ssl = args.get("verify-ssl", &exists);
+    if (!exists) {
+      // verify server certificate by default
+      return true;
+    }
+    boost::algorithm::to_lower(str_verify_ssl);
+    if (str_verify_ssl == "true") {
+      return true;
+    }
+    if (str_verify_ssl == "false") {
+      return false;
+    }
+    throw configuration_error("'verify-ssl' must be true/false, not: " + str_verify_ssl);
+  }
+
+  std::string get_exchange(const RGWHTTPArgs& args) {
+    bool exists;
+    const auto exchange = args.get("amqp-exchange", &exists);
+    if (!exists) {
+      throw configuration_error("AMQP: missing amqp-exchange");
+    }
+    return exchange;
+  }
+
+  ack_level_t get_ack_level(const RGWHTTPArgs& args) {
+    bool exists;
+    const auto& str_ack_level = args.get("amqp-ack-level", &exists);
+    if (!exists || str_ack_level == "broker") {
+      // "broker" is default
+      return ack_level_t::Broker;
+    }
+    if (str_ack_level == "none") {
+      return ack_level_t::None;
+    }
+    if (str_ack_level == "routable") {
+      return ack_level_t::Routable;
+    }
+    throw configuration_error("AMQP: invalid amqp-ack-level: " + str_ack_level);
+  }
+  
+public:
+  RGWPubSubAMQPEndpoint(const std::string& _endpoint,
+      const std::string& _topic,
+      const RGWHTTPArgs& args,
+      CephContext* _cct) : 
+        cct(_cct),
+        endpoint(_endpoint), 
+        topic(_topic),
+        exchange(get_exchange(args)),
+        ack_level(get_ack_level(args)) {
+    if (!amqp::connect(conn_id, endpoint, exchange, (ack_level == ack_level_t::Broker), get_verify_ssl(args), args.get_optional("ca-location"))) {
+      throw configuration_error("AMQP: failed to create connection to: " + endpoint);
+    }
+  }
+
+  // this allows waiting untill "finish()" is called from a different thread
+  // waiting could be blocking the waiting thread or yielding, depending
+  // with compilation flag support and whether the optional_yield is set
+  class Waiter {
+    using Signature = void(boost::system::error_code);
+    using Completion = ceph::async::Completion<Signature>;
+    std::unique_ptr<Completion> completion = nullptr;
+    int ret;
+
+    mutable std::atomic<bool> done = false;
+    mutable std::mutex lock;
+    mutable std::condition_variable cond;
+
+    template <typename ExecutionContext, typename CompletionToken>
+    auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
+      boost::asio::async_completion<CompletionToken, Signature> init(token);
+      auto& handler = init.completion_handler;
+      {
+        std::unique_lock l{lock};
+        completion = Completion::create(ctx.get_executor(), std::move(handler));
+      }
+      return init.result.get();
+    }
+
+  public:
+    int wait(optional_yield y) {
+      if (done) {
+        return ret;
+      }
+      if (y) {
+	auto& io_ctx = y.get_io_context();
+        auto& yield_ctx = y.get_yield_context();
+        boost::system::error_code ec;
+        async_wait(io_ctx, yield_ctx[ec]);
+        return -ec.value();
+      }
+      std::unique_lock l(lock);
+      cond.wait(l, [this]{return (done==true);});
+      return ret;
+    }
+
+    void finish(int r) {
+      std::unique_lock l{lock};
+      ret = r;
+      done = true;
+      if (completion) {
+        boost::system::error_code ec(-ret, boost::system::system_category());
+        Completion::post(std::move(completion), ec);
+      } else {
+        cond.notify_all();
+      }
+    }
+  };
+
+  int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+    if (ack_level == ack_level_t::None) {
+      return amqp::publish(conn_id, topic, json_format_pubsub_event(event));
+    } else {
+      // TODO: currently broker and routable are the same - this will require different flags but the same mechanism
+      // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
+      auto w = std::unique_ptr<Waiter>(new Waiter);
+      const auto rc = amqp::publish_with_confirm(conn_id, 
+        topic,
+        json_format_pubsub_event(event),
+        std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
+      if (rc < 0) {
+        // failed to publish, does not wait for reply
+        return rc;
+      }
+      return w->wait(y);
+    }
+  }
+
+  std::string to_str() const override {
+    std::string str("AMQP(0.9.1) Endpoint");
+    str += "\nURI: " + endpoint;
+    str += "\nTopic: " + topic;
+    str += "\nExchange: " + exchange;
+    return str;
+  }
+};
+
+static const std::string AMQP_0_9_1("0-9-1");
+static const std::string AMQP_1_0("1-0");
+static const std::string AMQP_SCHEMA("amqp");
+#endif	// ifdef WITH_RADOSGW_AMQP_ENDPOINT
+
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+class RGWPubSubKafkaEndpoint : public RGWPubSubEndpoint {
+private:
+  enum class ack_level_t {
+    None,
+    Broker,
+  };
+  CephContext* const cct;
+  const std::string topic;
+  const ack_level_t ack_level;
+  std::string conn_name;
+
+
+  ack_level_t get_ack_level(const RGWHTTPArgs& args) {
+    bool exists;
+    const auto& str_ack_level = args.get("kafka-ack-level", &exists);
+    if (!exists || str_ack_level == "broker") {
+      // "broker" is default
+      return ack_level_t::Broker;
+    }
+    if (str_ack_level == "none") {
+      return ack_level_t::None;
+    }
+    throw configuration_error("Kafka: invalid kafka-ack-level: " + str_ack_level);
+  }
+
+public:
+  RGWPubSubKafkaEndpoint(const std::string& _endpoint,
+      const std::string& _topic,
+      const RGWHTTPArgs& args,
+      CephContext* _cct) : 
+        cct(_cct),
+        topic(_topic),
+        ack_level(get_ack_level(args)) {
+    if (!kafka::connect(conn_name, _endpoint, get_bool(args, "use-ssl", false), get_bool(args, "verify-ssl", true), 
+          args.get_optional("ca-location"), args.get_optional("mechanism"))) {
+      throw configuration_error("Kafka: failed to create connection to: " + _endpoint);
+    }
+  }
+
+  // this allows waiting untill "finish()" is called from a different thread
+  // waiting could be blocking the waiting thread or yielding, depending
+  // with compilation flag support and whether the optional_yield is set
+  class Waiter {
+    using Signature = void(boost::system::error_code);
+    using Completion = ceph::async::Completion<Signature>;
+    std::unique_ptr<Completion> completion = nullptr;
+    int ret;
+
+    mutable std::atomic<bool> done = false;
+    mutable std::mutex lock;
+    mutable std::condition_variable cond;
+
+    template <typename ExecutionContext, typename CompletionToken>
+    auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
+      boost::asio::async_completion<CompletionToken, Signature> init(token);
+      auto& handler = init.completion_handler;
+      {
+        std::unique_lock l{lock};
+        completion = Completion::create(ctx.get_executor(), std::move(handler));
+      }
+      return init.result.get();
+    }
+
+  public:
+    int wait(optional_yield y) {
+      if (done) {
+        return ret;
+      }
+      if (y) {
+        auto& io_ctx = y.get_io_context();
+        auto& yield_ctx = y.get_yield_context();
+        boost::system::error_code ec;
+        async_wait(io_ctx, yield_ctx[ec]);
+        return -ec.value();
+      }
+      std::unique_lock l(lock);
+      cond.wait(l, [this]{return (done==true);});
+      return ret;
+    }
+
+    void finish(int r) {
+      std::unique_lock l{lock};
+      ret = r;
+      done = true;
+      if (completion) {
+        boost::system::error_code ec(-ret, boost::system::system_category());
+        Completion::post(std::move(completion), ec);
+      } else {
+        cond.notify_all();
+      }
+    }
+  };
+
+  int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) override {
+    if (ack_level == ack_level_t::None) {
+      return kafka::publish(conn_name, topic, json_format_pubsub_event(event));
+    } else {
+      // note: dynamic allocation of Waiter is needed when this is invoked from a beast coroutine
+      auto w = std::unique_ptr<Waiter>(new Waiter);
+      const auto rc = kafka::publish_with_confirm(conn_name, 
+        topic,
+        json_format_pubsub_event(event),
+        std::bind(&Waiter::finish, w.get(), std::placeholders::_1));
+      if (rc < 0) {
+        // failed to publish, does not wait for reply
+        return rc;
+      }
+      return w->wait(y);
+    }
+  }
+
+  std::string to_str() const override {
+    std::string str("Kafka Endpoint");
+    str += "\nBroker: " + conn_name;
+    str += "\nTopic: " + topic;
+    return str;
+  }
+};
+
+static const std::string KAFKA_SCHEMA("kafka");
+#endif	// ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+
+static const std::string WEBHOOK_SCHEMA("webhook");
+static const std::string UNKNOWN_SCHEMA("unknown");
+static const std::string NO_SCHEMA("");
+
+const std::string& get_schema(const std::string& endpoint) {
+  if (endpoint.empty()) {
+    return NO_SCHEMA; 
+  }
+  const auto pos = endpoint.find(':');
+  if (pos == std::string::npos) {
+    return UNKNOWN_SCHEMA;
+  }
+  const auto& schema = endpoint.substr(0,pos);
+  if (schema == "http" || schema == "https") {
+    return WEBHOOK_SCHEMA;
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+  } else if (schema == "amqp" || schema == "amqps") {
+    return AMQP_SCHEMA;
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+  } else if (schema == "kafka") {
+    return KAFKA_SCHEMA;
+#endif
+  }
+  return UNKNOWN_SCHEMA;
+}
+
+RGWPubSubEndpoint::Ptr RGWPubSubEndpoint::create(const std::string& endpoint, 
+    const std::string& topic, 
+    const RGWHTTPArgs& args,
+    CephContext* cct) {
+  const auto& schema = get_schema(endpoint);
+  if (schema == WEBHOOK_SCHEMA) {
+    return Ptr(new RGWPubSubHTTPEndpoint(endpoint, args));
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+  } else if (schema == AMQP_SCHEMA) {
+    bool exists;
+    std::string version = args.get("amqp-version", &exists);
+    if (!exists) {
+      version = AMQP_0_9_1;
+    }
+    if (version == AMQP_0_9_1) {
+      return Ptr(new RGWPubSubAMQPEndpoint(endpoint, topic, args, cct));
+    } else if (version == AMQP_1_0) {
+      throw configuration_error("AMQP: v1.0 not supported");
+      return nullptr;
+    } else {
+      throw configuration_error("AMQP: unknown version: " + version);
+      return nullptr;
+    }
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+  } else if (schema == KAFKA_SCHEMA) {
+      return Ptr(new RGWPubSubKafkaEndpoint(endpoint, topic, args, cct));
+#endif
+  }
+
+  throw configuration_error("unknown schema in: " + endpoint);
+  return nullptr;
+}
+
diff --git a/src/rgw/driver/rados/rgw_pubsub_push.h b/src/rgw/driver/rados/rgw_pubsub_push.h
new file mode 100644
index 000000000..17905937c
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_pubsub_push.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+#pragma once
+
+#include <string>
+#include <memory>
+#include <stdexcept>
+#include "include/buffer_fwd.h"
+#include "include/common_fwd.h"
+#include "common/async/yield_context.h"
+
+// TODO the env should be used as a template parameter to differentiate the source that triggers the pushes
+class RGWDataSyncEnv;
+class RGWHTTPArgs;
+struct rgw_pubsub_s3_event;
+
+// endpoint base class all endpoint  - types should derive from it
+class RGWPubSubEndpoint {
+public:
+  RGWPubSubEndpoint() = default;
+  // endpoint should not be copied
+  RGWPubSubEndpoint(const RGWPubSubEndpoint&) = delete;
+  const RGWPubSubEndpoint& operator=(const RGWPubSubEndpoint&) = delete;
+
+  typedef std::unique_ptr<RGWPubSubEndpoint> Ptr;
+
+  // factory method for the actual notification endpoint
+  // derived class specific arguments are passed in http args format
+  // may throw a configuration_error if creation fails
+  static Ptr create(const std::string& endpoint, const std::string& topic, const RGWHTTPArgs& args, CephContext *cct=nullptr);
+ 
+  // this method is used in order to send notification (S3 compliant) and wait for completion 
+  // in async manner via a coroutine when invoked in the frontend environment
+  virtual int send_to_completion_async(CephContext* cct, const rgw_pubsub_s3_event& event, optional_yield y) = 0;
+
+  // present as string
+  virtual std::string to_str() const { return ""; }
+  
+  virtual ~RGWPubSubEndpoint() = default;
+  
+  // exception object for configuration error
+  struct configuration_error : public std::logic_error {
+    configuration_error(const std::string& what_arg) : 
+      std::logic_error("pubsub endpoint configuration error: " + what_arg) {}
+  };
+};
+
diff --git a/src/rgw/driver/rados/rgw_putobj_processor.cc b/src/rgw/driver/rados/rgw_putobj_processor.cc
new file mode 100644
index 000000000..e453db5a9
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_putobj_processor.cc
@@ -0,0 +1,761 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+#include "rgw_aio.h"
+#include "rgw_putobj_processor.h"
+#include "rgw_multi.h"
+#include "rgw_compression.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_zone.h"
+#include "rgw_sal_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw::putobj {
+
+/*
+ * For the cloudtiered objects, update the object manifest with the
+ * cloudtier config info read from the attrs.
+ * Since these attrs are used internally for only replication, do not store them
+ * in the head object.
+ */
+void read_cloudtier_info_from_attrs(rgw::sal::Attrs& attrs, RGWObjCategory& category,
+                          RGWObjManifest& manifest) {
+  auto attr_iter = attrs.find(RGW_ATTR_CLOUD_TIER_TYPE);
+  if (attr_iter != attrs.end()) {
+    auto i = attr_iter->second;
+    string m = i.to_str();
+
+    if (m == "cloud-s3") {
+      category = RGWObjCategory::CloudTiered;
+      manifest.set_tier_type("cloud-s3");
+
+      auto config_iter = attrs.find(RGW_ATTR_CLOUD_TIER_CONFIG);
+      if (config_iter != attrs.end()) {
+        auto i = config_iter->second.cbegin();
+        RGWObjTier tier_config;
+
+        try {
+          using ceph::decode;
+          decode(tier_config, i);
+          manifest.set_tier_config(tier_config);
+          attrs.erase(config_iter);
+        } catch (buffer::error& err) {
+        }
+      }
+    }
+    attrs.erase(attr_iter);
+  }
+}
+
+int HeadObjectProcessor::process(bufferlist&& data, uint64_t logical_offset)
+{
+  const bool flush = (data.length() == 0);
+
+  // capture the first chunk for special handling
+  if (data_offset < head_chunk_size || data_offset == 0) {
+    if (flush) {
+      // flush partial chunk
+      return process_first_chunk(std::move(head_data), &processor);
+    }
+
+    auto remaining = head_chunk_size - data_offset;
+    auto count = std::min<uint64_t>(data.length(), remaining);
+    data.splice(0, count, &head_data);
+    data_offset += count;
+
+    if (data_offset == head_chunk_size) {
+      // process the first complete chunk
+      ceph_assert(head_data.length() == head_chunk_size);
+      int r = process_first_chunk(std::move(head_data), &processor);
+      if (r < 0) {
+        return r;
+      }
+    }
+    if (data.length() == 0) { // avoid flushing stripe processor
+      return 0;
+    }
+  }
+  ceph_assert(processor); // process_first_chunk() must initialize
+
+  // send everything else through the processor
+  auto write_offset = data_offset;
+  data_offset += data.length();
+  return processor->process(std::move(data), write_offset);
+}
+
+
+static int process_completed(const AioResultList& completed, RawObjSet *written)
+{
+  std::optional<int> error;
+  for (auto& r : completed) {
+    if (r.result >= 0) {
+      written->insert(r.obj.get_ref().obj);
+    } else if (!error) { // record first error code
+      error = r.result;
+    }
+  }
+  return error.value_or(0);
+}
+
+void RadosWriter::add_write_hint(librados::ObjectWriteOperation& op) {
+  const RGWObjStateManifest *sm = obj_ctx.get_state(head_obj);
+  const bool compressed = sm->state.compressed;
+  uint32_t alloc_hint_flags = 0;
+  if (compressed) {
+    alloc_hint_flags |= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+  }
+
+  op.set_alloc_hint2(0, 0, alloc_hint_flags);
+}
+
+int RadosWriter::set_stripe_obj(const rgw_raw_obj& raw_obj)
+{
+  stripe_obj = store->svc.rados->obj(raw_obj);
+  return stripe_obj.open(dpp);
+}
+
+int RadosWriter::process(bufferlist&& bl, uint64_t offset)
+{
+  bufferlist data = std::move(bl);
+  const uint64_t cost = data.length();
+  if (cost == 0) { // no empty writes, use aio directly for creates
+    return 0;
+  }
+  librados::ObjectWriteOperation op;
+  add_write_hint(op);
+  if (offset == 0) {
+    op.write_full(data);
+  } else {
+    op.write(offset, data);
+  }
+  constexpr uint64_t id = 0; // unused
+  auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
+  return process_completed(c, &written);
+}
+
+int RadosWriter::write_exclusive(const bufferlist& data)
+{
+  const uint64_t cost = data.length();
+
+  librados::ObjectWriteOperation op;
+  op.create(true); // exclusive create
+  add_write_hint(op);
+  op.write_full(data);
+
+  constexpr uint64_t id = 0; // unused
+  auto c = aio->get(stripe_obj, Aio::librados_op(std::move(op), y), cost, id);
+  auto d = aio->drain();
+  c.splice(c.end(), d);
+  return process_completed(c, &written);
+}
+
+int RadosWriter::drain()
+{
+  return process_completed(aio->drain(), &written);
+}
+
+RadosWriter::~RadosWriter()
+{
+  // wait on any outstanding aio completions
+  process_completed(aio->drain(), &written);
+
+  bool need_to_remove_head = false;
+  std::optional<rgw_raw_obj> raw_head;
+  if (!head_obj.empty()) {
+    raw_head.emplace();
+    store->obj_to_raw(bucket_info.placement_rule, head_obj, &*raw_head);
+  }
+
+  /**
+   * We should delete the object in the "multipart" namespace to avoid race condition.
+   * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
+   * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
+   * written by the second upload may be deleted by the first upload.
+   * details is describled on #11749
+   *
+   * The above comment still stands, but instead of searching for a specific object in the multipart
+   * namespace, we just make sure that we remove the object that is marked as the head object after
+   * we remove all the other raw objects. Note that we use different call to remove the head object,
+   * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
+   */
+  for (const auto& obj : written) {
+    if (raw_head && obj == *raw_head) {
+      ldpp_dout(dpp, 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
+      need_to_remove_head = true;
+      continue;
+    }
+
+    int r = store->delete_raw_obj(dpp, obj);
+    if (r < 0 && r != -ENOENT) {
+      ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
+    }
+  }
+
+  if (need_to_remove_head) {
+    std::string version_id;
+    ldpp_dout(dpp, 5) << "NOTE: we are going to process the head obj (" << *raw_head << ")" << dendl;
+    int r = store->delete_obj(dpp, obj_ctx, bucket_info, head_obj, 0, 0);
+    if (r < 0 && r != -ENOENT) {
+      ldpp_dout(dpp, 0) << "WARNING: failed to remove obj (" << *raw_head << "), leaked" << dendl;
+    }
+  }
+}
+
+
+// advance to the next stripe
+int ManifestObjectProcessor::next(uint64_t offset, uint64_t *pstripe_size)
+{
+  // advance the manifest
+  int r = manifest_gen.create_next(offset);
+  if (r < 0) {
+    return r;
+  }
+
+  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+  uint64_t chunk_size = 0;
+  r = store->get_max_chunk_size(stripe_obj.pool, &chunk_size, dpp);
+  if (r < 0) {
+    return r;
+  }
+  r = writer.set_stripe_obj(stripe_obj);
+  if (r < 0) {
+    return r;
+  }
+
+  chunk = ChunkProcessor(&writer, chunk_size);
+  *pstripe_size = manifest_gen.cur_stripe_max_size();
+  return 0;
+}
+
+
+
+int AtomicObjectProcessor::process_first_chunk(bufferlist&& data,
+                                               DataProcessor **processor)
+{
+  first_chunk = std::move(data);
+  *processor = &stripe;
+  return 0;
+}
+
+int AtomicObjectProcessor::prepare(optional_yield y)
+{
+  uint64_t max_head_chunk_size;
+  uint64_t head_max_size;
+  uint64_t chunk_size = 0;
+  uint64_t alignment;
+  rgw_pool head_pool;
+
+  if (!store->get_obj_data_pool(bucket_info.placement_rule, head_obj, &head_pool)) {
+    return -EIO;
+  }
+
+  int r = store->get_max_chunk_size(head_pool, &max_head_chunk_size, dpp, &alignment);
+  if (r < 0) {
+    return r;
+  }
+
+  bool same_pool = true;
+  if (bucket_info.placement_rule != tail_placement_rule) {
+    rgw_pool tail_pool;
+    if (!store->get_obj_data_pool(tail_placement_rule, head_obj, &tail_pool)) {
+      return -EIO;
+    }
+
+    if (tail_pool != head_pool) {
+      same_pool = false;
+
+      r = store->get_max_chunk_size(tail_pool, &chunk_size, dpp);
+      if (r < 0) {
+        return r;
+      }
+
+      head_max_size = 0;
+    }
+  }
+
+  if (same_pool) {
+    RGWZonePlacementInfo placement_info;
+    if (!store->svc.zone->get_zone_params().get_placement(bucket_info.placement_rule.name, &placement_info) || placement_info.inline_data) {
+      head_max_size = max_head_chunk_size;
+    } else {
+      head_max_size = 0;
+    }
+    chunk_size = max_head_chunk_size;
+  }
+
+  uint64_t stripe_size;
+  const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
+
+  store->get_max_aligned_size(default_stripe_size, alignment, &stripe_size);
+
+  manifest.set_trivial_rule(head_max_size, stripe_size);
+
+  r = manifest_gen.create_begin(store->ctx(), &manifest,
+                                bucket_info.placement_rule,
+                                &tail_placement_rule,
+                                head_obj.bucket, head_obj);
+  if (r < 0) {
+    return r;
+  }
+
+  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+  r = writer.set_stripe_obj(stripe_obj);
+  if (r < 0) {
+    return r;
+  }
+
+  set_head_chunk_size(head_max_size);
+  // initialize the processors
+  chunk = ChunkProcessor(&writer, chunk_size);
+  stripe = StripeProcessor(&chunk, this, head_max_size);
+  return 0;
+}
+
+int AtomicObjectProcessor::complete(size_t accounted_size,
+                                    const std::string& etag,
+                                    ceph::real_time *mtime,
+                                    ceph::real_time set_mtime,
+                                    rgw::sal::Attrs& attrs,
+                                    ceph::real_time delete_at,
+                                    const char *if_match,
+                                    const char *if_nomatch,
+                                    const std::string *user_data,
+                                    rgw_zone_set *zones_trace,
+                                    bool *pcanceled, optional_yield y)
+{
+  int r = writer.drain();
+  if (r < 0) {
+    return r;
+  }
+  const uint64_t actual_size = get_actual_size();
+  r = manifest_gen.create_next(actual_size);
+  if (r < 0) {
+    return r;
+  }
+
+  obj_ctx.set_atomic(head_obj);
+
+  RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
+
+  /* some object types shouldn't be versioned, e.g., multipart parts */
+  op_target.set_versioning_disabled(!bucket_info.versioning_enabled());
+
+  RGWRados::Object::Write obj_op(&op_target);
+  obj_op.meta.data = &first_chunk;
+  obj_op.meta.manifest = &manifest;
+  obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
+  obj_op.meta.if_match = if_match;
+  obj_op.meta.if_nomatch = if_nomatch;
+  obj_op.meta.mtime = mtime;
+  obj_op.meta.set_mtime = set_mtime;
+  obj_op.meta.owner = owner;
+  obj_op.meta.flags = PUT_OBJ_CREATE;
+  obj_op.meta.olh_epoch = olh_epoch;
+  obj_op.meta.delete_at = delete_at;
+  obj_op.meta.user_data = user_data;
+  obj_op.meta.zones_trace = zones_trace;
+  obj_op.meta.modify_tail = true;
+
+  read_cloudtier_info_from_attrs(attrs, obj_op.meta.category, manifest);
+
+  r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
+  if (r < 0) {
+    if (r == -ETIMEDOUT) {
+      // The head object write may eventually succeed, clear the set of objects for deletion. if it
+      // doesn't ever succeed, we'll orphan any tail objects as if we'd crashed before that write
+      writer.clear_written();
+    }
+    return r;
+  }
+  if (!obj_op.meta.canceled) {
+    // on success, clear the set of objects for deletion
+    writer.clear_written();
+  }
+  if (pcanceled) {
+    *pcanceled = obj_op.meta.canceled;
+  }
+  return 0;
+}
+
+
+int MultipartObjectProcessor::process_first_chunk(bufferlist&& data,
+                                                  DataProcessor **processor)
+{
+  // write the first chunk of the head object as part of an exclusive create,
+  // then drain to wait for the result in case of EEXIST
+  int r = writer.write_exclusive(data);
+  if (r == -EEXIST) {
+    // randomize the oid prefix and reprepare the head/manifest
+    std::string oid_rand = gen_rand_alphanumeric(store->ctx(), 32);
+
+    mp.init(target_obj.key.name, upload_id, oid_rand);
+    manifest.set_prefix(target_obj.key.name + "." + oid_rand);
+
+    r = prepare_head();
+    if (r < 0) {
+      return r;
+    }
+    // resubmit the write op on the new head object
+    r = writer.write_exclusive(data);
+  }
+  if (r < 0) {
+    return r;
+  }
+  *processor = &stripe;
+  return 0;
+}
+
+int MultipartObjectProcessor::prepare_head()
+{
+  const uint64_t default_stripe_size = store->ctx()->_conf->rgw_obj_stripe_size;
+  uint64_t chunk_size;
+  uint64_t stripe_size;
+  uint64_t alignment;
+
+  int r = store->get_max_chunk_size(tail_placement_rule, target_obj, &chunk_size, dpp, &alignment);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: unexpected: get_max_chunk_size(): placement_rule=" << tail_placement_rule.to_str() << " obj=" << target_obj << " returned r=" << r << dendl;
+    return r;
+  }
+  store->get_max_aligned_size(default_stripe_size, alignment, &stripe_size);
+
+  manifest.set_multipart_part_rule(stripe_size, part_num);
+
+  r = manifest_gen.create_begin(store->ctx(), &manifest,
+				bucket_info.placement_rule,
+				&tail_placement_rule,
+				target_obj.bucket, target_obj);
+  if (r < 0) {
+    return r;
+  }
+
+  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+  RGWSI_Tier_RADOS::raw_obj_to_obj(head_obj.bucket, stripe_obj, &head_obj);
+  head_obj.index_hash_source = target_obj.key.name;
+
+  r = writer.set_stripe_obj(stripe_obj);
+  if (r < 0) {
+    return r;
+  }
+  stripe_size = manifest_gen.cur_stripe_max_size();
+  set_head_chunk_size(stripe_size);
+
+  chunk = ChunkProcessor(&writer, chunk_size);
+  stripe = StripeProcessor(&chunk, this, stripe_size);
+  return 0;
+}
+
+int MultipartObjectProcessor::prepare(optional_yield y)
+{
+  manifest.set_prefix(target_obj.key.name + "." + upload_id);
+
+  return prepare_head();
+}
+
+int MultipartObjectProcessor::complete(size_t accounted_size,
+                                       const std::string& etag,
+                                       ceph::real_time *mtime,
+                                       ceph::real_time set_mtime,
+                                       std::map<std::string, bufferlist>& attrs,
+                                       ceph::real_time delete_at,
+                                       const char *if_match,
+                                       const char *if_nomatch,
+                                       const std::string *user_data,
+                                       rgw_zone_set *zones_trace,
+                                       bool *pcanceled, optional_yield y)
+{
+  int r = writer.drain();
+  if (r < 0) {
+    return r;
+  }
+  const uint64_t actual_size = get_actual_size();
+  r = manifest_gen.create_next(actual_size);
+  if (r < 0) {
+    return r;
+  }
+
+  RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
+  op_target.set_versioning_disabled(true);
+  op_target.set_meta_placement_rule(&tail_placement_rule);
+
+  RGWRados::Object::Write obj_op(&op_target);
+  obj_op.meta.set_mtime = set_mtime;
+  obj_op.meta.mtime = mtime;
+  obj_op.meta.owner = owner;
+  obj_op.meta.delete_at = delete_at;
+  obj_op.meta.zones_trace = zones_trace;
+  obj_op.meta.modify_tail = true;
+
+  r = obj_op.write_meta(dpp, actual_size, accounted_size, attrs, y);
+  if (r < 0)
+    return r;
+
+  RGWUploadPartInfo info;
+  string p = "part.";
+  bool sorted_omap = is_v2_upload_id(upload_id);
+
+  if (sorted_omap) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%08d", part_num);
+    p.append(buf);
+  } else {
+    p.append(part_num_str);
+  }
+  info.num = part_num;
+  info.etag = etag;
+  info.size = actual_size;
+  info.accounted_size = accounted_size;
+  info.modified = real_clock::now();
+  info.manifest = manifest;
+
+  bool compressed;
+  r = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info);
+  if (r < 0) {
+    ldpp_dout(dpp, 1) << "cannot get compression info" << dendl;
+    return r;
+  }
+
+  rgw_obj meta_obj;
+  meta_obj.init_ns(bucket_info.bucket, mp.get_meta(), RGW_OBJ_NS_MULTIPART);
+  meta_obj.set_in_extra_data(true);
+
+  rgw_raw_obj meta_raw_obj;
+  store->obj_to_raw(bucket_info.placement_rule, meta_obj, &meta_raw_obj); 
+
+  rgw_rados_ref meta_obj_ref;
+  r = store->get_raw_obj_ref(dpp, meta_raw_obj, &meta_obj_ref);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref of meta obj with ret=" << r << dendl;
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  cls_rgw_mp_upload_part_info_update(op, p, info);
+  r = rgw_rados_operate(dpp, meta_obj_ref.pool.ioctx(), meta_obj_ref.obj.oid, &op, y);
+  ldpp_dout(dpp, 20) << "Update meta: " << meta_obj_ref.obj.oid << " part " << p << " prefix " << info.manifest.get_prefix() << " return " << r << dendl;
+
+  if (r == -EOPNOTSUPP) {
+    // New CLS call to update part info is not yet supported. Fall back to the old handling.
+    bufferlist bl;
+    encode(info, bl);
+
+    map<string, bufferlist> m;
+    m[p] = bl;
+
+    op = librados::ObjectWriteOperation{};
+    op.assert_exists(); // detect races with abort
+    op.omap_set(m);
+    r = rgw_rados_operate(dpp, meta_obj_ref.pool.ioctx(), meta_obj_ref.obj.oid, &op, y);
+  }
+  if (r < 0) {
+    return r == -ENOENT ? -ERR_NO_SUCH_UPLOAD : r;
+  }
+
+  if (!obj_op.meta.canceled) {
+    // on success, clear the set of objects for deletion
+    writer.clear_written();
+  }
+  if (pcanceled) {
+    *pcanceled = obj_op.meta.canceled;
+  }
+  return 0;
+}
+
+int AppendObjectProcessor::process_first_chunk(bufferlist &&data, rgw::sal::DataProcessor **processor)
+{
+  int r = writer.write_exclusive(data);
+  if (r < 0) {
+    return r;
+  }
+  *processor = &stripe;
+  return 0;
+}
+
+int AppendObjectProcessor::prepare(optional_yield y)
+{
+  RGWObjState *astate;
+  int r = store->get_obj_state(dpp, &obj_ctx, bucket_info, head_obj,
+                               &astate, &cur_manifest, y);
+  if (r < 0) {
+    return r;
+  }
+  cur_size = astate->size;
+  *cur_accounted_size = astate->accounted_size;
+  if (!astate->exists) {
+    if (position != 0) {
+      ldpp_dout(dpp, 5) << "ERROR: Append position should be zero" << dendl;
+      return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
+    } else {
+      cur_part_num = 1;
+      //set the prefix
+      char buf[33];
+      gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+      string oid_prefix = head_obj.key.name;
+      oid_prefix.append(".");
+      oid_prefix.append(buf);
+      oid_prefix.append("_");
+      manifest.set_prefix(oid_prefix);
+    }
+  } else {
+    // check whether the object appendable
+    map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
+    if (iter == astate->attrset.end()) {
+      ldpp_dout(dpp, 5) << "ERROR: The object is not appendable" << dendl;
+      return -ERR_OBJECT_NOT_APPENDABLE;
+    }
+    if (position != *cur_accounted_size) {
+      ldpp_dout(dpp, 5) << "ERROR: Append position should be equal to the obj size" << dendl;
+      return -ERR_POSITION_NOT_EQUAL_TO_LENGTH;
+    }
+    try {
+      using ceph::decode;
+      decode(cur_part_num, iter->second);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 5) << "ERROR: failed to decode part num" << dendl;
+      return -EIO;
+    }
+    cur_part_num++;
+    //get the current obj etag
+    iter = astate->attrset.find(RGW_ATTR_ETAG);
+    if (iter != astate->attrset.end()) {
+      string s = rgw_string_unquote(iter->second.c_str());
+      size_t pos = s.find("-");
+      cur_etag = s.substr(0, pos);
+    }
+
+    iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
+    if (iter != astate->attrset.end()) {
+      tail_placement_rule.storage_class = iter->second.to_str();
+    } else {
+      tail_placement_rule.storage_class = RGW_STORAGE_CLASS_STANDARD;
+    }
+    manifest.set_prefix(cur_manifest->get_prefix());
+    astate->keep_tail = true;
+  }
+  manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, cur_part_num);
+
+  r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, &tail_placement_rule, head_obj.bucket, head_obj);
+  if (r < 0) {
+    return r;
+  }
+  rgw_raw_obj stripe_obj = manifest_gen.get_cur_obj(store);
+
+  uint64_t chunk_size = 0;
+  r = store->get_max_chunk_size(stripe_obj.pool, &chunk_size, dpp);
+  if (r < 0) {
+    return r;
+  }
+  r = writer.set_stripe_obj(std::move(stripe_obj));
+  if (r < 0) {
+    return r;
+  }
+
+  uint64_t stripe_size = manifest_gen.cur_stripe_max_size();
+
+  uint64_t max_head_size = std::min(chunk_size, stripe_size);
+  set_head_chunk_size(max_head_size);
+
+  // initialize the processors
+  chunk = ChunkProcessor(&writer, chunk_size);
+  stripe = StripeProcessor(&chunk, this, stripe_size);
+
+  return 0;
+}
+
+int AppendObjectProcessor::complete(size_t accounted_size, const string &etag, ceph::real_time *mtime,
+                                    ceph::real_time set_mtime, rgw::sal::Attrs& attrs,
+                                    ceph::real_time delete_at, const char *if_match, const char *if_nomatch,
+                                    const string *user_data, rgw_zone_set *zones_trace, bool *pcanceled,
+                                    optional_yield y)
+{
+  int r = writer.drain();
+  if (r < 0)
+    return r;
+  const uint64_t actual_size = get_actual_size();
+  r = manifest_gen.create_next(actual_size);
+  if (r < 0) {
+    return r;
+  }
+  obj_ctx.set_atomic(head_obj);
+  RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
+  //For Append obj, disable versioning
+  op_target.set_versioning_disabled(true);
+  RGWRados::Object::Write obj_op(&op_target);
+  if (cur_manifest) {
+    cur_manifest->append(dpp, manifest, store->svc.zone->get_zonegroup(), store->svc.zone->get_zone_params());
+    obj_op.meta.manifest = cur_manifest;
+  } else {
+    obj_op.meta.manifest = &manifest;
+  }
+  obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
+  obj_op.meta.mtime = mtime;
+  obj_op.meta.set_mtime = set_mtime;
+  obj_op.meta.owner = owner;
+  obj_op.meta.flags = PUT_OBJ_CREATE;
+  obj_op.meta.delete_at = delete_at;
+  obj_op.meta.user_data = user_data;
+  obj_op.meta.zones_trace = zones_trace;
+  obj_op.meta.modify_tail = true;
+  obj_op.meta.appendable = true;
+  //Add the append part number
+  bufferlist cur_part_num_bl;
+  using ceph::encode;
+  encode(cur_part_num, cur_part_num_bl);
+  attrs[RGW_ATTR_APPEND_PART_NUM] = cur_part_num_bl;
+  //calculate the etag
+  if (!cur_etag.empty()) {
+    MD5 hash;
+    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+    hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+    char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+    hex_to_buf(cur_etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+    hash.Update((const unsigned char *)petag, sizeof(petag));
+    hex_to_buf(etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+    hash.Update((const unsigned char *)petag, sizeof(petag));
+    hash.Final((unsigned char *)final_etag);
+    buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
+    snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],  sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+             "-%lld", (long long)cur_part_num);
+    bufferlist etag_bl;
+    etag_bl.append(final_etag_str, strlen(final_etag_str) + 1);
+    attrs[RGW_ATTR_ETAG] = etag_bl;
+  }
+  r = obj_op.write_meta(dpp, actual_size + cur_size,
+			accounted_size + *cur_accounted_size,
+			attrs, y);
+  if (r < 0) {
+    return r;
+  }
+  if (!obj_op.meta.canceled) {
+    // on success, clear the set of objects for deletion
+    writer.clear_written();
+  }
+  if (pcanceled) {
+    *pcanceled = obj_op.meta.canceled;
+  }
+  *cur_accounted_size += accounted_size;
+
+  return 0;
+}
+
+} // namespace rgw::putobj
diff --git a/src/rgw/driver/rados/rgw_putobj_processor.h b/src/rgw/driver/rados/rgw_putobj_processor.h
new file mode 100644
index 000000000..fa9200f32
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_putobj_processor.h
@@ -0,0 +1,282 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <optional>
+
+#include "rgw_putobj.h"
+#include "services/svc_rados.h"
+#include "services/svc_tier_rados.h"
+#include "rgw_sal.h"
+#include "rgw_obj_manifest.h"
+
+namespace rgw {
+
+namespace sal {
+  class RadosStore;
+}
+
+class Aio;
+
+namespace putobj {
+
+// an object processor with special handling for the first chunk of the head.
+// the virtual process_first_chunk() function returns a processor to handle the
+// rest of the object
+class HeadObjectProcessor : public rgw::sal::ObjectProcessor {
+  uint64_t head_chunk_size;
+  // buffer to capture the first chunk of the head object
+  bufferlist head_data;
+  // initialized after process_first_chunk() to process everything else
+  rgw::sal::DataProcessor *processor = nullptr;
+  uint64_t data_offset = 0; // maximum offset of data written (ie compressed)
+ protected:
+  uint64_t get_actual_size() const { return data_offset; }
+
+  // process the first chunk of data and return a processor for the rest
+  virtual int process_first_chunk(bufferlist&& data,
+                                  rgw::sal::DataProcessor **processor) = 0;
+ public:
+  HeadObjectProcessor(uint64_t head_chunk_size)
+    : head_chunk_size(head_chunk_size)
+  {}
+
+  void set_head_chunk_size(uint64_t size) { head_chunk_size = size; }
+
+  // cache first chunk for process_first_chunk(), then forward everything else
+  // to the returned processor
+  int process(bufferlist&& data, uint64_t logical_offset) final override;
+};
+
+using RawObjSet = std::set<rgw_raw_obj>;
+
+// a data sink that writes to rados objects and deletes them on cancelation
+class RadosWriter : public rgw::sal::DataProcessor {
+  Aio *const aio;
+  RGWRados *const store;
+  const RGWBucketInfo& bucket_info;
+  RGWObjectCtx& obj_ctx;
+  const rgw_obj head_obj;
+  RGWSI_RADOS::Obj stripe_obj; // current stripe object
+  RawObjSet written; // set of written objects for deletion
+  const DoutPrefixProvider *dpp;
+  optional_yield y;
+
+ public:
+  RadosWriter(Aio *aio, RGWRados *store,
+              const RGWBucketInfo& bucket_info,
+              RGWObjectCtx& obj_ctx, const rgw_obj& _head_obj,
+              const DoutPrefixProvider *dpp, optional_yield y)
+    : aio(aio), store(store), bucket_info(bucket_info),
+      obj_ctx(obj_ctx), head_obj(_head_obj), dpp(dpp), y(y)
+  {}
+  ~RadosWriter();
+
+  // add alloc hint to osd
+  void add_write_hint(librados::ObjectWriteOperation& op);
+
+  // change the current stripe object
+  int set_stripe_obj(const rgw_raw_obj& obj);
+
+  // write the data at the given offset of the current stripe object
+  int process(bufferlist&& data, uint64_t stripe_offset) override;
+
+  // write the data as an exclusive create and wait for it to complete
+  int write_exclusive(const bufferlist& data);
+
+  int drain();
+
+  // when the operation completes successfully, clear the set of written objects
+  // so they aren't deleted on destruction
+  void clear_written() { written.clear(); }
+
+};
+
+
+// a rados object processor that stripes according to RGWObjManifest
+class ManifestObjectProcessor : public HeadObjectProcessor,
+                                public StripeGenerator {
+ protected:
+  RGWRados* const store;
+  RGWBucketInfo& bucket_info;
+  rgw_placement_rule tail_placement_rule;
+  rgw_user owner;
+  RGWObjectCtx& obj_ctx;
+  rgw_obj head_obj;
+
+  RadosWriter writer;
+  RGWObjManifest manifest;
+  RGWObjManifest::generator manifest_gen;
+  ChunkProcessor chunk;
+  StripeProcessor stripe;
+  const DoutPrefixProvider *dpp;
+
+  // implements StripeGenerator
+  int next(uint64_t offset, uint64_t *stripe_size) override;
+
+ public:
+  ManifestObjectProcessor(Aio *aio, RGWRados* store,
+                          RGWBucketInfo& bucket_info,
+                          const rgw_placement_rule *ptail_placement_rule,
+                          const rgw_user& owner, RGWObjectCtx& _obj_ctx,
+                          const rgw_obj& _head_obj,
+                          const DoutPrefixProvider* dpp, optional_yield y)
+    : HeadObjectProcessor(0),
+      store(store), bucket_info(bucket_info),
+      owner(owner),
+      obj_ctx(_obj_ctx), head_obj(_head_obj),
+      writer(aio, store, bucket_info, obj_ctx, head_obj, dpp, y),
+      chunk(&writer, 0), stripe(&chunk, this, 0), dpp(dpp) {
+        if (ptail_placement_rule) {
+          tail_placement_rule = *ptail_placement_rule;
+        }
+      }
+
+  void set_owner(const rgw_user& _owner) {
+    owner = _owner;
+  }
+
+  void set_tail_placement(const rgw_placement_rule& tpr) {
+    tail_placement_rule = tpr;
+  }
+  void set_tail_placement(const rgw_placement_rule&& tpr) {
+    tail_placement_rule = tpr;
+  }
+
+};
+
+
+// a processor that completes with an atomic write to the head object as part of
+// a bucket index transaction
+class AtomicObjectProcessor : public ManifestObjectProcessor {
+  const std::optional<uint64_t> olh_epoch;
+  const std::string unique_tag;
+  bufferlist first_chunk; // written with the head in complete()
+
+  int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+ public:
+  AtomicObjectProcessor(Aio *aio, RGWRados* store,
+                        RGWBucketInfo& bucket_info,
+                        const rgw_placement_rule *ptail_placement_rule,
+                        const rgw_user& owner,
+                        RGWObjectCtx& obj_ctx, const rgw_obj& _head_obj,
+                        std::optional<uint64_t> olh_epoch,
+                        const std::string& unique_tag,
+                        const DoutPrefixProvider *dpp, optional_yield y)
+    : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule,
+                              owner, obj_ctx, _head_obj, dpp, y),
+      olh_epoch(olh_epoch), unique_tag(unique_tag)
+  {}
+
+  // prepare a trivial manifest
+  int prepare(optional_yield y) override;
+  // write the head object atomically in a bucket index transaction
+  int complete(size_t accounted_size, const std::string& etag,
+               ceph::real_time *mtime, ceph::real_time set_mtime,
+               std::map<std::string, bufferlist>& attrs,
+               ceph::real_time delete_at,
+               const char *if_match, const char *if_nomatch,
+               const std::string *user_data,
+               rgw_zone_set *zones_trace, bool *canceled,
+               optional_yield y) override;
+
+};
+
+
+// a processor for multipart parts, which don't require atomic completion. the
+// part's head is written with an exclusive create to detect racing uploads of
+// the same part/upload id, which are restarted with a random oid prefix
+class MultipartObjectProcessor : public ManifestObjectProcessor {
+  const rgw_obj target_obj; // target multipart object
+  const std::string upload_id;
+  const int part_num;
+  const std::string part_num_str;
+  RGWMPObj mp;
+
+  // write the first chunk and wait on aio->drain() for its completion.
+  // on EEXIST, retry with random prefix
+  int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+  // prepare the head stripe and manifest
+  int prepare_head();
+ public:
+  MultipartObjectProcessor(Aio *aio, RGWRados* store,
+                           RGWBucketInfo& bucket_info,
+                           const rgw_placement_rule *ptail_placement_rule,
+                           const rgw_user& owner, RGWObjectCtx& obj_ctx,
+                           const rgw_obj& _head_obj,
+                           const std::string& upload_id, uint64_t part_num,
+                           const std::string& part_num_str,
+                           const DoutPrefixProvider *dpp, optional_yield y)
+    : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule,
+                              owner, obj_ctx, _head_obj, dpp, y),
+      target_obj(head_obj), upload_id(upload_id),
+      part_num(part_num), part_num_str(part_num_str),
+      mp(head_obj.key.name, upload_id)
+  {}
+
+  // prepare a multipart manifest
+  int prepare(optional_yield y) override;
+  // write the head object attributes in a bucket index transaction, then
+  // register the completed part with the multipart meta object
+  int complete(size_t accounted_size, const std::string& etag,
+               ceph::real_time *mtime, ceph::real_time set_mtime,
+               std::map<std::string, bufferlist>& attrs,
+               ceph::real_time delete_at,
+               const char *if_match, const char *if_nomatch,
+               const std::string *user_data,
+               rgw_zone_set *zones_trace, bool *canceled,
+               optional_yield y) override;
+
+};
+
+  class AppendObjectProcessor : public ManifestObjectProcessor {
+    uint64_t cur_part_num;
+    uint64_t position;
+    uint64_t cur_size;
+    uint64_t *cur_accounted_size;
+    std::string cur_etag;
+    const std::string unique_tag;
+
+    RGWObjManifest *cur_manifest;
+
+    int process_first_chunk(bufferlist&& data, rgw::sal::DataProcessor **processor) override;
+
+  public:
+    AppendObjectProcessor(Aio *aio, RGWRados* store,
+                          RGWBucketInfo& bucket_info,
+                          const rgw_placement_rule *ptail_placement_rule,
+                          const rgw_user& owner, RGWObjectCtx& obj_ctx,
+                          const rgw_obj& _head_obj,
+                          const std::string& unique_tag, uint64_t position,
+                          uint64_t *cur_accounted_size,
+                          const DoutPrefixProvider *dpp, optional_yield y)
+            : ManifestObjectProcessor(aio, store, bucket_info, ptail_placement_rule,
+                                      owner, obj_ctx, _head_obj, dpp, y),
+              position(position), cur_size(0), cur_accounted_size(cur_accounted_size),
+              unique_tag(unique_tag), cur_manifest(nullptr)
+    {}
+    int prepare(optional_yield y) override;
+    int complete(size_t accounted_size, const std::string& etag,
+                 ceph::real_time *mtime, ceph::real_time set_mtime,
+                 std::map<std::string, bufferlist>& attrs, ceph::real_time delete_at,
+                 const char *if_match, const char *if_nomatch, const std::string *user_data,
+                 rgw_zone_set *zones_trace, bool *canceled,
+                 optional_yield y) override;
+  };
+
+} // namespace putobj
+} // namespace rgw
+
diff --git a/src/rgw/driver/rados/rgw_rados.cc b/src/rgw/driver/rados/rgw_rados.cc
new file mode 100644
index 000000000..10018d4a6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rados.cc
@@ -0,0 +1,10076 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/compat.h"
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sstream>
+
+#include <boost/algorithm/string.hpp>
+#include <string_view>
+
+#include <boost/container/flat_set.hpp>
+#include <boost/format.hpp>
+#include <boost/optional.hpp>
+#include <boost/utility/in_place_factory.hpp>
+
+#include "common/ceph_json.h"
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/Throttle.h"
+#include "common/BackTrace.h"
+
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_cache.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
+#include "rgw_aio_throttle.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_datalog.h"
+#include "rgw_putobj_processor.h"
+
+#include "cls/rgw/cls_rgw_ops.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/rgw/cls_rgw_const.h"
+#include "cls/refcount/cls_refcount_client.h"
+#include "cls/version/cls_version_client.h"
+#include "osd/osd_types.h"
+
+#include "rgw_tools.h"
+#include "rgw_coroutine.h"
+#include "rgw_compression.h"
+#include "rgw_crypt.h"
+#include "rgw_etag_verifier.h"
+#include "rgw_worker.h"
+#include "rgw_notify.h"
+#include "rgw_http_errors.h"
+
+#undef fork // fails to compile RGWPeriod::fork() below
+
+#include "common/Clock.h"
+
+#include <string>
+#include <iostream>
+#include <vector>
+#include <atomic>
+#include <list>
+#include <map>
+#include "include/random.h"
+
+#include "rgw_gc.h"
+#include "rgw_lc.h"
+
+#include "rgw_object_expirer_core.h"
+#include "rgw_sync.h"
+#include "rgw_sync_counters.h"
+#include "rgw_sync_trace.h"
+#include "rgw_trim_datalog.h"
+#include "rgw_trim_mdlog.h"
+#include "rgw_data_sync.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_reshard.h"
+#include "rgw_cr_rados.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_quota.h"
+#include "services/svc_sync_modules.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_sys_obj_cache.h"
+#include "services/svc_bucket.h"
+#include "services/svc_mdlog.h"
+
+#include "compressor/Compressor.h"
+
+#include "rgw_d3n_datacache.h"
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/rgw_rados.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace librados;
+
+#define ldout_bitx(_bitx, _dpp, _level) if(_bitx) { ldpp_dout(_dpp, 0) << "BITX: "
+#define ldout_bitx_c(_bitx, _ctx, _level) if(_bitx) { ldout(_ctx, 0) << "BITX: "
+#define dendl_bitx                      dendl ; }
+
+static string shadow_ns = "shadow";
+static string default_bucket_index_pool_suffix = "rgw.buckets.index";
+static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
+
+static RGWObjCategory main_category = RGWObjCategory::Main;
+#define RGW_USAGE_OBJ_PREFIX "usage."
+
+rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados* store) const
+{
+  if (!is_raw) {
+    rgw_raw_obj r;
+    store->obj_to_raw(placement_rule, obj, &r);
+    return r;
+  }
+  return raw_obj;
+}
+
+void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op)
+{
+  obj_version* check_objv = version_for_check();
+
+  if (check_objv) {
+    cls_version_check(*op, *check_objv, VER_COND_EQ);
+  }
+
+  cls_version_read(*op, &read_version);
+}
+
+void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
+{
+  obj_version* check_objv = version_for_check();
+  obj_version* modify_version = version_for_write();
+
+  if (check_objv) {
+    cls_version_check(*op, *check_objv, VER_COND_EQ);
+  }
+
+  if (modify_version) {
+    cls_version_set(*op, *modify_version);
+  } else {
+    cls_version_inc(*op);
+  }
+}
+
+void RGWObjVersionTracker::apply_write()
+{
+  const bool checked = (read_version.ver != 0);
+  const bool incremented = (write_version.ver == 0);
+
+  if (checked && incremented) {
+    // apply cls_version_inc() so our next operation can recheck it
+    ++read_version.ver;
+  } else {
+    read_version = write_version;
+  }
+  write_version = obj_version();
+}
+
+RGWObjStateManifest *RGWObjectCtx::get_state(const rgw_obj& obj) {
+  RGWObjStateManifest *result;
+  typename std::map<rgw_obj, RGWObjStateManifest>::iterator iter;
+  lock.lock_shared();
+  assert (!obj.empty());
+  iter = objs_state.find(obj);
+  if (iter != objs_state.end()) {
+    result = &iter->second;
+    lock.unlock_shared();
+  } else {
+    lock.unlock_shared();
+    lock.lock();
+    result = &objs_state[obj];
+    lock.unlock();
+  }
+  return result;
+}
+
+void RGWObjectCtx::set_compressed(const rgw_obj& obj) {
+  std::unique_lock wl{lock};
+  assert (!obj.empty());
+  objs_state[obj].state.compressed = true;
+}
+
+void RGWObjectCtx::set_atomic(const rgw_obj& obj) {
+  std::unique_lock wl{lock};
+  assert (!obj.empty());
+  objs_state[obj].state.is_atomic = true;
+}
+void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
+  std::unique_lock wl{lock};
+  assert (!obj.empty());
+  objs_state[obj].state.prefetch_data = true;
+}
+
+void RGWObjectCtx::invalidate(const rgw_obj& obj) {
+  std::unique_lock wl{lock};
+  auto iter = objs_state.find(obj);
+  if (iter == objs_state.end()) {
+    return;
+  }
+  bool is_atomic = iter->second.state.is_atomic;
+  bool prefetch_data = iter->second.state.prefetch_data;
+  bool compressed = iter->second.state.compressed;
+
+  objs_state.erase(iter);
+
+  if (is_atomic || prefetch_data) {
+    auto& sm = objs_state[obj];
+    sm.state.is_atomic = is_atomic;
+    sm.state.prefetch_data = prefetch_data;
+    sm.state.compressed = compressed;
+  }
+}
+
+class RGWMetaNotifierManager : public RGWCoroutinesManager {
+  RGWRados* store;
+  RGWHTTPManager http_manager;
+
+public:
+  RGWMetaNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
+                                             http_manager(store->ctx(), completion_mgr) {
+    http_manager.start();
+  }
+
+  int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
+    rgw_http_param_pair pairs[] = { { "type", "metadata" },
+                                    { "notify", NULL },
+                                    { NULL, NULL } };
+
+    list<RGWCoroutinesStack *> stacks;
+    for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
+      RGWRESTConn *conn = iter->second;
+      RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
+      stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
+
+      stacks.push_back(stack);
+    }
+    return run(dpp, stacks);
+  }
+};
+
+class RGWDataNotifierManager : public RGWCoroutinesManager {
+  RGWRados* store;
+  RGWHTTPManager http_manager;
+
+public:
+  RGWDataNotifierManager(RGWRados *_driver) : RGWCoroutinesManager(_driver->ctx(), _driver->get_cr_registry()), store(_driver),
+                                             http_manager(store->ctx(), completion_mgr) {
+    http_manager.start();
+  }
+
+  int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map,
+		bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& shards) {
+
+    list<RGWCoroutinesStack *> stacks;
+    const char *source_zone = store->svc.zone->get_zone_params().get_id().c_str();
+    for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
+      RGWRESTConn *conn = iter->second;
+      RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
+      stack->call(new RGWDataPostNotifyCR(store, http_manager, shards, source_zone, conn));
+      stacks.push_back(stack);
+    }
+
+    return run(dpp, stacks);
+  }
+};
+
+/* class RGWRadosThread */
+
+void RGWRadosThread::start()
+{
+  worker = new Worker(cct, this);
+  worker->create(thread_name.c_str());
+}
+
+void RGWRadosThread::stop()
+{
+  down_flag = true;
+  stop_process();
+  if (worker) {
+    worker->signal();
+    worker->join();
+  }
+  delete worker;
+  worker = NULL;
+}
+
+void *RGWRadosThread::Worker::entry() {
+  uint64_t msec = processor->interval_msec();
+  auto interval = std::chrono::milliseconds(msec);
+
+  do {
+    auto start = ceph::real_clock::now();
+    int r = processor->process(this);
+    if (r < 0) {
+      ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl;
+    }
+
+    if (processor->going_down())
+      break;
+
+    auto end = ceph::real_clock::now() - start;
+
+    uint64_t cur_msec = processor->interval_msec();
+    if (cur_msec != msec) { /* was it reconfigured? */
+      msec = cur_msec;
+      interval = std::chrono::milliseconds(msec);
+    }
+
+    if (cur_msec > 0) {
+      if (interval <= end)
+        continue; // next round
+
+      auto wait_time = interval - end;
+      wait_interval(wait_time);
+    } else {
+      wait();
+    }
+  } while (!processor->going_down());
+
+  return NULL;
+}
+
+class RGWMetaNotifier : public RGWRadosThread {
+  RGWMetaNotifierManager notify_mgr;
+  RGWMetadataLog *const log;
+
+  uint64_t interval_msec() override {
+    return cct->_conf->rgw_md_notify_interval_msec;
+  }
+  void stop_process() override {
+    notify_mgr.stop();
+  }
+public:
+  RGWMetaNotifier(RGWRados *_driver, RGWMetadataLog* log)
+    : RGWRadosThread(_driver, "meta-notifier"), notify_mgr(_driver), log(log) {}
+
+  int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWMetaNotifier::process(const DoutPrefixProvider *dpp)
+{
+  set<int> shards;
+
+  log->read_clear_modified(shards);
+
+  if (shards.empty()) {
+    return 0;
+  }
+
+  for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
+    ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
+  }
+
+  notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards);
+
+  return 0;
+}
+
+class RGWDataNotifier : public RGWRadosThread {
+  RGWDataNotifierManager notify_mgr;
+  bc::flat_set<rgw_data_notify_entry> entry;
+
+  uint64_t interval_msec() override {
+    return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
+  }
+  void stop_process() override {
+    notify_mgr.stop();
+  }
+public:
+  RGWDataNotifier(RGWRados *_driver) : RGWRadosThread(_driver, "data-notifier"), notify_mgr(_driver) {}
+
+  int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWDataNotifier::process(const DoutPrefixProvider *dpp)
+{
+  auto data_log = store->svc.datalog_rados;
+  if (!data_log) {
+    return 0;
+  }
+
+  auto shards = data_log->read_clear_modified();
+
+  if (shards.empty()) {
+    return 0;
+  }
+
+  for (const auto& [shard_id, entries] : shards) {
+    bc::flat_set<rgw_data_notify_entry>::iterator it;
+    for (const auto& entry : entries) {
+      ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id="
+        << shard_id << ":" << entry.gen << ":" << entry.key << dendl;
+    }
+  }
+
+  notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards);
+
+  return 0;
+}
+
+class RGWSyncProcessorThread : public RGWRadosThread {
+public:
+  RGWSyncProcessorThread(RGWRados *_driver, const string& thread_name = "radosgw") : RGWRadosThread(_driver, thread_name) {}
+  RGWSyncProcessorThread(RGWRados *_driver) : RGWRadosThread(_driver) {}
+  ~RGWSyncProcessorThread() override {}
+  int init(const DoutPrefixProvider *dpp) override = 0 ;
+  int process(const DoutPrefixProvider *dpp) override = 0;
+};
+
+class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
+{
+  RGWMetaSyncStatusManager sync;
+
+  uint64_t interval_msec() override {
+    return 0; /* no interval associated, it'll run once until stopped */
+  }
+  void stop_process() override {
+    sync.stop();
+  }
+public:
+  RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados)
+    : RGWSyncProcessorThread(_driver->getRados(), "meta-sync"), sync(_driver, async_rados) {}
+
+  void wakeup_sync_shards(set<int>& shard_ids) {
+    for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
+      sync.wakeup(*iter);
+    }
+  }
+  RGWMetaSyncStatusManager* get_manager() { return &sync; }
+
+  int init(const DoutPrefixProvider *dpp) override {
+    int ret = sync.init(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+
+  int process(const DoutPrefixProvider *dpp) override {
+    sync.run(dpp, null_yield);
+    return 0;
+  }
+};
+
+class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
+{
+  PerfCountersRef counters;
+  RGWDataSyncStatusManager sync;
+  bool initialized;
+
+  uint64_t interval_msec() override {
+    if (initialized) {
+      return 0; /* no interval associated, it'll run once until stopped */
+    } else {
+#define DATA_SYNC_INIT_WAIT_SEC 20
+      return DATA_SYNC_INIT_WAIT_SEC * 1000;
+    }
+  }
+  void stop_process() override {
+    sync.stop();
+  }
+public:
+  RGWDataSyncProcessorThread(rgw::sal::RadosStore* _driver, RGWAsyncRadosProcessor *async_rados,
+                             const RGWZone* source_zone)
+    : RGWSyncProcessorThread(_driver->getRados(), "data-sync"),
+      counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
+      sync(_driver, async_rados, source_zone->id, counters.get()),
+      initialized(false) {}
+
+  void wakeup_sync_shards(bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries) {
+    for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+      sync.wakeup(iter->first, iter->second);
+    }
+  }
+
+  RGWDataSyncStatusManager* get_manager() { return &sync; }
+
+  int init(const DoutPrefixProvider *dpp) override {
+    return 0;
+  }
+
+  int process(const DoutPrefixProvider *dpp) override {
+    while (!initialized) {
+      if (going_down()) {
+        return 0;
+      }
+      int ret = sync.init(dpp);
+      if (ret >= 0) {
+        initialized = true;
+        break;
+      }
+      /* we'll be back! */
+      return 0;
+    }
+    sync.run(dpp);
+    return 0;
+  }
+};
+
+class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
+{
+  RGWCoroutinesManager crs;
+  rgw::sal::RadosStore* store;
+  rgw::BucketTrimManager *bucket_trim;
+  RGWHTTPManager http;
+  const utime_t trim_interval;
+
+  uint64_t interval_msec() override { return 0; }
+  void stop_process() override { crs.stop(); }
+public:
+  RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim,
+                       int interval)
+    : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
+      crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
+      bucket_trim(bucket_trim),
+      http(store->ctx(), crs.get_completion_mgr()),
+      trim_interval(interval, 0)
+  {}
+
+  int init(const DoutPrefixProvider *dpp) override {
+    return http.start();
+  }
+  int process(const DoutPrefixProvider *dpp) override {
+    list<RGWCoroutinesStack*> stacks;
+    auto metatrimcr = create_meta_log_trim_cr(this, static_cast<rgw::sal::RadosStore*>(store), &http,
+					      cct->_conf->rgw_md_log_max_shards,
+					      trim_interval);
+    if (!metatrimcr) {
+      ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl;
+      return -EINVAL;
+    }
+    auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
+    meta->call(metatrimcr);
+
+    stacks.push_back(meta);
+
+    if (store->svc()->zone->sync_module_exports_data()) {
+      auto data = new RGWCoroutinesStack(store->ctx(), &crs);
+      data->call(create_data_log_trim_cr(dpp, static_cast<rgw::sal::RadosStore*>(store), &http,
+                                         cct->_conf->rgw_data_log_num_shards,
+                                         trim_interval));
+      stacks.push_back(data);
+
+      auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
+      bucket->call(bucket_trim->create_bucket_trim_cr(&http));
+      stacks.push_back(bucket);
+    }
+
+    crs.run(dpp, stacks);
+    return 0;
+  }
+
+  // implements DoutPrefixProvider
+  CephContext *get_cct() const override { return store->ctx(); }
+  unsigned get_subsys() const override
+  {
+    return dout_subsys;
+  }
+
+  std::ostream& gen_prefix(std::ostream& out) const override
+  {
+    return out << "sync log trim: ";
+  }
+
+};
+
+void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
+{
+  std::lock_guard l{meta_sync_thread_lock};
+  if (meta_sync_processor_thread) {
+    meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
+  }
+}
+
+void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries)
+{
+  ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", entries=" << entries << dendl;
+  for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+    ldpp_dout(dpp, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+    bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+    for (const auto& [key, gen] : entries) {
+      ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", key=" << key
+			 << ", gen=" << gen << dendl;
+    }
+  }
+
+  std::lock_guard l{data_sync_thread_lock};
+  auto iter = data_sync_processor_threads.find(source_zone);
+  if (iter == data_sync_processor_threads.end()) {
+    ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
+    return;
+  }
+
+  RGWDataSyncProcessorThread *thread = iter->second;
+  ceph_assert(thread);
+  thread->wakeup_sync_shards(entries);
+}
+
+RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
+{
+  std::lock_guard l{meta_sync_thread_lock};
+  if (meta_sync_processor_thread) {
+    return meta_sync_processor_thread->get_manager();
+  }
+  return nullptr;
+}
+
+RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
+{
+  std::lock_guard l{data_sync_thread_lock};
+  auto thread = data_sync_processor_threads.find(source_zone);
+  if (thread == data_sync_processor_threads.end()) {
+    return nullptr;
+  }
+  return thread->second->get_manager();
+}
+
+int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment)
+{
+  IoCtx ioctx;
+  int r = open_pool_ctx(dpp, pool, ioctx, false, true);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
+    return r;
+  }
+
+  bool req;
+  r = ioctx.pool_requires_alignment2(&req);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
+      << r << dendl;
+    return r;
+  }
+
+  if (!req) {
+    *alignment = 0;
+    return 0;
+  }
+
+  uint64_t align;
+  r = ioctx.pool_required_alignment2(&align);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
+      << r << dendl;
+    return r;
+  }
+  if (align != 0) {
+    ldpp_dout(dpp, 20) << "required alignment=" << align << dendl;
+  }
+  *alignment = align;
+  return 0;
+}
+
+void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
+{
+  if (alignment == 0) {
+    *max_size = size;
+    return;
+  }
+
+  if (size <= alignment) {
+    *max_size = alignment;
+    return;
+  }
+
+  *max_size = size - (size % alignment);
+}
+
+int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
+{
+  uint64_t alignment;
+  int r = get_required_alignment(dpp, pool, &alignment);
+  if (r < 0) {
+    return r;
+  }
+
+  if (palignment) {
+    *palignment = alignment;
+  }
+
+  uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
+
+  get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
+
+  ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
+
+  return 0;
+}
+
+int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
+                                 uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
+{
+  rgw_pool pool;
+  if (!get_obj_data_pool(placement_rule, obj, &pool)) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
+    return -EIO;
+  }
+  return get_max_chunk_size(pool, max_chunk_size, dpp, palignment);
+}
+
+void add_datalog_entry(const DoutPrefixProvider* dpp,
+                       RGWDataChangesLog* datalog,
+                       const RGWBucketInfo& bucket_info,
+                       uint32_t shard_id, optional_yield y)
+{
+  const auto& logs = bucket_info.layout.logs;
+  if (logs.empty()) {
+    return;
+  }
+  int r = datalog->add_entry(dpp, bucket_info, logs.back(), shard_id, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
+  } // datalog error is not fatal
+}
+
+class RGWIndexCompletionManager;
+
+struct complete_op_data {
+  ceph::mutex lock = ceph::make_mutex("complete_op_data");
+  AioCompletion *rados_completion{nullptr};
+  int manager_shard_id{-1};
+  RGWIndexCompletionManager *manager{nullptr};
+  rgw_obj obj;
+  RGWModifyOp op;
+  string tag;
+  rgw_bucket_entry_ver ver;
+  cls_rgw_obj_key key;
+  rgw_bucket_dir_entry_meta dir_meta;
+  list<cls_rgw_obj_key> remove_objs;
+  bool log_op;
+  uint16_t bilog_op;
+  rgw_zone_set zones_trace;
+
+  bool stopped{false};
+
+  void stop() {
+    std::lock_guard l{lock};
+    stopped = true;
+  }
+};
+
+class RGWIndexCompletionManager {
+  RGWRados* const store;
+  const uint32_t num_shards;
+  ceph::containers::tiny_vector<ceph::mutex> locks;
+  std::vector<set<complete_op_data*>> completions;
+  std::vector<complete_op_data*> retry_completions;
+
+  std::condition_variable cond;
+  std::mutex retry_completions_lock;
+  bool _stop{false};
+  std::thread retry_thread;
+
+  // used to distribute the completions and the locks they use across
+  // their respective vectors; it will get incremented and can wrap
+  // around back to 0 without issue
+  std::atomic<uint32_t> cur_shard {0};
+
+  void process();
+  
+  void add_completion(complete_op_data *completion);
+  
+  void stop() {
+    if (retry_thread.joinable()) {
+      _stop = true;
+      cond.notify_all();
+      retry_thread.join();
+    }
+
+    for (uint32_t i = 0; i < num_shards; ++i) {
+      std::lock_guard l{locks[i]};
+      for (auto c : completions[i]) {
+        c->stop();
+      }
+    }
+    completions.clear();
+  }
+  
+  uint32_t next_shard() {
+    return cur_shard++ % num_shards;
+  }
+
+public:
+  RGWIndexCompletionManager(RGWRados *_driver) :
+    store(_driver),
+    num_shards(store->ctx()->_conf->rgw_thread_pool_size),
+    locks{ceph::make_lock_container<ceph::mutex>(
+      num_shards,
+      [](const size_t i) {
+        return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
+				std::to_string(i));
+      })},
+    completions(num_shards),
+    retry_thread(&RGWIndexCompletionManager::process, this)
+    {}
+
+  ~RGWIndexCompletionManager() {
+    stop();
+  }
+
+  void create_completion(const rgw_obj& obj,
+                         RGWModifyOp op, string& tag,
+                         rgw_bucket_entry_ver& ver,
+                         const cls_rgw_obj_key& key,
+                         rgw_bucket_dir_entry_meta& dir_meta,
+                         list<cls_rgw_obj_key> *remove_objs, bool log_op,
+                         uint16_t bilog_op,
+                         rgw_zone_set *zones_trace,
+                         complete_op_data **result);
+
+  bool handle_completion(completion_t cb, complete_op_data *arg);
+
+  CephContext* ctx() {
+    return store->ctx();
+  }
+};
+
+static void obj_complete_cb(completion_t cb, void *arg)
+{
+  complete_op_data *completion = reinterpret_cast<complete_op_data*>(arg);
+  completion->lock.lock();
+  if (completion->stopped) {
+    completion->lock.unlock(); /* can drop lock, no one else is referencing us */
+    delete completion;
+    return;
+  }
+  bool need_delete = completion->manager->handle_completion(cb, completion);
+  completion->lock.unlock();
+  if (need_delete) {
+    delete completion;
+  }
+}
+
+void RGWIndexCompletionManager::process()
+{
+  DoutPrefix dpp(store->ctx(), dout_subsys, "rgw index completion thread: ");
+  while(!_stop) {
+    std::vector<complete_op_data*> comps;
+
+    {
+      std::unique_lock l{retry_completions_lock};
+      cond.wait(l, [this](){return _stop || !retry_completions.empty();});
+      if (_stop) {
+        return;
+      }
+      retry_completions.swap(comps);
+    }
+
+    for (auto c : comps) {
+      std::unique_ptr<complete_op_data> up{c};
+
+      ldpp_dout(&dpp, 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
+
+      RGWRados::BucketShard bs(store);
+      RGWBucketInfo bucket_info;
+
+      int r = bs.init(c->obj.bucket, c->obj, &bucket_info, &dpp);
+      if (r < 0) {
+        ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
+        /* not much to do */
+        continue;
+      }
+
+      r = store->guard_reshard(&dpp, &bs, c->obj, bucket_info,
+			     [&](RGWRados::BucketShard *bs) -> int {
+			       const bool bitx = ctx()->_conf->rgw_bucket_index_transaction_instrumentation;
+			       ldout_bitx(bitx, &dpp, 10) <<
+				 "ENTERING " << __func__ << ": bucket-shard=" << bs <<
+				 " obj=" << c->obj << " tag=" << c->tag <<
+				 " op=" << c->op << ", remove_objs=" << c->remove_objs << dendl_bitx;
+			       ldout_bitx(bitx, &dpp, 25) <<
+				 "BACKTRACE: " << __func__ << ": " << ClibBackTrace(1) << dendl_bitx;
+
+			       librados::ObjectWriteOperation o;
+			       o.assert_exists();
+			       cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+			       cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
+							  c->log_op, c->bilog_op, &c->zones_trace);
+			       int ret = bs->bucket_obj.operate(&dpp, &o, null_yield);
+			       ldout_bitx(bitx, &dpp, 10) <<
+				 "EXITING " << __func__ << ": ret=" << dendl_bitx;
+			       return ret;
+                             });
+      if (r < 0) {
+        ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
+        /* ignoring error, can't do anything about it */
+        continue;
+      }
+
+      // This null_yield can stay, for now, since we're in our own thread
+      add_datalog_entry(&dpp, store->svc.datalog_rados, bucket_info,
+			bs.shard_id, null_yield);
+    }
+  }
+}
+
+void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
+                                                  RGWModifyOp op, string& tag,
+                                                  rgw_bucket_entry_ver& ver,
+                                                  const cls_rgw_obj_key& key,
+                                                  rgw_bucket_dir_entry_meta& dir_meta,
+                                                  list<cls_rgw_obj_key> *remove_objs, bool log_op,
+                                                  uint16_t bilog_op,
+                                                  rgw_zone_set *zones_trace,
+                                                  complete_op_data **result)
+{
+  complete_op_data *entry = new complete_op_data;
+
+  int shard_id = next_shard();
+
+  entry->manager_shard_id = shard_id;
+  entry->manager = this;
+  entry->obj = obj;
+  entry->op = op;
+  entry->tag = tag;
+  entry->ver = ver;
+  entry->key = key;
+  entry->dir_meta = dir_meta;
+  entry->log_op = log_op;
+  entry->bilog_op = bilog_op;
+
+  if (remove_objs) {
+    for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
+      entry->remove_objs.push_back(*iter);
+    }
+  }
+
+  if (zones_trace) {
+    entry->zones_trace = *zones_trace;
+  } else {
+    entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
+  }
+
+  *result = entry;
+
+  entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
+
+  std::lock_guard l{locks[shard_id]};
+  const auto ok = completions[shard_id].insert(entry).second;
+  ceph_assert(ok);
+}
+
+void RGWIndexCompletionManager::add_completion(complete_op_data *completion) {
+  {
+    std::lock_guard l{retry_completions_lock};
+    retry_completions.push_back(completion);
+  }
+  cond.notify_all();
+}
+
+bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
+{
+  int shard_id = arg->manager_shard_id;
+  {
+    std::lock_guard l{locks[shard_id]};
+
+    auto& comps = completions[shard_id];
+
+    auto iter = comps.find(arg);
+    if (iter == comps.end()) {
+      ldout(arg->manager->ctx(), 0) << __func__ << "(): cannot find completion for obj=" << arg->key << dendl;
+      return true;
+    }
+
+    comps.erase(iter);
+  }
+
+  int r = rados_aio_get_return_value(cb);
+  if (r != -ERR_BUSY_RESHARDING) {
+    ldout(arg->manager->ctx(), 20) << __func__ << "(): completion " << 
+      (r == 0 ? "ok" : "failed with " + to_string(r)) << 
+      " for obj=" << arg->key << dendl;
+    return true;
+  }
+  add_completion(arg);
+  ldout(arg->manager->ctx(), 20) << __func__ << "(): async completion added for obj=" << arg->key << dendl;
+  return false;
+}
+
+void RGWRados::finalize()
+{
+  /* Before joining any sync threads, drain outstanding requests &
+   * mark the async_processor as going_down() */
+  if (svc.rados) {
+    svc.rados->stop_processor();
+  }
+
+  if (run_sync_thread) {
+    std::lock_guard l{meta_sync_thread_lock};
+    meta_sync_processor_thread->stop();
+
+    std::lock_guard dl{data_sync_thread_lock};
+    for (auto iter : data_sync_processor_threads) {
+      RGWDataSyncProcessorThread *thread = iter.second;
+      thread->stop();
+    }
+    if (sync_log_trimmer) {
+      sync_log_trimmer->stop();
+    }
+  }
+  if (run_sync_thread) {
+    delete meta_sync_processor_thread;
+    meta_sync_processor_thread = NULL;
+    std::lock_guard dl{data_sync_thread_lock};
+    for (auto iter : data_sync_processor_threads) {
+      RGWDataSyncProcessorThread *thread = iter.second;
+      delete thread;
+    }
+    data_sync_processor_threads.clear();
+    delete sync_log_trimmer;
+    sync_log_trimmer = nullptr;
+    bucket_trim = boost::none;
+  }
+  if (meta_notifier) {
+    meta_notifier->stop();
+    delete meta_notifier;
+  }
+  if (data_notifier) {
+    data_notifier->stop();
+    delete data_notifier;
+  }
+  delete sync_tracer;
+  
+  delete lc;
+  lc = NULL; 
+
+  delete gc;
+  gc = NULL;
+
+  delete obj_expirer;
+  obj_expirer = NULL;
+
+  RGWQuotaHandler::free_handler(quota_handler);
+  if (cr_registry) {
+    cr_registry->put();
+  }
+
+  svc.shutdown();
+
+  delete binfo_cache;
+  delete obj_tombstone_cache;
+  if (d3n_data_cache)
+    delete d3n_data_cache;
+
+  if (reshard_wait.get()) {
+    reshard_wait->stop();
+    reshard_wait.reset();
+  }
+
+  if (run_reshard_thread) {
+    reshard->stop_processor();
+  }
+  delete reshard;
+  delete index_completion_manager;
+
+  rgw::notify::shutdown();
+}
+
+/** 
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_rados()
+{
+  int ret = 0;
+
+  ret = rados.init_with_context(cct);
+  if (ret < 0) {
+    return ret;
+  }
+  ret = rados.connect();
+  if (ret < 0) {
+    return ret;
+  }
+
+  auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
+    new RGWCoroutinesManagerRegistry(cct)};
+  ret = crs->hook_to_admin_command("cr dump");
+  if (ret < 0) {
+    return ret;
+  }
+
+  cr_registry = crs.release();
+
+  if (use_datacache) {
+    d3n_data_cache = new D3nDataCache();
+    d3n_data_cache->init(cct);
+  }
+
+  return ret;
+}
+
+int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map<string, string>& meta)
+{
+  string name = cct->_conf->name.get_id();
+  if (name.compare(0, 4, "rgw.") == 0) {
+    name = name.substr(4);
+  }
+  map<string,string> metadata = meta;
+  metadata["num_handles"] = "1"s;
+  metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
+  metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
+  metadata["zone_name"] = svc.zone->zone_name();
+  metadata["zone_id"] = svc.zone->zone_id().id;
+  metadata["realm_name"] = svc.zone->get_realm().get_name();
+  metadata["realm_id"] = svc.zone->get_realm().get_id();
+  metadata["id"] = name;
+  int ret = rados.service_daemon_register(
+    daemon_type,
+    stringify(rados.get_instance_id()),
+    metadata);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status)
+{
+  int ret = rados.service_daemon_update_status(move(status));
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+/** 
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_complete(const DoutPrefixProvider *dpp)
+{
+  int ret;
+
+  /* 
+   * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
+   */
+  sync_module = svc.sync_modules->get_sync_module();
+
+  ret = open_root_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = open_gc_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = open_lc_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = open_objexp_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = open_reshard_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  ret = open_notif_pool_ctx(dpp);
+  if (ret < 0)
+    return ret;
+
+  pools_initialized = true;
+
+  if (use_gc) {
+    gc = new RGWGC();
+    gc->initialize(cct, this);
+  } else {
+    ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl;
+  }
+
+  obj_expirer = new RGWObjectExpirer(this->driver);
+
+  if (use_gc_thread && use_gc) {
+    gc->start_processor();
+    obj_expirer->start_processor();
+  }
+
+  auto& current_period = svc.zone->get_current_period();
+  auto& zonegroup = svc.zone->get_zonegroup();
+  auto& zone_params = svc.zone->get_zone_params();
+  auto& zone = svc.zone->get_zone();
+
+  /* no point of running sync thread if we don't have a master zone configured
+    or there is no rest_master_conn */
+  if (!svc.zone->need_to_sync()) {
+    run_sync_thread = false;
+  }
+
+  if (svc.zone->is_meta_master()) {
+    auto md_log = svc.mdlog->get_log(current_period.get_id());
+    meta_notifier = new RGWMetaNotifier(this, md_log);
+    meta_notifier->start();
+  }
+
+  /* init it anyway, might run sync through radosgw-admin explicitly */
+  sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
+  sync_tracer->init(this);
+  ret = sync_tracer->hook_to_admin_command();
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (run_sync_thread) {
+    for (const auto &pt: zonegroup.placement_targets) {
+      if (zone_params.placement_pools.find(pt.second.name)
+          == zone_params.placement_pools.end()){
+        ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target "
+                      << pt.second.name << " present in zonegroup" << dendl;
+      }
+    }
+    auto async_processor = svc.rados->get_async_processor();
+    std::lock_guard l{meta_sync_thread_lock};
+    meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->driver, async_processor);
+    ret = meta_sync_processor_thread->init(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
+      return ret;
+    }
+    meta_sync_processor_thread->start();
+
+    // configure the bucket trim manager
+    rgw::BucketTrimConfig config;
+    rgw::configure_bucket_trim(cct, config);
+
+    bucket_trim.emplace(this->driver, config);
+    ret = bucket_trim->init();
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl;
+      return ret;
+    }
+    svc.datalog_rados->set_observer(&*bucket_trim);
+
+    std::lock_guard dl{data_sync_thread_lock};
+    for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
+      ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
+      auto *thread = new RGWDataSyncProcessorThread(this->driver, svc.rados->get_async_processor(), source_zone);
+      ret = thread->init(dpp);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl;
+        return ret;
+      }
+      thread->start();
+      data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
+    }
+    auto interval = cct->_conf->rgw_sync_log_trim_interval;
+    if (interval > 0) {
+      sync_log_trimmer = new RGWSyncLogTrimThread(this->driver, &*bucket_trim, interval);
+      ret = sync_log_trimmer->init(dpp);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
+        return ret;
+      }
+      sync_log_trimmer->start();
+    }
+  }
+  if (cct->_conf->rgw_data_notify_interval_msec) {
+    data_notifier = new RGWDataNotifier(this);
+    data_notifier->start();
+  }
+
+  binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
+  binfo_cache->init(svc.cache);
+
+  lc = new RGWLC();
+  lc->initialize(cct, this->driver);
+
+  if (use_lc_thread)
+    lc->start_processor();
+
+  quota_handler = RGWQuotaHandler::generate_handler(dpp, this->driver, quota_threads);
+
+  bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
+                             zone.bucket_index_max_shards);
+  if (bucket_index_max_shards > get_max_bucket_shards()) {
+    bucket_index_max_shards = get_max_bucket_shards();
+    ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: "
+      << get_max_bucket_shards() << dendl;
+  }
+  ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
+
+  bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
+
+  if (need_tombstone_cache) {
+    obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
+  }
+
+  reshard_wait = std::make_shared<RGWReshardWait>();
+
+  reshard = new RGWReshard(this->driver);
+
+  // disable reshard thread based on zone/zonegroup support
+  run_reshard_thread = run_reshard_thread && svc.zone->can_reshard();
+
+  if (run_reshard_thread)  {
+    reshard->start_processor();
+  }
+
+  index_completion_manager = new RGWIndexCompletionManager(this);
+  ret = rgw::notify::init(cct, driver, dpp);
+  if (ret < 0 ) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl;
+  }
+
+  return ret;
+}
+
+int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp)
+{
+  if (raw) {
+    return svc.init_raw(cct, use_cache, null_yield, dpp);
+  }
+
+  return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp);
+}
+
+int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
+{
+  return ctl.init(&svc, driver, dpp);
+}
+
+/** 
+ * Initialize the RADOS instance and prepare to do other ops
+ * Returns 0 on success, -ERR# on failure.
+ */
+int RGWRados::init_begin(const DoutPrefixProvider *dpp)
+{
+  int ret = init_svc(false, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
+    return ret;
+  }
+
+  ret = init_ctl(dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
+    return ret;
+  }
+
+  host_id = svc.zone_utils->gen_host_id();
+
+  return init_rados();
+}
+
+/**
+ * Open the pool used as root for this gateway
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
+}
+
+int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
+}
+
+int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
+}
+
+int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
+}
+
+int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
+}
+
+int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true);
+}
+
+int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
+			    bool mostly_omap, bool bulk)
+{
+  constexpr bool create = true; // create the pool if it doesn't exist
+  return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap, bulk);
+}
+
+/**** logs ****/
+
+struct log_list_state {
+  string prefix;
+  librados::IoCtx io_ctx;
+  librados::NObjectIterator obit;
+};
+
+int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle)
+{
+  log_list_state *state = new log_list_state;
+  int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
+  if (r < 0) {
+    delete state;
+    return r;
+  }
+  try {
+    state->prefix = prefix;
+    state->obit = state->io_ctx.nobjects_begin();
+    *handle = (RGWAccessHandle)state;
+    return 0;
+  } catch (const std::system_error& e) {
+    r = -e.code().value();
+    ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
+}
+
+int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
+{
+  log_list_state *state = static_cast<log_list_state *>(handle);
+  while (true) {
+    if (state->obit == state->io_ctx.nobjects_end()) {
+      delete state;
+      return -ENOENT;
+    }
+    if (state->prefix.length() &&
+	state->obit->get_oid().find(state->prefix) != 0) {
+      state->obit++;
+      continue;
+    }
+    *name = state->obit->get_oid();
+    state->obit++;
+    break;
+  }
+  return 0;
+}
+
+int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name)
+{
+  librados::IoCtx io_ctx;
+  int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
+  if (r < 0)
+    return r;
+  return io_ctx.remove(name);
+}
+
+struct log_show_state {
+  librados::IoCtx io_ctx;
+  bufferlist bl;
+  bufferlist::const_iterator p;
+  string name;
+  uint64_t pos;
+  bool eof;
+  log_show_state() : pos(0), eof(false) {}
+};
+
+int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle)
+{
+  log_show_state *state = new log_show_state;
+  int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
+  if (r < 0) {
+    delete state;
+    return r;
+  }
+  state->name = name;
+  *handle = (RGWAccessHandle)state;
+  return 0;
+}
+
+int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry)
+{
+  log_show_state *state = static_cast<log_show_state *>(handle);
+  off_t off = state->p.get_off();
+
+  ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
+	   << " off " << off
+	   << " eof " << (int)state->eof
+	   << dendl;
+  // read some?
+  unsigned chunk = 1024*1024;
+  if ((state->bl.length() - off) < chunk/2 && !state->eof) {
+    bufferlist more;
+    int r = state->io_ctx.read(state->name, more, chunk, state->pos);
+    if (r < 0)
+      return r;
+    state->pos += r;
+    bufferlist old;
+    try {
+      old.substr_of(state->bl, off, state->bl.length() - off);
+    } catch (buffer::error& err) {
+      return -EINVAL;
+    }
+    state->bl = std::move(old);
+    state->bl.claim_append(more);
+    state->p = state->bl.cbegin();
+    if ((unsigned)r < chunk)
+      state->eof = true;
+    ldpp_dout(dpp, 10) << " read " << r << dendl;
+  }
+
+  if (state->p.end())
+    return 0;  // end of file
+  try {
+    decode(*entry, state->p);
+  }
+  catch (const buffer::error &e) {
+    return -EINVAL;
+  }
+  return 1;
+}
+
+/**
+ * usage_log_hash: get usage log key hash, based on name and index
+ *
+ * Get the usage object name. Since a user may have more than 1
+ * object holding that info (multiple shards), we use index to
+ * specify that shard number. Once index exceeds max shards it
+ * wraps.
+ * If name is not being set, results for all users will be returned
+ * and index will wrap only after total shards number.
+ *
+ * @param cct [in] ceph context
+ * @param name [in] user name
+ * @param hash [out] hash value
+ * @param index [in] shard index number 
+ */
+static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
+{
+  uint32_t val = index;
+
+  if (!name.empty()) {
+    int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
+    val %= max_user_shards;
+    val += ceph_str_hash_linux(name.c_str(), name.size());
+  }
+  char buf[17];
+  int max_shards = cct->_conf->rgw_usage_max_shards;
+  snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
+  hash = buf;
+}
+
+int RGWRados::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
+{
+  uint32_t index = 0;
+
+  map<string, rgw_usage_log_info> log_objs;
+
+  string hash;
+  string last_user;
+
+  /* restructure usage map, zone by object hash */
+  map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
+  for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
+    const rgw_user_bucket& ub = iter->first;
+    RGWUsageBatch& info = iter->second;
+
+    if (ub.user.empty()) {
+      ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
+      continue;
+    }
+
+    if (ub.user != last_user) {
+      /* index *should* be random, but why waste extra cycles
+         in most cases max user shards is not going to exceed 1,
+         so just incrementing it */
+      usage_log_hash(cct, ub.user, hash, index++);
+    }
+    last_user = ub.user;
+    vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
+
+    for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
+      v.push_back(miter->second);
+    }
+  }
+
+  map<string, rgw_usage_log_info>::iterator liter;
+
+  for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
+    int r = cls_obj_usage_log_add(dpp, liter->first, liter->second);
+    if (r < 0)
+      return r;
+  }
+  return 0;
+}
+
+int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
+                         uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
+			 rgw_usage_log_entry>& usage)
+{
+  uint32_t num = max_entries;
+  string hash, first_hash;
+  string user_str = user.to_str();
+  usage_log_hash(cct, user_str, first_hash, 0);
+
+  if (usage_iter.index) {
+    usage_log_hash(cct, user_str, hash, usage_iter.index);
+  } else {
+    hash = first_hash;
+  }
+
+  usage.clear();
+
+  do {
+    map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
+    map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
+
+    int ret =  cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num,
+                                    usage_iter.read_iter, ret_usage, is_truncated);
+    if (ret == -ENOENT)
+      goto next;
+
+    if (ret < 0)
+      return ret;
+
+    num -= ret_usage.size();
+
+    for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
+      usage[iter->first].aggregate(iter->second);
+    }
+
+next:
+    if (!*is_truncated) {
+      usage_iter.read_iter.clear();
+      usage_log_hash(cct, user_str, hash, ++usage_iter.index);
+    }
+  } while (num && !*is_truncated && hash != first_hash);
+  return 0;
+}
+
+int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
+{
+  uint32_t index = 0;
+  string hash, first_hash;
+  string user_str = user.to_str();
+  usage_log_hash(cct, user_str, first_hash, index);
+
+  hash = first_hash;
+  do {
+    int ret =  cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch);
+
+    if (ret < 0 && ret != -ENOENT)
+      return ret;
+
+    usage_log_hash(cct, user_str, hash, ++index);
+  } while (hash != first_hash);
+
+  return 0;
+}
+
+
+int RGWRados::clear_usage(const DoutPrefixProvider *dpp)
+{
+  auto max_shards = cct->_conf->rgw_usage_max_shards;
+  int ret=0;
+  for (unsigned i=0; i < max_shards; i++){
+    string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
+    ret = cls_obj_usage_log_clear(dpp, oid);
+    if (ret < 0){
+      ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
+      return ret;
+    }
+  }
+  return ret;
+}
+
+int RGWRados::decode_policy(const DoutPrefixProvider *dpp,
+			    ceph::buffer::list& bl,
+			    ACLOwner *owner)
+{
+  auto i = bl.cbegin();
+  RGWAccessControlPolicy policy(cct);
+  try {
+    policy.decode_owner(i);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+    return -EIO;
+  }
+  *owner = policy.get_owner();
+  return 0;
+}
+
+int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp)
+{
+  rgw_bucket bucket = bucket_info.bucket;
+  bucket.update_bucket_id(new_bucket_id);
+
+  bucket_info.objv_tracker.clear();
+  int ret = store->get_bucket_instance_info(bucket, bucket_info, nullptr, nullptr, null_yield, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+
+/**
+ * Get ordered listing of the objects in a bucket.
+ *
+ * max_p: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: do not include results that match this string.
+ *     Any skipped results will have the matching portion of their name
+ *     inserted in common_prefixes with a "true" mark.
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: if delim is filled in, any matching prefixes are
+ * placed here.
+ * is_truncated: if number of objects in the bucket is bigger than
+ * max, then truncated.
+ */
+int RGWRados::Bucket::List::list_objects_ordered(
+  const DoutPrefixProvider *dpp,
+  int64_t max_p,
+  std::vector<rgw_bucket_dir_entry> *result,
+  std::map<std::string, bool> *common_prefixes,
+  bool *is_truncated,
+  optional_yield y)
+{
+  RGWRados *store = target->get_store();
+  CephContext *cct = store->ctx();
+  int shard_id = target->get_shard_id();
+  const auto& current_index = target->get_bucket_info().layout.current_index;
+
+  int count = 0;
+  bool truncated = true;
+  bool cls_filtered = false;
+  const int64_t max = // protect against memory issues and negative vals
+    std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
+  int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
+
+  result->clear();
+
+  // use a local marker; either the marker will have a previous entry
+  // or it will be empty; either way it's OK to copy
+  rgw_obj_key marker_obj(params.marker.name,
+			 params.marker.instance,
+			 params.ns.empty() ? params.marker.ns : params.ns);
+  rgw_obj_index_key cur_marker;
+  marker_obj.get_index_key(&cur_marker);
+
+  rgw_obj_key end_marker_obj(params.end_marker.name,
+			     params.end_marker.instance,
+			     params.ns.empty() ? params.end_marker.ns : params.ns);
+  rgw_obj_index_key cur_end_marker;
+  end_marker_obj.get_index_key(&cur_end_marker);
+  const bool cur_end_marker_valid = !params.end_marker.empty();
+
+  rgw_obj_key prefix_obj(params.prefix);
+  prefix_obj.set_ns(params.ns);
+  std::string cur_prefix = prefix_obj.get_index_key_name();
+  std::string after_delim_s; /* needed in !params.delim.empty() AND later */
+
+  if (!params.delim.empty()) {
+    after_delim_s = cls_rgw_after_delim(params.delim);
+    /* if marker points at a common prefix, fast forward it into its
+     * upper bound string */
+    int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
+    if (delim_pos >= 0) {
+      string s = cur_marker.name.substr(0, delim_pos);
+      s.append(after_delim_s);
+      cur_marker = s;
+    }
+  }
+
+  // we'll stop after this many attempts as long we return at least
+  // one entry; but we will also go beyond this number of attempts
+  // until we return at least one entry
+  constexpr uint16_t SOFT_MAX_ATTEMPTS = 8;
+
+  rgw_obj_index_key prev_marker;
+  for (uint16_t attempt = 1; /* empty */; ++attempt) {
+    ldpp_dout(dpp, 20) << __func__ <<
+      ": starting attempt " << attempt << dendl;
+
+    if (attempt > 1 && !(prev_marker < cur_marker)) {
+      // we've failed to make forward progress
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	" marker failed to make forward progress; attempt=" << attempt <<
+	", prev_marker=" << prev_marker <<
+	", cur_marker=" << cur_marker << dendl;
+      break;
+    }
+    prev_marker = cur_marker;
+
+    ent_map_t ent_map;
+    ent_map.reserve(read_ahead);
+    int r = store->cls_bucket_list_ordered(dpp,
+                                           target->get_bucket_info(),
+                                           current_index,
+                                           shard_id,
+					   cur_marker,
+					   cur_prefix,
+					   params.delim,
+					   read_ahead + 1 - count,
+					   params.list_versions,
+					   attempt,
+					   ent_map,
+					   &truncated,
+					   &cls_filtered,
+					   &cur_marker,
+                                           y,
+					   params.force_check_filter);
+    if (r < 0) {
+      return r;
+    }
+
+    for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
+      rgw_bucket_dir_entry& entry = eiter->second;
+      rgw_obj_index_key index_key = entry.key;
+      rgw_obj_key obj(index_key);
+
+      ldpp_dout(dpp, 20) << __func__ <<
+	": considering entry " << entry.key << dendl;
+
+      /* note that parse_raw_oid() here will not set the correct
+       * object's instance, as rgw_obj_index_key encodes that
+       * separately. We don't need to set the instance because it's
+       * not needed for the checks here and we end up using the raw
+       * entry for the return vector
+       */
+      bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
+      if (!valid) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	  " could not parse object name: " << obj.name << dendl;
+        continue;
+      }
+
+      bool matched_ns = (obj.ns == params.ns);
+      if (!params.list_versions && !entry.is_visible()) {
+        ldpp_dout(dpp, 10) << __func__ <<
+	  ": skipping not visible entry \"" << entry.key << "\"" << dendl;
+        continue;
+      }
+
+      if (params.enforce_ns && !matched_ns) {
+        if (!params.ns.empty()) {
+          /* we've iterated past the namespace we're searching -- done now */
+          truncated = false;
+	  ldpp_dout(dpp, 10) << __func__ <<
+	    ": finished due to getting past requested namespace \"" <<
+	    params.ns << "\"" << dendl;
+          goto done;
+        }
+
+        /* we're skipping past namespaced objects */
+	ldpp_dout(dpp, 20) << __func__ <<
+	  ": skipping past namespaced objects, including \"" << entry.key <<
+	  "\"" << dendl;
+        continue;
+      }
+
+      if (cur_end_marker_valid && cur_end_marker <= index_key) {
+        truncated = false;
+	ldpp_dout(dpp, 10) << __func__ <<
+	  ": finished due to gitting end marker of \"" << cur_end_marker <<
+	  "\" with \"" << entry.key << "\"" << dendl;
+        goto done;
+      }
+
+      if (count < max) {
+	params.marker = index_key;
+	next_marker = index_key;
+      }
+
+      if (params.access_list_filter &&
+	  ! params.access_list_filter->filter(obj.name, index_key.name)) {
+	ldpp_dout(dpp, 20) << __func__ <<
+	  ": skipping past namespaced objects, including \"" << entry.key <<
+	  "\"" << dendl;
+        continue;
+      }
+
+      if (params.prefix.size() &&
+	  0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
+	ldpp_dout(dpp, 20) << __func__ <<
+	  ": skipping object \"" << entry.key <<
+	  "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl;
+        continue;
+      }
+
+      if (!params.delim.empty()) {
+	const int delim_pos = obj.name.find(params.delim, params.prefix.size());
+	if (delim_pos >= 0) {
+	  // run either the code where delimiter filtering is done a)
+	  // in the OSD/CLS or b) here.
+	  if (cls_filtered) {
+	    // NOTE: this condition is for the newer versions of the
+	    // OSD that does filtering on the CLS side should only
+	    // find one delimiter at the end if it finds any after the
+	    // prefix
+	    if (delim_pos !=
+		int(obj.name.length() - params.delim.length())) {
+	      ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
+		" found delimiter in place other than the end of "
+		"the prefix; obj.name=" << obj.name <<
+		", prefix=" << params.prefix << dendl;
+	    }
+	    if (common_prefixes) {
+	      if (count >= max) {
+		truncated = true;
+		ldpp_dout(dpp, 10) << __func__ <<
+		  ": stopping early with common prefix \"" << entry.key <<
+		  "\" because requested number (" << max <<
+		  ") reached (cls filtered)" << dendl;
+		goto done;
+	      }
+
+	      (*common_prefixes)[obj.name] = true;
+	      count++;
+	    }
+
+	    ldpp_dout(dpp, 20) << __func__ <<
+	      ": finished entry with common prefix \"" << entry.key <<
+	      "\" so continuing loop (cls filtered)" << dendl;
+	    continue;
+	  } else {
+	    // NOTE: this condition is for older versions of the OSD
+	    // that do not filter on the CLS side, so the following code
+	    // must do the filtering; once we reach version 16 of ceph,
+	    // this code can be removed along with the conditional that
+	    // can lead this way
+
+	    /* extract key -with trailing delimiter- for CommonPrefix */
+	    string prefix_key =
+	      obj.name.substr(0, delim_pos + params.delim.length());
+
+	    if (common_prefixes &&
+		common_prefixes->find(prefix_key) == common_prefixes->end()) {
+	      if (count >= max) {
+		truncated = true;
+		ldpp_dout(dpp, 10) << __func__ <<
+		  ": stopping early with common prefix \"" << entry.key <<
+		  "\" because requested number (" << max <<
+		  ") reached (not cls filtered)" << dendl;
+		goto done;
+	      }
+	      next_marker = prefix_key;
+	      (*common_prefixes)[prefix_key] = true;
+
+	      count++;
+	    }
+
+	    ldpp_dout(dpp, 20) << __func__ <<
+	      ": finished entry with common prefix \"" << entry.key <<
+	      "\" so continuing loop (not cls filtered)" << dendl;
+	    continue;
+	  } // if we're running an older OSD version
+	} // if a delimiter was found after prefix
+      } // if a delimiter was passed in
+
+      if (count >= max) {
+        truncated = true;
+	ldpp_dout(dpp, 10) << __func__ <<
+	  ": stopping early with entry \"" << entry.key <<
+	  "\" because requested number (" << max <<
+	  ") reached" << dendl;
+        goto done;
+      }
+
+      ldpp_dout(dpp, 20) << __func__ <<
+	": adding entry " << entry.key << " to result" << dendl;
+
+      result->emplace_back(std::move(entry));
+      count++;
+    } // eiter for loop
+
+    // NOTE: the following conditional is needed by older versions of
+    // the OSD that don't do delimiter filtering on the CLS side; once
+    // we reach version 16 of ceph, the following conditional and the
+    // code within can be removed
+    if (!cls_filtered && !params.delim.empty()) {
+      int marker_delim_pos =
+	cur_marker.name.find(params.delim, cur_prefix.size());
+      if (marker_delim_pos >= 0) {
+	std::string skip_after_delim =
+	  cur_marker.name.substr(0, marker_delim_pos);
+        skip_after_delim.append(after_delim_s);
+
+        ldpp_dout(dpp, 20) << __func__ <<
+	  ": skip_after_delim=" << skip_after_delim << dendl;
+
+        if (skip_after_delim > cur_marker.name) {
+          cur_marker = skip_after_delim;
+          ldpp_dout(dpp, 20) << __func__ <<
+	    ": setting cur_marker=" << cur_marker.name <<
+	    "[" << cur_marker.instance << "]" << dendl;
+        }
+      }
+    } // if older osd didn't do delimiter filtering
+
+    ldpp_dout(dpp, 10) << __func__ <<
+      ": end of outer loop, truncated=" << truncated <<
+      ", count=" << count << ", attempt=" << attempt << dendl;
+
+    if (!truncated || count >= (max + 1) / 2) {
+      // if we finished listing, or if we're returning at least half the
+      // requested entries, that's enough; S3 and swift protocols allow
+      // returning fewer than max entries
+      ldpp_dout(dpp, 10) << __func__ <<
+	": exiting attempt loop because we reached end (" << truncated <<
+	") or we're returning half the requested entries (" << count <<
+	" of " << max << ")" << dendl;
+      break;
+    } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) {
+      // if we've made at least 8 attempts and we have some, but very
+      // few, results, return with what we have
+      ldpp_dout(dpp, 10) << __func__ <<
+	": exiting attempt loop because we made " << attempt <<
+	" attempts and we're returning " << count << " entries" << dendl;
+      break;
+    }
+  } // for (uint16_t attempt...
+
+done:
+
+  if (is_truncated) {
+    *is_truncated = truncated;
+  }
+
+  return 0;
+} // list_objects_ordered
+
+
+/**
+ * Get listing of the objects in a bucket and allow the results to be out
+ * of order.
+ *
+ * Even though there are key differences with the ordered counterpart,
+ * the parameters are the same to maintain some compatability.
+ *
+ * max: maximum number of results to return
+ * bucket: bucket to list contents of
+ * prefix: only return results that match this prefix
+ * delim: should not be set; if it is we should have indicated an error
+ * marker: if filled in, begin the listing with this object.
+ * end_marker: if filled in, end the listing with this object.
+ * result: the objects are put in here.
+ * common_prefixes: this is never filled with an unordered list; the param
+ *                  is maintained for compatibility
+ * is_truncated: if number of objects in the bucket is bigger than max, then
+ *               truncated.
+ */
+int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp,
+                                                   int64_t max_p,
+						   std::vector<rgw_bucket_dir_entry>* result,
+						   std::map<std::string, bool>* common_prefixes,
+						   bool* is_truncated,
+                                                   optional_yield y)
+{
+  RGWRados *store = target->get_store();
+  int shard_id = target->get_shard_id();
+  const auto& current_index = target->get_bucket_info().layout.current_index;
+
+  int count = 0;
+  bool truncated = true;
+
+  const int64_t max = // protect against memory issues and negative vals
+    std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
+
+  // read a few extra in each call to cls_bucket_list_unordered in
+  // case some are filtered out due to namespace matching, versioning,
+  // filtering, etc.
+  const int64_t max_read_ahead = 100;
+  const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
+
+  result->clear();
+
+  // use a local marker; either the marker will have a previous entry
+  // or it will be empty; either way it's OK to copy
+  rgw_obj_key marker_obj(params.marker.name,
+			 params.marker.instance,
+			 params.ns.empty() ? params.marker.ns : params.ns);
+  rgw_obj_index_key cur_marker;
+  marker_obj.get_index_key(&cur_marker);
+
+  rgw_obj_key end_marker_obj(params.end_marker.name,
+			     params.end_marker.instance,
+			     params.ns.empty() ? params.end_marker.ns : params.ns);
+  rgw_obj_index_key cur_end_marker;
+  end_marker_obj.get_index_key(&cur_end_marker);
+  const bool cur_end_marker_valid = !params.end_marker.empty();
+
+  rgw_obj_key prefix_obj(params.prefix);
+  prefix_obj.set_ns(params.ns);
+  std::string cur_prefix = prefix_obj.get_index_key_name();
+
+  while (truncated && count <= max) {
+    std::vector<rgw_bucket_dir_entry> ent_list;
+    ent_list.reserve(read_ahead);
+
+    int r = store->cls_bucket_list_unordered(dpp,
+                                             target->get_bucket_info(),
+                                             current_index,
+                                             shard_id,
+					     cur_marker,
+					     cur_prefix,
+					     read_ahead,
+					     params.list_versions,
+					     ent_list,
+					     &truncated,
+					     &cur_marker,
+                                             y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	" cls_bucket_list_unordered returned " << r << " for " <<
+	target->get_bucket_info().bucket << dendl;
+      return r;
+    }
+
+    // NB: while regions of ent_list will be sorted, we have no
+    // guarantee that all items will be sorted since they can cross
+    // shard boundaries
+
+    for (auto& entry : ent_list) {
+      rgw_obj_index_key index_key = entry.key;
+      rgw_obj_key obj(index_key);
+
+      if (count < max) {
+	params.marker.set(index_key);
+	next_marker.set(index_key);
+      }
+
+      /* note that parse_raw_oid() here will not set the correct
+       * object's instance, as rgw_obj_index_key encodes that
+       * separately. We don't need to set the instance because it's
+       * not needed for the checks here and we end up using the raw
+       * entry for the return vector
+       */
+      bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
+      if (!valid) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	  " could not parse object name: " << obj.name << dendl;
+        continue;
+      }
+
+      if (!params.list_versions && !entry.is_visible()) {
+        ldpp_dout(dpp, 20) << __func__ <<
+	  ": skippping \"" << index_key <<
+	  "\" because not listing versions and entry not visibile" << dendl;
+        continue;
+      }
+
+      if (params.enforce_ns && obj.ns != params.ns) {
+        ldpp_dout(dpp, 20) << __func__ <<
+	  ": skippping \"" << index_key <<
+	  "\" because namespace does not match" << dendl;
+        continue;
+      }
+
+      if (cur_end_marker_valid && cur_end_marker <= index_key) {
+	// we're not guaranteed items will come in order, so we have
+	// to loop through all
+        ldpp_dout(dpp, 20) << __func__ <<
+	  ": skippping \"" << index_key <<
+	  "\" because after end_marker" << dendl;
+	continue;
+      }
+
+      if (params.access_list_filter &&
+	  !params.access_list_filter->filter(obj.name, index_key.name)) {
+        ldpp_dout(dpp, 20) << __func__ <<
+	  ": skippping \"" << index_key <<
+	  "\" because doesn't match filter" << dendl;
+        continue;
+      }
+
+      if (params.prefix.size() &&
+	  (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) {
+        ldpp_dout(dpp, 20) << __func__ <<
+	  ": skippping \"" << index_key <<
+	  "\" because doesn't match prefix" << dendl;
+	continue;
+      }
+
+      if (count >= max) {
+        truncated = true;
+        goto done;
+      }
+
+      result->emplace_back(std::move(entry));
+      count++;
+    } // for (auto& entry : ent_list)
+  } // while (truncated && count <= max)
+
+done:
+
+  if (is_truncated) {
+    *is_truncated = truncated;
+  }
+
+  return 0;
+} // list_objects_unordered
+
+
+/**
+ * create a rados pool, associated meta info
+ * returns 0 on success, -ERR# otherwise.
+ */
+int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool)
+{
+  librados::IoCtx io_ctx;
+  constexpr bool create = true;
+  return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create);
+}
+
+void RGWRados::create_bucket_id(string *bucket_id)
+{
+  uint64_t iid = instance_id();
+  uint64_t bid = next_bucket_id();
+  char buf[svc.zone->get_zone_params().get_id().size() + 48];
+  snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
+           svc.zone->get_zone_params().get_id().c_str(), iid, bid);
+  *bucket_id = buf;
+}
+
+int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
+                            const string& zonegroup_id,
+                            const rgw_placement_rule& placement_rule,
+                            const string& swift_ver_location,
+                            const RGWQuotaInfo * pquota_info,
+			    map<std::string, bufferlist>& attrs,
+                            RGWBucketInfo& info,
+                            obj_version *pobjv,
+                            obj_version *pep_objv,
+                            real_time creation_time,
+                            rgw_bucket *pmaster_bucket,
+                            uint32_t *pmaster_num_shards,
+			    optional_yield y,
+                            const DoutPrefixProvider *dpp,
+			    bool exclusive)
+{
+#define MAX_CREATE_RETRIES 20 /* need to bound retries */
+  rgw_placement_rule selected_placement_rule;
+  RGWZonePlacementInfo rule_info;
+
+  for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
+    int ret = 0;
+    ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule,
+                                            &selected_placement_rule, &rule_info, y);
+    if (ret < 0)
+      return ret;
+
+    if (!pmaster_bucket) {
+      create_bucket_id(&bucket.marker);
+      bucket.bucket_id = bucket.marker;
+    } else {
+      bucket.marker = pmaster_bucket->marker;
+      bucket.bucket_id = pmaster_bucket->bucket_id;
+    }
+
+    RGWObjVersionTracker& objv_tracker = info.objv_tracker;
+
+    objv_tracker.read_version.clear();
+
+    if (pobjv) {
+      objv_tracker.write_version = *pobjv;
+    } else {
+      objv_tracker.generate_new_write_ver(cct);
+    }
+
+    info.bucket = bucket;
+    info.owner = owner.user_id;
+    info.zonegroup = zonegroup_id;
+    info.placement_rule = selected_placement_rule;
+    info.swift_ver_location = swift_ver_location;
+    info.swift_versioning = (!swift_ver_location.empty());
+
+    init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(),
+			       pmaster_num_shards ?
+			       std::optional{*pmaster_num_shards} :
+			       std::nullopt,
+			       rule_info.index_type);
+
+    info.requester_pays = false;
+    if (real_clock::is_zero(creation_time)) {
+      info.creation_time = ceph::real_clock::now();
+    } else {
+      info.creation_time = creation_time;
+    }
+    if (pquota_info) {
+      info.quota = *pquota_info;
+    }
+
+    int r = svc.bi->init_index(dpp, info, info.layout.current_index);
+    if (r < 0) {
+      return r;
+    }
+
+    ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp, y);
+    if (ret == -ECANCELED) {
+      ret = -EEXIST;
+    }
+    if (ret == -EEXIST) {
+       /* we need to reread the info and return it, caller will have a use for it */
+      RGWBucketInfo orig_info;
+      r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
+      if (r < 0) {
+        if (r == -ENOENT) {
+          continue;
+        }
+        ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl;
+        return r;
+      }
+
+      /* only remove it if it's a different bucket instance */
+      if (orig_info.bucket.bucket_id != bucket.bucket_id) {
+	int r = svc.bi->clean_index(dpp, info, info.layout.current_index);
+	if (r < 0) {
+	  ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
+	}
+        r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp);
+        if (r < 0) {
+          ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
+          /* continue anyway */
+        }
+      }
+
+      info = std::move(orig_info);
+      /* ret == -EEXIST here */
+    }
+    return ret;
+  }
+
+  /* this is highly unlikely */
+  ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
+  return -ENOENT;
+}
+
+bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
+{
+  get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
+
+  return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
+}
+
+std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return svc.rados->cluster_fsid();
+}
+
+int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp,
+				 const RGWBucketInfo& bucket_info,
+				 const rgw_obj& obj,
+				 librados::IoCtx *ioctx)
+{
+  std::string oid, key;
+  get_obj_bucket_and_oid_loc(obj, oid, key);
+
+  rgw_pool pool;
+  if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
+    ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj <<
+      ", probably misconfiguration" << dendl;
+    return -EIO;
+  }
+
+  int r = open_pool_ctx(dpp, pool, *ioctx, false, true);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: unable to open data-pool=" << pool.to_str() <<
+      " for obj=" << obj << " with error-code=" << r << dendl;
+    return r;
+  }
+
+  ioctx->locator_set_key(key);
+
+  return 0;
+}
+
+int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
+                               const rgw_placement_rule& target_placement_rule,
+                               const rgw_obj& obj,
+                               rgw_rados_ref *ref)
+{
+  get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
+
+  rgw_pool pool;
+  if (!get_obj_data_pool(target_placement_rule, obj, &pool)) {
+    ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
+    return -EIO;
+  }
+
+  ref->pool = svc.rados->pool(pool);
+
+  int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
+                         .set_mostly_omap(false));
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
+    return r;
+  }
+
+  ref->pool.ioctx().locator_set_key(ref->obj.loc);
+
+  return 0;
+}
+
+int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
+                               const RGWBucketInfo& bucket_info,
+                               const rgw_obj& obj,
+                               rgw_rados_ref *ref)
+{
+  return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref);
+}
+
+int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
+{
+  ref->obj = obj;
+
+  if (ref->obj.oid.empty()) {
+    ref->obj.oid = obj.pool.to_str();
+    ref->obj.pool = svc.zone->get_zone_params().domain_root;
+  }
+  ref->pool = svc.rados->pool(obj.pool);
+  int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
+                         .set_mostly_omap(false));
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
+    return r;
+  }
+
+  ref->pool.ioctx().locator_set_key(ref->obj.loc);
+
+  return 0;
+}
+
+int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
+{
+  return get_raw_obj_ref(dpp, obj, ref);
+}
+
+/*
+ * fixes an issue where head objects were supposed to have a locator created, but ended
+ * up without one
+ */
+int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
+{
+  const rgw_bucket& bucket = bucket_info.bucket;
+  string oid;
+  string locator;
+
+  rgw_obj obj(bucket, key);
+
+  get_obj_bucket_and_oid_loc(obj, oid, locator);
+
+  if (locator.empty()) {
+    ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl;
+    return 0;
+  }
+
+  librados::IoCtx ioctx;
+
+  int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx);
+  if (ret < 0) {
+    cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
+    return ret;
+  }
+  ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
+
+  uint64_t size;
+  bufferlist data;
+
+  struct timespec mtime_ts;
+  map<string, bufferlist> attrs;
+  librados::ObjectReadOperation op;
+  op.getxattrs(&attrs, NULL);
+  op.stat2(&size, &mtime_ts, NULL);
+#define HEAD_SIZE 512 * 1024
+  op.read(0, HEAD_SIZE, &data, NULL);
+
+  ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (size > HEAD_SIZE) {
+    ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
+    return -EIO;
+  }
+
+  if (size != data.length()) {
+    ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
+    return -EIO;
+  }
+
+  if (copy_obj) {
+    librados::ObjectWriteOperation wop;
+
+    wop.mtime2(&mtime_ts);
+
+    map<string, bufferlist>::iterator iter;
+    for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+      wop.setxattr(iter->first.c_str(), iter->second);
+    }
+
+    wop.write(0, data);
+
+    ioctx.locator_set_key(locator);
+    rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield);
+  }
+
+  if (remove_bad) {
+    ioctx.locator_set_key(string());
+
+    ret = ioctx.remove(oid);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl;
+      return ret;
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp,
+                             librados::IoCtx& src_ioctx,
+			     const string& src_oid, const string& src_locator,
+		             librados::IoCtx& dst_ioctx,
+			     const string& dst_oid, const string& dst_locator)
+{
+
+#define COPY_BUF_SIZE (4 * 1024 * 1024)
+  bool done = false;
+  uint64_t chunk_size = COPY_BUF_SIZE;
+  uint64_t ofs = 0;
+  int ret = 0;
+  real_time mtime;
+  struct timespec mtime_ts;
+  uint64_t size;
+
+  if (src_oid == dst_oid && src_locator == dst_locator) {
+    return 0;
+  }
+
+  src_ioctx.locator_set_key(src_locator);
+  dst_ioctx.locator_set_key(dst_locator);
+
+  do {
+    bufferlist data;
+    ObjectReadOperation rop;
+    ObjectWriteOperation wop;
+
+    if (ofs == 0) {
+      rop.stat2(&size, &mtime_ts, NULL);
+      mtime = real_clock::from_timespec(mtime_ts);
+    }
+    rop.read(ofs, chunk_size, &data, NULL);
+    ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield);
+    if (ret < 0) {
+      goto done_err;
+    }
+
+    if (data.length() == 0) {
+      break;
+    }
+
+    if (ofs == 0) {
+      wop.create(true); /* make it exclusive */
+      wop.mtime2(&mtime_ts);
+      mtime = real_clock::from_timespec(mtime_ts);
+    }
+    wop.write(ofs, data);
+    ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield);
+    if (ret < 0) {
+      goto done_err;
+    }
+    ofs += data.length();
+    done = data.length() != chunk_size;
+  } while (!done);
+
+  if (ofs != size) {
+    ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
+               << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
+    ret = -EIO;
+    goto done_err;
+  }
+
+  src_ioctx.remove(src_oid);
+
+  return 0;
+
+done_err:
+  // TODO: clean up dst_oid if we created it
+  ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
+  return ret;
+}
+
+/*
+ * fixes an issue where head objects were supposed to have a locator created, but ended
+ * up without one
+ */
+int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp,
+                                   RGWBucketInfo& bucket_info, rgw_obj_key& key,
+                                   bool fix, bool *need_fix, optional_yield y)
+{
+  const rgw_bucket& bucket = bucket_info.bucket;
+  rgw_obj obj(bucket, key);
+
+  if (need_fix) {
+    *need_fix = false;
+  }
+
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  RGWObjState *astate = nullptr;
+  RGWObjManifest* manifest = nullptr;
+  RGWObjectCtx rctx(this->driver);
+  r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, &manifest, false, y);
+  if (r < 0)
+    return r;
+
+  if (manifest) {
+    RGWObjManifest::obj_iterator miter;
+    for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
+      rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
+      rgw_obj loc;
+      string oid;
+      string locator;
+
+      RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_tail_placement().bucket, raw_loc, &loc);
+
+      if (loc.key.ns.empty()) {
+	/* continue, we're only interested in tail objects */
+	continue;
+      }
+
+      auto& ioctx = ref.pool.ioctx();
+
+      get_obj_bucket_and_oid_loc(loc, oid, locator);
+      ref.pool.ioctx().locator_set_key(locator);
+
+      ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
+
+      r = ioctx.stat(oid, NULL, NULL);
+      if (r != -ENOENT) {
+	continue;
+      }
+
+      string bad_loc;
+      prepend_bucket_marker(bucket, loc.key.name, bad_loc);
+
+      /* create a new ioctx with the bad locator */
+      librados::IoCtx src_ioctx;
+      src_ioctx.dup(ioctx);
+      src_ioctx.locator_set_key(bad_loc);
+
+      r = src_ioctx.stat(oid, NULL, NULL);
+      if (r != 0) {
+	/* cannot find a broken part */
+	continue;
+      }
+      ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl;
+      if (need_fix) {
+        *need_fix = true;
+      }
+      if (fix) {
+        r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator);
+        if (r < 0) {
+          ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
+				const rgw_obj& obj,
+				RGWBucketInfo* bucket_info_out,
+                                const DoutPrefixProvider *dpp)
+{
+  bucket = _bucket;
+
+  RGWBucketInfo bucket_info;
+  RGWBucketInfo* bucket_info_p =
+    bucket_info_out ? bucket_info_out : &bucket_info;
+
+  int ret = store->get_bucket_instance_info(bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  string oid;
+
+  ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
+
+  return 0;
+}
+
+int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+                                const rgw_obj& obj)
+{
+  bucket = bucket_info.bucket;
+
+  int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info,
+							 obj.get_hash_object(),
+							 &bucket_obj,
+							 &shard_id);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
+
+  return 0;
+}
+
+int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp,
+				const RGWBucketInfo& bucket_info,
+                                const rgw::bucket_index_layout_generation& index,
+                                int sid)
+{
+  bucket = bucket_info.bucket;
+  shard_id = sid;
+
+  int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, index,
+                                                         shard_id, &bucket_obj);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
+
+  return 0;
+}
+
+
+/* Execute @handler on last item in bucket listing for bucket specified
+ * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
+ * to objects matching these criterias. */
+int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp,
+                                       RGWBucketInfo& bucket_info,
+                                       const std::string& obj_prefix,
+                                       const std::string& obj_delim,
+                                       std::function<int(const rgw_bucket_dir_entry&)> handler)
+{
+  RGWRados::Bucket target(this, bucket_info);
+  RGWRados::Bucket::List list_op(&target);
+
+  list_op.params.prefix = obj_prefix;
+  list_op.params.delim = obj_delim;
+
+  ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
+                 << ", obj_prefix=" << obj_prefix
+                 << ", obj_delim=" << obj_delim
+                 << dendl;
+
+  bool is_truncated = false;
+
+  boost::optional<rgw_bucket_dir_entry> last_entry;
+  /* We need to rewind to the last object in a listing. */
+  do {
+    /* List bucket entries in chunks. */
+    static constexpr int MAX_LIST_OBJS = 100;
+    std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
+
+    int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
+                                   &is_truncated, null_yield);
+    if (ret < 0) {
+      return ret;
+    } else if (!entries.empty()) {
+      last_entry = entries.back();
+    }
+  } while (is_truncated);
+
+  if (last_entry) {
+    return handler(*last_entry);
+  }
+
+  /* Empty listing - no items we can run handler on. */
+  return 0;
+}
+
+bool RGWRados::swift_versioning_enabled(const RGWBucketInfo& bucket_info) const
+{
+  return bucket_info.has_swift_versioning() &&
+    bucket_info.swift_ver_location.size();
+}
+
+int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
+                                    const rgw_user& user,
+                                    RGWBucketInfo& bucket_info,
+                                    const rgw_obj& obj,
+                                    const DoutPrefixProvider *dpp,
+                                    optional_yield y)
+{
+  if (! swift_versioning_enabled(bucket_info)) {
+    return 0;
+  }
+
+  obj_ctx.set_atomic(obj);
+
+  RGWObjState * state = nullptr;
+  RGWObjManifest *manifest = nullptr;
+  int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &state, &manifest, false, y);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!state->exists) {
+    return 0;
+  }
+
+  const string& src_name = obj.get_oid();
+  char buf[src_name.size() + 32];
+  struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
+  snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
+           src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
+
+  RGWBucketInfo dest_bucket_info;
+
+  r = get_bucket_info(&svc, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
+  if (r < 0) {
+    ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl;
+    if (r == -ENOENT) {
+      return -ERR_PRECONDITION_FAILED;
+    }
+    return r;
+  }
+
+  if (dest_bucket_info.owner != bucket_info.owner) {
+    return -ERR_PRECONDITION_FAILED;
+  }
+
+  rgw_obj dest_obj(dest_bucket_info.bucket, buf);
+
+  if (dest_bucket_info.versioning_enabled()){
+    gen_rand_obj_instance_name(&dest_obj);
+  }
+
+  obj_ctx.set_atomic(dest_obj);
+
+  rgw_zone_id no_zone;
+
+  r = copy_obj(obj_ctx,
+               user,
+               NULL, /* req_info *info */
+               no_zone,
+               dest_obj,
+               obj,
+               dest_bucket_info,
+               bucket_info,
+               bucket_info.placement_rule,
+               NULL, /* time_t *src_mtime */
+               NULL, /* time_t *mtime */
+               NULL, /* const time_t *mod_ptr */
+               NULL, /* const time_t *unmod_ptr */
+               false, /* bool high_precision_time */
+               NULL, /* const char *if_match */
+               NULL, /* const char *if_nomatch */
+               RGWRados::ATTRSMOD_NONE,
+               true, /* bool copy_if_newer */
+               state->attrset,
+               RGWObjCategory::Main,
+               0, /* uint64_t olh_epoch */
+               real_time(), /* time_t delete_at */
+               NULL, /* string *version_id */
+               NULL, /* string *ptag */
+               NULL, /* string *petag */
+               NULL, /* void (*progress_cb)(off_t, void *) */
+               NULL, /* void *progress_data */
+               dpp,
+               null_yield);
+  if (r == -ECANCELED || r == -ENOENT) {
+    /* Has already been overwritten, meaning another rgw process already
+     * copied it out */
+    return 0;
+  }
+
+  return r;
+}
+
+int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
+                                       const rgw_user& user,
+                                       RGWBucketInfo& bucket_info,
+                                       rgw_obj& obj,
+                                       bool& restored,
+                                       const DoutPrefixProvider *dpp)
+{
+  if (! swift_versioning_enabled(bucket_info)) {
+    return 0;
+  }
+
+  /* Bucket info of the bucket that stores previous versions of our object. */
+  RGWBucketInfo archive_binfo;
+
+  int ret = get_bucket_info(&svc, bucket_info.bucket.tenant,
+                            bucket_info.swift_ver_location,
+			    archive_binfo, nullptr, null_yield, nullptr);
+  if (ret < 0) {
+    return ret;
+  }
+
+  /* Abort the operation if the bucket storing our archive belongs to someone
+   * else. This is a limitation in comparison to Swift as we aren't taking ACLs
+   * into consideration. For we can live with that.
+   *
+   * TODO: delegate this check to un upper layer and compare with ACLs. */
+  if (bucket_info.owner != archive_binfo.owner) {
+    return -EPERM;
+  }
+
+  /* This code will be executed on latest version of the object. */
+  const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
+    rgw_zone_id no_zone;
+
+    /* We don't support object versioning of Swift API on those buckets that
+     * are already versioned using the S3 mechanism. This affects also bucket
+     * storing archived objects. Otherwise the delete operation would create
+     * a deletion marker. */
+    if (archive_binfo.versioned()) {
+      restored = false;
+      return -ERR_PRECONDITION_FAILED;
+    }
+
+    /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
+     * irrelevant and may be safely skipped. */
+    std::map<std::string, ceph::bufferlist> no_attrs;
+
+    rgw_obj archive_obj(archive_binfo.bucket, entry.key);
+
+    if (bucket_info.versioning_enabled()){
+      gen_rand_obj_instance_name(&obj);
+    }
+
+    obj_ctx.set_atomic(archive_obj);
+    obj_ctx.set_atomic(obj);
+
+    int ret = copy_obj(obj_ctx,
+                       user,
+                       nullptr,       /* req_info *info */
+                       no_zone,
+                       obj,           /* dest obj */
+                       archive_obj,   /* src obj */
+                       bucket_info,   /* dest bucket info */
+                       archive_binfo, /* src bucket info */
+                       bucket_info.placement_rule,  /* placement_rule */
+                       nullptr,       /* time_t *src_mtime */
+                       nullptr,       /* time_t *mtime */
+                       nullptr,       /* const time_t *mod_ptr */
+                       nullptr,       /* const time_t *unmod_ptr */
+                       false,         /* bool high_precision_time */
+                       nullptr,       /* const char *if_match */
+                       nullptr,       /* const char *if_nomatch */
+                       RGWRados::ATTRSMOD_NONE,
+                       true,          /* bool copy_if_newer */
+                       no_attrs,
+                       RGWObjCategory::Main,
+                       0,             /* uint64_t olh_epoch */
+                       real_time(),   /* time_t delete_at */
+                       nullptr,       /* string *version_id */
+                       nullptr,       /* string *ptag */
+                       nullptr,       /* string *petag */
+                       nullptr,       /* void (*progress_cb)(off_t, void *) */
+                       nullptr,       /* void *progress_data */
+                       dpp,
+                       null_yield);
+    if (ret == -ECANCELED || ret == -ENOENT) {
+      /* Has already been overwritten, meaning another rgw process already
+       * copied it out */
+      return 0;
+    } else if (ret < 0) {
+      return ret;
+    } else {
+      restored = true;
+    }
+
+    /* Need to remove the archived copy. */
+    ret = delete_obj(dpp, obj_ctx, archive_binfo, archive_obj,
+                     archive_binfo.versioning_status());
+
+    return ret;
+  };
+
+  const std::string& obj_name = obj.get_oid();
+  const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
+                                                         % obj_name);
+
+  return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(),
+                                  handler);
+}
+
+int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
+                                           uint64_t size, uint64_t accounted_size,
+                                           map<string, bufferlist>& attrs,
+                                           bool assume_noent, bool modify_tail,
+                                           void *_index_op, optional_yield y)
+{
+  RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
+  RGWRados *store = target->get_store();
+
+  ObjectWriteOperation op;
+#ifdef WITH_LTTNG
+  const req_state* s =  get_req_state();
+  string req_id;
+  if (!s) {
+    // fake req_id
+    req_id = store->svc.zone_utils->unique_id(store->driver->get_new_req_id());
+  } else {
+    req_id = s->req_id;
+  }
+#endif
+
+  RGWObjState *state;
+  RGWObjManifest *manifest = nullptr;
+  int r = target->get_state(dpp, &state, &manifest, false, y, assume_noent);
+  if (r < 0)
+    return r;
+
+  rgw_obj& obj = target->get_obj();
+
+  if (obj.get_oid().empty()) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
+    return -EIO;
+  }
+
+  rgw_rados_ref ref;
+  r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref);
+  if (r < 0)
+    return r;
+
+  bool is_olh = state->is_olh;
+
+  bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
+
+  const string *ptag = meta.ptag;
+  if (!ptag && !index_op->get_optag()->empty()) {
+    ptag = index_op->get_optag();
+  }
+  r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
+  if (r < 0)
+    return r;
+
+  if (real_clock::is_zero(meta.set_mtime)) {
+    meta.set_mtime = real_clock::now();
+  }
+
+  if (target->get_bucket_info().obj_lock_enabled() && target->get_bucket_info().obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
+    auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+    if (iter == attrs.end()) {
+      real_time lock_until_date = target->get_bucket_info().obj_lock.get_lock_until_date(meta.set_mtime);
+      string mode = target->get_bucket_info().obj_lock.get_mode();
+      RGWObjectRetention obj_retention(mode, lock_until_date);
+      bufferlist bl;
+      obj_retention.encode(bl);
+      op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
+    }
+  }
+
+  if (state->is_olh) {
+    op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
+  }
+
+  struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
+  op.mtime2(&mtime_ts);
+
+  if (meta.data) {
+    /* if we want to overwrite the data, we also want to overwrite the
+       xattrs, so just remove the object */
+    op.write_full(*meta.data);
+    if (state->compressed) {
+      uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+      op.set_alloc_hint2(0, 0, alloc_hint_flags);
+    }
+  }
+
+  string etag;
+  string content_type;
+  bufferlist acl_bl;
+  string storage_class;
+
+  map<string, bufferlist>::iterator iter;
+  if (meta.rmattrs) {
+    for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
+      const string& name = iter->first;
+      op.rmxattr(name.c_str());
+    }
+  }
+
+  if (meta.manifest) {
+    storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
+
+    /* remove existing manifest attr */
+    iter = attrs.find(RGW_ATTR_MANIFEST);
+    if (iter != attrs.end())
+      attrs.erase(iter);
+
+    bufferlist bl;
+    encode(*meta.manifest, bl);
+    op.setxattr(RGW_ATTR_MANIFEST, bl);
+  }
+
+  for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+    const string& name = iter->first;
+    bufferlist& bl = iter->second;
+
+    if (!bl.length())
+      continue;
+
+    op.setxattr(name.c_str(), bl);
+
+    if (name.compare(RGW_ATTR_ETAG) == 0) {
+      etag = rgw_bl_str(bl);
+    } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
+      content_type = rgw_bl_str(bl);
+    } else if (name.compare(RGW_ATTR_ACL) == 0) {
+      acl_bl = bl;
+    }
+  }
+  if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
+    cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
+  }
+
+  if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
+    bufferlist bl;
+    encode(store->svc.zone->get_zone_short_id(), bl);
+    op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
+  }
+
+  if (!storage_class.empty()) {
+    bufferlist bl;
+    bl.append(storage_class);
+    op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
+  }
+
+  if (!op.size())
+    return 0;
+
+  uint64_t epoch;
+  int64_t poolid;
+  bool orig_exists;
+  uint64_t orig_size;
+  
+  if (!reset_obj) {    //Multipart upload, it has immutable head. 
+    orig_exists = false;
+    orig_size = 0;
+  } else {
+    orig_exists = state->exists;
+    orig_size = state->accounted_size;
+  }
+
+  bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
+                          !obj.key.instance.empty();
+
+  bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
+
+  if (versioned_op) {
+    index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
+  }
+
+  if (!index_op->is_prepared()) {
+    tracepoint(rgw_rados, prepare_enter, req_id.c_str());
+    r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
+    tracepoint(rgw_rados, prepare_exit, req_id.c_str());
+    if (r < 0)
+      return r;
+  }
+
+  auto& ioctx = ref.pool.ioctx();
+
+  tracepoint(rgw_rados, operate_enter, req_id.c_str());
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  tracepoint(rgw_rados, operate_exit, req_id.c_str());
+  if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
+                or -ENOENT if was removed, or -EEXIST if it did not exist
+                before and now it does */
+    if (r == -EEXIST && assume_noent) {
+      target->invalidate_state();
+      return r;
+    }
+    goto done_cancel;
+  }
+
+  epoch = ioctx.get_last_version();
+  poolid = ioctx.get_id();
+
+  r = target->complete_atomic_modification(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
+  }
+
+  tracepoint(rgw_rados, complete_enter, req_id.c_str());
+  r = index_op->complete(dpp, poolid, epoch, size, accounted_size,
+                        meta.set_mtime, etag, content_type,
+                        storage_class, &acl_bl,
+			 meta.category, meta.remove_objs, y,
+			 meta.user_data, meta.appendable);
+  tracepoint(rgw_rados, complete_exit, req_id.c_str());
+  if (r < 0)
+    goto done_cancel;
+
+  if (meta.mtime) {
+    *meta.mtime = meta.set_mtime;
+  }
+
+  /* note that index_op was using state so we couldn't invalidate it earlier */
+  target->invalidate_state();
+  state = NULL;
+
+  if (versioned_op && meta.olh_epoch) {
+    r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  if (!real_clock::is_zero(meta.delete_at)) {
+    rgw_obj_index_key obj_key;
+    obj.key.get_index_key(&obj_key);
+
+    r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name,
+                                     obj.bucket.bucket_id, obj_key);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
+      /* ignoring error, nothing we can do at this point */
+    }
+  }
+  meta.canceled = false;
+
+  /* update quota cache */
+  if (meta.completeMultipart){
+  	store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+                                     0, orig_size);
+  }
+  else {
+    store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
+                                     accounted_size, orig_size);
+  }
+  return 0;
+
+done_cancel:
+  int ret = index_op->cancel(dpp, meta.remove_objs, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
+  }
+
+  meta.canceled = true;
+
+  /* we lost in a race. There are a few options:
+   * - existing object was rewritten (ECANCELED)
+   * - non existing object was created (EEXIST)
+   * - object was removed (ENOENT)
+   * should treat it as a success
+   */
+  if (meta.if_match == NULL && meta.if_nomatch == NULL) {
+    if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
+      r = 0;
+    }
+  } else {
+    if (meta.if_match != NULL) {
+      // only overwrite existing object
+      if (strcmp(meta.if_match, "*") == 0) {
+        if (r == -ENOENT) {
+          r = -ERR_PRECONDITION_FAILED;
+        } else if (r == -ECANCELED) {
+          r = 0;
+        }
+      }
+    }
+
+    if (meta.if_nomatch != NULL) {
+      // only create a new object
+      if (strcmp(meta.if_nomatch, "*") == 0) {
+        if (r == -EEXIST) {
+          r = -ERR_PRECONDITION_FAILED;
+        } else if (r == -ENOENT) {
+          r = 0;
+        }
+      }
+    }
+  }
+
+  return r;
+}
+
+int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
+                                           map<string, bufferlist>& attrs, optional_yield y)
+{
+  RGWBucketInfo& bucket_info = target->get_bucket_info();
+
+  RGWRados::Bucket bop(target->get_store(), bucket_info);
+  RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
+  index_op.set_zones_trace(meta.zones_trace);
+  
+  bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
+  int r;
+  if (assume_noent) {
+    r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
+    if (r == -EEXIST) {
+      assume_noent = false;
+    }
+  }
+  if (!assume_noent) {
+    r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
+  }
+  return r;
+}
+
+class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
+{
+  const DoutPrefixProvider *dpp;
+  CephContext* cct;
+  rgw_obj obj;
+  rgw::sal::DataProcessor *filter;
+  boost::optional<RGWPutObj_Compress>& compressor;
+  bool try_etag_verify;
+  rgw::putobj::etag_verifier_ptr etag_verifier;
+  boost::optional<rgw::putobj::ChunkProcessor> buffering;
+  CompressorRef& plugin;
+  rgw::sal::ObjectProcessor *processor;
+  void (*progress_cb)(off_t, void *);
+  void *progress_data;
+  bufferlist extra_data_bl, manifest_bl;
+  std::optional<RGWCompressionInfo> compression_info;
+  uint64_t extra_data_left{0};
+  bool need_to_process_attrs{true};
+  uint64_t data_len{0};
+  map<string, bufferlist> src_attrs;
+  uint64_t ofs{0};
+  uint64_t lofs{0}; /* logical ofs */
+  std::function<int(map<string, bufferlist>&)> attrs_handler;
+
+public:
+  RGWRadosPutObj(const DoutPrefixProvider *dpp,
+                 CephContext* cct,
+                 CompressorRef& plugin,
+                 boost::optional<RGWPutObj_Compress>& compressor,
+                 rgw::sal::ObjectProcessor *p,
+                 void (*_progress_cb)(off_t, void *),
+                 void *_progress_data,
+                 std::function<int(map<string, bufferlist>&)> _attrs_handler) :
+                       dpp(dpp),
+                       cct(cct),
+                       filter(p),
+                       compressor(compressor),
+                       try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify),
+                       plugin(plugin),
+                       processor(p),
+                       progress_cb(_progress_cb),
+                       progress_data(_progress_data),
+                       attrs_handler(_attrs_handler) {}
+
+
+  int process_attrs(void) {
+    bool encrypted = false;
+    if (extra_data_bl.length()) {
+      JSONParser jp;
+      if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
+        ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
+        return -EIO;
+      }
+
+      JSONDecoder::decode_json("attrs", src_attrs, &jp);
+
+      encrypted = src_attrs.count(RGW_ATTR_CRYPT_MODE);
+      if (encrypted) {
+        // we won't have access to the decrypted data for checksumming
+        try_etag_verify = false;
+      }
+
+      // if the object is both compressed and encrypted, it was transferred
+      // in its encrypted+compressed form. we need to preserve the original
+      // RGW_ATTR_COMPRESSION instead of falling back to default compression
+      // settings
+      auto iter = src_attrs.find(RGW_ATTR_COMPRESSION);
+      if (iter != src_attrs.end() && !encrypted) {
+        const bufferlist bl = std::move(iter->second);
+        src_attrs.erase(iter); // don't preserve source compression info
+
+        if (try_etag_verify) {
+          // if we're trying to verify etags, we need to convert compressed
+          // ranges in the manifest back into logical multipart part offsets
+          RGWCompressionInfo info;
+          bool compressed = false;
+          int r = rgw_compression_info_from_attr(bl, compressed, info);
+          if (r < 0) {
+            ldpp_dout(dpp, 4) << "failed to decode compression info, "
+                "disabling etag verification" << dendl;
+            try_etag_verify = false;
+          } else if (compressed) {
+            compression_info = std::move(info);
+          }
+        }
+      }
+
+      /* We need the manifest to recompute the ETag for verification */
+      iter = src_attrs.find(RGW_ATTR_MANIFEST);
+      if (iter != src_attrs.end()) {
+        manifest_bl = std::move(iter->second);
+        src_attrs.erase(iter);
+
+        // if the source object was encrypted, preserve the part lengths from
+        // the original object's manifest in RGW_ATTR_CRYPT_PARTS. if the object
+        // already replicated and has the RGW_ATTR_CRYPT_PARTS attr, preserve it
+        if (src_attrs.count(RGW_ATTR_CRYPT_MODE) &&
+            !src_attrs.count(RGW_ATTR_CRYPT_PARTS)) {
+          std::vector<size_t> parts_len;
+          int r = RGWGetObj_BlockDecrypt::read_manifest_parts(dpp, manifest_bl,
+                                                              parts_len);
+          if (r < 0) {
+            ldpp_dout(dpp, 4) << "failed to read part lengths from the manifest" << dendl;
+          } else {
+            // store the encoded part lenghts in RGW_ATTR_CRYPT_PARTS
+            bufferlist parts_bl;
+            encode(parts_len, parts_bl);
+            src_attrs[RGW_ATTR_CRYPT_PARTS] = std::move(parts_bl);
+          }
+        }
+      }
+
+      // filter out olh attributes
+      iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
+      while (iter != src_attrs.end()) {
+        if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
+          break;
+        }
+        iter = src_attrs.erase(iter);
+      }
+    }
+
+    int ret = attrs_handler(src_attrs);
+    if (ret < 0) {
+      return ret;
+    }
+
+    // do not compress if object is encrypted
+    if (plugin && !encrypted) {
+      compressor = boost::in_place(cct, plugin, filter);
+      // add a filter that buffers data so we don't try to compress tiny blocks.
+      // libcurl reads in 16k at a time, and we need at least 64k to get a good
+      // compression ratio
+      constexpr unsigned buffer_size = 512 * 1024;
+      buffering = boost::in_place(&*compressor, buffer_size);
+      filter = &*buffering;
+    }
+
+    if (try_etag_verify) {
+      ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl,
+                                              compression_info,
+                                              etag_verifier);
+      if (ret < 0) {
+        ldpp_dout(dpp, 4) << "failed to initial etag verifier, "
+            "disabling etag verification" << dendl;
+      } else {
+        filter = etag_verifier.get();
+      }
+    }
+
+    need_to_process_attrs = false;
+
+    return 0;
+  }
+
+  int handle_data(bufferlist& bl, bool *pause) override {
+    if (progress_cb) {
+      progress_cb(data_len, progress_data);
+    }
+    if (extra_data_left) {
+      uint64_t extra_len = bl.length();
+      if (extra_len > extra_data_left)
+        extra_len = extra_data_left;
+
+      bufferlist extra;
+      bl.splice(0, extra_len, &extra);
+      extra_data_bl.append(extra);
+
+      extra_data_left -= extra_len;
+      if (extra_data_left == 0) {
+        int res = process_attrs();
+        if (res < 0)
+          return res;
+      }
+      ofs += extra_len;
+      if (bl.length() == 0) {
+        return 0;
+      }
+    }
+    if (need_to_process_attrs) {
+      /* need to call process_attrs() even if we don't get any attrs,
+       * need it to call attrs_handler().
+       */
+      int res = process_attrs();
+      if (res < 0) {
+        return res;
+      }
+    }
+
+    ceph_assert(uint64_t(ofs) >= extra_data_len);
+
+    uint64_t size = bl.length();
+    ofs += size;
+
+    const uint64_t lofs = data_len;
+    data_len += size;
+
+    return filter->process(std::move(bl), lofs);
+  }
+
+  int flush() {
+    return filter->process({}, data_len);
+  }
+
+  bufferlist& get_extra_data() { return extra_data_bl; }
+
+  map<string, bufferlist>& get_attrs() { return src_attrs; }
+
+  void set_extra_data_len(uint64_t len) override {
+    extra_data_left = len;
+    RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
+  }
+
+  uint64_t get_data_len() {
+    return data_len;
+  }
+
+  std::string get_verifier_etag() {
+    if (etag_verifier) {
+      etag_verifier->calculate_etag();
+      return etag_verifier->get_calculated_etag();
+    } else {
+      return "";
+    }
+  }
+};
+
+/*
+ * prepare attrset depending on attrs_mod.
+ */
+static void set_copy_attrs(map<string, bufferlist>& src_attrs,
+                           map<string, bufferlist>& attrs,
+                           RGWRados::AttrsMod attrs_mod)
+{
+  switch (attrs_mod) {
+  case RGWRados::ATTRSMOD_NONE:
+    attrs = src_attrs;
+    break;
+  case RGWRados::ATTRSMOD_REPLACE:
+    if (!attrs[RGW_ATTR_ETAG].length()) {
+      attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
+    }
+    if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
+      auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
+      if (ttiter != src_attrs.end()) {
+        attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
+      }
+    }
+    break;
+  case RGWRados::ATTRSMOD_MERGE:
+    for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
+      if (attrs.find(it->first) == attrs.end()) {
+       attrs[it->first] = it->second;
+      }
+    }
+    break;
+  }
+}
+
+int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, const DoutPrefixProvider *dpp, optional_yield y)
+{
+  RGWObjectCtx rctx(this->driver);
+  rgw::sal::Attrs attrset;
+  uint64_t obj_size;
+  ceph::real_time mtime;
+  RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
+  RGWRados::Object::Read read_op(&op_target);
+
+  read_op.params.attrs = &attrset;
+  read_op.params.obj_size = &obj_size;
+  read_op.params.lastmod = &mtime;
+
+  int ret = read_op.prepare(y, dpp);
+  if (ret < 0)
+    return ret;
+
+  attrset.erase(RGW_ATTR_ID_TAG);
+  attrset.erase(RGW_ATTR_TAIL_TAG);
+  attrset.erase(RGW_ATTR_STORAGE_CLASS);
+
+  return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule,
+                       read_op, obj_size - 1, obj, NULL, mtime,
+                       attrset, 0, real_time(), NULL, dpp, y);
+}
+
+int RGWRados::reindex_obj(const RGWBucketInfo& bucket_info,
+			  const rgw_obj& obj,
+			  const DoutPrefixProvider* dpp,
+			  optional_yield y)
+{
+  if (bucket_info.versioned()) {
+    ldpp_dout(dpp, 10) << "WARNING: " << __func__ <<
+      ": cannot process versioned bucket \"" <<
+      bucket_info.bucket.get_key() << "\"" <<
+      dendl;
+    return -ENOTSUP;
+  }
+
+  Bucket target(this, bucket_info);
+  RGWRados::Bucket::UpdateIndex update_idx(&target, obj);
+  const std::string* no_write_tag = nullptr;
+
+  int ret = update_idx.prepare(dpp, RGWModifyOp::CLS_RGW_OP_ADD, no_write_tag, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+      ": update index prepare for \"" << obj << "\" returned: " <<
+      cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+struct obj_time_weight {
+  real_time mtime;
+  uint32_t zone_short_id;
+  uint64_t pg_ver;
+  bool high_precision;
+
+  obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
+
+  bool compare_low_precision(const obj_time_weight& rhs) {
+    struct timespec l = ceph::real_clock::to_timespec(mtime);
+    struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
+    l.tv_nsec = 0;
+    r.tv_nsec = 0;
+    if (l > r) {
+      return false;
+    }
+    if (l < r) {
+      return true;
+    }
+    if (!zone_short_id || !rhs.zone_short_id) {
+      /* don't compare zone ids, if one wasn't provided */
+      return false;
+    }
+    if (zone_short_id != rhs.zone_short_id) {
+      return (zone_short_id < rhs.zone_short_id);
+    }
+    return (pg_ver < rhs.pg_ver);
+
+  }
+
+  bool operator<(const obj_time_weight& rhs) {
+    if (!high_precision || !rhs.high_precision) {
+      return compare_low_precision(rhs);
+    }
+    if (mtime > rhs.mtime) {
+      return false;
+    }
+    if (mtime < rhs.mtime) {
+      return true;
+    }
+    if (!zone_short_id || !rhs.zone_short_id) {
+      /* don't compare zone ids, if one wasn't provided */
+      return false;
+    }
+    if (zone_short_id != rhs.zone_short_id) {
+      return (zone_short_id < rhs.zone_short_id);
+    }
+    return (pg_ver < rhs.pg_ver);
+  }
+
+  void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
+    mtime = _mtime;
+    zone_short_id = _short_id;
+    pg_ver = _pg_ver;
+  }
+
+  void init(RGWObjState *state) {
+    mtime = state->mtime;
+    zone_short_id = state->zone_short_id;
+    pg_ver = state->pg_ver;
+  }
+};
+
+inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
+  out << o.mtime;
+
+  if (o.zone_short_id != 0 || o.pg_ver != 0) {
+    out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
+  }
+
+  return out;
+}
+
+class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
+  bufferlist extra_data;
+public:
+  RGWGetExtraDataCB() {}
+  int handle_data(bufferlist& bl, bool *pause) override {
+    int bl_len = (int)bl.length();
+    if (extra_data.length() < extra_data_len) {
+      off_t max = extra_data_len - extra_data.length();
+      if (max > bl_len) {
+        max = bl_len;
+      }
+      bl.splice(0, max, &extra_data);
+    }
+    return bl_len;
+  }
+
+  bufferlist& get_extra_data() {
+    return extra_data;
+  }
+};
+
+int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
+               RGWObjectCtx& obj_ctx,
+               const rgw_user& user_id,
+               req_info *info,
+               const rgw_zone_id& source_zone,
+               const rgw_obj& src_obj,
+               const RGWBucketInfo *src_bucket_info,
+               real_time *src_mtime,
+               uint64_t *psize,
+               const real_time *mod_ptr,
+               const real_time *unmod_ptr,
+               bool high_precision_time,
+               const char *if_match,
+               const char *if_nomatch,
+               map<string, bufferlist> *pattrs,
+               map<string, string> *pheaders,
+               string *version_id,
+               string *ptag,
+               string *petag)
+{
+  /* source is in a different zonegroup, copy from there */
+
+  RGWRESTStreamRWRequest *in_stream_req;
+  string tag;
+  map<string, bufferlist> src_attrs;
+  append_rand_alpha(cct, tag, tag, 32);
+  obj_time_weight set_mtime_weight;
+  set_mtime_weight.high_precision = high_precision_time;
+
+  RGWRESTConn *conn;
+  if (source_zone.empty()) {
+    if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
+      /* source is in the master zonegroup */
+      conn = svc.zone->get_master_conn();
+    } else {
+      auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
+      map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
+      if (iter == zonegroup_conn_map.end()) {
+        ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
+        return -ENOENT;
+      }
+      conn = iter->second;
+    }
+  } else {
+    auto& zone_conn_map = svc.zone->get_zone_conn_map();
+    auto iter = zone_conn_map.find(source_zone);
+    if (iter == zone_conn_map.end()) {
+      ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
+      return -ENOENT;
+    }
+    conn = iter->second;
+  }
+
+  RGWGetExtraDataCB cb;
+  map<string, string> req_headers;
+  real_time set_mtime;
+
+  const real_time *pmod = mod_ptr;
+
+  obj_time_weight dest_mtime_weight;
+
+  constexpr bool prepend_meta = true;
+  constexpr bool get_op = true;
+  constexpr bool rgwx_stat = true;
+  constexpr bool sync_manifest = true;
+  constexpr bool skip_decrypt = true;
+  constexpr bool sync_cloudtiered = true;
+  int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
+                      dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
+                      prepend_meta, get_op, rgwx_stat,
+                      sync_manifest, skip_decrypt, nullptr, sync_cloudtiered,
+                      true, &cb, &in_stream_req);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
+                               nullptr, pheaders, null_yield);
+  if (ret < 0) {
+    return ret;
+  }
+
+  bufferlist& extra_data_bl = cb.get_extra_data();
+  if (extra_data_bl.length()) {
+    JSONParser jp;
+    if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
+      ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
+      return -EIO;
+    }
+
+    JSONDecoder::decode_json("attrs", src_attrs, &jp);
+
+    src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
+  }
+
+  if (src_mtime) {
+    *src_mtime = set_mtime;
+  }
+
+  if (petag) {
+    map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
+    if (iter != src_attrs.end()) {
+      bufferlist& etagbl = iter->second;
+      *petag = etagbl.to_str();
+      while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
+        *petag = petag->substr(0, petag->size() - 1);
+      }
+    }
+  }
+
+  if (pattrs) {
+    *pattrs = std::move(src_attrs);
+  }
+
+  return 0;
+}
+
+int RGWFetchObjFilter_Default::filter(CephContext *cct,
+                                      const rgw_obj_key& source_key,
+                                      const RGWBucketInfo& dest_bucket_info,
+                                      std::optional<rgw_placement_rule> dest_placement_rule,
+                                      const map<string, bufferlist>& obj_attrs,
+				      std::optional<rgw_user> *poverride_owner,
+                                      const rgw_placement_rule **prule)
+{
+  const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
+  if (!ptail_rule) {
+    auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
+    if (iter != obj_attrs.end()) {
+      dest_rule.storage_class = iter->second.to_str();
+      dest_rule.inherit_from(dest_bucket_info.placement_rule);
+      ptail_rule = &dest_rule;
+    } else {
+      ptail_rule = &dest_bucket_info.placement_rule;
+    }
+  }
+  *prule = ptail_rule;
+  return 0;
+}
+
+int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
+               const rgw_user& user_id,
+               req_info *info,
+               const rgw_zone_id& source_zone,
+               const rgw_obj& dest_obj,
+               const rgw_obj& src_obj,
+               RGWBucketInfo& dest_bucket_info,
+               RGWBucketInfo *src_bucket_info,
+               std::optional<rgw_placement_rule> dest_placement_rule,
+               real_time *src_mtime,
+               real_time *mtime,
+               const real_time *mod_ptr,
+               const real_time *unmod_ptr,
+               bool high_precision_time,
+               const char *if_match,
+               const char *if_nomatch,
+               AttrsMod attrs_mod,
+               bool copy_if_newer,
+               rgw::sal::Attrs& attrs,
+               RGWObjCategory category,
+               std::optional<uint64_t> olh_epoch,
+	       real_time delete_at,
+               string *ptag,
+               string *petag,
+               void (*progress_cb)(off_t, void *),
+               void *progress_data,
+               const DoutPrefixProvider *dpp,
+               RGWFetchObjFilter *filter,
+               const rgw_zone_set_entry& source_trace_entry,
+               rgw_zone_set *zones_trace,
+               std::optional<uint64_t>* bytes_transferred)
+{
+  /* source is in a different zonegroup, copy from there */
+
+  RGWRESTStreamRWRequest *in_stream_req;
+  string tag;
+  int i;
+  append_rand_alpha(cct, tag, tag, 32);
+  obj_time_weight set_mtime_weight;
+  set_mtime_weight.high_precision = high_precision_time;
+  int ret;
+
+  rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
+  using namespace rgw::putobj;
+  AtomicObjectProcessor processor(&aio, this, dest_bucket_info, nullptr,
+                                  user_id, obj_ctx, dest_obj, olh_epoch,
+				  tag, dpp, null_yield);
+  RGWRESTConn *conn;
+  auto& zone_conn_map = svc.zone->get_zone_conn_map();
+  auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
+  if (source_zone.empty()) {
+    if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
+      /* source is in the master zonegroup */
+      conn = svc.zone->get_master_conn();
+    } else {
+      map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
+      if (iter == zonegroup_conn_map.end()) {
+        ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
+        return -ENOENT;
+      }
+      conn = iter->second;
+    }
+  } else {
+    auto iter = zone_conn_map.find(source_zone);
+    if (iter == zone_conn_map.end()) {
+      ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
+      return -ENOENT;
+    }
+    conn = iter->second;
+  }
+
+  boost::optional<RGWPutObj_Compress> compressor;
+  CompressorRef plugin;
+
+  RGWFetchObjFilter_Default source_filter;
+  if (!filter) {
+    filter = &source_filter;
+  }
+
+  std::optional<rgw_user> override_owner;
+
+  RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
+                    [&](map<string, bufferlist>& obj_attrs) {
+                      const rgw_placement_rule *ptail_rule;
+
+                      int ret = filter->filter(cct,
+                                               src_obj.key,
+                                               dest_bucket_info,
+                                               dest_placement_rule,
+                                               obj_attrs,
+					       &override_owner,
+                                               &ptail_rule);
+                      if (ret < 0) {
+                        ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
+                        return ret;
+                      }
+
+                      processor.set_tail_placement(*ptail_rule);
+
+                      const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
+                      if (compression_type != "none") {
+                        plugin = Compressor::create(cct, compression_type);
+                        if (!plugin) {
+                          ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
+                                        << compression_type << dendl;
+                        }
+                      }
+
+                      ret = processor.prepare(null_yield);
+                      if (ret < 0) {
+                        return ret;
+                      }
+                      return 0;
+                    });
+
+  string etag;
+  real_time set_mtime;
+  uint64_t accounted_size = 0;
+
+  RGWObjState *dest_state = NULL;
+  RGWObjManifest *manifest = nullptr;
+
+  const real_time *pmod = mod_ptr;
+
+  obj_time_weight dest_mtime_weight;
+  rgw_zone_set_entry dst_zone_trace(svc.zone->get_zone().id, dest_bucket_info.bucket.get_key());
+
+  if (copy_if_newer) {
+    /* need to get mtime for destination */
+    ret = get_obj_state(dpp, &obj_ctx, dest_bucket_info, dest_obj, &dest_state, &manifest, false, null_yield);
+    if (ret < 0)
+      goto set_err_state;
+
+    if (!real_clock::is_zero(dest_state->mtime)) {
+      dest_mtime_weight.init(dest_state);
+      pmod = &dest_mtime_weight.mtime;
+    }
+  }
+
+  static constexpr bool prepend_meta = true;
+  static constexpr bool get_op = true;
+  static constexpr bool rgwx_stat = false;
+  static constexpr bool sync_manifest = true;
+  static constexpr bool skip_decrypt = true;
+  static constexpr bool sync_cloudtiered = true;
+  ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
+                      dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
+                      prepend_meta, get_op, rgwx_stat,
+                      sync_manifest, skip_decrypt, &dst_zone_trace,
+                      sync_cloudtiered, true,
+                      &cb, &in_stream_req);
+  if (ret < 0) {
+    goto set_err_state;
+  }
+
+  ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
+                               &accounted_size, nullptr, nullptr, null_yield);
+  if (ret < 0) {
+    goto set_err_state;
+  }
+  ret = cb.flush();
+  if (ret < 0) {
+    goto set_err_state;
+  }
+  if (cb.get_data_len() != accounted_size) {
+    ret = -EIO;
+    ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected "
+        << accounted_size << " bytes but received " << cb.get_data_len() << dendl;
+    goto set_err_state;
+  }
+
+  if (compressor && compressor->is_compressed()) {
+    bufferlist tmp;
+    RGWCompressionInfo cs_info;
+    cs_info.compression_type = plugin->get_type_name();
+    cs_info.orig_size = accounted_size;
+    cs_info.compressor_message = compressor->get_compressor_message();
+    cs_info.blocks = move(compressor->get_compression_blocks());
+    encode(cs_info, tmp);
+    cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
+  } else if (auto c = cb.get_attrs().find(RGW_ATTR_COMPRESSION);
+             c != cb.get_attrs().end()) {
+    // if the object was transferred in its compressed+encrypted form, use its
+    // original uncompressed size
+    try {
+      RGWCompressionInfo info;
+      auto p = c->second.cbegin();
+      decode(info, p);
+      accounted_size = info.orig_size;
+    } catch (const buffer::error&) {
+      ldpp_dout(dpp, 0) << "ERROR: could not decode compression attr for "
+          "replicated object " << dest_obj << dendl;
+      // decode error isn't fatal, but we might put the wrong size in the index
+    }
+  }
+
+  if (override_owner) {
+    processor.set_owner(*override_owner);
+
+    auto& obj_attrs = cb.get_attrs();
+
+    RGWUserInfo owner_info;
+    if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) {
+      ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
+      return -EINVAL;
+    }
+
+    RGWAccessControlPolicy acl;
+
+    auto aiter = obj_attrs.find(RGW_ATTR_ACL);
+    if (aiter == obj_attrs.end()) {
+      ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
+      acl.create_default(owner_info.user_id, owner_info.display_name);
+    } else {
+      auto iter = aiter->second.cbegin();
+      try {
+	acl.decode(iter);
+      } catch (buffer::error& err) {
+	ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
+	return -EIO;
+      }
+    }
+
+    ACLOwner new_owner;
+    new_owner.set_id(*override_owner);
+    new_owner.set_name(owner_info.display_name);
+
+    acl.set_owner(new_owner);
+
+    bufferlist bl;
+    acl.encode(bl);
+    obj_attrs[RGW_ATTR_ACL] = std::move(bl);
+  }
+
+  if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
+    cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
+  } else {
+    map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
+    if (iter != cb.get_attrs().end()) {
+      try {
+        decode(delete_at, iter->second);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
+      }
+    }
+  }
+
+  if (src_mtime) {
+    *src_mtime = set_mtime;
+  }
+
+  if (petag) {
+    const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
+    if (iter != cb.get_attrs().end()) {
+      *petag = iter->second.to_str();
+    }
+  }
+
+  //erase the append attr
+  cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
+
+  { // add x-amz-replication-status=REPLICA
+    auto& bl = cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_STATUS];
+    bl.clear(); // overwrite source's status
+    bl.append("REPLICA");
+  }
+  { // update replication trace
+    std::vector<rgw_zone_set_entry> trace;
+    if (auto i = cb.get_attrs().find(RGW_ATTR_OBJ_REPLICATION_TRACE);
+        i != cb.get_attrs().end()) {
+      try {
+        decode(trace, i->second);
+      } catch (const buffer::error&) {}
+    }
+    // add the source entry to the end
+    trace.push_back(source_trace_entry);
+
+    bufferlist bl;
+    encode(trace, bl);
+    cb.get_attrs()[RGW_ATTR_OBJ_REPLICATION_TRACE] = std::move(bl);
+  }
+
+  if (source_zone.empty()) {
+    set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
+  } else {
+    attrs = cb.get_attrs();
+  }
+
+  if (copy_if_newer) {
+    uint64_t pg_ver = 0;
+    auto i = attrs.find(RGW_ATTR_PG_VER);
+    if (i != attrs.end() && i->second.length() > 0) {
+      auto iter = i->second.cbegin();
+      try {
+        decode(pg_ver, iter);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
+        /* non critical error */
+      }
+    }
+    set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
+  }
+
+  /* Perform ETag verification is we have computed the object's MD5 sum at our end */
+  if (const auto& verifier_etag = cb.get_verifier_etag();
+      !verifier_etag.empty()) {
+    string trimmed_etag = etag;
+
+    /* Remove the leading and trailing double quotes from etag */
+    trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'),
+      trimmed_etag.end());
+
+    if (verifier_etag != trimmed_etag) {
+      ret = -EIO;
+      ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:"
+        << trimmed_etag << " Computed etag:" << verifier_etag << dendl;
+      goto set_err_state;
+    }
+  }
+
+#define MAX_COMPLETE_RETRY 100
+  for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
+    bool canceled = false;
+    ret = processor.complete(accounted_size, etag, mtime, set_mtime,
+                             attrs, delete_at, nullptr, nullptr, nullptr,
+                             zones_trace, &canceled, null_yield);
+    if (ret < 0) {
+      goto set_err_state;
+    }
+
+    if (copy_if_newer && canceled) {
+      ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl;
+      obj_ctx.invalidate(dest_obj); /* object was overwritten */
+      ret = get_obj_state(dpp, &obj_ctx, dest_bucket_info, dest_obj, &dest_state, &manifest, false, null_yield);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
+        goto set_err_state;
+      }
+      dest_mtime_weight.init(dest_state);
+      dest_mtime_weight.high_precision = high_precision_time;
+      if (!dest_state->exists ||
+        dest_mtime_weight < set_mtime_weight) {
+        ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
+        continue;
+      } else {
+        ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
+      }
+    }
+    break;
+  }
+
+  if (i == MAX_COMPLETE_RETRY) {
+    ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
+    ret = -EIO;
+    goto set_err_state;
+  }
+
+  if (bytes_transferred) {
+    *bytes_transferred = cb.get_data_len();
+  }
+  return 0;
+set_err_state:
+  if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
+    // we may have already fetched during sync of OP_ADD, but were waiting
+    // for OP_LINK_OLH to call set_olh() with a real olh_epoch
+    if (olh_epoch && *olh_epoch > 0) {
+      constexpr bool log_data_change = true;
+      ret = set_olh(dpp, obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
+                    *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
+    } else {
+      // we already have the latest copy
+      ret = 0;
+    }
+  }
+  return ret;
+}
+
+
+int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
+                                      RGWObjState *astate,
+                                      map<string, bufferlist>& src_attrs,
+                                      RGWRados::Object::Read& read_op,
+                                      const rgw_user& user_id,
+                                      const rgw_obj& dest_obj,
+                                      real_time *mtime)
+{
+  string etag;
+
+  RGWRESTStreamS3PutObj *out_stream_req;
+
+  auto rest_master_conn = svc.zone->get_master_conn();
+
+  int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req);
+  if (ret < 0) {
+    return ret;
+  }
+
+  out_stream_req->set_send_length(astate->size);
+
+  ret = RGWHTTP::send(out_stream_req);
+  if (ret < 0) {
+    delete out_stream_req;
+    return ret;
+  }
+
+  ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
+  if (ret < 0) {
+    delete out_stream_req;
+    return ret;
+  }
+
+  ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+/**
+ * Copy an object.
+ * dest_obj: the object to copy into
+ * src_obj: the object to copy from
+ * attrs: usage depends on attrs_mod parameter
+ * attrs_mod: the modification mode of the attrs, may have the following values:
+ *            ATTRSMOD_NONE - the attributes of the source object will be
+ *                            copied without modifications, attrs parameter is ignored;
+ *            ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
+ *                               parameter, source object attributes are not copied;
+ *            ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
+ *                             are overwritten by values contained in attrs parameter.
+ * err: stores any errors resulting from the get of the original object
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
+               const rgw_user& user_id,
+               req_info *info,
+               const rgw_zone_id& source_zone,
+               const rgw_obj& dest_obj,
+               const rgw_obj& src_obj,
+               RGWBucketInfo& dest_bucket_info,
+               RGWBucketInfo& src_bucket_info,
+               const rgw_placement_rule& dest_placement,
+               real_time *src_mtime,
+               real_time *mtime,
+               const real_time *mod_ptr,
+               const real_time *unmod_ptr,
+               bool high_precision_time,
+               const char *if_match,
+               const char *if_nomatch,
+               AttrsMod attrs_mod,
+               bool copy_if_newer,
+               rgw::sal::Attrs& attrs,
+               RGWObjCategory category,
+               uint64_t olh_epoch,
+	       real_time delete_at,
+               string *version_id,
+               string *ptag,
+               string *petag,
+               void (*progress_cb)(off_t, void *),
+               void *progress_data,
+               const DoutPrefixProvider *dpp,
+               optional_yield y)
+{
+  int ret;
+  uint64_t obj_size;
+  rgw_obj shadow_obj = dest_obj;
+  string shadow_oid;
+
+  bool remote_src;
+  bool remote_dest;
+
+  append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
+  shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
+
+  auto& zonegroup = svc.zone->get_zonegroup();
+
+  remote_dest = !zonegroup.equals(dest_bucket_info.zonegroup);
+  remote_src = !zonegroup.equals(src_bucket_info.zonegroup);
+
+  if (remote_src && remote_dest) {
+    ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
+    return -EINVAL;
+  }
+
+  ldpp_dout(dpp, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
+
+  if (remote_src || !source_zone.empty()) {
+    rgw_zone_set_entry source_trace_entry{source_zone.id, std::nullopt};
+    return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
+               dest_obj, src_obj, dest_bucket_info, &src_bucket_info,
+               dest_placement, src_mtime, mtime, mod_ptr,
+               unmod_ptr, high_precision_time,
+               if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
+               olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
+               nullptr /* filter */, source_trace_entry);
+  }
+
+  map<string, bufferlist> src_attrs;
+  RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
+  RGWRados::Object::Read read_op(&src_op_target);
+
+  read_op.conds.mod_ptr = mod_ptr;
+  read_op.conds.unmod_ptr = unmod_ptr;
+  read_op.conds.high_precision_time = high_precision_time;
+  read_op.conds.if_match = if_match;
+  read_op.conds.if_nomatch = if_nomatch;
+  read_op.params.attrs = &src_attrs;
+  read_op.params.lastmod = src_mtime;
+  read_op.params.obj_size = &obj_size;
+
+  ret = read_op.prepare(y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+  if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
+    // Current implementation does not follow S3 spec and even
+    // may result in data corruption silently when copying
+    // multipart objects acorss pools. So reject COPY operations
+    //on encrypted objects before it is fully functional.
+    ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
+                  << " has not been implemented." << dendl;
+    return -ERR_NOT_IMPLEMENTED;
+  }
+
+  src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
+  src_attrs.erase(RGW_ATTR_DELETE_AT);
+
+  src_attrs.erase(RGW_ATTR_OBJECT_RETENTION);
+  src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD);
+  map<string, bufferlist>::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+  if (rt != attrs.end())
+    src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second;
+  map<string, bufferlist>::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD);
+  if (lh != attrs.end())
+    src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second;
+
+  set_copy_attrs(src_attrs, attrs, attrs_mod);
+  attrs.erase(RGW_ATTR_ID_TAG);
+  attrs.erase(RGW_ATTR_PG_VER);
+  attrs.erase(RGW_ATTR_SOURCE_ZONE);
+  map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
+  if (cmp != src_attrs.end())
+    attrs[RGW_ATTR_COMPRESSION] = cmp->second;
+
+  RGWObjManifest manifest;
+  RGWObjState *astate = NULL;
+  RGWObjManifest *amanifest = nullptr;
+
+  ret = get_obj_state(dpp, &obj_ctx, src_bucket_info, src_obj, &astate, &amanifest, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  vector<rgw_raw_obj> ref_objs;
+
+  if (remote_dest) {
+    /* dest is in a different zonegroup, copy it there */
+    return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime);
+  }
+  uint64_t max_chunk_size;
+
+  ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
+    return ret;
+  }
+
+  rgw_pool src_pool;
+  rgw_pool dest_pool;
+
+  const rgw_placement_rule *src_rule{nullptr};
+
+  if (amanifest) {
+    src_rule = &amanifest->get_tail_placement().placement_rule;
+    ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
+  }
+
+  if (!src_rule || src_rule->empty()) {
+    src_rule = &src_bucket_info.placement_rule;
+  }
+
+  if (!get_obj_data_pool(*src_rule, src_obj, &src_pool)) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
+    return -EIO;
+  }
+
+  if (!get_obj_data_pool(dest_placement, dest_obj, &dest_pool)) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
+    return -EIO;
+  }
+
+  ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
+                             << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
+
+  bool copy_data = (!amanifest) ||
+    (*src_rule != dest_placement) ||
+    (src_pool != dest_pool);
+
+  bool copy_first = false;
+  if (amanifest) {
+    if (!amanifest->has_tail()) {
+      copy_data = true;
+    } else {
+      uint64_t head_size = amanifest->get_head_size();
+
+      if (head_size > 0) {
+        if (head_size > max_chunk_size) {
+          copy_data = true;
+        } else {
+          copy_first = true;
+        }
+      }
+    }
+  }
+
+  if (petag) {
+    const auto iter = attrs.find(RGW_ATTR_ETAG);
+    if (iter != attrs.end()) {
+      *petag = iter->second.to_str();
+    }
+  }
+
+  if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
+    attrs.erase(RGW_ATTR_TAIL_TAG);
+    return copy_obj_data(obj_ctx, dest_bucket_info, dest_placement, read_op, obj_size - 1, dest_obj,
+                         mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
+  }
+
+  /* This has been in for 2 years, so we can safely assume amanifest is not NULL */
+  RGWObjManifest::obj_iterator miter = amanifest->obj_begin(dpp);
+
+  if (copy_first) { // we need to copy first chunk, not increase refcount
+    ++miter;
+  }
+
+  bufferlist first_chunk;
+
+  const bool copy_itself = (dest_obj == src_obj);
+  RGWObjManifest *pmanifest; 
+  ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
+
+  RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
+  RGWRados::Object::Write write_op(&dest_op_target);
+
+  string tag;
+
+  if (ptag) {
+    tag = *ptag;
+  }
+
+  if (tag.empty()) {
+    append_rand_alpha(cct, tag, tag, 32);
+  }
+
+  std::unique_ptr<rgw::Aio> aio;
+  rgw::AioResultList all_results;
+  if (!copy_itself) {
+    aio = rgw::make_throttle(cct->_conf->rgw_max_copy_obj_concurrent_io, y);
+    attrs.erase(RGW_ATTR_TAIL_TAG);
+    manifest = *amanifest;
+    const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
+    if (tail_placement.bucket.name.empty()) {
+      manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
+    }
+    string ref_tag;
+    for (; miter != amanifest->obj_end(dpp); ++miter) {
+      ObjectWriteOperation op;
+      ref_tag = tag + '\0';
+      cls_refcount_get(op, ref_tag, true);
+
+      auto obj = svc.rados->obj(miter.get_location().get_raw_obj(this));
+      ret = obj.open(dpp);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "failed to open rados context for " << obj << dendl;
+        goto done_ret;
+      }
+
+      static constexpr uint64_t cost = 1; // 1 throttle unit per request
+      static constexpr uint64_t id = 0; // ids unused
+      rgw::AioResultList completed = aio->get(obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
+      ret = rgw::check_for_errors(completed);
+      all_results.splice(all_results.end(), completed);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to copy obj=" << obj << ", the error code = " << ret << dendl;
+        goto done_ret;
+      }
+    }
+
+    rgw::AioResultList completed = aio->drain();
+    ret = rgw::check_for_errors(completed);
+    all_results.splice(all_results.end(), completed);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to drain ios, the error code = " << ret <<dendl;
+      goto done_ret;
+    }
+
+    pmanifest = &manifest;
+  } else {
+    pmanifest = amanifest;
+    /* don't send the object's tail for garbage collection */
+    astate->keep_tail = true;
+  }
+
+  if (copy_first) {
+    ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp);
+    if (ret < 0) {
+      goto done_ret;
+    }
+
+    pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
+  } else {
+    pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
+  }
+
+  write_op.meta.data = &first_chunk;
+  write_op.meta.manifest = pmanifest;
+  write_op.meta.ptag = &tag;
+  write_op.meta.owner = dest_bucket_info.owner;
+  write_op.meta.mtime = mtime;
+  write_op.meta.flags = PUT_OBJ_CREATE;
+  write_op.meta.category = category;
+  write_op.meta.olh_epoch = olh_epoch;
+  write_op.meta.delete_at = delete_at;
+  write_op.meta.modify_tail = !copy_itself;
+
+  ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y);
+  if (ret < 0) {
+    goto done_ret;
+  }
+
+  return 0;
+
+done_ret:
+  if (!copy_itself) {
+
+    /* wait all pending op done */
+    rgw::AioResultList completed = aio->drain();
+    all_results.splice(all_results.end(), completed);
+
+    /* rollback reference */
+    string ref_tag = tag + '\0';
+    int ret2 = 0;
+    for (auto& r : all_results) {
+      if (r.result < 0) {
+        continue; // skip errors
+      }
+      ObjectWriteOperation op;
+      cls_refcount_put(op, ref_tag, true);
+
+      static constexpr uint64_t cost = 1; // 1 throttle unit per request
+      static constexpr uint64_t id = 0; // ids unused
+      rgw::AioResultList completed = aio->get(r.obj, rgw::Aio::librados_op(std::move(op), y), cost, id);
+      ret2 = rgw::check_for_errors(completed);
+      if (ret2 < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << r.obj << dendl;
+      }
+    }
+    completed = aio->drain();
+    ret2 = rgw::check_for_errors(completed);
+    if (ret2 < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to drain rollback ios, the error code = " << ret2 <<dendl;
+    }
+  }
+  return ret;
+}
+
+
+int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
+               RGWBucketInfo& dest_bucket_info,
+               const rgw_placement_rule& dest_placement,
+	       RGWRados::Object::Read& read_op, off_t end,
+               const rgw_obj& dest_obj,
+	       real_time *mtime,
+	       real_time set_mtime,
+               rgw::sal::Attrs& attrs,
+               uint64_t olh_epoch,
+	       real_time delete_at,
+               string *petag,
+               const DoutPrefixProvider *dpp,
+               optional_yield y)
+{
+  string tag;
+  append_rand_alpha(cct, tag, tag, 32);
+
+  rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
+  using namespace rgw::putobj;
+  // do not change the null_yield in the initialization of this AtomicObjectProcessor
+  // it causes crashes in the ragweed tests
+  AtomicObjectProcessor processor(&aio, this, dest_bucket_info, &dest_placement,
+                                  dest_bucket_info.owner, obj_ctx,
+                                  dest_obj, olh_epoch, tag,
+				  dpp, null_yield);
+  int ret = processor.prepare(y);
+  if (ret < 0)
+    return ret;
+
+  off_t ofs = 0;
+
+  do {
+    bufferlist bl;
+    ret = read_op.read(ofs, end, bl, y, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
+      return ret;
+    }
+
+    uint64_t read_len = ret;
+    ret = processor.process(std::move(bl), ofs);
+    if (ret < 0) {
+      return ret;
+    }
+
+    ofs += read_len;
+  } while (ofs <= end);
+
+  // flush
+  ret = processor.process({}, ofs);
+  if (ret < 0) {
+    return ret;
+  }
+
+  string etag;
+  auto iter = attrs.find(RGW_ATTR_ETAG);
+  if (iter != attrs.end()) {
+    bufferlist& bl = iter->second;
+    etag = bl.to_str();
+    if (petag) {
+      *petag = etag;
+    }
+  }
+
+  uint64_t accounted_size;
+  {
+    bool compressed{false};
+    RGWCompressionInfo cs_info;
+    ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
+      return ret;
+    }
+    // pass original size if compressed
+    accounted_size = compressed ? cs_info.orig_size : ofs;
+  }
+
+  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+                            nullptr, nullptr, nullptr, nullptr, nullptr, y);
+}
+
+int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
+                             RGWBucketInfo& bucket_info,
+                             const rgw_obj& obj,
+                             const rgw_placement_rule& placement_rule,
+                             const real_time& mtime,
+                             uint64_t olh_epoch,
+                             const DoutPrefixProvider *dpp,
+                             optional_yield y)
+{
+  rgw::sal::Attrs attrs;
+  real_time read_mtime;
+  uint64_t obj_size;
+
+  obj_ctx.set_atomic(obj);
+  RGWRados::Object op_target(this, bucket_info, obj_ctx, obj);
+  RGWRados::Object::Read read_op(&op_target);
+
+  read_op.params.attrs = &attrs;
+  read_op.params.lastmod = &read_mtime;
+  read_op.params.obj_size = &obj_size;
+
+  int ret = read_op.prepare(y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (read_mtime != mtime) {
+    /* raced */
+    ldpp_dout(dpp, 0) << __func__ << " ERROR: failed to transition obj(" << obj.key << ") read_mtime = " << read_mtime << " doesn't match mtime = " << mtime << dendl;
+    return -ECANCELED;
+  }
+
+  attrs.erase(RGW_ATTR_ID_TAG);
+  attrs.erase(RGW_ATTR_TAIL_TAG);
+
+  ret = copy_obj_data(obj_ctx,
+                      bucket_info,
+                      placement_rule,
+                      read_op,
+                      obj_size - 1,
+                      obj,
+                      nullptr /* pmtime */,
+                      mtime,
+                      attrs,
+                      olh_epoch,
+                      real_time(),
+                      nullptr /* petag */,
+                      dpp,
+                      y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
+{
+  constexpr uint NUM_ENTRIES = 1000u;
+
+  rgw_obj_index_key marker;
+  string prefix;
+  bool is_truncated;
+
+  do {
+    std::vector<rgw_bucket_dir_entry> ent_list;
+    ent_list.reserve(NUM_ENTRIES);
+
+    int r = cls_bucket_list_unordered(dpp,
+                                      bucket_info,
+                                      bucket_info.layout.current_index,
+                                      RGW_NO_SHARD,
+				      marker,
+				      prefix,
+				      NUM_ENTRIES,
+				      true,
+				      ent_list,
+				      &is_truncated,
+				      &marker,
+                                      y);
+    if (r < 0) {
+      return r;
+    }
+
+    string ns;
+    for (auto const& dirent : ent_list) {
+      rgw_obj_key obj;
+
+      if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
+        return -ENOTEMPTY;
+      }
+    }
+  } while (is_truncated);
+
+  return 0;
+}
+  
+/**
+ * Delete a bucket.
+ * bucket: the name of the bucket to delete
+ * Returns 0 on success, -ERR# otherwise.
+ */
+int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty)
+{
+  const rgw_bucket& bucket = bucket_info.bucket;
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0)
+    return r;
+  
+  if (check_empty) {
+    r = check_bucket_empty(dpp, bucket_info, y);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  bool remove_ep = true;
+
+  if (objv_tracker.read_version.empty()) {
+    RGWBucketEntryPoint ep;
+    r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
+                                                &ep,
+						null_yield,
+                                                dpp,
+                                                RGWBucketCtl::Bucket::GetParams()
+                                                .set_objv_tracker(&objv_tracker));
+    if (r < 0 ||
+        (!bucket_info.bucket.bucket_id.empty() &&
+         ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
+      if (r != -ENOENT) {
+        ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
+        /* we have no idea what caused the error, will not try to remove it */
+      }
+      /* 
+       * either failed to read bucket entrypoint, or it points to a different bucket instance than
+       * requested
+       */
+      remove_ep = false;
+    }
+  }
+ 
+  if (remove_ep) {
+    r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp,
+                                                  RGWBucketCtl::Bucket::RemoveParams()
+                                                  .set_objv_tracker(&objv_tracker));
+    if (r < 0)
+      return r;
+  }
+
+  /* if the bucket is not synced we can remove the meta file */
+  if (!svc.zone->is_syncing_bucket_meta(bucket)) {
+    RGWObjVersionTracker objv_tracker;
+    r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp);
+    if (r < 0) {
+      return r;
+    }
+
+   /* remove bucket index objects asynchronously by best effort */
+    (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
+				       bucket_objs,
+				       cct->_conf->rgw_bucket_index_max_aio)();
+  }
+
+  return 0;
+}
+
+int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp)
+{
+  RGWBucketInfo info;
+  map<string, bufferlist> attrs;
+  int r;
+
+  if (bucket.bucket_id.empty()) {
+    r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
+  } else {
+    r = get_bucket_instance_info(bucket, info, nullptr, &attrs, null_yield, dpp);
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
+    return r;
+  }
+
+  info.owner = owner.get_id();
+
+  r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp, null_yield);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+
+int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp)
+{
+  int ret = 0;
+
+  vector<rgw_bucket>::iterator iter;
+
+  for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
+    rgw_bucket& bucket = *iter;
+    if (enabled) {
+      ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
+    } else {
+      ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
+    }
+
+    RGWBucketInfo info;
+    map<string, bufferlist> attrs;
+    int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
+      ret = r;
+      continue;
+    }
+    if (enabled) {
+      info.flags &= ~BUCKET_SUSPENDED;
+    } else {
+      info.flags |= BUCKET_SUSPENDED;
+    }
+
+    r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp, null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
+      ret = r;
+      continue;
+    }
+  }
+  return ret;
+}
+
+int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended)
+{
+  RGWBucketInfo bucket_info;
+  int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
+  return 0;
+}
+
+int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp)
+{
+  if ((!manifest)|| state->keep_tail)
+    return 0;
+
+  cls_rgw_obj_chain chain;
+  store->update_gc_chain(dpp, obj, *manifest, &chain);
+
+  if (chain.empty()) {
+    return 0;
+  }
+
+  string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
+  if (store->gc == nullptr) {
+    ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl;
+    //Delete objects inline just in case gc hasn't been initialised, prevents crashes
+    store->delete_objs_inline(dpp, chain, tag);
+  } else {
+    auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously
+    if (ret < 0 && leftover_chain) {
+      //Delete objects inline if send chain to gc fails
+      store->delete_objs_inline(dpp, *leftover_chain, tag);
+    }
+  }
+  return 0;
+}
+
+void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
+{
+  RGWObjManifest::obj_iterator iter;
+  rgw_raw_obj raw_head;
+  obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
+  for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) {
+    const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
+    if (mobj == raw_head)
+      continue;
+    cls_rgw_obj_key key(mobj.oid);
+    chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
+  }
+}
+
+std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
+{
+  if (chain.empty()) {
+    return {0, std::nullopt};
+  }
+
+  return gc->send_split_chain(chain, tag);
+}
+
+void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag)
+{
+  string last_pool;
+  std::unique_ptr<IoCtx> ctx(new IoCtx);
+  int ret = 0;
+  for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
+    cls_rgw_obj& obj = *liter;
+    if (obj.pool != last_pool) {
+      ctx.reset(new IoCtx);
+      ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx);
+      if (ret < 0) {
+        last_pool = "";
+        ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" <<
+        obj.pool << dendl;
+        continue;
+      }
+      last_pool = obj.pool;
+    }
+    ctx->locator_set_key(obj.loc);
+    const string& oid = obj.key.name; /* just stored raw oid there */
+    ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool <<
+    ":" << obj.key.name << dendl;
+    ObjectWriteOperation op;
+    cls_refcount_put(op, tag, true);
+    ret = ctx->operate(oid, &op);
+    if (ret < 0) {
+      ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
+    }
+  }
+}
+
+static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
+                                 map<RGWObjCategory, RGWStorageStats>& stats)
+{
+  for (const auto& pair : header.stats) {
+    const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
+    const rgw_bucket_category_stats& header_stats = pair.second;
+
+    RGWStorageStats& s = stats[category];
+
+    s.category = category;
+    s.size += header_stats.total_size;
+    s.size_rounded += header_stats.total_size_rounded;
+    s.size_utilized += header_stats.actual_size;
+    s.num_objects += header_stats.num_entries;
+  }
+}
+
+int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+				 map<RGWObjCategory, RGWStorageStats> *existing_stats,
+				 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
+{
+  RGWSI_RADOS::Pool index_pool;
+
+  // key - bucket index object id
+  // value - bucket index check OP returned result with the given bucket index object (shard)
+  map<int, string> oids;
+
+  int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &oids, nullptr);
+  if (ret < 0) {
+    return ret;
+  }
+
+  // declare and pre-populate
+  map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
+  for (auto& iter : oids) {
+    bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret());
+  }
+
+  ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
+  if (ret < 0) {
+    return ret;
+  }
+
+  // aggregate results (from different shards if there are any)
+  for (const auto& iter : bucket_objs_ret) {
+    accumulate_raw_stats(iter.second.existing_header, *existing_stats);
+    accumulate_raw_stats(iter.second.calculated_header, *calculated_stats);
+  }
+
+  return 0;
+}
+
+int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+static int resync_encrypted_multipart(const DoutPrefixProvider* dpp,
+                                      optional_yield y, RGWRados* store,
+                                      RGWBucketInfo& bucket_info,
+                                      RGWObjectCtx& obj_ctx,
+                                      const RGWObjState& state)
+{
+  // only overwrite if the tag hasn't changed
+  obj_ctx.set_atomic(state.obj);
+
+  // make a tiny adjustment to the existing mtime so that fetch_remote_obj()
+  // won't return ERR_NOT_MODIFIED when resyncing the object
+  const auto set_mtime = state.mtime + std::chrono::nanoseconds(1);
+
+  // use set_attrs() to update the mtime in a bucket index transaction so the
+  // change is recorded in bilog and datalog entries. this will cause any peer
+  // zones to resync the object
+  auto add_attrs = std::map<std::string, bufferlist>{
+    { RGW_ATTR_PREFIX "resync-encrypted-multipart", bufferlist{} },
+  };
+
+  return store->set_attrs(dpp, &obj_ctx, bucket_info, state.obj,
+                          add_attrs, nullptr, y, set_mtime);
+}
+
+static void try_resync_encrypted_multipart(const DoutPrefixProvider* dpp,
+                                           optional_yield y, RGWRados* store,
+                                           RGWBucketInfo& bucket_info,
+                                           RGWObjectCtx& obj_ctx,
+                                           const rgw_bucket_dir_entry& dirent,
+                                           Formatter* f)
+{
+  const auto obj = rgw_obj{bucket_info.bucket, dirent.key};
+
+  RGWObjState* astate = nullptr;
+  RGWObjManifest* manifest = nullptr;
+  constexpr bool follow_olh = false; // dirent will have version ids
+  int ret = store->get_obj_state(dpp, &obj_ctx, bucket_info, obj,
+                                 &astate, &manifest, follow_olh, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 4) << obj << " does not exist" << dendl;
+    return;
+  }
+
+  // check whether the object is encrypted
+  if (auto i = astate->attrset.find(RGW_ATTR_CRYPT_MODE);
+      i == astate->attrset.end()) {
+    ldpp_dout(dpp, 4) << obj << " is not encrypted" << dendl;
+    return;
+  }
+
+  // check whether the object is multipart
+  if (!manifest) {
+    ldpp_dout(dpp, 4) << obj << " has no manifest so is not multipart" << dendl;
+    return;
+  }
+  const RGWObjManifest::obj_iterator end = manifest->obj_end(dpp);
+  if (end.get_cur_part_id() == 0) {
+    ldpp_dout(dpp, 4) << obj << " manifest is not multipart" << dendl;
+    return;
+  }
+
+  ret = resync_encrypted_multipart(dpp, y, store, bucket_info,
+                                   obj_ctx, *astate);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to update " << obj
+        << ": " << cpp_strerror(ret) << dendl;
+    return;
+  }
+
+  f->open_object_section("object");
+  encode_json("name", obj.key.name, f);
+  if (!obj.key.instance.empty()) {
+    encode_json("version", obj.key.instance, f);
+  }
+  encode_json("mtime", astate->mtime, f);
+  f->close_section(); // "object"
+}
+
+int RGWRados::bucket_resync_encrypted_multipart(const DoutPrefixProvider* dpp,
+                                                optional_yield y,
+                                                rgw::sal::RadosStore* driver,
+                                                RGWBucketInfo& bucket_info,
+                                                const std::string& marker,
+                                                RGWFormatterFlusher& flusher)
+{
+  RGWRados::Bucket target(this, bucket_info);
+  RGWRados::Bucket::List list_op(&target);
+
+  list_op.params.marker.name = marker;
+  list_op.params.enforce_ns = true; // only empty ns
+  list_op.params.list_versions = true;
+  list_op.params.allow_unordered = true;
+
+  /* List bucket entries in chunks. */
+  static constexpr int MAX_LIST_OBJS = 100;
+  std::vector<rgw_bucket_dir_entry> entries;
+  entries.reserve(MAX_LIST_OBJS);
+
+  int processed = 0;
+  bool is_truncated = true;
+
+  Formatter* f = flusher.get_formatter();
+  f->open_array_section("progress");
+
+  do {
+    int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
+                                   &is_truncated, y);
+    if (ret < 0) {
+      return ret;
+    }
+
+    f->open_object_section("batch");
+    f->open_array_section("modified");
+
+    for (const auto& dirent : entries) {
+      RGWObjectCtx obj_ctx{driver};
+      try_resync_encrypted_multipart(dpp, y, this, bucket_info,
+                                     obj_ctx, dirent, f);
+    }
+
+    f->close_section(); // "modified"
+
+    processed += entries.size();
+    encode_json("total processed", processed, f);
+    encode_json("marker", list_op.get_next_marker().name, f);
+    f->close_section(); // "batch"
+
+    flusher.flush(); // flush after each 'chunk'
+  } while (is_truncated);
+
+  f->close_section(); // "progress" array
+  return 0;
+}
+
+int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+      ": unable to open bucket index, r=" << r << " (" <<
+      cpp_strerror(-r) << ")" << dendl;
+    return r;
+  }
+
+  r = CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+      ": unable to issue set bucket resharding, r=" << r << " (" <<
+      cpp_strerror(-r) << ")" << dendl;
+  }
+  return r;
+}
+
+int RGWRados::defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y)
+{
+  std::string oid, key;
+  get_obj_bucket_and_oid_loc(obj, oid, key);
+  if (!rctx)
+    return 0;
+
+  RGWObjState *state = NULL;
+  RGWObjManifest *manifest = nullptr;
+
+  int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, &manifest, false, y);
+  if (r < 0)
+    return r;
+
+  if (!state->is_atomic) {
+    ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
+    return -EINVAL;
+  }
+
+  string tag;
+
+  if (state->tail_tag.length() > 0) {
+    tag = state->tail_tag.c_str();
+  } else if (state->obj_tag.length() > 0) {
+    tag = state->obj_tag.c_str();
+  } else {
+    ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
+    return -EINVAL;
+  }
+
+  ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl;
+
+  cls_rgw_obj_chain chain;
+  update_gc_chain(dpp, state->obj, *manifest, &chain);
+  return gc->async_defer_chain(tag, chain);
+}
+
+void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
+{
+  list<string> prefixes;
+  prefixes.push_back(RGW_ATTR_OLH_PREFIX);
+  cls_rgw_remove_obj(op, prefixes);
+}
+
+void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
+{
+  cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
+}
+
+void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
+{
+  cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
+}
+
+struct tombstone_entry {
+  ceph::real_time mtime;
+  uint32_t zone_short_id;
+  uint64_t pg_ver;
+
+  tombstone_entry() = default;
+  explicit tombstone_entry(const RGWObjState& state)
+    : mtime(state.mtime), zone_short_id(state.zone_short_id),
+      pg_ver(state.pg_ver) {}
+};
+
+/**
+ * Delete an object.
+ * bucket: name of the bucket storing the object
+ * obj: name of the object to delete
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  RGWRados *store = target->get_store();
+  const rgw_obj& src_obj = target->get_obj();
+  const string& instance = src_obj.key.instance;
+  rgw_obj obj = target->get_obj();
+
+  if (instance == "null") {
+    obj.key.instance.clear();
+  }
+
+  bool explicit_marker_version = (!params.marker_version_id.empty());
+
+  if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
+    if (instance.empty() || explicit_marker_version) {
+      rgw_obj marker = obj;
+      marker.key.instance.clear();
+
+      if (!params.marker_version_id.empty()) {
+        if (params.marker_version_id != "null") {
+          marker.key.set_instance(params.marker_version_id);
+        }
+      } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
+	store->gen_rand_obj_instance_name(&marker);
+      }
+
+      result.version_id = marker.key.instance;
+      if (result.version_id.empty())
+        result.version_id = "null";
+      result.delete_marker = true;
+
+      struct rgw_bucket_dir_entry_meta meta;
+
+      meta.owner = params.obj_owner.get_id().to_str();
+      meta.owner_display_name = params.obj_owner.get_display_name();
+
+      if (real_clock::is_zero(params.mtime)) {
+        meta.mtime = real_clock::now();
+      } else {
+        meta.mtime = params.mtime;
+      }
+
+      int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
+      if (r < 0) {
+        return r;
+      }
+    } else {
+      rgw_bucket_dir_entry dirent;
+
+      int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent);
+      if (r < 0) {
+        return r;
+      }
+      result.delete_marker = dirent.is_delete_marker();
+      r = store->unlink_obj_instance(dpp, target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.zones_trace);
+      if (r < 0) {
+        return r;
+      }
+      result.version_id = instance;
+    }
+
+    BucketShard *bs = nullptr;
+    int r = target->get_bucket_shard(&bs, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl;
+      return r;
+    }
+
+    add_datalog_entry(dpp, store->svc.datalog_rados,
+                      target->get_bucket_info(), bs->shard_id, y);
+
+    return 0;
+  }
+
+  rgw_rados_ref ref;
+  int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  RGWObjState *state;
+  RGWObjManifest *manifest = nullptr;
+  r = target->get_state(dpp, &state, &manifest, false, y);
+  if (r < 0)
+    return r;
+
+  ObjectWriteOperation op;
+
+  if (!real_clock::is_zero(params.unmod_since)) {
+    struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
+    struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
+    if (!params.high_precision_time) {
+      ctime.tv_nsec = 0;
+      unmod.tv_nsec = 0;
+    }
+
+    ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
+    if (ctime > unmod) {
+      return -ERR_PRECONDITION_FAILED;
+    }
+
+    /* only delete object if mtime is less than or equal to params.unmod_since */
+    store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
+  }
+  uint64_t obj_accounted_size = state->accounted_size;
+
+  if(params.abortmp) {
+    obj_accounted_size = params.parts_accounted_size;
+  }
+
+  if (!real_clock::is_zero(params.expiration_time)) {
+    bufferlist bl;
+    real_time delete_at;
+
+    if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
+      try {
+        auto iter = bl.cbegin();
+        decode(delete_at, iter);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
+	return -EIO;
+      }
+
+      if (params.expiration_time != delete_at) {
+        return -ERR_PRECONDITION_FAILED;
+      }
+    } else {
+      return -ERR_PRECONDITION_FAILED;
+    }
+  }
+
+  if (!state->exists) {
+    target->invalidate_state();
+    return -ENOENT;
+  }
+
+  r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y);
+  if (r < 0)
+    return r;
+
+  RGWBucketInfo& bucket_info = target->get_bucket_info();
+
+  RGWRados::Bucket bop(store, bucket_info);
+  RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+  index_op.set_zones_trace(params.zones_trace);
+  index_op.set_bilog_flags(params.bilog_flags);
+
+  r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y);
+  if (r < 0)
+    return r;
+
+  store->remove_rgw_head_obj(op);
+
+  auto& ioctx = ref.pool.ioctx();
+  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, y);
+
+  /* raced with another operation, object state is indeterminate */
+  const bool need_invalidate = (r == -ECANCELED);
+
+  int64_t poolid = ioctx.get_id();
+  if (r >= 0) {
+    tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
+    if (obj_tombstone_cache) {
+      tombstone_entry entry{*state};
+      obj_tombstone_cache->add(obj, entry);
+    }
+    r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs, y);
+
+    int ret = target->complete_atomic_modification(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
+    }
+    /* other than that, no need to propagate error */
+  } else {
+    int ret = index_op.cancel(dpp, params.remove_objs, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
+    }
+  }
+
+  if (need_invalidate) {
+    target->invalidate_state();
+  }
+
+  if (r < 0)
+    return r;
+
+  /* update quota cache */
+  store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
+
+  return 0;
+}
+
+int RGWRados::delete_obj(const DoutPrefixProvider *dpp,
+                         RGWObjectCtx& obj_ctx,
+                         const RGWBucketInfo& bucket_info,
+                         const rgw_obj& obj,
+                         int versioning_status, // versioning flags defined in enum RGWBucketFlags
+                         uint16_t bilog_flags,
+                         const real_time& expiration_time,
+                         rgw_zone_set *zones_trace)
+{
+  RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
+  RGWRados::Object::Delete del_op(&del_target);
+
+  del_op.params.bucket_owner = bucket_info.owner;
+  del_op.params.versioning_status = versioning_status;
+  del_op.params.bilog_flags = bilog_flags;
+  del_op.params.expiration_time = expiration_time;
+  del_op.params.zones_trace = zones_trace;
+
+  return del_op.delete_obj(null_yield, dpp);
+}
+
+int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
+{
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  ObjectWriteOperation op;
+
+  op.remove();
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime,
+			       const DoutPrefixProvider *dpp, optional_yield y)
+{
+  std::string oid, key;
+  get_obj_bucket_and_oid_loc(obj, oid, key);
+
+  RGWBucketInfo bucket_info;
+  int ret = get_bucket_instance_info(obj.bucket, bucket_info, NULL, NULL, null_yield, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  RGWRados::Bucket bop(this, bucket_info);
+  RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+  return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, nullptr, y);
+}
+
+static void generate_fake_tag(const DoutPrefixProvider *dpp, RGWRados* store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
+{
+  string tag;
+
+  RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp);
+  if (mi != manifest.obj_end(dpp)) {
+    if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
+      ++mi;
+    tag = mi.get_location().get_raw_obj(store).oid;
+    tag.append("_");
+  }
+
+  unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
+
+  map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
+  if (iter != attrset.end()) {
+    bufferlist& bl = iter->second;
+    hash.Update((const unsigned char *)bl.c_str(), bl.length());
+  }
+
+  hash.Final(md5);
+  buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
+  tag.append(md5_str);
+
+  ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl;
+
+  tag_bl.append(tag.c_str(), tag.size() + 1);
+}
+
+static bool is_olh(map<string, bufferlist>& attrs)
+{
+  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_VER);
+  return (iter != attrs.end());
+}
+
+static bool has_olh_tag(map<string, bufferlist>& attrs)
+{
+  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
+  return (iter != attrs.end());
+}
+
+int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx&
+				   obj_ctx, RGWBucketInfo& bucket_info,
+				   const rgw_obj& obj, RGWObjState *olh_state,
+				   RGWObjState **target_state,
+				   RGWObjManifest **target_manifest, optional_yield y)
+{
+  ceph_assert(olh_state->is_olh);
+
+  rgw_obj target;
+  int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
+  if (r < 0) {
+    return r;
+  }
+
+  r = get_obj_state(dpp, &obj_ctx, bucket_info, target, target_state,
+		    target_manifest, false, y);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx,
+				 RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                                 RGWObjState **state, RGWObjManifest** manifest,
+				 bool follow_olh, optional_yield y, bool assume_noent)
+{
+  if (obj.empty()) {
+    return -EINVAL;
+  }
+
+  bool need_follow_olh = follow_olh && obj.key.instance.empty();
+  *manifest = nullptr;
+
+  RGWObjStateManifest *sm = rctx->get_state(obj);
+  RGWObjState *s = &(sm->state);
+  ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
+  *state = s;
+  if (sm->manifest) {
+    *manifest = &(*sm->manifest);
+  }
+  if (s->has_attrs) {
+    if (s->is_olh && need_follow_olh) {
+      return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
+    }
+    return 0;
+  }
+
+  s->obj = obj;
+
+  rgw_raw_obj raw_obj;
+  obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
+
+  int r = -ENOENT;
+
+  if (!assume_noent) {
+    r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
+  }
+
+  if (r == -ENOENT) {
+    s->exists = false;
+    s->has_attrs = true;
+    tombstone_entry entry;
+    if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
+      s->mtime = entry.mtime;
+      s->zone_short_id = entry.zone_short_id;
+      s->pg_ver = entry.pg_ver;
+      ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
+          << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
+    } else {
+      s->mtime = real_time();
+    }
+    return 0;
+  }
+  if (r < 0)
+    return r;
+
+  s->exists = true;
+  s->has_attrs = true;
+  s->accounted_size = s->size;
+
+  auto iter = s->attrset.find(RGW_ATTR_ETAG);
+  if (iter != s->attrset.end()) {
+    /* get rid of extra null character at the end of the etag, as we used to store it like that */
+    bufferlist& bletag = iter->second;
+    if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
+      bufferlist newbl;
+      bletag.splice(0, bletag.length() - 1, &newbl);
+      bletag = std::move(newbl);
+    }
+  }
+
+  iter = s->attrset.find(RGW_ATTR_COMPRESSION);
+  const bool compressed = (iter != s->attrset.end());
+  if (compressed) {
+    // use uncompressed size for accounted_size
+    try {
+      RGWCompressionInfo info;
+      auto p = iter->second.cbegin();
+      decode(info, p);
+      s->accounted_size = info.orig_size; 
+    } catch (buffer::error&) {
+      ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl;
+      return -EIO;
+    }
+  }
+
+  if (iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ); iter != s->attrset.end()) {
+    const bufferlist& bl = iter->second;
+    auto it = bl.begin();
+    it.copy(bl.length(), s->shadow_obj);
+    s->shadow_obj[bl.length()] = '\0';
+  }
+  if (iter = s->attrset.find(RGW_ATTR_ID_TAG); iter != s->attrset.end()) {
+    s->obj_tag = iter->second;
+  }
+  if (iter = s->attrset.find(RGW_ATTR_TAIL_TAG); iter != s->attrset.end()) {
+    s->tail_tag = iter->second;
+  }
+
+  if (iter = s->attrset.find(RGW_ATTR_MANIFEST); iter != s->attrset.end()) {
+    bufferlist manifest_bl = iter->second;
+    auto miter = manifest_bl.cbegin();
+    try {
+      sm->manifest.emplace();
+      decode(*sm->manifest, miter);
+      sm->manifest->set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
+                                             broken due to old bugs */
+      s->size = sm->manifest->get_obj_size();
+      if (!compressed)
+        s->accounted_size = s->size;
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
+      return -EIO;
+    }
+    *manifest = &(*sm->manifest);
+    ldpp_dout(dpp, 10) << "manifest: total_size = " << sm->manifest->get_obj_size() << dendl;
+    if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
+	sm->manifest->has_explicit_objs()) {
+      RGWObjManifest::obj_iterator mi;
+      for (mi = sm->manifest->obj_begin(dpp); mi != sm->manifest->obj_end(dpp); ++mi) {
+        ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
+      }
+    }
+
+    if (!s->obj_tag.length()) {
+      /*
+       * Uh oh, something's wrong, object with manifest should have tag. Let's
+       * create one out of the manifest, would be unique
+       */
+      generate_fake_tag(dpp, this, s->attrset, *sm->manifest, manifest_bl, s->obj_tag);
+      s->fake_tag = true;
+    }
+  }
+  if (iter = s->attrset.find(RGW_ATTR_PG_VER); iter != s->attrset.end()) {
+    const bufferlist& pg_ver_bl = iter->second;
+    if (pg_ver_bl.length()) {
+      auto pgbl = pg_ver_bl.cbegin();
+      try {
+        decode(s->pg_ver, pgbl);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
+      }
+    }
+  }
+  if (iter = s->attrset.find(RGW_ATTR_SOURCE_ZONE); iter != s->attrset.end()) {
+    const bufferlist& zone_short_id_bl = iter->second;
+    if (zone_short_id_bl.length()) {
+      auto zbl = zone_short_id_bl.cbegin();
+      try {
+        decode(s->zone_short_id, zbl);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
+      }
+    }
+  }
+  if (s->obj_tag.length()) {
+    ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
+  } else {
+    ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
+  }
+
+  /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
+   * it exist, and not only if is_olh() returns true
+   */
+  if (iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG); iter != s->attrset.end()) {
+    s->olh_tag = iter->second;
+  }
+
+  if (is_olh(s->attrset)) {
+    s->is_olh = true;
+
+    ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
+
+    if (need_follow_olh) {
+      return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, manifest, y);
+    } else if (obj.key.have_null_instance() && !sm->manifest) {
+      // read null version, and the head object only have olh info
+      s->exists = false;
+      return -ENOENT;
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest,
+                            bool follow_olh, optional_yield y, bool assume_noent)
+{
+  int ret;
+
+  do {
+    ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, manifest, follow_olh, y, assume_noent);
+  } while (ret == -EAGAIN);
+
+  return ret;
+}
+
+int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y)
+{
+  RGWObjState *astate;
+  int r = get_state(dpp, &astate, pmanifest, true, y);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y)
+{
+  RGWObjState *state;
+  RGWObjManifest *manifest = nullptr;
+  int r = source->get_state(dpp, &state, &manifest, true, y);
+  if (r < 0)
+    return r;
+  if (!state->exists)
+    return -ENOENT;
+  if (!state->get_attr(name, dest))
+    return -ENODATA;
+
+  return 0;
+}
+
+int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp)
+{
+  RGWObjectCtx& ctx = source->get_ctx();
+  rgw_obj& obj = source->get_obj();
+  RGWRados *store = source->get_store();
+
+  RGWObjStateManifest *sm = ctx.get_state(obj);
+  result.obj = obj;
+  if (sm->state.has_attrs) {
+    state.ret = 0;
+    result.size = sm->state.size;
+    result.mtime = ceph::real_clock::to_timespec(sm->state.mtime);
+    result.attrs = sm->state.attrset;
+    result.manifest = sm->manifest;
+    return 0;
+  }
+
+  string oid;
+  string loc;
+  get_obj_bucket_and_oid_loc(obj, oid, loc);
+
+  int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectReadOperation op;
+  op.stat2(&result.size, &result.mtime, NULL);
+  op.getxattrs(&result.attrs, NULL);
+  state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
+  state.io_ctx.locator_set_key(loc);
+  r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
+  if (r < 0) {
+    ldpp_dout(dpp, 5) << __func__
+						   << ": ERROR: aio_operate() returned ret=" << r
+						   << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+
+int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp)
+{
+  if (!state.completion) {
+    return state.ret;
+  }
+
+  state.completion->wait_for_complete();
+  state.ret = state.completion->get_return_value();
+  state.completion->release();
+
+  if (state.ret != 0) {
+    return state.ret;
+  }
+
+  return finish(dpp);
+}
+
+int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp)
+{
+  map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
+  if (iter != result.attrs.end()) {
+    bufferlist& bl = iter->second;
+    auto biter = bl.cbegin();
+    try {
+      result.manifest.emplace();
+      decode(*result.manifest, biter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest"  << dendl;
+      return -EIO;
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx,
+                                 RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                                 ObjectOperation& op, RGWObjState **pstate,
+				 RGWObjManifest** pmanifest, optional_yield y)
+{
+  if (!rctx)
+    return 0;
+
+  int r = get_obj_state(dpp, rctx, bucket_info, obj, pstate, pmanifest, false, y);
+  if (r < 0)
+    return r;
+
+  return append_atomic_test(dpp, *pstate, op);
+}
+
+int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
+                                 const RGWObjState* state,
+                                 librados::ObjectOperation& op)
+{
+  if (!state->is_atomic) {
+    ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
+    return 0;
+  }
+
+  if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
+    op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
+  } else {
+    ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
+  }
+  return 0;
+}
+
+int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent)
+{
+  return store->get_obj_state(dpp, &ctx, bucket_info, obj, pstate, pmanifest, follow_olh, y, assume_noent);
+}
+
+void RGWRados::Object::invalidate_state()
+{
+  ctx.invalidate(obj);
+}
+
+int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp,
+                                                  ObjectWriteOperation& op, bool reset_obj, const string *ptag,
+                                                  const char *if_match, const char *if_nomatch, bool removal_op,
+                                                  bool modify_tail, optional_yield y)
+{
+  int r = get_state(dpp, &state, &manifest, false, y);
+  if (r < 0)
+    return r;
+
+  bool need_guard = ((manifest) || (state->obj_tag.length() != 0) ||
+                     if_match != NULL || if_nomatch != NULL) &&
+                     (!state->fake_tag);
+
+  if (!state->is_atomic) {
+    ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
+
+    if (reset_obj) {
+      op.create(false);
+      store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
+    }
+
+    return 0;
+  }
+
+  if (need_guard) {
+    /* first verify that the object wasn't replaced under */
+    if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
+      op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag); 
+      // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
+    }
+
+    if (if_match) {
+      if (strcmp(if_match, "*") == 0) {
+        // test the object is existing
+        if (!state->exists) {
+          return -ERR_PRECONDITION_FAILED;
+        }
+      } else {
+        bufferlist bl;
+        if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
+            strncmp(if_match, bl.c_str(), bl.length()) != 0) {
+          return -ERR_PRECONDITION_FAILED;
+        }
+      }
+    }
+
+    if (if_nomatch) {
+      if (strcmp(if_nomatch, "*") == 0) {
+        // test the object is NOT existing
+        if (state->exists) {
+          return -ERR_PRECONDITION_FAILED;
+        }
+      } else {
+        bufferlist bl;
+        if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
+            strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
+          return -ERR_PRECONDITION_FAILED;
+        }
+      }
+    }
+  }
+
+  if (reset_obj) {
+    if (state->exists) {
+      op.create(false);
+      store->remove_rgw_head_obj(op);
+    } else {
+      op.create(true);
+    }
+  }
+
+  if (removal_op) {
+    /* the object is being removed, no need to update its tag */
+    return 0;
+  }
+
+  if (ptag) {
+    state->write_tag = *ptag;
+  } else {
+    append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
+  }
+  bufferlist bl;
+  bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
+
+  ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl;
+
+  op.setxattr(RGW_ATTR_ID_TAG, bl);
+  if (modify_tail) {
+    op.setxattr(RGW_ATTR_TAIL_TAG, bl);
+  }
+
+  return 0;
+}
+
+/**
+ * Set an attr on an object.
+ * bucket: name of the bucket holding the object
+ * obj: name of the object to set the attr on
+ * name: the attr to set
+ * bl: the contents of the attr
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWRados::set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, const char *name, bufferlist& bl)
+{
+  map<string, bufferlist> attrs;
+  attrs[name] = bl;
+  return set_attrs(dpp, rctx, bucket_info, obj, attrs, NULL, null_yield);
+}
+
+int RGWRados::set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& src_obj,
+                        map<string, bufferlist>& attrs,
+                        map<string, bufferlist>* rmattrs,
+                        optional_yield y,
+                        ceph::real_time set_mtime /* = zero() */)
+{
+  rgw_obj obj = src_obj;
+  if (obj.key.instance == "null") {
+    obj.key.instance.clear();
+  }
+
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  ObjectWriteOperation op;
+  RGWObjState *state = NULL;
+  RGWObjManifest *manifest = nullptr;
+
+  r = append_atomic_test(dpp, rctx, bucket_info, obj, op, &state, &manifest, y);
+  if (r < 0)
+    return r;
+
+  // ensure null version object exist
+  if (src_obj.key.instance == "null" && !manifest) {
+    return -ENOENT;
+  }
+
+  map<string, bufferlist>::iterator iter;
+  if (rmattrs) {
+    for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+      const string& name = iter->first;
+      op.rmxattr(name.c_str());
+    }
+  }
+
+  const rgw_bucket& bucket = obj.bucket;
+
+  for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+    const string& name = iter->first;
+    bufferlist& bl = iter->second;
+
+    if (!bl.length())
+      continue;
+
+    op.setxattr(name.c_str(), bl);
+
+    if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
+      real_time ts;
+      try {
+        decode(ts, bl);
+
+        rgw_obj_index_key obj_key;
+        obj.key.get_index_key(&obj_key);
+
+        obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
+      } catch (buffer::error& err) {
+	ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
+      }
+    }
+  }
+
+  if (!op.size())
+    return 0;
+
+  bufferlist bl;
+  RGWRados::Bucket bop(this, bucket_info);
+  RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+  if (state) {
+    string tag;
+    append_rand_alpha(cct, tag, tag, 32);
+    state->write_tag = tag;
+    r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
+
+    if (r < 0)
+      return r;
+
+    bl.append(tag.c_str(), tag.size() + 1);
+    op.setxattr(RGW_ATTR_ID_TAG,  bl);
+  }
+
+
+  /* As per https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingMetadata.html, 
+   * the only way for users to modify object metadata is to make a copy of the object and
+   * set the metadata.
+   * Hence do not update mtime for any other attr changes */
+  real_time mtime = state->mtime;
+  if (set_mtime != ceph::real_clock::zero()) {
+    mtime = set_mtime;
+  }
+  struct timespec mtime_ts = real_clock::to_timespec(mtime);
+  op.mtime2(&mtime_ts);
+  auto& ioctx = ref.pool.ioctx();
+  r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
+  if (state) {
+    if (r >= 0) {
+      bufferlist acl_bl;
+      if (iter = attrs.find(RGW_ATTR_ACL); iter != attrs.end()) {
+        acl_bl = iter->second;
+      }
+      std::string etag;
+      if (iter = attrs.find(RGW_ATTR_ETAG); iter != attrs.end()) {
+        etag = rgw_bl_str(iter->second);
+      }
+      std::string content_type;
+      if (iter = attrs.find(RGW_ATTR_CONTENT_TYPE); iter != attrs.end()) {
+        content_type = rgw_bl_str(iter->second);
+      }
+      string storage_class;
+      if (iter = attrs.find(RGW_ATTR_STORAGE_CLASS); iter != attrs.end()) {
+        storage_class = rgw_bl_str(iter->second);
+      }
+      uint64_t epoch = ioctx.get_last_version();
+      int64_t poolid = ioctx.get_id();
+      r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
+                            mtime, etag, content_type, storage_class, &acl_bl,
+                            RGWObjCategory::Main, nullptr, y);
+    } else {
+      int ret = index_op.cancel(dpp, nullptr, y);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
+      }
+    }
+  }
+  if (r < 0)
+    return r;
+
+  if (state) {
+    state->obj_tag.swap(bl);
+    if (rmattrs) {
+      for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+        state->attrset.erase(iter->first);
+      }
+    }
+
+    for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+      state->attrset[iter->first] = iter->second;
+    }
+
+    auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
+    if (iter != state->attrset.end()) {
+      iter->second = state->obj_tag;
+    }
+
+    state->mtime = mtime;
+  }
+
+  return 0;
+}
+
+int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  RGWRados *store = source->get_store();
+  CephContext *cct = store->ctx();
+
+  bufferlist etag;
+
+  map<string, bufferlist>::iterator iter;
+
+  RGWObjState *astate;
+  RGWObjManifest *manifest = nullptr;
+  int r = source->get_state(dpp, &astate, &manifest, true, y);
+  if (r < 0)
+    return r;
+
+  if (!astate->exists) {
+    return -ENOENT;
+  }
+
+  const RGWBucketInfo& bucket_info = source->get_bucket_info();
+
+  state.obj = astate->obj;
+  store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
+
+  state.cur_pool = state.head_obj.pool;
+  state.cur_ioctx = &state.io_ctxs[state.cur_pool];
+
+  r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx);
+  if (r < 0) {
+    return r;
+  }
+  if (params.target_obj) {
+    *params.target_obj = state.obj;
+  }
+  if (params.attrs) {
+    *params.attrs = astate->attrset;
+    if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+      for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
+        ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
+      }
+    }
+  }
+
+  /* Convert all times go GMT to make them compatible */
+  if (conds.mod_ptr || conds.unmod_ptr) {
+    obj_time_weight src_weight;
+    src_weight.init(astate);
+    src_weight.high_precision = conds.high_precision_time;
+
+    obj_time_weight dest_weight;
+    dest_weight.high_precision = conds.high_precision_time;
+
+    if (conds.mod_ptr && !conds.if_nomatch) {
+      dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
+      ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
+      if (!(dest_weight < src_weight)) {
+        return -ERR_NOT_MODIFIED;
+      }
+    }
+
+    if (conds.unmod_ptr && !conds.if_match) {
+      dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
+      ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
+      if (dest_weight < src_weight) {
+        return -ERR_PRECONDITION_FAILED;
+      }
+    }
+  }
+  if (conds.if_match || conds.if_nomatch) {
+    r = get_attr(dpp, RGW_ATTR_ETAG, etag, y);
+    if (r < 0)
+      return r;
+
+    if (conds.if_match) {
+      string if_match_str = rgw_string_unquote(conds.if_match);
+      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
+      if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
+        return -ERR_PRECONDITION_FAILED;
+      }
+    }
+
+    if (conds.if_nomatch) {
+      string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
+      ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
+      if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
+        return -ERR_NOT_MODIFIED;
+      }
+    }
+  }
+
+  if (params.obj_size)
+    *params.obj_size = astate->size;
+  if (params.lastmod)
+    *params.lastmod = astate->mtime;
+
+  return 0;
+}
+
+int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
+{
+  if (ofs < 0) {
+    ofs += obj_size;
+    if (ofs < 0)
+      ofs = 0;
+    end = obj_size - 1;
+  } else if (end < 0) {
+    end = obj_size - 1;
+  }
+
+  if (obj_size > 0) {
+    if (ofs >= (off_t)obj_size) {
+      return -ERANGE;
+    }
+    if (end >= (off_t)obj_size) {
+      end = obj_size - 1;
+    }
+  }
+  return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call)
+{
+  RGWRados *store = target->get_store();
+  BucketShard *bs = nullptr;
+  int r;
+
+#define NUM_RESHARD_RETRIES 10
+  for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
+    int ret = get_bucket_shard(&bs, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to get BucketShard object. obj=" << 
+        obj_instance.key << ". ret=" << ret << dendl;
+      return ret;
+    }
+
+    r = call(bs);
+    if (r != -ERR_BUSY_RESHARDING) {
+      break;
+    }
+
+    ldpp_dout(dpp, 10) <<
+      "NOTICE: resharding operation on bucket index detected, blocking. obj=" << 
+      obj_instance.key << dendl;
+
+    r = store->block_while_resharding(bs, obj_instance, target->bucket_info, null_yield, dpp);
+    if (r == -ERR_BUSY_RESHARDING) {
+      ldpp_dout(dpp, 10) << __func__ <<
+	" NOTICE: block_while_resharding() still busy. obj=" <<
+        obj_instance.key << dendl;
+      continue;
+    } else if (r < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+	" ERROR: block_while_resharding() failed. obj=" <<
+        obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    ldpp_dout(dpp, 20) << "reshard completion identified. obj=" << obj_instance.key << dendl;
+    i = 0; /* resharding is finished, make sure we can retry */
+    invalidate_bs();
+  } // for loop
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << 
+      obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (pbs) {
+    *pbs = bs;
+  }
+
+  return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y)
+{
+  if (blind) {
+    return 0;
+  }
+  RGWRados *store = target->get_store();
+
+  if (write_tag && write_tag->length()) {
+    optag = string(write_tag->c_str(), write_tag->length());
+  } else {
+    if (optag.empty()) {
+      append_rand_alpha(store->ctx(), optag, optag, 32);
+    }
+  }
+
+  int r = guard_reshard(dpp, obj, nullptr, [&](BucketShard *bs) -> int {
+				   return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace);
+				 });
+
+  if (r < 0) {
+    return r;
+  }
+  prepared = true;
+
+  return 0;
+}
+
+int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch,
+                                            uint64_t size, uint64_t accounted_size,
+                                            ceph::real_time& ut, const string& etag,
+                                            const string& content_type, const string& storage_class,
+                                            bufferlist *acl_bl,
+                                            RGWObjCategory category,
+                                            list<rgw_obj_index_key> *remove_objs,
+					    optional_yield y,
+					    const string *user_data,
+                                            bool appendable)
+{
+  if (blind) {
+    return 0;
+  }
+  RGWRados *store = target->get_store();
+  BucketShard *bs = nullptr;
+
+  int ret = get_bucket_shard(&bs, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
+    return ret;
+  }
+
+  rgw_bucket_dir_entry ent;
+  obj.key.get_index_key(&ent.key);
+  ent.meta.size = size;
+  ent.meta.accounted_size = accounted_size;
+  ent.meta.mtime = ut;
+  ent.meta.etag = etag;
+  ent.meta.storage_class = storage_class;
+  if (user_data)
+    ent.meta.user_data = *user_data;
+
+  ACLOwner owner;
+  if (acl_bl && acl_bl->length()) {
+    int ret = store->decode_policy(dpp, *acl_bl, &owner);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl;
+    }
+  }
+  ent.meta.owner = owner.get_id().to_str();
+  ent.meta.owner_display_name = owner.get_display_name();
+  ent.meta.content_type = content_type;
+  ent.meta.appendable = appendable;
+
+  ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
+
+  add_datalog_entry(dpp, store->svc.datalog_rados,
+                    target->bucket_info, bs->shard_id, y);
+
+  return ret;
+}
+
+int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp,
+                                                int64_t poolid, uint64_t epoch,
+                                                real_time& removed_mtime,
+                                                list<rgw_obj_index_key> *remove_objs,
+						optional_yield y)
+{
+  if (blind) {
+    return 0;
+  }
+  RGWRados *store = target->get_store();
+  BucketShard *bs = nullptr;
+
+  int ret = get_bucket_shard(&bs, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
+    return ret;
+  }
+
+  ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
+
+  add_datalog_entry(dpp, store->svc.datalog_rados,
+                    target->bucket_info, bs->shard_id, y);
+
+  return ret;
+}
+
+
+int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp,
+                                          list<rgw_obj_index_key> *remove_objs,
+					  optional_yield y)
+{
+    if (blind) {
+    return 0;
+  }
+  RGWRados *store = target->get_store();
+  BucketShard *bs;
+
+  int ret = guard_reshard(dpp, obj, &bs, [&](BucketShard *bs) -> int {
+				 return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace);
+			       });
+
+  /*
+   * need to update data log anyhow, so that whoever follows needs to update its internal markers
+   * for following the specific bucket shard log. Otherwise they end up staying behind, and users
+   * have no way to tell that they're all caught up
+   */
+  add_datalog_entry(dpp, store->svc.datalog_rados,
+                    target->bucket_info, bs->shard_id, y);
+
+  return ret;
+}
+
+/*
+ * Read up through index `end` inclusive. Number of bytes read is up
+ * to `end - ofs + 1`.
+ */
+int RGWRados::Object::Read::read(int64_t ofs, int64_t end,
+				 bufferlist& bl, optional_yield y,
+				 const DoutPrefixProvider *dpp)
+{
+  RGWRados *store = source->get_store();
+
+  rgw_raw_obj read_obj;
+  uint64_t read_ofs = ofs;
+  uint64_t len, read_len;
+  bool reading_from_head = true;
+  ObjectReadOperation op;
+
+  bool merge_bl = false;
+  bufferlist *pbl = &bl;
+  bufferlist read_bl;
+  uint64_t max_chunk_size;
+
+  RGWObjState *astate;
+  RGWObjManifest *manifest = nullptr;
+  int r = source->get_state(dpp, &astate, &manifest, true, y);
+  if (r < 0)
+    return r;
+
+  if (astate->size == 0) {
+    end = 0;
+  } else if (end >= (int64_t)astate->size) {
+    end = astate->size - 1;
+  }
+
+  if (end < 0)
+    len = 0;
+  else
+    len = end - ofs + 1;
+
+  if (manifest && manifest->has_tail()) {
+    /* now get the relevant object part */
+    RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
+
+    uint64_t stripe_ofs = iter.get_stripe_ofs();
+    read_obj = iter.get_location().get_raw_obj(store);
+    len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
+    read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
+    reading_from_head = (read_obj == state.head_obj);
+  } else {
+    read_obj = state.head_obj;
+  }
+
+  r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
+    return r;
+  }
+
+  if (len > max_chunk_size)
+    len = max_chunk_size;
+
+
+  read_len = len;
+
+  if (reading_from_head) {
+    /* only when reading from the head object do we need to do the atomic test */
+    r = store->append_atomic_test(dpp, &source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate, &manifest, y);
+    if (r < 0)
+      return r;
+
+    if (astate && astate->prefetch_data) {
+      if (!ofs && astate->data.length() >= len) {
+        bl = astate->data;
+        return bl.length();
+      }
+
+      if (ofs < astate->data.length()) {
+        unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
+        astate->data.begin(ofs).copy(copy_len, bl);
+        read_len -= copy_len;
+        read_ofs += copy_len;
+        if (!read_len)
+	  return bl.length();
+
+        merge_bl = true;
+        pbl = &read_bl;
+      }
+    }
+  }
+
+  ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
+  op.read(read_ofs, read_len, pbl, NULL);
+
+  if (state.cur_pool != read_obj.pool) {
+    auto iter = state.io_ctxs.find(read_obj.pool);
+    if (iter == state.io_ctxs.end()) {
+      state.cur_ioctx = &state.io_ctxs[read_obj.pool];
+      r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false, true);
+      if (r < 0) {
+        ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
+        return r;
+      }
+    } else {
+      state.cur_ioctx = &iter->second;
+    }
+    state.cur_pool = read_obj.pool;
+  }
+
+  state.cur_ioctx->locator_set_key(read_obj.loc);
+
+  r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
+  ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
+
+  if (r < 0) {
+    return r;
+  }
+
+  if (merge_bl) {
+    bl.append(read_bl);
+  }
+
+  return bl.length();
+}
+
+int get_obj_data::flush(rgw::AioResultList&& results) {
+  int r = rgw::check_for_errors(results);
+  if (r < 0) {
+    return r;
+  }
+  std::list<bufferlist> bl_list;
+
+  auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
+  results.sort(cmp); // merge() requires results to be sorted first
+  completed.merge(results, cmp); // merge results in sorted order
+
+  while (!completed.empty() && completed.front().id == offset) {
+    auto bl = std::move(completed.front().data);
+
+    bl_list.push_back(bl);
+    offset += bl.length();
+    int r = client_cb->handle_data(bl, 0, bl.length());
+    if (r < 0) {
+      return r;
+    }
+
+    if (rgwrados->get_use_datacache()) {
+      const std::lock_guard l(d3n_get_data.d3n_lock);
+      auto oid = completed.front().obj.get_ref().obj.oid;
+      if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) {
+        lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl;
+        rgwrados->d3n_data_cache->put(bl, bl.length(), oid);
+      } else {
+        lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl;
+      }
+    }
+    completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
+  }
+  return 0;
+}
+
+static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+                               const rgw_raw_obj& read_obj, off_t obj_ofs,
+                               off_t read_ofs, off_t len, bool is_head_obj,
+                               RGWObjState *astate, void *arg)
+{
+  struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+  return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len,
+                                      is_head_obj, astate, arg);
+}
+
+int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+                                 const rgw_raw_obj& read_obj, off_t obj_ofs,
+                                 off_t read_ofs, off_t len, bool is_head_obj,
+                                 RGWObjState *astate, void *arg)
+{
+  ObjectReadOperation op;
+  struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
+  string oid, key;
+
+  if (is_head_obj) {
+    /* only when reading from the head object do we need to do the atomic test */
+    int r = append_atomic_test(dpp, astate, op);
+    if (r < 0)
+      return r;
+
+    if (astate &&
+        obj_ofs < astate->data.length()) {
+      unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
+
+      r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
+      if (r < 0)
+        return r;
+
+      len -= chunk_len;
+      d->offset += chunk_len;
+      read_ofs += chunk_len;
+      obj_ofs += chunk_len;
+      if (!len)
+	  return 0;
+    }
+  }
+
+  auto obj = d->rgwrados->svc.rados->obj(read_obj);
+  int r = obj.open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl;
+    return r;
+  }
+
+  ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
+  op.read(read_ofs, len, nullptr, nullptr);
+
+  const uint64_t cost = len;
+  const uint64_t id = obj_ofs; // use logical object offset for sorting replies
+
+  auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
+
+  return d->flush(std::move(completed));
+}
+
+int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb,
+                                    optional_yield y)
+{
+  RGWRados *store = source->get_store();
+  CephContext *cct = store->ctx();
+  const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
+  const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
+
+  auto aio = rgw::make_throttle(window_size, y);
+  get_obj_data data(store, cb, &*aio, ofs, y);
+
+  int r = store->iterate_obj(dpp, source->get_ctx(), source->get_bucket_info(), state.obj,
+                             ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
+    data.cancel(); // drain completions without writing back to client
+    return r;
+  }
+
+  return data.drain();
+}
+
+int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
+                          RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                          off_t ofs, off_t end, uint64_t max_chunk_size,
+                          iterate_obj_cb cb, void *arg, optional_yield y)
+{
+  rgw_raw_obj head_obj;
+  rgw_raw_obj read_obj;
+  uint64_t read_ofs = ofs;
+  uint64_t len;
+  bool reading_from_head = true;
+  RGWObjState *astate = NULL;
+  RGWObjManifest *manifest = nullptr;
+
+  obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
+
+  int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, &manifest, false, y);
+  if (r < 0) {
+    return r;
+  }
+
+  if (end < 0)
+    len = 0;
+  else
+    len = end - ofs + 1;
+
+  if (manifest) {
+    /* now get the relevant object stripe */
+    RGWObjManifest::obj_iterator iter = manifest->obj_find(dpp, ofs);
+
+    RGWObjManifest::obj_iterator obj_end = manifest->obj_end(dpp);
+
+    for (; iter != obj_end && ofs <= end; ++iter) {
+      off_t stripe_ofs = iter.get_stripe_ofs();
+      off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
+
+      while (ofs < next_stripe_ofs && ofs <= end) {
+        read_obj = iter.get_location().get_raw_obj(this);
+        uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
+        read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
+
+        if (read_len > max_chunk_size) {
+          read_len = max_chunk_size;
+        }
+
+        reading_from_head = (read_obj == head_obj);
+        r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
+	if (r < 0) {
+	  return r;
+        }
+
+	len -= read_len;
+        ofs += read_len;
+      }
+    }
+  } else {
+    while (ofs <= end) {
+      read_obj = head_obj;
+      uint64_t read_len = std::min(len, max_chunk_size);
+
+      r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
+      if (r < 0) {
+	return r;
+      }
+
+      len -= read_len;
+      ofs += read_len;
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield);
+}
+
+int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  bufferlist outbl;
+
+  return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
+}
+
+void RGWRados::olh_cancel_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+                                       RGWObjState& state, const rgw_obj& olh_obj,
+                                       const std::string& op_tag, optional_yield y)
+{
+  if (cct->_conf->rgw_debug_inject_olh_cancel_modification_err) {
+    // simulate the scenario where we fail to remove the pending xattr
+    return;
+  }
+
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << __func__ << " target_obj=" << olh_obj << " get_obj_head_ref() returned " << r << dendl;
+    return;
+  }
+  string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
+  attr_name.append(op_tag);
+
+  // first remove the relevant pending prefix
+  ObjectWriteOperation op;
+  bucket_index_guard_olh_op(dpp, state, op);
+  op.rmxattr(attr_name.c_str());
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, y);
+  if (r < 0) {
+    if (r != -ENOENT && r != -ECANCELED) {
+      ldpp_dout(dpp, 0) << __func__ << " target_obj=" << olh_obj << " rmxattr rgw_rados_operate() returned " << r << dendl;
+    }
+    return;
+  }
+    
+  if (auto iter = state.attrset.find(RGW_ATTR_OLH_INFO); iter == state.attrset.end()) {
+    // attempt to remove the OLH object if there are no pending ops,
+    // its olh info attr is empty, and its tag hasn't changed
+    ObjectWriteOperation rm_op;
+    bucket_index_guard_olh_op(dpp, state, rm_op);
+    rm_op.cmpxattr(RGW_ATTR_OLH_INFO, CEPH_OSD_CMPXATTR_OP_EQ, bufferlist());
+    cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true);
+    rm_op.remove();
+    r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, y);
+  }
+  if (r < 0 && (r != -ENOENT && r != -ECANCELED)) {
+    ldpp_dout(dpp, 0) << __func__ << " target_obj=" << olh_obj << " olh rm rgw_rados_operate() returned " << r << dendl;
+  }
+}
+
+int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
+{
+  ObjectWriteOperation op;
+
+  ceph_assert(olh_obj.key.instance.empty());
+
+  bool has_tag = (state.exists && has_olh_tag(state.attrset));
+
+  if (!state.exists) {
+    op.create(true);
+  } else {
+    op.assert_exists();
+    struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
+    op.mtime2(&mtime_ts);
+  }
+
+  /*
+   * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
+   * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
+   * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
+   * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
+   * log will reflect that.
+   *
+   * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
+   * is used for object data instance, olh_tag for olh instance.
+   */
+  if (has_tag) {
+    /* guard against racing writes */
+    bucket_index_guard_olh_op(dpp, state, op);
+  } else if (state.exists) {
+    // This is the case where a null versioned object already exists for this key
+    // but it hasn't been initialized as an OLH object yet. We immediately add
+    // the RGW_ATTR_OLH_INFO attr so that the OLH points back to itself and
+    // therefore effectively makes this an unobservable modification.
+    op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, bufferlist());
+    RGWOLHInfo info;
+    info.target = olh_obj;
+    info.removed = false;
+    bufferlist bl;
+    encode(info, bl);
+    op.setxattr(RGW_ATTR_OLH_INFO, bl);
+  }
+
+  if (!has_tag) {
+    /* obj tag */
+    string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
+
+    bufferlist bl;
+    bl.append(obj_tag.c_str(), obj_tag.size());
+    op.setxattr(RGW_ATTR_ID_TAG, bl);
+
+    state.attrset[RGW_ATTR_ID_TAG] = bl;
+    state.obj_tag = bl;
+
+    /* olh tag */
+    string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
+
+    bufferlist olh_bl;
+    olh_bl.append(olh_tag.c_str(), olh_tag.size());
+    op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
+
+    state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
+    state.olh_tag = olh_bl;
+    state.is_olh = true;
+
+    bufferlist verbl;
+    op.setxattr(RGW_ATTR_OLH_VER, verbl);
+  }
+
+  bufferlist bl;
+  RGWOLHPendingInfo pending_info;
+  pending_info.time = real_clock::now();
+  encode(pending_info, bl);
+
+#define OLH_PENDING_TAG_LEN 32
+  /* tag will start with current time epoch, this so that entries are sorted by time */
+  char buf[32];
+  utime_t ut(pending_info.time);
+  snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
+  *op_tag = buf;
+
+  string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
+
+  op_tag->append(s);
+
+  string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
+  attr_name.append(*op_tag);
+
+  op.setxattr(attr_name.c_str(), bl);
+
+  int ret = obj_operate(dpp, bucket_info, olh_obj, &op);
+  if (ret < 0) {
+    return ret;
+  }
+
+  state.exists = true;
+  state.attrset[attr_name] = bl;
+
+  return 0;
+}
+
+int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
+{
+  int ret;
+
+  ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag);
+  if (ret == -EEXIST) {
+    ret = -ECANCELED;
+  }
+
+  return ret;
+}
+
+int RGWRados::guard_reshard(const DoutPrefixProvider *dpp,
+                            BucketShard *bs,
+			    const rgw_obj& obj_instance,
+			    RGWBucketInfo& bucket_info,
+			    std::function<int(BucketShard *)> call)
+{
+  rgw_obj obj;
+  const rgw_obj *pobj = &obj_instance;
+  int r;
+
+  for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
+    r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl;
+      return r;
+    }
+
+    r = call(bs);
+    if (r != -ERR_BUSY_RESHARDING) {
+      break;
+    }
+
+    ldpp_dout(dpp, 10) <<
+      "NOTICE: resharding operation on bucket index detected, blocking. obj=" <<
+      obj_instance.key << dendl;
+
+    r = block_while_resharding(bs, obj_instance, bucket_info, null_yield, dpp);
+    if (r == -ERR_BUSY_RESHARDING) {
+      ldpp_dout(dpp, 10) << __func__ <<
+	" NOTICE: block_while_resharding() still busy. obj=" <<
+        obj_instance.key << dendl;
+      continue;
+    } else if (r < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+	" ERROR: block_while_resharding() failed. obj=" <<
+        obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    ldpp_dout(dpp, 20) << "reshard completion identified" << dendl;
+    i = 0; /* resharding is finished, make sure we can retry */
+  } // for loop
+
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << 
+      obj_instance.key << ". ret=" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+
+int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
+                                     const rgw_obj& obj_instance,
+                                     RGWBucketInfo& bucket_info,
+                                     optional_yield y,
+                                     const DoutPrefixProvider *dpp)
+{
+  int ret = 0;
+  cls_rgw_bucket_instance_entry entry;
+
+  // gets loaded by fetch_new_bucket_info; can be used by
+  // clear_resharding
+  std::map<std::string, bufferlist> bucket_attrs;
+
+  // since we want to run this recovery code from two distinct places,
+  // let's just put it in a lambda so we can easily re-use; if the
+  // lambda successfully fetches a new bucket id, it sets
+  // new_bucket_id and returns 0, otherwise it returns a negative
+  // error code
+  auto fetch_new_bucket_info =
+    [this, bs, &obj_instance, &bucket_info, &bucket_attrs, &y, dpp](const std::string& log_tag) -> int {
+    int ret = get_bucket_info(&svc, bs->bucket.tenant, bs->bucket.name,
+			      bucket_info, nullptr, y, dpp, &bucket_attrs);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+	" ERROR: failed to refresh bucket info after reshard at " <<
+	log_tag << ": " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    ret = bs->init(dpp, bucket_info, obj_instance);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+	" ERROR: failed to refresh bucket shard generation after reshard at " <<
+	log_tag << ": " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    const auto gen = bucket_info.layout.logs.empty() ? -1 : bucket_info.layout.logs.back().gen;
+    ldpp_dout(dpp, 20) << __func__ <<
+      " INFO: refreshed bucket info after reshard at " <<
+      log_tag << ". new shard_id=" << bs->shard_id << ". gen=" << gen << dendl;
+
+    return 0;
+  }; // lambda fetch_new_bucket_info
+
+  constexpr int num_retries = 10;
+  for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
+    auto& ref = bs->bucket_obj.get_ref();
+    ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
+    if (ret == -ENOENT) {
+      ret = fetch_new_bucket_info("get_bucket_resharding_failed");
+      if (ret < 0) {
+	ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	  " failed to refresh bucket info after reshard when get bucket "
+	  "resharding failed, error: " << cpp_strerror(-ret) << dendl;
+	return ret;
+      }
+    } else if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+	" ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
+	dendl;
+      return ret;
+    }
+
+    if (!entry.resharding_in_progress()) {
+      ret = fetch_new_bucket_info("get_bucket_resharding_succeeded");
+      if (ret < 0) {
+	ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	  " failed to refresh bucket info after reshard when get bucket "
+	  "resharding succeeded, error: " << cpp_strerror(-ret) << dendl;
+	return ret;
+      }
+    }
+
+    ldpp_dout(dpp, 20) << __func__ << " NOTICE: reshard still in progress; " <<
+      (i < num_retries ? "retrying" : "too many retries") << dendl;
+
+    if (i == num_retries) {
+      break;
+    }
+
+    // If bucket is erroneously marked as resharding (e.g., crash or
+    // other error) then fix it. If we can take the bucket reshard
+    // lock then it means no other resharding should be taking place,
+    // and we're free to clear the flags.
+    {
+      // since we expect to do this rarely, we'll do our work in a
+      // block and erase our work after each try
+
+      RGWObjectCtx obj_ctx(this->driver);
+      const rgw_bucket& b = bs->bucket;
+      std::string bucket_id = b.get_key();
+      RGWBucketReshardLock reshard_lock(this->driver, bucket_info, true);
+      ret = reshard_lock.lock(dpp);
+      if (ret == -ENOENT) {
+	continue;
+      } else if (ret < 0) {
+	ldpp_dout(dpp, 20) << __func__ <<
+	  " ERROR: failed to take reshard lock for bucket " <<
+	  bucket_id << "; expected if resharding underway" << dendl;
+      } else {
+	ldpp_dout(dpp, 10) << __func__ <<
+	  " INFO: was able to take reshard lock for bucket " <<
+	  bucket_id << dendl;
+        // the reshard may have finished, so call clear_resharding()
+        // with its current bucket info; ALSO this will load
+        // bucket_attrs for call to clear_resharding below
+        ret = fetch_new_bucket_info("trying_to_clear_resharding");
+        if (ret < 0) {
+	  reshard_lock.unlock();
+	  ldpp_dout(dpp, 0) << __func__ <<
+	    " ERROR: failed to update bucket info before clear resharding for bucket " <<
+	    bucket_id << dendl;
+          continue; // try again
+        }
+
+	ret = RGWBucketReshard::clear_resharding(this->driver, bucket_info, bucket_attrs, dpp);
+	reshard_lock.unlock();
+	if (ret == -ENOENT) {
+	  ldpp_dout(dpp, 5) << __func__ <<
+	    " INFO: no need to reset reshard flags; old shards apparently"
+	    " removed after successful resharding of bucket " <<
+	    bucket_id << dendl;
+	  continue; // immediately test again
+	} else if (ret < 0) {
+	  ldpp_dout(dpp, 0) << __func__ <<
+	    " ERROR: failed to clear resharding flags for bucket " <<
+	    bucket_id << ", " << cpp_strerror(-ret) << dendl;
+	  // wait and then test again
+	} else {
+	  ldpp_dout(dpp, 5) << __func__ <<
+	    " INFO: apparently successfully cleared resharding flags for "
+	    "bucket " << bucket_id << dendl;
+	  continue; // if we apparently succeed immediately test again
+	} // if clear resharding succeeded
+      } // if taking of lock succeeded
+    } // block to encapsulate recovery from incomplete reshard
+
+    ret = reshard_wait->wait(y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+	" ERROR: bucket is still resharding, please retry" << dendl;
+      return ret;
+    }
+  } // for loop
+
+  ldpp_dout(dpp, 0) << __func__ <<
+    " ERROR: bucket is still resharding, please retry" << dendl;
+  return -ERR_BUSY_RESHARDING;
+}
+
+int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+                                    RGWObjState& olh_state, const rgw_obj& obj_instance,
+                                    bool delete_marker, const string& op_tag,
+                                    struct rgw_bucket_dir_entry_meta *meta,
+                                    uint64_t olh_epoch,
+                                    real_time unmod_since, bool high_precision_time,
+				    optional_yield y,
+                                    rgw_zone_set *_zones_trace, bool log_data_change)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  rgw_zone_set zones_trace;
+  if (_zones_trace) {
+    zones_trace = *_zones_trace;
+  }
+  zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
+
+  BucketShard bs(this);
+
+  r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+		    [&](BucketShard *bs) -> int {
+		      cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
+		      auto& ref = bs->bucket_obj.get_ref();
+		      librados::ObjectWriteOperation op;
+		      op.assert_exists(); // bucket index shard must exist
+		      cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+		      cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
+                                              delete_marker, op_tag, meta, olh_epoch,
+					      unmod_since, high_precision_time,
+					      svc.zone->need_to_log_data(), zones_trace);
+                      return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+                    });
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
+    return r;
+  }
+
+  if (log_data_change) {
+    add_datalog_entry(dpp, svc.datalog_rados, bucket_info, bs.shard_id, y);
+  }
+
+  return 0;
+}
+
+void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op)
+{
+  ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
+  op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
+}
+
+int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
+                                           RGWBucketInfo& bucket_info,
+                                           const rgw_obj& obj_instance,
+                                           const string& op_tag, const string& olh_tag,
+                                           uint64_t olh_epoch, rgw_zone_set *_zones_trace)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  rgw_zone_set zones_trace;
+  if (_zones_trace) {
+    zones_trace = *_zones_trace;
+  }
+  zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
+
+  BucketShard bs(this);
+
+  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
+  r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+		    [&](BucketShard *bs) -> int {
+		      auto& ref = bs->bucket_obj.get_ref();
+		      librados::ObjectWriteOperation op;
+		      op.assert_exists(); // bucket index shard must exist
+		      cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+		      cls_rgw_bucket_unlink_instance(op, key, op_tag,
+						     olh_tag, olh_epoch, svc.zone->need_to_log_data(), zones_trace);
+                      return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+                    });
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
+                                        RGWBucketInfo& bucket_info, RGWObjState& state,
+                                        const rgw_obj& obj_instance, uint64_t ver_marker,
+                                        std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log,
+                                        bool *is_truncated)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  BucketShard bs(this);
+  int ret =
+    bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+
+  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+  auto& shard_ref = bs.bucket_obj.get_ref();
+  ObjectReadOperation op;
+
+  rgw_cls_read_olh_log_ret log_ret;
+  int op_ret = 0;
+  cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret); 
+  bufferlist outbl;
+  r =  rgw_rados_operate(dpp, shard_ref.pool.ioctx(), shard_ref.obj.oid, &op, &outbl, null_yield);
+  if (r < 0) {
+    return r;
+  }
+  if (op_ret < 0) {
+    ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned op_ret=" << op_ret << dendl;
+    return op_ret;
+  }
+
+  *log = std::move(log_ret.log);
+  *is_truncated = log_ret.is_truncated;
+
+  return 0;
+}
+
+// a multisite sync bug resulted in the OLH head attributes being overwritten by
+// the attributes from another zone, causing link_olh() to fail endlessly due to
+// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
+// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
+int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
+                         const rgw_obj& obj)
+{
+  // fetch the current olh entry from the bucket index
+  rgw_bucket_olh_entry olh;
+  int r = bi_get_olh(dpp, bucket_info, obj, &olh);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
+    return r;
+  }
+  if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
+    return 0;
+  }
+
+  ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag
+      << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
+
+  // rewrite OLH_ID_TAG and OLH_INFO from current olh
+  ObjectWriteOperation op;
+  // assert this is the same olh tag we think we're fixing
+  bucket_index_guard_olh_op(dpp, *state, op);
+  // preserve existing mtime
+  struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
+  op.mtime2(&mtime_ts);
+  {
+    bufferlist bl;
+    bl.append(olh.tag.c_str(), olh.tag.size());
+    op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
+  }
+  {
+    RGWOLHInfo info;
+    info.target = rgw_obj(bucket_info.bucket, olh.key);
+    info.removed = olh.delete_marker;
+    bufferlist bl;
+    encode(info, bl);
+    op.setxattr(RGW_ATTR_OLH_INFO, bl);
+  }
+  rgw_rados_ref ref;
+  r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with "
+        << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp,
+                                        RGWBucketInfo& bucket_info,
+                                        RGWObjState& state,
+                                        const rgw_obj& obj_instance, uint64_t ver)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  BucketShard bs(this);
+  int ret =
+    bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+
+  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+  ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+		      [&](BucketShard *pbs) -> int {
+			ObjectWriteOperation op;
+			op.assert_exists(); // bucket index shard must exist
+			cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+			cls_rgw_trim_olh_log(op, key, ver, olh_tag);
+                        return pbs->bucket_obj.operate(dpp, &op, null_yield);
+                      });
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp,
+                                     RGWBucketInfo& bucket_info,
+                                     const std::string& olh_tag,
+                                     const rgw_obj& obj_instance)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  BucketShard bs(this);
+
+  cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
+
+  int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
+			  [&](BucketShard *pbs) -> int {
+			    ObjectWriteOperation op;
+			    op.assert_exists(); // bucket index shard must exist
+			    auto& ref = pbs->bucket_obj.get_ref();
+			    cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
+			    cls_rgw_clear_olh(op, key, olh_tag);
+                            return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+                          });
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
+{
+  try {
+    auto biter = bl.cbegin();
+    decode(*olh, biter);
+    return 0;
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl;
+    return -EIO;
+  }
+}
+
+int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp,
+			    RGWObjectCtx& obj_ctx,
+			    RGWObjState& state,
+			    RGWBucketInfo& bucket_info,
+			    const rgw_obj& obj,
+			    bufferlist& olh_tag,
+			    std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
+			    uint64_t *plast_ver,
+			    rgw_zone_set* zones_trace)
+{
+  if (log.empty()) {
+    return 0;
+  }
+
+  librados::ObjectWriteOperation op;
+
+  uint64_t last_ver = log.rbegin()->first;
+  *plast_ver = last_ver;
+
+  map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
+
+  op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
+  op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
+
+  bufferlist ver_bl;
+  string last_ver_s = to_string(last_ver);
+  ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
+  op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
+
+  struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
+  op.mtime2(&mtime_ts);
+
+  bool need_to_link = false;
+  uint64_t link_epoch = 0;
+  cls_rgw_obj_key key;
+  bool delete_marker = false;
+  list<cls_rgw_obj_key> remove_instances;
+  bool need_to_remove = false;
+
+  // decode current epoch and instance
+  auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
+  if (olh_ver != state.attrset.end()) {
+    std::string str = olh_ver->second.to_str();
+    std::string err;
+    link_epoch = strict_strtoll(str.c_str(), 10, &err);
+  }
+  auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
+  if (olh_info != state.attrset.end()) {
+    RGWOLHInfo info;
+    int r = decode_olh_info(dpp, cct, olh_info->second, &info);
+    if (r < 0) {
+      return r;
+    }
+    info.target.key.get_index_key(&key);
+    delete_marker = info.removed;
+  }
+
+  for (iter = log.begin(); iter != log.end(); ++iter) {
+    vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
+    for (; viter != iter->second.end(); ++viter) {
+      rgw_bucket_olh_log_entry& entry = *viter;
+
+      ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
+                     << " key=" << entry.key.name << "[" << entry.key.instance << "] "
+                     << (entry.delete_marker ? "(delete)" : "") << dendl;
+      switch (entry.op) {
+      case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
+        remove_instances.push_back(entry.key);
+        break;
+      case CLS_RGW_OLH_OP_LINK_OLH:
+        // only overwrite a link of the same epoch if its key sorts before
+        if (link_epoch < iter->first || key.instance.empty() ||
+            key.instance > entry.key.instance) {
+          ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
+              << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
+          need_to_link = true;
+          need_to_remove = false;
+          key = entry.key;
+          delete_marker = entry.delete_marker;
+        } else {
+          ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
+              << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
+        }
+        break;
+      case CLS_RGW_OLH_OP_UNLINK_OLH:
+        need_to_remove = true;
+        need_to_link = false;
+        break;
+      default:
+        ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
+        return -EIO;
+      }
+      string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
+      attr_name.append(entry.op_tag);
+      op.rmxattr(attr_name.c_str());
+    }
+  }
+
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  const rgw_bucket& bucket = obj.bucket;
+
+  if (need_to_link) {
+    rgw_obj target(bucket, key);
+    RGWOLHInfo info;
+    info.target = target;
+    info.removed = delete_marker;
+    bufferlist bl;
+    encode(info, bl);
+    op.setxattr(RGW_ATTR_OLH_INFO, bl);
+  }
+
+  /* first remove object instances */
+  for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
+       liter != remove_instances.end(); ++liter) {
+    cls_rgw_obj_key& key = *liter;
+    rgw_obj obj_instance(bucket, key);
+    int ret = delete_obj(dpp, obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
+      return ret;
+    }
+  }
+
+  /* update olh object */
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
+    return r;
+  }
+
+
+  if (need_to_remove) {
+    string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
+    r = clear_olh(dpp, obj_ctx, obj, bucket_info, ref, olh_tag, last_ver, null_yield);
+    if (r < 0 && r != -ECANCELED) {
+      ldpp_dout(dpp, 0) << "ERROR: could not clear olh, r=" << r << dendl;
+      return r;
+    }
+  } else {
+    r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj, last_ver);
+    if (r < 0 && r != -ECANCELED) {
+      ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+int RGWRados::clear_olh(const DoutPrefixProvider *dpp,
+                        RGWObjectCtx& obj_ctx,
+                        const rgw_obj& obj,
+                        RGWBucketInfo& bucket_info,
+                        const std::string& tag,
+                        const uint64_t ver,
+                        optional_yield y) {
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+  return clear_olh(dpp, obj_ctx, obj, bucket_info, ref, tag, ver, y);
+}
+
+int RGWRados::clear_olh(const DoutPrefixProvider *dpp,
+                        RGWObjectCtx& obj_ctx,
+                        const rgw_obj& obj,
+                        RGWBucketInfo& bucket_info,
+                        rgw_rados_ref& ref,
+                        const std::string& tag,
+                        const uint64_t ver,
+                        optional_yield y) {
+  ObjectWriteOperation rm_op;
+
+  RGWObjManifest *manifest = nullptr;
+  RGWObjState *s = nullptr;
+
+  int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &s, &manifest, false, y);
+  if (r < 0) {
+    return r;
+  }
+  map<string, bufferlist> pending_entries;
+  rgw_filter_attrset(s->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
+
+  map<string, bufferlist> rm_pending_entries;
+  check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries);
+
+  if (!rm_pending_entries.empty()) {
+    r = remove_olh_pending_entries(dpp, bucket_info, *s, obj, rm_pending_entries);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: rm_pending_entries returned ret=" << r << dendl;
+      return r;
+    }
+  }
+
+  bufferlist tag_bl;
+  tag_bl.append(tag.c_str(), tag.length());
+  rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, tag_bl);
+  rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, ver);
+  cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
+  rm_op.remove();
+
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, y);
+  if (r == -ECANCELED) {
+    return r; /* someone else made a modification in the meantime */
+  }
+  /* 
+   * only clear if was successful, otherwise we might clobber pending operations on this object
+   */
+  r = bucket_index_clear_olh(dpp, bucket_info, tag, obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
+    return r;
+  }
+  return 0;
+}
+
+/*
+ * read olh log and apply it
+ */
+int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
+{
+  map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
+  bool is_truncated;
+  uint64_t ver_marker = 0;
+
+  do {
+    int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj, ver_marker, &log, &is_truncated);
+    if (ret < 0) {
+      return ret;
+    }
+    ret = apply_olh_log(dpp, obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
+    if (ret < 0) {
+      return ret;
+    }
+  } while (is_truncated);
+
+  return 0;
+}
+
+int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
+		      RGWBucketInfo& bucket_info,
+		      const rgw_obj& target_obj, bool delete_marker,
+		      rgw_bucket_dir_entry_meta *meta,
+                      uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
+                      optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
+{
+  string op_tag;
+
+  rgw_obj olh_obj = target_obj;
+  olh_obj.key.instance.clear();
+
+  RGWObjState *state = NULL;
+  RGWObjManifest *manifest = nullptr;
+
+  int ret = 0;
+  int i;
+
+#define MAX_ECANCELED_RETRY 100
+  for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
+    if (ret == -ECANCELED) {
+      obj_ctx.invalidate(olh_obj);
+    }
+
+    ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, &manifest, false, y); /* don't follow olh */
+    if (ret < 0) {
+      return ret;
+    }
+
+    ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
+      if (ret == -ECANCELED) {
+        continue;
+      }
+      return ret;
+    }
+    if (cct->_conf->rgw_debug_inject_set_olh_err) {
+      // fail here to simulate the scenario of an unlinked object instance
+      ret = -cct->_conf->rgw_debug_inject_set_olh_err;
+    } else {
+      ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj,
+		                              delete_marker, op_tag, meta, olh_epoch, unmod_since,
+		                              high_precision_time, y, zones_trace, log_data_change);
+    }
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
+      olh_cancel_modification(dpp, bucket_info, *state, olh_obj, op_tag, y);
+      if (ret == -ECANCELED) {
+        // the bucket index rejected the link_olh() due to olh tag mismatch;
+        // attempt to reconstruct olh head attributes based on the bucket index
+        int r2 = repair_olh(dpp, state, bucket_info, olh_obj);
+        if (r2 < 0 && r2 != -ECANCELED) {
+          return r2;
+        }
+        continue;
+      }
+      // it's possible that the pending xattr from this op prevented the olh
+      // object from being cleaned by another thread that was deleting the last
+      // existing version. We invoke a best-effort update_olh here to handle this case.
+      int r = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
+      if (r < 0 && r != -ECANCELED) {
+        ldpp_dout(dpp, 20) << "update_olh() target_obj=" << olh_obj << " returned " << r << dendl;
+      }
+      return ret;
+    }
+    break;
+  }
+
+  if (i == MAX_ECANCELED_RETRY) {
+    ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
+    return -EIO;
+  }
+
+  ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
+  if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
+    ret = 0;
+  }
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
+                                  uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
+{
+  string op_tag;
+
+  rgw_obj olh_obj = target_obj;
+  olh_obj.key.instance.clear();
+
+  RGWObjState *state = NULL;
+  RGWObjManifest *manifest = NULL;
+
+  int ret = 0;
+  int i;
+
+  for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
+    if (ret == -ECANCELED) {
+      obj_ctx.invalidate(olh_obj);
+    }
+
+    ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, &manifest, false, y); /* don't follow olh */
+    if (ret < 0)
+      return ret;
+
+    ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
+      if (ret == -ECANCELED) {
+        continue;
+      }
+      return ret;
+    }
+
+    string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
+
+    ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
+    if (ret < 0) {
+      olh_cancel_modification(dpp, bucket_info, *state, olh_obj, op_tag, y);
+      ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
+      if (ret == -ECANCELED) {
+        continue;
+      }
+      // it's possible that the pending xattr from this op prevented the olh
+      // object from being cleaned by another thread that was deleting the last
+      // existing version. We invoke a best-effort update_olh here to handle this case.
+      int r = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, zones_trace);
+      if (r < 0 && r != -ECANCELED) {
+        ldpp_dout(dpp, 20) << "update_olh() target_obj=" << olh_obj << " returned " << r << dendl;
+      }
+      return ret;
+    }
+    break;
+  }
+
+  if (i == MAX_ECANCELED_RETRY) {
+    ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
+    return -EIO;
+  }
+
+  ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, zones_trace);
+  if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
+    return 0;
+  }
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
+{
+#define OBJ_INSTANCE_LEN 32
+  char buf[OBJ_INSTANCE_LEN + 1];
+
+  gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
+                                                                      no underscore for instance name due to the way we encode the raw keys */
+
+  target_key->set_instance(buf);
+}
+
+void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
+{
+  gen_rand_obj_instance_name(&target_obj->key);
+}
+
+int RGWRados::get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
+{
+  map<string, bufferlist> attrset;
+
+  ObjectReadOperation op;
+  op.getxattrs(&attrset, NULL);
+
+  int r = obj_operate(dpp, bucket_info, obj, &op);
+  if (r < 0) {
+    return r;
+  }
+
+  auto iter = attrset.find(RGW_ATTR_OLH_VER);
+  if (iter == attrset.end()) { /* not an olh */
+    return -EINVAL;
+  }
+
+  return decode_olh_info(dpp, cct, iter->second, olh);
+}
+
+void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp,
+					 map<string, bufferlist>& pending_entries,
+                                         map<string, bufferlist> *rm_pending_entries)
+{
+  map<string, bufferlist>::iterator iter = pending_entries.begin();
+
+  real_time now = real_clock::now();
+
+  while (iter != pending_entries.end()) {
+    auto biter = iter->second.cbegin();
+    RGWOLHPendingInfo pending_info;
+    try {
+      decode(pending_info, biter);
+    } catch (buffer::error& err) {
+      /* skipping bad entry, we could remove it but it might hide a bug */
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
+      ++iter;
+      continue;
+    }
+
+    map<string, bufferlist>::iterator cur_iter = iter;
+    ++iter;
+    if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
+      (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
+      pending_entries.erase(cur_iter);
+    } else {
+      /* entries names are sorted by time (rounded to a second) */
+      break;
+    }
+  }
+}
+
+int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
+{
+  rgw_rados_ref ref;
+  int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  // trim no more than 1000 entries per osd op
+  constexpr int max_entries = 1000;
+
+  auto i = pending_attrs.begin();
+  while (i != pending_attrs.end()) {
+    ObjectWriteOperation op;
+    bucket_index_guard_olh_op(dpp, state, op);
+
+    for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
+      op.rmxattr(i->first.c_str());
+    }
+
+    r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+    if (r == -ENOENT || r == -ECANCELED) {
+      /* raced with some other change, shouldn't sweat about it */
+      return 0;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
+      return r;
+    }
+  }
+  return 0;
+}
+
+int RGWRados::follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
+{
+  map<string, bufferlist> pending_entries;
+  rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
+
+  map<string, bufferlist> rm_pending_entries;
+  check_pending_olh_entries(dpp, pending_entries, &rm_pending_entries);
+
+  if (!rm_pending_entries.empty()) {
+    int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj, rm_pending_entries);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
+      return ret;
+    }
+  }
+  if (!pending_entries.empty()) {
+    ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
+
+    int ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
+    if (ret < 0) {
+      if (ret == -ECANCELED) {
+        // In this context, ECANCELED means that the OLH tag changed in either the bucket index entry or the OLH object.
+        // If the OLH tag changed, it indicates that a previous OLH entry was removed since this request started. We
+        // return ENOENT to indicate that the OLH object was removed.
+        ret = -ENOENT;
+      }
+      return ret;
+    }
+  }
+
+  auto iter = state->attrset.find(RGW_ATTR_OLH_VER);
+  if (iter == state->attrset.end()) {
+    return -EINVAL;
+  }
+  iter = state->attrset.find(RGW_ATTR_OLH_INFO);
+  if (iter == state->attrset.end()) {
+    return -ENOENT;
+  }
+
+  RGWOLHInfo olh;
+  int ret = decode_olh_info(dpp, cct, iter->second, &olh);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (olh.removed) {
+    return -ENOENT;
+  }
+
+  *target = olh.target;
+
+  return 0;
+}
+
+int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
+                           rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
+                           map<string, bufferlist> *attrs, bufferlist *first_chunk,
+                           RGWObjVersionTracker *objv_tracker, optional_yield y)
+{
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  map<string, bufferlist> unfiltered_attrset;
+  uint64_t size = 0;
+  struct timespec mtime_ts;
+
+  ObjectReadOperation op;
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_read(&op);
+  }
+  if (attrs) {
+    op.getxattrs(&unfiltered_attrset, NULL);
+  }
+  if (psize || pmtime) {
+    op.stat2(&size, &mtime_ts, NULL);
+  }
+  if (first_chunk) {
+    op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
+  }
+  bufferlist outbl;
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, y);
+
+  if (epoch) {
+    *epoch = ref.pool.ioctx().get_last_version();
+  }
+
+  if (r < 0)
+    return r;
+
+  if (psize)
+    *psize = size;
+  if (pmtime)
+    *pmtime = ceph::real_clock::from_timespec(mtime_ts);
+  if (attrs) {
+    rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
+  }
+
+  return 0;
+}
+
+int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp,
+			       RGWBucketInfo& bucket_info,
+			       const rgw::bucket_index_layout_generation& idx_layout,
+			       int shard_id, string *bucket_ver, string *master_ver,
+			       map<RGWObjCategory, RGWStorageStats>& stats,
+			       string *max_marker, bool *syncstopped)
+{
+  vector<rgw_bucket_dir_header> headers;
+  map<int, string> bucket_instance_ids;
+  int r = cls_bucket_head(dpp, bucket_info, idx_layout, shard_id, headers, &bucket_instance_ids);
+  if (r < 0) {
+    return r;
+  }
+
+  ceph_assert(headers.size() == bucket_instance_ids.size());
+
+  auto iter = headers.begin();
+  map<int, string>::iterator viter = bucket_instance_ids.begin();
+  BucketIndexShardsManager ver_mgr;
+  BucketIndexShardsManager master_ver_mgr;
+  BucketIndexShardsManager marker_mgr;
+  char buf[64];
+  for(; iter != headers.end(); ++iter, ++viter) {
+    accumulate_raw_stats(*iter, stats);
+    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
+    ver_mgr.add(viter->first, string(buf));
+    snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
+    master_ver_mgr.add(viter->first, string(buf));
+    if (shard_id >= 0) {
+      *max_marker = iter->max_marker;
+    } else {
+      marker_mgr.add(viter->first, iter->max_marker);
+    }
+    if (syncstopped != NULL)
+      *syncstopped = iter->syncstopped;
+  }
+  ver_mgr.to_string(bucket_ver);
+  master_ver_mgr.to_string(master_ver);
+  if (shard_id < 0) {
+    marker_mgr.to_string(max_marker);
+  }
+  return 0;
+}
+
+class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
+  RGWGetBucketStats_CB *cb;
+  uint32_t pendings;
+  map<RGWObjCategory, RGWStorageStats> stats;
+  int ret_code;
+  bool should_cb;
+  ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
+
+public:
+  RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
+    : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
+  {}
+
+  void handle_response(int r, rgw_bucket_dir_header& header) override {
+    std::lock_guard l{lock};
+    if (should_cb) {
+      if ( r >= 0) {
+        accumulate_raw_stats(header, stats);
+      } else {
+        ret_code = r;
+      }
+
+      // Are we all done?
+      if (--pendings == 0) {
+        if (!ret_code) {
+          cb->set_response(&stats);
+        }
+        cb->handle_response(ret_code);
+        cb->put();
+      }
+    }
+  }
+
+  void unset_cb() {
+    std::lock_guard l{lock};
+    should_cb = false;
+  }
+};
+
+int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx)
+{
+  int num_aio = 0;
+  RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1);
+  ceph_assert(get_ctx);
+  int r = cls_bucket_head_async(dpp, bucket_info, idx_layout, shard_id, get_ctx, &num_aio);
+  if (r < 0) {
+    ctx->put();
+    if (num_aio) {
+      get_ctx->unset_cb();
+    }
+  }
+  get_ctx->put();
+  return r;
+}
+
+int RGWRados::get_bucket_instance_info(const string& meta_key,
+				       RGWBucketInfo& info,
+                                       real_time *pmtime,
+				       map<string, bufferlist> *pattrs,
+				       optional_yield y,
+                                       const DoutPrefixProvider *dpp)
+{
+  rgw_bucket bucket;
+  rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
+
+  return get_bucket_instance_info(bucket, info, pmtime, pattrs, y, dpp);
+}
+
+int RGWRados::get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info,
+                                       real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y,
+                                       const DoutPrefixProvider *dpp)
+{
+  return ctl.bucket->read_bucket_instance_info(bucket, &info,
+					       y,
+                                               dpp,
+					       RGWBucketCtl::BucketInstance::GetParams()
+					       .set_mtime(pmtime)
+					       .set_attrs(pattrs));
+}
+
+int RGWRados::get_bucket_info(RGWServices *svc,
+                              const string& tenant, const string& bucket_name,
+                              RGWBucketInfo& info,
+                              real_time *pmtime,
+                              optional_yield y,
+                              const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs)
+{
+  rgw_bucket bucket;
+  bucket.tenant = tenant;
+  bucket.name = bucket_name;
+  return ctl.bucket->read_bucket_info(bucket, &info, y, dpp,
+				      RGWBucketCtl::BucketInstance::GetParams()
+				      .set_mtime(pmtime)
+				      .set_attrs(pattrs));
+}
+
+int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
+                                      ceph::real_time *pmtime,
+                                      const DoutPrefixProvider *dpp,
+                                      map<string, bufferlist> *pattrs)
+{
+  rgw_bucket bucket = info.bucket;
+  bucket.bucket_id.clear();
+
+  auto rv = info.objv_tracker.read_version;
+
+  return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp,
+				      RGWBucketCtl::BucketInstance::GetParams()
+				      .set_mtime(pmtime)
+				      .set_attrs(pattrs)
+				      .set_refresh_version(rv));
+}
+
+int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
+                              real_time mtime, map<string, bufferlist> *pattrs,
+                              const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return ctl.bucket->store_bucket_instance_info(info.bucket, info, y, dpp,
+						RGWBucketCtl::BucketInstance::PutParams()
+						.set_exclusive(exclusive)
+						.set_mtime(mtime)
+						.set_attrs(pattrs));
+}
+
+int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
+                                     map<string, bufferlist> *pattrs, bool create_entry_point,
+                                     const DoutPrefixProvider *dpp, optional_yield y)
+{
+  bool create_head = !info.has_instance_obj || create_entry_point;
+
+  int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (!create_head)
+    return 0; /* done! */
+
+  RGWBucketEntryPoint entry_point;
+  entry_point.bucket = info.bucket;
+  entry_point.owner = info.owner;
+  entry_point.creation_time = info.creation_time;
+  entry_point.linked = true;
+  RGWObjVersionTracker ot;
+  if (pep_objv && !pep_objv->tag.empty()) {
+    ot.write_version = *pep_objv;
+  } else {
+    ot.generate_new_write_ver(cct);
+    if (pep_objv) {
+      *pep_objv = ot.write_version;
+    }
+  }
+  ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, y, dpp, RGWBucketCtl::Bucket::PutParams()
+						                          .set_exclusive(exclusive)
+									  .set_objv_tracker(&ot)
+									  .set_mtime(mtime));
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp)
+{
+  map<string, RGWBucketEnt>::iterator iter;
+  for (iter = m.begin(); iter != m.end(); ++iter) {
+    RGWBucketEnt& ent = iter->second;
+    rgw_bucket& bucket = ent.bucket;
+    ent.count = 0;
+    ent.size = 0;
+    ent.size_rounded = 0;
+
+    vector<rgw_bucket_dir_header> headers;
+
+    RGWBucketInfo bucket_info;
+    int ret = get_bucket_instance_info(bucket, bucket_info, NULL, NULL, null_yield, dpp);
+    if (ret < 0) {
+      return ret;
+    }
+
+    int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, headers);
+    if (r < 0)
+      return r;
+
+    auto hiter = headers.begin();
+    for (; hiter != headers.end(); ++hiter) {
+      RGWObjCategory category = main_category;
+      auto iter = (hiter->stats).find(category);
+      if (iter != hiter->stats.end()) {
+        struct rgw_bucket_category_stats& stats = iter->second;
+        ent.count += stats.num_entries;
+        ent.size += stats.total_size;
+        ent.size_rounded += stats.total_size_rounded;
+      }
+    }
+
+    // fill in placement_rule from the bucket instance for use in swift's
+    // per-storage policy statistics
+    ent.placement_rule = std::move(bucket_info.placement_rule);
+  }
+
+  return m.size();
+}
+
+int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl)
+{
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+  librados::Rados *rad = get_rados_handle();
+  librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
+
+  r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
+  completion->release();
+  return r;
+}
+
+int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx)
+{
+  librados::IoCtx& io_ctx = ctx.io_ctx;
+  librados::NObjectIterator& iter = ctx.iter;
+
+  int r = open_pool_ctx(dpp, pool, io_ctx, false, false);
+  if (r < 0)
+    return r;
+
+  iter = io_ctx.nobjects_begin();
+
+  return 0;
+}
+
+int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
+{
+  librados::IoCtx& io_ctx = ctx.io_ctx;
+  librados::NObjectIterator& iter = ctx.iter;
+
+  int r = open_pool_ctx(dpp, pool, io_ctx, false, false);
+  if (r < 0)
+    return r;
+
+  librados::ObjectCursor oc;
+  if (!oc.from_str(cursor)) {
+    ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    iter = io_ctx.nobjects_begin(oc);
+    return 0;
+  } catch (const std::system_error& e) {
+    r = -e.code().value();
+    ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
+}
+
+string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
+{
+  return ctx.iter.get_cursor().to_str();
+}
+
+static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
+                           vector<rgw_bucket_dir_entry>& objs,
+                           bool *is_truncated, RGWAccessListFilter *filter)
+{
+  librados::IoCtx& io_ctx = ctx.io_ctx;
+  librados::NObjectIterator& iter = ctx.iter;
+
+  if (iter == io_ctx.nobjects_end())
+    return -ENOENT;
+
+  uint32_t i;
+
+  for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
+    rgw_bucket_dir_entry e;
+
+    string oid = iter->get_oid();
+    ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
+
+    // fill it in with initial values; we may correct later
+    if (filter && !filter->filter(oid, oid))
+      continue;
+
+    e.key = oid;
+    objs.push_back(e);
+  }
+
+  if (is_truncated)
+    *is_truncated = (iter != io_ctx.nobjects_end());
+
+  return objs.size();
+}
+
+int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+                           bool *is_truncated, RGWAccessListFilter *filter)
+{
+  // catch exceptions from NObjectIterator::operator++()
+  try {
+    return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter);
+  } catch (const std::system_error& e) {
+    int r = -e.code().value();
+    ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
+}
+
+int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
+{
+  if (!ctx->initialized) {
+    int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx);
+    if (r < 0) {
+      ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
+      return r;
+    }
+    ctx->initialized = true;
+  }
+  return 0;
+}
+
+int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max,
+                                    RGWListRawObjsCtx& ctx, list<string>& oids,
+                                    bool *is_truncated)
+{
+  if (!ctx.initialized) {
+    return -EINVAL;
+  }
+  RGWAccessListFilterPrefix filter(prefix_filter);
+  vector<rgw_bucket_dir_entry> objs;
+  int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter);
+  if (r < 0) {
+    if(r != -ENOENT)
+      ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
+    return r;
+  }
+
+  vector<rgw_bucket_dir_entry>::iterator iter;
+  for (iter = objs.begin(); iter != objs.end(); ++iter) {
+    oids.push_back(iter->key.name);
+  }
+
+  return oids.size();
+}
+
+int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter,
+			       int max, RGWListRawObjsCtx& ctx, list<string>& oids,
+			       bool *is_truncated)
+{
+  if (!ctx.initialized) {
+    int r = list_raw_objects_init(dpp, pool, string(), &ctx);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated);
+}
+
+string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
+{
+  return pool_iterate_get_cursor(ctx.iter_ctx);
+}
+
+int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                              rgw_bucket_dir_entry *dirent)
+{
+  rgw_cls_bi_entry bi_entry;
+  int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry);
+  if (r < 0 && r != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
+  }
+  if (r < 0) {
+    return r;
+  }
+  auto iter = bi_entry.data.cbegin();
+  try {
+    decode(*dirent, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                         rgw_bucket_olh_entry *olh)
+{
+  rgw_cls_bi_entry bi_entry;
+  int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry);
+  if (r < 0 && r != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
+  }
+  if (r < 0) {
+    return r;
+  }
+  auto iter = bi_entry.data.cbegin();
+  try {
+    decode(*olh, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                     BIIndexType index_type, rgw_cls_bi_entry *entry)
+{
+  BucketShard bs(this);
+  int ret = bs.init(dpp, bucket_info, obj);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
+
+  auto& ref = bs.bucket_obj.get_ref();
+  
+  return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
+}
+
+void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
+{
+  auto& ref = bs.bucket_obj.get_ref();
+  cls_rgw_bi_put(op, ref.obj.oid, entry);
+}
+
+int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
+{
+  auto& ref = bs.bucket_obj.get_ref();
+  int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
+{
+  // make sure incomplete multipart uploads are hashed correctly
+  if (obj.key.ns == RGW_OBJ_NS_MULTIPART) {
+    RGWMPObj mp;
+    mp.from_meta(obj.key.name);
+    obj.index_hash_source = mp.get_key();
+  }
+  BucketShard bs(this);
+
+  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return bi_put(bs, entry);
+}
+
+int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket,
+		      const string& obj_name_filter, const string& marker, uint32_t max,
+		      list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+  rgw_obj obj(bucket, obj_name_filter);
+  BucketShard bs(this);
+  int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  auto& ref = bs.bucket_obj.get_ref();
+  ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
+  if (ret == -ENOENT) {
+    *is_truncated = false;
+  }
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max,
+		      list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+  auto& ref = bs.bucket_obj.get_ref();
+  int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWRados::bi_list(const DoutPrefixProvider *dpp,
+		      const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max,
+		      list<rgw_cls_bi_entry> *entries, bool *is_truncated)
+{
+  BucketShard bs(this);
+  int ret = bs.init(dpp, bucket_info,
+		    bucket_info.layout.current_index,
+		    shard_id);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated);
+}
+
+int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs)
+{
+  auto& ref = bs.bucket_obj.get_ref();
+  int ret = ref.pool.ioctx().remove(ref.obj.oid);
+  if (ret == -ENOENT) {
+    ret = 0;
+  }
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op)
+{
+  return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield);
+}
+
+int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
+                             librados::ObjectWriteOperation *op)
+{
+  return gc_pool_ctx.aio_operate(oid, c, op);
+}
+
+int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
+{
+  return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield);
+}
+
+int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
+{
+  return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
+}
+
+int RGWRados::process_gc(bool expired_only)
+{
+  return gc->process(expired_only);
+}
+
+int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
+			       vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+			       int& index)
+{
+  return lc->list_lc_progress(marker, max_entries, progress_map, index);
+}
+
+int RGWRados::process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket)
+{
+  RGWLC lc;
+  lc.initialize(cct, this->driver);
+  RGWLC::LCWorker worker(&lc, cct, &lc, 0);
+  auto ret = lc.process(&worker, optional_bucket, true /* once */);
+  lc.stop_processor(); // sets down_flag, but returns immediately
+  return ret;
+}
+
+bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp)
+{
+  return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now());
+}
+
+int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag,
+                                 rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
+{
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs << " obj=" << obj << " tag=" << tag << " op=" << op << dendl_bitx;
+  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+  rgw_zone_set zones_trace;
+  if (_zones_trace) {
+    zones_trace = *_zones_trace;
+  }
+  zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
+
+  ObjectWriteOperation o;
+  o.assert_exists(); // bucket index shard must exist
+
+  cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
+  cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+  cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->need_to_log_data(), bilog_flags, zones_trace);
+  int ret = bs.bucket_obj.operate(dpp, &o, y);
+  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
+  return ret;
+}
+
+int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
+                                  int64_t pool, uint64_t epoch,
+                                  rgw_bucket_dir_entry& ent, RGWObjCategory category,
+				  list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
+{
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+  ldout_bitx_c(bitx, cct, 10) << "ENTERING " << __func__ << ": bucket-shard=" << bs <<
+    " obj=" << obj << " tag=" << tag << " op=" << op <<
+    ", remove_objs=" << (remove_objs ? *remove_objs : std::list<rgw_obj_index_key>()) << dendl_bitx;
+  ldout_bitx_c(bitx, cct, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+  ObjectWriteOperation o;
+  o.assert_exists(); // bucket index shard must exist
+
+  rgw_bucket_dir_entry_meta dir_meta;
+  dir_meta = ent.meta;
+  dir_meta.category = category;
+
+  rgw_zone_set zones_trace;
+  if (_zones_trace) {
+    zones_trace = *_zones_trace;
+  }
+  zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
+
+  rgw_bucket_entry_ver ver;
+  ver.pool = pool;
+  ver.epoch = epoch;
+  cls_rgw_obj_key key(ent.key.name, ent.key.instance);
+  cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
+  cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
+                             svc.zone->need_to_log_data(), bilog_flags, &zones_trace);
+  complete_op_data *arg;
+  index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
+                                              svc.zone->need_to_log_data(), bilog_flags, &zones_trace, &arg);
+  librados::AioCompletion *completion = arg->rados_completion;
+  int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
+  completion->release(); /* can't reference arg here, as it might have already been released */
+
+  ldout_bitx_c(bitx, cct, 10) << "EXITING " << __func__ << ": ret=" << ret << dendl_bitx;
+  return ret;
+}
+
+int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
+                                   int64_t pool, uint64_t epoch,
+                                   rgw_bucket_dir_entry& ent, RGWObjCategory category,
+                                   list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
+{
+  return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
+}
+
+int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
+                                   int64_t pool, uint64_t epoch,
+                                   rgw_obj& obj,
+                                   real_time& removed_mtime,
+                                   list<rgw_obj_index_key> *remove_objs,
+                                   uint16_t bilog_flags,
+                                   rgw_zone_set *zones_trace)
+{
+  rgw_bucket_dir_entry ent;
+  ent.meta.mtime = removed_mtime;
+  obj.key.get_index_key(&ent.key);
+  return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
+			     ent, RGWObjCategory::None, remove_objs,
+			     bilog_flags, zones_trace);
+}
+
+int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj,
+                                      list<rgw_obj_index_key> *remove_objs,
+                                      uint16_t bilog_flags, rgw_zone_set *zones_trace)
+{
+  rgw_bucket_dir_entry ent;
+  obj.key.get_index_key(&ent.key);
+  return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
+			     -1 /* pool id */, 0, ent,
+			     RGWObjCategory::None, remove_objs, bilog_flags,
+			     zones_trace);
+}
+
+int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, bucket_info.layout.current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0)
+    return r;
+
+  return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
+}
+
+
+// returns 0 if there is an error in calculation
+uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
+						      uint32_t num_shards)
+{
+  if (num_shards == 0) {
+    // we'll get a floating point exception since we divide by
+    // num_shards
+    return 0;
+  }
+
+  // We want to minimize the chances that when num_shards >>
+  // num_entries that we return much fewer than num_entries to the
+  // client. Given all the overhead of making a cls call to the osd,
+  // returning a few entries is not much more work than returning one
+  // entry. This minimum might be better tuned based on future
+  // experiments where num_shards >> num_entries. (Note: ">>" should
+  // be interpreted as "much greater than".)
+  constexpr uint32_t min_read = 8;
+
+  // The following is based on _"Balls into Bins" -- A Simple and
+  // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
+  // cases when num_shards >> num_entries (it almost serves as a
+  // ceiling calculation). We also assume alpha is 1.0 and extract it
+  // from the calculation. Future work could involve memoizing some of
+  // the transcendental functions to minimize repeatedly re-calling
+  // them with the same parameters, which we expect to be the case the
+  // majority of the time.
+  uint32_t calc_read =
+    1 +
+    static_cast<uint32_t>((num_entries / num_shards) +
+			  sqrt((2 * num_entries) *
+			       log(num_shards) / num_shards));
+
+  return std::max(min_read, calc_read);
+}
+
+
+int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
+                                      RGWBucketInfo& bucket_info,
+                                      const rgw::bucket_index_layout_generation& idx_layout,
+                                      const int shard_id,
+				      const rgw_obj_index_key& start_after,
+				      const std::string& prefix,
+				      const std::string& delimiter,
+				      const uint32_t num_entries,
+				      const bool list_versions,
+				      const uint16_t expansion_factor,
+				      ent_map_t& m,
+				      bool* is_truncated,
+				      bool* cls_filtered,
+				      rgw_obj_index_key* last_entry,
+                                      optional_yield y,
+				      RGWBucketListNameFilter force_check_filter)
+{
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+
+  /* expansion_factor allows the number of entries to read to grow
+   * exponentially; this is used when earlier reads are producing too
+   * few results, perhaps due to filtering or to a series of
+   * namespaced entries */
+
+  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
+    " start_after=\"" << start_after.to_string() <<
+    "\", prefix=\"" << prefix <<
+    ", delimiter=\"" << delimiter <<
+    "\", shard_id=" << shard_id <<
+    "\", num_entries=" << num_entries <<
+    ", shard_id=" << shard_id <<
+    ", list_versions=" << list_versions <<
+    ", expansion_factor=" << expansion_factor <<
+    ", force_check_filter is " <<
+    (force_check_filter ? "set" : "unset") << dendl_bitx;
+  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+  m.clear();
+
+  RGWSI_RADOS::Pool index_pool;
+  // key   - oid (for different shards if there is any)
+  // value - list result for the corresponding oid (shard), it is filled by
+  //         the AIO callback
+  std::map<int, std::string> shard_oids;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout,
+					  &index_pool, &shard_oids,
+					  nullptr);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << __func__ <<
+      ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl;
+    return r;
+  }
+
+  const uint32_t shard_count = shard_oids.size();
+  if (shard_count == 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+      ": the bucket index shard count appears to be 0, "
+      "which is an illegal value" << dendl;
+    return -ERR_INVALID_BUCKET_STATE;
+  }
+
+  uint32_t num_entries_per_shard;
+  if (expansion_factor == 0) {
+    num_entries_per_shard =
+      calc_ordered_bucket_list_per_shard(num_entries, shard_count);
+  } else if (expansion_factor <= 11) {
+    // we'll max out the exponential multiplication factor at 1024 (2<<10)
+    num_entries_per_shard =
+      std::min(num_entries,
+	       (uint32_t(1 << (expansion_factor - 1)) *
+		calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
+  } else {
+    num_entries_per_shard = num_entries;
+  }
+
+  if (num_entries_per_shard == 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+      ": unable to calculate the number of entries to read from each "
+      "bucket index shard" << dendl;
+    return -ERR_INVALID_BUCKET_STATE;
+  }
+
+  ldpp_dout(dpp, 10) << __func__ <<
+    ": request from each of " << shard_count <<
+    " shard(s) for " << num_entries_per_shard << " entries to get " <<
+    num_entries << " total entries" << dendl;
+
+  auto& ioctx = index_pool.ioctx();
+  std::map<int, rgw_cls_list_ret> shard_list_results;
+  cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
+  r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
+			    num_entries_per_shard,
+			    list_versions, shard_oids, shard_list_results,
+			    cct->_conf->rgw_bucket_index_max_aio)();
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << __func__ <<
+      ": CLSRGWIssueBucketList for " << bucket_info.bucket <<
+      " failed" << dendl;
+    return r;
+  }
+
+  // to manage the iterators through each shard's list results
+  struct ShardTracker {
+    const size_t shard_idx;
+    rgw_cls_list_ret& result;
+    const std::string& oid_name;
+    RGWRados::ent_map_t::iterator cursor;
+    RGWRados::ent_map_t::iterator end;
+
+    // manages an iterator through a shard and provides other
+    // accessors
+    ShardTracker(size_t _shard_idx,
+		 rgw_cls_list_ret& _result,
+		 const std::string& _oid_name):
+      shard_idx(_shard_idx),
+      result(_result),
+      oid_name(_oid_name),
+      cursor(_result.dir.m.begin()),
+      end(_result.dir.m.end())
+    {}
+
+    inline const std::string& entry_name() const {
+      return cursor->first;
+    }
+    rgw_bucket_dir_entry& dir_entry() const {
+      return cursor->second;
+    }
+    inline bool is_truncated() const {
+      return result.is_truncated;
+    }
+    inline ShardTracker& advance() {
+      ++cursor;
+      // return a self-reference to allow for chaining of calls, such
+      // as x.advance().at_end()
+      return *this;
+    }
+    inline bool at_end() const {
+      return cursor == end;
+    }
+  }; // ShardTracker
+
+  // add the next unique candidate, or return false if we reach the end
+  auto next_candidate = [] (CephContext *cct, ShardTracker& t,
+                            std::multimap<std::string, size_t>& candidates,
+                            size_t tracker_idx) {
+    if (!t.at_end()) {
+      candidates.emplace(t.entry_name(), tracker_idx);
+    }
+    return;
+  };
+
+  // one tracker per shard requested (may not be all shards)
+  std::vector<ShardTracker> results_trackers;
+  results_trackers.reserve(shard_list_results.size());
+  for (auto& r : shard_list_results) {
+    results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
+
+    // if any *one* shard's result is trucated, the entire result is
+    // truncated
+    *is_truncated = *is_truncated || r.second.is_truncated;
+
+    // unless *all* are shards are cls_filtered, the entire result is
+    // not filtered
+    *cls_filtered = *cls_filtered && r.second.cls_filtered;
+  }
+
+  // create a map to track the next candidate entry from ShardTracker
+  // (key=candidate, value=index into results_trackers); as we consume
+  // entries from shards, we replace them with the next entries in the
+  // shards until we run out
+  std::multimap<std::string, size_t> candidates;
+  size_t tracker_idx = 0;
+  std::vector<size_t> vidx;
+  vidx.reserve(shard_list_results.size());
+  for (auto& t : results_trackers) {
+    // it's important that the values in the map refer to the index
+    // into the results_trackers vector, which may not be the same
+    // as the shard number (i.e., when not all shards are requested)
+    next_candidate(cct, t, candidates, tracker_idx);
+    ++tracker_idx;
+  }
+
+  rgw_bucket_dir_entry*
+    last_entry_visited = nullptr; // to set last_entry (marker)
+  std::map<std::string, bufferlist> updates;
+  uint32_t count = 0;
+  while (count < num_entries && !candidates.empty()) {
+    r = 0;
+    // select the next entry in lexical order (first key in map);
+    // again tracker_idx is not necessarily shard number, but is index
+    // into results_trackers vector
+    tracker_idx = candidates.begin()->second;
+    auto& tracker = results_trackers.at(tracker_idx);
+
+    const std::string& name = tracker.entry_name();
+    rgw_bucket_dir_entry& dirent = tracker.dir_entry();
+
+    ldpp_dout(dpp, 20) << __func__ << ": currently processing " <<
+      dirent.key << " from shard " << tracker.shard_idx << dendl;
+
+    const bool force_check =
+      force_check_filter && force_check_filter(dirent.key.name);
+
+    if ((!dirent.exists &&
+	 !dirent.is_delete_marker() &&
+	 !dirent.is_common_prefix()) ||
+        !dirent.pending_map.empty() ||
+        force_check) {
+      /* there are uncommitted ops. We need to check the current
+       * state, and if the tags are old we need to do clean-up as
+       * well. */
+      librados::IoCtx sub_ctx;
+      sub_ctx.dup(ioctx);
+      ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+	" calling check_disk_state bucket=" << bucket_info.bucket <<
+	" entry=" << dirent.key << dendl_bitx;
+      r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
+			   updates[tracker.oid_name], y);
+      if (r < 0 && r != -ENOENT) {
+	ldpp_dout(dpp, 0) << __func__ <<
+	  ": check_disk_state for \"" << dirent.key <<
+	  "\" failed with r=" << r << dendl;
+	return r;
+      }
+    } else {
+      r = 0;
+    }
+
+    // at this point either r >= 0 or r == -ENOENT
+    if (r >= 0) { // i.e., if r != -ENOENT
+      ldpp_dout(dpp, 10) << __func__ << ": got " <<
+	dirent.key << dendl;
+
+      auto [it, inserted] = m.insert_or_assign(name, std::move(dirent));
+      last_entry_visited = &it->second;
+      if (inserted) {
+	++count;
+      } else {
+	ldpp_dout(dpp, 0) << "WARNING: " << __func__ <<
+	  " reassigned map value at \"" << name <<
+	  "\", which should not happen" << dendl;
+      }
+    } else {
+      ldpp_dout(dpp, 10) << __func__ << ": skipping " <<
+	dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
+      last_entry_visited = &tracker.dir_entry();
+    }
+
+    // refresh the candidates map
+    vidx.clear();
+    bool need_to_stop = false;
+    auto range = candidates.equal_range(name);
+    for (auto i = range.first; i != range.second; ++i) {
+      vidx.push_back(i->second);
+    } 
+    candidates.erase(range.first, range.second);
+    for (auto idx : vidx) {
+      auto& tracker_match = results_trackers.at(idx);
+      tracker_match.advance();
+      next_candidate(cct, tracker_match, candidates, idx);
+      if (tracker_match.at_end() && tracker_match.is_truncated()) {
+        need_to_stop = true;
+        break;
+      }
+    }
+    if (need_to_stop) {
+      // once we exhaust one shard that is truncated, we need to stop,
+      // as we cannot be certain that one of the next entries needs to
+      // come from that shard; S3 and swift protocols allow returning
+      // fewer than what was requested
+      ldpp_dout(dpp, 10) << __func__ <<
+	": stopped accumulating results at count=" << count <<
+	", dirent=\"" << dirent.key <<
+	"\", because its shard is truncated and exhausted" << dendl;
+      break;
+    }
+  } // while we haven't provided requested # of result entries
+
+  // suggest updates if there are any
+  for (auto& miter : updates) {
+    if (miter.second.length()) {
+      ObjectWriteOperation o;
+      cls_rgw_suggest_changes(o, miter.second);
+      // we don't care if we lose suggested updates, send them off blindly
+      AioCompletion *c =
+	librados::Rados::aio_create_completion(nullptr, nullptr);
+
+      ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+	": doing dir_suggest on " << miter.first << dendl_bitx;
+      ioctx.aio_operate(miter.first, c, &o);
+      c->release();
+    }
+  } // updates loop
+
+  // determine truncation by checking if all the returned entries are
+  // consumed or not
+  *is_truncated = false;
+  for (const auto& t : results_trackers) {
+    if (!t.at_end() || t.is_truncated()) {
+      *is_truncated = true;
+      break;
+    }
+  }
+
+  ldpp_dout(dpp, 20) << __func__ <<
+    ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
+    dendl;
+
+  if (*is_truncated && count < num_entries) {
+    ldpp_dout(dpp, 10) << __func__ <<
+      ": requested " << num_entries << " entries but returning " <<
+      count << ", which is truncated" << dendl;
+  }
+
+  if (last_entry_visited != nullptr && last_entry) {
+    *last_entry = last_entry_visited->key;
+    ldpp_dout(dpp, 20) << __func__ <<
+      ": returning, last_entry=" << *last_entry << dendl;
+  } else {
+    ldpp_dout(dpp, 20) << __func__ <<
+      ": returning, last_entry NOT SET" << dendl;
+  }
+
+  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+  return 0;
+} // RGWRados::cls_bucket_list_ordered
+
+
+// A helper function to retrieve the hash source from an incomplete
+// multipart entry by removing everything from the second to last
+// period on.
+static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) {
+  std::size_t found = oid_wo_ns.rfind('.');
+  if (found == std::string::npos || found < 1) {
+    return -EINVAL;
+  }
+  found = oid_wo_ns.rfind('.', found - 1);
+  if (found == std::string::npos || found < 1) {
+    return -EINVAL;
+  }
+  *index_hash_source = oid_wo_ns.substr(0, found);
+  return 0;
+}
+
+
+int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
+                                        RGWBucketInfo& bucket_info,
+                                        const rgw::bucket_index_layout_generation& idx_layout,
+                                        int shard_id,
+					const rgw_obj_index_key& start_after,
+					const std::string& prefix,
+					uint32_t num_entries,
+					bool list_versions,
+					std::vector<rgw_bucket_dir_entry>& ent_list,
+					bool *is_truncated,
+					rgw_obj_index_key *last_entry,
+                                        optional_yield y,
+					RGWBucketListNameFilter force_check_filter) {
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+
+  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": " << bucket_info.bucket <<
+    " start_after=\"" << start_after <<
+    "\", prefix=\"" << prefix <<
+    "\", shard_id=" << shard_id <<
+    "\", num_entries=" << num_entries <<
+    ", list_versions=" << list_versions <<
+    (force_check_filter ? "set" : "unset") << dendl_bitx;
+  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+  ent_list.clear();
+  static MultipartMetaFilter multipart_meta_filter;
+
+  *is_truncated = false;
+  RGWSI_RADOS::Pool index_pool;
+
+  std::map<int, std::string> oids;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  auto& ioctx = index_pool.ioctx();
+
+  const uint32_t num_shards = oids.size();
+
+  rgw_obj_index_key marker = start_after;
+  uint32_t current_shard;
+  if (shard_id >= 0) {
+    current_shard = shard_id;
+  } else if (start_after.empty()) {
+    current_shard = 0u;
+  } else {
+    // at this point we have a marker (start_after) that has something
+    // in it, so we need to get to the bucket shard index, so we can
+    // start reading from there
+
+
+    // now convert the key (oid) to an rgw_obj_key since that will
+    // separate out the namespace, name, and instance
+    rgw_obj_key obj_key;
+    bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key);
+    if (!parsed) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	" received an invalid start marker: \"" << start_after << "\"" <<
+	dendl;
+      return -EINVAL;
+    } else if (obj_key.name.empty()) {
+      // if the name is empty that means the object name came in with
+      // a namespace only, and therefore we need to start our scan at
+      // the first bucket index shard
+      current_shard = 0u;
+    } else {
+      // so now we have the key used to compute the bucket index shard
+      // and can extract the specific shard from it
+      if (obj_key.ns == RGW_OBJ_NS_MULTIPART) {
+        // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of
+        // the implementation relying on MultipartMetaFilter
+        // because MultipartMetaFilter only checks .meta suffix, which may
+        // exclude data multiparts but include some regular objects with .meta suffix
+        // by mistake.
+        string index_hash_source;
+        r = parse_index_hash_source(obj_key.name, &index_hash_source);
+        if (r < 0) {
+	  ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	    " parse_index_hash_source unable to parse \"" << obj_key.name <<
+	    "\", r=" << r << dendl;
+          return r;
+        }
+        current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards);
+      } else {
+        current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
+      }
+    }
+  }
+
+  uint32_t count = 0u;
+  std::map<std::string, bufferlist> updates;
+  rgw_obj_index_key last_added_entry;
+  while (count <= num_entries &&
+	 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
+	  current_shard < num_shards)) {
+    const std::string& oid = oids[current_shard];
+    rgw_cls_list_ret result;
+
+    librados::ObjectReadOperation op;
+    const std::string empty_delimiter;
+    cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
+			   num_entries,
+                           list_versions, &result);
+    r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	": error in rgw_rados_operate (bucket list op), r=" << r << dendl;
+      return r;
+    }
+
+    for (auto& entry : result.dir.m) {
+      rgw_bucket_dir_entry& dirent = entry.second;
+
+      bool force_check = force_check_filter &&
+	force_check_filter(dirent.key.name);
+      if ((!dirent.exists && !dirent.is_delete_marker()) ||
+	  !dirent.pending_map.empty() ||
+	  force_check) {
+	/* there are uncommitted ops. We need to check the current state,
+	 * and if the tags are old we need to do cleanup as well. */
+	librados::IoCtx sub_ctx;
+	sub_ctx.dup(ioctx);
+	ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+	  ": calling check_disk_state bucket=" << bucket_info.bucket <<
+	  " entry=" << dirent.key << dendl_bitx;
+	r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
+	if (r < 0 && r != -ENOENT) {
+	  ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	    ": error in check_disk_state, r=" << r << dendl;
+	  return r;
+	}
+      } else {
+        r = 0;
+      }
+
+      // at this point either r >= 0 or r == -ENOENT
+      if (r >= 0) { // i.e., if r != -ENOENT
+	ldpp_dout(dpp, 10) << __func__ << ": got " <<
+	  dirent.key << dendl;
+
+	if (count < num_entries) {
+	  marker = last_added_entry = dirent.key; // double assign
+	  ent_list.emplace_back(std::move(dirent));
+	  ++count;
+	} else {
+	  last_added_entry = dirent.key;
+	  *is_truncated = true;
+	  ldpp_dout(dpp, 10) << "INFO: " << __func__ <<
+	    ": reached max entries (" << num_entries << ") to return at \"" <<
+	    dirent.key << "\"" << dendl;
+	  goto check_updates;
+	}
+      } else { // r == -ENOENT
+	// in the case of -ENOENT, make sure we're advancing marker
+	// for possible next call to CLSRGWIssueBucketList
+	marker = dirent.key;
+      }
+    } // entry for loop
+
+    if (!result.is_truncated) {
+      // if we reached the end of the shard read next shard
+      ++current_shard;
+      marker = rgw_obj_index_key();
+    }
+  } // shard loop
+
+check_updates:
+
+  // suggest updates if there is any
+  std::map<std::string, bufferlist>::iterator miter = updates.begin();
+  for (; miter != updates.end(); ++miter) {
+    if (miter->second.length()) {
+      ObjectWriteOperation o;
+      cls_rgw_suggest_changes(o, miter->second);
+      // we don't care if we lose suggested updates, send them off blindly
+      AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+
+      ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+	" doing dir_suggest on " << miter->first << dendl_bitx;
+      ioctx.aio_operate(miter->first, c, &o);
+      c->release();
+    }
+  }
+
+  if (last_entry && !ent_list.empty()) {
+    *last_entry = last_added_entry;
+  }
+
+  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+  return 0;
+} // RGWRados::cls_bucket_list_unordered
+
+
+int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid,
+				    rgw_usage_log_info& info)
+{
+  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  ObjectWriteOperation op;
+  cls_rgw_usage_log_add(op, info);
+
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  return r;
+}
+
+int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
+                                     uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+                                     string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
+				     bool *is_truncated)
+{
+  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  *is_truncated = false;
+
+  r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
+			     max_entries, read_iter, usage, is_truncated);
+
+  return r;
+}
+
+static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
+{
+  bool done = false;
+  do {
+    librados::ObjectWriteOperation op;
+    cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
+    int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+    if (r == -ENODATA)
+      done = true;
+    else if (r < 0)
+      return r;
+  } while (!done);
+
+  return 0;
+}
+
+int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
+                                    uint64_t start_epoch, uint64_t end_epoch)
+{
+  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch);
+  return r;
+}
+
+int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid)
+{
+  rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
+
+  rgw_rados_ref ref;
+  int r = get_raw_obj_ref(dpp, obj, &ref);
+  if (r < 0) {
+    return r;
+  }
+  librados::ObjectWriteOperation op;
+  cls_rgw_usage_log_clear(op);
+  r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
+  return r;
+}
+
+
+// note: this removes entries from the rados bucket index objects
+// without going through CLS; this is known to be called from
+// "radosgw-admin unlink" and "radosgw-admin bucket check --fix"
+int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp,
+				     RGWBucketInfo& bucket_info,
+				     const std::list<rgw_obj_index_key>& entry_key_list)
+{
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" << bucket_info.bucket <<
+    " entry_key_list.size()=" << entry_key_list.size() << dendl_bitx;
+  ldout_bitx(bitx, dpp, 25) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl_bitx;
+
+  const auto& current_index = bucket_info.get_current_index();
+  if (is_layout_indexless(current_index)) {
+    return -EINVAL;
+  }
+  const uint32_t num_shards = current_index.layout.normal.num_shards;
+
+  RGWSI_RADOS::Pool index_pool;
+  std::map<int, std::string> index_oids;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt,
+					  bucket_info.layout.current_index,
+					  &index_pool, &index_oids, nullptr);
+  if (r < 0) {
+    ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
+      " open_bucket_index returned " << r << dendl_bitx;
+    return r;
+  }
+
+  // split up removals by shard
+  std::map<int, std::set<std::string>> sharded_removals;
+  for (const auto& entry_key : entry_key_list) {
+    const rgw_obj_key obj_key(entry_key);
+    const uint32_t shard =
+      RGWSI_BucketIndex_RADOS::bucket_shard_index(obj_key, num_shards);
+
+    // entry_key already combines namespace and name, so we first have
+    // to break that apart before we can then combine with instance
+    std::string name;
+    std::string ns; // namespace
+    rgw_obj_key::parse_index_key(entry_key.name, &name, &ns);
+    rgw_obj_key full_key(name, entry_key.instance, ns);
+    std::string combined_key = full_key.get_oid();
+
+    sharded_removals[shard].insert(combined_key);
+
+    ldout_bitx(bitx, dpp, 20) << "INFO: " << __func__ <<
+      ": removal from bucket index, bucket=" << bucket_info.bucket <<
+      " key=" << combined_key << " designated for shard " << shard <<
+      dendl_bitx;
+  }
+
+  for (const auto& removals : sharded_removals) {
+    const int shard = removals.first;
+    const std::string& oid = index_oids[shard];
+
+    ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+      ": removal from bucket index, bucket=" << bucket_info.bucket <<
+      ", shard=" << shard << ", oid=" << oid << ", num_keys=" <<
+      removals.second.size() << dendl_bitx;
+
+    r = index_pool.ioctx().omap_rm_keys(oid, removals.second);
+    if (r < 0) {
+      ldout_bitx(bitx, dpp, 0) << "ERROR: " << __func__ <<
+	": omap_rm_keys returned ret=" << r <<
+	dendl_bitx;
+      return r;
+    }
+  }
+
+  ldout_bitx(bitx, dpp, 5) <<
+    "EXITING " << __func__ << " and returning " << r << dendl_bitx;
+
+  return r;
+}
+
+int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
+                               librados::IoCtx io_ctx,
+                               RGWBucketInfo& bucket_info,
+                               rgw_bucket_dir_entry& list_state,
+                               rgw_bucket_dir_entry& object,
+                               bufferlist& suggested_updates,
+                               optional_yield y)
+{
+  const bool bitx = cct->_conf->rgw_bucket_index_transaction_instrumentation;
+  ldout_bitx(bitx, dpp, 10) << "ENTERING " << __func__ << ": bucket=" <<
+    bucket_info.bucket << " dir_entry=" << list_state.key << dendl_bitx;
+
+  uint8_t suggest_flag = (svc.zone->need_to_log_data() ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
+
+  std::string loc;
+
+  rgw_obj obj(bucket_info.bucket, list_state.key);
+
+  MultipartMetaFilter multipart_meta_filter;
+  string temp_key;
+  if (multipart_meta_filter.filter(list_state.key.name, temp_key)) {
+    obj.in_extra_data = true;
+  }
+
+  string oid;
+  get_obj_bucket_and_oid_loc(obj, oid, loc);
+
+  if (loc != list_state.locator) {
+    ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
+  }
+
+  io_ctx.locator_set_key(list_state.locator);
+
+  RGWObjState *astate = NULL;
+  RGWObjManifest *manifest = nullptr;
+  RGWObjectCtx rctx(this->driver);
+  int r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, &manifest, false, y);
+  if (r < 0)
+    return r;
+
+  list_state.pending_map.clear(); // we don't need this and it inflates size
+  if (!list_state.is_delete_marker() && !astate->exists) {
+    ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": disk state exists" << dendl_bitx;
+      /* object doesn't exist right now -- hopefully because it's
+       * marked as !exists and got deleted */
+    if (list_state.exists) {
+      ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": index list state exists" << dendl_bitx;
+      /* FIXME: what should happen now? Work out if there are any
+       * non-bad ways this could happen (there probably are, but annoying
+       * to handle!) */
+    }
+
+    // encode a suggested removal of that key
+    list_state.ver.epoch = io_ctx.get_last_version();
+    list_state.ver.pool = io_ctx.get_id();
+    ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << ": encoding remove of " << list_state.key << " on suggested_updates" << dendl_bitx;
+    cls_rgw_encode_suggestion(CEPH_RGW_REMOVE | suggest_flag, list_state, suggested_updates);
+    return -ENOENT;
+  }
+
+  string etag;
+  string content_type;
+  string storage_class;
+  ACLOwner owner;
+  bool appendable = false;
+
+  object.meta.size = astate->size;
+  object.meta.accounted_size = astate->accounted_size;
+  object.meta.mtime = astate->mtime;
+
+  map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
+  if (iter != astate->attrset.end()) {
+    etag = rgw_bl_str(iter->second);
+  }
+  iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
+  if (iter != astate->attrset.end()) {
+    content_type = rgw_bl_str(iter->second);
+  }
+  iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
+  if (iter != astate->attrset.end()) {
+    storage_class = rgw_bl_str(iter->second);
+  }
+  iter = astate->attrset.find(RGW_ATTR_ACL);
+  if (iter != astate->attrset.end()) {
+    r = decode_policy(dpp, iter->second, &owner);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl;
+    }
+  }
+  iter = astate->attrset.find(RGW_ATTR_APPEND_PART_NUM);
+  if (iter != astate->attrset.end()) {
+    appendable = true;
+  }
+
+  if (manifest) {
+    RGWObjManifest::obj_iterator miter;
+    for (miter = manifest->obj_begin(dpp); miter != manifest->obj_end(dpp); ++miter) {
+      const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
+      rgw_obj loc;
+      RGWSI_Tier_RADOS::raw_obj_to_obj(manifest->get_obj().bucket, raw_loc, &loc);
+
+      if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
+	ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ << " removing manifest part from index loc=" << loc << dendl_bitx;
+	r = delete_obj_index(loc, astate->mtime, dpp, y);
+	if (r < 0) {
+	  ldout_bitx(bitx, dpp, 0) <<
+	    "WARNING: " << __func__ << ": delete_obj_index returned r=" << r << dendl_bitx;
+	}
+      }
+    }
+  }
+
+  object.meta.etag = etag;
+  object.meta.content_type = content_type;
+  object.meta.storage_class = storage_class;
+  object.meta.owner = owner.get_id().to_str();
+  object.meta.owner_display_name = owner.get_display_name();
+  object.meta.appendable = appendable;
+
+  // encode suggested updates
+
+  list_state.meta.size = object.meta.size;
+  list_state.meta.accounted_size = object.meta.accounted_size;
+  list_state.meta.mtime = object.meta.mtime;
+  list_state.meta.category = main_category;
+  list_state.meta.etag = etag;
+  list_state.meta.appendable = appendable;
+  list_state.meta.content_type = content_type;
+  list_state.meta.storage_class = storage_class;
+
+  librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id
+  r = get_obj_head_ioctx(dpp, bucket_info, obj, &head_obj_ctx);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << __func__ <<
+      " WARNING: unable to find head object data pool for \"" <<
+      obj << "\", not updating version pool/epoch" << dendl;
+  } else {
+    list_state.ver.pool = head_obj_ctx.get_id();
+    list_state.ver.epoch = astate->epoch;
+  }
+
+  if (astate->obj_tag.length() > 0) {
+    list_state.tag = astate->obj_tag.c_str();
+  }
+
+  list_state.meta.owner = owner.get_id().to_str();
+  list_state.meta.owner_display_name = owner.get_display_name();
+
+  list_state.exists = true;
+
+  ldout_bitx(bitx, dpp, 10) << "INFO: " << __func__ <<
+    ": encoding update of " << list_state.key << " on suggested_updates" << dendl_bitx;
+  cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
+
+  ldout_bitx(bitx, dpp, 10) << "EXITING " << __func__ << dendl_bitx;
+  return 0;
+} // RGWRados::check_disk_state
+
+int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> oids;
+  map<int, struct rgw_cls_list_ret> list_results;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned "
+                   << r << dendl;
+    return r;
+  }
+
+  r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
+                   << r << dendl;
+    return r;
+  }
+
+  map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
+  for(; iter != list_results.end(); ++iter) {
+    headers.push_back(std::move(iter->second.dir.header));
+  }
+  return 0;
+}
+
+int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+  int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &bucket_objs, nullptr);
+  if (r < 0)
+    return r;
+
+  map<int, string>::iterator iter = bucket_objs.begin();
+  for (; iter != bucket_objs.end(); ++iter) {
+    r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
+    if (r < 0) {
+      ctx->put();
+      break;
+    } else {
+      (*num_aio)++;
+    }
+  }
+  return r;
+}
+
+int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
+				  const rgw_bucket& bucket,
+				  uint64_t num_objs,
+                                  const DoutPrefixProvider *dpp)
+{
+  if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
+      return 0;
+  }
+
+  bool need_resharding = false;
+  uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
+  const uint32_t max_dynamic_shards =
+    uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
+
+  if (num_source_shards >= max_dynamic_shards) {
+    return 0;
+  }
+
+  uint32_t suggested_num_shards = 0;
+  const uint64_t max_objs_per_shard =
+    cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
+
+  // TODO: consider per-bucket sync policy here?
+  const bool is_multisite = svc.zone->need_to_log_data();
+
+  quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards,
+				     num_objs, is_multisite, need_resharding,
+				     &suggested_num_shards);
+  if (! need_resharding) {
+    return 0;
+  }
+
+  const uint32_t final_num_shards =
+    RGWBucketReshard::get_preferred_shards(suggested_num_shards,
+					   max_dynamic_shards);
+  // final verification, so we don't reduce number of shards
+  if (final_num_shards <= num_source_shards) {
+    return 0;
+  }
+
+  ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
+    " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
+    "; new num shards " << final_num_shards << " (suggested " <<
+    suggested_num_shards << ")" << dendl;
+
+  return add_bucket_to_reshard(dpp, bucket_info, final_num_shards);
+}
+
+int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
+{
+  RGWReshard reshard(this->driver, dpp);
+
+  uint32_t num_source_shards = rgw::current_num_shards(bucket_info.layout);
+
+  new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
+  if (new_num_shards <= num_source_shards) {
+    ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
+    return 0;
+  }
+
+  cls_rgw_reshard_entry entry;
+  entry.time = real_clock::now();
+  entry.tenant = bucket_info.owner.tenant;
+  entry.bucket_name = bucket_info.bucket.name;
+  entry.bucket_id = bucket_info.bucket.bucket_id;
+  entry.old_num_shards = num_source_shards;
+  entry.new_num_shards = new_num_shards;
+
+  return reshard.add(dpp, entry);
+}
+
+int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
+                          RGWQuota& quota,
+			  uint64_t obj_size, optional_yield y,
+			  bool check_size_only)
+{
+  // if we only check size, then num_objs will set to 0
+  if(check_size_only)
+    return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 0, obj_size, y);
+
+  return quota_handler->check_quota(dpp, bucket_owner, bucket, quota, 1, obj_size, y);
+}
+
+int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key,
+                                  int *shard_id)
+{
+  int r = 0;
+  switch (layout.hash_type) {
+    case rgw::BucketHashType::Mod:
+      if (!layout.num_shards) {
+        if (shard_id) {
+          *shard_id = -1;
+        }
+      } else {
+        uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards);
+        if (shard_id) {
+          *shard_id = (int)sid;
+        }
+      }
+      break;
+    default:
+      r = -ENOTSUP;
+  }
+  return r;
+}
+
+uint64_t RGWRados::instance_id()
+{
+  return get_rados_handle()->get_instance_id();
+}
+
+uint64_t RGWRados::next_bucket_id()
+{
+  std::lock_guard l{bucket_id_lock};
+  return ++max_bucket_id;
+}
+
+librados::Rados* RGWRados::get_rados_handle()
+{
+  return &rados;
+}
+
+int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
+{
+  rgw_rados_ref ref;
+  int ret = get_raw_obj_ref(dpp, obj, &ref);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
+    return ret;
+  }
+
+  ObjectWriteOperation op;
+  list<string> prefixes;
+  cls_rgw_remove_obj(op, prefixes);
+
+  AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+  ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
+    c->release();
+    return ret;
+  }
+
+  handles.push_back(c);
+
+  return 0;
+}
+
+int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj,
+                             RGWBucketInfo& bucket_info, RGWObjState *astate,
+                             list<librados::AioCompletion *>& handles, bool keep_index_consistent,
+                             optional_yield y)
+{
+  rgw_rados_ref ref;
+  int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (keep_index_consistent) {
+    RGWRados::Bucket bop(this, bucket_info);
+    RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
+
+    ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
+      return ret;
+    }
+  }
+
+  ObjectWriteOperation op;
+  list<string> prefixes;
+  cls_rgw_remove_obj(op, prefixes);
+
+  AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+  ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
+    c->release();
+    return ret;
+  }
+
+  handles.push_back(c);
+
+  if (keep_index_consistent) {
+    ret = delete_obj_index(obj, astate->mtime, dpp, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
+      return ret;
+    }
+  }
+  return ret;
+}
+
+void objexp_hint_entry::generate_test_instances(list<objexp_hint_entry*>& o)
+{
+  auto it = new objexp_hint_entry;
+  it->tenant = "tenant1";
+  it->bucket_name = "bucket1";
+  it->bucket_id = "1234";
+  it->obj_key = rgw_obj_key("obj");
+  o.push_back(it);
+  o.push_back(new objexp_hint_entry);
+}
+
+void objexp_hint_entry::dump(Formatter *f) const
+{
+  f->open_object_section("objexp_hint_entry");
+  encode_json("tenant", tenant, f);
+  encode_json("bucket_name", bucket_name, f);
+  encode_json("bucket_id", bucket_id, f);
+  encode_json("rgw_obj_key", obj_key, f);
+  utime_t ut(exp_time);
+  encode_json("exp_time", ut, f);
+  f->close_section();
+}
+
+void RGWOLHInfo::generate_test_instances(list<RGWOLHInfo*> &o)
+{
+  RGWOLHInfo *olh = new RGWOLHInfo;
+  olh->removed = false;
+  o.push_back(olh);
+  o.push_back(new RGWOLHInfo);
+}
+
+void RGWOLHInfo::dump(Formatter *f) const
+{
+  encode_json("target", target, f);
+}
+
+void RGWOLHPendingInfo::dump(Formatter *f) const
+{
+  utime_t ut(time);
+  encode_json("time", ut, f);
+}
+
diff --git a/src/rgw/driver/rados/rgw_rados.h b/src/rgw/driver/rados/rgw_rados.h
new file mode 100644
index 000000000..75a5e1b54
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rados.h
@@ -0,0 +1,1661 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <iostream>
+#include <functional>
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "include/rados/librados.hpp"
+#include "include/Context.h"
+#include "include/random.h"
+#include "common/RefCountedObj.h"
+#include "common/ceph_time.h"
+#include "common/Timer.h"
+#include "rgw_common.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "cls/version/cls_version_types.h"
+#include "cls/log/cls_log_types.h"
+#include "cls/timeindex/cls_timeindex_types.h"
+#include "cls/otp/cls_otp_types.h"
+#include "rgw_quota.h"
+#include "rgw_log.h"
+#include "rgw_metadata.h"
+#include "rgw_meta_sync_status.h"
+#include "rgw_period_puller.h"
+#include "rgw_obj_manifest.h"
+#include "rgw_sync_module.h"
+#include "rgw_trim_bilog.h"
+#include "rgw_service.h"
+#include "rgw_sal.h"
+#include "rgw_aio.h"
+#include "rgw_d3n_cacherequest.h"
+
+#include "services/svc_rados.h"
+#include "services/svc_bi_rados.h"
+#include "common/Throttle.h"
+#include "common/ceph_mutex.h"
+#include "rgw_cache.h"
+#include "rgw_sal_fwd.h"
+
+struct D3nDataCache;
+
+class RGWWatcher;
+class ACLOwner;
+class RGWGC;
+class RGWMetaNotifier;
+class RGWDataNotifier;
+class RGWLC;
+class RGWObjectExpirer;
+class RGWMetaSyncProcessorThread;
+class RGWDataSyncProcessorThread;
+class RGWSyncLogTrimThread;
+class RGWSyncTraceManager;
+struct RGWZoneGroup;
+struct RGWZoneParams;
+class RGWReshard;
+class RGWReshardWait;
+
+struct get_obj_data;
+
+/* flags for put_obj_meta() */
+#define PUT_OBJ_CREATE      0x01
+#define PUT_OBJ_EXCL        0x02
+#define PUT_OBJ_CREATE_EXCL (PUT_OBJ_CREATE | PUT_OBJ_EXCL)
+
+static inline void prepend_bucket_marker(const rgw_bucket& bucket, const std::string& orig_oid, std::string& oid)
+{
+  if (bucket.marker.empty() || orig_oid.empty()) {
+    oid = orig_oid;
+  } else {
+    oid = bucket.marker;
+    oid.append("_");
+    oid.append(orig_oid);
+  }
+}
+
+static inline void get_obj_bucket_and_oid_loc(const rgw_obj& obj, std::string& oid, std::string& locator)
+{
+  const rgw_bucket& bucket = obj.bucket;
+  prepend_bucket_marker(bucket, obj.get_oid(), oid);
+  const std::string& loc = obj.key.get_loc();
+  if (!loc.empty()) {
+    prepend_bucket_marker(bucket, loc, locator);
+  } else {
+    locator.clear();
+  }
+}
+
+struct RGWOLHInfo {
+  rgw_obj target;
+  bool removed;
+
+  RGWOLHInfo() : removed(false) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(target, bl);
+    encode(removed, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(target, bl);
+     decode(removed, bl);
+     DECODE_FINISH(bl);
+  }
+  static void generate_test_instances(std::list<RGWOLHInfo*>& o);
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOLHInfo)
+
+struct RGWOLHPendingInfo {
+  ceph::real_time time;
+
+  RGWOLHPendingInfo() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(time, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(time, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOLHPendingInfo)
+
+struct RGWUsageBatch {
+  std::map<ceph::real_time, rgw_usage_log_entry> m;
+
+  void insert(ceph::real_time& t, rgw_usage_log_entry& entry, bool *account) {
+    bool exists = m.find(t) != m.end();
+    *account = !exists;
+    m[t].aggregate(entry);
+  }
+};
+
+struct RGWCloneRangeInfo {
+  rgw_obj src;
+  off_t src_ofs;
+  off_t dst_ofs;
+  uint64_t len;
+};
+
+class RGWFetchObjFilter {
+public:
+  virtual ~RGWFetchObjFilter() {}
+
+  virtual int filter(CephContext *cct,
+                     const rgw_obj_key& source_key,
+                     const RGWBucketInfo& dest_bucket_info,
+                     std::optional<rgw_placement_rule> dest_placement_rule,
+                     const std::map<std::string, bufferlist>& obj_attrs,
+                     std::optional<rgw_user> *poverride_owner,
+                     const rgw_placement_rule **prule) = 0;
+};
+
+class RGWFetchObjFilter_Default : public RGWFetchObjFilter {
+protected:
+  rgw_placement_rule dest_rule;
+public:
+  RGWFetchObjFilter_Default() {}
+
+  int filter(CephContext *cct,
+             const rgw_obj_key& source_key,
+             const RGWBucketInfo& dest_bucket_info,
+             std::optional<rgw_placement_rule> dest_placement_rule,
+             const std::map<std::string, bufferlist>& obj_attrs,
+             std::optional<rgw_user> *poverride_owner,
+             const rgw_placement_rule **prule) override;
+};
+
+struct RGWObjStateManifest {
+  RGWObjState state;
+  std::optional<RGWObjManifest> manifest;
+};
+
+class RGWObjectCtx {
+  rgw::sal::Driver* driver;
+  ceph::shared_mutex lock = ceph::make_shared_mutex("RGWObjectCtx");
+
+  std::map<rgw_obj, RGWObjStateManifest> objs_state;
+public:
+  explicit RGWObjectCtx(rgw::sal::Driver* _driver) : driver(_driver) {}
+  RGWObjectCtx(RGWObjectCtx& _o) {
+    std::unique_lock wl{lock};
+    this->driver = _o.driver;
+    this->objs_state = _o.objs_state;
+  }
+
+  rgw::sal::Driver* get_driver() {
+    return driver;
+  }
+
+  RGWObjStateManifest *get_state(const rgw_obj& obj);
+
+  void set_compressed(const rgw_obj& obj);
+  void set_atomic(const rgw_obj& obj);
+  void set_prefetch_data(const rgw_obj& obj);
+  void invalidate(const rgw_obj& obj);
+};
+
+
+struct RGWRawObjState {
+  rgw_raw_obj obj;
+  bool has_attrs{false};
+  bool exists{false};
+  uint64_t size{0};
+  ceph::real_time mtime;
+  uint64_t epoch{0};
+  bufferlist obj_tag;
+  bool has_data{false};
+  bufferlist data;
+  bool prefetch_data{false};
+  uint64_t pg_ver{0};
+
+  /* important! don't forget to update copy constructor */
+
+  RGWObjVersionTracker objv_tracker;
+
+  std::map<std::string, bufferlist> attrset;
+  RGWRawObjState() {}
+  RGWRawObjState(const RGWRawObjState& rhs) : obj (rhs.obj) {
+    has_attrs = rhs.has_attrs;
+    exists = rhs.exists;
+    size = rhs.size;
+    mtime = rhs.mtime;
+    epoch = rhs.epoch;
+    if (rhs.obj_tag.length()) {
+      obj_tag = rhs.obj_tag;
+    }
+    has_data = rhs.has_data;
+    if (rhs.data.length()) {
+      data = rhs.data;
+    }
+    prefetch_data = rhs.prefetch_data;
+    pg_ver = rhs.pg_ver;
+    objv_tracker = rhs.objv_tracker;
+  }
+};
+
+struct RGWPoolIterCtx {
+  librados::IoCtx io_ctx;
+  librados::NObjectIterator iter;
+};
+
+struct RGWListRawObjsCtx {
+  bool initialized;
+  RGWPoolIterCtx iter_ctx;
+
+  RGWListRawObjsCtx() : initialized(false) {}
+};
+
+struct objexp_hint_entry {
+  std::string tenant;
+  std::string bucket_name;
+  std::string bucket_id;
+  rgw_obj_key obj_key;
+  ceph::real_time exp_time;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(bucket_name, bl);
+    encode(bucket_id, bl);
+    encode(obj_key, bl);
+    encode(exp_time, bl);
+    encode(tenant, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    // XXX Do we want DECODE_START_LEGACY_COMPAT_LEN(2, 1, 1, bl); ?
+    DECODE_START(2, bl);
+    decode(bucket_name, bl);
+    decode(bucket_id, bl);
+    decode(obj_key, bl);
+    decode(exp_time, bl);
+    if (struct_v >= 2) {
+      decode(tenant, bl);
+    } else {
+      tenant.clear();
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<objexp_hint_entry*>& o);
+};
+WRITE_CLASS_ENCODER(objexp_hint_entry)
+
+class RGWMetaSyncStatusManager;
+class RGWDataSyncStatusManager;
+class RGWCoroutinesManagerRegistry;
+
+class RGWGetDirHeader_CB;
+class RGWGetUserHeader_CB;
+namespace rgw { namespace sal {
+  class RadosStore;
+  class MPRadosSerializer;
+  class LCRadosSerializer;
+} }
+
+class RGWAsyncRadosProcessor;
+
+template <class T>
+class RGWChainedCacheImpl;
+
+struct bucket_info_entry {
+  RGWBucketInfo info;
+  real_time mtime;
+  std::map<std::string, bufferlist> attrs;
+};
+
+struct tombstone_entry;
+
+template <class K, class V>
+class lru_map;
+using tombstone_cache_t = lru_map<rgw_obj, tombstone_entry>;
+
+class RGWIndexCompletionManager;
+
+class RGWRados
+{
+  friend class RGWGC;
+  friend class RGWMetaNotifier;
+  friend class RGWDataNotifier;
+  friend class RGWObjectExpirer;
+  friend class RGWMetaSyncProcessorThread;
+  friend class RGWDataSyncProcessorThread;
+  friend class RGWReshard;
+  friend class RGWBucketReshard;
+  friend class RGWBucketReshardLock;
+  friend class BucketIndexLockGuard;
+  friend class rgw::sal::MPRadosSerializer;
+  friend class rgw::sal::LCRadosSerializer;
+  friend class rgw::sal::RadosStore;
+
+  /** Open the pool used as root for this gateway */
+  int open_root_pool_ctx(const DoutPrefixProvider *dpp);
+  int open_gc_pool_ctx(const DoutPrefixProvider *dpp);
+  int open_lc_pool_ctx(const DoutPrefixProvider *dpp);
+  int open_objexp_pool_ctx(const DoutPrefixProvider *dpp);
+  int open_reshard_pool_ctx(const DoutPrefixProvider *dpp);
+  int open_notif_pool_ctx(const DoutPrefixProvider *dpp);
+
+  int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx&  io_ctx,
+		    bool mostly_omap, bool bulk);
+
+
+  ceph::mutex lock = ceph::make_mutex("rados_timer_lock");
+  SafeTimer *timer;
+
+  rgw::sal::RadosStore* driver = nullptr;
+  RGWGC *gc = nullptr;
+  RGWLC *lc;
+  RGWObjectExpirer *obj_expirer;
+  bool use_gc_thread;
+  bool use_lc_thread;
+  bool quota_threads;
+  bool run_sync_thread;
+  bool run_reshard_thread;
+
+  RGWMetaNotifier *meta_notifier;
+  RGWDataNotifier *data_notifier;
+  RGWMetaSyncProcessorThread *meta_sync_processor_thread;
+  RGWSyncTraceManager *sync_tracer = nullptr;
+  std::map<rgw_zone_id, RGWDataSyncProcessorThread *> data_sync_processor_threads;
+
+  boost::optional<rgw::BucketTrimManager> bucket_trim;
+  RGWSyncLogTrimThread *sync_log_trimmer{nullptr};
+
+  ceph::mutex meta_sync_thread_lock = ceph::make_mutex("meta_sync_thread_lock");
+  ceph::mutex data_sync_thread_lock = ceph::make_mutex("data_sync_thread_lock");
+
+  librados::IoCtx root_pool_ctx;      // .rgw
+
+  ceph::mutex bucket_id_lock{ceph::make_mutex("rados_bucket_id")};
+
+  // This field represents the number of bucket index object shards
+  uint32_t bucket_index_max_shards;
+
+  std::string get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y);
+
+  int get_obj_head_ref(const DoutPrefixProvider *dpp, const rgw_placement_rule& target_placement_rule, const rgw_obj& obj, rgw_rados_ref *ref);
+  int get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref);
+  int get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
+  uint64_t max_bucket_id;
+
+  int clear_olh(const DoutPrefixProvider *dpp,
+                RGWObjectCtx& obj_ctx,
+                const rgw_obj& obj,
+                RGWBucketInfo& bucket_info,
+                rgw_rados_ref& ref,
+                const std::string& tag,
+                const uint64_t ver,
+                optional_yield y);
+
+  int get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& rctx,
+			   RGWBucketInfo& bucket_info, const rgw_obj& obj,
+			   RGWObjState *olh_state, RGWObjState **target_state,
+			   RGWObjManifest **target_manifest, optional_yield y);
+  int get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest,
+                         bool follow_olh, optional_yield y, bool assume_noent = false);
+  int append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx* rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                         librados::ObjectOperation& op, RGWObjState **state,
+			 RGWObjManifest** pmanifest, optional_yield y);
+
+  int update_placement_map();
+  int store_bucket_info(RGWBucketInfo& info, std::map<std::string, bufferlist> *pattrs, RGWObjVersionTracker *objv_tracker, bool exclusive);
+
+  void remove_rgw_head_obj(librados::ObjectWriteOperation& op);
+  void cls_obj_check_prefix_exist(librados::ObjectOperation& op, const std::string& prefix, bool fail_if_exist);
+  void cls_obj_check_mtime(librados::ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type);
+protected:
+  CephContext *cct;
+
+  librados::Rados rados;
+
+  using RGWChainedCacheImpl_bucket_info_entry = RGWChainedCacheImpl<bucket_info_entry>;
+  RGWChainedCacheImpl_bucket_info_entry *binfo_cache;
+
+  tombstone_cache_t *obj_tombstone_cache;
+
+  librados::IoCtx gc_pool_ctx;        // .rgw.gc
+  librados::IoCtx lc_pool_ctx;        // .rgw.lc
+  librados::IoCtx objexp_pool_ctx;
+  librados::IoCtx reshard_pool_ctx;
+  librados::IoCtx notif_pool_ctx;     // .rgw.notif
+
+  bool pools_initialized;
+
+  RGWQuotaHandler *quota_handler;
+
+  RGWCoroutinesManagerRegistry *cr_registry;
+
+  RGWSyncModuleInstanceRef sync_module;
+  bool writeable_zone{false};
+
+  RGWIndexCompletionManager *index_completion_manager{nullptr};
+
+  bool use_cache{false};
+  bool use_gc{true};
+  bool use_datacache{false};
+
+  int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx);
+public:
+  RGWRados(): timer(NULL),
+               gc(NULL), lc(NULL), obj_expirer(NULL), use_gc_thread(false), use_lc_thread(false), quota_threads(false),
+               run_sync_thread(false), run_reshard_thread(false), meta_notifier(NULL),
+               data_notifier(NULL), meta_sync_processor_thread(NULL),
+               bucket_index_max_shards(0),
+               max_bucket_id(0), cct(NULL),
+               binfo_cache(NULL), obj_tombstone_cache(nullptr),
+               pools_initialized(false),
+               quota_handler(NULL),
+               cr_registry(NULL),
+               pctl(&ctl),
+               reshard(NULL) {}
+
+  RGWRados& set_use_cache(bool status) {
+    use_cache = status;
+    return *this;
+  }
+
+  RGWRados& set_use_gc(bool status) {
+    use_gc = status;
+    return *this;
+  }
+
+  RGWRados& set_use_datacache(bool status) {
+    use_datacache = status;
+    return *this;
+  }
+
+  bool get_use_datacache() {
+    return use_datacache;
+  }
+
+  RGWLC *get_lc() {
+    return lc;
+  }
+
+  RGWGC *get_gc() {
+    return gc;
+  }
+
+  RGWRados& set_run_gc_thread(bool _use_gc_thread) {
+    use_gc_thread = _use_gc_thread;
+    return *this;
+  }
+
+  RGWRados& set_run_lc_thread(bool _use_lc_thread) {
+    use_lc_thread = _use_lc_thread;
+    return *this;
+  }
+
+  RGWRados& set_run_quota_threads(bool _run_quota_threads) {
+    quota_threads = _run_quota_threads;
+    return *this;
+  }
+
+  RGWRados& set_run_sync_thread(bool _run_sync_thread) {
+    run_sync_thread = _run_sync_thread;
+    return *this;
+  }
+
+  RGWRados& set_run_reshard_thread(bool _run_reshard_thread) {
+    run_reshard_thread = _run_reshard_thread;
+    return *this;
+  }
+
+  librados::IoCtx* get_lc_pool_ctx() {
+    return &lc_pool_ctx;
+  }
+
+  librados::IoCtx& get_notif_pool_ctx() {
+    return notif_pool_ctx;
+  }
+
+  void set_context(CephContext *_cct) {
+    cct = _cct;
+  }
+  void set_store(rgw::sal::RadosStore* _driver) {
+    driver = _driver;
+  }
+
+  RGWServices svc;
+  RGWCtl ctl;
+
+  RGWCtl *pctl{nullptr};
+
+  /**
+   * AmazonS3 errors contain a HostId string, but is an opaque base64 blob; we
+   * try to be more transparent. This has a wrapper so we can update it when zonegroup/zone are changed.
+   */
+  std::string host_id;
+
+  RGWReshard *reshard;
+  std::shared_ptr<RGWReshardWait> reshard_wait;
+
+  virtual ~RGWRados() = default;
+
+  tombstone_cache_t *get_tombstone_cache() {
+    return obj_tombstone_cache;
+  }
+  const RGWSyncModuleInstanceRef& get_sync_module() {
+    return sync_module;
+  }
+  RGWSyncTraceManager *get_sync_tracer() {
+    return sync_tracer;
+  }
+
+  int get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment);
+  void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size);
+  int get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
+  int get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment = nullptr);
+
+  uint32_t get_max_bucket_shards() {
+    return RGWSI_BucketIndex_RADOS::shards_max();
+  }
+
+
+  int get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref);
+
+  int list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& marker, RGWListRawObjsCtx *ctx);
+  int list_raw_objects_next(const DoutPrefixProvider *dpp, const std::string& prefix_filter, int max,
+                            RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
+                            bool *is_truncated);
+  int list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& prefix_filter, int max,
+                       RGWListRawObjsCtx& ctx, std::list<std::string>& oids,
+                       bool *is_truncated);
+  std::string list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx);
+
+  CephContext *ctx() { return cct; }
+  /** do all necessary setup of the storage device */
+  int init_begin(CephContext *_cct, const DoutPrefixProvider *dpp) {
+    set_context(_cct);
+    return init_begin(dpp);
+  }
+  /** Initialize the RADOS instance and prepare to do other ops */
+  int init_svc(bool raw, const DoutPrefixProvider *dpp);
+  int init_ctl(const DoutPrefixProvider *dpp);
+  virtual int init_rados();
+  int init_begin(const DoutPrefixProvider *dpp);
+  int init_complete(const DoutPrefixProvider *dpp);
+  void finalize();
+
+  int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type, const std::map<std::string, std::string>& meta);
+  int update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status);
+
+  /// list logs
+  int log_list_init(const DoutPrefixProvider *dpp, const std::string& prefix, RGWAccessHandle *handle);
+  int log_list_next(RGWAccessHandle handle, std::string *name);
+
+  /// remove log
+  int log_remove(const DoutPrefixProvider *dpp, const std::string& name);
+
+  /// show log
+  int log_show_init(const DoutPrefixProvider *dpp, const std::string& name, RGWAccessHandle *handle);
+  int log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry);
+
+  // log bandwidth info
+  int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info);
+  int read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
+                 uint32_t max_entries, bool *is_truncated, RGWUsageIter& read_iter, std::map<rgw_user_bucket,
+		 rgw_usage_log_entry>& usage);
+  int trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& bucket_name, uint64_t start_epoch, uint64_t end_epoch);
+  int clear_usage(const DoutPrefixProvider *dpp);
+
+  int create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool);
+
+  void create_bucket_id(std::string *bucket_id);
+
+  bool get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool);
+  bool obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj);
+
+  int create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
+		    const std::string& zonegroup_id,
+		    const rgw_placement_rule& placement_rule,
+		    const std::string& swift_ver_location,
+		    const RGWQuotaInfo * pquota_info,
+		    std::map<std::string,bufferlist>& attrs,
+		    RGWBucketInfo& bucket_info,
+		    obj_version *pobjv,
+		    obj_version *pep_objv,
+		    ceph::real_time creation_time,
+		    rgw_bucket *master_bucket,
+		    uint32_t *master_num_shards,
+		    optional_yield y,
+                    const DoutPrefixProvider *dpp,
+		    bool exclusive = true);
+
+  RGWCoroutinesManagerRegistry *get_cr_registry() { return cr_registry; }
+
+  struct BucketShard {
+    RGWRados *store;
+    rgw_bucket bucket;
+    int shard_id;
+    RGWSI_RADOS::Obj bucket_obj;
+
+    explicit BucketShard(RGWRados *_store) : store(_store), shard_id(-1) {}
+    int init(const rgw_bucket& _bucket, const rgw_obj& obj,
+             RGWBucketInfo* out, const DoutPrefixProvider *dpp);
+    int init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj);
+    int init(const DoutPrefixProvider *dpp,
+	     const RGWBucketInfo& bucket_info,
+	     const rgw::bucket_index_layout_generation& index, int sid);
+
+    friend std::ostream& operator<<(std::ostream& out, const BucketShard& bs) {
+      out << "BucketShard:{ bucket=" << bs.bucket <<
+	", shard_id=" << bs.shard_id <<
+	", bucket_ojb=" << bs.bucket_obj << "}";
+      return out;
+    }
+  };
+
+  class Object {
+    RGWRados *store;
+    RGWBucketInfo bucket_info;
+    RGWObjectCtx& ctx;
+    rgw_obj obj;
+
+    BucketShard bs;
+
+    RGWObjState *state;
+    RGWObjManifest *manifest;
+
+    bool versioning_disabled;
+
+    bool bs_initialized;
+
+    const rgw_placement_rule *pmeta_placement_rule;
+
+  protected:
+    int get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, RGWObjManifest **pmanifest, bool follow_olh, optional_yield y, bool assume_noent = false);
+    void invalidate_state();
+
+    int prepare_atomic_modification(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation& op, bool reset_obj, const std::string *ptag,
+                                    const char *ifmatch, const char *ifnomatch, bool removal_op, bool modify_tail, optional_yield y);
+    int complete_atomic_modification(const DoutPrefixProvider *dpp);
+
+  public:
+    Object(RGWRados *_store, const RGWBucketInfo& _bucket_info, RGWObjectCtx& _ctx, const rgw_obj& _obj) : store(_store), bucket_info(_bucket_info),
+                                                                                               ctx(_ctx), obj(_obj), bs(store),
+                                                                                               state(NULL), manifest(nullptr), versioning_disabled(false),
+                                                                                               bs_initialized(false),
+                                                                                               pmeta_placement_rule(nullptr) {}
+
+    RGWRados *get_store() { return store; }
+    rgw_obj& get_obj() { return obj; }
+    RGWObjectCtx& get_ctx() { return ctx; }
+    RGWBucketInfo& get_bucket_info() { return bucket_info; }
+    //const std::string& get_instance() { return obj->get_instance(); }
+    //rgw::sal::Object* get_target() { return obj; }
+    int get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y);
+
+    int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
+      if (!bs_initialized) {
+        int r =
+	  bs.init(bucket_info.bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
+        if (r < 0) {
+          return r;
+        }
+        bs_initialized = true;
+      }
+      *pbs = &bs;
+      return 0;
+    }
+
+    void set_versioning_disabled(bool status) {
+      versioning_disabled = status;
+    }
+
+    bool versioning_enabled() {
+      return (!versioning_disabled && bucket_info.versioning_enabled());
+    }
+
+    void set_meta_placement_rule(const rgw_placement_rule *p) {
+        pmeta_placement_rule = p;
+    }
+
+    const rgw_placement_rule& get_meta_placement_rule() {
+        return pmeta_placement_rule ? *pmeta_placement_rule : bucket_info.placement_rule;
+    }
+
+    struct Read {
+      RGWRados::Object *source;
+
+      struct GetObjState {
+        std::map<rgw_pool, librados::IoCtx> io_ctxs;
+        rgw_pool cur_pool;
+        librados::IoCtx *cur_ioctx{nullptr};
+        rgw_obj obj;
+        rgw_raw_obj head_obj;
+      } state;
+
+      struct ConditionParams {
+        const ceph::real_time *mod_ptr;
+        const ceph::real_time *unmod_ptr;
+        bool high_precision_time;
+        uint32_t mod_zone_id;
+        uint64_t mod_pg_ver;
+        const char *if_match;
+        const char *if_nomatch;
+
+        ConditionParams() :
+                 mod_ptr(NULL), unmod_ptr(NULL), high_precision_time(false), mod_zone_id(0), mod_pg_ver(0),
+                 if_match(NULL), if_nomatch(NULL) {}
+      } conds;
+
+      struct Params {
+        ceph::real_time *lastmod;
+        uint64_t *obj_size;
+        std::map<std::string, bufferlist> *attrs;
+        rgw_obj *target_obj;
+
+        Params() : lastmod(nullptr), obj_size(nullptr), attrs(nullptr),
+		 target_obj(nullptr) {}
+      } params;
+
+      explicit Read(RGWRados::Object *_source) : source(_source) {}
+
+      int prepare(optional_yield y, const DoutPrefixProvider *dpp);
+      static int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
+      int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp);
+      int iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb, optional_yield y);
+      int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y);
+    };
+
+    struct Write {
+      RGWRados::Object *target;
+
+      struct MetaParams {
+        ceph::real_time *mtime;
+        std::map<std::string, bufferlist>* rmattrs;
+        const bufferlist *data;
+        RGWObjManifest *manifest;
+        const std::string *ptag;
+        std::list<rgw_obj_index_key> *remove_objs;
+        ceph::real_time set_mtime;
+        rgw_user owner;
+        RGWObjCategory category;
+        int flags;
+        const char *if_match;
+        const char *if_nomatch;
+        std::optional<uint64_t> olh_epoch;
+        ceph::real_time delete_at;
+        bool canceled;
+        const std::string *user_data;
+        rgw_zone_set *zones_trace;
+        bool modify_tail;
+        bool completeMultipart;
+        bool appendable;
+
+        MetaParams() : mtime(NULL), rmattrs(NULL), data(NULL), manifest(NULL), ptag(NULL),
+                 remove_objs(NULL), category(RGWObjCategory::Main), flags(0),
+                 if_match(NULL), if_nomatch(NULL), canceled(false), user_data(nullptr), zones_trace(nullptr),
+                 modify_tail(false),  completeMultipart(false), appendable(false) {}
+      } meta;
+
+      explicit Write(RGWRados::Object *_target) : target(_target) {}
+
+      int _do_write_meta(const DoutPrefixProvider *dpp,
+                     uint64_t size, uint64_t accounted_size,
+                     std::map<std::string, bufferlist>& attrs,
+                     bool modify_tail, bool assume_noent,
+                     void *index_op, optional_yield y);
+      int write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
+                     std::map<std::string, bufferlist>& attrs, optional_yield y);
+      int write_data(const char *data, uint64_t ofs, uint64_t len, bool exclusive);
+      const req_state* get_req_state() {
+        return nullptr;  /* XXX dang Only used by LTTng, and it handles null anyway */
+      }
+    };
+
+    struct Delete {
+      RGWRados::Object *target;
+
+      struct DeleteParams {
+        rgw_user bucket_owner;
+        int versioning_status; // versioning flags defined in enum RGWBucketFlags
+        ACLOwner obj_owner;    // needed for creation of deletion marker
+        uint64_t olh_epoch;
+        std::string marker_version_id;
+        uint32_t bilog_flags;
+        std::list<rgw_obj_index_key> *remove_objs;
+        ceph::real_time expiration_time;
+        ceph::real_time unmod_since;
+        ceph::real_time mtime; /* for setting delete marker mtime */
+        bool high_precision_time;
+        rgw_zone_set *zones_trace;
+	bool abortmp;
+	uint64_t parts_accounted_size;
+
+        DeleteParams() : versioning_status(0), olh_epoch(0), bilog_flags(0), remove_objs(NULL), high_precision_time(false), zones_trace(nullptr), abortmp(false), parts_accounted_size(0) {}
+      } params;
+
+      struct DeleteResult {
+        bool delete_marker;
+        std::string version_id;
+
+        DeleteResult() : delete_marker(false) {}
+      } result;
+
+      explicit Delete(RGWRados::Object *_target) : target(_target) {}
+
+      int delete_obj(optional_yield y, const DoutPrefixProvider *dpp);
+    };
+
+    struct Stat {
+      RGWRados::Object *source;
+
+      struct Result {
+        rgw_obj obj;
+	std::optional<RGWObjManifest> manifest;
+        uint64_t size{0};
+	struct timespec mtime {};
+        std::map<std::string, bufferlist> attrs;
+      } result;
+
+      struct State {
+        librados::IoCtx io_ctx;
+        librados::AioCompletion *completion;
+        int ret;
+
+        State() : completion(NULL), ret(0) {}
+      } state;
+
+
+      explicit Stat(RGWRados::Object *_source) : source(_source) {}
+
+      int stat_async(const DoutPrefixProvider *dpp);
+      int wait(const DoutPrefixProvider *dpp);
+      int stat();
+    private:
+      int finish(const DoutPrefixProvider *dpp);
+    };
+  };
+
+  class Bucket {
+    RGWRados *store;
+    RGWBucketInfo bucket_info;
+    rgw_bucket& bucket;
+    int shard_id;
+
+  public:
+    Bucket(RGWRados *_store, const RGWBucketInfo& _bucket_info) : store(_store), bucket_info(_bucket_info), bucket(bucket_info.bucket),
+                                                            shard_id(RGW_NO_SHARD) {}
+    RGWRados *get_store() { return store; }
+    rgw_bucket& get_bucket() { return bucket; }
+    RGWBucketInfo& get_bucket_info() { return bucket_info; }
+
+    int update_bucket_id(const std::string& new_bucket_id, const DoutPrefixProvider *dpp);
+
+    int get_shard_id() { return shard_id; }
+    void set_shard_id(int id) {
+      shard_id = id;
+    }
+
+    class UpdateIndex {
+      RGWRados::Bucket *target;
+      std::string optag;
+      rgw_obj obj;
+      uint16_t bilog_flags{0};
+      BucketShard bs;
+      bool bs_initialized{false};
+      bool blind;
+      bool prepared{false};
+      rgw_zone_set *zones_trace{nullptr};
+
+      int init_bs(const DoutPrefixProvider *dpp) {
+        int r =
+	  bs.init(target->get_bucket(), obj, &target->bucket_info, dpp);
+        if (r < 0) {
+          return r;
+        }
+        bs_initialized = true;
+        return 0;
+      }
+
+      void invalidate_bs() {
+        bs_initialized = false;
+      }
+
+      int guard_reshard(const DoutPrefixProvider *dpp, const rgw_obj& obj_instance, BucketShard **pbs, std::function<int(BucketShard *)> call);
+    public:
+
+      UpdateIndex(RGWRados::Bucket *_target, const rgw_obj& _obj) : target(_target), obj(_obj),
+                                                              bs(target->get_store()) {
+                                                                blind = (target->get_bucket_info().layout.current_index.layout.type == rgw::BucketIndexType::Indexless);
+                                                              }
+
+      int get_bucket_shard(BucketShard **pbs, const DoutPrefixProvider *dpp) {
+        if (!bs_initialized) {
+          int r = init_bs(dpp);
+          if (r < 0) {
+            return r;
+          }
+        }
+        *pbs = &bs;
+        return 0;
+      }
+
+      void set_bilog_flags(uint16_t flags) {
+        bilog_flags = flags;
+      }
+
+      void set_zones_trace(rgw_zone_set *_zones_trace) {
+        zones_trace = _zones_trace;
+      }
+
+      int prepare(const DoutPrefixProvider *dpp, RGWModifyOp, const std::string *write_tag, optional_yield y);
+      int complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch, uint64_t size,
+                   uint64_t accounted_size, ceph::real_time& ut,
+                   const std::string& etag, const std::string& content_type,
+                   const std::string& storage_class,
+                   bufferlist *acl_bl, RGWObjCategory category,
+		   std::list<rgw_obj_index_key> *remove_objs,
+		   optional_yield y,
+		   const std::string *user_data = nullptr,
+		   bool appendable = false);
+      int complete_del(const DoutPrefixProvider *dpp,
+                       int64_t poolid, uint64_t epoch,
+                       ceph::real_time& removed_mtime, /* mtime of removed object */
+                       std::list<rgw_obj_index_key> *remove_objs,
+		       optional_yield y);
+      int cancel(const DoutPrefixProvider *dpp,
+                 std::list<rgw_obj_index_key> *remove_objs,
+		 optional_yield y);
+
+      const std::string *get_optag() { return &optag; }
+
+      bool is_prepared() { return prepared; }
+    }; // class UpdateIndex
+
+    class List {
+    protected:
+      // absolute maximum number of objects that
+      // list_objects_(un)ordered can return
+      static constexpr int64_t bucket_list_objects_absolute_max = 25000;
+
+      RGWRados::Bucket *target;
+      rgw_obj_key next_marker;
+
+      int list_objects_ordered(const DoutPrefixProvider *dpp,
+                               int64_t max,
+			       std::vector<rgw_bucket_dir_entry> *result,
+			       std::map<std::string, bool> *common_prefixes,
+			       bool *is_truncated,
+                               optional_yield y);
+      int list_objects_unordered(const DoutPrefixProvider *dpp,
+                                 int64_t max,
+				 std::vector<rgw_bucket_dir_entry> *result,
+				 std::map<std::string, bool> *common_prefixes,
+				 bool *is_truncated,
+                                 optional_yield y);
+
+    public:
+
+      struct Params {
+        std::string prefix;
+        std::string delim;
+        rgw_obj_key marker;
+        rgw_obj_key end_marker;
+        std::string ns;
+        bool enforce_ns;
+        RGWAccessListFilter* access_list_filter;
+	RGWBucketListNameFilter force_check_filter;
+        bool list_versions;
+	bool allow_unordered;
+
+        Params() :
+	  enforce_ns(true),
+	  access_list_filter(nullptr),
+	  list_versions(false),
+	  allow_unordered(false)
+	{}
+      } params;
+
+      explicit List(RGWRados::Bucket *_target) : target(_target) {}
+
+      int list_objects(const DoutPrefixProvider *dpp, int64_t max,
+		       std::vector<rgw_bucket_dir_entry> *result,
+		       std::map<std::string, bool> *common_prefixes,
+		       bool *is_truncated,
+                       optional_yield y) {
+	if (params.allow_unordered) {
+	  return list_objects_unordered(dpp, max, result, common_prefixes,
+					is_truncated, y);
+	} else {
+	  return list_objects_ordered(dpp, max, result, common_prefixes,
+				      is_truncated, y);
+	}
+      }
+      rgw_obj_key& get_next_marker() {
+        return next_marker;
+      }
+    }; // class List
+  }; // class Bucket
+
+  int on_last_entry_in_listing(const DoutPrefixProvider *dpp,
+                               RGWBucketInfo& bucket_info,
+                               const std::string& obj_prefix,
+                               const std::string& obj_delim,
+                               std::function<int(const rgw_bucket_dir_entry&)> handler);
+
+  bool swift_versioning_enabled(const RGWBucketInfo& bucket_info) const;
+
+  int swift_versioning_copy(RGWObjectCtx& obj_ctx,              /* in/out */
+                            const rgw_user& user,               /* in */
+                            RGWBucketInfo& bucket_info,         /* in */
+                            const rgw_obj& obj,                 /* in */
+                            const DoutPrefixProvider *dpp,      /* in */
+                            optional_yield y);                  /* in */
+  int swift_versioning_restore(RGWObjectCtx& obj_ctx,           /* in/out */
+                               const rgw_user& user,            /* in */
+                               RGWBucketInfo& bucket_info,      /* in */
+                               rgw_obj& obj,                    /* in/out */
+                               bool& restored,                  /* out */
+                               const DoutPrefixProvider *dpp);  /* in */
+  int copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
+                              RGWObjState *astate,
+                              std::map<std::string, bufferlist>& src_attrs,
+                              RGWRados::Object::Read& read_op,
+                              const rgw_user& user_id,
+                              const rgw_obj& dest_obj,
+                              ceph::real_time *mtime);
+
+  enum AttrsMod {
+    ATTRSMOD_NONE    = 0,
+    ATTRSMOD_REPLACE = 1,
+    ATTRSMOD_MERGE   = 2
+  };
+
+  D3nDataCache* d3n_data_cache{nullptr};
+
+  int rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, const DoutPrefixProvider *dpp, optional_yield y);
+  int reindex_obj(const RGWBucketInfo& dest_bucket_info,
+		  const rgw_obj& obj,
+		  const DoutPrefixProvider* dpp,
+		  optional_yield y);
+
+  int stat_remote_obj(const DoutPrefixProvider *dpp,
+               RGWObjectCtx& obj_ctx,
+               const rgw_user& user_id,
+               req_info *info,
+               const rgw_zone_id& source_zone,
+               const rgw_obj& src_obj,
+               const RGWBucketInfo *src_bucket_info,
+               real_time *src_mtime,
+               uint64_t *psize,
+               const real_time *mod_ptr,
+               const real_time *unmod_ptr,
+               bool high_precision_time,
+               const char *if_match,
+               const char *if_nomatch,
+               std::map<std::string, bufferlist> *pattrs,
+               std::map<std::string, std::string> *pheaders,
+               std::string *version_id,
+               std::string *ptag,
+               std::string *petag);
+
+  int fetch_remote_obj(RGWObjectCtx& obj_ctx,
+                       const rgw_user& user_id,
+                       req_info *info,
+                       const rgw_zone_id& source_zone,
+                       const rgw_obj& dest_obj,
+                       const rgw_obj& src_obj,
+                       RGWBucketInfo& dest_bucket_info,
+                       RGWBucketInfo *src_bucket_info,
+		       std::optional<rgw_placement_rule> dest_placement,
+                       ceph::real_time *src_mtime,
+                       ceph::real_time *mtime,
+                       const ceph::real_time *mod_ptr,
+                       const ceph::real_time *unmod_ptr,
+                       bool high_precision_time,
+                       const char *if_match,
+                       const char *if_nomatch,
+                       AttrsMod attrs_mod,
+                       bool copy_if_newer,
+                       rgw::sal::Attrs& attrs,
+                       RGWObjCategory category,
+                       std::optional<uint64_t> olh_epoch,
+		       ceph::real_time delete_at,
+                       std::string *ptag,
+                       std::string *petag,
+                       void (*progress_cb)(off_t, void *),
+                       void *progress_data,
+                       const DoutPrefixProvider *dpp,
+                       RGWFetchObjFilter *filter,
+                       const rgw_zone_set_entry& source_trace_entry,
+                       rgw_zone_set *zones_trace = nullptr,
+                       std::optional<uint64_t>* bytes_transferred = 0);
+  /**
+   * Copy an object.
+   * dest_obj: the object to copy into
+   * src_obj: the object to copy from
+   * attrs: usage depends on attrs_mod parameter
+   * attrs_mod: the modification mode of the attrs, may have the following values:
+   *            ATTRSMOD_NONE - the attributes of the source object will be
+   *                            copied without modifications, attrs parameter is ignored;
+   *            ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
+   *                               parameter, source object attributes are not copied;
+   *            ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
+   *                             are overwritten by values contained in attrs parameter.
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int copy_obj(RGWObjectCtx& obj_ctx,
+               const rgw_user& user_id,
+               req_info *info,
+               const rgw_zone_id& source_zone,
+               const rgw_obj& dest_obj,
+               const rgw_obj& src_obj,
+               RGWBucketInfo& dest_bucket_info,
+               RGWBucketInfo& src_bucket_info,
+               const rgw_placement_rule& dest_placement,
+               ceph::real_time *src_mtime,
+               ceph::real_time *mtime,
+               const ceph::real_time *mod_ptr,
+               const ceph::real_time *unmod_ptr,
+               bool high_precision_time,
+               const char *if_match,
+               const char *if_nomatch,
+               AttrsMod attrs_mod,
+               bool copy_if_newer,
+               std::map<std::string, bufferlist>& attrs,
+               RGWObjCategory category,
+               uint64_t olh_epoch,
+	       ceph::real_time delete_at,
+               std::string *version_id,
+               std::string *ptag,
+               std::string *petag,
+               void (*progress_cb)(off_t, void *),
+               void *progress_data,
+               const DoutPrefixProvider *dpp,
+               optional_yield y);
+
+  int copy_obj_data(RGWObjectCtx& obj_ctx,
+               RGWBucketInfo& dest_bucket_info,
+               const rgw_placement_rule& dest_placement,
+	       RGWRados::Object::Read& read_op, off_t end,
+               const rgw_obj& dest_obj,
+	       ceph::real_time *mtime,
+	       ceph::real_time set_mtime,
+               std::map<std::string, bufferlist>& attrs,
+               uint64_t olh_epoch,
+	       ceph::real_time delete_at,
+               std::string *petag,
+               const DoutPrefixProvider *dpp,
+               optional_yield y);
+
+  int transition_obj(RGWObjectCtx& obj_ctx,
+                     RGWBucketInfo& bucket_info,
+                     const rgw_obj& obj,
+                     const rgw_placement_rule& placement_rule,
+                     const real_time& mtime,
+                     uint64_t olh_epoch,
+                     const DoutPrefixProvider *dpp,
+                     optional_yield y);
+
+  int check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y);
+
+  /**
+   * Delete a bucket.
+   * bucket: the name of the bucket to delete
+   * Returns 0 on success, -ERR# otherwise.
+   */
+  int delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty = true);
+
+  void wakeup_meta_sync_shards(std::set<int>& shard_ids);
+
+  void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >& entries);
+
+  RGWMetaSyncStatusManager* get_meta_sync_manager();
+  RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone);
+
+  int set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp);
+  int set_buckets_enabled(std::vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp);
+  int bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended);
+
+  /** Delete an object.*/
+  int delete_obj(const DoutPrefixProvider *dpp,
+		 RGWObjectCtx& obj_ctx,
+		 const RGWBucketInfo& bucket_info,
+		 const rgw_obj& obj,
+		 int versioning_status,  // versioning flags defined in enum RGWBucketFlags
+		 uint16_t bilog_flags = 0,
+		 const ceph::real_time& expiration_time = ceph::real_time(),
+		 rgw_zone_set *zones_trace = nullptr);
+
+  int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj);
+
+  /** Remove an object from the bucket index */
+  int delete_obj_index(const rgw_obj& obj, ceph::real_time mtime,
+		       const DoutPrefixProvider *dpp, optional_yield y);
+
+  /**
+   * Set an attr on an object.
+   * bucket: name of the bucket holding the object
+   * obj: name of the object to set the attr on
+   * name: the attr to set
+   * bl: the contents of the attr
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int set_attr(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, const char *name, bufferlist& bl);
+
+  int set_attrs(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                        std::map<std::string, bufferlist>& attrs,
+                        std::map<std::string, bufferlist>* rmattrs,
+                        optional_yield y,
+                        ceph::real_time set_mtime = ceph::real_clock::zero());
+
+  int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest,
+                    bool follow_olh, optional_yield y, bool assume_noent = false);
+  int get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state, RGWObjManifest** manifest, optional_yield y) {
+    return get_obj_state(dpp, rctx, bucket_info, obj, state, manifest, true, y);
+  }
+
+  using iterate_obj_cb = int (*)(const DoutPrefixProvider*, const rgw_raw_obj&, off_t, off_t,
+                                 off_t, bool, RGWObjState*, void*);
+
+  int iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& ctx, RGWBucketInfo& bucket_info,
+                  const rgw_obj& obj, off_t ofs, off_t end,
+                  uint64_t max_chunk_size, iterate_obj_cb cb, void *arg,
+                  optional_yield y);
+
+  int append_atomic_test(const DoutPrefixProvider *dpp, const RGWObjState* astate, librados::ObjectOperation& op);
+
+  virtual int get_obj_iterate_cb(const DoutPrefixProvider *dpp,
+                         const rgw_raw_obj& read_obj, off_t obj_ofs,
+                         off_t read_ofs, off_t len, bool is_head_obj,
+                         RGWObjState *astate, void *arg);
+
+  /**
+   * a simple object read without keeping state
+   */
+
+  int raw_obj_stat(const DoutPrefixProvider *dpp,
+                   rgw_raw_obj& obj, uint64_t *psize, ceph::real_time *pmtime, uint64_t *epoch,
+                   std::map<std::string, bufferlist> *attrs, bufferlist *first_chunk,
+                   RGWObjVersionTracker *objv_tracker, optional_yield y);
+
+  int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectWriteOperation *op);
+  int obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::ObjectReadOperation *op);
+
+  int guard_reshard(const DoutPrefixProvider *dpp,
+                    BucketShard *bs,
+		    const rgw_obj& obj_instance,
+		    RGWBucketInfo& bucket_info,
+		    std::function<int(BucketShard *)> call);
+  int block_while_resharding(RGWRados::BucketShard *bs,
+                             const rgw_obj& obj_instance,
+			     RGWBucketInfo& bucket_info,
+                             optional_yield y,
+                             const DoutPrefixProvider *dpp);
+
+  void bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, librados::ObjectOperation& op);
+  void olh_cancel_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, const std::string& op_tag, optional_yield y);
+  int olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
+  int olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::string *op_tag);
+  int bucket_index_link_olh(const DoutPrefixProvider *dpp,
+                            RGWBucketInfo& bucket_info, RGWObjState& olh_state,
+                            const rgw_obj& obj_instance, bool delete_marker,
+                            const std::string& op_tag, struct rgw_bucket_dir_entry_meta *meta,
+                            uint64_t olh_epoch,
+                            ceph::real_time unmod_since, bool high_precision_time,
+			    optional_yield y,
+                            rgw_zone_set *zones_trace = nullptr,
+                            bool log_data_change = false);
+  int bucket_index_unlink_instance(const DoutPrefixProvider *dpp,
+                                   RGWBucketInfo& bucket_info,
+                                   const rgw_obj& obj_instance,
+                                   const std::string& op_tag, const std::string& olh_tag,
+                                   uint64_t olh_epoch, rgw_zone_set *zones_trace = nullptr);
+  int bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
+                                RGWBucketInfo& bucket_info, RGWObjState& state,
+                                const rgw_obj& obj_instance, uint64_t ver_marker,
+                                std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> > *log, bool *is_truncated);
+  int bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjState& obj_state, const rgw_obj& obj_instance, uint64_t ver);
+  int bucket_index_clear_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const std::string& olh_tag, const rgw_obj& obj_instance);
+  int apply_olh_log(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState& obj_state, RGWBucketInfo& bucket_info, const rgw_obj& obj,
+                    bufferlist& obj_tag, std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
+                    uint64_t *plast_ver, rgw_zone_set *zones_trace = nullptr);
+  int update_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState *state, RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace = nullptr);
+  int clear_olh(const DoutPrefixProvider *dpp,
+                RGWObjectCtx& obj_ctx,
+                const rgw_obj& obj,
+                RGWBucketInfo& bucket_info,
+                const std::string& tag,
+                const uint64_t ver,
+                optional_yield y);
+  int set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
+              uint64_t olh_epoch, ceph::real_time unmod_since, bool high_precision_time,
+              optional_yield y, rgw_zone_set *zones_trace = nullptr, bool log_data_change = false);
+  int repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
+                 const rgw_obj& obj);
+  int unlink_obj_instance(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
+                          uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace = nullptr);
+
+  void check_pending_olh_entries(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>& pending_entries, std::map<std::string, bufferlist> *rm_pending_entries);
+  int remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, std::map<std::string, bufferlist>& pending_attrs);
+  int follow_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, RGWObjectCtx& ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target);
+  int get_olh(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh);
+
+  void gen_rand_obj_instance_name(rgw_obj_key *target_key);
+  void gen_rand_obj_instance_name(rgw_obj *target);
+
+  int update_containers_stats(std::map<std::string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp);
+  int append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl);
+
+public:
+  void set_atomic(void *ctx, const rgw_obj& obj) {
+    RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+    rctx->set_atomic(obj);
+  }
+  void set_prefetch_data(void *ctx, const rgw_obj& obj) {
+    RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+    rctx->set_prefetch_data(obj);
+  }
+  void set_compressed(void *ctx, const rgw_obj& obj) {
+    RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
+    rctx->set_compressed(obj);
+  }
+  int decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner);
+  int get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, std::string *bucket_ver, std::string *master_ver,
+      std::map<RGWObjCategory, RGWStorageStats>& stats, std::string *max_marker, bool* syncstopped = NULL);
+  int get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *cb);
+
+  int put_bucket_instance_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, std::map<std::string, bufferlist> *pattrs, const DoutPrefixProvider *dpp, optional_yield y);
+  /* xxx dang obj_ctx -> svc */
+  int get_bucket_instance_info(const std::string& meta_key, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
+  int get_bucket_instance_info(const rgw_bucket& bucket, RGWBucketInfo& info, ceph::real_time *pmtime, std::map<std::string, bufferlist> *pattrs, optional_yield y, const DoutPrefixProvider *dpp);
+
+  static void make_bucket_entry_name(const std::string& tenant_name, const std::string& bucket_name, std::string& bucket_entry);
+
+  int get_bucket_info(RGWServices *svc,
+		      const std::string& tenant_name, const std::string& bucket_name,
+		      RGWBucketInfo& info,
+		      ceph::real_time *pmtime, optional_yield y,
+                      const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *pattrs = NULL);
+
+  // Returns 0 on successful refresh. Returns error code if there was
+  // an error or the version stored on the OSD is the same as that
+  // presented in the BucketInfo structure.
+  //
+  int try_refresh_bucket_info(RGWBucketInfo& info,
+			      ceph::real_time *pmtime,
+                              const DoutPrefixProvider *dpp,
+			      std::map<std::string, bufferlist> *pattrs = nullptr);
+
+  int put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, ceph::real_time mtime, obj_version *pep_objv,
+			     std::map<std::string, bufferlist> *pattrs, bool create_entry_point,
+                             const DoutPrefixProvider *dpp, optional_yield y);
+
+  int cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, std::string& tag, rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *zones_trace = nullptr);
+  int cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, std::string& tag, int64_t pool, uint64_t epoch,
+                          rgw_bucket_dir_entry& ent, RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+  int cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, std::string& tag, int64_t pool, uint64_t epoch, rgw_bucket_dir_entry& ent,
+                           RGWObjCategory category, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+  int cls_obj_complete_del(BucketShard& bs, std::string& tag, int64_t pool, uint64_t epoch, rgw_obj& obj,
+                           ceph::real_time& removed_mtime, std::list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+  int cls_obj_complete_cancel(BucketShard& bs, std::string& tag, rgw_obj& obj,
+                              std::list<rgw_obj_index_key> *remove_objs,
+                              uint16_t bilog_flags, rgw_zone_set *zones_trace = nullptr);
+  int cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout);
+
+  using ent_map_t =
+    boost::container::flat_map<std::string, rgw_bucket_dir_entry>;
+
+  int cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
+                              RGWBucketInfo& bucket_info,
+                              const rgw::bucket_index_layout_generation& idx_layout,
+                              const int shard_id,
+			      const rgw_obj_index_key& start_after,
+			      const std::string& prefix,
+			      const std::string& delimiter,
+			      const uint32_t num_entries,
+			      const bool list_versions,
+			      const uint16_t exp_factor, // 0 means ignore
+			      ent_map_t& m,
+			      bool* is_truncated,
+			      bool* cls_filtered,
+			      rgw_obj_index_key *last_entry,
+                              optional_yield y,
+			      RGWBucketListNameFilter force_check_filter = {});
+  int cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
+                                RGWBucketInfo& bucket_info,
+                                const rgw::bucket_index_layout_generation& idx_layout,
+                                int shard_id,
+				const rgw_obj_index_key& start_after,
+				const std::string& prefix,
+				uint32_t num_entries,
+				bool list_versions,
+				std::vector<rgw_bucket_dir_entry>& ent_list,
+				bool *is_truncated,
+				rgw_obj_index_key *last_entry,
+                                optional_yield y,
+				RGWBucketListNameFilter force_check_filter = {});
+  int cls_bucket_head(const DoutPrefixProvider *dpp,
+		      const RGWBucketInfo& bucket_info,
+		      const rgw::bucket_index_layout_generation& idx_layout,
+		      int shard_id, std::vector<rgw_bucket_dir_header>& headers,
+		      std::map<int, std::string> *bucket_instance_ids = NULL);
+  int cls_bucket_head_async(const DoutPrefixProvider *dpp,
+			    const RGWBucketInfo& bucket_info,
+			    const rgw::bucket_index_layout_generation& idx_layout,
+			    int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio);
+  int bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_dir_entry *dirent);
+  int bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_bucket_olh_entry *olh);
+  int bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry);
+  void bi_put(librados::ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry);
+  int bi_put(BucketShard& bs, rgw_cls_bi_entry& entry);
+  int bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry);
+  int bi_list(const DoutPrefixProvider *dpp,
+	      const RGWBucketInfo& bucket_info,
+	      int shard_id,
+	      const std::string& filter_obj,
+	      const std::string& marker,
+	      uint32_t max,
+	      std::list<rgw_cls_bi_entry> *entries,
+	      bool *is_truncated);
+  int bi_list(BucketShard& bs, const std::string& filter_obj, const std::string& marker, uint32_t max, std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
+  int bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const std::string& obj_name, const std::string& marker, uint32_t max,
+              std::list<rgw_cls_bi_entry> *entries, bool *is_truncated);
+  int bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs);
+
+  int cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const std::string& oid, rgw_usage_log_info& info);
+  int cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
+                             uint64_t end_epoch, uint32_t max_entries, std::string& read_iter,
+			     std::map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated);
+  int cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const std::string& oid, const std::string& user, const std::string& bucket, uint64_t start_epoch,
+                             uint64_t end_epoch);
+  int cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, std::string& oid);
+
+  int get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const std::string& obj_key, int *shard_id);
+
+  int lock_exclusive(const rgw_pool& pool, const std::string& oid, ceph::timespan& duration, rgw_zone_id& zone_id, std::string& owner_id);
+  int unlock(const rgw_pool& pool, const std::string& oid, rgw_zone_id& zone_id, std::string& owner_id);
+
+  void update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain);
+  std::tuple<int, std::optional<cls_rgw_obj_chain>> send_chain_to_gc(cls_rgw_obj_chain& chain, const std::string& tag);
+  void delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const std::string& tag);
+  int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectWriteOperation *op);
+  int gc_aio_operate(const std::string& oid, librados::AioCompletion *c,
+                     librados::ObjectWriteOperation *op);
+  int gc_operate(const DoutPrefixProvider *dpp, std::string& oid, librados::ObjectReadOperation *op, bufferlist *pbl);
+
+  int list_gc_objs(int *index, std::string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue);
+  int process_gc(bool expired_only);
+  bool process_expire_objects(const DoutPrefixProvider *dpp);
+  int defer_gc(const DoutPrefixProvider *dpp, RGWObjectCtx* ctx, RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y);
+
+  int process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket);
+  int list_lc_progress(std::string& marker, uint32_t max_entries,
+		       std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+		       int& index);
+
+  int bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+                         std::map<RGWObjCategory, RGWStorageStats> *existing_stats,
+                         std::map<RGWObjCategory, RGWStorageStats> *calculated_stats);
+  int bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info);
+
+  // Search the bucket for encrypted multipart uploads, and increase their mtime
+  // slightly to generate a bilog entry to trigger a resync to repair any
+  // corrupted replicas. See https://tracker.ceph.com/issues/46062
+  int bucket_resync_encrypted_multipart(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        rgw::sal::RadosStore* driver,
+                                        RGWBucketInfo& bucket_info,
+                                        const std::string& marker,
+                                        RGWFormatterFlusher& flusher);
+
+  int bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry);
+  int remove_objs_from_index(const DoutPrefixProvider *dpp,
+			     RGWBucketInfo& bucket_info,
+			     const std::list<rgw_obj_index_key>& oid_list);
+  int move_rados_obj(const DoutPrefixProvider *dpp,
+                     librados::IoCtx& src_ioctx,
+		     const std::string& src_oid, const std::string& src_locator,
+	             librados::IoCtx& dst_ioctx,
+		     const std::string& dst_oid, const std::string& dst_locator);
+  int fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key);
+  int fix_tail_obj_locator(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
+                           rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y);
+
+  int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
+                  RGWQuota& quota, uint64_t obj_size,
+		  optional_yield y, bool check_size_only = false);
+
+  int check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
+			  uint64_t num_objs, const DoutPrefixProvider *dpp);
+
+  int add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards);
+
+  uint64_t instance_id();
+
+  librados::Rados* get_rados_handle();
+
+  int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::list<librados::AioCompletion *>& handles);
+  int delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj, RGWBucketInfo& info, RGWObjState *astate,
+                     std::list<librados::AioCompletion *>& handles, bool keep_index_consistent,
+                     optional_yield y);
+
+ private:
+  /**
+   * Check the actual on-disk state of the object specified
+   * by list_state, and fill in the time and size of object.
+   * Then append any changes to suggested_updates for
+   * the rgw class' dir_suggest_changes function.
+   *
+   * Note that this can maul list_state; don't use it afterwards. Also
+   * it expects object to already be filled in from list_state; it only
+   * sets the size and mtime.
+   *
+   * Returns 0 on success, -ENOENT if the object doesn't exist on disk,
+   * and -errno on other failures. (-ENOENT is not a failure, and it
+   * will encode that info as a suggested update.)
+   */
+  int check_disk_state(const DoutPrefixProvider *dpp,
+                       librados::IoCtx io_ctx,
+                       RGWBucketInfo& bucket_info,
+                       rgw_bucket_dir_entry& list_state,
+                       rgw_bucket_dir_entry& object,
+                       bufferlist& suggested_updates,
+                       optional_yield y);
+
+  /**
+   * Init pool iteration
+   * pool: pool to use for the ctx initialization
+   * ctx: context object to use for the iteration
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx);
+
+  /**
+   * Init pool iteration
+   * pool: pool to use
+   * cursor: position to start iteration
+   * ctx: context object to use for the iteration
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const std::string& cursor, RGWPoolIterCtx& ctx);
+
+  /**
+   * Get pool iteration position
+   * ctx: context object to use for the iteration
+   * Returns: std::string representation of position
+   */
+  std::string pool_iterate_get_cursor(RGWPoolIterCtx& ctx);
+
+  /**
+   * Iterate over pool return object names, use optional filter
+   * ctx: iteration context, initialized with pool_iterate_begin()
+   * num: max number of objects to return
+   * objs: a vector that the results will append into
+   * is_truncated: if not NULL, will hold true iff iteration is complete
+   * filter: if not NULL, will be used to filter returned objects
+   * Returns: 0 on success, -ERR# otherwise.
+   */
+  int pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num,
+		   std::vector<rgw_bucket_dir_entry>& objs,
+                   bool *is_truncated, RGWAccessListFilter *filter);
+
+  uint64_t next_bucket_id();
+
+  /**
+   * This is broken out to facilitate unit testing.
+   */
+  static uint32_t calc_ordered_bucket_list_per_shard(uint32_t num_entries,
+						     uint32_t num_shards);
+};
+
+
+struct get_obj_data {
+  RGWRados* rgwrados;
+  RGWGetDataCB* client_cb = nullptr;
+  rgw::Aio* aio;
+  uint64_t offset; // next offset to write to client
+  rgw::AioResultList completed; // completed read results, sorted by offset
+  optional_yield yield;
+
+  get_obj_data(RGWRados* rgwrados, RGWGetDataCB* cb, rgw::Aio* aio,
+               uint64_t offset, optional_yield yield)
+               : rgwrados(rgwrados), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
+  ~get_obj_data() {
+    if (rgwrados->get_use_datacache()) {
+      const std::lock_guard l(d3n_get_data.d3n_lock);
+    }
+  }
+
+  D3nGetObjData d3n_get_data;
+  std::atomic_bool d3n_bypass_cache_write{false};
+
+  int flush(rgw::AioResultList&& results);
+
+  void cancel() {
+    // wait for all completions to drain and ignore the results
+    aio->drain();
+  }
+
+  int drain() {
+    auto c = aio->wait();
+    while (!c.empty()) {
+      int r = flush(std::move(c));
+      if (r < 0) {
+        cancel();
+        return r;
+      }
+      c = aio->wait();
+    }
+    return flush(std::move(c));
+  }
+};
diff --git a/src/rgw/driver/rados/rgw_reshard.cc b/src/rgw/driver/rados/rgw_reshard.cc
new file mode 100644
index 000000000..2abf02908
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_reshard.cc
@@ -0,0 +1,1419 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <limits>
+#include <sstream>
+
+#include "rgw_zone.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_reshard.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "cls/lock/cls_lock_client.h"
+#include "common/errno.h"
+#include "common/ceph_json.h"
+
+#include "common/dout.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_tier_rados.h"
+#include "services/svc_bilog_rados.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+const string reshard_oid_prefix = "reshard.";
+const string reshard_lock_name = "reshard_process";
+const string bucket_instance_lock_name = "bucket_instance_lock";
+
+/* All primes up to 2000 used to attempt to make dynamic sharding use
+ * a prime numbers of shards. Note: this list also includes 1 for when
+ * 1 shard is the most appropriate, even though 1 is not prime.
+ */
+const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
+  1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61,
+  67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
+  139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211,
+  223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283,
+  293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379,
+  383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461,
+  463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563,
+  569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643,
+  647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739,
+  743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829,
+  839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937,
+  941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021,
+  1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093,
+  1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181,
+  1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259,
+  1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
+  1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433,
+  1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493,
+  1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579,
+  1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
+  1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741,
+  1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831,
+  1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913,
+  1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999
+};
+
+class BucketReshardShard {
+  rgw::sal::RadosStore* store;
+  const RGWBucketInfo& bucket_info;
+  int shard_id;
+  RGWRados::BucketShard bs;
+  vector<rgw_cls_bi_entry> entries;
+  map<RGWObjCategory, rgw_bucket_category_stats> stats;
+  deque<librados::AioCompletion *>& aio_completions;
+  uint64_t max_aio_completions;
+  uint64_t reshard_shard_batch_size;
+
+  int wait_next_completion() {
+    librados::AioCompletion *c = aio_completions.front();
+    aio_completions.pop_front();
+
+    c->wait_for_complete();
+
+    int ret = c->get_return_value();
+    c->release();
+
+    if (ret < 0) {
+      derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    return 0;
+  }
+
+  int get_completion(librados::AioCompletion **c) {
+    if (aio_completions.size() >= max_aio_completions) {
+      int ret = wait_next_completion();
+      if (ret < 0) {
+        return ret;
+      }
+    }
+
+    *c = librados::Rados::aio_create_completion(nullptr, nullptr);
+    aio_completions.push_back(*c);
+
+    return 0;
+  }
+
+public:
+  BucketReshardShard(const DoutPrefixProvider *dpp,
+		     rgw::sal::RadosStore *_store, const RGWBucketInfo& _bucket_info,
+                     const rgw::bucket_index_layout_generation& index,
+                     int shard_id, deque<librados::AioCompletion *>& _completions) :
+    store(_store), bucket_info(_bucket_info), shard_id(shard_id),
+    bs(store->getRados()), aio_completions(_completions)
+  {
+    bs.init(dpp, bucket_info, index, shard_id);
+
+    max_aio_completions =
+      store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_max_aio");
+    reshard_shard_batch_size =
+      store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_batch_size");
+  }
+
+  int get_shard_id() const {
+    return shard_id;
+  }
+
+  int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
+                const rgw_bucket_category_stats& entry_stats) {
+    entries.push_back(entry);
+    if (account) {
+      rgw_bucket_category_stats& target = stats[category];
+      target.num_entries += entry_stats.num_entries;
+      target.total_size += entry_stats.total_size;
+      target.total_size_rounded += entry_stats.total_size_rounded;
+      target.actual_size += entry_stats.actual_size;
+    }
+    if (entries.size() >= reshard_shard_batch_size) {
+      int ret = flush();
+      if (ret < 0) {
+        return ret;
+      }
+    }
+
+    return 0;
+  }
+
+  int flush() {
+    if (entries.size() == 0) {
+      return 0;
+    }
+
+    librados::ObjectWriteOperation op;
+    for (auto& entry : entries) {
+      store->getRados()->bi_put(op, bs, entry);
+    }
+    cls_rgw_bucket_update_stats(op, false, stats);
+
+    librados::AioCompletion *c;
+    int ret = get_completion(&c);
+    if (ret < 0) {
+      return ret;
+    }
+    ret = bs.bucket_obj.aio_operate(c, &op);
+    if (ret < 0) {
+      derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+    entries.clear();
+    stats.clear();
+    return 0;
+  }
+
+  int wait_all_aio() {
+    int ret = 0;
+    while (!aio_completions.empty()) {
+      int r = wait_next_completion();
+      if (r < 0) {
+        ret = r;
+      }
+    }
+    return ret;
+  }
+}; // class BucketReshardShard
+
+
+class BucketReshardManager {
+  rgw::sal::RadosStore *store;
+  deque<librados::AioCompletion *> completions;
+  vector<BucketReshardShard> target_shards;
+
+public:
+  BucketReshardManager(const DoutPrefixProvider *dpp,
+		       rgw::sal::RadosStore *_store,
+		       const RGWBucketInfo& bucket_info,
+                       const rgw::bucket_index_layout_generation& target)
+    : store(_store)
+  {
+    const uint32_t num_shards = rgw::num_shards(target.layout.normal);
+    target_shards.reserve(num_shards);
+    for (uint32_t i = 0; i < num_shards; ++i) {
+      target_shards.emplace_back(dpp, store, bucket_info, target, i, completions);
+    }
+  }
+
+  ~BucketReshardManager() {
+    for (auto& shard : target_shards) {
+      int ret = shard.wait_all_aio();
+      if (ret < 0) {
+        ldout(store->ctx(), 20) << __func__ <<
+	  ": shard->wait_all_aio() returned ret=" << ret << dendl;
+      }
+    }
+  }
+
+  int add_entry(int shard_index,
+                rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
+                const rgw_bucket_category_stats& entry_stats) {
+    int ret = target_shards[shard_index].add_entry(entry, account, category,
+						   entry_stats);
+    if (ret < 0) {
+      derr << "ERROR: target_shards.add_entry(" << entry.idx <<
+	") returned error: " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    return 0;
+  }
+
+  int finish() {
+    int ret = 0;
+    for (auto& shard : target_shards) {
+      int r = shard.flush();
+      if (r < 0) {
+        derr << "ERROR: target_shards[" << shard.get_shard_id() << "].flush() returned error: " << cpp_strerror(-r) << dendl;
+        ret = r;
+      }
+    }
+    for (auto& shard : target_shards) {
+      int r = shard.wait_all_aio();
+      if (r < 0) {
+        derr << "ERROR: target_shards[" << shard.get_shard_id() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl;
+        ret = r;
+      }
+    }
+    target_shards.clear();
+    return ret;
+  }
+}; // class BucketReshardManager
+
+RGWBucketReshard::RGWBucketReshard(rgw::sal::RadosStore* _store,
+				   const RGWBucketInfo& _bucket_info,
+				   const std::map<std::string, bufferlist>& _bucket_attrs,
+				   RGWBucketReshardLock* _outer_reshard_lock) :
+  store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
+  reshard_lock(store, bucket_info, true),
+  outer_reshard_lock(_outer_reshard_lock)
+{ }
+
+// sets reshard status of bucket index shards for the current index layout
+static int set_resharding_status(const DoutPrefixProvider *dpp,
+				 rgw::sal::RadosStore* store,
+				 const RGWBucketInfo& bucket_info,
+                                 cls_rgw_reshard_status status)
+{
+  cls_rgw_bucket_instance_entry instance_entry;
+  instance_entry.set_status(status);
+
+  int ret = store->getRados()->bucket_set_reshard(dpp, bucket_info, instance_entry);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: "
+		  << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+static int remove_old_reshard_instance(rgw::sal::RadosStore* store,
+                                       const rgw_bucket& bucket,
+                                       const DoutPrefixProvider* dpp)
+{
+  RGWBucketInfo info;
+  int r = store->getRados()->get_bucket_instance_info(bucket, info, nullptr,
+                                                      nullptr, null_yield, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  // delete its shard objects (ignore errors)
+  store->svc()->bi->clean_index(dpp, info, info.layout.current_index);
+  // delete the bucket instance metadata
+  return store->ctl()->bucket->remove_bucket_instance_info(bucket, info, null_yield, dpp);
+}
+
+// initialize the new bucket index shard objects
+static int init_target_index(rgw::sal::RadosStore* store,
+                             RGWBucketInfo& bucket_info,
+                             const rgw::bucket_index_layout_generation& index,
+                             const DoutPrefixProvider* dpp)
+{
+  int ret = store->svc()->bi->init_index(dpp, bucket_info, index);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to initialize "
+       "target index shard objects: " << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  if (!bucket_info.datasync_flag_enabled()) {
+    // if bucket sync is disabled, disable it on each of the new shards too
+    auto log = rgw::log_layout_from_index(0, index);
+    ret = store->svc()->bilog_rados->log_stop(dpp, bucket_info, log, -1);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to disable "
+          "bucket sync on the target index shard objects: "
+          << cpp_strerror(ret) << dendl;
+      store->svc()->bi->clean_index(dpp, bucket_info, index);
+      return ret;
+    }
+  }
+
+  return ret;
+}
+
+// initialize a target index layout, create its bucket index shard objects, and
+// write the target layout to the bucket instance metadata
+static int init_target_layout(rgw::sal::RadosStore* store,
+                              RGWBucketInfo& bucket_info,
+			      std::map<std::string, bufferlist>& bucket_attrs,
+                              ReshardFaultInjector& fault,
+                              uint32_t new_num_shards,
+                              const DoutPrefixProvider* dpp)
+{
+  auto prev = bucket_info.layout; // make a copy for cleanup
+  const auto current = prev.current_index;
+
+  // initialize a new normal target index layout generation
+  rgw::bucket_index_layout_generation target;
+  target.layout.type = rgw::BucketIndexType::Normal;
+  target.layout.normal.num_shards = new_num_shards;
+  target.gen = current.gen + 1;
+
+  if (bucket_info.reshard_status == cls_rgw_reshard_status::IN_PROGRESS) {
+    // backward-compatible cleanup of old reshards, where the target was in a
+    // different bucket instance
+    if (!bucket_info.new_bucket_instance_id.empty()) {
+      rgw_bucket new_bucket = bucket_info.bucket;
+      new_bucket.bucket_id = bucket_info.new_bucket_instance_id;
+      ldout(store->ctx(), 10) << __func__ << " removing target bucket instance "
+          "from a previous reshard attempt" << dendl;
+      // ignore errors
+      remove_old_reshard_instance(store, new_bucket, dpp);
+    }
+    bucket_info.reshard_status = cls_rgw_reshard_status::NOT_RESHARDING;
+  }
+
+  if (bucket_info.layout.target_index) {
+    // a previous reshard failed or stalled, and its reshard lock dropped
+    ldpp_dout(dpp, 10) << __func__ << " removing existing target index "
+        "objects from a previous reshard attempt" << dendl;
+    // delete its existing shard objects (ignore errors)
+    store->svc()->bi->clean_index(dpp, bucket_info, *bucket_info.layout.target_index);
+    // don't reuse this same generation in the new target layout, in case
+    // something is still trying to operate on its shard objects
+    target.gen = bucket_info.layout.target_index->gen + 1;
+  }
+
+  // create the index shard objects
+  int ret = init_target_index(store, bucket_info, target, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  // retry in case of racing writes to the bucket instance metadata
+  static constexpr auto max_retries = 10;
+  int tries = 0;
+  do {
+    // update resharding state
+    bucket_info.layout.target_index = target;
+    bucket_info.layout.resharding = rgw::BucketReshardState::InProgress;
+
+    if (ret = fault.check("set_target_layout");
+        ret == 0) { // no fault injected, write the bucket instance metadata
+      ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
+                                                        real_time(), &bucket_attrs, dpp, null_yield);
+    } else if (ret == -ECANCELED) {
+      fault.clear(); // clear the fault so a retry can succeed
+    }
+
+    if (ret == -ECANCELED) {
+      // racing write detected, read the latest bucket info and try again
+      int ret2 = store->getRados()->get_bucket_instance_info(
+          bucket_info.bucket, bucket_info,
+          nullptr, &bucket_attrs, null_yield, dpp);
+      if (ret2 < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+            "bucket info: " << cpp_strerror(ret2) << dendl;
+        ret = ret2;
+        break;
+      }
+
+      // check that we're still in the reshard state we started in
+      if (bucket_info.layout.resharding != rgw::BucketReshardState::None ||
+          bucket_info.layout.current_index != current) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+            "another reshard" << dendl;
+        break;
+      }
+
+      prev = bucket_info.layout; // update the copy
+    }
+    ++tries;
+  } while (ret == -ECANCELED && tries < max_retries);
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to write "
+        "target index layout to bucket info: " << cpp_strerror(ret) << dendl;
+
+    bucket_info.layout = std::move(prev);  // restore in-memory layout
+
+    // delete the target shard objects (ignore errors)
+    store->svc()->bi->clean_index(dpp, bucket_info, target);
+    return ret;
+  }
+  return 0;
+} // init_target_layout
+
+// delete the bucket index shards associated with the target layout and remove
+// it from the bucket instance metadata
+static int revert_target_layout(rgw::sal::RadosStore* store,
+                                RGWBucketInfo& bucket_info,
+				std::map<std::string, bufferlist>& bucket_attrs,
+                                ReshardFaultInjector& fault,
+                                const DoutPrefixProvider* dpp)
+{
+  auto prev = bucket_info.layout; // make a copy for cleanup
+
+  // remove target index shard objects
+  int ret = store->svc()->bi->clean_index(dpp, bucket_info, *prev.target_index);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to remove "
+        "target index with: " << cpp_strerror(ret) << dendl;
+    ret = 0; // non-fatal error
+  }
+
+  // retry in case of racing writes to the bucket instance metadata
+  static constexpr auto max_retries = 10;
+  int tries = 0;
+  do {
+    // clear target_index and resharding state
+    bucket_info.layout.target_index = std::nullopt;
+    bucket_info.layout.resharding = rgw::BucketReshardState::None;
+
+    if (ret = fault.check("revert_target_layout");
+        ret == 0) { // no fault injected, revert the bucket instance metadata
+      ret = store->getRados()->put_bucket_instance_info(bucket_info, false,
+                                                        real_time(),
+                                                        &bucket_attrs, dpp, null_yield);
+    } else if (ret == -ECANCELED) {
+      fault.clear(); // clear the fault so a retry can succeed
+    }
+
+    if (ret == -ECANCELED) {
+      // racing write detected, read the latest bucket info and try again
+      int ret2 = store->getRados()->get_bucket_instance_info(
+          bucket_info.bucket, bucket_info,
+          nullptr, &bucket_attrs, null_yield, dpp);
+      if (ret2 < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+            "bucket info: " << cpp_strerror(ret2) << dendl;
+        ret = ret2;
+        break;
+      }
+
+      // check that we're still in the reshard state we started in
+      if (bucket_info.layout.resharding == rgw::BucketReshardState::None) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+            "reshard cancel" << dendl;
+        return -ECANCELED;
+      }
+      if (bucket_info.layout.current_index != prev.current_index ||
+          bucket_info.layout.target_index != prev.target_index) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+            "another reshard" << dendl;
+        return -ECANCELED;
+      }
+
+      prev = bucket_info.layout; // update the copy
+    }
+    ++tries;
+  } while (ret == -ECANCELED && tries < max_retries);
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to clear "
+        "target index layout in bucket info: " << cpp_strerror(ret) << dendl;
+
+    bucket_info.layout = std::move(prev);  // restore in-memory layout
+    return ret;
+  }
+  return 0;
+} // remove_target_layout
+
+static int init_reshard(rgw::sal::RadosStore* store,
+                        RGWBucketInfo& bucket_info,
+			std::map<std::string, bufferlist>& bucket_attrs,
+                        ReshardFaultInjector& fault,
+                        uint32_t new_num_shards,
+                        const DoutPrefixProvider *dpp)
+{
+  if (new_num_shards == 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " got invalid new_num_shards=0" << dendl;
+    return -EINVAL;
+  }
+
+  int ret = init_target_layout(store, bucket_info, bucket_attrs, fault, new_num_shards, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (ret = fault.check("block_writes");
+      ret == 0) { // no fault injected, block writes to the current index shards
+    ret = set_resharding_status(dpp, store, bucket_info,
+                                cls_rgw_reshard_status::IN_PROGRESS);
+  }
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to pause "
+        "writes to the current index: " << cpp_strerror(ret) << dendl;
+    // clean up the target layout (ignore errors)
+    revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+    return ret;
+  }
+  return 0;
+} // init_reshard
+
+static int cancel_reshard(rgw::sal::RadosStore* store,
+                          RGWBucketInfo& bucket_info,
+			  std::map<std::string, bufferlist>& bucket_attrs,
+                          ReshardFaultInjector& fault,
+                          const DoutPrefixProvider *dpp)
+{
+  // unblock writes to the current index shard objects
+  int ret = set_resharding_status(dpp, store, bucket_info,
+                                  cls_rgw_reshard_status::NOT_RESHARDING);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
+        "writes to current index objects: " << cpp_strerror(ret) << dendl;
+    ret = 0; // non-fatal error
+  }
+
+  if (bucket_info.layout.target_index) {
+    return revert_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+  }
+  // there is nothing to revert
+  return 0;
+} // cancel_reshard
+
+static int commit_target_layout(rgw::sal::RadosStore* store,
+                                RGWBucketInfo& bucket_info,
+                                std::map<std::string, bufferlist>& bucket_attrs,
+                                ReshardFaultInjector& fault,
+                                const DoutPrefixProvider *dpp)
+{
+  auto& layout = bucket_info.layout;
+  const auto next_log_gen = layout.logs.empty() ? 1 :
+      layout.logs.back().gen + 1;
+
+  if (!store->svc()->zone->need_to_log_data()) {
+    // if we're not syncing data, we can drop any existing logs
+    layout.logs.clear();
+  }
+
+  // use the new index layout as current
+  ceph_assert(layout.target_index);
+  layout.current_index = std::move(*layout.target_index);
+  layout.target_index = std::nullopt;
+  layout.resharding = rgw::BucketReshardState::None;
+  // add the in-index log layout
+  layout.logs.push_back(log_layout_from_index(next_log_gen, layout.current_index));
+
+  int ret = fault.check("commit_target_layout");
+  if (ret == 0) { // no fault injected, write the bucket instance metadata
+    ret = store->getRados()->put_bucket_instance_info(
+        bucket_info, false, real_time(), &bucket_attrs, dpp, null_yield);
+  } else if (ret == -ECANCELED) {
+    fault.clear(); // clear the fault so a retry can succeed
+  }
+  return ret;
+} // commit_target_layout
+
+static int commit_reshard(rgw::sal::RadosStore* store,
+                          RGWBucketInfo& bucket_info,
+			  std::map<std::string, bufferlist>& bucket_attrs,
+                          ReshardFaultInjector& fault,
+                          const DoutPrefixProvider *dpp)
+{
+  auto prev = bucket_info.layout; // make a copy for cleanup
+
+  // retry in case of racing writes to the bucket instance metadata
+  static constexpr auto max_retries = 10;
+  int tries = 0;
+  int ret = 0;
+  do {
+    ret = commit_target_layout(store, bucket_info, bucket_attrs, fault, dpp);
+    if (ret == -ECANCELED) {
+      // racing write detected, read the latest bucket info and try again
+      int ret2 = store->getRados()->get_bucket_instance_info(
+          bucket_info.bucket, bucket_info,
+          nullptr, &bucket_attrs, null_yield, dpp);
+      if (ret2 < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to read "
+            "bucket info: " << cpp_strerror(ret2) << dendl;
+        ret = ret2;
+        break;
+      }
+
+      // check that we're still in the reshard state we started in
+      if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+            "reshard cancel" << dendl;
+        return -ECANCELED; // whatever canceled us already did the cleanup
+      }
+      if (bucket_info.layout.current_index != prev.current_index ||
+          bucket_info.layout.target_index != prev.target_index) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " raced with "
+            "another reshard" << dendl;
+        return -ECANCELED; // whatever canceled us already did the cleanup
+      }
+
+      prev = bucket_info.layout; // update the copy
+    }
+    ++tries;
+  } while (ret == -ECANCELED && tries < max_retries);
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << " failed to commit "
+        "target index layout: " << cpp_strerror(ret) << dendl;
+
+    bucket_info.layout = std::move(prev); // restore in-memory layout
+
+    // unblock writes to the current index shard objects
+    int ret2 = set_resharding_status(dpp, store, bucket_info,
+                                     cls_rgw_reshard_status::NOT_RESHARDING);
+    if (ret2 < 0) {
+      ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to unblock "
+          "writes to current index objects: " << cpp_strerror(ret2) << dendl;
+      // non-fatal error
+    }
+    return ret;
+  }
+
+  if (store->svc()->zone->need_to_log_data() && !prev.logs.empty() &&
+      prev.current_index.layout.type == rgw::BucketIndexType::Normal) {
+    // write a datalog entry for each shard of the previous index. triggering
+    // sync on the old shards will force them to detect the end-of-log for that
+    // generation, and eventually transition to the next
+    // TODO: use a log layout to support types other than BucketLogType::InIndex
+    for (uint32_t shard_id = 0; shard_id < rgw::num_shards(prev.current_index.layout.normal); ++shard_id) {
+      // This null_yield can stay, for now, since we're in our own thread
+      ret = store->svc()->datalog_rados->add_entry(dpp, bucket_info, prev.logs.back(), shard_id,
+						   null_yield);
+      if (ret < 0) {
+        ldpp_dout(dpp, 1) << "WARNING: failed writing data log (bucket_info.bucket="
+        << bucket_info.bucket << ", shard_id=" << shard_id << "of generation="
+        << prev.logs.back().gen << ")" << dendl;
+      } // datalog error is not fatal
+    }
+  }
+
+  // check whether the old index objects are still needed for bilogs
+  const auto& logs = bucket_info.layout.logs;
+  auto log = std::find_if(logs.begin(), logs.end(),
+      [&prev] (const rgw::bucket_log_layout_generation& log) {
+        return log.layout.type == rgw::BucketLogType::InIndex
+            && log.layout.in_index.gen == prev.current_index.gen;
+      });
+  if (log == logs.end()) {
+    // delete the index objects (ignore errors)
+    store->svc()->bi->clean_index(dpp, bucket_info, prev.current_index);
+  }
+  return 0;
+} // commit_reshard
+
+int RGWBucketReshard::clear_resharding(rgw::sal::RadosStore* store,
+                                       RGWBucketInfo& bucket_info,
+				       std::map<std::string, bufferlist>& bucket_attrs,
+                                       const DoutPrefixProvider* dpp)
+{
+  ReshardFaultInjector no_fault;
+  return cancel_reshard(store, bucket_info, bucket_attrs, no_fault, dpp);
+}
+
+int RGWBucketReshard::cancel(const DoutPrefixProvider* dpp)
+{
+  int ret = reshard_lock.lock(dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (bucket_info.layout.resharding != rgw::BucketReshardState::InProgress) {
+    ldpp_dout(dpp, -1) << "ERROR: bucket is not resharding" << dendl;
+    ret = -EINVAL;
+  } else {
+    ret = clear_resharding(store, bucket_info, bucket_attrs, dpp);
+  }
+
+  reshard_lock.unlock();
+  return ret;
+}
+
+RGWBucketReshardLock::RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+					   const std::string& reshard_lock_oid,
+					   bool _ephemeral) :
+  store(_store),
+  lock_oid(reshard_lock_oid),
+  ephemeral(_ephemeral),
+  internal_lock(reshard_lock_name)
+{
+  const int lock_dur_secs = store->ctx()->_conf.get_val<uint64_t>(
+    "rgw_reshard_bucket_lock_duration");
+  duration = std::chrono::seconds(lock_dur_secs);
+
+#define COOKIE_LEN 16
+  char cookie_buf[COOKIE_LEN + 1];
+  gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
+  cookie_buf[COOKIE_LEN] = '\0';
+
+  internal_lock.set_cookie(cookie_buf);
+  internal_lock.set_duration(duration);
+}
+
+int RGWBucketReshardLock::lock(const DoutPrefixProvider *dpp) {
+  internal_lock.set_must_renew(false);
+
+  int ret;
+  if (ephemeral) {
+    ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
+						 lock_oid);
+  } else {
+    ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
+  }
+
+  if (ret == -EBUSY) {
+    ldout(store->ctx(), 0) << "INFO: RGWReshardLock::" << __func__ <<
+      " found lock on " << lock_oid <<
+      " to be held by another RGW process; skipping for now" << dendl;
+    return ret;
+  } else if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: RGWReshardLock::" << __func__ <<
+      " failed to acquire lock on " << lock_oid << ": " <<
+      cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  reset_time(Clock::now());
+
+  return 0;
+}
+
+void RGWBucketReshardLock::unlock() {
+  int ret = internal_lock.unlock(&store->getRados()->reshard_pool_ctx, lock_oid);
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ <<
+      " failed to drop lock on " << lock_oid << " ret=" << ret << dendl;
+  }
+}
+
+int RGWBucketReshardLock::renew(const Clock::time_point& now) {
+  internal_lock.set_must_renew(true);
+  int ret;
+  if (ephemeral) {
+    ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
+						 lock_oid);
+  } else {
+    ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
+  }
+  if (ret < 0) { /* expired or already locked by another processor */
+    std::stringstream error_s;
+    if (-ENOENT == ret) {
+      error_s << "ENOENT (lock expired or never initially locked)";
+    } else {
+      error_s << ret << " (" << cpp_strerror(-ret) << ")";
+    }
+    ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
+      lock_oid << " with error " << error_s.str() << dendl;
+    return ret;
+  }
+  internal_lock.set_must_renew(false);
+
+  reset_time(now);
+  ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " <<
+    lock_oid << dendl;
+
+  return 0;
+}
+
+
+int RGWBucketReshard::do_reshard(const rgw::bucket_index_layout_generation& current,
+                                 const rgw::bucket_index_layout_generation& target,
+                                 int max_entries,
+				 bool verbose,
+				 ostream *out,
+				 Formatter *formatter,
+                                 const DoutPrefixProvider *dpp)
+{
+  if (out) {
+    (*out) << "tenant: " << bucket_info.bucket.tenant << std::endl;
+    (*out) << "bucket name: " << bucket_info.bucket.name << std::endl;
+  }
+
+  /* update bucket info -- in progress*/
+  list<rgw_cls_bi_entry> entries;
+
+  if (max_entries < 0) {
+    ldpp_dout(dpp, 0) << __func__ <<
+      ": can't reshard, negative max_entries" << dendl;
+    return -EINVAL;
+  }
+
+  BucketReshardManager target_shards_mgr(dpp, store, bucket_info, target);
+
+  bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr);
+
+  if (verbose_json_out) {
+    formatter->open_array_section("entries");
+  }
+
+  uint64_t total_entries = 0;
+
+  if (!verbose_json_out && out) {
+    (*out) << "total entries:";
+  }
+
+  const uint32_t num_source_shards = rgw::num_shards(current.layout.normal);
+  string marker;
+  for (uint32_t i = 0; i < num_source_shards; ++i) {
+    bool is_truncated = true;
+    marker.clear();
+    const std::string null_object_filter; // empty string since we're not filtering by object
+    while (is_truncated) {
+      entries.clear();
+      int ret = store->getRados()->bi_list(dpp, bucket_info, i, null_object_filter, marker, max_entries, &entries, &is_truncated);
+      if (ret == -ENOENT) {
+        ldpp_dout(dpp, 1) << "WARNING: " << __func__ << " failed to find shard "
+            << i << ", skipping" << dendl;
+        // break out of the is_truncated loop and move on to the next shard
+        break;
+      } else if (ret < 0) {
+        derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl;
+        return ret;
+      }
+
+      for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
+	rgw_cls_bi_entry& entry = *iter;
+	if (verbose_json_out) {
+	  formatter->open_object_section("entry");
+
+	  encode_json("shard_id", i, formatter);
+	  encode_json("num_entry", total_entries, formatter);
+	  encode_json("entry", entry, formatter);
+	}
+	total_entries++;
+
+	marker = entry.idx;
+
+	int target_shard_id;
+	cls_rgw_obj_key cls_key;
+	RGWObjCategory category;
+	rgw_bucket_category_stats stats;
+	bool account = entry.get_info(&cls_key, &category, &stats);
+	rgw_obj_key key(cls_key);
+	if (entry.type == BIIndexType::OLH && key.empty()) {
+	  // bogus entry created by https://tracker.ceph.com/issues/46456
+	  // to fix, skip so it doesn't get include in the new bucket instance
+	  total_entries--;
+	  ldpp_dout(dpp, 10) << "Dropping entry with empty name, idx=" << marker << dendl;
+	  continue;
+	}
+	rgw_obj obj(bucket_info.bucket, key);
+	RGWMPObj mp;
+	if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) {
+	  // place the multipart .meta object on the same shard as its head object
+	  obj.index_hash_source = mp.get_key();
+	}
+	ret = store->getRados()->get_target_shard_id(bucket_info.layout.target_index->layout.normal,
+						     obj.get_hash_object(), &target_shard_id);
+	if (ret < 0) {
+	  ldpp_dout(dpp, -1) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl;
+	  return ret;
+	}
+
+	int shard_index = (target_shard_id > 0 ? target_shard_id : 0);
+
+	ret = target_shards_mgr.add_entry(shard_index, entry, account,
+					  category, stats);
+	if (ret < 0) {
+	  return ret;
+	}
+
+	Clock::time_point now = Clock::now();
+	if (reshard_lock.should_renew(now)) {
+	  // assume outer locks have timespans at least the size of ours, so
+	  // can call inside conditional
+	  if (outer_reshard_lock) {
+	    ret = outer_reshard_lock->renew(now);
+	    if (ret < 0) {
+	      return ret;
+	    }
+	  }
+	  ret = reshard_lock.renew(now);
+	  if (ret < 0) {
+	    ldpp_dout(dpp, -1) << "Error renewing bucket lock: " << ret << dendl;
+	    return ret;
+	  }
+	}
+	if (verbose_json_out) {
+	  formatter->close_section();
+	  formatter->flush(*out);
+	} else if (out && !(total_entries % 1000)) {
+	  (*out) << " " << total_entries;
+	}
+      } // entries loop
+    }
+  }
+
+  if (verbose_json_out) {
+    formatter->close_section();
+    formatter->flush(*out);
+  } else if (out) {
+    (*out) << " " << total_entries << std::endl;
+  }
+
+  int ret = target_shards_mgr.finish();
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to reshard" << dendl;
+    return -EIO;
+  }
+  return 0;
+} // RGWBucketReshard::do_reshard
+
+int RGWBucketReshard::get_status(const DoutPrefixProvider *dpp, list<cls_rgw_bucket_instance_entry> *status)
+{
+  return store->svc()->bi_rados->get_reshard_status(dpp, bucket_info, status);
+}
+
+int RGWBucketReshard::execute(int num_shards,
+                              ReshardFaultInjector& fault,
+                              int max_op_entries,
+                              const DoutPrefixProvider *dpp,
+                              bool verbose, ostream *out,
+                              Formatter *formatter,
+                              RGWReshard* reshard_log)
+{
+  // take a reshard lock on the bucket
+  int ret = reshard_lock.lock(dpp);
+  if (ret < 0) {
+    return ret;
+  }
+  // unlock when scope exits
+  auto unlock = make_scope_guard([this] { reshard_lock.unlock(); });
+
+  if (reshard_log) {
+    ret = reshard_log->update(dpp, bucket_info);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  // prepare the target index and add its layout the bucket info
+  ret = init_reshard(store, bucket_info, bucket_attrs, fault, num_shards, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (ret = fault.check("do_reshard");
+      ret == 0) { // no fault injected, do the reshard
+    ret = do_reshard(bucket_info.layout.current_index,
+                     *bucket_info.layout.target_index,
+                     max_op_entries, verbose, out, formatter, dpp);
+  }
+
+  if (ret < 0) {
+    cancel_reshard(store, bucket_info, bucket_attrs, fault, dpp);
+
+    ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
+        << bucket_info.bucket.name << "\" canceled due to errors" << dendl;
+    return ret;
+  }
+
+  ret = commit_reshard(store, bucket_info, bucket_attrs, fault, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ldpp_dout(dpp, 1) << __func__ << " INFO: reshard of bucket \""
+      << bucket_info.bucket.name << "\" completed successfully" << dendl;
+  return 0;
+} // execute
+
+bool RGWBucketReshard::can_reshard(const RGWBucketInfo& bucket,
+                                   const RGWSI_Zone* zone_svc)
+{
+  return !zone_svc->need_to_log_data() ||
+      bucket.layout.logs.size() < max_bilog_history;
+}
+
+
+RGWReshard::RGWReshard(rgw::sal::RadosStore* _store, bool _verbose, ostream *_out,
+                       Formatter *_formatter) :
+  store(_store), instance_lock(bucket_instance_lock_name),
+  verbose(_verbose), out(_out), formatter(_formatter)
+{
+  num_logshards = store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_num_logs");
+}
+
+string RGWReshard::get_logshard_key(const string& tenant,
+				    const string& bucket_name)
+{
+  return tenant + ":" + bucket_name;
+}
+
+#define MAX_RESHARD_LOGSHARDS_PRIME 7877
+
+void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid)
+{
+  string key = get_logshard_key(tenant, bucket_name);
+
+  uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size());
+  uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
+  sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards;
+
+  get_logshard_oid(int(sid), oid);
+}
+
+int RGWReshard::add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
+{
+  if (!store->svc()->zone->can_reshard()) {
+    ldpp_dout(dpp, 20) << __func__ << " Resharding is disabled"  << dendl;
+    return 0;
+  }
+
+  string logshard_oid;
+
+  get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+  librados::ObjectWriteOperation op;
+  cls_rgw_reshard_add(op, entry);
+
+  int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int RGWReshard::update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info)
+{
+  cls_rgw_reshard_entry entry;
+  entry.bucket_name = bucket_info.bucket.name;
+  entry.bucket_id = bucket_info.bucket.bucket_id;
+  entry.tenant = bucket_info.owner.tenant;
+
+  int ret = get(dpp, entry);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = add(dpp, entry);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " <<
+      cpp_strerror(-ret) << dendl;
+  }
+
+  return ret;
+}
+
+
+int RGWReshard::list(const DoutPrefixProvider *dpp, int logshard_num, string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated)
+{
+  string logshard_oid;
+
+  get_logshard_oid(logshard_num, &logshard_oid);
+
+  int ret = cls_rgw_reshard_list(store->getRados()->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated);
+
+  if (ret == -ENOENT) {
+    // these shard objects aren't created until we actually write something to
+    // them, so treat ENOENT as a successful empty listing
+    *is_truncated = false;
+    ret = 0;
+  } else if (ret == -EACCES) {
+    ldpp_dout(dpp, -1) << "ERROR: access denied to pool " << store->svc()->zone->get_zone_params().reshard_pool
+                      << ". Fix the pool access permissions of your client" << dendl;
+  } else if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to list reshard log entries, oid="
+        << logshard_oid << " marker=" << marker << " " << cpp_strerror(ret) << dendl;
+  }
+
+  return ret;
+}
+
+int RGWReshard::get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry)
+{
+  string logshard_oid;
+
+  get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+  int ret = cls_rgw_reshard_get(store->getRados()->reshard_pool_ctx, logshard_oid, entry);
+  if (ret < 0) {
+    if (ret != -ENOENT) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant <<
+	" bucket=" << entry.bucket_name << dendl;
+    }
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWReshard::remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry)
+{
+  string logshard_oid;
+
+  get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
+
+  librados::ObjectWriteOperation op;
+  cls_rgw_reshard_remove(op, entry);
+
+  int ret = rgw_rados_operate(dpp, store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
+    return ret;
+  }
+
+  return ret;
+}
+
+int RGWReshard::clear_bucket_resharding(const DoutPrefixProvider *dpp, const string& bucket_instance_oid, cls_rgw_reshard_entry& entry)
+{
+  int ret = cls_rgw_clear_bucket_resharding(store->getRados()->reshard_pool_ctx, bucket_instance_oid);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWReshardWait::wait(optional_yield y)
+{
+  std::unique_lock lock(mutex);
+
+  if (going_down) {
+    return -ECANCELED;
+  }
+
+  if (y) {
+    auto& context = y.get_io_context();
+    auto& yield = y.get_yield_context();
+
+    Waiter waiter(context);
+    waiters.push_back(waiter);
+    lock.unlock();
+
+    waiter.timer.expires_after(duration);
+
+    boost::system::error_code ec;
+    waiter.timer.async_wait(yield[ec]);
+
+    lock.lock();
+    waiters.erase(waiters.iterator_to(waiter));
+    return -ec.value();
+  }
+
+  cond.wait_for(lock, duration);
+
+  if (going_down) {
+    return -ECANCELED;
+  }
+
+  return 0;
+}
+
+void RGWReshardWait::stop()
+{
+  std::scoped_lock lock(mutex);
+  going_down = true;
+  cond.notify_all();
+  for (auto& waiter : waiters) {
+    // unblock any waiters with ECANCELED
+    waiter.timer.cancel();
+  }
+}
+
+int RGWReshard::process_entry(const cls_rgw_reshard_entry& entry,
+                              int max_entries, const DoutPrefixProvider *dpp)
+{
+  ldpp_dout(dpp, 20) << __func__ << " resharding " <<
+      entry.bucket_name  << dendl;
+
+  rgw_bucket bucket;
+  RGWBucketInfo bucket_info;
+  std::map<std::string, bufferlist> bucket_attrs;
+
+  int ret = store->getRados()->get_bucket_info(store->svc(),
+                                               entry.tenant,
+					       entry.bucket_name,
+                                               bucket_info, nullptr,
+                                               null_yield, dpp,
+					       &bucket_attrs);
+  if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) {
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) <<  __func__ <<
+          ": Error in get_bucket_info for bucket " << entry.bucket_name <<
+          ": " << cpp_strerror(-ret) << dendl;
+      if (ret != -ENOENT) {
+        // any error other than ENOENT will abort
+        return ret;
+      }
+    } else {
+      ldpp_dout(dpp, 0) << __func__ <<
+          ": Bucket: " << entry.bucket_name <<
+          " already resharded by someone, skipping " << dendl;
+    }
+
+    // we've encountered a reshard queue entry for an apparently
+    // non-existent bucket; let's try to recover by cleaning up
+    ldpp_dout(dpp, 0) <<  __func__ <<
+        ": removing reshard queue entry for a resharded or non-existent bucket" <<
+        entry.bucket_name << dendl;
+
+    ret = remove(dpp, entry);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+          ": Error removing non-existent bucket " <<
+          entry.bucket_name << " from resharding queue: " <<
+          cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    // we cleaned up, move on to the next entry
+    return 0;
+  }
+
+  if (!RGWBucketReshard::can_reshard(bucket_info, store->svc()->zone)) {
+    ldpp_dout(dpp, 1) << "Bucket " << bucket_info.bucket << " is not "
+        "eligible for resharding until peer zones finish syncing one "
+        "or more of its old log generations" << dendl;
+    return remove(dpp, entry);
+  }
+
+  RGWBucketReshard br(store, bucket_info, bucket_attrs, nullptr);
+
+  ReshardFaultInjector f; // no fault injected
+  ret = br.execute(entry.new_num_shards, f, max_entries, dpp,
+                   false, nullptr, nullptr, this);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) <<  __func__ <<
+        ": Error during resharding bucket " << entry.bucket_name << ":" <<
+        cpp_strerror(-ret)<< dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << __func__ <<
+      " removing reshard queue entry for bucket " << entry.bucket_name <<
+      dendl;
+
+  ret = remove(dpp, entry);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << __func__ << ": Error removing bucket " <<
+        entry.bucket_name << " from resharding queue: " <<
+        cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int RGWReshard::process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp)
+{
+  string marker;
+  bool truncated = true;
+
+  constexpr uint32_t max_entries = 1000;
+
+  string logshard_oid;
+  get_logshard_oid(logshard_num, &logshard_oid);
+
+  RGWBucketReshardLock logshard_lock(store, logshard_oid, false);
+
+  int ret = logshard_lock.lock(dpp);
+  if (ret < 0) { 
+    ldpp_dout(dpp, 5) << __func__ << "(): failed to acquire lock on " <<
+      logshard_oid << ", ret = " << ret <<dendl;
+    return ret;
+  }
+  
+  do {
+    std::list<cls_rgw_reshard_entry> entries;
+    ret = list(dpp, logshard_num, marker, max_entries, entries, &truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, 10) << "cannot list all reshards in logshard oid=" <<
+	logshard_oid << dendl;
+      continue;
+    }
+
+    for(auto& entry: entries) { // logshard entries
+      process_entry(entry, max_entries, dpp);
+      if (ret < 0) {
+        return ret;
+      }
+
+      Clock::time_point now = Clock::now();
+      if (logshard_lock.should_renew(now)) {
+        ret = logshard_lock.renew(now);
+        if (ret < 0) {
+          return ret;
+        }
+      }
+
+      entry.get_key(&marker);
+    } // entry for loop
+  } while (truncated);
+
+  logshard_lock.unlock();
+  return 0;
+}
+
+
+void RGWReshard::get_logshard_oid(int shard_num, string *logshard)
+{
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
+
+  string objname(reshard_oid_prefix);
+  *logshard =  objname + buf;
+}
+
+int RGWReshard::process_all_logshards(const DoutPrefixProvider *dpp)
+{
+  int ret = 0;
+
+  for (int i = 0; i < num_logshards; i++) {
+    string logshard;
+    get_logshard_oid(i, &logshard);
+
+    ldpp_dout(dpp, 20) << "processing logshard = " << logshard << dendl;
+
+    ret = process_single_logshard(i, dpp);
+
+    ldpp_dout(dpp, 20) << "finish processing logshard = " << logshard << " , ret = " << ret << dendl;
+  }
+
+  return 0;
+}
+
+bool RGWReshard::going_down()
+{
+  return down_flag;
+}
+
+void RGWReshard::start_processor()
+{
+  worker = new ReshardWorker(store->ctx(), this);
+  worker->create("rgw_reshard");
+}
+
+void RGWReshard::stop_processor()
+{
+  down_flag = true;
+  if (worker) {
+    worker->stop();
+    worker->join();
+  }
+  delete worker;
+  worker = nullptr;
+}
+
+void *RGWReshard::ReshardWorker::entry() {
+  do {
+    utime_t start = ceph_clock_now();
+    reshard->process_all_logshards(this);
+
+    if (reshard->going_down())
+      break;
+
+    utime_t end = ceph_clock_now();
+    end -= start;
+    int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
+
+    if (secs <= end.sec())
+      continue; // next round
+
+    secs -= end.sec();
+
+    std::unique_lock locker{lock};
+    cond.wait_for(locker, std::chrono::seconds(secs));
+  } while (!reshard->going_down());
+
+  return NULL;
+}
+
+void RGWReshard::ReshardWorker::stop()
+{
+  std::lock_guard l{lock};
+  cond.notify_all();
+}
+
+CephContext *RGWReshard::ReshardWorker::get_cct() const
+{
+  return cct;
+}
+
+unsigned RGWReshard::ReshardWorker::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& RGWReshard::ReshardWorker::gen_prefix(std::ostream& out) const
+{
+  return out << "rgw reshard worker thread: ";
+}
diff --git a/src/rgw/driver/rados/rgw_reshard.h b/src/rgw/driver/rados/rgw_reshard.h
new file mode 100644
index 000000000..59819f3a5
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_reshard.h
@@ -0,0 +1,274 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <vector>
+#include <initializer_list>
+#include <functional>
+#include <iterator>
+#include <algorithm>
+
+#include <boost/intrusive/list.hpp>
+#include <boost/asio/basic_waitable_timer.hpp>
+
+#include "include/common_fwd.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_time.h"
+#include "common/async/yield_context.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "cls/lock/cls_lock_client.h"
+
+#include "rgw_common.h"
+#include "common/fault_injector.h"
+
+
+class RGWReshard;
+namespace rgw { namespace sal {
+  class RadosStore;
+} }
+
+using ReshardFaultInjector = FaultInjector<std::string_view>;
+
+class RGWBucketReshardLock {
+  using Clock = ceph::coarse_mono_clock;
+
+  rgw::sal::RadosStore* store;
+  const std::string lock_oid;
+  const bool ephemeral;
+  rados::cls::lock::Lock internal_lock;
+  std::chrono::seconds duration;
+
+  Clock::time_point start_time;
+  Clock::time_point renew_thresh;
+
+  void reset_time(const Clock::time_point& now) {
+    start_time = now;
+    renew_thresh = start_time + duration / 2;
+  }
+
+public:
+  RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+		       const std::string& reshard_lock_oid,
+		       bool _ephemeral);
+  RGWBucketReshardLock(rgw::sal::RadosStore* _store,
+		       const RGWBucketInfo& bucket_info,
+		       bool _ephemeral) :
+    RGWBucketReshardLock(_store, bucket_info.bucket.get_key(':'), _ephemeral)
+  {}
+
+  int lock(const DoutPrefixProvider *dpp);
+  void unlock();
+  int renew(const Clock::time_point&);
+
+  bool should_renew(const Clock::time_point& now) const {
+    return now >= renew_thresh;
+  }
+}; // class RGWBucketReshardLock
+
+class RGWBucketReshard {
+ public:
+  using Clock = ceph::coarse_mono_clock;
+
+ private:
+  rgw::sal::RadosStore *store;
+  RGWBucketInfo bucket_info;
+  std::map<std::string, bufferlist> bucket_attrs;
+
+  RGWBucketReshardLock reshard_lock;
+  RGWBucketReshardLock* outer_reshard_lock;
+
+  // using an initializer_list as an array in contiguous memory
+  // allocated in at once
+  static const std::initializer_list<uint16_t> reshard_primes;
+
+  int do_reshard(const rgw::bucket_index_layout_generation& current,
+                 const rgw::bucket_index_layout_generation& target,
+                 int max_entries,
+                 bool verbose,
+                 std::ostream *os,
+		 Formatter *formatter,
+                 const DoutPrefixProvider *dpp);
+public:
+
+  // pass nullptr for the final parameter if no outer reshard lock to
+  // manage
+  RGWBucketReshard(rgw::sal::RadosStore* _store,
+		   const RGWBucketInfo& _bucket_info,
+		   const std::map<std::string, bufferlist>& _bucket_attrs,
+		   RGWBucketReshardLock* _outer_reshard_lock);
+  int execute(int num_shards, ReshardFaultInjector& f,
+              int max_op_entries, const DoutPrefixProvider *dpp,
+              bool verbose = false, std::ostream *out = nullptr,
+              ceph::Formatter *formatter = nullptr,
+	      RGWReshard *reshard_log = nullptr);
+  int get_status(const DoutPrefixProvider *dpp, std::list<cls_rgw_bucket_instance_entry> *status);
+  int cancel(const DoutPrefixProvider* dpp);
+
+  static int clear_resharding(rgw::sal::RadosStore* store,
+			      RGWBucketInfo& bucket_info,
+			      std::map<std::string, bufferlist>& bucket_attrs,
+                              const DoutPrefixProvider* dpp);
+
+  static uint32_t get_max_prime_shards() {
+    return *std::crbegin(reshard_primes);
+  }
+
+  // returns the prime in our list less than or equal to the
+  // parameter; the lowest value that can be returned is 1
+  static uint32_t get_prime_shards_less_or_equal(uint32_t requested_shards) {
+    auto it = std::upper_bound(reshard_primes.begin(), reshard_primes.end(),
+			       requested_shards);
+    if (it == reshard_primes.begin()) {
+      return 1;
+    } else {
+      return *(--it);
+    }
+  }
+
+  // returns the prime in our list greater than or equal to the
+  // parameter; if we do not have such a prime, 0 is returned
+  static uint32_t get_prime_shards_greater_or_equal(
+    uint32_t requested_shards)
+  {
+    auto it = std::lower_bound(reshard_primes.begin(), reshard_primes.end(),
+			       requested_shards);
+    if (it == reshard_primes.end()) {
+      return 0;
+    } else {
+      return *it;
+    }
+  }
+
+  // returns a preferred number of shards given a calculated number of
+  // shards based on max_dynamic_shards and the list of prime values
+  static uint32_t get_preferred_shards(uint32_t suggested_shards,
+				       uint32_t max_dynamic_shards) {
+
+    // use a prime if max is within our prime range, otherwise use
+    // specified max
+    const uint32_t absolute_max =
+      max_dynamic_shards >= get_max_prime_shards() ?
+      max_dynamic_shards :
+      get_prime_shards_less_or_equal(max_dynamic_shards);
+
+    // if we can use a prime number, use it, otherwise use suggested;
+    // note get_prime_shards_greater_or_equal will return 0 if no prime in
+    // prime range
+    const uint32_t prime_ish_num_shards =
+      std::max(get_prime_shards_greater_or_equal(suggested_shards),
+	       suggested_shards);
+
+    // dynamic sharding cannot reshard more than defined maximum
+    const uint32_t final_num_shards =
+      std::min(prime_ish_num_shards, absolute_max);
+
+    return final_num_shards;
+  }
+
+  const std::map<std::string, bufferlist>& get_bucket_attrs() const {
+    return bucket_attrs;
+  }
+
+  // for multisite, the RGWBucketInfo keeps a history of old log generations
+  // until all peers are done with them. prevent this log history from growing
+  // too large by refusing to reshard the bucket until the old logs get trimmed
+  static constexpr size_t max_bilog_history = 4;
+
+  static bool can_reshard(const RGWBucketInfo& bucket,
+                          const RGWSI_Zone* zone_svc);
+}; // RGWBucketReshard
+
+
+class RGWReshard {
+public:
+    using Clock = ceph::coarse_mono_clock;
+
+private:
+    rgw::sal::RadosStore* store;
+    std::string lock_name;
+    rados::cls::lock::Lock instance_lock;
+    int num_logshards;
+
+    bool verbose;
+    std::ostream *out;
+    Formatter *formatter;
+
+    void get_logshard_oid(int shard_num, std::string *shard);
+protected:
+  class ReshardWorker : public Thread, public DoutPrefixProvider {
+    CephContext *cct;
+    RGWReshard *reshard;
+    ceph::mutex lock = ceph::make_mutex("ReshardWorker");
+    ceph::condition_variable cond;
+
+  public:
+    ReshardWorker(CephContext * const _cct,
+		  RGWReshard * const _reshard)
+      : cct(_cct),
+        reshard(_reshard) {}
+
+    void *entry() override;
+    void stop();
+
+    CephContext *get_cct() const override;
+    unsigned get_subsys() const override;
+    std::ostream& gen_prefix(std::ostream& out) const override;
+  };
+
+  ReshardWorker *worker = nullptr;
+  std::atomic<bool> down_flag = { false };
+
+  std::string get_logshard_key(const std::string& tenant, const std::string& bucket_name);
+  void get_bucket_logshard_oid(const std::string& tenant, const std::string& bucket_name, std::string *oid);
+
+public:
+  RGWReshard(rgw::sal::RadosStore* _store, bool _verbose = false, std::ostream *_out = nullptr, Formatter *_formatter = nullptr);
+  int add(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
+  int update(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info);
+  int get(const DoutPrefixProvider *dpp, cls_rgw_reshard_entry& entry);
+  int remove(const DoutPrefixProvider *dpp, const cls_rgw_reshard_entry& entry);
+  int list(const DoutPrefixProvider *dpp, int logshard_num, std::string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated);
+  int clear_bucket_resharding(const DoutPrefixProvider *dpp, const std::string& bucket_instance_oid, cls_rgw_reshard_entry& entry);
+
+  /* reshard thread */
+  int process_entry(const cls_rgw_reshard_entry& entry, int max_entries,
+                    const DoutPrefixProvider *dpp);
+  int process_single_logshard(int logshard_num, const DoutPrefixProvider *dpp);
+  int process_all_logshards(const DoutPrefixProvider *dpp);
+  bool going_down();
+  void start_processor();
+  void stop_processor();
+};
+
+class RGWReshardWait {
+ public:
+  // the blocking wait uses std::condition_variable::wait_for(), which uses the
+  // std::chrono::steady_clock. use that for the async waits as well
+  using Clock = std::chrono::steady_clock;
+ private:
+  const ceph::timespan duration;
+  ceph::mutex mutex = ceph::make_mutex("RGWReshardWait::lock");
+  ceph::condition_variable cond;
+
+  struct Waiter : boost::intrusive::list_base_hook<> {
+    using Executor = boost::asio::io_context::executor_type;
+    using Timer = boost::asio::basic_waitable_timer<Clock,
+          boost::asio::wait_traits<Clock>, Executor>;
+    Timer timer;
+    explicit Waiter(boost::asio::io_context& ioc) : timer(ioc) {}
+  };
+  boost::intrusive::list<Waiter> waiters;
+
+  bool going_down{false};
+
+public:
+  RGWReshardWait(ceph::timespan duration = std::chrono::seconds(5))
+    : duration(duration) {}
+  ~RGWReshardWait() {
+    ceph_assert(going_down);
+  }
+  int wait(optional_yield y);
+  // unblock any threads waiting on reshard
+  void stop();
+};
diff --git a/src/rgw/driver/rados/rgw_rest_bucket.cc b/src/rgw/driver/rados/rgw_rest_bucket.cc
new file mode 100644
index 000000000..ebe4e429c
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_bucket.cc
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_op.h"
+#include "driver/rados/rgw_bucket.h"
+#include "rgw_rest_bucket.h"
+#include "rgw_sal.h"
+
+#include "include/str_list.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWOp_Bucket_Info : public RGWRESTOp {
+
+public:
+  RGWOp_Bucket_Info() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_READ);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_bucket_info"; }
+};
+
+void RGWOp_Bucket_Info::execute(optional_yield y)
+{
+  RGWBucketAdminOpState op_state;
+
+  bool fetch_stats;
+
+  std::string bucket;
+
+  string uid_str;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_bool(s, "stats", false, &fetch_stats);
+
+  op_state.set_user_id(uid);
+  op_state.set_bucket_name(bucket);
+  op_state.set_fetch_stats(fetch_stats);
+
+  op_ret = RGWBucketAdminOp::info(driver, op_state, flusher, y, this);
+}
+
+class RGWOp_Get_Policy : public RGWRESTOp {
+
+public:
+  RGWOp_Get_Policy() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_READ);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_policy"; }
+};
+
+void RGWOp_Get_Policy::execute(optional_yield y)
+{
+  RGWBucketAdminOpState op_state;
+
+  std::string bucket;
+  std::string object;
+
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_string(s, "object", object, &object);
+
+  op_state.set_bucket_name(bucket);
+  op_state.set_object(object);
+
+  op_ret = RGWBucketAdminOp::get_policy(driver, op_state, flusher, this);
+}
+
+class RGWOp_Check_Bucket_Index : public RGWRESTOp {
+
+public:
+  RGWOp_Check_Bucket_Index() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "check_bucket_index"; }
+};
+
+void RGWOp_Check_Bucket_Index::execute(optional_yield y)
+{
+  std::string bucket;
+
+  bool fix_index;
+  bool check_objects;
+
+  RGWBucketAdminOpState op_state;
+
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_bool(s, "fix", false, &fix_index);
+  RESTArgs::get_bool(s, "check-objects", false, &check_objects);
+
+  op_state.set_bucket_name(bucket);
+  op_state.set_fix_index(fix_index);
+  op_state.set_check_objects(check_objects);
+
+  op_ret = RGWBucketAdminOp::check_index(driver, op_state, flusher, s->yield, s);
+}
+
+class RGWOp_Bucket_Link : public RGWRESTOp {
+
+public:
+  RGWOp_Bucket_Link() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "link_bucket"; }
+};
+
+void RGWOp_Bucket_Link::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string bucket;
+  std::string bucket_id;
+  std::string new_bucket_name;
+
+  RGWBucketAdminOpState op_state;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_string(s, "bucket-id", bucket_id, &bucket_id);
+  RESTArgs::get_string(s, "new-bucket-name", new_bucket_name, &new_bucket_name);
+
+  rgw_user uid(uid_str);
+  op_state.set_user_id(uid);
+  op_state.set_bucket_name(bucket);
+  op_state.set_bucket_id(bucket_id);
+  op_state.set_new_bucket_name(new_bucket_name);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWBucketAdminOp::link(driver, op_state, s);
+}
+
+class RGWOp_Bucket_Unlink : public RGWRESTOp {
+
+public:
+  RGWOp_Bucket_Unlink() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "unlink_bucket"; }
+};
+
+void RGWOp_Bucket_Unlink::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string bucket;
+
+  RGWBucketAdminOpState op_state;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+
+  op_state.set_user_id(uid);
+  op_state.set_bucket_name(bucket);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWBucketAdminOp::unlink(driver, op_state, s);
+}
+
+class RGWOp_Bucket_Remove : public RGWRESTOp {
+
+public:
+  RGWOp_Bucket_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_bucket"; }
+};
+
+void RGWOp_Bucket_Remove::execute(optional_yield y)
+{
+  std::string bucket_name;
+  bool delete_children;
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name);
+  RESTArgs::get_bool(s, "purge-objects", false, &delete_children);
+
+  /* FIXME We're abusing the owner of the bucket to pass the user, so that it can be forwarded to
+   * the master.  This user is actually the OP caller, not the bucket owner. */
+  op_ret = driver->get_bucket(s, s->user.get(), string(), bucket_name, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "get_bucket returned ret=" << op_ret << dendl;
+    if (op_ret == -ENOENT) {
+      op_ret = -ERR_NO_SUCH_BUCKET;
+    }
+    return;
+  }
+
+  op_ret = bucket->remove_bucket(s, delete_children, true, &s->info, s->yield);
+}
+
+class RGWOp_Set_Bucket_Quota : public RGWRESTOp {
+
+public:
+  RGWOp_Set_Bucket_Quota() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "set_bucket_quota"; }
+};
+
+#define QUOTA_INPUT_MAX_LEN 1024
+
+void RGWOp_Set_Bucket_Quota::execute(optional_yield y)
+{
+  bool uid_arg_existed = false;
+  std::string uid_str;
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str, &uid_arg_existed);
+  if (! uid_arg_existed) {
+    op_ret = -EINVAL;
+    return;
+  }
+  rgw_user uid(uid_str);
+  bool bucket_arg_existed = false;
+  std::string bucket_name;
+  RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name, &bucket_arg_existed);
+  if (! bucket_arg_existed) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  bool use_http_params;
+
+  if (s->content_length > 0) {
+    use_http_params = false;
+  } else {
+    const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+    use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
+  }
+  RGWQuotaInfo quota;
+  if (!use_http_params) {
+    bool empty;
+    op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
+    if (op_ret < 0) {
+      if (!empty)
+        return;
+      /* was probably chunked input, but no content provided, configure via http params */
+      use_http_params = true;
+    }
+  }
+  if (use_http_params) {
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    op_ret = driver->get_bucket(s, nullptr, uid.tenant, bucket_name, &bucket, s->yield);
+    if (op_ret < 0) {
+      return;
+    }
+    RGWQuotaInfo *old_quota = &bucket->get_info().quota;
+    int64_t old_max_size_kb = rgw_rounded_kb(old_quota->max_size);
+    int64_t max_size_kb;
+    bool has_max_size_kb = false;
+    RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, &quota.max_objects);
+    RESTArgs::get_int64(s, "max-size", old_quota->max_size, &quota.max_size);
+    RESTArgs::get_int64(s, "max-size-kb", old_max_size_kb, &max_size_kb, &has_max_size_kb);
+    if (has_max_size_kb)
+      quota.max_size = max_size_kb * 1024;
+    RESTArgs::get_bool(s, "enabled", old_quota->enabled, &quota.enabled);
+  }
+
+  RGWBucketAdminOpState op_state;
+  op_state.set_user_id(uid);
+  op_state.set_bucket_name(bucket_name);
+  op_state.set_quota(quota);
+
+  op_ret = RGWBucketAdminOp::set_quota(driver, op_state, s);
+}
+
+class RGWOp_Sync_Bucket : public RGWRESTOp {
+
+public:
+  RGWOp_Sync_Bucket() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "sync_bucket"; }
+};
+
+void RGWOp_Sync_Bucket::execute(optional_yield y)
+{
+  std::string bucket;
+  std::string tenant;
+  bool sync_bucket;
+
+  RGWBucketAdminOpState op_state;
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_string(s, "tenant", tenant, &tenant);
+  RESTArgs::get_bool(s, "sync", true, &sync_bucket);
+
+  op_state.set_bucket_name(bucket);
+  op_state.set_tenant(tenant);
+  op_state.set_sync_bucket(sync_bucket);
+
+  op_ret = RGWBucketAdminOp::sync_bucket(driver, op_state, s);
+}
+
+class RGWOp_Object_Remove: public RGWRESTOp {
+
+public:
+  RGWOp_Object_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("buckets", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_object"; }
+};
+
+void RGWOp_Object_Remove::execute(optional_yield y)
+{
+  std::string bucket;
+  std::string object;
+
+  RGWBucketAdminOpState op_state;
+
+  RESTArgs::get_string(s, "bucket", bucket, &bucket);
+  RESTArgs::get_string(s, "object", object, &object);
+
+  op_state.set_bucket_name(bucket);
+  op_state.set_object(object);
+
+  op_ret = RGWBucketAdminOp::remove_object(driver, op_state, s);
+}
+
+
+RGWOp *RGWHandler_Bucket::op_get()
+{
+
+  if (s->info.args.sub_resource_exists("policy"))
+    return new RGWOp_Get_Policy;
+
+  if (s->info.args.sub_resource_exists("index"))
+    return new RGWOp_Check_Bucket_Index;
+
+  return new RGWOp_Bucket_Info;
+}
+
+RGWOp *RGWHandler_Bucket::op_put()
+{
+  if (s->info.args.sub_resource_exists("quota"))
+    return new RGWOp_Set_Bucket_Quota;
+
+  if (s->info.args.sub_resource_exists("sync"))
+    return new RGWOp_Sync_Bucket;
+  
+  return new RGWOp_Bucket_Link;
+}
+
+RGWOp *RGWHandler_Bucket::op_post()
+{
+  return new RGWOp_Bucket_Unlink;
+}
+
+RGWOp *RGWHandler_Bucket::op_delete()
+{
+  if (s->info.args.sub_resource_exists("object"))
+    return new RGWOp_Object_Remove;
+
+  return new RGWOp_Bucket_Remove;
+}
diff --git a/src/rgw/driver/rados/rgw_rest_bucket.h b/src/rgw/driver/rados/rgw_rest_bucket.h
new file mode 100644
index 000000000..00f0b6439
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_bucket.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+
+
+class RGWHandler_Bucket : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+  RGWOp *op_put() override;
+  RGWOp *op_post() override;
+  RGWOp *op_delete() override;
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_Bucket() override = default;
+
+  int read_permissions(RGWOp*, optional_yield y) override {
+    return 0;
+  }
+};
+
+class RGWRESTMgr_Bucket : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Bucket() = default;
+  ~RGWRESTMgr_Bucket() override = default;
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override {
+    return new RGWHandler_Bucket(auth_registry);
+  }
+};
diff --git a/src/rgw/driver/rados/rgw_rest_log.cc b/src/rgw/driver/rados/rgw_rest_log.cc
new file mode 100644
index 000000000..f4099807d
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_log.cc
@@ -0,0 +1,1268 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/ceph_json.h"
+#include "common/strtol.h"
+#include "rgw_rest.h"
+#include "rgw_op.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_log.h"
+#include "rgw_client_io.h"
+#include "rgw_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_common.h"
+#include "rgw_zone.h"
+#include "rgw_mdlog.h"
+#include "rgw_datalog_notify.h"
+#include "rgw_trim_bilog.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_bilog_rados.h"
+
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define LOG_CLASS_LIST_MAX_ENTRIES (1000)
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void RGWOp_MDLog_List::execute(optional_yield y) {
+  string   period = s->info.args.get("period");
+  string   shard = s->info.args.get("id");
+  string   max_entries_str = s->info.args.get("max-entries");
+  string   marker = s->info.args.get("marker"),
+           err;
+  void    *handle;
+  unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+  if (s->info.args.exists("start-time") ||
+      s->info.args.exists("end-time")) {
+    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (!max_entries_str.empty()) {
+    max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
+      max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+    }
+  }
+
+  if (period.empty()) {
+    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+    period = driver->get_zone()->get_current_period_id();
+    if (period.empty()) {
+      ldpp_dout(this, 5) << "Missing period id" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+  meta_log.init_list_entries(shard_id, {}, {}, marker, &handle);
+
+  op_ret = meta_log.list_entries(this, handle, max_entries, entries,
+                                   &last_marker, &truncated);
+
+  meta_log.complete_list_entries(handle);
+}
+
+void RGWOp_MDLog_List::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret < 0)
+    return;
+
+  s->formatter->open_object_section("log_entries");
+  s->formatter->dump_string("marker", last_marker);
+  s->formatter->dump_bool("truncated", truncated);
+  {
+    s->formatter->open_array_section("entries");
+    for (list<cls_log_entry>::iterator iter = entries.begin();
+	 iter != entries.end(); ++iter) {
+      cls_log_entry& entry = *iter;
+      static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->dump_log_entry(entry, s->formatter);
+      flusher.flush();
+    }
+    s->formatter->close_section();
+  }
+  s->formatter->close_section();
+  flusher.flush();
+}
+
+void RGWOp_MDLog_Info::execute(optional_yield y) {
+  num_objects = s->cct->_conf->rgw_md_log_max_shards;
+  period = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->read_oldest_log_period(y, s);
+  op_ret = period.get_error();
+}
+
+void RGWOp_MDLog_Info::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  s->formatter->open_object_section("mdlog");
+  s->formatter->dump_unsigned("num_objects", num_objects);
+  if (period) {
+    s->formatter->dump_string("period", period.get_period().get_id());
+    s->formatter->dump_unsigned("realm_epoch", period.get_epoch());
+  }
+  s->formatter->close_section();
+  flusher.flush();
+}
+
+void RGWOp_MDLog_ShardInfo::execute(optional_yield y) {
+  string period = s->info.args.get("period");
+  string shard = s->info.args.get("id");
+  string err;
+
+  unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (period.empty()) {
+    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+    period = driver->get_zone()->get_current_period_id();
+
+    if (period.empty()) {
+      ldpp_dout(this, 5) << "Missing period id" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+  op_ret = meta_log.get_info(this, shard_id, &info);
+}
+
+void RGWOp_MDLog_ShardInfo::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  encode_json("info", info, s->formatter);
+  flusher.flush();
+}
+
+void RGWOp_MDLog_Delete::execute(optional_yield y) {
+  string   marker = s->info.args.get("marker"),
+           period = s->info.args.get("period"),
+           shard = s->info.args.get("id"),
+           err;
+  unsigned shard_id;
+
+
+  if (s->info.args.exists("start-time") ||
+      s->info.args.exists("end-time")) {
+    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+    op_ret = -EINVAL;
+  }
+
+  if (s->info.args.exists("start-marker")) {
+    ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
+    op_ret = -EINVAL;
+  }
+
+  if (s->info.args.exists("end-marker")) {
+    if (!s->info.args.exists("marker")) {
+      marker = s->info.args.get("end-marker");
+    } else {
+      ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
+      op_ret = -EINVAL;
+    }
+  }
+
+  op_ret = 0;
+
+  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (marker.empty()) { /* bounding end */
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (period.empty()) {
+    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+    period = driver->get_zone()->get_current_period_id();
+
+    if (period.empty()) {
+      ldpp_dout(this, 5) << "Missing period id" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+
+  op_ret = meta_log.trim(this, shard_id, {}, {}, {}, marker);
+}
+
+void RGWOp_MDLog_Lock::execute(optional_yield y) {
+  string period, shard_id_str, duration_str, locker_id, zone_id;
+  unsigned shard_id;
+
+  op_ret = 0;
+
+  period       = s->info.args.get("period");
+  shard_id_str = s->info.args.get("id");
+  duration_str = s->info.args.get("length");
+  locker_id    = s->info.args.get("locker-id");
+  zone_id      = s->info.args.get("zone-id");
+
+  if (period.empty()) {
+    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+    period = driver->get_zone()->get_current_period_id();
+  }
+
+  if (period.empty() ||
+      shard_id_str.empty() ||
+      (duration_str.empty()) ||
+      locker_id.empty() ||
+      zone_id.empty()) {
+    ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  string err;
+  shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+  unsigned dur;
+  dur = (unsigned)strict_strtol(duration_str.c_str(), 10, &err);
+  if (!err.empty() || dur <= 0) {
+    ldpp_dout(this, 5) << "invalid length param " << duration_str << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+  op_ret = meta_log.lock_exclusive(s, shard_id, make_timespan(dur), zone_id,
+				     locker_id);
+  if (op_ret == -EBUSY)
+    op_ret = -ERR_LOCKED;
+}
+
+void RGWOp_MDLog_Unlock::execute(optional_yield y) {
+  string period, shard_id_str, locker_id, zone_id;
+  unsigned shard_id;
+
+  op_ret = 0;
+
+  period       = s->info.args.get("period");
+  shard_id_str = s->info.args.get("id");
+  locker_id    = s->info.args.get("locker-id");
+  zone_id      = s->info.args.get("zone-id");
+
+  if (period.empty()) {
+    ldpp_dout(this, 5) << "Missing period id trying to use current" << dendl;
+    period = driver->get_zone()->get_current_period_id();
+  }
+
+  if (period.empty() ||
+      shard_id_str.empty() ||
+      locker_id.empty() ||
+      zone_id.empty()) {
+    ldpp_dout(this, 5) << "Error invalid parameter list" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  string err;
+  shard_id = (unsigned)strict_strtol(shard_id_str.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id param " << shard_id_str << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  RGWMetadataLog meta_log{s->cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone, static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls, period};
+  op_ret = meta_log.unlock(s, shard_id, zone_id, locker_id);
+}
+
+void RGWOp_MDLog_Notify::execute(optional_yield y) {
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
+  if (r < 0) {
+    op_ret = r;
+    return;
+  }
+
+  char* buf = data.c_str();
+  ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
+
+  JSONParser p;
+  r = p.parse(buf, data.length());
+  if (r < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
+    op_ret = r;
+    return;
+  }
+
+  set<int> updated_shards;
+  try {
+    decode_json_obj(updated_shards, &p);
+  } catch (JSONDecoder::err& err) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    for (set<int>::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+      ldpp_dout(this, 20) << __func__ << "(): updated shard=" << *iter << dendl;
+    }
+  }
+
+  driver->wakeup_meta_sync_shards(updated_shards);
+
+  op_ret = 0;
+}
+
+void RGWOp_BILog_List::execute(optional_yield y) {
+  bool gen_specified = false;
+  string tenant_name = s->info.args.get("tenant"),
+         bucket_name = s->info.args.get("bucket"),
+         marker = s->info.args.get("marker"),
+         max_entries_str = s->info.args.get("max-entries"),
+         bucket_instance = s->info.args.get("bucket-instance"),
+         gen_str = s->info.args.get("generation", &gen_specified),
+         format_version_str = s->info.args.get("format-ver");
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+  unsigned max_entries;
+
+  if (bucket_name.empty() && bucket_instance.empty()) {
+    ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  string err;
+  std::optional<uint64_t> gen;
+  if (gen_specified) {
+    gen = strict_strtoll(gen_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  if (!format_version_str.empty()) {
+    format_ver = strict_strtoll(format_version_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(s, 5) << "Failed to parse format-ver param: " << format_ver << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  int shard_id;
+  string bn;
+  op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!bucket_instance.empty()) {
+    b.name = bn;
+    b.bucket_id = bucket_instance;
+  }
+  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+    return;
+  }
+
+  const auto& logs = bucket->get_info().layout.logs;
+  if (logs.empty()) {
+    ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
+    op_ret = -ENOENT;
+    return;
+  }
+
+  auto log = std::prev(logs.end());
+  if (gen) {
+    log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen));
+    if (log == logs.end()) {
+      ldpp_dout(s, 5) << "ERROR: no log layout with gen=" << *gen << dendl;
+      op_ret = -ENOENT;
+      return;
+    }
+  }
+  if (auto next = std::next(log); next != logs.end()) {
+    next_log_layout = *next;   // get the next log after the current latest
+  }
+  auto& log_layout = *log; // current log layout for log listing
+
+  unsigned count = 0;
+
+
+  max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+  if (!err.empty())
+    max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+  send_response();
+  do {
+    list<rgw_bi_log_entry> entries;
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->bilog_rados->log_list(s, bucket->get_info(), log_layout, shard_id,
+                                               marker, max_entries - count,
+                                               entries, &truncated);
+    if (ret < 0) {
+      ldpp_dout(this, 5) << "ERROR: list_bi_log_entries()" << dendl;
+      return;
+    }
+
+    count += entries.size();
+
+    send_response(entries, marker);
+  } while (truncated && count < max_entries);
+
+  send_response_end();
+}
+
+void RGWOp_BILog_List::send_response() {
+  if (sent_header)
+    return;
+
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  sent_header = true;
+
+  if (op_ret < 0)
+    return;
+
+  if (format_ver >= 2) {
+    s->formatter->open_object_section("result");
+  }
+
+  s->formatter->open_array_section("entries");
+}
+
+void RGWOp_BILog_List::send_response(list<rgw_bi_log_entry>& entries, string& marker)
+{
+  for (list<rgw_bi_log_entry>::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+    rgw_bi_log_entry& entry = *iter;
+    encode_json("entry", entry, s->formatter);
+
+    marker = entry.id;
+    flusher.flush();
+  }
+}
+
+void RGWOp_BILog_List::send_response_end() {
+  s->formatter->close_section();
+
+  if (format_ver >= 2) {
+    encode_json("truncated", truncated, s->formatter);
+
+    if (next_log_layout) {
+      s->formatter->open_object_section("next_log");
+      encode_json("generation", next_log_layout->gen, s->formatter);
+      encode_json("num_shards", rgw::num_shards(next_log_layout->layout.in_index.layout), s->formatter);
+      s->formatter->close_section(); // next_log
+    }
+
+    s->formatter->close_section(); // result
+  }
+
+  flusher.flush();
+}
+
+void RGWOp_BILog_Info::execute(optional_yield y) {
+  string tenant_name = s->info.args.get("tenant"),
+         bucket_name = s->info.args.get("bucket"),
+         bucket_instance = s->info.args.get("bucket-instance");
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+  if (bucket_name.empty() && bucket_instance.empty()) {
+    ldpp_dout(this, 5) << "ERROR: neither bucket nor bucket instance specified" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  int shard_id;
+  string bn;
+  op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!bucket_instance.empty()) {
+    b.name = bn;
+    b.bucket_id = bucket_instance;
+  }
+  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+    return;
+  }
+
+  const auto& logs = bucket->get_info().layout.logs;
+  if (logs.empty()) {
+    ldpp_dout(s, 5) << "ERROR: bucket=" << bucket_name << " has no log layouts" << dendl;
+    op_ret = -ENOENT;
+    return;
+  }
+
+  map<RGWObjCategory, RGWStorageStats> stats;
+  const auto& index = log_to_index_layout(logs.back());
+
+  int ret =  bucket->read_stats(s, index, shard_id, &bucket_ver, &master_ver, stats, &max_marker, &syncstopped);
+  if (ret < 0 && ret != -ENOENT) {
+    op_ret = ret;
+    return;
+  }
+
+  oldest_gen = logs.front().gen;
+  latest_gen = logs.back().gen;
+
+  for (auto& log : logs) {
+      uint32_t num_shards = rgw::num_shards(log.layout.in_index.layout);
+      generations.push_back({log.gen, num_shards});
+  }
+}
+
+void RGWOp_BILog_Info::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret < 0)
+    return;
+
+  s->formatter->open_object_section("info");
+  encode_json("bucket_ver", bucket_ver, s->formatter);
+  encode_json("master_ver", master_ver, s->formatter);
+  encode_json("max_marker", max_marker, s->formatter);
+  encode_json("syncstopped", syncstopped, s->formatter);
+  encode_json("oldest_gen", oldest_gen, s->formatter);
+  encode_json("latest_gen", latest_gen, s->formatter);
+  encode_json("generations", generations, s->formatter);
+  s->formatter->close_section();
+
+  flusher.flush();
+}
+
+void RGWOp_BILog_Delete::execute(optional_yield y) {
+  bool gen_specified = false;
+  string tenant_name = s->info.args.get("tenant"),
+         bucket_name = s->info.args.get("bucket"),
+         start_marker = s->info.args.get("start-marker"),
+         end_marker = s->info.args.get("end-marker"),
+         bucket_instance = s->info.args.get("bucket-instance"),
+	 gen_str = s->info.args.get("generation", &gen_specified);
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  rgw_bucket b(rgw_bucket_key(tenant_name, bucket_name));
+
+  op_ret = 0;
+  if ((bucket_name.empty() && bucket_instance.empty()) ||
+      end_marker.empty()) {
+    ldpp_dout(this, 5) << "ERROR: one of bucket or bucket instance, and also end-marker is mandatory" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  string err;
+  uint64_t gen = 0;
+  if (gen_specified) {
+    gen = strict_strtoll(gen_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(s, 5) << "Error parsing generation param " << gen_str << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  int shard_id;
+  string bn;
+  op_ret = rgw_bucket_parse_bucket_instance(bucket_instance, &bn, &bucket_instance, &shard_id);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!bucket_instance.empty()) {
+    b.name = bn;
+    b.bucket_id = bucket_instance;
+  }
+  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 5) << "could not get bucket info for bucket=" << bucket_name << dendl;
+    return;
+  }
+
+  op_ret = bilog_trim(this, static_cast<rgw::sal::RadosStore*>(driver),
+		      bucket->get_info(), gen, shard_id,
+		      start_marker, end_marker);
+  if (op_ret < 0) {
+    ldpp_dout(s, 5) << "bilog_trim failed with op_ret=" << op_ret << dendl;
+  }
+
+  return;
+}
+
+void RGWOp_DATALog_List::execute(optional_yield y) {
+  string   shard = s->info.args.get("id");
+
+  string   max_entries_str = s->info.args.get("max-entries"),
+           marker = s->info.args.get("marker"),
+           err;
+  unsigned shard_id, max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+
+  if (s->info.args.exists("start-time") ||
+      s->info.args.exists("end-time")) {
+    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+    op_ret = -EINVAL;
+  }
+
+  s->info.args.get_bool("extra-info", &extra_info, false);
+
+  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (!max_entries_str.empty()) {
+    max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    if (max_entries > LOG_CLASS_LIST_MAX_ENTRIES) {
+      max_entries = LOG_CLASS_LIST_MAX_ENTRIES;
+    }
+  }
+
+  // Note that last_marker is updated to be the marker of the last
+  // entry listed
+  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->
+    datalog_rados->list_entries(this, shard_id, max_entries, entries,
+				marker, &last_marker, &truncated, y);
+}
+
+void RGWOp_DATALog_List::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret < 0)
+    return;
+
+  s->formatter->open_object_section("log_entries");
+  s->formatter->dump_string("marker", last_marker);
+  s->formatter->dump_bool("truncated", truncated);
+  {
+    s->formatter->open_array_section("entries");
+    for (const auto& entry : entries) {
+      if (!extra_info) {
+        encode_json("entry", entry.entry, s->formatter);
+      } else {
+        encode_json("entry", entry, s->formatter);
+      }
+      flusher.flush();
+    }
+    s->formatter->close_section();
+  }
+  s->formatter->close_section();
+  flusher.flush();
+}
+
+
+void RGWOp_DATALog_Info::execute(optional_yield y) {
+  num_objects = s->cct->_conf->rgw_data_log_num_shards;
+  op_ret = 0;
+}
+
+void RGWOp_DATALog_Info::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  s->formatter->open_object_section("num_objects");
+  s->formatter->dump_unsigned("num_objects", num_objects);
+  s->formatter->close_section();
+  flusher.flush();
+}
+
+void RGWOp_DATALog_ShardInfo::execute(optional_yield y) {
+  string shard = s->info.args.get("id");
+  string err;
+
+  unsigned shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->
+    datalog_rados->get_info(this, shard_id, &info, y);
+}
+
+void RGWOp_DATALog_ShardInfo::send_response() {
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  encode_json("info", info, s->formatter);
+  flusher.flush();
+}
+
+void RGWOp_DATALog_Notify::execute(optional_yield y) {
+  string  source_zone = s->info.args.get("source-zone");
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = read_all_input(s, LARGE_ENOUGH_BUF);
+  if (r < 0) {
+    op_ret = r;
+    return;
+  }
+
+  char* buf = data.c_str();
+  ldpp_dout(this, 20) << __func__ << "(): read data: " << buf << dendl;
+
+  JSONParser p;
+  r = p.parse(buf, data.length());
+  if (r < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to parse JSON" << dendl;
+    op_ret = r;
+    return;
+  }
+
+  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry>> updated_shards;
+  try {
+    auto decoder = rgw_data_notify_v1_decoder{updated_shards};
+    decode_json_obj(decoder, &p);
+  } catch (JSONDecoder::err& err) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter = updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+      ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+      bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+      for (const auto& [key, gen] : entries) {
+        ldpp_dout(this, 20) << __func__ << "(): modified key=" << key
+        << " of gen=" << gen << dendl;
+      }
+    }
+  }
+
+  driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
+
+  op_ret = 0;
+}
+
+void RGWOp_DATALog_Notify2::execute(optional_yield y) {
+  string  source_zone = s->info.args.get("source-zone");
+#define LARGE_ENOUGH_BUF (128 * 1024)
+
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = rgw_rest_read_all_input(s, LARGE_ENOUGH_BUF);
+  if (r < 0) {
+    op_ret = r;
+    return;
+  }
+
+  char* buf = data.c_str();
+  ldout(s->cct, 20) << __func__ << "(): read data: " << buf << dendl;
+
+  JSONParser p;
+  r = p.parse(buf, data.length());
+  if (r < 0) {
+    ldout(s->cct, 0) << "ERROR: failed to parse JSON" << dendl;
+    op_ret = r;
+    return;
+  }
+
+  bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> > updated_shards;
+  try {
+    decode_json_obj(updated_shards, &p);
+  } catch (JSONDecoder::err& err) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode JSON" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (driver->ctx()->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    for (bc::flat_map<int, bc::flat_set<rgw_data_notify_entry> >::iterator iter =
+        updated_shards.begin(); iter != updated_shards.end(); ++iter) {
+      ldpp_dout(this, 20) << __func__ << "(): updated shard=" << iter->first << dendl;
+      bc::flat_set<rgw_data_notify_entry>& entries = iter->second;
+      for (const auto& [key, gen] : entries) {
+        ldpp_dout(this, 20) << __func__ << "(): modified key=" << key <<
+        " of generation=" << gen << dendl;
+      }
+    }
+  }
+
+  driver->wakeup_data_sync_shards(this, source_zone, updated_shards);
+
+  op_ret = 0;
+}
+
+void RGWOp_DATALog_Delete::execute(optional_yield y) {
+  string   marker = s->info.args.get("marker"),
+           shard = s->info.args.get("id"),
+           err;
+  unsigned shard_id;
+
+  op_ret = 0;
+
+  if (s->info.args.exists("start-time") ||
+      s->info.args.exists("end-time")) {
+    ldpp_dout(this, 5) << "start-time and end-time are no longer accepted" << dendl;
+    op_ret = -EINVAL;
+  }
+
+  if (s->info.args.exists("start-marker")) {
+    ldpp_dout(this, 5) << "start-marker is no longer accepted" << dendl;
+    op_ret = -EINVAL;
+  }
+
+  if (s->info.args.exists("end-marker")) {
+    if (!s->info.args.exists("marker")) {
+      marker = s->info.args.get("end-marker");
+    } else {
+      ldpp_dout(this, 5) << "end-marker and marker cannot both be provided" << dendl;
+      op_ret = -EINVAL;
+    }
+  }
+
+  shard_id = (unsigned)strict_strtol(shard.c_str(), 10, &err);
+  if (!err.empty()) {
+    ldpp_dout(this, 5) << "Error parsing shard_id " << shard << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+  if (marker.empty()) { /* bounding end */
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->
+    datalog_rados->trim_entries(this, shard_id, marker, y);
+}
+
+// not in header to avoid pulling in rgw_sync.h
+class RGWOp_MDLog_Status : public RGWRESTOp {
+  rgw_meta_sync_status status;
+public:
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override { return "get_metadata_log_status"; }
+};
+
+void RGWOp_MDLog_Status::execute(optional_yield y)
+{
+  auto sync = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_meta_sync_manager();
+  if (sync == nullptr) {
+    ldpp_dout(this, 1) << "no sync manager" << dendl;
+    op_ret = -ENOENT;
+    return;
+  }
+  op_ret = sync->read_sync_status(this, &status);
+}
+
+void RGWOp_MDLog_Status::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret >= 0) {
+    encode_json("status", status, s->formatter);
+  }
+  flusher.flush();
+}
+
+// not in header to avoid pulling in rgw_data_sync.h
+class RGWOp_BILog_Status : public RGWRESTOp {
+  bilog_status_v2 status;
+  int version = 1;
+public:
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("bilog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override { return "get_bucket_index_log_status"; }
+};
+
+void RGWOp_BILog_Status::execute(optional_yield y)
+{
+  const auto options = s->info.args.get("options");
+  bool merge = (options == "merge");
+  const auto source_zone = s->info.args.get("source-zone");
+  const auto source_key = s->info.args.get("source-bucket");
+  auto key = s->info.args.get("bucket");
+  op_ret = s->info.args.get_int("version", &version, 1);
+
+  if (key.empty()) {
+    key = source_key;
+  }
+  if (key.empty()) {
+    ldpp_dout(this, 4) << "no 'bucket' provided" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  rgw_bucket b;
+  int shard_id{-1}; // unused
+  op_ret = rgw_bucket_parse_bucket_key(s->cct, key, &b, &shard_id);
+  if (op_ret < 0) {
+    ldpp_dout(this, 4) << "invalid 'bucket' provided" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  // read the bucket instance info for num_shards
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  op_ret = driver->get_bucket(s, nullptr, b, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 4) << "failed to read bucket info: " << cpp_strerror(op_ret) << dendl;
+    return;
+  }
+
+  rgw_bucket source_bucket;
+
+  if (source_key.empty() ||
+      source_key == key) {
+    source_bucket = bucket->get_key();
+  } else {
+    op_ret = rgw_bucket_parse_bucket_key(s->cct, source_key, &source_bucket, nullptr);
+    if (op_ret < 0) {
+      ldpp_dout(this, 4) << "invalid 'source-bucket' provided (key=" << source_key << ")" << dendl;
+      return;
+    }
+  }
+
+  const auto& local_zone_id = driver->get_zone()->get_id();
+
+  if (!merge) {
+    rgw_sync_bucket_pipe pipe;
+    pipe.source.zone = source_zone;
+    pipe.source.bucket = source_bucket;
+    pipe.dest.zone = local_zone_id;
+    pipe.dest.bucket = bucket->get_key();
+
+    ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
+
+    op_ret = rgw_read_bucket_full_sync_status(
+      this,
+      static_cast<rgw::sal::RadosStore*>(driver),
+      pipe,
+      &status.sync_status,
+      s->yield);
+    if (op_ret < 0) {
+      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+      return;
+    }
+    status.inc_status.resize(status.sync_status.shards_done_with_gen.size());
+
+    op_ret = rgw_read_bucket_inc_sync_status(
+      this,
+      static_cast<rgw::sal::RadosStore*>(driver),
+      pipe,
+      status.sync_status.incremental_gen,
+      &status.inc_status);
+    if (op_ret < 0) {
+      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+    }
+    return;
+  }
+
+  rgw_zone_id source_zone_id(source_zone);
+
+  RGWBucketSyncPolicyHandlerRef source_handler;
+  op_ret = driver->get_sync_policy_handler(s, source_zone_id, source_bucket, &source_handler, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "could not get bucket sync policy handler (r=" << op_ret << ")" << dendl;
+    return;
+  }
+
+  auto local_dests = source_handler->get_all_dests_in_zone(local_zone_id);
+
+  std::vector<rgw_bucket_shard_sync_info> current_status;
+  for (auto& entry : local_dests) {
+    auto pipe = entry.second;
+
+    ldpp_dout(this, 20) << "RGWOp_BILog_Status::execute(optional_yield y): getting sync status for pipe=" << pipe << dendl;
+
+    RGWBucketInfo *pinfo = &bucket->get_info();
+    std::optional<RGWBucketInfo> opt_dest_info;
+
+    if (!pipe.dest.bucket) {
+      /* Uh oh, something went wrong */
+      ldpp_dout(this, 20) << "ERROR: RGWOp_BILog_Status::execute(optional_yield y): BUG: pipe.dest.bucket was not initialized" << pipe << dendl;
+      op_ret = -EIO;
+      return;
+    }
+
+    if (*pipe.dest.bucket != pinfo->bucket) {
+      opt_dest_info.emplace();
+      std::unique_ptr<rgw::sal::Bucket> dest_bucket;
+      op_ret = driver->get_bucket(s, nullptr, *pipe.dest.bucket, &dest_bucket, y);
+      if (op_ret < 0) {
+        ldpp_dout(this, 4) << "failed to read target bucket info (bucket=: " << cpp_strerror(op_ret) << dendl;
+        return;
+      }
+
+      *opt_dest_info = dest_bucket->get_info();
+      pinfo = &(*opt_dest_info);
+      pipe.dest.bucket = pinfo->bucket;
+    }
+
+    op_ret = rgw_read_bucket_full_sync_status(
+      this,
+      static_cast<rgw::sal::RadosStore*>(driver),
+      pipe,
+      &status.sync_status,
+      s->yield);
+    if (op_ret < 0) {
+      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_full_sync_status() on pipe=" << pipe << " returned ret=" << op_ret << dendl;
+      return;
+    }
+
+    current_status.resize(status.sync_status.shards_done_with_gen.size());
+    int r = rgw_read_bucket_inc_sync_status(this, static_cast<rgw::sal::RadosStore*>(driver),
+					    pipe, status.sync_status.incremental_gen, &current_status);
+    if (r < 0) {
+      ldpp_dout(this, -1) << "ERROR: rgw_read_bucket_inc_sync_status() on pipe=" << pipe << " returned ret=" << r << dendl;
+      op_ret = r;
+      return;
+    }
+
+    if (status.inc_status.empty()) {
+      status.inc_status = std::move(current_status);
+    } else {
+      if (current_status.size() != status.inc_status.size()) {
+        op_ret = -EINVAL;
+        ldpp_dout(this, -1) << "ERROR: different number of shards for sync status of buckets "
+	  "syncing from the same source: status.size()= "
+			    << status.inc_status.size()
+			    << " current_status.size()="
+			    << current_status.size() << dendl;
+	return;
+      }
+      auto m = status.inc_status.begin();
+      for (auto& cur_shard_status : current_status) {
+        auto& result_shard_status = *m++;
+        // always take the first marker, or any later marker that's smaller
+        if (cur_shard_status.inc_marker.position < result_shard_status.inc_marker.position) {
+          result_shard_status = std::move(cur_shard_status);
+        }
+      }
+    }
+  }
+}
+
+void RGWOp_BILog_Status::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret >= 0) {
+    if (version < 2) {
+      encode_json("status", status.inc_status, s->formatter);
+    } else {
+      encode_json("status", status, s->formatter);
+    }
+  }
+  flusher.flush();
+}
+
+// not in header to avoid pulling in rgw_data_sync.h
+class RGWOp_DATALog_Status : public RGWRESTOp {
+  rgw_data_sync_status status;
+public:
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override ;
+  void send_response() override;
+  const char* name() const override { return "get_data_changes_log_status"; }
+};
+
+void RGWOp_DATALog_Status::execute(optional_yield y)
+{
+  const auto source_zone = s->info.args.get("source-zone");
+  auto sync = driver->get_data_sync_manager(source_zone);
+  if (sync == nullptr) {
+    ldpp_dout(this, 1) << "no sync manager for source-zone " << source_zone << dendl;
+    op_ret = -ENOENT;
+    return;
+  }
+  op_ret = sync->read_sync_status(this, &status);
+}
+
+void RGWOp_DATALog_Status::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret >= 0) {
+    encode_json("status", status, s->formatter);
+  }
+  flusher.flush();
+}
+
+
+RGWOp *RGWHandler_Log::op_get() {
+  bool exists;
+  string type = s->info.args.get("type", &exists);
+
+  if (!exists) {
+    return NULL;
+  }
+
+  if (type.compare("metadata") == 0) {
+    if (s->info.args.exists("id")) {
+      if (s->info.args.exists("info")) {
+        return new RGWOp_MDLog_ShardInfo;
+      } else {
+        return new RGWOp_MDLog_List;
+      }
+    } else if (s->info.args.exists("status")) {
+      return new RGWOp_MDLog_Status;
+    } else {
+      return new RGWOp_MDLog_Info;
+    }
+  } else if (type.compare("bucket-index") == 0) {
+    if (s->info.args.exists("info")) {
+      return new RGWOp_BILog_Info;
+    } else if (s->info.args.exists("status")) {
+      return new RGWOp_BILog_Status;
+    } else {
+      return new RGWOp_BILog_List;
+    }
+  } else if (type.compare("data") == 0) {
+    if (s->info.args.exists("id")) {
+      if (s->info.args.exists("info")) {
+        return new RGWOp_DATALog_ShardInfo;
+      } else {
+        return new RGWOp_DATALog_List;
+      }
+    } else if (s->info.args.exists("status")) {
+      return new RGWOp_DATALog_Status;
+    } else {
+      return new RGWOp_DATALog_Info;
+    }
+  }
+  return NULL;
+}
+
+RGWOp *RGWHandler_Log::op_delete() {
+  bool exists;
+  string type = s->info.args.get("type", &exists);
+
+  if (!exists) {
+    return NULL;
+  }
+
+  if (type.compare("metadata") == 0)
+    return new RGWOp_MDLog_Delete;
+  else if (type.compare("bucket-index") == 0) 
+    return new RGWOp_BILog_Delete;
+  else if (type.compare("data") == 0)
+    return new RGWOp_DATALog_Delete;
+  return NULL;
+}
+
+RGWOp *RGWHandler_Log::op_post() {
+  bool exists;
+  string type = s->info.args.get("type", &exists);
+
+  if (!exists) {
+    return NULL;
+  }
+
+  if (type.compare("metadata") == 0) {
+    if (s->info.args.exists("lock"))
+      return new RGWOp_MDLog_Lock;
+    else if (s->info.args.exists("unlock"))
+      return new RGWOp_MDLog_Unlock;
+    else if (s->info.args.exists("notify"))
+      return new RGWOp_MDLog_Notify;
+  } else if (type.compare("data") == 0) {
+    if (s->info.args.exists("notify")) {
+      return new RGWOp_DATALog_Notify;
+    } else if (s->info.args.exists("notify2")) {
+      return new RGWOp_DATALog_Notify2;
+    }
+  }
+  return NULL;
+}
+
diff --git a/src/rgw/driver/rados/rgw_rest_log.h b/src/rgw/driver/rados/rgw_rest_log.h
new file mode 100644
index 000000000..02b1d133f
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_log.h
@@ -0,0 +1,337 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_datalog.h"
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_metadata.h"
+#include "rgw_mdlog.h"
+#include "rgw_data_sync.h"
+
+class RGWOp_BILog_List : public RGWRESTOp {
+  bool sent_header;
+  uint32_t format_ver{0};
+  bool truncated{false};
+  std::optional<rgw::bucket_log_layout_generation> next_log_layout;
+
+public:
+  RGWOp_BILog_List() : sent_header(false) {}
+  ~RGWOp_BILog_List() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("bilog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void send_response() override;
+  virtual void send_response(std::list<rgw_bi_log_entry>& entries, std::string& marker);
+  virtual void send_response_end();
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "list_bucket_index_log";
+  }
+};
+
+class RGWOp_BILog_Info : public RGWRESTOp {
+  std::string bucket_ver;
+  std::string master_ver;
+  std::string max_marker;
+  bool syncstopped;
+  uint64_t oldest_gen = 0;
+  uint64_t latest_gen = 0;
+  std::vector<store_gen_shards> generations;
+
+public:
+  RGWOp_BILog_Info() : bucket_ver(), master_ver(), syncstopped(false) {}
+  ~RGWOp_BILog_Info() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("bilog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void send_response() override;
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "bucket_index_log_info";
+  }
+};
+
+class RGWOp_BILog_Delete : public RGWRESTOp {
+public:
+  RGWOp_BILog_Delete() {}
+  ~RGWOp_BILog_Delete() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("bilog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "trim_bucket_index_log";
+  }
+};
+
+class RGWOp_MDLog_List : public RGWRESTOp {
+  std::list<cls_log_entry> entries;
+  std::string last_marker;
+  bool truncated;
+public:
+  RGWOp_MDLog_List() : truncated(false) {}
+  ~RGWOp_MDLog_List() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "list_metadata_log";
+  }
+};
+
+class RGWOp_MDLog_Info : public RGWRESTOp {
+  unsigned num_objects;
+  RGWPeriodHistory::Cursor period;
+public:
+  RGWOp_MDLog_Info() : num_objects(0) {}
+  ~RGWOp_MDLog_Info() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "get_metadata_log_info";
+  }
+};
+
+class RGWOp_MDLog_ShardInfo : public RGWRESTOp {
+  RGWMetadataLogInfo info;
+public:
+  RGWOp_MDLog_ShardInfo() {}
+  ~RGWOp_MDLog_ShardInfo() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "get_metadata_log_shard_info";
+  }
+};
+
+class RGWOp_MDLog_Lock : public RGWRESTOp {
+public:
+  RGWOp_MDLog_Lock() {}
+  ~RGWOp_MDLog_Lock() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "lock_mdlog_object";
+  }
+};
+
+class RGWOp_MDLog_Unlock : public RGWRESTOp {
+public:
+  RGWOp_MDLog_Unlock() {}
+  ~RGWOp_MDLog_Unlock() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "unlock_mdlog_object";
+  }
+};
+
+class RGWOp_MDLog_Notify : public RGWRESTOp {
+public:
+  RGWOp_MDLog_Notify() {}
+  ~RGWOp_MDLog_Notify() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "mdlog_notify";
+  }
+  RGWOpType get_type() override { return RGW_OP_SYNC_MDLOG_NOTIFY; }
+};
+
+class RGWOp_MDLog_Delete : public RGWRESTOp {
+public:
+  RGWOp_MDLog_Delete() {}
+  ~RGWOp_MDLog_Delete() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("mdlog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "trim_metadata_log";
+  }
+};
+
+class RGWOp_DATALog_List : public RGWRESTOp {
+  std::vector<rgw_data_change_log_entry> entries;
+  std::string last_marker;
+  bool truncated;
+  bool extra_info;
+public:
+  RGWOp_DATALog_List() : truncated(false), extra_info(false) {}
+  ~RGWOp_DATALog_List() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "list_data_changes_log";
+  }
+};
+
+class RGWOp_DATALog_Info : public RGWRESTOp {
+  unsigned num_objects;
+public:
+  RGWOp_DATALog_Info() : num_objects(0) {}
+  ~RGWOp_DATALog_Info() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "get_data_changes_log_info";
+  }
+};
+
+class RGWOp_DATALog_ShardInfo : public RGWRESTOp {
+  RGWDataChangesLogInfo info;
+public:
+  RGWOp_DATALog_ShardInfo() {}
+  ~RGWOp_DATALog_ShardInfo() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield y) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override {
+    return "get_data_changes_log_shard_info";
+  }
+};
+
+class RGWOp_DATALog_Notify : public RGWRESTOp {
+public:
+  RGWOp_DATALog_Notify() {}
+  ~RGWOp_DATALog_Notify() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "datalog_notify";
+  }
+  RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY; }
+};
+
+class RGWOp_DATALog_Notify2 : public RGWRESTOp {
+  rgw_data_notify_entry data_notify;
+public:
+  RGWOp_DATALog_Notify2() {}
+  ~RGWOp_DATALog_Notify2() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "datalog_notify2";
+  }
+  RGWOpType get_type() override { return RGW_OP_SYNC_DATALOG_NOTIFY2; }
+};
+
+class RGWOp_DATALog_Delete : public RGWRESTOp {
+public:
+  RGWOp_DATALog_Delete() {}
+  ~RGWOp_DATALog_Delete() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("datalog", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override {
+    return "trim_data_changes_log";
+  }
+};
+
+class RGWHandler_Log : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+  RGWOp *op_delete() override;
+  RGWOp *op_post() override;
+
+  int read_permissions(RGWOp*, optional_yield) override {
+    return 0;
+  }
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_Log() override = default;
+};
+
+class RGWRESTMgr_Log : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Log() = default;
+  ~RGWRESTMgr_Log() override = default;
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state* const,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string& frontend_prefixs) override {
+    return new RGWHandler_Log(auth_registry);
+  }
+};
diff --git a/src/rgw/driver/rados/rgw_rest_pubsub.h b/src/rgw/driver/rados/rgw_rest_pubsub.h
new file mode 100644
index 000000000..27bde7a95
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_pubsub.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+
+#include "rgw_rest_s3.h"
+
+// s3 compliant notification handler factory
+class RGWHandler_REST_PSNotifs_S3 : public RGWHandler_REST_S3 {
+protected:
+  int init_permissions(RGWOp* op, optional_yield y) override {return 0;}
+  int read_permissions(RGWOp* op, optional_yield y) override {return 0;}
+  bool supports_quota() override {return false;}
+  RGWOp* op_get() override;
+  RGWOp* op_put() override;
+  RGWOp* op_delete() override;
+public:
+  using RGWHandler_REST_S3::RGWHandler_REST_S3;
+  virtual ~RGWHandler_REST_PSNotifs_S3() = default;
+  // following are used to generate the operations when invoked by another REST handler
+  static RGWOp* create_get_op();
+  static RGWOp* create_put_op();
+  static RGWOp* create_delete_op();
+};
+
+// AWS compliant topics handler factory
+class RGWHandler_REST_PSTopic_AWS : public RGWHandler_REST {
+  const rgw::auth::StrategyRegistry& auth_registry;
+protected:
+  RGWOp* op_post() override;
+public:
+  RGWHandler_REST_PSTopic_AWS(const rgw::auth::StrategyRegistry& _auth_registry) : 
+      auth_registry(_auth_registry) {}
+  virtual ~RGWHandler_REST_PSTopic_AWS() = default;
+  int postauth_init(optional_yield) override { return 0; }
+  int authorize(const DoutPrefixProvider* dpp, optional_yield y) override;
+  static bool action_exists(const req_state* s);
+};
+
diff --git a/src/rgw/driver/rados/rgw_rest_realm.cc b/src/rgw/driver/rados/rgw_rest_realm.cc
new file mode 100644
index 000000000..79640a2a1
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_realm.cc
@@ -0,0 +1,376 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+#include "rgw_rest_realm.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_config.h"
+#include "rgw_zone.h"
+#include "rgw_sal_rados.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_mdlog.h"
+
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+// reject 'period push' if we would have to fetch too many intermediate periods
+static const uint32_t PERIOD_HISTORY_FETCH_MAX = 64;
+
+// base period op, shared between Get and Post
+class RGWOp_Period_Base : public RGWRESTOp {
+ protected:
+  RGWPeriod period;
+  std::ostringstream error_stream;
+ public:
+  int verify_permission(optional_yield) override { return 0; }
+  void send_response() override;
+};
+
+// reply with the period object on success
+void RGWOp_Period_Base::send_response()
+{
+  set_req_state_err(s, op_ret, error_stream.str());
+  dump_errno(s);
+
+  if (op_ret < 0) {
+    if (!s->err.message.empty()) {
+      ldpp_dout(this, 4) << "Request failed with " << op_ret
+          << ": " << s->err.message << dendl;
+    }
+    end_header(s);
+    return;
+  }
+
+  encode_json("period", period, s->formatter);
+  end_header(s, NULL, "application/json", s->formatter->get_len());
+  flusher.flush();
+}
+
+// GET /admin/realm/period
+class RGWOp_Period_Get : public RGWOp_Period_Base {
+ public:
+  void execute(optional_yield y) override;
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("zone", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  const char* name() const override { return "get_period"; }
+};
+
+void RGWOp_Period_Get::execute(optional_yield y)
+{
+  string realm_id, realm_name, period_id;
+  epoch_t epoch = 0;
+  RESTArgs::get_string(s, "realm_id", realm_id, &realm_id);
+  RESTArgs::get_string(s, "realm_name", realm_name, &realm_name);
+  RESTArgs::get_string(s, "period_id", period_id, &period_id);
+  RESTArgs::get_uint32(s, "epoch", 0, &epoch);
+
+  period.set_id(period_id);
+  period.set_epoch(epoch);
+
+  op_ret = period.init(this, driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm_id, y, realm_name);
+  if (op_ret < 0)
+    ldpp_dout(this, 5) << "failed to read period" << dendl;
+}
+
+// POST /admin/realm/period
+class RGWOp_Period_Post : public RGWOp_Period_Base {
+ public:
+  void execute(optional_yield y) override;
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("zone", RGW_CAP_WRITE);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  const char* name() const override { return "post_period"; }
+  RGWOpType get_type() override { return RGW_OP_PERIOD_POST; }
+};
+
+void RGWOp_Period_Post::execute(optional_yield y)
+{
+  auto cct = driver->ctx();
+
+  // initialize the period without reading from rados
+  period.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y, false);
+
+  // decode the period from input
+  const auto max_size = cct->_conf->rgw_max_put_param_size;
+  bool empty;
+  op_ret = get_json_input(cct, s, period, max_size, &empty);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to decode period" << dendl;
+    return;
+  }
+
+  // require period.realm_id to match our realm
+  if (period.get_realm() != static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_realm().get_id()) {
+    error_stream << "period with realm id " << period.get_realm()
+        << " doesn't match current realm " << static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_realm().get_id() << std::endl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  // load the realm and current period from rados; there may be a more recent
+  // period that we haven't restarted with yet. we also don't want to modify
+  // the objects in use by RGWRados
+  RGWRealm realm(period.get_realm());
+  op_ret = realm.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to read current realm: "
+        << cpp_strerror(-op_ret) << dendl;
+    return;
+  }
+
+  RGWPeriod current_period;
+  op_ret = current_period.init(this, cct, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm.get_id(), y);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to read current period: "
+        << cpp_strerror(-op_ret) << dendl;
+    return;
+  }
+
+  // if period id is empty, handle as 'period commit'
+  if (period.get_id().empty()) {
+    op_ret = period.commit(this, driver, realm, current_period, error_stream, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, -1) << "master zone failed to commit period" << dendl;
+    }
+    return;
+  }
+
+  // if it's not period commit, nobody is allowed to push to the master zone
+  if (period.get_master_zone() == static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone_params().get_id()) {
+    ldpp_dout(this, 10) << "master zone rejecting period id="
+        << period.get_id() << " epoch=" << period.get_epoch() << dendl;
+    op_ret = -EINVAL; // XXX: error code
+    return;
+  }
+
+  // write the period to rados
+  op_ret = period.store_info(this, false, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to store period " << period.get_id() << dendl;
+    return;
+  }
+  // set as latest epoch
+  op_ret = period.update_latest_epoch(this, period.get_epoch(), y);
+  if (op_ret == -EEXIST) {
+    // already have this epoch (or a more recent one)
+    ldpp_dout(this, 4) << "already have epoch >= " << period.get_epoch()
+        << " for period " << period.get_id() << dendl;
+    op_ret = 0;
+    return;
+  }
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to set latest epoch" << dendl;
+    return;
+  }
+
+  auto period_history = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->get_period_history();
+
+  // decide whether we can set_current_period() or set_latest_epoch()
+  if (period.get_id() != current_period.get_id()) {
+    auto current_epoch = current_period.get_realm_epoch();
+    // discard periods in the past
+    if (period.get_realm_epoch() < current_epoch) {
+      ldpp_dout(this, 10) << "discarding period " << period.get_id()
+          << " with realm epoch " << period.get_realm_epoch()
+          << " older than current epoch " << current_epoch << dendl;
+      // return success to ack that we have this period
+      return;
+    }
+    // discard periods too far in the future
+    if (period.get_realm_epoch() > current_epoch + PERIOD_HISTORY_FETCH_MAX) {
+      ldpp_dout(this, -1) << "discarding period " << period.get_id()
+          << " with realm epoch " << period.get_realm_epoch() << " too far in "
+          "the future from current epoch " << current_epoch << dendl;
+      op_ret = -ENOENT; // XXX: error code
+      return;
+    }
+    // attach a copy of the period into the period history
+    auto cursor = period_history->attach(this, RGWPeriod{period}, y);
+    if (!cursor) {
+      // we're missing some history between the new period and current_period
+      op_ret = cursor.get_error();
+      ldpp_dout(this, -1) << "failed to collect the periods between current period "
+          << current_period.get_id() << " (realm epoch " << current_epoch
+          << ") and the new period " << period.get_id()
+          << " (realm epoch " << period.get_realm_epoch()
+          << "): " << cpp_strerror(-op_ret) << dendl;
+      return;
+    }
+    if (cursor.has_next()) {
+      // don't switch if we have a newer period in our history
+      ldpp_dout(this, 4) << "attached period " << period.get_id()
+          << " to history, but the history contains newer periods" << dendl;
+      return;
+    }
+    // set as current period
+    op_ret = realm.set_current_period(this, period, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, -1) << "failed to update realm's current period" << dendl;
+      return;
+    }
+    ldpp_dout(this, 4) << "period " << period.get_id()
+        << " is newer than current period " << current_period.get_id()
+        << ", updating realm's current period and notifying zone" << dendl;
+    realm.notify_new_period(this, period, y);
+    return;
+  }
+  // reflect the period into our local objects
+  op_ret = period.reflect(this, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, -1) << "failed to update local objects: "
+        << cpp_strerror(-op_ret) << dendl;
+    return;
+  }
+  ldpp_dout(this, 4) << "period epoch " << period.get_epoch()
+      << " is newer than current epoch " << current_period.get_epoch()
+      << ", updating period's latest epoch and notifying zone" << dendl;
+  realm.notify_new_period(this, period, y);
+  // update the period history
+  period_history->insert(RGWPeriod{period});
+}
+
+class RGWHandler_Period : public RGWHandler_Auth_S3 {
+ protected:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+
+  RGWOp *op_get() override { return new RGWOp_Period_Get; }
+  RGWOp *op_post() override { return new RGWOp_Period_Post; }
+};
+
+class RGWRESTMgr_Period : public RGWRESTMgr {
+ public:
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override {
+    return new RGWHandler_Period(auth_registry);
+  }
+};
+
+
+// GET /admin/realm
+class RGWOp_Realm_Get : public RGWRESTOp {
+  std::unique_ptr<RGWRealm> realm;
+public:
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("zone", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override { return "get_realm"; }
+};
+
+void RGWOp_Realm_Get::execute(optional_yield y)
+{
+  string id;
+  RESTArgs::get_string(s, "id", id, &id);
+  string name;
+  RESTArgs::get_string(s, "name", name, &name);
+
+  // read realm
+  realm.reset(new RGWRealm(id, name));
+  op_ret = realm->init(this, g_ceph_context, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, y);
+  if (op_ret < 0)
+    ldpp_dout(this, -1) << "failed to read realm id=" << id
+        << " name=" << name << dendl;
+}
+
+void RGWOp_Realm_Get::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+
+  if (op_ret < 0) {
+    end_header(s);
+    return;
+  }
+
+  encode_json("realm", *realm, s->formatter);
+  end_header(s, NULL, "application/json", s->formatter->get_len());
+  flusher.flush();
+}
+
+// GET /admin/realm?list
+class RGWOp_Realm_List : public RGWRESTOp {
+  std::string default_id;
+  std::list<std::string> realms;
+public:
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("zone", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override { return "list_realms"; }
+};
+
+void RGWOp_Realm_List::execute(optional_yield y)
+{
+  {
+    // read default realm
+    RGWRealm realm(driver->ctx(), static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj);
+    [[maybe_unused]] int ret = realm.read_default_id(this, default_id, y);
+  }
+  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->list_realms(this, realms);
+  if (op_ret < 0)
+    ldpp_dout(this, -1) << "failed to list realms" << dendl;
+}
+
+void RGWOp_Realm_List::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+
+  if (op_ret < 0) {
+    end_header(s);
+    return;
+  }
+
+  s->formatter->open_object_section("realms_list");
+  encode_json("default_info", default_id, s->formatter);
+  encode_json("realms", realms, s->formatter);
+  s->formatter->close_section();
+  end_header(s, NULL, "application/json", s->formatter->get_len());
+  flusher.flush();
+}
+
+class RGWHandler_Realm : public RGWHandler_Auth_S3 {
+protected:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  RGWOp *op_get() override {
+    if (s->info.args.sub_resource_exists("list"))
+      return new RGWOp_Realm_List;
+    return new RGWOp_Realm_Get;
+  }
+};
+
+RGWRESTMgr_Realm::RGWRESTMgr_Realm()
+{
+  // add the /admin/realm/period resource
+  register_resource("period", new RGWRESTMgr_Period);
+}
+
+RGWHandler_REST*
+RGWRESTMgr_Realm::get_handler(rgw::sal::Driver* driver,
+			      req_state*,
+                              const rgw::auth::StrategyRegistry& auth_registry,
+                              const std::string&)
+{
+  return new RGWHandler_Realm(auth_registry);
+}
diff --git a/src/rgw/driver/rados/rgw_rest_realm.h b/src/rgw/driver/rados/rgw_rest_realm.h
new file mode 100644
index 000000000..a0d1dc1c9
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_realm.h
@@ -0,0 +1,16 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+
+class RGWRESTMgr_Realm : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Realm();
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override;
+};
diff --git a/src/rgw/driver/rados/rgw_rest_user.cc b/src/rgw/driver/rados/rgw_rest_user.cc
new file mode 100644
index 000000000..361ceb0f7
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_user.cc
@@ -0,0 +1,1137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/ceph_json.h"
+
+#include "rgw_op.h"
+#include "rgw_user.h"
+#include "rgw_rest_user.h"
+#include "rgw_sal.h"
+
+#include "include/str_list.h"
+#include "include/ceph_assert.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "rgw_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int fetch_access_keys_from_master(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, RGWUserAdminOpState &op_state, req_state *s, optional_yield y) {
+    bufferlist data;
+    JSONParser jp;
+    RGWUserInfo ui;
+    int op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, &jp, s->info, y);
+    if (op_ret < 0) {
+      ldpp_dout(dpp, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+      return op_ret;
+    }
+    ui.decode_json(&jp);
+    op_state.op_access_keys = std::move(ui.access_keys);
+
+    return 0;
+}
+
+class RGWOp_User_List : public RGWRESTOp {
+
+public:
+  RGWOp_User_List() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_READ);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "list_user"; }
+};
+
+void RGWOp_User_List::execute(optional_yield y)
+{
+  RGWUserAdminOpState op_state(driver);
+
+  uint32_t max_entries;
+  std::string marker;
+  RESTArgs::get_uint32(s, "max-entries", 1000, &max_entries);
+  RESTArgs::get_string(s, "marker", marker, &marker);
+
+  op_state.max_entries = max_entries;
+  op_state.marker = marker;
+  op_ret = RGWUserAdminOp_User::list(this, driver, op_state, flusher);
+}
+
+class RGWOp_User_Info : public RGWRESTOp {
+
+public:
+  RGWOp_User_Info() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_READ);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_user_info"; }
+};
+
+void RGWOp_User_Info::execute(optional_yield y)
+{
+  RGWUserAdminOpState op_state(driver);
+
+  std::string uid_str, access_key_str;
+  bool fetch_stats;
+  bool sync_stats;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "access-key", access_key_str, &access_key_str);
+
+  // if uid was not supplied in rest argument, error out now, otherwise we'll
+  // end up initializing anonymous user, for which keys.init will eventually
+  // return -EACESS
+  if (uid_str.empty() && access_key_str.empty()){
+    op_ret=-EINVAL;
+    return;
+  }
+
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_bool(s, "stats", false, &fetch_stats);
+
+  RESTArgs::get_bool(s, "sync", false, &sync_stats);
+
+  op_state.set_user_id(uid);
+  op_state.set_access_key(access_key_str);
+  op_state.set_fetch_stats(fetch_stats);
+  op_state.set_sync_stats(sync_stats);
+
+  op_ret = RGWUserAdminOp_User::info(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_User_Create : public RGWRESTOp {
+
+public:
+  RGWOp_User_Create() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "create_user"; }
+};
+
+void RGWOp_User_Create::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string display_name;
+  std::string email;
+  std::string access_key;
+  std::string secret_key;
+  std::string key_type_str;
+  std::string caps;
+  std::string tenant_name;
+  std::string op_mask_str;
+  std::string default_placement_str;
+  std::string placement_tags_str;
+
+  bool gen_key;
+  bool suspended;
+  bool system;
+  bool exclusive;
+
+  int32_t max_buckets;
+  const int32_t default_max_buckets =
+    s->cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "display-name", display_name, &display_name);
+  RESTArgs::get_string(s, "email", email, &email);
+  RESTArgs::get_string(s, "access-key", access_key, &access_key);
+  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+  RESTArgs::get_string(s, "user-caps", caps, &caps);
+  RESTArgs::get_string(s, "tenant", tenant_name, &tenant_name);
+  RESTArgs::get_bool(s, "generate-key", true, &gen_key);
+  RESTArgs::get_bool(s, "suspended", false, &suspended);
+  RESTArgs::get_int32(s, "max-buckets", default_max_buckets, &max_buckets);
+  RESTArgs::get_bool(s, "system", false, &system);
+  RESTArgs::get_bool(s, "exclusive", false, &exclusive);
+  RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str);
+  RESTArgs::get_string(s, "default-placement", default_placement_str, &default_placement_str);
+  RESTArgs::get_string(s, "placement-tags", placement_tags_str, &placement_tags_str);
+
+  if (!s->user->get_info().system && system) {
+    ldpp_dout(this, 0) << "cannot set system flag by non-system user" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (!tenant_name.empty()) {
+    uid.tenant = tenant_name;
+  }
+
+  // TODO: validate required args are passed in. (for eg. uid and display_name here)
+  op_state.set_user_id(uid);
+  op_state.set_display_name(display_name);
+  op_state.set_user_email(email);
+  op_state.set_caps(caps);
+  op_state.set_access_key(access_key);
+  op_state.set_secret_key(secret_key);
+
+  if (!op_mask_str.empty()) {
+    uint32_t op_mask;
+    int ret = rgw_parse_op_type_list(op_mask_str, &op_mask);
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "failed to parse op_mask: " << ret << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    op_state.set_op_mask(op_mask);
+  }
+
+  if (!key_type_str.empty()) {
+    int32_t key_type = KEY_TYPE_UNDEFINED;
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+
+    op_state.set_key_type(key_type);
+  }
+
+  if (max_buckets != default_max_buckets) {
+    if (max_buckets < 0) {
+      max_buckets = -1;
+    }
+    op_state.set_max_buckets(max_buckets);
+  }
+  if (s->info.args.exists("suspended"))
+    op_state.set_suspension(suspended);
+
+  if (s->info.args.exists("system"))
+    op_state.set_system(system);
+
+  if (s->info.args.exists("exclusive"))
+    op_state.set_exclusive(exclusive);
+
+  if (!default_placement_str.empty()) {
+    rgw_placement_rule target_rule;
+    target_rule.from_str(default_placement_str);
+    if (!driver->valid_placement(target_rule)) {
+      ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << target_rule.to_str() << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    op_state.set_default_placement(target_rule);
+  }
+
+  if (!placement_tags_str.empty()) {
+    list<string> placement_tags_list;
+    get_str_list(placement_tags_str, ",", placement_tags_list);
+    op_state.set_placement_tags(placement_tags_list);
+  }
+
+  if(!(driver->is_meta_master())) {
+    op_ret = fetch_access_keys_from_master(this, driver, op_state, s, y);
+
+    if(op_ret < 0) {
+      return;
+    } else {
+      // set_generate_key() is not set if keys have already been fetched from master zone
+      gen_key = false;
+    }
+  }
+
+  if (gen_key) {
+    op_state.set_generate_key();
+  }
+
+  op_ret = RGWUserAdminOp_User::create(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_User_Modify : public RGWRESTOp {
+
+public:
+  RGWOp_User_Modify() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "modify_user"; }
+};
+
+void RGWOp_User_Modify::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string display_name;
+  std::string email;
+  std::string access_key;
+  std::string secret_key;
+  std::string key_type_str;
+  std::string op_mask_str;
+  std::string default_placement_str;
+  std::string placement_tags_str;
+
+  bool gen_key;
+  bool suspended;
+  bool system;
+  bool email_set;
+  bool quota_set;
+  int32_t max_buckets;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "display-name", display_name, &display_name);
+  RESTArgs::get_string(s, "email", email, &email, &email_set);
+  RESTArgs::get_string(s, "access-key", access_key, &access_key);
+  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+  RESTArgs::get_bool(s, "generate-key", false, &gen_key);
+  RESTArgs::get_bool(s, "suspended", false, &suspended);
+  RESTArgs::get_int32(s, "max-buckets", RGW_DEFAULT_MAX_BUCKETS, &max_buckets, &quota_set);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+
+  RESTArgs::get_bool(s, "system", false, &system);
+  RESTArgs::get_string(s, "op-mask", op_mask_str, &op_mask_str);
+  RESTArgs::get_string(s, "default-placement", default_placement_str, &default_placement_str);
+  RESTArgs::get_string(s, "placement-tags", placement_tags_str, &placement_tags_str);
+
+  if (!s->user->get_info().system && system) {
+    ldpp_dout(this, 0) << "cannot set system flag by non-system user" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_state.set_user_id(uid);
+  op_state.set_display_name(display_name);
+
+  if (email_set)
+    op_state.set_user_email(email);
+
+  op_state.set_access_key(access_key);
+  op_state.set_secret_key(secret_key);
+
+  if (quota_set) {
+    if (max_buckets < 0 ) {
+      max_buckets = -1;
+    }
+    op_state.set_max_buckets(max_buckets);
+  }
+
+  if (!key_type_str.empty()) {
+    int32_t key_type = KEY_TYPE_UNDEFINED;
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+
+    op_state.set_key_type(key_type);
+  }
+
+  if (!op_mask_str.empty()) {
+    uint32_t op_mask;
+    if (rgw_parse_op_type_list(op_mask_str, &op_mask) < 0) {
+        ldpp_dout(this, 0) << "failed to parse op_mask" << dendl;
+        op_ret = -EINVAL;
+        return;
+    }   
+    op_state.set_op_mask(op_mask);
+  }
+
+  if (s->info.args.exists("suspended"))
+    op_state.set_suspension(suspended);
+
+  if (s->info.args.exists("system"))
+    op_state.set_system(system);
+
+  if (!op_mask_str.empty()) {
+    uint32_t op_mask;
+    int ret = rgw_parse_op_type_list(op_mask_str, &op_mask);
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "failed to parse op_mask: " << ret << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    op_state.set_op_mask(op_mask);
+  }
+
+  if (!default_placement_str.empty()) {
+    rgw_placement_rule target_rule;
+    target_rule.from_str(default_placement_str);
+    if (!driver->valid_placement(target_rule)) {
+      ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << target_rule.to_str() << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    op_state.set_default_placement(target_rule);
+  }
+
+  if (!placement_tags_str.empty()) {
+    list<string> placement_tags_list;
+    get_str_list(placement_tags_str, ",", placement_tags_list);
+    op_state.set_placement_tags(placement_tags_list);
+  }
+  
+  if(!(driver->is_meta_master())) {
+    op_ret = fetch_access_keys_from_master(this, driver, op_state, s, y);
+
+    if(op_ret < 0) {
+      return;
+    } else {
+      // set_generate_key() is not set if keys have already been fetched from master zone
+      gen_key = false;
+    }
+  }
+
+  if (gen_key) {
+    op_state.set_generate_key();
+  }
+
+  op_ret = RGWUserAdminOp_User::modify(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_User_Remove : public RGWRESTOp {
+
+public:
+  RGWOp_User_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_user"; }
+};
+
+void RGWOp_User_Remove::execute(optional_yield y)
+{
+  std::string uid_str;
+  bool purge_data;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_bool(s, "purge-data", false, &purge_data);
+
+  // FIXME: no double checking
+  if (!uid.empty())
+    op_state.set_user_id(uid);
+
+  op_state.set_purge_data(purge_data);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_User::remove(s, driver, op_state, flusher, s->yield);
+}
+
+class RGWOp_Subuser_Create : public RGWRESTOp {
+
+public:
+  RGWOp_Subuser_Create() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "create_subuser"; }
+};
+
+void RGWOp_Subuser_Create::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string subuser;
+  std::string secret_key;
+  std::string access_key;
+  std::string perm_str;
+  std::string key_type_str;
+
+  bool gen_subuser = false; // FIXME placeholder
+  bool gen_secret;
+  bool gen_access;
+
+  uint32_t perm_mask = 0;
+  int32_t key_type = KEY_TYPE_SWIFT;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "subuser", subuser, &subuser);
+  RESTArgs::get_string(s, "access-key", access_key, &access_key);
+  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+  RESTArgs::get_string(s, "access", perm_str, &perm_str);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+  RESTArgs::get_bool(s, "generate-secret", false, &gen_secret);
+  RESTArgs::get_bool(s, "gen-access-key", false, &gen_access);
+  
+  perm_mask = rgw_str_to_perm(perm_str.c_str());
+  op_state.set_perm(perm_mask);
+
+  op_state.set_user_id(uid);
+  op_state.set_subuser(subuser);
+  op_state.set_access_key(access_key);
+  op_state.set_secret_key(secret_key);
+  op_state.set_generate_subuser(gen_subuser);
+
+  if (gen_access)
+    op_state.set_gen_access();
+
+  if (gen_secret)
+    op_state.set_gen_secret();
+
+  if (!key_type_str.empty()) {
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+  }
+  op_state.set_key_type(key_type);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_Subuser::create(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Subuser_Modify : public RGWRESTOp {
+
+public:
+  RGWOp_Subuser_Modify() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "modify_subuser"; }
+};
+
+void RGWOp_Subuser_Modify::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string subuser;
+  std::string secret_key;
+  std::string key_type_str;
+  std::string perm_str;
+
+  RGWUserAdminOpState op_state(driver);
+
+  uint32_t perm_mask;
+  int32_t key_type = KEY_TYPE_SWIFT;
+
+  bool gen_secret;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "subuser", subuser, &subuser);
+  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+  RESTArgs::get_string(s, "access", perm_str, &perm_str);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+  RESTArgs::get_bool(s, "generate-secret", false, &gen_secret);
+
+  perm_mask = rgw_str_to_perm(perm_str.c_str());
+  op_state.set_perm(perm_mask);
+
+  op_state.set_user_id(uid);
+  op_state.set_subuser(subuser);
+
+  if (!secret_key.empty())
+    op_state.set_secret_key(secret_key);
+
+  if (gen_secret)
+    op_state.set_gen_secret();
+
+  if (!key_type_str.empty()) {
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+  }
+  op_state.set_key_type(key_type);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_Subuser::modify(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Subuser_Remove : public RGWRESTOp {
+
+public:
+  RGWOp_Subuser_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_subuser"; }
+};
+
+void RGWOp_Subuser_Remove::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string subuser;
+  bool purge_keys;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "subuser", subuser, &subuser);
+  RESTArgs::get_bool(s, "purge-keys", true, &purge_keys);
+
+  op_state.set_user_id(uid);
+  op_state.set_subuser(subuser);
+
+  if (purge_keys)
+    op_state.set_purge_keys();
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_Subuser::remove(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Key_Create : public RGWRESTOp {
+
+public:
+  RGWOp_Key_Create() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "create_access_key"; }
+};
+
+void RGWOp_Key_Create::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string subuser;
+  std::string access_key;
+  std::string secret_key;
+  std::string key_type_str;
+
+  bool gen_key;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "subuser", subuser, &subuser);
+  RESTArgs::get_string(s, "access-key", access_key, &access_key);
+  RESTArgs::get_string(s, "secret-key", secret_key, &secret_key);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+  RESTArgs::get_bool(s, "generate-key", true, &gen_key);
+
+  op_state.set_user_id(uid);
+  op_state.set_subuser(subuser);
+  op_state.set_access_key(access_key);
+  op_state.set_secret_key(secret_key);
+
+  if (gen_key)
+    op_state.set_generate_key();
+
+  if (!key_type_str.empty()) {
+    int32_t key_type = KEY_TYPE_UNDEFINED;
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+
+    op_state.set_key_type(key_type);
+  }
+
+  op_ret = RGWUserAdminOp_Key::create(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Key_Remove : public RGWRESTOp {
+
+public:
+  RGWOp_Key_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_access_key"; }
+};
+
+void RGWOp_Key_Remove::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string subuser;
+  std::string access_key;
+  std::string key_type_str;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "subuser", subuser, &subuser);
+  RESTArgs::get_string(s, "access-key", access_key, &access_key);
+  RESTArgs::get_string(s, "key-type", key_type_str, &key_type_str);
+
+  op_state.set_user_id(uid);
+  op_state.set_subuser(subuser);
+  op_state.set_access_key(access_key);
+
+  if (!key_type_str.empty()) {
+    int32_t key_type = KEY_TYPE_UNDEFINED;
+    if (key_type_str.compare("swift") == 0)
+      key_type = KEY_TYPE_SWIFT;
+    else if (key_type_str.compare("s3") == 0)
+      key_type = KEY_TYPE_S3;
+
+    op_state.set_key_type(key_type);
+  }
+
+  op_ret = RGWUserAdminOp_Key::remove(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Caps_Add : public RGWRESTOp {
+
+public:
+  RGWOp_Caps_Add() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "add_user_caps"; }
+};
+
+void RGWOp_Caps_Add::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string caps;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "user-caps", caps, &caps);
+
+  op_state.set_user_id(uid);
+  op_state.set_caps(caps);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_Caps::add(s, driver, op_state, flusher, y);
+}
+
+class RGWOp_Caps_Remove : public RGWRESTOp {
+
+public:
+  RGWOp_Caps_Remove() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "remove_user_caps"; }
+};
+
+void RGWOp_Caps_Remove::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string caps;
+
+  RGWUserAdminOpState op_state(driver);
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  rgw_user uid(uid_str);
+
+  RESTArgs::get_string(s, "user-caps", caps, &caps);
+
+  op_state.set_user_id(uid);
+  op_state.set_caps(caps);
+
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(s, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+  op_ret = RGWUserAdminOp_Caps::remove(s, driver, op_state, flusher, y);
+}
+
+struct UserQuotas {
+  RGWQuota quota;
+
+  UserQuotas() {}
+
+  explicit UserQuotas(RGWUserInfo& info){
+    quota.bucket_quota = info.quota.bucket_quota;
+    quota.user_quota = info.quota.user_quota;
+  }
+
+  void dump(Formatter *f) const {
+    encode_json("bucket_quota", quota.bucket_quota, f);
+    encode_json("user_quota", quota.user_quota, f);
+  }
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("bucket_quota", quota.bucket_quota, obj);
+    JSONDecoder::decode_json("user_quota", quota.user_quota, obj);
+  }
+};
+
+class RGWOp_Quota_Info : public RGWRESTOp {
+
+public:
+  RGWOp_Quota_Info() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_READ);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_quota_info"; }
+};
+
+
+void RGWOp_Quota_Info::execute(optional_yield y)
+{
+  RGWUserAdminOpState op_state(driver);
+
+  std::string uid_str;
+  std::string quota_type;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "quota-type", quota_type, &quota_type);
+
+  if (uid_str.empty()) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  rgw_user uid(uid_str);
+
+  bool show_all = quota_type.empty();
+  bool show_bucket = show_all || (quota_type == "bucket");
+  bool show_user = show_all || (quota_type == "user");
+
+  if (!(show_all || show_bucket || show_user)) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_state.set_user_id(uid);
+
+  RGWUser user;
+  op_ret = user.init(s, driver, op_state, y);
+  if (op_ret < 0)
+    return;
+
+  if (!op_state.has_existing_user()) {
+    op_ret = -ERR_NO_SUCH_USER;
+    return;
+  }
+
+  RGWUserInfo info;
+  string err_msg;
+  op_ret = user.info(info, &err_msg);
+  if (op_ret < 0)
+    return;
+
+  flusher.start(0);
+  if (show_all) {
+    UserQuotas quotas(info);
+    encode_json("quota", quotas, s->formatter);
+  } else if (show_user) {
+    encode_json("user_quota", info.quota.user_quota, s->formatter);
+  } else {
+    encode_json("bucket_quota", info.quota.bucket_quota, s->formatter);
+  }
+
+  flusher.flush();
+}
+
+class RGWOp_Quota_Set : public RGWRESTOp {
+
+public:
+  RGWOp_Quota_Set() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("users", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "set_quota_info"; }
+};
+
+/**
+ * set quota
+ *
+ * two different ways to set the quota info: as json struct in the message body or via http params.
+ *
+ * as json:
+ *
+ * PUT /admin/user?uid=<uid>[&quota-type=<type>]
+ *
+ * whereas quota-type is optional and is either user, or bucket
+ *
+ * if quota-type is not specified then we expect to get a structure that contains both quotas,
+ * otherwise we'll only get the relevant configuration.
+ *
+ * E.g., if quota type not specified:
+ * {
+ *    "user_quota" : {
+ *      "max_size_kb" : 4096,
+ *      "max_objects" : -1,
+ *      "enabled" : false
+ *    },
+ *    "bucket_quota" : {
+ *      "max_size_kb" : 1024,
+ *      "max_objects" : -1,
+ *      "enabled" : true
+ *    }
+ * }
+ *
+ *
+ * or if quota type is specified:
+ * {
+ *   "max_size_kb" : 4096,
+ *   "max_objects" : -1,
+ *   "enabled" : false
+ * }
+ *
+ * Another option is not to pass any body and set the following http params:
+ *
+ *
+ * max-size-kb=<size>
+ * max-objects=<max objects>
+ * enabled[={true,false}]
+ *
+ * all params are optionals and default to the current settings. With this type of configuration the
+ * quota-type param is mandatory.
+ *
+ */
+
+void RGWOp_Quota_Set::execute(optional_yield y)
+{
+  RGWUserAdminOpState op_state(driver);
+
+  std::string uid_str;
+  std::string quota_type;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "quota-type", quota_type, &quota_type);
+
+  if (uid_str.empty()) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  rgw_user uid(uid_str);
+
+  bool set_all = quota_type.empty();
+  bool set_bucket = set_all || (quota_type == "bucket");
+  bool set_user = set_all || (quota_type == "user");
+
+  if (!(set_all || set_bucket || set_user)) {
+    ldpp_dout(this, 20) << "invalid quota type" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  bool use_http_params;
+
+  if (s->content_length > 0) {
+    use_http_params = false;
+  } else {
+    const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+    use_http_params = (!encoding || strcmp(encoding, "chunked") != 0);
+  }
+
+  if (use_http_params && set_all) {
+    ldpp_dout(this, 20) << "quota type was not specified, can't set all quotas via http headers" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_state.set_user_id(uid);
+
+  RGWUser user;
+  op_ret = user.init(s, driver, op_state, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "failed initializing user info: " << op_ret << dendl;
+    return;
+  }
+
+  if (!op_state.has_existing_user()) {
+    op_ret = -ERR_NO_SUCH_USER;
+    return;
+  }
+
+#define QUOTA_INPUT_MAX_LEN 1024
+  if (set_all) {
+    UserQuotas quotas;
+
+    if ((op_ret = get_json_input(driver->ctx(), s, quotas, QUOTA_INPUT_MAX_LEN, NULL)) < 0) {
+      ldpp_dout(this, 20) << "failed to retrieve input" << dendl;
+      return;
+    }
+
+    op_state.set_user_quota(quotas.quota.user_quota);
+    op_state.set_bucket_quota(quotas.quota.bucket_quota);
+  } else {
+    RGWQuotaInfo quota;
+
+    if (!use_http_params) {
+      bool empty;
+      op_ret = get_json_input(driver->ctx(), s, quota, QUOTA_INPUT_MAX_LEN, &empty);
+      if (op_ret < 0) {
+        ldpp_dout(this, 20) << "failed to retrieve input" << dendl;
+        if (!empty)
+          return;
+
+        /* was probably chunked input, but no content provided, configure via http params */
+        use_http_params = true;
+      }
+    }
+
+    if (use_http_params) {
+      RGWUserInfo info;
+      string err_msg;
+      op_ret = user.info(info, &err_msg);
+      if (op_ret < 0) {
+        ldpp_dout(this, 20) << "failed to get user info: " << op_ret << dendl;
+        return;
+      }
+      RGWQuotaInfo *old_quota;
+      if (set_user) {
+        old_quota = &info.quota.user_quota;
+      } else {
+        old_quota = &info.quota.bucket_quota;
+      }
+
+      RESTArgs::get_int64(s, "max-objects", old_quota->max_objects, &quota.max_objects);
+      RESTArgs::get_int64(s, "max-size", old_quota->max_size, &quota.max_size);
+      int64_t max_size_kb;
+      bool has_max_size_kb = false;
+      RESTArgs::get_int64(s, "max-size-kb", 0, &max_size_kb, &has_max_size_kb);
+      if (has_max_size_kb) {
+        quota.max_size = max_size_kb * 1024;
+      }
+      RESTArgs::get_bool(s, "enabled", old_quota->enabled, &quota.enabled);
+    }
+
+    if (set_user) {
+      op_state.set_user_quota(quota);
+    } else {
+      op_state.set_bucket_quota(quota);
+    }
+  }
+
+  string err;
+  op_ret = user.modify(s, op_state, y, &err);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "failed updating user info: " << op_ret << ": " << err << dendl;
+    return;
+  }
+}
+
+RGWOp *RGWHandler_User::op_get()
+{
+  if (s->info.args.sub_resource_exists("quota"))
+    return new RGWOp_Quota_Info;
+
+  if (s->info.args.sub_resource_exists("list"))
+    return new RGWOp_User_List;
+
+  return new RGWOp_User_Info;
+}
+
+RGWOp *RGWHandler_User::op_put()
+{
+  if (s->info.args.sub_resource_exists("subuser"))
+    return new RGWOp_Subuser_Create;
+
+  if (s->info.args.sub_resource_exists("key"))
+    return new RGWOp_Key_Create;
+
+  if (s->info.args.sub_resource_exists("caps"))
+    return new RGWOp_Caps_Add;
+
+  if (s->info.args.sub_resource_exists("quota"))
+    return new RGWOp_Quota_Set;
+
+  return new RGWOp_User_Create;
+}
+
+RGWOp *RGWHandler_User::op_post()
+{
+  if (s->info.args.sub_resource_exists("subuser"))
+    return new RGWOp_Subuser_Modify;
+
+  return new RGWOp_User_Modify;
+}
+
+RGWOp *RGWHandler_User::op_delete()
+{
+  if (s->info.args.sub_resource_exists("subuser"))
+    return new RGWOp_Subuser_Remove;
+
+  if (s->info.args.sub_resource_exists("key"))
+    return new RGWOp_Key_Remove;
+
+  if (s->info.args.sub_resource_exists("caps"))
+    return new RGWOp_Caps_Remove;
+
+  return new RGWOp_User_Remove;
+}
+
diff --git a/src/rgw/driver/rados/rgw_rest_user.h b/src/rgw/driver/rados/rgw_rest_user.h
new file mode 100644
index 000000000..ee585be45
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_rest_user.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+
+
+class RGWHandler_User : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+  RGWOp *op_put() override;
+  RGWOp *op_post() override;
+  RGWOp *op_delete() override;
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_User() override = default;
+
+  int read_permissions(RGWOp*, optional_yield) override {
+    return 0;
+  }
+};
+
+class RGWRESTMgr_User : public RGWRESTMgr {
+public:
+  RGWRESTMgr_User() = default;
+  ~RGWRESTMgr_User() override = default;
+
+  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override {
+    return new RGWHandler_User(auth_registry);
+  }
+};
diff --git a/src/rgw/driver/rados/rgw_sal_rados.cc b/src/rgw/driver/rados/rgw_sal_rados.cc
new file mode 100644
index 000000000..9acdb79d3
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sal_rados.cc
@@ -0,0 +1,3846 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <system_error>
+#include <filesystem>
+#include <unistd.h>
+#include <sstream>
+#include <boost/algorithm/string.hpp>
+#include <boost/process.hpp>
+
+#include "common/Clock.h"
+#include "common/errno.h"
+
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "rgw_bucket.h"
+#include "rgw_multi.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_aio.h"
+#include "rgw_aio_throttle.h"
+#include "rgw_tracer.h"
+
+#include "rgw_zone.h"
+#include "rgw_rest_conn.h"
+#include "rgw_service.h"
+#include "rgw_lc.h"
+#include "rgw_lc_tier.h"
+#include "rgw_rest_admin.h"
+#include "rgw_rest_bucket.h"
+#include "rgw_rest_metadata.h"
+#include "rgw_rest_log.h"
+#include "rgw_rest_config.h"
+#include "rgw_rest_ratelimit.h"
+#include "rgw_rest_realm.h"
+#include "rgw_rest_user.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_meta.h"
+#include "services/svc_meta_be_sobj.h"
+#include "services/svc_cls.h"
+#include "services/svc_zone.h"
+#include "services/svc_tier_rados.h"
+#include "services/svc_quota.h"
+#include "services/svc_config_key.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_role_rados.h"
+#include "services/svc_user.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+#include "rgw_pubsub.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static string mp_ns = RGW_OBJ_NS_MULTIPART;
+
+namespace rgw::sal {
+
+// default number of entries to list with each bucket listing call
+// (use marker to bridge between calls)
+static constexpr size_t listing_max_entries = 1000;
+static std::string pubsub_oid_prefix = "pubsub.";
+
+static int decode_policy(CephContext* cct,
+                         bufferlist& bl,
+                         RGWAccessControlPolicy* policy)
+{
+  auto iter = bl.cbegin();
+  try {
+    policy->decode(iter);
+  } catch (buffer::error& err) {
+    ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+    return -EIO;
+  }
+  if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
+    ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
+    RGWAccessControlPolicy_S3* s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
+    s3policy->to_xml(*_dout);
+    *_dout << dendl;
+  }
+  return 0;
+}
+
+static int rgw_op_get_bucket_policy_from_attr(const DoutPrefixProvider* dpp,
+					      RadosStore* store,
+					      User* user,
+					      Attrs& bucket_attrs,
+					      RGWAccessControlPolicy* policy,
+					      optional_yield y)
+{
+  auto aiter = bucket_attrs.find(RGW_ATTR_ACL);
+
+  if (aiter != bucket_attrs.end()) {
+    int ret = decode_policy(store->ctx(), aiter->second, policy);
+    if (ret < 0)
+      return ret;
+  } else {
+    ldout(store->ctx(), 0) << "WARNING: couldn't find acl header for bucket, generating default" << dendl;
+    /* object exists, but policy is broken */
+    int r = user->load_user(dpp, y);
+    if (r < 0)
+      return r;
+
+    policy->create_default(user->get_id(), user->get_display_name());
+  }
+  return 0;
+}
+
+static int drain_aio(std::list<librados::AioCompletion*>& handles)
+{
+  int ret = 0;
+  while (!handles.empty()) {
+    librados::AioCompletion* handle = handles.front();
+    handles.pop_front();
+    handle->wait_for_complete();
+    int r = handle->get_return_value();
+    handle->release();
+    if (r < 0) {
+      ret = r;
+    }
+  }
+  return ret;
+}
+
+int RadosCompletions::drain()
+{
+  return drain_aio(handles);
+}
+
+int RadosUser::list_buckets(const DoutPrefixProvider* dpp, const std::string& marker,
+			       const std::string& end_marker, uint64_t max, bool need_stats,
+			       BucketList &buckets, optional_yield y)
+{
+  RGWUserBuckets ulist;
+  bool is_truncated = false;
+  int ret;
+
+  buckets.clear();
+  ret = store->ctl()->user->list_buckets(dpp, info.user_id, marker, end_marker, max,
+					 need_stats, &ulist, &is_truncated, y);
+  if (ret < 0)
+    return ret;
+
+  buckets.set_truncated(is_truncated);
+  for (const auto& ent : ulist.get_buckets()) {
+    buckets.add(std::unique_ptr<Bucket>(new RadosBucket(this->store, ent.second, this)));
+  }
+
+  return 0;
+}
+
+int RadosUser::create_bucket(const DoutPrefixProvider* dpp,
+				 const rgw_bucket& b,
+				 const std::string& zonegroup_id,
+				 rgw_placement_rule& placement_rule,
+				 std::string& swift_ver_location,
+				 const RGWQuotaInfo * pquota_info,
+				 const RGWAccessControlPolicy& policy,
+				 Attrs& attrs,
+				 RGWBucketInfo& info,
+				 obj_version& ep_objv,
+				 bool exclusive,
+				 bool obj_lock_enabled,
+				 bool* existed,
+				 req_info& req_info,
+				 std::unique_ptr<Bucket>* bucket_out,
+				 optional_yield y)
+{
+  int ret;
+  bufferlist in_data;
+  RGWBucketInfo master_info;
+  rgw_bucket* pmaster_bucket;
+  uint32_t* pmaster_num_shards;
+  real_time creation_time;
+  std::unique_ptr<Bucket> bucket;
+  obj_version objv,* pobjv = NULL;
+
+  /* If it exists, look it up; otherwise create it */
+  ret = store->get_bucket(dpp, this, b, &bucket, y);
+  if (ret < 0 && ret != -ENOENT)
+    return ret;
+
+  if (ret != -ENOENT) {
+    RGWAccessControlPolicy old_policy(store->ctx());
+    *existed = true;
+    if (swift_ver_location.empty()) {
+      swift_ver_location = bucket->get_info().swift_ver_location;
+    }
+    placement_rule.inherit_from(bucket->get_info().placement_rule);
+
+    // don't allow changes to the acl policy
+    int r = rgw_op_get_bucket_policy_from_attr(dpp, store, this, bucket->get_attrs(),
+					       &old_policy, y);
+    if (r >= 0 && old_policy != policy) {
+      bucket_out->swap(bucket);
+      return -EEXIST;
+    }
+  } else {
+    bucket = std::unique_ptr<Bucket>(new RadosBucket(store, b, this));
+    *existed = false;
+    bucket->set_attrs(attrs);
+  }
+
+  if (!store->svc()->zone->is_meta_master()) {
+    JSONParser jp;
+    ret = store->forward_request_to_master(dpp, this, NULL, in_data, &jp, req_info, y);
+    if (ret < 0) {
+      return ret;
+    }
+
+    JSONDecoder::decode_json("entry_point_object_ver", ep_objv, &jp);
+    JSONDecoder::decode_json("object_ver", objv, &jp);
+    JSONDecoder::decode_json("bucket_info", master_info, &jp);
+    ldpp_dout(dpp, 20) << "parsed: objv.tag=" << objv.tag << " objv.ver=" << objv.ver << dendl;
+    std::time_t ctime = ceph::real_clock::to_time_t(master_info.creation_time);
+    ldpp_dout(dpp, 20) << "got creation time: << " << std::put_time(std::localtime(&ctime), "%F %T") << dendl;
+    pmaster_bucket= &master_info.bucket;
+    creation_time = master_info.creation_time;
+    pmaster_num_shards = &master_info.layout.current_index.layout.normal.num_shards;
+    pobjv = &objv;
+    if (master_info.obj_lock_enabled()) {
+      info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED;
+    }
+  } else {
+    pmaster_bucket = NULL;
+    pmaster_num_shards = NULL;
+    if (obj_lock_enabled)
+      info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED;
+  }
+
+  std::string zid = zonegroup_id;
+  if (zid.empty()) {
+    zid = store->svc()->zone->get_zonegroup().get_id();
+  }
+
+  if (*existed) {
+    rgw_placement_rule selected_placement_rule;
+    ret = store->svc()->zone->select_bucket_placement(dpp, this->get_info(),
+					       zid, placement_rule,
+					       &selected_placement_rule, nullptr, y);
+    if (selected_placement_rule != info.placement_rule) {
+      ret = -EEXIST;
+      bucket_out->swap(bucket);
+      return ret;
+    }
+  } else {
+
+    ret = store->getRados()->create_bucket(this->get_info(), bucket->get_key(),
+				    zid, placement_rule, swift_ver_location, pquota_info,
+				    attrs, info, pobjv, &ep_objv, creation_time,
+				    pmaster_bucket, pmaster_num_shards, y, dpp,
+				    exclusive);
+    if (ret == -EEXIST) {
+      *existed = true;
+      /* bucket already existed, might have raced with another bucket creation,
+       * or might be partial bucket creation that never completed. Read existing
+       * bucket info, verify that the reported bucket owner is the current user.
+       * If all is ok then update the user's list of buckets.  Otherwise inform
+       * client about a name conflict.
+       */
+      if (info.owner.compare(this->get_id()) != 0) {
+	return -EEXIST;
+      }
+      ret = 0;
+    } else if (ret != 0) {
+      return ret;
+    }
+  }
+
+  bucket->set_version(ep_objv);
+  bucket->get_info() = info;
+
+  RadosBucket* rbucket = static_cast<RadosBucket*>(bucket.get());
+  ret = rbucket->link(dpp, this, y, false);
+  if (ret && !*existed && ret != -EEXIST) {
+    /* if it exists (or previously existed), don't remove it! */
+    ret = rbucket->unlink(dpp, this, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: failed to unlink bucket: ret=" << ret
+		       << dendl;
+    }
+  } else if (ret == -EEXIST || (ret == 0 && *existed)) {
+    ret = -ERR_BUCKET_EXISTS;
+  }
+
+  bucket_out->swap(bucket);
+
+  return ret;
+}
+
+int RadosUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return store->ctl()->user->get_attrs_by_uid(dpp, get_id(), &attrs, y, &objv_tracker);
+}
+
+int RadosUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y)
+{
+  for(auto& it : new_attrs) {
+	  attrs[it.first] = it.second;
+  }
+  return store_user(dpp, y, false);
+}
+
+int RadosUser::read_stats(const DoutPrefixProvider *dpp,
+                             optional_yield y, RGWStorageStats* stats,
+			     ceph::real_time* last_stats_sync,
+			     ceph::real_time* last_stats_update)
+{
+  return store->ctl()->user->read_stats(dpp, get_id(), stats, y, last_stats_sync, last_stats_update);
+}
+
+int RadosUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb)
+{
+  return store->svc()->user->read_stats_async(dpp, get_id(), cb);
+}
+
+int RadosUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return store->svc()->user->complete_flush_stats(dpp, get_id(), y);
+}
+
+int RadosUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+			       uint32_t max_entries, bool* is_truncated,
+			       RGWUsageIter& usage_iter,
+			       map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  std::string bucket_name;
+  return store->getRados()->read_usage(dpp, get_id(), bucket_name, start_epoch,
+				       end_epoch, max_entries, is_truncated,
+				       usage_iter, usage);
+}
+
+int RadosUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+  std::string bucket_name;
+
+  return store->getRados()->trim_usage(dpp, get_id(), bucket_name, start_epoch, end_epoch);
+}
+
+int RadosUser::load_user(const DoutPrefixProvider* dpp, optional_yield y)
+{
+    return store->ctl()->user->get_info_by_uid(dpp, info.user_id, &info, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker).set_attrs(&attrs));
+}
+
+int RadosUser::store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info)
+{
+    return store->ctl()->user->store_info(dpp, info, y,
+					  RGWUserCtl::PutParams().set_objv_tracker(&objv_tracker)
+					  .set_exclusive(exclusive)
+					  .set_attrs(&attrs)
+					  .set_old_info(old_info));
+}
+
+int RadosUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y)
+{
+    return store->ctl()->user->remove_info(dpp, info, y,
+					  RGWUserCtl::RemoveParams().set_objv_tracker(&objv_tracker));
+}
+
+int RadosUser::verify_mfa(const std::string& mfa_str, bool* verified,
+			  const DoutPrefixProvider* dpp, optional_yield y)
+{
+  vector<string> params;
+  get_str_vec(mfa_str, " ", params);
+
+  if (params.size() != 2) {
+    ldpp_dout(dpp, 5) << "NOTICE: invalid mfa string provided: " << mfa_str << dendl;
+    return -EINVAL;
+  }
+
+  string& serial = params[0];
+  string& pin = params[1];
+
+  auto i = info.mfa_ids.find(serial);
+  if (i == info.mfa_ids.end()) {
+    ldpp_dout(dpp, 5) << "NOTICE: user does not have mfa device with serial=" << serial << dendl;
+    return -EACCES;
+  }
+
+  int ret = store->svc()->cls->mfa.check_mfa(dpp, info.user_id, serial, pin, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << "NOTICE: failed to check MFA, serial=" << serial << dendl;
+    return -EACCES;
+  }
+
+  *verified = true;
+
+  return 0;
+}
+
+RadosBucket::~RadosBucket() {}
+
+int RadosBucket::remove_bucket(const DoutPrefixProvider* dpp,
+			       bool delete_children,
+			       bool forward_to_master,
+			       req_info* req_info,
+			       optional_yield y)
+{
+  int ret;
+
+  // Refresh info
+  ret = load_bucket(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ListParams params;
+  params.list_versions = true;
+  params.allow_unordered = true;
+
+  ListResults results;
+
+  do {
+    results.objs.clear();
+
+    ret = list(dpp, params, 1000, results, y);
+    if (ret < 0) {
+      return ret;
+    }
+
+    if (!results.objs.empty() && !delete_children) {
+      ldpp_dout(dpp, -1) << "ERROR: could not remove non-empty bucket " << info.bucket.name <<
+	dendl;
+      return -ENOTEMPTY;
+    }
+
+    for (const auto& obj : results.objs) {
+      rgw_obj_key key(obj.key);
+      /* xxx dang */
+      ret = rgw_remove_object(dpp, store, this, key);
+      if (ret < 0 && ret != -ENOENT) {
+	return ret;
+      }
+    }
+  } while(results.is_truncated);
+
+  ret = abort_multiparts(dpp, store->ctx());
+  if (ret < 0) {
+    return ret;
+  }
+
+  // remove lifecycle config, if any (XXX note could be made generic)
+  (void) store->getRados()->get_lc()->remove_bucket_config(
+    this, get_attrs());
+
+  ret = store->ctl()->bucket->sync_user_stats(dpp, info.owner, info, y, nullptr);
+  if (ret < 0) {
+     ldout(store->ctx(), 1) << "WARNING: failed sync user stats before bucket delete. ret=" <<  ret << dendl;
+  }
+
+  RGWObjVersionTracker ot;
+
+  // if we deleted children above we will force delete, as any that
+  // remain is detrius from a prior bug
+  ret = store->getRados()->delete_bucket(info, ot, y, dpp, !delete_children);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: could not remove bucket " <<
+      info.bucket.name << dendl;
+    return ret;
+  }
+
+  // if bucket has notification definitions associated with it
+  // they should be removed (note that any pending notifications on the bucket are still going to be sent)
+  const RGWPubSub ps(store, info.owner.tenant);
+  const RGWPubSub::Bucket ps_bucket(ps, this);
+  const auto ps_ret = ps_bucket.remove_notifications(dpp, y);
+  if (ps_ret < 0 && ps_ret != -ENOENT) {
+    ldpp_dout(dpp, -1) << "ERROR: unable to remove notifications from bucket. ret=" << ps_ret << dendl;
+  }
+
+  ret = store->ctl()->bucket->unlink_bucket(info.owner, info.bucket, y, dpp, false);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: unable to remove user bucket information" << dendl;
+  }
+
+  if (forward_to_master) {
+    bufferlist in_data;
+    ret = store->forward_request_to_master(dpp, owner, &ot.read_version, in_data, nullptr, *req_info, y);
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+	/* adjust error, we want to return with NoSuchBucket and not
+	 * NoSuchKey */
+	ret = -ERR_NO_SUCH_BUCKET;
+      }
+      return ret;
+    }
+  }
+
+  return ret;
+}
+
+int RadosBucket::remove_bucket_bypass_gc(int concurrent_max, bool
+					 keep_index_consistent,
+					 optional_yield y, const
+					 DoutPrefixProvider *dpp)
+{
+  int ret;
+  map<RGWObjCategory, RGWStorageStats> stats;
+  map<string, bool> common_prefixes;
+  RGWObjectCtx obj_ctx(store);
+  CephContext *cct = store->ctx();
+
+  string bucket_ver, master_ver;
+
+  ret = load_bucket(dpp, y);
+  if (ret < 0)
+    return ret;
+
+  const auto& index = info.get_current_index();
+  ret = read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver, &master_ver, stats, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = abort_multiparts(dpp, cct);
+  if (ret < 0) {
+    return ret;
+  }
+
+  rgw::sal::Bucket::ListParams params;
+  rgw::sal::Bucket::ListResults results;
+
+  params.list_versions = true;
+  params.allow_unordered = true;
+
+  std::list<librados::AioCompletion*> handles;
+
+  int max_aio = concurrent_max;
+  results.is_truncated = true;
+
+  while (results.is_truncated) {
+    ret = list(dpp, params, listing_max_entries, results, y);
+    if (ret < 0)
+      return ret;
+
+    std::vector<rgw_bucket_dir_entry>::iterator it = results.objs.begin();
+    for (; it != results.objs.end(); ++it) {
+      RGWObjState *astate = NULL;
+      RGWObjManifest *amanifest = nullptr;
+      rgw_obj obj{get_key(), it->key};
+
+      ret = store->getRados()->get_obj_state(dpp, &obj_ctx, get_info(),
+					     obj, &astate, &amanifest,
+					     false, y);
+      if (ret == -ENOENT) {
+        ldpp_dout(dpp, 1) << "WARNING: cannot find obj state for obj " << obj << dendl;
+        continue;
+      }
+      if (ret < 0) {
+        ldpp_dout(dpp, -1) << "ERROR: get obj state returned with error " << ret << dendl;
+        return ret;
+      }
+
+      if (amanifest) {
+        RGWObjManifest& manifest = *amanifest;
+        RGWObjManifest::obj_iterator miter = manifest.obj_begin(dpp);
+        const rgw_obj head_obj = manifest.get_obj();
+        rgw_raw_obj raw_head_obj;
+        store->get_raw_obj(manifest.get_head_placement_rule(), head_obj, &raw_head_obj);
+
+        for (; miter != manifest.obj_end(dpp) && max_aio--; ++miter) {
+          if (!max_aio) {
+            ret = drain_aio(handles);
+            if (ret < 0) {
+              ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+              return ret;
+            }
+            max_aio = concurrent_max;
+          }
+
+          rgw_raw_obj last_obj = miter.get_location().get_raw_obj(store->getRados());
+          if (last_obj == raw_head_obj) {
+            // have the head obj deleted at the end
+            continue;
+          }
+
+          ret = store->getRados()->delete_raw_obj_aio(dpp, last_obj, handles);
+          if (ret < 0) {
+            ldpp_dout(dpp, -1) << "ERROR: delete obj aio failed with " << ret << dendl;
+            return ret;
+          }
+        } // for all shadow objs
+
+        ret = store->getRados()->delete_obj_aio(dpp, head_obj, get_info(), astate,
+                                                handles, keep_index_consistent, y);
+        if (ret < 0) {
+          ldpp_dout(dpp, -1) << "ERROR: delete obj aio failed with " << ret << dendl;
+          return ret;
+        }
+      }
+
+      if (!max_aio) {
+        ret = drain_aio(handles);
+        if (ret < 0) {
+          ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+          return ret;
+        }
+        max_aio = concurrent_max;
+      }
+      obj_ctx.invalidate(obj);
+    } // for all RGW objects in results
+  } // while is_truncated
+
+  ret = drain_aio(handles);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: could not drain handles as aio completion returned with " << ret << dendl;
+    return ret;
+  }
+
+  sync_user_stats(dpp, y);
+  if (ret < 0) {
+     ldpp_dout(dpp, 1) << "WARNING: failed sync user stats before bucket delete. ret=" <<  ret << dendl;
+  }
+
+  RGWObjVersionTracker objv_tracker;
+
+  // this function can only be run if caller wanted children to be
+  // deleted, so we can ignore the check for children as any that
+  // remain are detritus from a prior bug
+  ret = remove_bucket(dpp, true, false, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: could not remove bucket " << this << dendl;
+    return ret;
+  }
+
+  return ret;
+}
+
+int RadosBucket::load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats)
+{
+  int ret;
+
+  RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj();
+  RGWObjVersionTracker ep_ot;
+  if (info.bucket.bucket_id.empty()) {
+    ret = store->ctl()->bucket->read_bucket_info(info.bucket, &info, y, dpp,
+				      RGWBucketCtl::BucketInstance::GetParams()
+				      .set_mtime(&mtime)
+				      .set_attrs(&attrs)
+                                      .set_bectx_params(bectx_params),
+				      &ep_ot);
+  } else {
+    ret  = store->ctl()->bucket->read_bucket_instance_info(info.bucket, &info, y, dpp,
+				      RGWBucketCtl::BucketInstance::GetParams()
+				      .set_mtime(&mtime)
+				      .set_attrs(&attrs)
+				      .set_bectx_params(bectx_params));
+  }
+  if (ret != 0) {
+    return ret;
+  }
+
+  bucket_version = ep_ot.read_version;
+
+  if (get_stats) {
+    ret = store->ctl()->bucket->read_bucket_stats(info.bucket, &ent, y, dpp);
+  }
+
+  return ret;
+}
+
+int RadosBucket::read_stats(const DoutPrefixProvider *dpp,
+			    const bucket_index_layout_generation& idx_layout,
+			    int shard_id, std::string* bucket_ver, std::string* master_ver,
+			    std::map<RGWObjCategory, RGWStorageStats>& stats,
+			    std::string* max_marker, bool* syncstopped)
+{
+  return store->getRados()->get_bucket_stats(dpp, info, idx_layout, shard_id, bucket_ver, master_ver, stats, max_marker, syncstopped);
+}
+
+int RadosBucket::read_stats_async(const DoutPrefixProvider *dpp,
+				  const bucket_index_layout_generation& idx_layout,
+				  int shard_id, RGWGetBucketStats_CB* ctx)
+{
+  return store->getRados()->get_bucket_stats_async(dpp, get_info(), idx_layout, shard_id, ctx);
+}
+
+int RadosBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return store->ctl()->bucket->sync_user_stats(dpp, owner->get_id(), info, y, &ent);
+}
+
+int RadosBucket::update_container_stats(const DoutPrefixProvider* dpp)
+{
+  int ret;
+  map<std::string, RGWBucketEnt> m;
+
+  m[info.bucket.name] = ent;
+  ret = store->getRados()->update_containers_stats(m, dpp);
+  if (!ret)
+    return -EEXIST;
+  if (ret < 0)
+    return ret;
+
+  map<std::string, RGWBucketEnt>::iterator iter = m.find(info.bucket.name);
+  if (iter == m.end())
+    return -EINVAL;
+
+  ent.count = iter->second.count;
+  ent.size = iter->second.size;
+  ent.size_rounded = iter->second.size_rounded;
+  ent.creation_time = iter->second.creation_time;
+  ent.placement_rule = std::move(iter->second.placement_rule);
+
+  info.creation_time = ent.creation_time;
+  info.placement_rule = ent.placement_rule;
+
+  return 0;
+}
+
+int RadosBucket::check_bucket_shards(const DoutPrefixProvider* dpp)
+{
+      return store->getRados()->check_bucket_shards(info, info.bucket, get_count(), dpp);
+}
+
+int RadosBucket::link(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint, RGWObjVersionTracker* objv)
+{
+  RGWBucketEntryPoint ep;
+  ep.bucket = info.bucket;
+  ep.owner = new_user->get_id();
+  ep.creation_time = get_creation_time();
+  ep.linked = true;
+  Attrs ep_attrs;
+  rgw_ep_info ep_data{ep, ep_attrs};
+
+  int r = store->ctl()->bucket->link_bucket(new_user->get_id(), info.bucket,
+					    get_creation_time(), y, dpp, update_entrypoint,
+					    &ep_data);
+  if (r < 0)
+    return r;
+
+  if (objv)
+    *objv = ep_data.ep_objv;
+
+  return r;
+}
+
+int RadosBucket::unlink(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint)
+{
+  return store->ctl()->bucket->unlink_bucket(new_user->get_id(), info.bucket, y, dpp, update_entrypoint);
+}
+
+int RadosBucket::chown(const DoutPrefixProvider* dpp, User& new_user, optional_yield y)
+{
+  std::string obj_marker;
+  int r;
+
+  if (!owner) {
+      ldpp_dout(dpp, 0) << __func__ << " Cannot chown without an owner " << dendl;
+      return -EINVAL;
+  }
+
+  r = this->unlink(dpp, owner, y);
+  if (r < 0) {
+    return r;
+  }
+
+  return this->link(dpp, &new_user, y);
+}
+
+int RadosBucket::put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time _mtime)
+{
+  mtime = _mtime;
+  return store->getRados()->put_bucket_instance_info(info, exclusive, mtime, &attrs, dpp, null_yield);
+}
+
+/* Make sure to call get_bucket_info() if you need it first */
+bool RadosBucket::is_owner(User* user)
+{
+  return (info.owner.compare(user->get_id()) == 0);
+}
+
+int RadosBucket::check_empty(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return store->getRados()->check_bucket_empty(dpp, info, y);
+}
+
+int RadosBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size,
+				optional_yield y, bool check_size_only)
+{
+    return store->getRados()->check_quota(dpp, info.owner, get_key(),
+					  quota, obj_size, y, check_size_only);
+}
+
+int RadosBucket::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y)
+{
+  for(auto& it : new_attrs) {
+	  attrs[it.first] = it.second;
+  }
+  return store->ctl()->bucket->set_bucket_instance_attrs(get_info(),
+				new_attrs, &get_info().objv_tracker, y, dpp);
+}
+
+int RadosBucket::try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime)
+{
+  return store->getRados()->try_refresh_bucket_info(info, pmtime, dpp, &attrs);
+}
+
+int RadosBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+			       uint32_t max_entries, bool* is_truncated,
+			       RGWUsageIter& usage_iter,
+			       map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  return store->getRados()->read_usage(dpp, owner->get_id(), get_name(), start_epoch,
+				       end_epoch, max_entries, is_truncated,
+				       usage_iter, usage);
+}
+
+int RadosBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+  return store->getRados()->trim_usage(dpp, owner->get_id(), get_name(), start_epoch, end_epoch);
+}
+
+int RadosBucket::remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink)
+{
+  return store->getRados()->remove_objs_from_index(dpp, info, objs_to_unlink);
+}
+
+int RadosBucket::check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats)
+{
+  return store->getRados()->bucket_check_index(dpp, info, &existing_stats, &calculated_stats);
+}
+
+int RadosBucket::rebuild_index(const DoutPrefixProvider *dpp)
+{
+  return store->getRados()->bucket_rebuild_index(dpp, info);
+}
+
+int RadosBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout)
+{
+  return store->getRados()->cls_obj_set_bucket_tag_timeout(dpp, info, timeout);
+}
+
+int RadosBucket::purge_instance(const DoutPrefixProvider* dpp)
+{
+  int max_shards = (info.layout.current_index.layout.normal.num_shards > 0 ? info.layout.current_index.layout.normal.num_shards : 1);
+  for (int i = 0; i < max_shards; i++) {
+    RGWRados::BucketShard bs(store->getRados());
+    int shard_id = (info.layout.current_index.layout.normal.num_shards > 0  ? i : -1);
+    int ret = bs.init(dpp, info, info.layout.current_index, shard_id);
+    if (ret < 0) {
+      cerr << "ERROR: bs.init(bucket=" << info.bucket << ", shard=" << shard_id
+           << "): " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+    ret = store->getRados()->bi_remove(dpp, bs);
+    if (ret < 0) {
+      cerr << "ERROR: failed to remove bucket index object: "
+           << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+  }
+  return 0;
+}
+
+int RadosBucket::set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy &acl, optional_yield y)
+{
+  bufferlist aclbl;
+
+  acls = acl;
+  acl.encode(aclbl);
+  map<string, bufferlist>& attrs = get_attrs();
+
+  attrs[RGW_ATTR_ACL] = aclbl;
+  info.owner = acl.get_owner().get_id();
+
+  int r = store->ctl()->bucket->store_bucket_instance_info(info.bucket,
+                 info, y, dpp,
+                 RGWBucketCtl::BucketInstance::PutParams().set_attrs(&attrs));
+  if (r < 0) {
+    cerr << "ERROR: failed to set bucket owner: " << cpp_strerror(-r) << std::endl;
+    return r;
+  }
+  
+  return 0;
+}
+
+std::unique_ptr<Object> RadosBucket::get_object(const rgw_obj_key& k)
+{
+  return std::make_unique<RadosObject>(this->store, k, this);
+}
+
+int RadosBucket::list(const DoutPrefixProvider* dpp, ListParams& params, int max, ListResults& results, optional_yield y)
+{
+  RGWRados::Bucket target(store->getRados(), get_info());
+  if (params.shard_id >= 0) {
+    target.set_shard_id(params.shard_id);
+  }
+  RGWRados::Bucket::List list_op(&target);
+
+  list_op.params.prefix = params.prefix;
+  list_op.params.delim = params.delim;
+  list_op.params.marker = params.marker;
+  list_op.params.ns = params.ns;
+  list_op.params.end_marker = params.end_marker;
+  list_op.params.ns = params.ns;
+  list_op.params.enforce_ns = params.enforce_ns;
+  list_op.params.access_list_filter = params.access_list_filter;
+  list_op.params.force_check_filter = params.force_check_filter;
+  list_op.params.list_versions = params.list_versions;
+  list_op.params.allow_unordered = params.allow_unordered;
+
+  int ret = list_op.list_objects(dpp, max, &results.objs, &results.common_prefixes, &results.is_truncated, y);
+  if (ret >= 0) {
+    results.next_marker = list_op.get_next_marker();
+    params.marker = results.next_marker;
+  }
+
+  return ret;
+}
+
+std::unique_ptr<MultipartUpload> RadosBucket::get_multipart_upload(
+				  const std::string& oid,
+				  std::optional<std::string> upload_id,
+				  ACLOwner owner, ceph::real_time mtime)
+{
+  return std::make_unique<RadosMultipartUpload>(this->store, this, oid, upload_id,
+						std::move(owner), mtime);
+}
+
+int RadosBucket::list_multiparts(const DoutPrefixProvider *dpp,
+				 const string& prefix,
+				 string& marker,
+				 const string& delim,
+				 const int& max_uploads,
+				 vector<std::unique_ptr<MultipartUpload>>& uploads,
+				 map<string, bool> *common_prefixes,
+				 bool *is_truncated)
+{
+  rgw::sal::Bucket::ListParams params;
+  rgw::sal::Bucket::ListResults results;
+  MultipartMetaFilter mp_filter;
+
+  params.prefix = prefix;
+  params.delim = delim;
+  params.marker = marker;
+  params.ns = RGW_OBJ_NS_MULTIPART;
+  params.access_list_filter = &mp_filter;
+
+  int ret = list(dpp, params, max_uploads, results, null_yield);
+
+  if (ret < 0)
+    return ret;
+
+  if (!results.objs.empty()) {
+    for (const rgw_bucket_dir_entry& dentry : results.objs) {
+      rgw_obj_key key(dentry.key);
+      ACLOwner owner(rgw_user(dentry.meta.owner));
+      owner.set_name(dentry.meta.owner_display_name);
+      uploads.push_back(this->get_multipart_upload(key.name,
+			std::nullopt, std::move(owner), dentry.meta.mtime));
+    }
+  }
+  if (common_prefixes) {
+    *common_prefixes = std::move(results.common_prefixes);
+  }
+  *is_truncated = results.is_truncated;
+  marker = params.marker.name;
+
+  return 0;
+}
+
+int RadosBucket::abort_multiparts(const DoutPrefixProvider* dpp,
+				  CephContext* cct)
+{
+  constexpr int max = 1000;
+  int ret, num_deleted = 0;
+  vector<std::unique_ptr<MultipartUpload>> uploads;
+  string marker;
+  bool is_truncated;
+
+  const std::string empty_delim;
+  const std::string empty_prefix;
+
+  do {
+    ret = list_multiparts(dpp, empty_prefix, marker, empty_delim,
+			  max, uploads, nullptr, &is_truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << __func__ <<
+	" ERROR : calling list_bucket_multiparts; ret=" << ret <<
+	"; bucket=\"" << this << "\"" << dendl;
+      return ret;
+    }
+    ldpp_dout(dpp, 20) << __func__ <<
+      " INFO: aborting and cleaning up multipart upload(s); bucket=\"" <<
+      this << "\"; uploads.size()=" << uploads.size() <<
+      "; is_truncated=" << is_truncated << dendl;
+
+    if (!uploads.empty()) {
+      for (const auto& upload : uploads) {
+	ret = upload->abort(dpp, cct);
+        if (ret < 0) {
+	  // we're doing a best-effort; if something cannot be found,
+	  // log it and keep moving forward
+	  if (ret != -ENOENT && ret != -ERR_NO_SUCH_UPLOAD) {
+	    ldpp_dout(dpp, 0) << __func__ <<
+	      " ERROR : failed to abort and clean-up multipart upload \"" <<
+	      upload->get_meta() << "\"" << dendl;
+	    return ret;
+	  } else {
+	    ldpp_dout(dpp, 10) << __func__ <<
+	      " NOTE : unable to find part(s) of "
+	      "aborted multipart upload of \"" << upload->get_meta() <<
+	      "\" for cleaning up" << dendl;
+	  }
+        }
+        num_deleted++;
+      }
+      if (num_deleted) {
+        ldpp_dout(dpp, 0) << __func__ <<
+	  " WARNING : aborted " << num_deleted <<
+	  " incomplete multipart uploads" << dendl;
+      }
+    }
+  } while (is_truncated);
+
+  return 0;
+}
+
+std::string RadosBucket::topics_oid() const {
+  return pubsub_oid_prefix + get_tenant() + ".bucket." + get_name() + "/" + get_marker();
+}
+
+int RadosBucket::read_topics(rgw_pubsub_bucket_topics& notifications, 
+    RGWObjVersionTracker* objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) 
+{
+  bufferlist bl;
+  const int ret = rgw_get_system_obj(store->svc()->sysobj,
+                               store->svc()->zone->get_zone_params().log_pool,
+                               topics_oid(),
+                               bl,
+                               objv_tracker,
+                               nullptr, y, dpp, nullptr);
+  if (ret < 0) {
+    return ret;
+  }
+
+  auto iter = bl.cbegin();
+  try {
+    decode(notifications, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 20) << " failed to decode bucket notifications from oid: " << topics_oid() << ". for bucket: " 
+      << get_name() << ". error: " << err.what() << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RadosBucket::write_topics(const rgw_pubsub_bucket_topics& notifications,
+    RGWObjVersionTracker* objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) {
+  bufferlist bl;
+  encode(notifications, bl);
+
+  return rgw_put_system_obj(dpp, store->svc()->sysobj,
+      store->svc()->zone->get_zone_params().log_pool, 
+      topics_oid(),
+      bl, false, objv_tracker, real_time(), y);
+}
+
+int RadosBucket::remove_topics(RGWObjVersionTracker* objv_tracker, 
+    optional_yield y, const DoutPrefixProvider *dpp) {
+  return rgw_delete_system_obj(dpp, store->svc()->sysobj, 
+      store->svc()->zone->get_zone_params().log_pool,
+      topics_oid(),
+      objv_tracker, y);
+}
+
+std::unique_ptr<User> RadosStore::get_user(const rgw_user &u)
+{
+  return std::make_unique<RadosUser>(this, u);
+}
+
+std::string RadosStore::get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y)
+{
+  return getRados()->get_cluster_fsid(dpp, y);
+}
+
+int RadosStore::get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user)
+{
+  RGWUserInfo uinfo;
+  User* u;
+  RGWObjVersionTracker objv_tracker;
+
+  int r = ctl()->user->get_info_by_access_key(dpp, key, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
+  if (r < 0)
+    return r;
+
+  u = new RadosUser(this, uinfo);
+  if (!u)
+    return -ENOMEM;
+
+  u->get_version_tracker() = objv_tracker;
+
+  user->reset(u);
+  return 0;
+}
+
+int RadosStore::get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user)
+{
+  RGWUserInfo uinfo;
+  User* u;
+  RGWObjVersionTracker objv_tracker;
+
+  int r = ctl()->user->get_info_by_email(dpp, email, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
+  if (r < 0)
+    return r;
+
+  u = new RadosUser(this, uinfo);
+  if (!u)
+    return -ENOMEM;
+
+  u->get_version_tracker() = objv_tracker;
+
+  user->reset(u);
+  return 0;
+}
+
+int RadosStore::get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user)
+{
+  RGWUserInfo uinfo;
+  User* u;
+  RGWObjVersionTracker objv_tracker;
+
+  int r = ctl()->user->get_info_by_swift(dpp, user_str, &uinfo, y, RGWUserCtl::GetParams().set_objv_tracker(&objv_tracker));
+  if (r < 0)
+    return r;
+
+  u = new RadosUser(this, uinfo);
+  if (!u)
+    return -ENOMEM;
+
+  u->get_version_tracker() = objv_tracker;
+
+  user->reset(u);
+  return 0;
+}
+
+std::unique_ptr<Object> RadosStore::get_object(const rgw_obj_key& k)
+{
+  return std::make_unique<RadosObject>(this, k);
+}
+
+int RadosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y)
+{
+  int ret;
+  Bucket* bp;
+
+  bp = new RadosBucket(this, b, u);
+  ret = bp->load_bucket(dpp, y);
+  if (ret < 0) {
+    delete bp;
+    return ret;
+  }
+
+  bucket->reset(bp);
+  return 0;
+}
+
+int RadosStore::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket)
+{
+  Bucket* bp;
+
+  bp = new RadosBucket(this, i, u);
+  /* Don't need to fetch the bucket info, use the provided one */
+
+  bucket->reset(bp);
+  return 0;
+}
+
+int RadosStore::get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr<Bucket>* bucket, optional_yield y)
+{
+  rgw_bucket b;
+
+  b.tenant = tenant;
+  b.name = name;
+
+  return get_bucket(dpp, u, b, bucket, y);
+}
+
+bool RadosStore::is_meta_master()
+{
+  return svc()->zone->is_meta_master();
+}
+
+int RadosStore::forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
+					     bufferlist& in_data,
+					     JSONParser* jp, req_info& info,
+					     optional_yield y)
+{
+  if (is_meta_master()) {
+    /* We're master, don't forward */
+    return 0;
+  }
+
+  if (!svc()->zone->get_master_conn()) {
+    ldpp_dout(dpp, 0) << "rest connection is invalid" << dendl;
+    return -EINVAL;
+  }
+  ldpp_dout(dpp, 0) << "sending request to master zonegroup" << dendl;
+  bufferlist response;
+  std::string uid_str = user->get_id().to_str();
+#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
+  int ret = svc()->zone->get_master_conn()->forward(dpp, rgw_user(uid_str), info,
+                                                    objv, MAX_REST_RESPONSE,
+						    &in_data, &response, y);
+  if (ret < 0)
+    return ret;
+
+  ldpp_dout(dpp, 20) << "response: " << response.c_str() << dendl;
+  if (jp && !jp->parse(response.c_str(), response.length())) {
+    ldpp_dout(dpp, 0) << "failed parsing response from master zonegroup" << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int RadosStore::forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+					     bufferlist& in_data,
+					     RGWXMLDecoder::XMLParser* parser, req_info& info,
+					     optional_yield y)
+{
+  if (is_meta_master()) {
+    /* We're master, don't forward */
+    return 0;
+  }
+
+  if (!svc()->zone->get_master_conn()) {
+    ldpp_dout(dpp, 0) << "rest connection is invalid" << dendl;
+    return -EINVAL;
+  }
+  ldpp_dout(dpp, 0) << "sending request to master zonegroup" << dendl;
+  bufferlist response;
+#define MAX_REST_RESPONSE (128 * 1024) // we expect a very small response
+  int ret = svc()->zone->get_master_conn()->forward_iam_request(dpp, key, info,
+                                                    objv, MAX_REST_RESPONSE,
+						                                        &in_data, &response, y);
+  if (ret < 0)
+    return ret;
+
+  ldpp_dout(dpp, 20) << "response: " << response.c_str() << dendl;
+
+  std::string r = response.c_str();
+  std::string str_to_search = "&quot;";
+  std::string str_to_replace = "\"";
+  boost::replace_all(r, str_to_search, str_to_replace);
+  ldpp_dout(dpp, 20) << "r: " << r.c_str() << dendl;
+
+  if (parser && !parser->parse(r.c_str(), r.length(), 1)) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to parse response from master zonegroup" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+std::string RadosStore::zone_unique_id(uint64_t unique_num)
+{
+  return svc()->zone_utils->unique_id(unique_num);
+}
+
+std::string RadosStore::zone_unique_trans_id(const uint64_t unique_num)
+{
+  return svc()->zone_utils->unique_trans_id(unique_num);
+}
+
+int RadosStore::get_zonegroup(const std::string& id,
+			      std::unique_ptr<ZoneGroup>* zonegroup)
+{
+  ZoneGroup* zg;
+  RGWZoneGroup rzg;
+  int r = svc()->zone->get_zonegroup(id, rzg);
+  if (r < 0)
+    return r;
+
+  zg = new RadosZoneGroup(this, rzg);
+  if (!zg)
+    return -ENOMEM;
+
+  zonegroup->reset(zg);
+  return 0;
+}
+
+int RadosStore::list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids)
+{
+  return svc()->zone->list_zones(dpp, zone_ids);
+}
+
+int RadosStore::cluster_stat(RGWClusterStat& stats)
+{
+  rados_cluster_stat_t rados_stats;
+  int ret;
+
+  ret = rados->get_rados_handle()->cluster_stat(rados_stats);
+  if (ret < 0)
+    return ret;
+
+  stats.kb = rados_stats.kb;
+  stats.kb_used = rados_stats.kb_used;
+  stats.kb_avail = rados_stats.kb_avail;
+  stats.num_objects = rados_stats.num_objects;
+
+  return ret;
+}
+
+std::unique_ptr<Lifecycle> RadosStore::get_lifecycle(void)
+{
+  return std::make_unique<RadosLifecycle>(this);
+}
+
+std::unique_ptr<Completions> RadosStore::get_completions(void)
+{
+  return std::make_unique<RadosCompletions>();
+}
+
+std::unique_ptr<Notification> RadosStore::get_notification(
+  rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, rgw::notify::EventType event_type, optional_yield y, const std::string* object_name)
+{
+  return std::make_unique<RadosNotification>(s, this, obj, src_obj, s, event_type, y, object_name);
+}
+
+std::unique_ptr<Notification> RadosStore::get_notification(const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y)
+{
+  return std::make_unique<RadosNotification>(dpp, this, obj, src_obj, event_type, _bucket, _user_id, _user_tenant, _req_id, y);
+}
+
+std::string RadosStore::topics_oid(const std::string& tenant) const {
+  return pubsub_oid_prefix + tenant;
+}
+
+int RadosStore::read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+        optional_yield y, const DoutPrefixProvider *dpp) {
+  bufferlist bl;
+  const int ret = rgw_get_system_obj(svc()->sysobj,
+                               svc()->zone->get_zone_params().log_pool,
+                               topics_oid(tenant),
+                               bl,
+                               objv_tracker,
+                               nullptr, y, dpp, nullptr);
+  if (ret < 0) {
+    return ret;
+  }
+
+  auto iter = bl.cbegin();
+  try {
+    decode(topics, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 20) << " failed to decode topics from oid: " << topics_oid(tenant) << 
+      ". error: " << err.what() << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RadosStore::write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+	optional_yield y, const DoutPrefixProvider *dpp) {
+  bufferlist bl;
+  encode(topics, bl);
+
+  return rgw_put_system_obj(dpp, svc()->sysobj,
+      svc()->zone->get_zone_params().log_pool, 
+      topics_oid(tenant),
+      bl, false, objv_tracker, real_time(), y);
+}
+
+int RadosStore::remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker,
+        optional_yield y, const DoutPrefixProvider *dpp) {
+  return rgw_delete_system_obj(dpp, svc()->sysobj, 
+      svc()->zone->get_zone_params().log_pool,
+      topics_oid(tenant),
+      objv_tracker, y);
+}
+
+int RadosStore::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
+{
+  return rados->delete_raw_obj(dpp, obj);
+}
+
+int RadosStore::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, Completions* aio)
+{
+  RadosCompletions* raio = static_cast<RadosCompletions*>(aio);
+
+  return rados->delete_raw_obj_aio(dpp, obj, raio->handles);
+}
+
+void RadosStore::get_raw_obj(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj* raw_obj)
+{
+    rados->obj_to_raw(placement_rule, obj, raw_obj);
+}
+
+int RadosStore::get_raw_chunk_size(const DoutPrefixProvider* dpp, const rgw_raw_obj& obj, uint64_t* chunk_size)
+{
+  return rados->get_max_chunk_size(obj.pool, chunk_size, dpp);
+}
+
+int RadosStore::initialize(CephContext *cct, const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<ZoneGroup> zg =
+    std::make_unique<RadosZoneGroup>(this, svc()->zone->get_zonegroup());
+  zone = make_unique<RadosZone>(this, std::move(zg));
+  return 0;
+}
+
+int RadosStore::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
+{
+    return rados->log_usage(dpp, usage_info);
+}
+
+int RadosStore::log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl)
+{
+  rgw_raw_obj obj(svc()->zone->get_zone_params().log_pool, oid);
+
+  int ret = rados->append_async(dpp, obj, bl.length(), bl);
+  if (ret == -ENOENT) {
+    ret = rados->create_pool(dpp, svc()->zone->get_zone_params().log_pool);
+    if (ret < 0)
+      return ret;
+    // retry
+    ret = rados->append_async(dpp, obj, bl.length(), bl);
+  }
+
+  return ret;
+}
+
+int RadosStore::register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
+					   const map<std::string, std::string>& meta)
+{
+  return rados->register_to_service_map(dpp, daemon_type, meta);
+}
+
+void RadosStore::get_quota(RGWQuota& quota)
+{
+    quota.bucket_quota = svc()->quota->get_bucket_quota();
+    quota.user_quota = svc()->quota->get_user_quota();
+}
+
+void RadosStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit)
+{
+  bucket_ratelimit = svc()->zone->get_current_period().get_config().bucket_ratelimit;
+  user_ratelimit = svc()->zone->get_current_period().get_config().user_ratelimit;
+  anon_ratelimit = svc()->zone->get_current_period().get_config().anon_ratelimit;
+}
+
+int RadosStore::set_buckets_enabled(const DoutPrefixProvider* dpp, vector<rgw_bucket>& buckets, bool enabled)
+{
+    return rados->set_buckets_enabled(buckets, enabled, dpp);
+}
+
+int RadosStore::get_sync_policy_handler(const DoutPrefixProvider* dpp,
+					   std::optional<rgw_zone_id> zone,
+					   std::optional<rgw_bucket> bucket,
+					   RGWBucketSyncPolicyHandlerRef* phandler,
+					   optional_yield y)
+{
+  return ctl()->bucket->get_sync_policy_handler(zone, bucket, phandler, y, dpp);
+}
+
+RGWDataSyncStatusManager* RadosStore::get_data_sync_manager(const rgw_zone_id& source_zone)
+{
+  return rados->get_data_sync_manager(source_zone);
+}
+
+int RadosStore::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+				  uint32_t max_entries, bool* is_truncated,
+				  RGWUsageIter& usage_iter,
+				  map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  rgw_user uid;
+  std::string bucket_name;
+
+  return rados->read_usage(dpp, uid, bucket_name, start_epoch, end_epoch, max_entries,
+			   is_truncated, usage_iter, usage);
+}
+
+int RadosStore::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+  rgw_user uid;
+  std::string bucket_name;
+
+  return rados->trim_usage(dpp, uid, bucket_name, start_epoch, end_epoch);
+}
+
+int RadosStore::get_config_key_val(std::string name, bufferlist* bl)
+{
+  return svc()->config_key->get(name, true, bl);
+}
+
+int RadosStore::meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle)
+{
+  return ctl()->meta.mgr->list_keys_init(dpp, section, marker, phandle);
+}
+
+int RadosStore::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, list<std::string>& keys, bool* truncated)
+{
+  return ctl()->meta.mgr->list_keys_next(dpp, handle, max, keys, truncated);
+}
+
+void RadosStore::meta_list_keys_complete(void* handle)
+{
+  ctl()->meta.mgr->list_keys_complete(handle);
+}
+
+std::string RadosStore::meta_get_marker(void* handle)
+{
+  return ctl()->meta.mgr->get_marker(handle);
+}
+
+int RadosStore::meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y)
+{
+  return ctl()->meta.mgr->remove(metadata_key, y, dpp);
+}
+
+void RadosStore::finalize(void)
+{
+  if (rados)
+    rados->finalize();
+}
+
+void RadosStore::register_admin_apis(RGWRESTMgr* mgr)
+{
+  mgr->register_resource("user", new RGWRESTMgr_User);
+  mgr->register_resource("bucket", new RGWRESTMgr_Bucket);
+  /*Registering resource for /admin/metadata */
+  mgr->register_resource("metadata", new RGWRESTMgr_Metadata);
+  mgr->register_resource("log", new RGWRESTMgr_Log);
+  /* XXX These may become global when cbodley is done with his zone work */
+  mgr->register_resource("config", new RGWRESTMgr_Config);
+  mgr->register_resource("realm", new RGWRESTMgr_Realm);
+  mgr->register_resource("ratelimit", new RGWRESTMgr_Ratelimit);
+}
+
+std::unique_ptr<LuaManager> RadosStore::get_lua_manager()
+{
+  return std::make_unique<RadosLuaManager>(this);
+}
+
+std::unique_ptr<RGWRole> RadosStore::get_role(std::string name,
+					      std::string tenant,
+					      std::string path,
+					      std::string trust_policy,
+					      std::string max_session_duration_str,
+                std::multimap<std::string,std::string> tags)
+{
+  return std::make_unique<RadosRole>(this, name, tenant, path, trust_policy, max_session_duration_str, tags);
+}
+
+std::unique_ptr<RGWRole> RadosStore::get_role(std::string id)
+{
+  return std::make_unique<RadosRole>(this, id);
+}
+
+std::unique_ptr<RGWRole> RadosStore::get_role(const RGWRoleInfo& info)
+{
+  return std::make_unique<RadosRole>(this, info);
+}
+
+int RadosStore::get_roles(const DoutPrefixProvider *dpp,
+			  optional_yield y,
+			  const std::string& path_prefix,
+			  const std::string& tenant,
+			  vector<std::unique_ptr<RGWRole>>& roles)
+{
+  auto pool = svc()->zone->get_zone_params().roles_pool;
+  std::string prefix;
+
+  // List all roles if path prefix is empty
+  if (! path_prefix.empty()) {
+    prefix = tenant + RGWRole::role_path_oid_prefix + path_prefix;
+  } else {
+    prefix = tenant + RGWRole::role_path_oid_prefix;
+  }
+
+  //Get the filtered objects
+  list<std::string> result;
+  bool is_truncated;
+  RGWListRawObjsCtx ctx;
+  do {
+    list<std::string> oids;
+    int r = rados->list_raw_objects(dpp, pool, prefix, 1000, ctx, oids, &is_truncated);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: listing filtered objects failed: "
+                  << prefix << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    for (const auto& iter : oids) {
+      result.push_back(iter.substr(RGWRole::role_path_oid_prefix.size()));
+    }
+  } while (is_truncated);
+
+  for (const auto& it : result) {
+    //Find the role oid prefix from the end
+    size_t pos = it.rfind(RGWRole::role_oid_prefix);
+    if (pos == std::string::npos) {
+        continue;
+    }
+    // Split the result into path and info_oid + id
+    std::string path = it.substr(0, pos);
+
+    /*Make sure that prefix is part of path (False results could've been returned)
+      because of the role info oid + id appended to the path)*/
+    if(path_prefix.empty() || path.find(path_prefix) != std::string::npos) {
+      //Get id from info oid prefix + id
+      std::string id = it.substr(pos + RGWRole::role_oid_prefix.length());
+
+      std::unique_ptr<rgw::sal::RGWRole> role = get_role(id);
+      int ret = role->read_info(dpp, y);
+      if (ret < 0) {
+        return ret;
+      }
+      roles.push_back(std::move(role));
+    }
+  }
+
+  return 0;
+}
+
+std::unique_ptr<RGWOIDCProvider> RadosStore::get_oidc_provider()
+{
+  return std::make_unique<RadosOIDCProvider>(this);
+}
+
+int RadosStore::get_oidc_providers(const DoutPrefixProvider *dpp,
+				   const std::string& tenant,
+				   vector<std::unique_ptr<RGWOIDCProvider>>& providers)
+{
+  std::string prefix = tenant + RGWOIDCProvider::oidc_url_oid_prefix;
+  auto pool = svc()->zone->get_zone_params().oidc_pool;
+
+  //Get the filtered objects
+  list<std::string> result;
+  bool is_truncated;
+  RGWListRawObjsCtx ctx;
+  do {
+    list<std::string> oids;
+    int r = rados->list_raw_objects(dpp, pool, prefix, 1000, ctx, oids, &is_truncated);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: listing filtered objects failed: OIDC pool: "
+                  << pool.name << ": " << prefix << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    for (const auto& iter : oids) {
+      std::unique_ptr<rgw::sal::RGWOIDCProvider> provider = get_oidc_provider();
+      bufferlist bl;
+
+      r = rgw_get_system_obj(svc()->sysobj, pool, iter, bl, nullptr, nullptr, null_yield, dpp);
+      if (r < 0) {
+        return r;
+      }
+
+      try {
+        using ceph::decode;
+        auto iter = bl.cbegin();
+        decode(*provider, iter);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to decode oidc provider info from pool: "
+	  << pool.name << ": " << iter << dendl;
+        return -EIO;
+      }
+
+      providers.push_back(std::move(provider));
+    }
+  } while (is_truncated);
+
+  return 0;
+}
+
+std::unique_ptr<Writer> RadosStore::get_append_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  const std::string& unique_tag,
+				  uint64_t position,
+				  uint64_t *cur_accounted_size)
+{
+  RGWBucketInfo& bucket_info = obj->get_bucket()->get_info();
+  RGWObjectCtx& obj_ctx = static_cast<RadosObject*>(obj)->get_ctx();
+  auto aio = rgw::make_throttle(ctx()->_conf->rgw_put_obj_min_window_size, y);
+  return std::make_unique<RadosAppendWriter>(dpp, y,
+				 bucket_info, obj_ctx, obj->get_obj(),
+				 this, std::move(aio), owner,
+				 ptail_placement_rule,
+				 unique_tag, position,
+				 cur_accounted_size);
+}
+
+std::unique_ptr<Writer> RadosStore::get_atomic_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t olh_epoch,
+				  const std::string& unique_tag)
+{
+  RGWBucketInfo& bucket_info = obj->get_bucket()->get_info();
+  RGWObjectCtx& obj_ctx = static_cast<RadosObject*>(obj)->get_ctx();
+  auto aio = rgw::make_throttle(ctx()->_conf->rgw_put_obj_min_window_size, y);
+  return std::make_unique<RadosAtomicWriter>(dpp, y,
+				 bucket_info, obj_ctx, obj->get_obj(),
+				 this, std::move(aio), owner,
+				 ptail_placement_rule,
+				 olh_epoch, unique_tag);
+}
+
+const std::string& RadosStore::get_compression_type(const rgw_placement_rule& rule)
+{
+      return svc()->zone->get_zone_params().get_compression_type(rule);
+}
+
+bool RadosStore::valid_placement(const rgw_placement_rule& rule)
+{
+  return svc()->zone->get_zone_params().valid_placement(rule);
+}
+
+int RadosStore::get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx* ioctx)
+{
+  return rados->get_obj_head_ioctx(dpp, bucket_info, obj, ioctx);
+}
+
+RadosObject::~RadosObject()
+{
+  if (rados_ctx_owned)
+    delete rados_ctx;
+}
+
+int RadosObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **pstate, optional_yield y, bool follow_olh)
+{
+  int ret = store->getRados()->get_obj_state(dpp, rados_ctx, bucket->get_info(), get_obj(), pstate, &manifest, follow_olh, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  /* Don't overwrite obj, atomic, or prefetch */
+  rgw_obj obj = get_obj();
+  bool is_atomic = state.is_atomic;
+  bool prefetch_data = state.prefetch_data;
+
+  state = **pstate;
+
+  state.obj = obj;
+  state.is_atomic = is_atomic;
+  state.prefetch_data = prefetch_data;
+  return ret;
+}
+
+int RadosObject::read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Read &read_op, optional_yield y, rgw_obj* target_obj)
+{
+  read_op.params.attrs = &state.attrset;
+  read_op.params.target_obj = target_obj;
+  read_op.params.obj_size = &state.size;
+  read_op.params.lastmod = &state.mtime;
+
+  return read_op.prepare(y, dpp);
+}
+
+int RadosObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y)
+{
+  Attrs empty;
+  return store->getRados()->set_attrs(dpp, rados_ctx,
+			bucket->get_info(),
+			get_obj(),
+			setattrs ? *setattrs : empty,
+			delattrs ? delattrs : nullptr,
+			y);
+}
+
+int RadosObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj)
+{
+  RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
+  RGWRados::Object::Read read_op(&op_target);
+
+  return read_attrs(dpp, read_op, y, target_obj);
+}
+
+int RadosObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp)
+{
+  rgw_obj target = get_obj();
+  rgw_obj save = get_obj();
+  int r = get_obj_attrs(y, dpp, &target);
+  if (r < 0) {
+    return r;
+  }
+
+  /* Temporarily set target */
+  state.obj = target;
+  set_atomic();
+  state.attrset[attr_name] = attr_val;
+  r = set_obj_attrs(dpp, &state.attrset, nullptr, y);
+  /* Restore target */
+  state.obj = save;
+
+  return r;
+}
+
+int RadosObject::delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y)
+{
+  Attrs rmattr;
+  bufferlist bl;
+
+  set_atomic();
+  rmattr[attr_name] = bl;
+  return set_obj_attrs(dpp, nullptr, &rmattr, y);
+}
+
+bool RadosObject::is_expired() {
+  auto iter = state.attrset.find(RGW_ATTR_DELETE_AT);
+  if (iter == state.attrset.end()) {
+    return false;
+  }
+  utime_t delete_at;
+  try {
+    auto bufit = iter->second.cbegin();
+    decode(delete_at, bufit);
+  } catch (buffer::error& err) {
+    ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode " RGW_ATTR_DELETE_AT " attr" << dendl;
+    return false;
+  }
+
+  return delete_at <= ceph_clock_now() && !delete_at.is_zero();
+}
+
+void RadosObject::gen_rand_obj_instance_name()
+{
+  store->getRados()->gen_rand_obj_instance_name(&state.obj.key);
+}
+
+void RadosObject::raw_obj_to_obj(const rgw_raw_obj& raw_obj)
+{
+  rgw_obj tobj = get_obj();
+  RGWSI_Tier_RADOS::raw_obj_to_obj(get_bucket()->get_key(), raw_obj, &tobj);
+  set_key(tobj.key);
+}
+
+void RadosObject::get_raw_obj(rgw_raw_obj* raw_obj)
+{
+  store->getRados()->obj_to_raw((bucket->get_info()).placement_rule, get_obj(), raw_obj);
+}
+
+int RadosObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+				  std::map<std::string, bufferlist> *m,
+				  bool* pmore, optional_yield y)
+{
+  rgw_raw_obj raw_obj;
+  get_raw_obj(&raw_obj);
+  auto sysobj = store->svc()->sysobj->get_obj(raw_obj);
+
+  return sysobj.omap().get_vals(dpp, marker, count, m, pmore, y);
+}
+
+int RadosObject::omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
+				 optional_yield y)
+{
+  rgw_raw_obj raw_obj;
+  get_raw_obj(&raw_obj);
+  auto sysobj = store->svc()->sysobj->get_obj(raw_obj);
+
+  return sysobj.omap().get_all(dpp, m, y);
+}
+
+int RadosObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+					  const std::set<std::string>& keys,
+					  Attrs* vals)
+{
+  int ret;
+  rgw_raw_obj head_obj;
+  librados::IoCtx cur_ioctx;
+  rgw_obj obj = get_obj();
+
+  store->getRados()->obj_to_raw(bucket->get_placement_rule(), obj, &head_obj);
+  ret = store->get_obj_head_ioctx(dpp, bucket->get_info(), obj, &cur_ioctx);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return cur_ioctx.omap_get_vals_by_keys(oid, keys, vals);
+}
+
+int RadosObject::omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+					bool must_exist, optional_yield y)
+{
+  rgw_raw_obj raw_meta_obj;
+  rgw_obj obj = get_obj();
+
+  store->getRados()->obj_to_raw(bucket->get_placement_rule(), obj, &raw_meta_obj);
+
+  auto sysobj = store->svc()->sysobj->get_obj(raw_meta_obj);
+
+  return sysobj.omap().set_must_exist(must_exist).set(dpp, key, val, y);
+}
+
+int RadosObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y)
+{
+  int r = get_obj_attrs(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to read object attrs " << get_name() << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  const auto& aiter = get_attrs().find(RGW_ATTR_ACL);
+  if (aiter == get_attrs().end()) {
+    ldpp_dout(dpp, 0) << "ERROR: no acls found for object " << get_name() << dendl;
+    return -EINVAL;
+  }
+
+  bufferlist& bl = aiter->second;
+  RGWAccessControlPolicy policy(store->ctx());
+  ACLOwner owner;
+  auto bliter = bl.cbegin();
+  try {
+    policy.decode(bliter);
+    owner = policy.get_owner();
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: decode policy failed" << err.what()
+      << dendl;
+    return -EIO;
+  }
+
+  //Get the ACL from the policy
+  RGWAccessControlList& acl = policy.get_acl();
+
+  //Remove grant that is set to old owner
+  acl.remove_canon_user_grant(owner.get_id());
+
+  //Create a grant and add grant
+  ACLGrant grant;
+  grant.set_canon(new_user.get_id(), new_user.get_display_name(), RGW_PERM_FULL_CONTROL);
+  acl.add_grant(&grant);
+
+  //Update the ACL owner to the new user
+  owner.set_id(new_user.get_id());
+  owner.set_name(new_user.get_display_name());
+  policy.set_owner(owner);
+
+  bl.clear();
+  encode(policy, bl);
+
+  set_atomic();
+  map<string, bufferlist> attrs;
+  attrs[RGW_ATTR_ACL] = bl;
+  r = set_obj_attrs(dpp, &attrs, nullptr, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: modify attr failed " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+std::unique_ptr<MPSerializer> RadosObject::get_serializer(const DoutPrefixProvider *dpp, const std::string& lock_name)
+{
+  return std::make_unique<MPRadosSerializer>(dpp, store, this, lock_name);
+}
+
+int RadosObject::transition(Bucket* bucket,
+			    const rgw_placement_rule& placement_rule,
+			    const real_time& mtime,
+			    uint64_t olh_epoch,
+			    const DoutPrefixProvider* dpp,
+			    optional_yield y)
+{
+  return store->getRados()->transition_obj(*rados_ctx, bucket->get_info(), get_obj(), placement_rule, mtime, olh_epoch, dpp, y);
+}
+
+int RadosObject::transition_to_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_bucket_dir_entry& o,
+			   std::set<std::string>& cloud_targets,
+			   CephContext* cct,
+			   bool update_object,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y)
+{
+  /* init */
+  rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
+  string id = "cloudid";
+  string endpoint = rtier->get_rt().t.s3.endpoint;
+  RGWAccessKey key = rtier->get_rt().t.s3.key;
+  string region = rtier->get_rt().t.s3.region;
+  HostStyle host_style = rtier->get_rt().t.s3.host_style;
+  string bucket_name = rtier->get_rt().t.s3.target_path;
+  const rgw::sal::ZoneGroup& zonegroup = store->get_zone()->get_zonegroup();
+
+  if (bucket_name.empty()) {
+    bucket_name = "rgwx-" + zonegroup.get_name() + "-" + tier->get_storage_class() +
+                    "-cloud-bucket";
+    boost::algorithm::to_lower(bucket_name);
+  }
+
+  /* Create RGW REST connection */
+  S3RESTConn conn(cct, id, { endpoint }, key, zonegroup.get_id(), region, host_style);
+
+  RGWLCCloudTierCtx tier_ctx(cct, dpp, o, store, bucket->get_info(),
+			     this, conn, bucket_name,
+			     rtier->get_rt().t.s3.target_storage_class);
+  tier_ctx.acl_mappings = rtier->get_rt().t.s3.acl_mappings;
+  tier_ctx.multipart_min_part_size = rtier->get_rt().t.s3.multipart_min_part_size;
+  tier_ctx.multipart_sync_threshold = rtier->get_rt().t.s3.multipart_sync_threshold;
+  tier_ctx.storage_class = tier->get_storage_class();
+
+  ldpp_dout(dpp, 0) << "Transitioning object(" << o.key << ") to the cloud endpoint(" << endpoint << ")" << dendl;
+
+  /* Transition object to cloud end point */
+  int ret = rgw_cloud_tier_transfer_object(tier_ctx, cloud_targets);
+
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to transfer object(" << o.key << ") to the cloud endpoint(" << endpoint << ") ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (update_object) {
+    real_time read_mtime;
+
+    std::unique_ptr<rgw::sal::Object::ReadOp> read_op(get_read_op());
+    read_op->params.lastmod = &read_mtime;
+
+    ret = read_op->prepare(y, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: Updating tier object(" << o.key << ") failed ret=" << ret << dendl;
+      return ret;
+    }
+
+    if (read_mtime != tier_ctx.o.meta.mtime) {
+      /* raced */
+      ldpp_dout(dpp, 0) << "ERROR: Updating tier object(" << o.key << ") failed ret=" << -ECANCELED << dendl;
+      return -ECANCELED;
+    }
+
+    rgw_placement_rule target_placement;
+    target_placement.inherit_from(tier_ctx.bucket_info.placement_rule);
+    target_placement.storage_class = tier->get_storage_class();
+
+    ret = write_cloud_tier(dpp, y, tier_ctx.o.versioned_epoch,
+			   tier, tier_ctx.is_multipart_upload,
+			   target_placement, tier_ctx.obj);
+
+  }
+
+  return ret;
+}
+
+int RadosObject::write_cloud_tier(const DoutPrefixProvider* dpp,
+				  optional_yield y,
+				  uint64_t olh_epoch,
+				  PlacementTier* tier,
+				  bool is_multipart_upload,
+				  rgw_placement_rule& target_placement,
+				  Object* head_obj)
+{
+  rgw::sal::RadosPlacementTier* rtier = static_cast<rgw::sal::RadosPlacementTier*>(tier);
+  map<string, bufferlist> attrs = get_attrs();
+  RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
+  RGWRados::Object::Write obj_op(&op_target);
+
+  obj_op.meta.modify_tail = true;
+  obj_op.meta.flags = PUT_OBJ_CREATE;
+  obj_op.meta.category = RGWObjCategory::CloudTiered;
+  obj_op.meta.delete_at = real_time();
+  bufferlist blo;
+  obj_op.meta.data = &blo;
+  obj_op.meta.if_match = NULL;
+  obj_op.meta.if_nomatch = NULL;
+  obj_op.meta.user_data = NULL;
+  obj_op.meta.zones_trace = NULL;
+  obj_op.meta.delete_at = real_time();
+  obj_op.meta.olh_epoch = olh_epoch;
+
+  RGWObjManifest *pmanifest;
+  RGWObjManifest manifest;
+
+  pmanifest = &manifest;
+  RGWObjTier tier_config;
+  tier_config.name = tier->get_storage_class();
+  tier_config.tier_placement = rtier->get_rt();
+  tier_config.is_multipart_upload = is_multipart_upload;
+
+  pmanifest->set_tier_type("cloud-s3");
+  pmanifest->set_tier_config(tier_config);
+
+  /* check if its necessary */
+  pmanifest->set_head(target_placement, head_obj->get_obj(), 0);
+  pmanifest->set_tail_placement(target_placement, head_obj->get_obj().bucket);
+  pmanifest->set_obj_size(0);
+  obj_op.meta.manifest = pmanifest;
+
+  /* update storage class */
+  bufferlist bl;
+  bl.append(tier->get_storage_class());
+  attrs[RGW_ATTR_STORAGE_CLASS] = bl;
+
+  attrs.erase(RGW_ATTR_ID_TAG);
+  attrs.erase(RGW_ATTR_TAIL_TAG);
+
+  return obj_op.write_meta(dpp, 0, 0, attrs, y);
+}
+
+int RadosObject::get_max_chunk_size(const DoutPrefixProvider* dpp, rgw_placement_rule placement_rule, uint64_t* max_chunk_size, uint64_t* alignment)
+{
+  return store->getRados()->get_max_chunk_size(placement_rule, get_obj(), max_chunk_size, dpp, alignment);
+}
+
+void RadosObject::get_max_aligned_size(uint64_t size, uint64_t alignment,
+				     uint64_t* max_size)
+{
+  store->getRados()->get_max_aligned_size(size, alignment, max_size);
+}
+
+bool RadosObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
+{
+  rgw_obj obj;
+  rgw_pool p1, p2;
+
+  obj = get_obj();
+
+  if (r1 == r2)
+    return true;
+
+  if (!store->getRados()->get_obj_data_pool(r1, obj, &p1)) {
+    return false;
+  }
+  if (!store->getRados()->get_obj_data_pool(r2, obj, &p2)) {
+    return false;
+  }
+
+  return p1 == p2;
+}
+
+int RadosObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f)
+{
+  int ret;
+  RGWObjManifest *amanifest{nullptr};
+  rgw_raw_obj head_obj;
+
+  RGWRados::Object op_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
+  RGWRados::Object::Read parent_op(&op_target);
+  uint64_t obj_size;
+
+  parent_op.params.obj_size = &obj_size;
+  parent_op.params.attrs = &get_attrs();
+
+  ret = parent_op.prepare(y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  head_obj = parent_op.state.head_obj;
+
+  ret = op_target.get_manifest(dpp, &amanifest, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ::encode_json("head", head_obj, f);
+  ::encode_json("manifest", *amanifest, f);
+  f->open_array_section("data_location");
+  for (auto miter = amanifest->obj_begin(dpp); miter != amanifest->obj_end(dpp); ++miter) {
+    f->open_object_section("obj");
+    rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(store->getRados());
+    uint64_t ofs = miter.get_ofs();
+    uint64_t left = amanifest->get_obj_size() - ofs;
+    ::encode_json("ofs", miter.get_ofs(), f);
+    ::encode_json("loc", raw_loc, f);
+    ::encode_json("loc_ofs", miter.location_ofs(), f);
+    uint64_t loc_size = miter.get_stripe_size();
+    if (loc_size > left) {
+      loc_size = left;
+    }
+    ::encode_json("loc_size", loc_size, f);
+    f->close_section();
+  }
+  f->close_section();
+
+  return 0;
+}
+
+std::unique_ptr<Object::ReadOp> RadosObject::get_read_op()
+{
+  return std::make_unique<RadosObject::RadosReadOp>(this, rados_ctx);
+}
+
+RadosObject::RadosReadOp::RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx) :
+	source(_source),
+	rctx(_rctx),
+	op_target(_source->store->getRados(),
+		  _source->get_bucket()->get_info(),
+		  *static_cast<RGWObjectCtx *>(rctx),
+		  _source->get_obj()),
+	parent_op(&op_target)
+{ }
+
+int RadosObject::RadosReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp)
+{
+  uint64_t obj_size;
+
+  parent_op.conds.mod_ptr = params.mod_ptr;
+  parent_op.conds.unmod_ptr = params.unmod_ptr;
+  parent_op.conds.high_precision_time = params.high_precision_time;
+  parent_op.conds.mod_zone_id = params.mod_zone_id;
+  parent_op.conds.mod_pg_ver = params.mod_pg_ver;
+  parent_op.conds.if_match = params.if_match;
+  parent_op.conds.if_nomatch = params.if_nomatch;
+  parent_op.params.lastmod = params.lastmod;
+  parent_op.params.target_obj = params.target_obj;
+  parent_op.params.obj_size = &obj_size;
+  parent_op.params.attrs = &source->get_attrs();
+
+  int ret = parent_op.prepare(y, dpp);
+  if (ret < 0)
+    return ret;
+
+  source->set_key(parent_op.state.obj.key);
+  source->set_obj_size(obj_size);
+
+  return ret;
+}
+
+int RadosObject::RadosReadOp::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider* dpp)
+{
+  return parent_op.read(ofs, end, bl, y, dpp);
+}
+
+int RadosObject::RadosReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y)
+{
+  return parent_op.get_attr(dpp, name, dest, y);
+}
+
+std::unique_ptr<Object::DeleteOp> RadosObject::get_delete_op()
+{
+  return std::make_unique<RadosObject::RadosDeleteOp>(this);
+}
+
+RadosObject::RadosDeleteOp::RadosDeleteOp(RadosObject *_source) :
+	source(_source),
+	op_target(_source->store->getRados(),
+		  _source->get_bucket()->get_info(),
+		  _source->get_ctx(),
+		  _source->get_obj()),
+	parent_op(&op_target)
+{ }
+
+int RadosObject::RadosDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  parent_op.params.bucket_owner = params.bucket_owner.get_id();
+  parent_op.params.versioning_status = params.versioning_status;
+  parent_op.params.obj_owner = params.obj_owner;
+  parent_op.params.olh_epoch = params.olh_epoch;
+  parent_op.params.marker_version_id = params.marker_version_id;
+  parent_op.params.bilog_flags = params.bilog_flags;
+  parent_op.params.remove_objs = params.remove_objs;
+  parent_op.params.expiration_time = params.expiration_time;
+  parent_op.params.unmod_since = params.unmod_since;
+  parent_op.params.mtime = params.mtime;
+  parent_op.params.high_precision_time = params.high_precision_time;
+  parent_op.params.zones_trace = params.zones_trace;
+  parent_op.params.abortmp = params.abortmp;
+  parent_op.params.parts_accounted_size = params.parts_accounted_size;
+
+  int ret = parent_op.delete_obj(y, dpp);
+  if (ret < 0)
+    return ret;
+
+  result.delete_marker = parent_op.result.delete_marker;
+  result.version_id = parent_op.result.version_id;
+
+  return ret;
+}
+
+int RadosObject::delete_object(const DoutPrefixProvider* dpp,
+			       optional_yield y,
+			       bool prevent_versioning)
+{
+  RGWRados::Object del_target(store->getRados(), bucket->get_info(), *rados_ctx, get_obj());
+  RGWRados::Object::Delete del_op(&del_target);
+
+  del_op.params.bucket_owner = bucket->get_info().owner;
+  del_op.params.versioning_status = prevent_versioning ? 0 : bucket->get_info().versioning_status();
+
+  return del_op.delete_obj(y, dpp);
+}
+
+int RadosObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate,
+				   Completions* aio, bool keep_index_consistent,
+				   optional_yield y)
+{
+  RadosCompletions* raio = static_cast<RadosCompletions*>(aio);
+
+  return store->getRados()->delete_obj_aio(dpp, get_obj(), bucket->get_info(), astate,
+					   raio->handles, keep_index_consistent, y);
+}
+
+int RadosObject::copy_object(User* user,
+				req_info* info,
+				const rgw_zone_id& source_zone,
+				rgw::sal::Object* dest_object,
+				rgw::sal::Bucket* dest_bucket,
+				rgw::sal::Bucket* src_bucket,
+				const rgw_placement_rule& dest_placement,
+				ceph::real_time* src_mtime,
+				ceph::real_time* mtime,
+				const ceph::real_time* mod_ptr,
+				const ceph::real_time* unmod_ptr,
+				bool high_precision_time,
+				const char* if_match,
+				const char* if_nomatch,
+				AttrsMod attrs_mod,
+				bool copy_if_newer,
+				Attrs& attrs,
+				RGWObjCategory category,
+				uint64_t olh_epoch,
+				boost::optional<ceph::real_time> delete_at,
+				std::string* version_id,
+				std::string* tag,
+				std::string* etag,
+				void (*progress_cb)(off_t, void *),
+				void* progress_data,
+				const DoutPrefixProvider* dpp,
+				optional_yield y)
+{
+  return store->getRados()->copy_obj(*rados_ctx,
+				     user->get_id(),
+				     info,
+				     source_zone,
+				     dest_object->get_obj(),
+				     get_obj(),
+				     dest_bucket->get_info(),
+				     src_bucket->get_info(),
+				     dest_placement,
+				     src_mtime,
+				     mtime,
+				     mod_ptr,
+				     unmod_ptr,
+				     high_precision_time,
+				     if_match,
+				     if_nomatch,
+				     static_cast<RGWRados::AttrsMod>(attrs_mod),
+				     copy_if_newer,
+				     attrs,
+				     category,
+				     olh_epoch,
+				     (delete_at ? *delete_at : real_time()),
+				     version_id,
+				     tag,
+				     etag,
+				     progress_cb,
+				     progress_data,
+				     dpp,
+				     y);
+}
+
+int RadosObject::RadosReadOp::iterate(const DoutPrefixProvider* dpp, int64_t ofs, int64_t end, RGWGetDataCB* cb, optional_yield y)
+{
+  return parent_op.iterate(dpp, ofs, end, cb, y);
+}
+
+int RadosObject::swift_versioning_restore(bool& restored,
+					  const DoutPrefixProvider* dpp)
+{
+  rgw_obj obj = get_obj();
+  return store->getRados()->swift_versioning_restore(*rados_ctx,
+						     bucket->get_owner()->get_id(),
+						     bucket->get_info(),
+						     obj,
+						     restored,
+						     dpp);
+}
+
+int RadosObject::swift_versioning_copy(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return store->getRados()->swift_versioning_copy(*rados_ctx,
+                                        bucket->get_info().owner,
+                                        bucket->get_info(),
+                                        get_obj(),
+                                        dpp,
+                                        y);
+}
+
+int RadosMultipartUpload::cleanup_part_history(const DoutPrefixProvider* dpp,
+                                               optional_yield y,
+                                               RadosMultipartPart *part,
+                                               list<rgw_obj_index_key>& remove_objs)
+{
+  cls_rgw_obj_chain chain;
+  for (auto& ppfx : part->get_past_prefixes()) {
+    rgw_obj past_obj;
+    past_obj.init_ns(bucket->get_key(), ppfx + "." + std::to_string(part->info.num), mp_ns);
+    rgw_obj_index_key past_key;
+    past_obj.key.get_index_key(&past_key);
+    // Remove past upload part objects from index, too.
+    remove_objs.push_back(past_key);
+
+    RGWObjManifest manifest = part->get_manifest();
+    manifest.set_prefix(ppfx);
+    RGWObjManifest::obj_iterator miter = manifest.obj_begin(dpp);
+    for (; miter != manifest.obj_end(dpp); ++miter) {
+      rgw_raw_obj raw_part_obj = miter.get_location().get_raw_obj(store->getRados());
+      cls_rgw_obj_key part_key(raw_part_obj.oid);
+      chain.push_obj(raw_part_obj.pool.to_str(), part_key, raw_part_obj.loc);
+    }
+  }
+  if (store->getRados()->get_gc() == nullptr) {
+    // Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
+    store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id());
+  } else {
+    // use upload id as tag and do it synchronously
+    auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id());
+    if (ret < 0 && leftover_chain) {
+      ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
+      if (ret == -ENOENT) {
+        return -ERR_NO_SUCH_UPLOAD;
+      }
+      // Delete objects inline if send chain to gc fails
+      store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id());
+    }
+  }
+  return 0;
+}
+
+
+int RadosMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct)
+{
+  std::unique_ptr<rgw::sal::Object> meta_obj = get_meta_obj();
+  meta_obj->set_in_extra_data(true);
+  meta_obj->set_hash_source(mp_obj.get_key());
+  cls_rgw_obj_chain chain;
+  list<rgw_obj_index_key> remove_objs;
+  bool truncated;
+  int marker = 0;
+  int ret;
+  uint64_t parts_accounted_size = 0;
+
+  do {
+    ret = list_parts(dpp, cct, 1000, marker, &marker, &truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << __func__ << ": RadosMultipartUpload::list_parts returned " <<
+	ret << dendl;
+      return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+    }
+
+    for (auto part_it = parts.begin();
+	 part_it != parts.end();
+	 ++part_it) {
+      RadosMultipartPart* obj_part = dynamic_cast<RadosMultipartPart*>(part_it->second.get());
+      if (obj_part->info.manifest.empty()) {
+	std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
+				    rgw_obj_key(obj_part->oid, std::string(), RGW_OBJ_NS_MULTIPART));
+	obj->set_hash_source(mp_obj.get_key());
+	ret = obj->delete_object(dpp, null_yield);
+        if (ret < 0 && ret != -ENOENT)
+          return ret;
+      } else {
+	auto target = meta_obj->get_obj();
+	store->getRados()->update_gc_chain(dpp, target, obj_part->info.manifest, &chain);
+        RGWObjManifest::obj_iterator oiter = obj_part->info.manifest.obj_begin(dpp);
+        if (oiter != obj_part->info.manifest.obj_end(dpp)) {
+	  std::unique_ptr<rgw::sal::Object> head = bucket->get_object(rgw_obj_key());
+          rgw_raw_obj raw_head = oiter.get_location().get_raw_obj(store->getRados());
+	  dynamic_cast<rgw::sal::RadosObject*>(head.get())->raw_obj_to_obj(raw_head);
+
+          rgw_obj_index_key key;
+          head->get_key().get_index_key(&key);
+          remove_objs.push_back(key);
+
+          cleanup_part_history(dpp, null_yield, obj_part, remove_objs);
+        }
+      }
+      parts_accounted_size += obj_part->info.accounted_size;
+    }
+  } while (truncated);
+
+  if (store->getRados()->get_gc() == nullptr) {
+    //Delete objects inline if gc hasn't been initialised (in case when bypass gc is specified)
+    store->getRados()->delete_objs_inline(dpp, chain, mp_obj.get_upload_id());
+  } else {
+    /* use upload id as tag and do it synchronously */
+    auto [ret, leftover_chain] = store->getRados()->send_chain_to_gc(chain, mp_obj.get_upload_id());
+    if (ret < 0 && leftover_chain) {
+      ldpp_dout(dpp, 5) << __func__ << ": gc->send_chain() returned " << ret << dendl;
+      if (ret == -ENOENT) {
+        return -ERR_NO_SUCH_UPLOAD;
+      }
+      //Delete objects inline if send chain to gc fails
+      store->getRados()->delete_objs_inline(dpp, *leftover_chain, mp_obj.get_upload_id());
+    }
+  }
+
+  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = meta_obj->get_delete_op();
+  del_op->params.bucket_owner = bucket->get_acl_owner();
+  del_op->params.versioning_status = 0;
+  if (!remove_objs.empty()) {
+    del_op->params.remove_objs = &remove_objs;
+  }
+  
+  del_op->params.abortmp = true;
+  del_op->params.parts_accounted_size = parts_accounted_size;
+
+  // and also remove the metadata obj
+  ret = del_op->delete_obj(dpp, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " <<
+      ret << dendl;
+  }
+  return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+}
+
+std::unique_ptr<rgw::sal::Object> RadosMultipartUpload::get_meta_obj()
+{
+  return bucket->get_object(rgw_obj_key(get_meta(), string(), mp_ns));
+}
+
+int RadosMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs)
+{
+  int ret;
+  std::string oid = mp_obj.get_key();
+  RGWObjectCtx obj_ctx(store);
+
+  do {
+    char buf[33];
+    string tmp_obj_name;
+    std::unique_ptr<rgw::sal::Object> obj;
+    gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+    std::string upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */
+    upload_id.append(buf);
+
+    mp_obj.init(oid, upload_id);
+    tmp_obj_name = mp_obj.get_meta();
+
+    obj = bucket->get_object(rgw_obj_key(tmp_obj_name, string(), mp_ns));
+    // the meta object will be indexed with 0 size, we c
+    obj->set_in_extra_data(true);
+    obj->set_hash_source(oid);
+
+    RGWRados::Object op_target(store->getRados(),
+			       obj->get_bucket()->get_info(),
+			       obj_ctx, obj->get_obj());
+    RGWRados::Object::Write obj_op(&op_target);
+
+    op_target.set_versioning_disabled(true); /* no versioning for multipart meta */
+    obj_op.meta.owner = owner.get_id();
+    obj_op.meta.category = RGWObjCategory::MultiMeta;
+    obj_op.meta.flags = PUT_OBJ_CREATE_EXCL;
+    obj_op.meta.mtime = &mtime;
+
+    multipart_upload_info upload_info;
+    upload_info.dest_placement = dest_placement;
+
+    bufferlist bl;
+    encode(upload_info, bl);
+    obj_op.meta.data = &bl;
+
+    ret = obj_op.write_meta(dpp, bl.length(), 0, attrs, y);
+  } while (ret == -EEXIST);
+
+  return ret;
+}
+
+int RadosMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct,
+				     int num_parts, int marker,
+				     int *next_marker, bool *truncated,
+				     bool assume_unsorted)
+{
+  map<string, bufferlist> parts_map;
+  map<string, bufferlist>::iterator iter;
+
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(
+		      rgw_obj_key(get_meta(), std::string(), RGW_OBJ_NS_MULTIPART));
+  obj->set_in_extra_data(true);
+
+  bool sorted_omap = is_v2_upload_id(get_upload_id()) && !assume_unsorted;
+
+  parts.clear();
+
+  int ret;
+  if (sorted_omap) {
+    string p;
+    p = "part.";
+    char buf[32];
+
+    snprintf(buf, sizeof(buf), "%08d", marker);
+    p.append(buf);
+
+    ret = obj->omap_get_vals(dpp, p, num_parts + 1, &parts_map,
+                                 nullptr, null_yield);
+  } else {
+    ret = obj->omap_get_all(dpp, &parts_map, null_yield);
+  }
+  if (ret < 0) {
+    return ret;
+  }
+
+  int i;
+  int last_num = 0;
+
+  uint32_t expected_next = marker + 1;
+
+  for (i = 0, iter = parts_map.begin();
+       (i < num_parts || !sorted_omap) && iter != parts_map.end();
+       ++iter, ++i) {
+    bufferlist& bl = iter->second;
+    auto bli = bl.cbegin();
+    std::unique_ptr<RadosMultipartPart> part = std::make_unique<RadosMultipartPart>();
+    try {
+      decode(part->info, bli);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: could not part info, caught buffer::error" <<
+	dendl;
+      return -EIO;
+    }
+    if (sorted_omap) {
+      if (part->info.num != expected_next) {
+        /* ouch, we expected a specific part num here, but we got a
+         * different one. Either a part is missing, or it could be a
+         * case of mixed rgw versions working on the same upload,
+         * where one gateway doesn't support correctly sorted omap
+         * keys for multipart upload just assume data is unsorted.
+         */
+        return list_parts(dpp, cct, num_parts, marker, next_marker, truncated, true);
+      }
+      expected_next++;
+    }
+    if (sorted_omap ||
+      (int)part->info.num > marker) {
+      last_num = part->info.num;
+      parts[part->info.num] = std::move(part);
+    }
+  }
+
+  if (sorted_omap) {
+    if (truncated) {
+      *truncated = (iter != parts_map.end());
+    }
+  } else {
+    /* rebuild a map with only num_parts entries */
+    std::map<uint32_t, std::unique_ptr<MultipartPart>> new_parts;
+    std::map<uint32_t, std::unique_ptr<MultipartPart>>::iterator piter;
+    for (i = 0, piter = parts.begin();
+	 i < num_parts && piter != parts.end();
+	 ++i, ++piter) {
+      last_num = piter->first;
+      new_parts[piter->first] = std::move(piter->second);
+    }
+
+    if (truncated) {
+      *truncated = (piter != parts.end());
+    }
+
+    parts.swap(new_parts);
+  }
+
+  if (next_marker) {
+    *next_marker = last_num;
+  }
+
+  return 0;
+}
+
+int RadosMultipartUpload::complete(const DoutPrefixProvider *dpp,
+				   optional_yield y, CephContext* cct,
+				   map<int, string>& part_etags,
+				   list<rgw_obj_index_key>& remove_objs,
+				   uint64_t& accounted_size, bool& compressed,
+				   RGWCompressionInfo& cs_info, off_t& ofs,
+				   std::string& tag, ACLOwner& owner,
+				   uint64_t olh_epoch,
+				   rgw::sal::Object* target_obj)
+{
+  char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+  std::string etag;
+  bufferlist etag_bl;
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  bool truncated;
+  int ret;
+
+  int total_parts = 0;
+  int handled_parts = 0;
+  int max_parts = 1000;
+  int marker = 0;
+  uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size;
+  auto etags_iter = part_etags.begin();
+  rgw::sal::Attrs attrs = target_obj->get_attrs();
+
+  do {
+    ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated);
+    if (ret == -ENOENT) {
+      ret = -ERR_NO_SUCH_UPLOAD;
+    }
+    if (ret < 0)
+      return ret;
+
+    total_parts += parts.size();
+    if (!truncated && total_parts != (int)part_etags.size()) {
+      ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts
+		       << " expected: " << part_etags.size() << dendl;
+      ret = -ERR_INVALID_PART;
+      return ret;
+    }
+
+    for (auto obj_iter = parts.begin(); etags_iter != part_etags.end() && obj_iter != parts.end(); ++etags_iter, ++obj_iter, ++handled_parts) {
+      RadosMultipartPart* part = dynamic_cast<rgw::sal::RadosMultipartPart*>(obj_iter->second.get());
+      uint64_t part_size = part->get_size();
+      if (handled_parts < (int)part_etags.size() - 1 &&
+          part_size < min_part_size) {
+        ret = -ERR_TOO_SMALL;
+        return ret;
+      }
+
+      char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+      if (etags_iter->first != (int)obj_iter->first) {
+        ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: "
+			 << etags_iter->first << " next uploaded: "
+			 << obj_iter->first << dendl;
+        ret = -ERR_INVALID_PART;
+        return ret;
+      }
+      string part_etag = rgw_string_unquote(etags_iter->second);
+      if (part_etag.compare(part->get_etag()) != 0) {
+        ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: " << etags_iter->first
+			 << " etag: " << etags_iter->second << dendl;
+        ret = -ERR_INVALID_PART;
+        return ret;
+      }
+
+      hex_to_buf(part->get_etag().c_str(), petag,
+		CEPH_CRYPTO_MD5_DIGESTSIZE);
+      hash.Update((const unsigned char *)petag, sizeof(petag));
+
+      RGWUploadPartInfo& obj_part = part->info;
+
+      /* update manifest for part */
+      string oid = mp_obj.get_part(part->info.num);
+      rgw_obj src_obj;
+      src_obj.init_ns(bucket->get_key(), oid, mp_ns);
+
+      if (obj_part.manifest.empty()) {
+        ldpp_dout(dpp, 0) << "ERROR: empty manifest for object part: obj="
+			 << src_obj << dendl;
+        ret = -ERR_INVALID_PART;
+        return ret;
+      } else {
+        manifest.append(dpp, obj_part.manifest, store->svc()->zone->get_zonegroup(), store->svc()->zone->get_zone_params());
+        auto manifest_prefix = part->info.manifest.get_prefix();
+        if (not manifest_prefix.empty()) {
+          // It has an explicit prefix. Override the default one.
+          src_obj.init_ns(bucket->get_key(), manifest_prefix + "." + std::to_string(part->info.num), mp_ns);
+        }
+      }
+
+      bool part_compressed = (obj_part.cs_info.compression_type != "none");
+      if ((handled_parts > 0) &&
+          ((part_compressed != compressed) ||
+            (cs_info.compression_type != obj_part.cs_info.compression_type))) {
+          ldpp_dout(dpp, 0) << "ERROR: compression type was changed during multipart upload ("
+                           << cs_info.compression_type << ">>" << obj_part.cs_info.compression_type << ")" << dendl;
+          ret = -ERR_INVALID_PART;
+          return ret; 
+      }
+      
+      if (part_compressed) {
+        int64_t new_ofs; // offset in compression data for new part
+        if (cs_info.blocks.size() > 0)
+          new_ofs = cs_info.blocks.back().new_ofs + cs_info.blocks.back().len;
+        else
+          new_ofs = 0;
+        for (const auto& block : obj_part.cs_info.blocks) {
+          compression_block cb;
+          cb.old_ofs = block.old_ofs + cs_info.orig_size;
+          cb.new_ofs = new_ofs;
+          cb.len = block.len;
+          cs_info.blocks.push_back(cb);
+          new_ofs = cb.new_ofs + cb.len;
+        } 
+        if (!compressed)
+          cs_info.compression_type = obj_part.cs_info.compression_type;
+        cs_info.orig_size += obj_part.cs_info.orig_size;
+        compressed = true;
+      }
+
+      rgw_obj_index_key remove_key;
+      src_obj.key.get_index_key(&remove_key);
+
+      remove_objs.push_back(remove_key);
+
+      cleanup_part_history(dpp, y, part, remove_objs);
+
+      ofs += obj_part.size;
+      accounted_size += obj_part.accounted_size;
+    }
+  } while (truncated);
+  hash.Final((unsigned char *)final_etag);
+
+  buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
+  snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
+	   sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+           "-%lld", (long long)part_etags.size());
+  etag = final_etag_str;
+  ldpp_dout(dpp, 10) << "calculated etag: " << etag << dendl;
+
+  etag_bl.append(etag);
+
+  attrs[RGW_ATTR_ETAG] = etag_bl;
+
+  if (compressed) {
+    // write compression attribute to full object
+    bufferlist tmp;
+    encode(cs_info, tmp);
+    attrs[RGW_ATTR_COMPRESSION] = tmp;
+  }
+
+  target_obj->set_atomic();
+
+  RGWRados::Object op_target(store->getRados(),
+			     target_obj->get_bucket()->get_info(),
+			     dynamic_cast<RadosObject*>(target_obj)->get_ctx(),
+			     target_obj->get_obj());
+  RGWRados::Object::Write obj_op(&op_target);
+
+  obj_op.meta.manifest = &manifest;
+  obj_op.meta.remove_objs = &remove_objs;
+
+  obj_op.meta.ptag = &tag; /* use req_id as operation tag */
+  obj_op.meta.owner = owner.get_id();
+  obj_op.meta.flags = PUT_OBJ_CREATE;
+  obj_op.meta.modify_tail = true;
+  obj_op.meta.completeMultipart = true;
+  obj_op.meta.olh_epoch = olh_epoch;
+
+  ret = obj_op.write_meta(dpp, ofs, accounted_size, attrs, y);
+  if (ret < 0)
+    return ret;
+
+  return ret;
+}
+
+int RadosMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
+{
+  if (!rule && !attrs) {
+    return 0;
+  }
+
+  if (rule) {
+    if (!placement.empty()) {
+      *rule = &placement;
+      if (!attrs) {
+	/* Don't need attrs, done */
+	return 0;
+      }
+    } else {
+      *rule = nullptr;
+    }
+  }
+
+  /* We need either attributes or placement, so we need a read */
+  std::unique_ptr<rgw::sal::Object> meta_obj;
+  meta_obj = get_meta_obj();
+  meta_obj->set_in_extra_data(true);
+
+  multipart_upload_info upload_info;
+  bufferlist headbl;
+
+  /* Read the obj head which contains the multipart_upload_info */
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op = meta_obj->get_read_op();
+  meta_obj->set_prefetch_data();
+
+  int ret = read_op->prepare(y, dpp);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      return -ERR_NO_SUCH_UPLOAD;
+    }
+    return ret;
+  }
+
+  extract_span_context(meta_obj->get_attrs(), trace_ctx);
+
+  if (attrs) {
+    /* Attrs are filled in by prepare */
+    *attrs = meta_obj->get_attrs();
+    if (!rule || *rule != nullptr) {
+      /* placement was cached; don't actually read */
+      return 0;
+    }
+  }
+
+  /* Now read the placement from the head */
+  ret = read_op->read(0, store->ctx()->_conf->rgw_max_chunk_size, headbl, y, dpp);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      return -ERR_NO_SUCH_UPLOAD;
+    }
+    return ret;
+  }
+
+  if (headbl.length() <= 0) {
+    return -ERR_NO_SUCH_UPLOAD;
+  }
+
+  /* Decode multipart_upload_info */
+  auto hiter = headbl.cbegin();
+  try {
+    decode(upload_info, hiter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode multipart upload info" << dendl;
+    return -EIO;
+  }
+  placement = upload_info.dest_placement;
+  *rule = &placement;
+
+  return 0;
+}
+
+std::unique_ptr<Writer> RadosMultipartUpload::get_writer(
+				  const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t part_num,
+				  const std::string& part_num_str)
+{
+  RGWBucketInfo& bucket_info = obj->get_bucket()->get_info();
+  RGWObjectCtx& obj_ctx = static_cast<RadosObject*>(obj)->get_ctx();
+  auto aio = rgw::make_throttle(store->ctx()->_conf->rgw_put_obj_min_window_size, y);
+  return std::make_unique<RadosMultipartWriter>(dpp, y, get_upload_id(),
+				 bucket_info, obj_ctx,
+				 obj->get_obj(), store, std::move(aio), owner,
+				 ptail_placement_rule, part_num, part_num_str);
+}
+
+MPRadosSerializer::MPRadosSerializer(const DoutPrefixProvider *dpp, RadosStore* store, RadosObject* obj, const std::string& lock_name) :
+  lock(lock_name)
+{
+  rgw_pool meta_pool;
+  rgw_raw_obj raw_obj;
+
+  obj->get_raw_obj(&raw_obj);
+  oid = raw_obj.oid;
+  store->getRados()->get_obj_data_pool(obj->get_bucket()->get_placement_rule(),
+				       obj->get_obj(), &meta_pool);
+  store->getRados()->open_pool_ctx(dpp, meta_pool, ioctx, true, true);
+}
+
+int MPRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y)
+{
+  op.assert_exists();
+  lock.set_duration(dur);
+  lock.lock_exclusive(&op);
+  int ret = rgw_rados_operate(dpp, ioctx, oid, &op, y);
+  if (! ret) {
+    locked = true;
+  }
+  return ret;
+}
+
+LCRadosSerializer::LCRadosSerializer(RadosStore* store, const std::string& _oid, const std::string& lock_name, const std::string& cookie) :
+  StoreLCSerializer(_oid),
+  lock(lock_name)
+{
+  ioctx = &store->getRados()->lc_pool_ctx;
+  lock.set_cookie(cookie);
+}
+
+int LCRadosSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y)
+{
+  lock.set_duration(dur);
+  return lock.lock_exclusive(ioctx, oid);
+}
+
+int RadosLifecycle::get_entry(const std::string& oid, const std::string& marker,
+			      std::unique_ptr<LCEntry>* entry)
+{
+  cls_rgw_lc_entry cls_entry;
+  int ret = cls_rgw_lc_get_entry(*store->getRados()->get_lc_pool_ctx(), oid, marker, cls_entry);
+  if (ret)
+    return ret;
+
+  LCEntry* e;
+  e = new StoreLCEntry(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
+  if (!e)
+    return -ENOMEM;
+
+  entry->reset(e);
+  return 0;
+}
+
+int RadosLifecycle::get_next_entry(const std::string& oid, const std::string& marker,
+				   std::unique_ptr<LCEntry>* entry)
+{
+  cls_rgw_lc_entry cls_entry;
+  int ret = cls_rgw_lc_get_next_entry(*store->getRados()->get_lc_pool_ctx(), oid, marker,
+				      cls_entry);
+
+  if (ret)
+    return ret;
+
+  LCEntry* e;
+  e = new StoreLCEntry(cls_entry.bucket, cls_entry.start_time, cls_entry.status);
+  if (!e)
+    return -ENOMEM;
+
+  entry->reset(e);
+  return 0;
+}
+
+int RadosLifecycle::set_entry(const std::string& oid, LCEntry& entry)
+{
+  cls_rgw_lc_entry cls_entry;
+
+  cls_entry.bucket = entry.get_bucket();
+  cls_entry.start_time = entry.get_start_time();
+  cls_entry.status = entry.get_status();
+
+  return cls_rgw_lc_set_entry(*store->getRados()->get_lc_pool_ctx(), oid, cls_entry);
+}
+
+int RadosLifecycle::list_entries(const std::string& oid, const std::string& marker,
+				 uint32_t max_entries, std::vector<std::unique_ptr<LCEntry>>& entries)
+{
+  entries.clear();
+
+  vector<cls_rgw_lc_entry> cls_entries;
+  int ret = cls_rgw_lc_list(*store->getRados()->get_lc_pool_ctx(), oid, marker, max_entries, cls_entries);
+
+  if (ret < 0)
+    return ret;
+
+  for (auto& entry : cls_entries) {
+    entries.push_back(std::make_unique<StoreLCEntry>(entry.bucket, oid,
+				entry.start_time, entry.status));
+  }
+
+  return ret;
+}
+
+int RadosLifecycle::rm_entry(const std::string& oid, LCEntry& entry)
+{
+  cls_rgw_lc_entry cls_entry;
+
+  cls_entry.bucket = entry.get_bucket();
+  cls_entry.start_time = entry.get_start_time();
+  cls_entry.status = entry.get_status();
+
+  return cls_rgw_lc_rm_entry(*store->getRados()->get_lc_pool_ctx(), oid, cls_entry);
+}
+
+int RadosLifecycle::get_head(const std::string& oid, std::unique_ptr<LCHead>* head)
+{
+  cls_rgw_lc_obj_head cls_head;
+  int ret = cls_rgw_lc_get_head(*store->getRados()->get_lc_pool_ctx(), oid, cls_head);
+  if (ret)
+    return ret;
+
+  LCHead* h;
+  h = new StoreLCHead(cls_head.start_date, cls_head.shard_rollover_date, cls_head.marker);
+  if (!h)
+    return -ENOMEM;
+
+  head->reset(h);
+  return 0;
+}
+
+int RadosLifecycle::put_head(const std::string& oid, LCHead& head)
+{
+  cls_rgw_lc_obj_head cls_head;
+
+  cls_head.marker = head.get_marker();
+  cls_head.start_date = head.get_start_date();
+  cls_head.shard_rollover_date = head.get_shard_rollover_date();
+
+  return cls_rgw_lc_put_head(*store->getRados()->get_lc_pool_ctx(), oid, cls_head);
+}
+
+std::unique_ptr<LCSerializer> RadosLifecycle::get_serializer(const std::string& lock_name,
+							     const std::string& oid,
+							     const std::string& cookie)
+{
+  return std::make_unique<LCRadosSerializer>(store, oid, lock_name, cookie);
+}
+
+int RadosNotification::publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags)
+{
+  return rgw::notify::publish_reserve(dpp, event_type, res, obj_tags);
+}
+
+int RadosNotification::publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+				     const ceph::real_time& mtime, const std::string& etag, const std::string& version)
+{
+  return rgw::notify::publish_commit(obj, size, mtime, etag, version, event_type, res, dpp);
+}
+
+int RadosAtomicWriter::prepare(optional_yield y)
+{
+  return processor.prepare(y);
+}
+
+int RadosAtomicWriter::process(bufferlist&& data, uint64_t offset)
+{
+  return processor.process(std::move(data), offset);
+}
+
+int RadosAtomicWriter::complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y)
+{
+  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+			    if_match, if_nomatch, user_data, zones_trace, canceled, y);
+}
+
+int RadosAppendWriter::prepare(optional_yield y)
+{
+  return processor.prepare(y);
+}
+
+int RadosAppendWriter::process(bufferlist&& data, uint64_t offset)
+{
+  return processor.process(std::move(data), offset);
+}
+
+int RadosAppendWriter::complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y)
+{
+  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+			    if_match, if_nomatch, user_data, zones_trace, canceled, y);
+}
+
+int RadosMultipartWriter::prepare(optional_yield y)
+{
+  return processor.prepare(y);
+}
+
+int RadosMultipartWriter::process(bufferlist&& data, uint64_t offset)
+{
+  return processor.process(std::move(data), offset);
+}
+
+int RadosMultipartWriter::complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y)
+{
+  return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
+			    if_match, if_nomatch, user_data, zones_trace, canceled, y);
+}
+
+const std::string& RadosZoneGroup::get_endpoint() const
+{
+  if (!group.endpoints.empty()) {
+      return group.endpoints.front();
+  } else {
+    // use zonegroup's master zone endpoints
+    auto z = group.zones.find(group.master_zone);
+    if (z != group.zones.end() && !z->second.endpoints.empty()) {
+      return z->second.endpoints.front();
+    }
+  }
+  return empty;
+}
+
+bool RadosZoneGroup::placement_target_exists(std::string& target) const
+{
+  return !!group.placement_targets.count(target);
+}
+
+void RadosZoneGroup::get_placement_target_names(std::set<std::string>& names) const
+{
+  for (const auto& target : group.placement_targets) {
+    names.emplace(target.second.name);
+  }
+}
+
+int RadosZoneGroup::get_placement_tier(const rgw_placement_rule& rule,
+				       std::unique_ptr<PlacementTier>* tier)
+{
+  std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
+  titer = group.placement_targets.find(rule.name);
+  if (titer == group.placement_targets.end()) {
+    return -ENOENT;
+  }
+
+  const auto& target_rule = titer->second;
+  std::map<std::string, RGWZoneGroupPlacementTier>::const_iterator ttier;
+  ttier = target_rule.tier_targets.find(rule.storage_class);
+  if (ttier == target_rule.tier_targets.end()) {
+    // not found
+    return -ENOENT;
+  }
+
+  PlacementTier* t;
+  t = new RadosPlacementTier(store, ttier->second);
+  if (!t)
+    return -ENOMEM;
+
+  tier->reset(t);
+  return 0;
+}
+
+int RadosZoneGroup::get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone)
+{
+  RGWZone* rz = store->svc()->zone->find_zone(id);
+  if (!rz)
+    return -ENOENT;
+
+  Zone* z = new RadosZone(store, clone(), *rz);
+  zone->reset(z);
+  return 0;
+}
+
+int RadosZoneGroup::get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone)
+{
+  rgw_zone_id id;
+  int ret = store->svc()->zone->find_zone_id_by_name(name, &id);
+  if (ret < 0)
+    return ret;
+
+  RGWZone* rz = store->svc()->zone->find_zone(id.id);
+  if (!rz)
+    return -ENOENT;
+
+  Zone* z = new RadosZone(store, clone(), *rz);
+  zone->reset(z);
+  return 0;
+}
+
+int RadosZoneGroup::list_zones(std::list<std::string>& zone_ids)
+{
+  for (const auto& entry : group.zones)
+    {
+      zone_ids.push_back(entry.second.id);
+    }
+  return 0;
+}
+
+std::unique_ptr<Zone> RadosZone::clone()
+{
+  if (local_zone)
+    return std::make_unique<RadosZone>(store, group->clone());
+
+  return std::make_unique<RadosZone>(store, group->clone(), rgw_zone);
+}
+
+const std::string& RadosZone::get_id()
+{
+  if (local_zone)
+    return store->svc()->zone->zone_id().id;
+
+  return rgw_zone.id;
+}
+
+const std::string& RadosZone::get_name() const
+{
+  if (local_zone)
+    return store->svc()->zone->zone_name();
+
+  return rgw_zone.name;
+}
+
+bool RadosZone::is_writeable()
+{
+  if (local_zone)
+    return store->svc()->zone->zone_is_writeable();
+
+  return !rgw_zone.read_only;
+}
+
+bool RadosZone::get_redirect_endpoint(std::string* endpoint)
+{
+  if (local_zone)
+    return store->svc()->zone->get_redirect_zone_endpoint(endpoint);
+
+  endpoint = &rgw_zone.redirect_zone;
+  return true;
+}
+
+bool RadosZone::has_zonegroup_api(const std::string& api) const
+{
+  return store->svc()->zone->has_zonegroup_api(api);
+}
+
+const std::string& RadosZone::get_current_period_id()
+{
+  return store->svc()->zone->get_current_period_id();
+}
+
+const RGWAccessKey& RadosZone::get_system_key()
+{
+  return store->svc()->zone->get_zone_params().system_key;
+}
+
+const std::string& RadosZone::get_realm_name()
+{
+  return store->svc()->zone->get_realm().get_name();
+}
+
+const std::string& RadosZone::get_realm_id()
+{
+  return store->svc()->zone->get_realm().get_id();
+}
+
+const std::string_view RadosZone::get_tier_type()
+{
+  if (local_zone)
+    return store->svc()->zone->get_zone().tier_type;
+
+  return rgw_zone.id;
+}
+
+RGWBucketSyncPolicyHandlerRef RadosZone::get_sync_policy_handler()
+{
+  return store->svc()->zone->get_sync_policy_handler(get_id());
+}
+
+RadosLuaManager::RadosLuaManager(RadosStore* _s) : 
+  store(_s),
+  pool((store->svc() && store->svc()->zone) ? store->svc()->zone->get_zone_params().log_pool : rgw_pool())
+{ }
+
+int RadosLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script)
+{
+  if (pool.empty()) {
+    ldpp_dout(dpp, 10) << "WARNING: missing pool when reading lua script " << dendl;
+    return 0;
+  }
+  bufferlist bl;
+
+  int r = rgw_get_system_obj(store->svc()->sysobj, pool, key, bl, nullptr, nullptr, y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  auto iter = bl.cbegin();
+  try {
+    ceph::decode(script, iter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RadosLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script)
+{
+  if (pool.empty()) {
+    ldpp_dout(dpp, 10) << "WARNING: missing pool when writing lua script " << dendl;
+    return 0;
+  }
+  bufferlist bl;
+  ceph::encode(script, bl);
+
+  int r = rgw_put_system_obj(dpp, store->svc()->sysobj, pool, key, bl, false, nullptr, real_time(), y);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RadosLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key)
+{
+  if (pool.empty()) {
+    ldpp_dout(dpp, 10) << "WARNING: missing pool when deleting lua script " << dendl;
+    return 0;
+  }
+  int r = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, key, nullptr, y);
+  if (r < 0 && r != -ENOENT) {
+    return r;
+  }
+
+  return 0;
+}
+
+const std::string PACKAGE_LIST_OBJECT_NAME = "lua_package_allowlist";
+
+int RadosLuaManager::add_package(const DoutPrefixProvider *dpp, optional_yield y, const std::string& package_name)
+{
+  // add package to list
+  const bufferlist empty_bl;
+  std::map<std::string, bufferlist> new_package{{package_name, empty_bl}};
+  librados::ObjectWriteOperation op;
+  op.omap_set(new_package);
+  auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+      PACKAGE_LIST_OBJECT_NAME, &op, y);
+
+  if (ret < 0) {
+    return ret;
+  }
+  return 0;
+}
+
+int RadosLuaManager::remove_package(const DoutPrefixProvider *dpp, optional_yield y, const std::string& package_name)
+{
+  librados::ObjectWriteOperation op;
+  size_t pos = package_name.find(" ");
+  if (pos != package_name.npos) {
+    // remove specfic version of the the package
+    op.omap_rm_keys(std::set<std::string>({package_name}));
+    auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+        PACKAGE_LIST_OBJECT_NAME, &op, y);
+    if (ret < 0) {
+        return ret;
+    }
+    return 0;
+  }
+  // otherwise, remove any existing versions of the package
+  rgw::lua::packages_t packages;
+  auto ret = list_packages(dpp, y, packages);
+  if (ret < 0 && ret != -ENOENT) {
+    return ret;
+  }
+  for(const auto& package : packages) {
+    const std::string package_no_version = package.substr(0, package.find(" "));
+    if (package_no_version.compare(package_name) == 0) {
+        op.omap_rm_keys(std::set<std::string>({package}));
+        ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+            PACKAGE_LIST_OBJECT_NAME, &op, y);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+  }
+  return 0;
+}
+
+int RadosLuaManager::list_packages(const DoutPrefixProvider *dpp, optional_yield y, rgw::lua::packages_t& packages)
+{
+  constexpr auto max_chunk = 1024U;
+  std::string start_after;
+  bool more = true;
+  int rval;
+  while (more) {
+    librados::ObjectReadOperation op;
+    rgw::lua::packages_t packages_chunk;
+    op.omap_get_keys2(start_after, max_chunk, &packages_chunk, &more, &rval);
+    const auto ret = rgw_rados_operate(dpp, *(store->getRados()->get_lc_pool_ctx()),
+      PACKAGE_LIST_OBJECT_NAME, &op, nullptr, y);
+
+    if (ret < 0) {
+      return ret;
+    }
+
+    packages.merge(packages_chunk);
+  }
+
+  return 0;
+}
+
+int RadosOIDCProvider::store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y)
+{
+  auto sysobj = store->svc()->sysobj;
+  std::string oid = tenant + get_url_oid_prefix() + url;
+
+  bufferlist bl;
+  using ceph::encode;
+  encode(*this, bl);
+  return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().oidc_pool, oid, bl, exclusive, nullptr, real_time(), y);
+}
+
+int RadosOIDCProvider::read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant)
+{
+  auto sysobj = store->svc()->sysobj;
+  auto& pool = store->svc()->zone->get_zone_params().oidc_pool;
+  std::string oid = tenant + get_url_oid_prefix() + url;
+  bufferlist bl;
+
+  int ret = rgw_get_system_obj(sysobj, pool, oid, bl, nullptr, nullptr, null_yield, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  try {
+    using ceph::decode;
+    auto iter = bl.cbegin();
+    decode(*this, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode oidc provider info from pool: " << pool.name <<
+                  ": " << url << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RadosOIDCProvider::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  auto& pool = store->svc()->zone->get_zone_params().oidc_pool;
+
+  std::string url, tenant;
+  auto ret = get_tenant_url_from_arn(tenant, url);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to parse arn" << dendl;
+    return -EINVAL;
+  }
+
+  if (this->tenant != tenant) {
+    ldpp_dout(dpp, 0) << "ERROR: tenant in arn doesn't match that of user " << this->tenant << ", "
+                  << tenant << ": " << dendl;
+    return -EINVAL;
+  }
+
+  // Delete url
+  std::string oid = tenant + get_url_oid_prefix() + url;
+  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: deleting oidc url from pool: " << pool.name << ": "
+                  << provider_url << ": " << cpp_strerror(-ret) << dendl;
+  }
+
+  return ret;
+}
+
+int RadosRole::store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  using ceph::encode;
+  std::string oid;
+
+  oid = info.id;
+
+  bufferlist bl;
+  encode(this->info, bl);
+
+  if (!this->info.tags.empty()) {
+    bufferlist bl_tags;
+    encode(this->info.tags, bl_tags);
+    map<string, bufferlist> attrs;
+    attrs.emplace("tagging", bl_tags);
+
+    RGWSI_MBSObj_PutParams params(bl, &attrs, info.mtime, exclusive);
+    std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
+    ctx->init(store->svc()->role->get_be_handler());
+    return store->svc()->role->svc.meta_be->put(ctx.get(), oid, params, &info.objv_tracker, y, dpp);
+  } else {
+    RGWSI_MBSObj_PutParams params(bl, nullptr, info.mtime, exclusive);
+    std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
+    ctx->init(store->svc()->role->get_be_handler());
+    return store->svc()->role->svc.meta_be->put(ctx.get(), oid, params, &info.objv_tracker, y, dpp);
+  }
+}
+
+int RadosRole::store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  auto sysobj = store->svc()->sysobj;
+  RGWNameToId nameToId;
+  nameToId.obj_id = info.id;
+
+  std::string oid = info.tenant + get_names_oid_prefix() + info.name;
+
+  bufferlist bl;
+  using ceph::encode;
+  encode(nameToId, bl);
+
+  return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, exclusive, &info.objv_tracker, real_time(), y);
+}
+
+int RadosRole::store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  auto sysobj = store->svc()->sysobj;
+  std::string oid = info.tenant + get_path_oid_prefix() + info.path + get_info_oid_prefix() + info.id;
+
+  bufferlist bl;
+
+  return rgw_put_system_obj(dpp, sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, exclusive, &info.objv_tracker, real_time(), y);
+}
+
+int RadosRole::read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y)
+{
+  auto sysobj = store->svc()->sysobj;
+  std::string oid = info.tenant + get_names_oid_prefix() + role_name;
+  bufferlist bl;
+
+  int ret = rgw_get_system_obj(sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, nullptr, nullptr, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  RGWNameToId nameToId;
+  try {
+    auto iter = bl.cbegin();
+    using ceph::decode;
+    decode(nameToId, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode role from Role pool: " << role_name << dendl;
+    return -EIO;
+  }
+  role_id = nameToId.obj_id;
+  return 0;
+}
+
+int RadosRole::read_name(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  auto sysobj = store->svc()->sysobj;
+  std::string oid = info.tenant + get_names_oid_prefix() + info.name;
+  bufferlist bl;
+
+  int ret = rgw_get_system_obj(sysobj, store->svc()->zone->get_zone_params().roles_pool, oid, bl, nullptr, nullptr, y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed reading role name from Role pool: " << info.name <<
+      ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  RGWNameToId nameToId;
+  try {
+    using ceph::decode;
+    auto iter = bl.cbegin();
+    decode(nameToId, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode role name from Role pool: " << info.name << dendl;
+    return -EIO;
+  }
+  info.id = nameToId.obj_id;
+  return 0;
+}
+
+int RadosRole::read_info(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  std::string oid;
+
+  oid = info.id;
+  ldpp_dout(dpp, 20) << "INFO: oid in read_info is: " << oid << dendl;
+
+  bufferlist bl;
+
+  RGWSI_MBSObj_GetParams params(&bl, &info.attrs, &info.mtime);
+  std::unique_ptr<RGWSI_MetaBackend::Context> ctx(store->svc()->role->svc.meta_be->alloc_ctx());
+  ctx->init(store->svc()->role->get_be_handler());
+  int ret = store->svc()->role->svc.meta_be->get(ctx.get(), oid, params, &info.objv_tracker, y, dpp, true);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed reading role info from Role pool: " << info.id << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  try {
+    using ceph::decode;
+    auto iter = bl.cbegin();
+    decode(this->info, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode role info from Role pool: " << info.id << dendl;
+    return -EIO;
+  }
+
+  auto it = info.attrs.find("tagging");
+  if (it != info.attrs.end()) {
+    bufferlist bl_tags = it->second;
+    try {
+      using ceph::decode;
+      auto iter = bl_tags.cbegin();
+      decode(info.tags, iter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode attrs" << info.id << dendl;
+      return -EIO;
+    }
+  }
+
+  return 0;
+}
+
+int RadosRole::create(const DoutPrefixProvider *dpp, bool exclusive, const std::string& role_id, optional_yield y)
+{
+  int ret;
+
+  if (! validate_input(dpp)) {
+    return -EINVAL;
+  }
+
+  if (!role_id.empty()) {
+    info.id = role_id;
+  }
+
+  /* check to see the name is not used */
+  ret = read_id(dpp, info.name, info.tenant, info.id, y);
+  if (exclusive && ret == 0) {
+    ldpp_dout(dpp, 0) << "ERROR: name " << info.name << " already in use for role id "
+                    << info.id << dendl;
+    return -EEXIST;
+  } else if ( ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "failed reading role id  " << info.id << ": "
+                  << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  if (info.id.empty()) {
+    /* create unique id */
+    uuid_d new_uuid;
+    char uuid_str[37];
+    new_uuid.generate_random();
+    new_uuid.print(uuid_str);
+    info.id = uuid_str;
+  }
+
+  //arn
+  info.arn = role_arn_prefix + info.tenant + ":role" + info.path + info.name;
+
+  // Creation time
+  real_clock::time_point t = real_clock::now();
+
+  struct timeval tv;
+  real_clock::to_timeval(t, tv);
+
+  char buf[30];
+  struct tm result;
+  gmtime_r(&tv.tv_sec, &result);
+  strftime(buf,30,"%Y-%m-%dT%H:%M:%S", &result);
+  sprintf(buf + strlen(buf),".%dZ",(int)tv.tv_usec/1000);
+  info.creation_date.assign(buf, strlen(buf));
+
+  auto& pool = store->svc()->zone->get_zone_params().roles_pool;
+  ret = store_info(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR:  storing role info in Role pool: "
+                  << info.id << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  ret = store_name(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: storing role name in Role pool: "
+                  << info.name << ": " << cpp_strerror(-ret) << dendl;
+
+    //Delete the role info that was stored in the previous call
+    std::string oid = get_info_oid_prefix() + info.id;
+    int info_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+    if (info_ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: cleanup of role id from Role pool: "
+                  << info.id << ": " << cpp_strerror(-info_ret) << dendl;
+    }
+    return ret;
+  }
+
+  ret = store_path(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: storing role path in Role pool: "
+                  << info.path << ": " << cpp_strerror(-ret) << dendl;
+    //Delete the role info that was stored in the previous call
+    std::string oid = get_info_oid_prefix() + info.id;
+    int info_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+    if (info_ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: cleanup of role id from Role pool: "
+                  << info.id << ": " << cpp_strerror(-info_ret) << dendl;
+    }
+    //Delete role name that was stored in previous call
+    oid = info.tenant + get_names_oid_prefix() + info.name;
+    int name_ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+    if (name_ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: cleanup of role name from Role pool: "
+                  << info.name << ": " << cpp_strerror(-name_ret) << dendl;
+    }
+    return ret;
+  }
+  return 0;
+}
+
+int RadosRole::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  auto& pool = store->svc()->zone->get_zone_params().roles_pool;
+
+  int ret = read_name(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = read_info(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (! info.perm_policy_map.empty()) {
+    return -ERR_DELETE_CONFLICT;
+  }
+
+  // Delete id
+  std::string oid = get_info_oid_prefix() + info.id;
+  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: deleting role id from Role pool: "
+                  << info.id << ": " << cpp_strerror(-ret) << dendl;
+  }
+
+  // Delete name
+  oid = info.tenant + get_names_oid_prefix() + info.name;
+  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: deleting role name from Role pool: "
+                  << info.name << ": " << cpp_strerror(-ret) << dendl;
+  }
+
+  // Delete path
+  oid = info.tenant + get_path_oid_prefix() + info.path + get_info_oid_prefix() + info.id;
+  ret = rgw_delete_system_obj(dpp, store->svc()->sysobj, pool, oid, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: deleting role path from Role pool: "
+                  << info.path << ": " << cpp_strerror(-ret) << dendl;
+  }
+  return ret;
+}
+
+} // namespace rgw::sal
+
+extern "C" {
+
+void* newRadosStore(void)
+{
+  rgw::sal::RadosStore* store = new rgw::sal::RadosStore();
+  if (store) {
+    RGWRados* rados = new RGWRados();
+
+    if (!rados) {
+      delete store; store = nullptr;
+    } else {
+      store->setRados(rados);
+      rados->set_store(store);
+    }
+  }
+
+  return store;
+}
+
+}
diff --git a/src/rgw/driver/rados/rgw_sal_rados.h b/src/rgw/driver/rados/rgw_sal_rados.h
new file mode 100644
index 000000000..4d2dc9709
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sal_rados.h
@@ -0,0 +1,978 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal_store.h"
+#include "rgw_rados.h"
+#include "rgw_notify.h"
+#include "rgw_oidc_provider.h"
+#include "rgw_role.h"
+#include "rgw_multi.h"
+#include "rgw_putobj_processor.h"
+#include "services/svc_tier_rados.h"
+#include "cls/lock/cls_lock_client.h"
+
+namespace rgw { namespace sal {
+
+class RadosMultipartUpload;
+
+class RadosCompletions : public Completions {
+  public:
+    std::list<librados::AioCompletion*> handles;
+    RadosCompletions() {}
+    ~RadosCompletions() = default;
+    virtual int drain() override;
+};
+
+class RadosPlacementTier: public StorePlacementTier {
+  RadosStore* store;
+  RGWZoneGroupPlacementTier tier;
+public:
+  RadosPlacementTier(RadosStore* _store, const RGWZoneGroupPlacementTier& _tier) : store(_store), tier(_tier) {}
+  virtual ~RadosPlacementTier() = default;
+
+  virtual const std::string& get_tier_type() { return tier.tier_type; }
+  virtual const std::string& get_storage_class() { return tier.storage_class; }
+  virtual bool retain_head_object() { return tier.retain_head_object; }
+  RGWZoneGroupPlacementTier& get_rt() { return tier; }
+};
+
+class RadosZoneGroup : public StoreZoneGroup {
+  RadosStore* store;
+  const RGWZoneGroup group;
+  std::string empty;
+public:
+  RadosZoneGroup(RadosStore* _store, const RGWZoneGroup& _group) : store(_store), group(_group) {}
+  virtual ~RadosZoneGroup() = default;
+
+  virtual const std::string& get_id() const override { return group.get_id(); };
+  virtual const std::string& get_name() const override { return group.get_name(); };
+  virtual int equals(const std::string& other_zonegroup) const override {
+    return group.equals(other_zonegroup);
+  };
+  /** Get the endpoint from zonegroup, or from master zone if not set */
+  virtual const std::string& get_endpoint() const override;
+  virtual bool placement_target_exists(std::string& target) const override;
+  virtual bool is_master_zonegroup() const override {
+    return group.is_master_zonegroup();
+  };
+  virtual const std::string& get_api_name() const override { return group.api_name; };
+  virtual void get_placement_target_names(std::set<std::string>& names) const override;
+  virtual const std::string& get_default_placement_name() const override {
+    return group.default_placement.name; };
+  virtual int get_hostnames(std::list<std::string>& names) const override {
+    names = group.hostnames;
+    return 0;
+  };
+  virtual int get_s3website_hostnames(std::list<std::string>& names) const override {
+    names = group.hostnames_s3website;
+    return 0;
+  };
+  virtual int get_zone_count() const override {
+    return group.zones.size();
+  }
+  virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr<PlacementTier>* tier);
+  virtual int get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone) override;
+  virtual int get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone) override;
+  virtual int list_zones(std::list<std::string>& zone_ids) override;
+  bool supports(std::string_view feature) const override {
+    return group.supports(feature);
+  }
+  virtual std::unique_ptr<ZoneGroup> clone() override {
+    return std::make_unique<RadosZoneGroup>(store, group);
+  }
+  const RGWZoneGroup& get_group() const { return group; }
+};
+
+class RadosZone : public StoreZone {
+  protected:
+    RadosStore* store;
+    std::unique_ptr<ZoneGroup> group;
+    RGWZone rgw_zone;
+    bool local_zone{false};
+  public:
+    RadosZone(RadosStore* _store, std::unique_ptr<ZoneGroup> _zg) : store(_store), group(std::move(_zg)), local_zone(true) {}
+    RadosZone(RadosStore* _store, std::unique_ptr<ZoneGroup> _zg, RGWZone& z) : store(_store), group(std::move(_zg)), rgw_zone(z) {}
+    ~RadosZone() = default;
+
+    virtual std::unique_ptr<Zone> clone() override;
+    virtual ZoneGroup& get_zonegroup() override { return *(group.get()); }
+    virtual const std::string& get_id() override;
+    virtual const std::string& get_name() const override;
+    virtual bool is_writeable() override;
+    virtual bool get_redirect_endpoint(std::string* endpoint) override;
+    virtual bool has_zonegroup_api(const std::string& api) const override;
+    virtual const std::string& get_current_period_id() override;
+    virtual const RGWAccessKey& get_system_key() override;
+    virtual const std::string& get_realm_name() override;
+    virtual const std::string& get_realm_id() override;
+    virtual const std::string_view get_tier_type() override;
+    virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() override;
+};
+
+class RadosStore : public StoreDriver {
+  private:
+    RGWRados* rados;
+    RGWUserCtl* user_ctl;
+    std::unique_ptr<RadosZone> zone;
+    std::string topics_oid(const std::string& tenant) const;
+
+  public:
+    RadosStore()
+      : rados(nullptr) {
+      }
+    ~RadosStore() {
+      delete rados;
+    }
+
+    virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) override;
+    virtual const std::string get_name() const override {
+      return "rados";
+    }
+    virtual std::string get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y) override;
+    virtual std::unique_ptr<User> get_user(const rgw_user& u) override;
+    virtual int get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user) override;
+    virtual int get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user) override;
+    virtual int get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user) override;
+    virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+    virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+    virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket) override;
+    virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string&name, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+    virtual bool is_meta_master() override;
+    virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
+					  bufferlist& in_data, JSONParser* jp, req_info& info,
+					  optional_yield y) override;
+    virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+					     bufferlist& in_data,
+					     RGWXMLDecoder::XMLParser* parser, req_info& info,
+					     optional_yield y) override;
+    virtual Zone* get_zone() { return zone.get(); }
+    virtual std::string zone_unique_id(uint64_t unique_num) override;
+    virtual std::string zone_unique_trans_id(const uint64_t unique_num) override;
+    virtual int get_zonegroup(const std::string& id, std::unique_ptr<ZoneGroup>* zonegroup) override;
+    virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids) override;
+    virtual int cluster_stat(RGWClusterStat& stats) override;
+    virtual std::unique_ptr<Lifecycle> get_lifecycle(void) override;
+    virtual std::unique_ptr<Completions> get_completions(void) override;
+    virtual std::unique_ptr<Notification> get_notification(rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s, rgw::notify::EventType event_type, optional_yield y, const std::string* object_name=nullptr) override;
+    virtual std::unique_ptr<Notification> get_notification(
+    const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj, 
+    rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant,
+    std::string& _req_id, optional_yield y) override;
+    int read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+        optional_yield y, const DoutPrefixProvider *dpp) override;
+    int write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+	optional_yield y, const DoutPrefixProvider *dpp) override;
+    int remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker,
+        optional_yield y, const DoutPrefixProvider *dpp) override;
+    virtual RGWLC* get_rgwlc(void) override { return rados->get_lc(); }
+    virtual RGWCoroutinesManagerRegistry* get_cr_registry() override { return rados->get_cr_registry(); }
+
+    virtual int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info) override;
+    virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) override;
+    virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
+				const std::map<std::string, std::string>& meta) override;
+    virtual void get_quota(RGWQuota& quota) override;
+    virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) override;
+    virtual int set_buckets_enabled(const DoutPrefixProvider* dpp, std::vector<rgw_bucket>& buckets, bool enabled) override;
+    virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp,
+					std::optional<rgw_zone_id> zone,
+					std::optional<rgw_bucket> bucket,
+					RGWBucketSyncPolicyHandlerRef* phandler,
+					optional_yield y) override;
+    virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override;
+    virtual void wakeup_meta_sync_shards(std::set<int>& shard_ids) override { rados->wakeup_meta_sync_shards(shard_ids); }
+    virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, boost::container::flat_map<int, boost::container::flat_set<rgw_data_notify_entry>>& shard_ids) override { rados->wakeup_data_sync_shards(dpp, source_zone, shard_ids); }
+    virtual int clear_usage(const DoutPrefixProvider *dpp) override { return rados->clear_usage(dpp); }
+    virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+			       uint32_t max_entries, bool* is_truncated,
+			       RGWUsageIter& usage_iter,
+			       std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+    virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+    virtual int get_config_key_val(std::string name, bufferlist* bl) override;
+    virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) override;
+    virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list<std::string>& keys, bool* truncated) override;
+    virtual void meta_list_keys_complete(void* handle) override;
+    virtual std::string meta_get_marker(void* handle) override;
+    virtual int meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y) override;
+    virtual const RGWSyncModuleInstanceRef& get_sync_module() { return rados->get_sync_module(); }
+    virtual std::string get_host_id() { return rados->host_id; }
+    virtual std::unique_ptr<LuaManager> get_lua_manager() override;
+    virtual std::unique_ptr<RGWRole> get_role(std::string name,
+					      std::string tenant,
+					      std::string path="",
+					      std::string trust_policy="",
+					      std::string max_session_duration_str="",
+                std::multimap<std::string,std::string> tags={}) override;
+    virtual std::unique_ptr<RGWRole> get_role(std::string id) override;
+    virtual std::unique_ptr<RGWRole> get_role(const RGWRoleInfo& info) override;
+    virtual int get_roles(const DoutPrefixProvider *dpp,
+			  optional_yield y,
+			  const std::string& path_prefix,
+			  const std::string& tenant,
+			  std::vector<std::unique_ptr<RGWRole>>& roles) override;
+    virtual std::unique_ptr<RGWOIDCProvider> get_oidc_provider() override;
+    virtual int get_oidc_providers(const DoutPrefixProvider *dpp,
+				   const std::string& tenant,
+				   std::vector<std::unique_ptr<RGWOIDCProvider>>& providers) override;
+    virtual std::unique_ptr<Writer> get_append_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  const std::string& unique_tag,
+				  uint64_t position,
+				  uint64_t *cur_accounted_size) override;
+    virtual std::unique_ptr<Writer> get_atomic_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t olh_epoch,
+				  const std::string& unique_tag) override;
+    virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override;
+    virtual bool valid_placement(const rgw_placement_rule& rule) override;
+
+    virtual void finalize(void) override;
+
+    virtual CephContext* ctx(void) override { return rados->ctx(); }
+
+    virtual void register_admin_apis(RGWRESTMgr* mgr) override;
+
+    /* Unique to RadosStore */
+    int get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
+			   librados::IoCtx* ioctx);
+    int delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj);
+    int delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, Completions* aio);
+    void get_raw_obj(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj* raw_obj);
+    int get_raw_chunk_size(const DoutPrefixProvider* dpp, const rgw_raw_obj& obj, uint64_t* chunk_size);
+
+    void setRados(RGWRados * st) { rados = st; }
+    RGWRados* getRados(void) { return rados; }
+
+    RGWServices* svc() { return &rados->svc; }
+    const RGWServices* svc() const { return &rados->svc; }
+    RGWCtl* ctl() { return &rados->ctl; }
+    const RGWCtl* ctl() const { return &rados->ctl; }
+
+    void setUserCtl(RGWUserCtl *_ctl) { user_ctl = _ctl; }
+};
+
+class RadosUser : public StoreUser {
+  private:
+    RadosStore* store;
+
+  public:
+    RadosUser(RadosStore *_st, const rgw_user& _u) : StoreUser(_u), store(_st) { }
+    RadosUser(RadosStore *_st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) { }
+    RadosUser(RadosStore *_st) : store(_st) { }
+    RadosUser(RadosUser& _o) = default;
+
+    virtual std::unique_ptr<User> clone() override {
+      return std::unique_ptr<User>(new RadosUser(*this));
+    }
+    int list_buckets(const DoutPrefixProvider* dpp, const std::string& marker, const std::string& end_marker,
+		     uint64_t max, bool need_stats, BucketList& buckets,
+		     optional_yield y) override;
+    virtual int create_bucket(const DoutPrefixProvider* dpp,
+                            const rgw_bucket& b,
+                            const std::string& zonegroup_id,
+                            rgw_placement_rule& placement_rule,
+                            std::string& swift_ver_location,
+                            const RGWQuotaInfo * pquota_info,
+                            const RGWAccessControlPolicy& policy,
+			    Attrs& attrs,
+                            RGWBucketInfo& info,
+                            obj_version& ep_objv,
+			    bool exclusive,
+			    bool obj_lock_enabled,
+			    bool* existed,
+			    req_info& req_info,
+			    std::unique_ptr<Bucket>* bucket,
+			    optional_yield y) override;
+    virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) override;
+    virtual int read_stats(const DoutPrefixProvider *dpp,
+                           optional_yield y, RGWStorageStats* stats,
+			   ceph::real_time* last_stats_sync = nullptr,
+			   ceph::real_time* last_stats_update = nullptr) override;
+    virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) override;
+    virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+    virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+			   bool* is_truncated, RGWUsageIter& usage_iter,
+			   std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+    virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+
+    virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) override;
+    virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) override;
+
+    friend class RadosBucket;
+};
+
+class RadosObject : public StoreObject {
+  private:
+    RadosStore* store;
+    RGWAccessControlPolicy acls;
+    RGWObjManifest *manifest{nullptr};
+    RGWObjectCtx* rados_ctx;
+    bool rados_ctx_owned;
+
+  public:
+
+    struct RadosReadOp : public ReadOp {
+    private:
+      RadosObject* source;
+      RGWObjectCtx* rctx;
+      RGWRados::Object op_target;
+      RGWRados::Object::Read parent_op;
+
+    public:
+      RadosReadOp(RadosObject *_source, RGWObjectCtx *_rctx);
+
+      virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override;
+
+      /*
+       * Both `read` and `iterate` read up through index `end`
+       * *inclusive*. The number of bytes that could be returned is
+       * `end - ofs + 1`.
+       */
+      virtual int read(int64_t ofs, int64_t end,
+		       bufferlist& bl, optional_yield y,
+		       const DoutPrefixProvider* dpp) override;
+      virtual int iterate(const DoutPrefixProvider* dpp,
+			  int64_t ofs, int64_t end,
+			  RGWGetDataCB* cb, optional_yield y) override;
+
+        virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) override;
+    };
+
+    struct RadosDeleteOp : public DeleteOp {
+    private:
+      RadosObject* source;
+      RGWRados::Object op_target;
+      RGWRados::Object::Delete parent_op;
+
+    public:
+      RadosDeleteOp(RadosObject* _source);
+
+      virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override;
+    };
+
+    RadosObject(RadosStore *_st, const rgw_obj_key& _k)
+      : StoreObject(_k),
+	store(_st),
+        acls(),
+	rados_ctx(new RGWObjectCtx(dynamic_cast<Driver*>(store))),
+	rados_ctx_owned(true) {
+    }
+    RadosObject(RadosStore *_st, const rgw_obj_key& _k, Bucket* _b)
+      : StoreObject(_k, _b),
+	store(_st),
+        acls(),
+	rados_ctx(new RGWObjectCtx(dynamic_cast<Driver*>(store))) ,
+	rados_ctx_owned(true) {
+    }
+    RadosObject(RadosObject& _o) : StoreObject(_o) {
+      store = _o.store;
+      acls = _o.acls;
+      manifest = _o.manifest;
+      rados_ctx = _o.rados_ctx;
+      rados_ctx_owned = false;
+    }
+
+    virtual ~RadosObject();
+
+    virtual void invalidate() override {
+      StoreObject::invalidate();
+      rados_ctx->invalidate(get_obj());
+    }
+    virtual int delete_object(const DoutPrefixProvider* dpp,
+			      optional_yield y, bool prevent_versioning) override;
+    virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio,
+			       bool keep_index_consistent, optional_yield y) override;
+    virtual int copy_object(User* user,
+               req_info* info, const rgw_zone_id& source_zone,
+               rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket,
+               rgw::sal::Bucket* src_bucket,
+               const rgw_placement_rule& dest_placement,
+               ceph::real_time* src_mtime, ceph::real_time* mtime,
+               const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr,
+               bool high_precision_time,
+               const char* if_match, const char* if_nomatch,
+               AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs,
+               RGWObjCategory category, uint64_t olh_epoch,
+	       boost::optional<ceph::real_time> delete_at,
+               std::string* version_id, std::string* tag, std::string* etag,
+               void (*progress_cb)(off_t, void *), void* progress_data,
+               const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+    virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; }
+    virtual void set_atomic() override {
+      rados_ctx->set_atomic(state.obj);
+      StoreObject::set_atomic();
+    }
+    virtual void set_prefetch_data() override {
+      rados_ctx->set_prefetch_data(state.obj);
+      StoreObject::set_prefetch_data();
+    }
+    virtual void set_compressed() override {
+      rados_ctx->set_compressed(state.obj);
+      StoreObject::set_compressed();
+    }
+
+    virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) override;
+    virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) override;
+    virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override;
+    virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override;
+    virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) override;
+    virtual bool is_expired() override;
+    virtual void gen_rand_obj_instance_name() override;
+    void get_raw_obj(rgw_raw_obj* raw_obj);
+    virtual std::unique_ptr<Object> clone() override {
+      return std::unique_ptr<Object>(new RadosObject(*this));
+    }
+    virtual std::unique_ptr<MPSerializer> get_serializer(const DoutPrefixProvider *dpp,
+							 const std::string& lock_name) override;
+    virtual int transition(Bucket* bucket,
+			   const rgw_placement_rule& placement_rule,
+			   const real_time& mtime,
+			   uint64_t olh_epoch,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y) override;
+    virtual int transition_to_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_bucket_dir_entry& o,
+			   std::set<std::string>& cloud_targets,
+			   CephContext* cct,
+			   bool update_object,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y) override;
+    virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
+    virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override;
+
+    /* Swift versioning */
+    virtual int swift_versioning_restore(bool& restored,
+					 const DoutPrefixProvider* dpp) override;
+    virtual int swift_versioning_copy(const DoutPrefixProvider* dpp,
+				      optional_yield y) override;
+
+    /* OPs */
+    virtual std::unique_ptr<ReadOp> get_read_op() override;
+    virtual std::unique_ptr<DeleteOp> get_delete_op() override;
+
+    /* OMAP */
+    virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+			      std::map<std::string, bufferlist> *m,
+			      bool* pmore, optional_yield y) override;
+    virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
+			     optional_yield y) override;
+    virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+			      const std::set<std::string>& keys,
+			      Attrs* vals) override;
+    virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+				    bool must_exist, optional_yield y) override;
+    virtual int chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) override;
+
+    /* Internal to RadosStore */
+    int get_max_chunk_size(const DoutPrefixProvider* dpp,
+			   rgw_placement_rule placement_rule,
+			   uint64_t* max_chunk_size,
+			   uint64_t* alignment = nullptr);
+    void get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t* max_size);
+    void raw_obj_to_obj(const rgw_raw_obj& raw_obj);
+    int write_cloud_tier(const DoutPrefixProvider* dpp,
+			   optional_yield y,
+			   uint64_t olh_epoch,
+			   rgw::sal::PlacementTier* tier,
+			   bool is_multipart_upload,
+			   rgw_placement_rule& target_placement,
+			   Object* head_obj);
+    RGWObjManifest* get_manifest() { return manifest; }
+    RGWObjectCtx& get_ctx() { return *rados_ctx; }
+
+  private:
+    int read_attrs(const DoutPrefixProvider* dpp, RGWRados::Object::Read &read_op, optional_yield y, rgw_obj* target_obj = nullptr);
+};
+
+class RadosBucket : public StoreBucket {
+  private:
+    RadosStore* store;
+    RGWAccessControlPolicy acls;
+    std::string topics_oid() const;
+
+  public:
+    RadosBucket(RadosStore *_st)
+      : store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, User* _u)
+      : StoreBucket(_u),
+	store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const rgw_bucket& _b)
+      : StoreBucket(_b),
+	store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const RGWBucketEnt& _e)
+      : StoreBucket(_e),
+	store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const RGWBucketInfo& _i)
+      : StoreBucket(_i),
+	store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const rgw_bucket& _b, User* _u)
+      : StoreBucket(_b, _u),
+	store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const RGWBucketEnt& _e, User* _u)
+      : StoreBucket(_e, _u),
+	store(_st),
+        acls() {
+    }
+
+    RadosBucket(RadosStore *_st, const RGWBucketInfo& _i, User* _u)
+      : StoreBucket(_i, _u),
+	store(_st),
+        acls() {
+    }
+
+    virtual ~RadosBucket();
+    virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+    virtual int list(const DoutPrefixProvider* dpp, ListParams&, int, ListResults&, optional_yield y) override;
+    virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) override;
+    virtual int remove_bucket_bypass_gc(int concurrent_max, bool
+					keep_index_consistent,
+					optional_yield y, const
+					DoutPrefixProvider *dpp) override;
+    virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+    virtual int set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy& acl, optional_yield y) override;
+    virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats = false) override;
+    virtual int read_stats(const DoutPrefixProvider *dpp,
+                           const bucket_index_layout_generation& idx_layout,
+                           int shard_id, std::string* bucket_ver, std::string* master_ver,
+                           std::map<RGWObjCategory, RGWStorageStats>& stats,
+                           std::string* max_marker = nullptr,
+                           bool* syncstopped = nullptr) override;
+    virtual int read_stats_async(const DoutPrefixProvider *dpp,
+                                 const bucket_index_layout_generation& idx_layout,
+                                 int shard_id, RGWGetBucketStats_CB* ctx) override;
+    virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+    virtual int update_container_stats(const DoutPrefixProvider* dpp) override;
+    virtual int check_bucket_shards(const DoutPrefixProvider* dpp) override;
+    virtual int chown(const DoutPrefixProvider* dpp, User& new_user, optional_yield y) override;
+    virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time mtime) override;
+    virtual bool is_owner(User* user) override;
+    virtual int check_empty(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) override;
+    virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& attrs, optional_yield y) override;
+    virtual int try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime) override;
+    virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+			   bool* is_truncated, RGWUsageIter& usage_iter,
+			   std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+    virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+    virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink) override;
+    virtual int check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats) override;
+    virtual int rebuild_index(const DoutPrefixProvider *dpp) override;
+    virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override;
+    virtual int purge_instance(const DoutPrefixProvider* dpp) override;
+    virtual std::unique_ptr<Bucket> clone() override {
+      return std::make_unique<RadosBucket>(*this);
+    }
+    virtual std::unique_ptr<MultipartUpload> get_multipart_upload(
+				const std::string& oid,
+				std::optional<std::string> upload_id=std::nullopt,
+				ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) override;
+    virtual int list_multiparts(const DoutPrefixProvider *dpp,
+				const std::string& prefix,
+				std::string& marker,
+				const std::string& delim,
+				const int& max_uploads,
+				std::vector<std::unique_ptr<MultipartUpload>>& uploads,
+				std::map<std::string, bool> *common_prefixes,
+				bool *is_truncated) override;
+    virtual int abort_multiparts(const DoutPrefixProvider* dpp,
+				 CephContext* cct) override;
+    int read_topics(rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, 
+        optional_yield y, const DoutPrefixProvider *dpp) override;
+    int write_topics(const rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, 
+        optional_yield y, const DoutPrefixProvider *dpp) override;
+    int remove_topics(RGWObjVersionTracker* objv_tracker, 
+        optional_yield y, const DoutPrefixProvider *dpp) override;
+
+  private:
+    int link(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint = true, RGWObjVersionTracker* objv = nullptr);
+    int unlink(const DoutPrefixProvider* dpp, User* new_user, optional_yield y, bool update_entrypoint = true);
+    friend class RadosUser;
+};
+
+class RadosMultipartPart : public StoreMultipartPart {
+protected:
+  RGWUploadPartInfo info;
+
+public:
+  RadosMultipartPart() = default;
+  virtual ~RadosMultipartPart() = default;
+
+  virtual uint32_t get_num() { return info.num; }
+  virtual uint64_t get_size() { return info.accounted_size; }
+  virtual const std::string& get_etag() { return info.etag; }
+  virtual ceph::real_time& get_mtime() { return info.modified; }
+
+  /* For RadosStore code */
+  RGWObjManifest& get_manifest() { return info.manifest; }
+  const std::set<std::string>& get_past_prefixes() const { return info.past_prefixes; }
+
+  friend class RadosMultipartUpload;
+};
+
+class RadosMultipartUpload : public StoreMultipartUpload {
+  RadosStore* store;
+  RGWMPObj mp_obj;
+  ACLOwner owner;
+  ceph::real_time mtime;
+  rgw_placement_rule placement;
+  RGWObjManifest manifest;
+
+public:
+  RadosMultipartUpload(RadosStore* _store, Bucket* _bucket, const std::string& oid,
+                       std::optional<std::string> upload_id, ACLOwner owner,
+                       ceph::real_time _mtime)
+      : StoreMultipartUpload(_bucket), store(_store), mp_obj(oid, upload_id),
+        owner(owner), mtime(_mtime) {}
+  virtual ~RadosMultipartUpload() = default;
+
+  virtual const std::string& get_meta() const override { return mp_obj.get_meta(); }
+  virtual const std::string& get_key() const override { return mp_obj.get_key(); }
+  virtual const std::string& get_upload_id() const override { return mp_obj.get_upload_id(); }
+  virtual const ACLOwner& get_owner() const override { return owner; }
+  virtual ceph::real_time& get_mtime() override { return mtime; }
+  virtual std::unique_ptr<rgw::sal::Object> get_meta_obj() override;
+  virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override;
+  virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			 int num_parts, int marker,
+			 int* next_marker, bool* truncated,
+			 bool assume_unsorted = false) override;
+  virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override;
+  virtual int complete(const DoutPrefixProvider* dpp,
+		       optional_yield y, CephContext* cct,
+		       std::map<int, std::string>& part_etags,
+		       std::list<rgw_obj_index_key>& remove_objs,
+		       uint64_t& accounted_size, bool& compressed,
+		       RGWCompressionInfo& cs_info, off_t& ofs,
+		       std::string& tag, ACLOwner& owner,
+		       uint64_t olh_epoch,
+		       rgw::sal::Object* target_obj) override;
+  virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
+  virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
+			  optional_yield y,
+			  rgw::sal::Object* obj,
+			  const rgw_user& owner,
+			  const rgw_placement_rule *ptail_placement_rule,
+			  uint64_t part_num,
+			  const std::string& part_num_str) override;
+protected:
+  int cleanup_part_history(const DoutPrefixProvider* dpp,
+                           optional_yield y,
+                           RadosMultipartPart* part,
+                           std::list<rgw_obj_index_key>& remove_objs);
+};
+
+class MPRadosSerializer : public StoreMPSerializer {
+  librados::IoCtx ioctx;
+  rados::cls::lock::Lock lock;
+  librados::ObjectWriteOperation op;
+
+public:
+  MPRadosSerializer(const DoutPrefixProvider *dpp, RadosStore* store, RadosObject* obj, const std::string& lock_name);
+
+  virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override;
+  virtual int unlock() override {
+    return lock.unlock(&ioctx, oid);
+  }
+};
+
+class LCRadosSerializer : public StoreLCSerializer {
+  librados::IoCtx* ioctx;
+  rados::cls::lock::Lock lock;
+
+public:
+  LCRadosSerializer(RadosStore* store, const std::string& oid, const std::string& lock_name, const std::string& cookie);
+
+  virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override;
+  virtual int unlock() override {
+    return lock.unlock(ioctx, oid);
+  }
+};
+
+class RadosLifecycle : public StoreLifecycle {
+  RadosStore* store;
+
+public:
+  RadosLifecycle(RadosStore* _st) : store(_st) {}
+
+  using StoreLifecycle::get_entry;
+  virtual int get_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) override;
+  virtual int get_next_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) override;
+  virtual int set_entry(const std::string& oid, LCEntry& entry) override;
+  virtual int list_entries(const std::string& oid, const std::string& marker,
+			   uint32_t max_entries,
+			   std::vector<std::unique_ptr<LCEntry>>& entries) override;
+  virtual int rm_entry(const std::string& oid, LCEntry& entry) override;
+  virtual int get_head(const std::string& oid, std::unique_ptr<LCHead>* head) override;
+  virtual int put_head(const std::string& oid, LCHead& head) override;
+  virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
+						       const std::string& oid,
+						       const std::string& cookie) override;
+};
+
+class RadosNotification : public StoreNotification {
+  RadosStore* store;
+  /* XXX it feels incorrect to me that rgw::notify::reservation_t is
+   * currently RADOS-specific; instead, I think notification types such as
+   * reservation_t should be generally visible, whereas the internal
+   * notification behavior should be made portable (e.g., notification
+   * to non-RADOS message sinks) */
+  rgw::notify::reservation_t res;
+
+  public:
+    RadosNotification(const DoutPrefixProvider* _dpp, RadosStore* _store, Object* _obj, Object* _src_obj, req_state* _s, rgw::notify::EventType _type, optional_yield y, const std::string* object_name) :
+      StoreNotification(_obj, _src_obj, _type), store(_store), res(_dpp, _store, _s, _obj, _src_obj, object_name, y) { }
+
+    RadosNotification(const DoutPrefixProvider* _dpp, RadosStore* _store, Object* _obj, Object* _src_obj, rgw::notify::EventType _type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y) :
+      StoreNotification(_obj, _src_obj, _type), store(_store), res(_dpp, _store, _obj, _src_obj, _bucket, _user_id, _user_tenant, _req_id, y) {}
+
+    ~RadosNotification() = default;
+
+    rgw::notify::reservation_t& get_reservation(void) {
+      return res;
+    }
+
+    virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) override;
+    virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+			       const ceph::real_time& mtime, const std::string& etag, const std::string& version) override;
+};
+
+class RadosAtomicWriter : public StoreWriter {
+protected:
+  rgw::sal::RadosStore* store;
+  std::unique_ptr<Aio> aio;
+  RGWObjectCtx& obj_ctx;
+  rgw::putobj::AtomicObjectProcessor processor;
+
+public:
+  RadosAtomicWriter(const DoutPrefixProvider *dpp,
+		    optional_yield y,
+		    RGWBucketInfo& bucket_info,
+		    RGWObjectCtx& obj_ctx,
+		    const rgw_obj& obj,
+		    RadosStore* _store, std::unique_ptr<Aio> _aio,
+		    const rgw_user& owner,
+		    const rgw_placement_rule *ptail_placement_rule,
+		    uint64_t olh_epoch,
+		    const std::string& unique_tag) :
+			StoreWriter(dpp, y),
+			store(_store),
+			aio(std::move(_aio)),
+			obj_ctx(obj_ctx),
+			processor(&*aio, store->getRados(), bucket_info,
+				  ptail_placement_rule, owner, obj_ctx,
+				  obj, olh_epoch, unique_tag,
+				  dpp, y)
+  {}
+  ~RadosAtomicWriter() = default;
+
+  // prepare to start processing object data
+  virtual int prepare(optional_yield y) override;
+
+  // Process a bufferlist
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+
+  // complete the operation and make its result visible to clients
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) override;
+};
+
+class RadosAppendWriter : public StoreWriter {
+protected:
+  rgw::sal::RadosStore* store;
+  std::unique_ptr<Aio> aio;
+  RGWObjectCtx& obj_ctx;
+  rgw::putobj::AppendObjectProcessor processor;
+
+public:
+  RadosAppendWriter(const DoutPrefixProvider *dpp,
+		    optional_yield y,
+		    RGWBucketInfo& bucket_info,
+		    RGWObjectCtx& obj_ctx,
+		    const rgw_obj& obj,
+		    RadosStore* _store, std::unique_ptr<Aio> _aio,
+		    const rgw_user& owner,
+		    const rgw_placement_rule *ptail_placement_rule,
+		    const std::string& unique_tag,
+		    uint64_t position,
+		    uint64_t *cur_accounted_size) :
+			StoreWriter(dpp, y),
+			store(_store),
+			aio(std::move(_aio)),
+			obj_ctx(obj_ctx),
+			processor(&*aio, store->getRados(), bucket_info,
+				  ptail_placement_rule, owner, obj_ctx,
+				  obj, unique_tag, position,
+				  cur_accounted_size, dpp, y)
+  {}
+  ~RadosAppendWriter() = default;
+
+  // prepare to start processing object data
+  virtual int prepare(optional_yield y) override;
+
+  // Process a bufferlist
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+
+  // complete the operation and make its result visible to clients
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) override;
+};
+
+class RadosMultipartWriter : public StoreWriter {
+protected:
+  rgw::sal::RadosStore* store;
+  std::unique_ptr<Aio> aio;
+  RGWObjectCtx& obj_ctx;
+  rgw::putobj::MultipartObjectProcessor processor;
+
+public:
+  RadosMultipartWriter(const DoutPrefixProvider *dpp,
+		       optional_yield y, const std::string& upload_id,
+		       RGWBucketInfo& bucket_info,
+		       RGWObjectCtx& obj_ctx,
+		       const rgw_obj& obj,
+		       RadosStore* _store, std::unique_ptr<Aio> _aio,
+		       const rgw_user& owner,
+		       const rgw_placement_rule *ptail_placement_rule,
+		       uint64_t part_num, const std::string& part_num_str) :
+			StoreWriter(dpp, y),
+			store(_store),
+			aio(std::move(_aio)),
+			obj_ctx(obj_ctx),
+			processor(&*aio, store->getRados(), bucket_info,
+				  ptail_placement_rule, owner, obj_ctx,
+				  obj, upload_id,
+				  part_num, part_num_str, dpp, y)
+  {}
+  ~RadosMultipartWriter() = default;
+
+  // prepare to start processing object data
+  virtual int prepare(optional_yield y) override;
+
+  // Process a bufferlist
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+
+  // complete the operation and make its result visible to clients
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) override;
+};
+
+class RadosLuaManager : public StoreLuaManager {
+  RadosStore* const store;
+  rgw_pool pool;
+
+public:
+  RadosLuaManager(RadosStore* _s);
+  virtual ~RadosLuaManager() = default;
+
+  virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script);
+  virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script);
+  virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key);
+  virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name);
+  virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name);
+  virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages);
+};
+
+class RadosOIDCProvider : public RGWOIDCProvider {
+  RadosStore* store;
+public:
+  RadosOIDCProvider(RadosStore* _store) : store(_store) {}
+  ~RadosOIDCProvider() = default;
+
+  virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) override;
+  virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) override;
+  virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override;
+  void encode(bufferlist& bl) const {
+    RGWOIDCProvider::encode(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    RGWOIDCProvider::decode(bl);
+  }
+};
+
+class RadosRole : public RGWRole {
+  RadosStore* store;
+public:
+  RadosRole(RadosStore* _store, std::string name,
+          std::string tenant,
+          std::string path,
+          std::string trust_policy,
+          std::string max_session_duration,
+          std::multimap<std::string,std::string> tags) : RGWRole(name, tenant, path, trust_policy, max_session_duration, tags), store(_store) {}
+  RadosRole(RadosStore* _store, std::string id) : RGWRole(id), store(_store) {}
+  RadosRole(RadosStore* _store, const RGWRoleInfo& info) : RGWRole(info), store(_store) {}
+  RadosRole(RadosStore* _store) : store(_store) {}
+  ~RadosRole() = default;
+
+  virtual int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
+  virtual int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
+  virtual int store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) override;
+  virtual int read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y) override;
+  virtual int read_name(const DoutPrefixProvider *dpp, optional_yield y) override;
+  virtual int read_info(const DoutPrefixProvider *dpp, optional_yield y) override;
+  virtual int create(const DoutPrefixProvider *dpp, bool exclusive, const std::string& role_id, optional_yield y) override;
+  virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override;
+};
+}} // namespace rgw::sal
+
+WRITE_CLASS_ENCODER(rgw::sal::RadosOIDCProvider)
diff --git a/src/rgw/driver/rados/rgw_service.cc b/src/rgw/driver/rados/rgw_service.cc
new file mode 100644
index 000000000..4fcb1ebde
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_service.cc
@@ -0,0 +1,476 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_service.h"
+
+#include "services/svc_finisher.h"
+#include "services/svc_bi_rados.h"
+#include "services/svc_bilog_rados.h"
+#include "services/svc_bucket_sobj.h"
+#include "services/svc_bucket_sync_sobj.h"
+#include "services/svc_cls.h"
+#include "services/svc_config_key_rados.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_meta.h"
+#include "services/svc_meta_be.h"
+#include "services/svc_meta_be_sobj.h"
+#include "services/svc_meta_be_otp.h"
+#include "services/svc_notify.h"
+#include "services/svc_otp.h"
+#include "services/svc_rados.h"
+#include "services/svc_zone.h"
+#include "services/svc_zone_utils.h"
+#include "services/svc_quota.h"
+#include "services/svc_sync_modules.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_sys_obj_cache.h"
+#include "services/svc_sys_obj_core.h"
+#include "services/svc_user_rados.h"
+#include "services/svc_role_rados.h"
+
+#include "common/errno.h"
+
+#include "rgw_bucket.h"
+#include "rgw_datalog.h"
+#include "rgw_metadata.h"
+#include "rgw_otp.h"
+#include "rgw_user.h"
+#include "rgw_role.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWServices_Def::RGWServices_Def() = default;
+RGWServices_Def::~RGWServices_Def()
+{
+  shutdown();
+}
+
+int RGWServices_Def::init(CephContext *cct,
+			  bool have_cache,
+                          bool raw,
+			  bool run_sync,
+			  optional_yield y,
+                          const DoutPrefixProvider *dpp)
+{
+  finisher = std::make_unique<RGWSI_Finisher>(cct);
+  bucket_sobj = std::make_unique<RGWSI_Bucket_SObj>(cct);
+  bucket_sync_sobj = std::make_unique<RGWSI_Bucket_Sync_SObj>(cct);
+  bi_rados = std::make_unique<RGWSI_BucketIndex_RADOS>(cct);
+  bilog_rados = std::make_unique<RGWSI_BILog_RADOS>(cct);
+  cls = std::make_unique<RGWSI_Cls>(cct);
+  config_key_rados = std::make_unique<RGWSI_ConfigKey_RADOS>(cct);
+  datalog_rados = std::make_unique<RGWDataChangesLog>(cct);
+  mdlog = std::make_unique<RGWSI_MDLog>(cct, run_sync);
+  meta = std::make_unique<RGWSI_Meta>(cct);
+  meta_be_sobj = std::make_unique<RGWSI_MetaBackend_SObj>(cct);
+  meta_be_otp = std::make_unique<RGWSI_MetaBackend_OTP>(cct);
+  notify = std::make_unique<RGWSI_Notify>(cct);
+  otp = std::make_unique<RGWSI_OTP>(cct);
+  rados = std::make_unique<RGWSI_RADOS>(cct);
+  zone = std::make_unique<RGWSI_Zone>(cct);
+  zone_utils = std::make_unique<RGWSI_ZoneUtils>(cct);
+  quota = std::make_unique<RGWSI_Quota>(cct);
+  sync_modules = std::make_unique<RGWSI_SyncModules>(cct);
+  sysobj = std::make_unique<RGWSI_SysObj>(cct);
+  sysobj_core = std::make_unique<RGWSI_SysObj_Core>(cct);
+  user_rados = std::make_unique<RGWSI_User_RADOS>(cct);
+  role_rados = std::make_unique<RGWSI_Role_RADOS>(cct);
+
+  if (have_cache) {
+    sysobj_cache = std::make_unique<RGWSI_SysObj_Cache>(dpp, cct);
+  }
+
+  vector<RGWSI_MetaBackend *> meta_bes{meta_be_sobj.get(), meta_be_otp.get()};
+
+  finisher->init();
+  bi_rados->init(zone.get(), rados.get(), bilog_rados.get(), datalog_rados.get());
+  bilog_rados->init(bi_rados.get());
+  bucket_sobj->init(zone.get(), sysobj.get(), sysobj_cache.get(),
+                    bi_rados.get(), meta.get(), meta_be_sobj.get(),
+                    sync_modules.get(), bucket_sync_sobj.get());
+  bucket_sync_sobj->init(zone.get(),
+                         sysobj.get(),
+                         sysobj_cache.get(),
+                         bucket_sobj.get());
+  cls->init(zone.get(), rados.get());
+  config_key_rados->init(rados.get());
+  mdlog->init(rados.get(), zone.get(), sysobj.get(), cls.get());
+  meta->init(sysobj.get(), mdlog.get(), meta_bes);
+  meta_be_sobj->init(sysobj.get(), mdlog.get());
+  meta_be_otp->init(sysobj.get(), mdlog.get(), cls.get());
+  notify->init(zone.get(), rados.get(), finisher.get());
+  otp->init(zone.get(), meta.get(), meta_be_otp.get());
+  rados->init();
+  zone->init(sysobj.get(), rados.get(), sync_modules.get(), bucket_sync_sobj.get());
+  zone_utils->init(rados.get(), zone.get());
+  quota->init(zone.get());
+  sync_modules->init(zone.get());
+  sysobj_core->core_init(rados.get(), zone.get());
+  if (have_cache) {
+    sysobj_cache->init(rados.get(), zone.get(), notify.get());
+    sysobj->init(rados.get(), sysobj_cache.get());
+  } else {
+    sysobj->init(rados.get(), sysobj_core.get());
+  }
+  user_rados->init(rados.get(), zone.get(), sysobj.get(), sysobj_cache.get(),
+                   meta.get(), meta_be_sobj.get(), sync_modules.get());
+  role_rados->init(zone.get(), meta.get(), meta_be_sobj.get(), sysobj.get());
+
+  can_shutdown = true;
+
+  int r = finisher->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start finisher service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (!raw) {
+    r = notify->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start notify service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+  }
+
+  r = rados->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start rados service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (!raw) {
+    r = zone->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start zone service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = datalog_rados->start(dpp, &zone->get_zone(),
+			     zone->get_zone_params(),
+			     rados->get_rados_handle());
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start datalog_rados service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = mdlog->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start mdlog service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = sync_modules->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start sync modules service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+  }
+
+  r = cls->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start cls service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = config_key_rados->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start config_key service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = zone_utils->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start zone_utils service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = quota->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start quota service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = sysobj_core->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj_core service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (have_cache) {
+    r = sysobj_cache->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj_cache service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+  }
+
+  r = sysobj->start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start sysobj service (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (!raw) {
+    r = meta_be_sobj->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start meta_be_sobj service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = meta->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start meta service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = bucket_sobj->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start bucket service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = bucket_sync_sobj->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start bucket_sync service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = user_rados->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start user_rados service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = otp->start(y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to start otp service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+    r = role_rados->start(y, dpp);
+    if (r < 0) {
+      ldout(cct, 0) << "ERROR: failed to start role_rados service (" << cpp_strerror(-r) << dendl;
+      return r;
+    }
+
+  }
+
+  /* cache or core services will be started by sysobj */
+
+  return  0;
+}
+
+void RGWServices_Def::shutdown()
+{
+  if (!can_shutdown) {
+    return;
+  }
+
+  if (has_shutdown) {
+    return;
+  }
+
+  role_rados->shutdown();
+  datalog_rados.reset();
+  user_rados->shutdown();
+  sync_modules->shutdown();
+  otp->shutdown();
+  notify->shutdown();
+  meta_be_otp->shutdown();
+  meta_be_sobj->shutdown();
+  meta->shutdown();
+  mdlog->shutdown();
+  config_key_rados->shutdown();
+  cls->shutdown();
+  bilog_rados->shutdown();
+  bi_rados->shutdown();
+  bucket_sync_sobj->shutdown();
+  bucket_sobj->shutdown();
+  finisher->shutdown();
+
+  sysobj->shutdown();
+  sysobj_core->shutdown();
+  notify->shutdown();
+  if (sysobj_cache) {
+    sysobj_cache->shutdown();
+  }
+  quota->shutdown();
+  zone_utils->shutdown();
+  zone->shutdown();
+  rados->shutdown();
+
+  has_shutdown = true;
+
+}
+
+
+int RGWServices::do_init(CephContext *_cct, bool have_cache, bool raw, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  cct = _cct;
+
+  int r = _svc.init(cct, have_cache, raw, run_sync, y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  finisher = _svc.finisher.get();
+  bi_rados = _svc.bi_rados.get();
+  bi = bi_rados;
+  bilog_rados = _svc.bilog_rados.get();
+  bucket_sobj = _svc.bucket_sobj.get();
+  bucket = bucket_sobj;
+  bucket_sync_sobj = _svc.bucket_sync_sobj.get();
+  bucket_sync = bucket_sync_sobj;
+  cls = _svc.cls.get();
+  config_key_rados = _svc.config_key_rados.get();
+  config_key = config_key_rados;
+  datalog_rados = _svc.datalog_rados.get();
+  mdlog = _svc.mdlog.get();
+  meta = _svc.meta.get();
+  meta_be_sobj = _svc.meta_be_sobj.get();
+  meta_be_otp = _svc.meta_be_otp.get();
+  notify = _svc.notify.get();
+  otp = _svc.otp.get();
+  rados = _svc.rados.get();
+  zone = _svc.zone.get();
+  zone_utils = _svc.zone_utils.get();
+  quota = _svc.quota.get();
+  sync_modules = _svc.sync_modules.get();
+  sysobj = _svc.sysobj.get();
+  cache = _svc.sysobj_cache.get();
+  core = _svc.sysobj_core.get();
+  user = _svc.user_rados.get();
+  role = _svc.role_rados.get();
+
+  return 0;
+}
+
+RGWServiceInstance::~RGWServiceInstance() {}
+
+int RGWServiceInstance::start(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  if (start_state != StateInit) {
+    return 0;
+  }
+
+  start_state = StateStarting;; /* setting started prior to do_start() on purpose so that circular
+                                   references can call start() on each other */
+
+  int r = do_start(y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  start_state = StateStarted;
+
+  return 0;
+}
+
+RGWCtlDef::RGWCtlDef() {}
+RGWCtlDef::~RGWCtlDef() {}
+RGWCtlDef::_meta::_meta() {}
+RGWCtlDef::_meta::~_meta() {}
+
+
+int RGWCtlDef::init(RGWServices& svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp)
+{
+  meta.mgr.reset(new RGWMetadataManager(svc.meta));
+
+  meta.user.reset(RGWUserMetaHandlerAllocator::alloc(svc.user));
+
+  auto sync_module = svc.sync_modules->get_sync_module();
+  if (sync_module) {
+    meta.bucket.reset(sync_module->alloc_bucket_meta_handler());
+    meta.bucket_instance.reset(sync_module->alloc_bucket_instance_meta_handler(driver));
+  } else {
+    meta.bucket.reset(RGWBucketMetaHandlerAllocator::alloc());
+    meta.bucket_instance.reset(RGWBucketInstanceMetaHandlerAllocator::alloc(driver));
+  }
+
+  meta.otp.reset(RGWOTPMetaHandlerAllocator::alloc());
+  meta.role = std::make_unique<rgw::sal::RGWRoleMetadataHandler>(driver, svc.role);
+
+  user.reset(new RGWUserCtl(svc.zone, svc.user, (RGWUserMetadataHandler *)meta.user.get()));
+  bucket.reset(new RGWBucketCtl(svc.zone,
+                                svc.bucket,
+                                svc.bucket_sync,
+                                svc.bi, svc.user));
+  otp.reset(new RGWOTPCtl(svc.zone, svc.otp));
+
+  RGWBucketMetadataHandlerBase *bucket_meta_handler = static_cast<RGWBucketMetadataHandlerBase *>(meta.bucket.get());
+  RGWBucketInstanceMetadataHandlerBase *bi_meta_handler = static_cast<RGWBucketInstanceMetadataHandlerBase *>(meta.bucket_instance.get());
+
+  bucket_meta_handler->init(svc.bucket, bucket.get());
+  bi_meta_handler->init(svc.zone, svc.bucket, svc.bi);
+
+  RGWOTPMetadataHandlerBase *otp_handler = static_cast<RGWOTPMetadataHandlerBase *>(meta.otp.get());
+  otp_handler->init(svc.zone, svc.meta_be_otp, svc.otp);
+
+  user->init(bucket.get());
+  bucket->init(user.get(),
+               (RGWBucketMetadataHandler *)bucket_meta_handler,
+               (RGWBucketInstanceMetadataHandler *)bi_meta_handler,
+	       svc.datalog_rados,
+               dpp);
+
+  otp->init((RGWOTPMetadataHandler *)meta.otp.get());
+
+  return 0;
+}
+
+int RGWCtl::init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp)
+{
+  svc = _svc;
+  cct = svc->cct;
+
+  int r = _ctl.init(*svc, driver, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start init ctls (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  meta.mgr = _ctl.meta.mgr.get();
+  meta.user = _ctl.meta.user.get();
+  meta.bucket = _ctl.meta.bucket.get();
+  meta.bucket_instance = _ctl.meta.bucket_instance.get();
+  meta.otp = _ctl.meta.otp.get();
+  meta.role = _ctl.meta.role.get();
+
+  user = _ctl.user.get();
+  bucket = _ctl.bucket.get();
+  otp = _ctl.otp.get();
+
+  r = meta.user->attach(meta.mgr);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to start init meta.user ctl (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = meta.bucket->attach(meta.mgr);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to start init meta.bucket ctl (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = meta.bucket_instance->attach(meta.mgr);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to start init meta.bucket_instance ctl (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = meta.otp->attach(meta.mgr);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to start init otp ctl (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = meta.role->attach(meta.mgr);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to start init otp ctl (" << cpp_strerror(-r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_service.h b/src/rgw/driver/rados/rgw_service.h
new file mode 100644
index 000000000..4c0b8d842
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_service.h
@@ -0,0 +1,215 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+
+#include "common/async/yield_context.h"
+
+#include "rgw_common.h"
+
+struct RGWServices_Def;
+
+class RGWServiceInstance
+{
+  friend struct RGWServices_Def;
+
+protected:
+  CephContext *cct;
+
+  enum StartState {
+    StateInit = 0,
+    StateStarting = 1,
+    StateStarted = 2,
+  } start_state{StateInit};
+
+  virtual void shutdown() {}
+  virtual int do_start(optional_yield, const DoutPrefixProvider *dpp) {
+    return 0;
+  }
+public:
+  RGWServiceInstance(CephContext *_cct) : cct(_cct) {}
+  virtual ~RGWServiceInstance();
+
+  int start(optional_yield y, const DoutPrefixProvider *dpp);
+  bool is_started() {
+    return (start_state == StateStarted);
+  }
+
+  CephContext *ctx() {
+    return cct;
+  }
+};
+
+class RGWSI_Finisher;
+class RGWSI_Bucket;
+class RGWSI_Bucket_SObj;
+class RGWSI_Bucket_Sync;
+class RGWSI_Bucket_Sync_SObj;
+class RGWSI_BucketIndex;
+class RGWSI_BucketIndex_RADOS;
+class RGWSI_BILog_RADOS;
+class RGWSI_Cls;
+class RGWSI_ConfigKey;
+class RGWSI_ConfigKey_RADOS;
+class RGWSI_MDLog;
+class RGWSI_Meta;
+class RGWSI_MetaBackend;
+class RGWSI_MetaBackend_SObj;
+class RGWSI_MetaBackend_OTP;
+class RGWSI_Notify;
+class RGWSI_OTP;
+class RGWSI_RADOS;
+class RGWSI_Zone;
+class RGWSI_ZoneUtils;
+class RGWSI_Quota;
+class RGWSI_SyncModules;
+class RGWSI_SysObj;
+class RGWSI_SysObj_Core;
+class RGWSI_SysObj_Cache;
+class RGWSI_User;
+class RGWSI_User_RADOS;
+class RGWDataChangesLog;
+class RGWSI_Role_RADOS;
+
+struct RGWServices_Def
+{
+  bool can_shutdown{false};
+  bool has_shutdown{false};
+
+  std::unique_ptr<RGWSI_Finisher> finisher;
+  std::unique_ptr<RGWSI_Bucket_SObj> bucket_sobj;
+  std::unique_ptr<RGWSI_Bucket_Sync_SObj> bucket_sync_sobj;
+  std::unique_ptr<RGWSI_BucketIndex_RADOS> bi_rados;
+  std::unique_ptr<RGWSI_BILog_RADOS> bilog_rados;
+  std::unique_ptr<RGWSI_Cls> cls;
+  std::unique_ptr<RGWSI_ConfigKey_RADOS> config_key_rados;
+  std::unique_ptr<RGWSI_MDLog> mdlog;
+  std::unique_ptr<RGWSI_Meta> meta;
+  std::unique_ptr<RGWSI_MetaBackend_SObj> meta_be_sobj;
+  std::unique_ptr<RGWSI_MetaBackend_OTP> meta_be_otp;
+  std::unique_ptr<RGWSI_Notify> notify;
+  std::unique_ptr<RGWSI_OTP> otp;
+  std::unique_ptr<RGWSI_RADOS> rados;
+  std::unique_ptr<RGWSI_Zone> zone;
+  std::unique_ptr<RGWSI_ZoneUtils> zone_utils;
+  std::unique_ptr<RGWSI_Quota> quota;
+  std::unique_ptr<RGWSI_SyncModules> sync_modules;
+  std::unique_ptr<RGWSI_SysObj> sysobj;
+  std::unique_ptr<RGWSI_SysObj_Core> sysobj_core;
+  std::unique_ptr<RGWSI_SysObj_Cache> sysobj_cache;
+  std::unique_ptr<RGWSI_User_RADOS> user_rados;
+  std::unique_ptr<RGWDataChangesLog> datalog_rados;
+  std::unique_ptr<RGWSI_Role_RADOS> role_rados;
+
+  RGWServices_Def();
+  ~RGWServices_Def();
+
+  int init(CephContext *cct, bool have_cache, bool raw_storage, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp);
+  void shutdown();
+};
+
+
+struct RGWServices
+{
+  RGWServices_Def _svc;
+
+  CephContext *cct;
+
+  RGWSI_Finisher *finisher{nullptr};
+  RGWSI_Bucket *bucket{nullptr};
+  RGWSI_Bucket_SObj *bucket_sobj{nullptr};
+  RGWSI_Bucket_Sync *bucket_sync{nullptr};
+  RGWSI_Bucket_Sync_SObj *bucket_sync_sobj{nullptr};
+  RGWSI_BucketIndex *bi{nullptr};
+  RGWSI_BucketIndex_RADOS *bi_rados{nullptr};
+  RGWSI_BILog_RADOS *bilog_rados{nullptr};
+  RGWSI_Cls *cls{nullptr};
+  RGWSI_ConfigKey_RADOS *config_key_rados{nullptr};
+  RGWSI_ConfigKey *config_key{nullptr};
+  RGWDataChangesLog *datalog_rados{nullptr};
+  RGWSI_MDLog *mdlog{nullptr};
+  RGWSI_Meta *meta{nullptr};
+  RGWSI_MetaBackend *meta_be_sobj{nullptr};
+  RGWSI_MetaBackend *meta_be_otp{nullptr};
+  RGWSI_Notify *notify{nullptr};
+  RGWSI_OTP *otp{nullptr};
+  RGWSI_RADOS *rados{nullptr};
+  RGWSI_Zone *zone{nullptr};
+  RGWSI_ZoneUtils *zone_utils{nullptr};
+  RGWSI_Quota *quota{nullptr};
+  RGWSI_SyncModules *sync_modules{nullptr};
+  RGWSI_SysObj *sysobj{nullptr};
+  RGWSI_SysObj_Cache *cache{nullptr};
+  RGWSI_SysObj_Core *core{nullptr};
+  RGWSI_User *user{nullptr};
+  RGWSI_Role_RADOS *role{nullptr};
+
+  int do_init(CephContext *cct, bool have_cache, bool raw_storage, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp);
+
+  int init(CephContext *cct, bool have_cache, bool run_sync, optional_yield y, const DoutPrefixProvider *dpp) {
+    return do_init(cct, have_cache, false, run_sync, y, dpp);
+  }
+
+  int init_raw(CephContext *cct, bool have_cache, optional_yield y, const DoutPrefixProvider *dpp) {
+    return do_init(cct, have_cache, true, false, y, dpp);
+  }
+  void shutdown() {
+    _svc.shutdown();
+  }
+};
+
+class RGWMetadataManager;
+class RGWMetadataHandler;
+class RGWUserCtl;
+class RGWBucketCtl;
+class RGWOTPCtl;
+
+struct RGWCtlDef {
+  struct _meta {
+    std::unique_ptr<RGWMetadataManager> mgr;
+    std::unique_ptr<RGWMetadataHandler> bucket;
+    std::unique_ptr<RGWMetadataHandler> bucket_instance;
+    std::unique_ptr<RGWMetadataHandler> user;
+    std::unique_ptr<RGWMetadataHandler> otp;
+    std::unique_ptr<RGWMetadataHandler> role;
+
+    _meta();
+    ~_meta();
+  } meta;
+
+  std::unique_ptr<RGWUserCtl> user;
+  std::unique_ptr<RGWBucketCtl> bucket;
+  std::unique_ptr<RGWOTPCtl> otp;
+
+  RGWCtlDef();
+  ~RGWCtlDef();
+
+  int init(RGWServices& svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp);
+};
+
+struct RGWCtl {
+  CephContext *cct{nullptr};
+  RGWServices *svc{nullptr};
+
+  RGWCtlDef _ctl;
+
+  struct _meta {
+    RGWMetadataManager *mgr{nullptr};
+
+    RGWMetadataHandler *bucket{nullptr};
+    RGWMetadataHandler *bucket_instance{nullptr};
+    RGWMetadataHandler *user{nullptr};
+    RGWMetadataHandler *otp{nullptr};
+    RGWMetadataHandler *role{nullptr};
+  } meta;
+
+  RGWUserCtl *user{nullptr};
+  RGWBucketCtl *bucket{nullptr};
+  RGWOTPCtl *otp{nullptr};
+
+  int init(RGWServices *_svc, rgw::sal::Driver* driver, const DoutPrefixProvider *dpp);
+};
diff --git a/src/rgw/driver/rados/rgw_sync.cc b/src/rgw/driver/rados/rgw_sync.cc
new file mode 100644
index 000000000..d0ec90796
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync.cc
@@ -0,0 +1,2568 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_cls.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "meta sync: ")
+
+using namespace std;
+
+static string mdlog_sync_status_oid = "mdlog.sync-status";
+static string mdlog_sync_status_shard_prefix = "mdlog.sync-status.shard";
+static string mdlog_sync_full_sync_index_prefix = "meta.full-sync.index";
+
+RGWContinuousLeaseCR::~RGWContinuousLeaseCR() {}
+
+RGWSyncErrorLogger::RGWSyncErrorLogger(rgw::sal::RadosStore* _store, const string &oid_prefix, int _num_shards) : store(_store), num_shards(_num_shards) {
+  for (int i = 0; i < num_shards; i++) {
+    oids.push_back(get_shard_oid(oid_prefix, i));
+  }
+}
+string RGWSyncErrorLogger::get_shard_oid(const string& oid_prefix, int shard_id) {
+  char buf[oid_prefix.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), shard_id);
+  return string(buf);
+}
+
+RGWCoroutine *RGWSyncErrorLogger::log_error_cr(const DoutPrefixProvider *dpp, const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message) {
+  cls_log_entry entry;
+
+  rgw_sync_error_info info(source_zone, error_code, message);
+  bufferlist bl;
+  encode(info, bl);
+  store->svc()->cls->timelog.prepare_entry(entry, real_clock::now(), section, name, bl);
+
+  uint32_t shard_id = ++counter % num_shards;
+
+
+  return new RGWRadosTimelogAddCR(dpp, store, oids[shard_id], entry);
+}
+
+void RGWSyncBackoff::update_wait_time()
+{
+  if (cur_wait == 0) {
+    cur_wait = 1;
+  } else {
+    cur_wait = (cur_wait << 1);
+  }
+  if (cur_wait >= max_secs) {
+    cur_wait = max_secs;
+  }
+}
+
+void RGWSyncBackoff::backoff_sleep()
+{
+  update_wait_time();
+  sleep(cur_wait);
+}
+
+void RGWSyncBackoff::backoff(RGWCoroutine *op)
+{
+  update_wait_time();
+  op->wait(utime_t(cur_wait, 0));
+}
+
+int RGWBackoffControlCR::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+    // retry the operation until it succeeds
+    while (true) {
+      yield {
+	std::lock_guard l{lock};
+        cr = alloc_cr();
+        cr->get();
+        call(cr);
+      }
+      {
+	std::lock_guard l{lock};
+        cr->put();
+        cr = NULL;
+      }
+      if (retcode >= 0) {
+        break;
+      }
+      if (retcode != -EBUSY && retcode != -EAGAIN) {
+        ldout(cct, 0) << "ERROR: RGWBackoffControlCR called coroutine returned " << retcode << dendl;
+        if (exit_on_error) {
+          return set_cr_error(retcode);
+        }
+      }
+      if (reset_backoff) {
+        backoff.reset();
+      }
+      yield backoff.backoff(this);
+    }
+
+    // run an optional finisher
+    yield call(alloc_finisher_cr());
+    if (retcode < 0) {
+      ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+void rgw_mdlog_info::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("num_objects", num_shards, obj);
+  JSONDecoder::decode_json("period", period, obj);
+  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+void rgw_mdlog_entry::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("section", section, obj);
+  JSONDecoder::decode_json("name", name, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("timestamp", ut, obj);
+  timestamp = ut.to_real_time();
+  JSONDecoder::decode_json("data", log_data, obj);
+}
+
+void rgw_mdlog_shard_data::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("marker", marker, obj);
+  JSONDecoder::decode_json("truncated", truncated, obj);
+  JSONDecoder::decode_json("entries", entries, obj);
+};
+
+int RGWShardCollectCR::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+    while (spawn_next()) {
+      current_running++;
+
+      if (current_running >= max_concurrent) {
+        int child_ret;
+        yield wait_for_child();
+        if (collect_next(&child_ret)) {
+          current_running--;
+          child_ret = handle_result(child_ret);
+          if (child_ret < 0) {
+            status = child_ret;
+          }
+        }
+      }
+    }
+    while (current_running > 0) {
+      int child_ret;
+      yield wait_for_child();
+      if (collect_next(&child_ret)) {
+        current_running--;
+        child_ret = handle_result(child_ret);
+        if (child_ret < 0) {
+          status = child_ret;
+        }
+      }
+    }
+    if (status < 0) {
+      return set_cr_error(status);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWReadRemoteMDLogInfoCR : public RGWShardCollectCR {
+  RGWMetaSyncEnv *sync_env;
+
+  const std::string& period;
+  int num_shards;
+  map<int, RGWMetadataLogInfo> *mdlog_info;
+
+  int shard_id;
+#define READ_MDLOG_MAX_CONCURRENT 10
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to fetch mdlog status: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+public:
+  RGWReadRemoteMDLogInfoCR(RGWMetaSyncEnv *_sync_env,
+                     const std::string& period, int _num_shards,
+                     map<int, RGWMetadataLogInfo> *_mdlog_info) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
+                                                                 sync_env(_sync_env),
+                                                                 period(period), num_shards(_num_shards),
+                                                                 mdlog_info(_mdlog_info), shard_id(0) {}
+  bool spawn_next() override;
+};
+
+class RGWListRemoteMDLogCR : public RGWShardCollectCR {
+  RGWMetaSyncEnv *sync_env;
+
+  const std::string& period;
+  map<int, string> shards;
+  int max_entries_per_shard;
+  map<int, rgw_mdlog_shard_data> *result;
+
+  map<int, string>::iterator iter;
+#define READ_MDLOG_MAX_CONCURRENT 10
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to list remote mdlog shard: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+public:
+  RGWListRemoteMDLogCR(RGWMetaSyncEnv *_sync_env,
+                     const std::string& period, map<int, string>& _shards,
+                     int _max_entries_per_shard,
+                     map<int, rgw_mdlog_shard_data> *_result) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
+                                                                 sync_env(_sync_env), period(period),
+                                                                 max_entries_per_shard(_max_entries_per_shard),
+                                                                 result(_result) {
+    shards.swap(_shards);
+    iter = shards.begin();
+  }
+  bool spawn_next() override;
+};
+
+int RGWRemoteMetaLog::read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info)
+{
+  rgw_http_param_pair pairs[] = { { "type", "metadata" },
+                                  { NULL, NULL } };
+
+  int ret = conn->get_json_resource(dpp, "/admin/log", pairs, null_yield, *log_info);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog info" << dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << "remote mdlog, num_shards=" << log_info->num_shards << dendl;
+
+  return 0;
+}
+
+int RGWRemoteMetaLog::read_master_log_shards_info(const DoutPrefixProvider *dpp, const string &master_period, map<int, RGWMetadataLogInfo> *shards_info)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+
+  rgw_mdlog_info log_info;
+  int ret = read_log_info(dpp, &log_info);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return run(dpp, new RGWReadRemoteMDLogInfoCR(&sync_env, master_period, log_info.num_shards, shards_info));
+}
+
+int RGWRemoteMetaLog::read_master_log_shards_next(const DoutPrefixProvider *dpp, const string& period, map<int, string> shard_markers, map<int, rgw_mdlog_shard_data> *result)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+
+  return run(dpp, new RGWListRemoteMDLogCR(&sync_env, period, shard_markers, 1, result));
+}
+
+int RGWRemoteMetaLog::init()
+{
+  conn = store->svc()->zone->get_master_conn();
+
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+
+  error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
+
+  init_sync_env(&sync_env);
+
+  tn = sync_env.sync_tracer->add_node(sync_env.sync_tracer->root_node, "meta");
+
+  return 0;
+}
+
+#define CLONE_MAX_ENTRIES 100
+
+int RGWMetaSyncStatusManager::init(const DoutPrefixProvider *dpp)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+
+  if (!store->svc()->zone->get_master_conn()) {
+    ldpp_dout(dpp, -1) << "no REST connection to master zone" << dendl;
+    return -EIO;
+  }
+
+  int r = rgw_init_ioctx(dpp, store->getRados()->get_rados_handle(), store->svc()->zone->get_zone_params().log_pool, ioctx, true);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to open log pool (" << store->svc()->zone->get_zone_params().log_pool << " ret=" << r << dendl;
+    return r;
+  }
+
+  r = master_log.init();
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to init remote log, r=" << r << dendl;
+    return r;
+  }
+
+  RGWMetaSyncEnv& sync_env = master_log.get_sync_env();
+
+  rgw_meta_sync_status sync_status;
+  r = read_sync_status(dpp, &sync_status);
+  if (r < 0 && r != -ENOENT) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to read sync status, r=" << r << dendl;
+    return r;
+  }
+
+  int num_shards = sync_status.sync_info.num_shards;
+
+  for (int i = 0; i < num_shards; i++) {
+    shard_objs[i] = rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env.shard_obj_name(i));
+  }
+
+  std::unique_lock wl{ts_to_shard_lock};
+  for (int i = 0; i < num_shards; i++) {
+    clone_markers.push_back(string());
+    utime_shard ut;
+    ut.shard_id = i;
+    ts_to_shard[ut] = i;
+  }
+
+  return 0;
+}
+
+void RGWMetaSyncEnv::init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _store, RGWRESTConn *_conn,
+                          RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
+                          RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer) {
+  dpp = _dpp;
+  cct = _cct;
+  store = _store;
+  conn = _conn;
+  async_rados = _async_rados;
+  http_manager = _http_manager;
+  error_logger = _error_logger;
+  sync_tracer = _sync_tracer;
+}
+
+string RGWMetaSyncEnv::status_oid()
+{
+  return mdlog_sync_status_oid;
+}
+
+string RGWMetaSyncEnv::shard_obj_name(int shard_id)
+{
+  char buf[mdlog_sync_status_shard_prefix.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_status_shard_prefix.c_str(), shard_id);
+
+  return string(buf);
+}
+
+class RGWAsyncReadMDLogEntries : public RGWAsyncRadosRequest {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWMetadataLog *mdlog;
+  int shard_id;
+  int max_entries;
+
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override {
+    real_time from_time;
+    real_time end_time;
+
+    void *handle;
+
+    mdlog->init_list_entries(shard_id, from_time, end_time, marker, &handle);
+
+    int ret = mdlog->list_entries(dpp, handle, max_entries, entries, &marker, &truncated);
+
+    mdlog->complete_list_entries(handle);
+
+    return ret;
+  }
+public:
+  string marker;
+  list<cls_log_entry> entries;
+  bool truncated;
+
+  RGWAsyncReadMDLogEntries(const DoutPrefixProvider *dpp, RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                           RGWMetadataLog* mdlog, int _shard_id,
+                           std::string _marker, int _max_entries)
+    : RGWAsyncRadosRequest(caller, cn), dpp(dpp), store(_store), mdlog(mdlog),
+      shard_id(_shard_id), max_entries(_max_entries), marker(std::move(_marker)) {}
+};
+
+class RGWReadMDLogEntriesCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  RGWMetadataLog *const mdlog;
+  int shard_id;
+  string marker;
+  string *pmarker;
+  int max_entries;
+  list<cls_log_entry> *entries;
+  bool *truncated;
+
+  RGWAsyncReadMDLogEntries *req{nullptr};
+
+public:
+  RGWReadMDLogEntriesCR(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
+                        int _shard_id, string*_marker, int _max_entries,
+                        list<cls_log_entry> *_entries, bool *_truncated)
+    : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
+      shard_id(_shard_id), pmarker(_marker), max_entries(_max_entries),
+      entries(_entries), truncated(_truncated) {}
+
+  ~RGWReadMDLogEntriesCR() override {
+    if (req) {
+      req->finish();
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    marker = *pmarker;
+    req = new RGWAsyncReadMDLogEntries(dpp, this, stack->create_completion_notifier(),
+                                       sync_env->store, mdlog, shard_id, marker,
+                                       max_entries);
+    sync_env->async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    *pmarker = std::move(req->marker);
+    *entries = std::move(req->entries);
+    *truncated = req->truncated;
+    return req->get_ret_status();
+  }
+};
+
+
+class RGWReadRemoteMDLogShardInfoCR : public RGWCoroutine {
+  RGWMetaSyncEnv *env;
+  RGWRESTReadResource *http_op;
+
+  const std::string& period;
+  int shard_id;
+  RGWMetadataLogInfo *shard_info;
+
+public:
+  RGWReadRemoteMDLogShardInfoCR(RGWMetaSyncEnv *env, const std::string& period,
+                                int _shard_id, RGWMetadataLogInfo *_shard_info)
+    : RGWCoroutine(env->store->ctx()), env(env), http_op(NULL),
+      period(period), shard_id(_shard_id), shard_info(_shard_info) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    auto store = env->store;
+    RGWRESTConn *conn = store->svc()->zone->get_master_conn();
+    reenter(this) {
+      yield {
+	char buf[16];
+	snprintf(buf, sizeof(buf), "%d", shard_id);
+        rgw_http_param_pair pairs[] = { { "type" , "metadata" },
+	                                { "id", buf },
+	                                { "period", period.c_str() },
+					{ "info" , NULL },
+	                                { NULL, NULL } };
+
+        string p = "/admin/log/";
+
+        http_op = new RGWRESTReadResource(conn, p, pairs, NULL,
+                                          env->http_manager);
+
+        init_new_io(http_op);
+
+        int ret = http_op->aio_read(dpp);
+        if (ret < 0) {
+          ldpp_dout(env->dpp, 0) << "ERROR: failed to read from " << p << dendl;
+          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+          http_op->put();
+          return set_cr_error(ret);
+        }
+
+        return io_block(0);
+      }
+      yield {
+        int ret = http_op->wait(shard_info, null_yield);
+        http_op->put();
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        return set_cr_done();
+      }
+    }
+    return 0;
+  }
+};
+
+RGWCoroutine* create_read_remote_mdlog_shard_info_cr(RGWMetaSyncEnv *env,
+                                                     const std::string& period,
+                                                     int shard_id,
+                                                     RGWMetadataLogInfo* info)
+{
+  return new RGWReadRemoteMDLogShardInfoCR(env, period, shard_id, info);
+}
+
+class RGWListRemoteMDLogShardCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  RGWRESTReadResource *http_op;
+
+  const std::string& period;
+  int shard_id;
+  string marker;
+  uint32_t max_entries;
+  rgw_mdlog_shard_data *result;
+
+public:
+  RGWListRemoteMDLogShardCR(RGWMetaSyncEnv *env, const std::string& period,
+                            int _shard_id, const string& _marker, uint32_t _max_entries,
+                            rgw_mdlog_shard_data *_result)
+    : RGWSimpleCoroutine(env->store->ctx()), sync_env(env), http_op(NULL),
+      period(period), shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    RGWRESTConn *conn = sync_env->conn;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%d", shard_id);
+
+    char max_entries_buf[32];
+    snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
+
+    const char *marker_key = (marker.empty() ? "" : "marker");
+
+    rgw_http_param_pair pairs[] = { { "type", "metadata" },
+      { "id", buf },
+      { "period", period.c_str() },
+      { "max-entries", max_entries_buf },
+      { marker_key, marker.c_str() },
+      { NULL, NULL } };
+
+    string p = "/admin/log/";
+
+    http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+    init_new_io(http_op);
+
+    int ret = http_op->aio_read(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to read from " << p << dendl;
+      log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+      http_op->put();
+      return ret;
+    }
+
+    return 0;
+  }
+
+  int request_complete() override {
+    int ret = http_op->wait(result, null_yield);
+    http_op->put();
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to list remote mdlog shard, ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+};
+
+RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env,
+                                                const std::string& period,
+                                                int shard_id,
+                                                const std::string& marker,
+                                                uint32_t max_entries,
+                                                rgw_mdlog_shard_data *result)
+{
+  return new RGWListRemoteMDLogShardCR(env, period, shard_id, marker,
+                                       max_entries, result);
+}
+
+bool RGWReadRemoteMDLogInfoCR::spawn_next() {
+  if (shard_id >= num_shards) {
+    return false;
+  }
+  spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, period, shard_id, &(*mdlog_info)[shard_id]), false);
+  shard_id++;
+  return true;
+}
+
+bool RGWListRemoteMDLogCR::spawn_next() {
+  if (iter == shards.end()) {
+    return false;
+  }
+
+  spawn(new RGWListRemoteMDLogShardCR(sync_env, period, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
+  ++iter;
+  return true;
+}
+
+class RGWInitSyncStatusCoroutine : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  rgw_meta_sync_info status;
+  vector<RGWMetadataLogInfo> shards_info;
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+public:
+  RGWInitSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
+                             const rgw_meta_sync_info &status)
+    : RGWCoroutine(_sync_env->store->ctx()), sync_env(_sync_env),
+      status(status), shards_info(status.num_shards),
+      lease_cr(nullptr), lease_stack(nullptr) {}
+
+  ~RGWInitSyncStatusCoroutine() override {
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    int ret;
+    reenter(this) {
+      yield {
+        set_status("acquiring sync lock");
+	uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+        string lock_name = "sync_lock";
+	rgw::sal::RadosStore* store = sync_env->store;
+        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
+                                                rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                lock_name, lock_duration, this, nullptr));
+        lease_stack.reset(spawn(lease_cr.get(), false));
+      }
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          ldpp_dout(dpp, 5) << "failed to take lease" << dendl;
+          set_status("lease lock failed, early abort");
+          return set_cr_error(lease_cr->get_ret_status());
+        }
+        set_sleeping(true);
+        yield;
+      }
+      yield {
+        set_status("writing sync status");
+	rgw::sal::RadosStore* store = sync_env->store;
+        call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, store,
+                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                           status));
+      }
+
+      if (retcode < 0) {
+        set_status("failed to write sync status");
+        ldpp_dout(dpp, 0) << "ERROR: failed to write sync status, retcode=" << retcode << dendl;
+        yield lease_cr->go_down();
+        return set_cr_error(retcode);
+      }
+      /* fetch current position in logs */
+      set_status("fetching remote log position");
+      yield {
+        for (int i = 0; i < (int)status.num_shards; i++) {
+          spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, status.period, i,
+                                                  &shards_info[i]), false);
+	}
+      }
+
+      drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
+
+      yield {
+        set_status("updating sync status");
+        for (int i = 0; i < (int)status.num_shards; i++) {
+	  rgw_meta_sync_marker marker;
+          RGWMetadataLogInfo& info = shards_info[i];
+	  marker.next_step_marker = info.marker;
+	  marker.timestamp = info.last_update;
+	  rgw::sal::RadosStore* store = sync_env->store;
+          spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(dpp,
+                                                                store,
+                                                                rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->shard_obj_name(i)),
+                                                                marker), true);
+        }
+      }
+      yield {
+        set_status("changing sync state: build full sync maps");
+	status.state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
+	rgw::sal::RadosStore* store = sync_env->store;
+        call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, store,
+                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                           status));
+      }
+      set_status("drop lock lease");
+      yield lease_cr->go_down();
+      while (collect(&ret, NULL)) {
+	if (ret < 0) {
+	  return set_cr_error(ret);
+	}
+        yield;
+      }
+      drain_all();
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWReadSyncStatusMarkersCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  RGWMetaSyncEnv *env;
+  const int num_shards;
+  int shard_id{0};
+  map<uint32_t, rgw_meta_sync_marker>& markers;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to read metadata sync markers: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  RGWReadSyncStatusMarkersCR(RGWMetaSyncEnv *env, int num_shards,
+                             map<uint32_t, rgw_meta_sync_marker>& markers)
+    : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS),
+      env(env), num_shards(num_shards), markers(markers)
+  {}
+  bool spawn_next() override;
+};
+
+bool RGWReadSyncStatusMarkersCR::spawn_next()
+{
+  if (shard_id >= num_shards) {
+    return false;
+  }
+  using CR = RGWSimpleRadosReadCR<rgw_meta_sync_marker>;
+  rgw_raw_obj obj{env->store->svc()->zone->get_zone_params().log_pool,
+                  env->shard_obj_name(shard_id)};
+  spawn(new CR(env->dpp, env->store, obj, &markers[shard_id]), false);
+  shard_id++;
+  return true;
+}
+
+class RGWReadSyncStatusCoroutine : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  rgw_meta_sync_status *sync_status;
+
+public:
+  RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
+                             rgw_meta_sync_status *_status)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status)
+  {}
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWReadSyncStatusCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // read sync info
+    using ReadInfoCR = RGWSimpleRadosReadCR<rgw_meta_sync_info>;
+    yield {
+      bool empty_on_enoent = false; // fail on ENOENT
+      rgw_raw_obj obj{sync_env->store->svc()->zone->get_zone_params().log_pool,
+                      sync_env->status_oid()};
+      call(new ReadInfoCR(dpp, sync_env->store, obj,
+                          &sync_status->sync_info, empty_on_enoent));
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to read sync status info with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    // read shard markers
+    using ReadMarkersCR = RGWReadSyncStatusMarkersCR;
+    yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards,
+                                 sync_status->sync_markers));
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to read sync status markers with "
+          << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWFetchAllMetaCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  int num_shards;
+
+
+  int ret_status;
+
+  list<string> sections;
+  list<string>::iterator sections_iter;
+
+  struct meta_list_result {
+    list<string> keys;
+    string marker;
+    uint64_t count{0};
+    bool truncated{false};
+
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("keys", keys, obj);
+      JSONDecoder::decode_json("marker", marker, obj);
+      JSONDecoder::decode_json("count", count, obj);
+      JSONDecoder::decode_json("truncated", truncated, obj);
+    }
+  } result;
+  list<string>::iterator iter;
+
+  std::unique_ptr<RGWShardedOmapCRManager> entries_index;
+
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+  bool lost_lock;
+  bool failed;
+
+  string marker;
+
+  map<uint32_t, rgw_meta_sync_marker>& markers;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWFetchAllMetaCR(RGWMetaSyncEnv *_sync_env, int _num_shards,
+                    map<uint32_t, rgw_meta_sync_marker>& _markers,
+                    RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+						      num_shards(_num_shards),
+						      ret_status(0), lease_cr(nullptr), lease_stack(nullptr),
+                                                      lost_lock(false), failed(false), markers(_markers) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "fetch_all_meta");
+  }
+
+  ~RGWFetchAllMetaCR() override {
+  }
+
+  void append_section_from_set(set<string>& all_sections, const string& name) {
+    set<string>::iterator iter = all_sections.find(name);
+    if (iter != all_sections.end()) {
+      sections.emplace_back(std::move(*iter));
+      all_sections.erase(iter);
+    }
+  }
+  /*
+   * meta sync should go in the following order: user, bucket.instance, bucket
+   * then whatever other sections exist (if any)
+   */
+  void rearrange_sections() {
+    set<string> all_sections;
+    std::move(sections.begin(), sections.end(),
+              std::inserter(all_sections, all_sections.end()));
+    sections.clear();
+
+    append_section_from_set(all_sections, "user");
+    append_section_from_set(all_sections, "bucket.instance");
+    append_section_from_set(all_sections, "bucket");
+    append_section_from_set(all_sections, "roles");
+
+    std::move(all_sections.begin(), all_sections.end(),
+              std::back_inserter(sections));
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    RGWRESTConn *conn = sync_env->conn;
+
+    reenter(this) {
+      yield {
+        set_status(string("acquiring lock (") + sync_env->status_oid() + ")");
+	uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+        string lock_name = "sync_lock";
+        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados,
+                                                sync_env->store,
+                                                rgw_raw_obj(sync_env->store->svc()->zone->get_zone_params().log_pool, sync_env->status_oid()),
+                                                lock_name, lock_duration, this, nullptr));
+        lease_stack.reset(spawn(lease_cr.get(), false));
+      }
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          ldpp_dout(dpp, 5) << "failed to take lease" << dendl;
+          set_status("lease lock failed, early abort");
+          return set_cr_error(lease_cr->get_ret_status());
+        }
+        set_sleeping(true);
+        yield;
+      }
+      entries_index.reset(new RGWShardedOmapCRManager(sync_env->async_rados, sync_env->store, this, num_shards,
+                                                      sync_env->store->svc()->zone->get_zone_params().log_pool,
+                                                      mdlog_sync_full_sync_index_prefix));
+      yield {
+	call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
+				       "/admin/metadata", NULL, &sections));
+      }
+      if (get_ret_status() < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to fetch metadata sections" << dendl;
+        yield entries_index->finish();
+        yield lease_cr->go_down();
+        drain_all();
+	return set_cr_error(get_ret_status());
+      }
+      rearrange_sections();
+      sections_iter = sections.begin();
+      for (; sections_iter != sections.end(); ++sections_iter) {
+        do {
+          yield {
+#define META_FULL_SYNC_CHUNK_SIZE "1000"
+            string entrypoint = string("/admin/metadata/") + *sections_iter;
+            rgw_http_param_pair pairs[] = { { "max-entries", META_FULL_SYNC_CHUNK_SIZE },
+              { "marker", result.marker.c_str() },
+              { NULL, NULL } };
+            result.keys.clear();
+            call(new RGWReadRESTResourceCR<meta_list_result >(cct, conn, sync_env->http_manager,
+                                                              entrypoint, pairs, &result));
+          }
+          ret_status = get_ret_status();
+          if (ret_status == -ENOENT) {
+            set_retcode(0); /* reset coroutine status so that we don't return it */
+            ret_status = 0;
+          }
+          if (ret_status < 0) {
+            tn->log(0, SSTR("ERROR: failed to fetch metadata section: " << *sections_iter));
+            yield entries_index->finish();
+            yield lease_cr->go_down();
+            drain_all();
+            return set_cr_error(ret_status);
+          }
+          iter = result.keys.begin();
+          for (; iter != result.keys.end(); ++iter) {
+            if (!lease_cr->is_locked()) {
+              lost_lock = true;
+              tn->log(1, "lease is lost, abort");
+              break;
+            }
+            yield; // allow entries_index consumer to make progress
+
+            tn->log(20, SSTR("list metadata: section=" << *sections_iter << " key=" << *iter));
+            string s = *sections_iter + ":" + *iter;
+            int shard_id;
+	    rgw::sal::RadosStore* store = sync_env->store;
+            int ret = store->ctl()->meta.mgr->get_shard_id(*sections_iter, *iter, &shard_id);
+            if (ret < 0) {
+              tn->log(0, SSTR("ERROR: could not determine shard id for " << *sections_iter << ":" << *iter));
+              ret_status = ret;
+              break;
+            }
+            if (!entries_index->append(s, shard_id)) {
+              break;
+            }
+          }
+        } while (result.truncated);
+      }
+      yield {
+        if (!entries_index->finish()) {
+          failed = true;
+        }
+      }
+      if (!failed) {
+        for (map<uint32_t, rgw_meta_sync_marker>::iterator iter = markers.begin(); iter != markers.end(); ++iter) {
+          int shard_id = (int)iter->first;
+          rgw_meta_sync_marker& marker = iter->second;
+          marker.total_entries = entries_index->get_total_entries(shard_id);
+          spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(dpp, sync_env->store,
+                                                                rgw_raw_obj(sync_env->store->svc()->zone->get_zone_params().log_pool, sync_env->shard_obj_name(shard_id)),
+                                                                marker), true);
+        }
+      }
+
+      drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
+
+      yield lease_cr->go_down();
+
+      int ret;
+      while (collect(&ret, NULL)) {
+	if (ret < 0) {
+	  return set_cr_error(ret);
+	}
+        yield;
+      }
+      drain_all();
+      if (failed) {
+        yield return set_cr_error(-EIO);
+      }
+      if (lost_lock) {
+        yield return set_cr_error(-EBUSY);
+      }
+
+      if (ret_status < 0) {
+        yield return set_cr_error(ret_status);
+      }
+
+      yield return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+static string full_sync_index_shard_oid(int shard_id)
+{
+  char buf[mdlog_sync_full_sync_index_prefix.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_full_sync_index_prefix.c_str(), shard_id);
+  return string(buf);
+}
+
+class RGWReadRemoteMetadataCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  RGWRESTReadResource *http_op;
+
+  string section;
+  string key;
+
+  bufferlist *pbl;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWReadRemoteMetadataCR(RGWMetaSyncEnv *_sync_env,
+                                                      const string& _section, const string& _key, bufferlist *_pbl,
+                                                      const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+                                                      http_op(NULL),
+                                                      section(_section),
+                                                      key(_key),
+						      pbl(_pbl) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "read_remote_meta",
+                                         section + ":" + key);
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    RGWRESTConn *conn = sync_env->conn;
+    reenter(this) {
+      yield {
+        string key_encode;
+        url_encode(key, key_encode);
+        rgw_http_param_pair pairs[] = { { "key" , key.c_str()},
+	                                { NULL, NULL } };
+
+        string p = string("/admin/metadata/") + section + "/" + key_encode;
+
+        http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
+
+        init_new_io(http_op);
+
+        int ret = http_op->aio_read(dpp);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl;
+          log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+          http_op->put();
+          return set_cr_error(ret);
+        }
+
+        return io_block(0);
+      }
+      yield {
+        int ret = http_op->wait(pbl, null_yield);
+        http_op->put();
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        return set_cr_done();
+      }
+    }
+    return 0;
+  }
+};
+
+class RGWAsyncMetaStoreEntry : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  string raw_key;
+  bufferlist bl;
+  const DoutPrefixProvider *dpp;
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override {
+    int ret = store->ctl()->meta.mgr->put(raw_key, bl, null_yield, dpp, RGWMDLogSyncType::APPLY_ALWAYS, true);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: can't store key: " << raw_key << " ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+public:
+  RGWAsyncMetaStoreEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                       const string& _raw_key,
+                       bufferlist& _bl,
+                       const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), store(_store),
+                                          raw_key(_raw_key), bl(_bl), dpp(dpp) {}
+};
+
+
+class RGWMetaStoreEntryCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  string raw_key;
+  bufferlist bl;
+
+  RGWAsyncMetaStoreEntry *req;
+
+public:
+  RGWMetaStoreEntryCR(RGWMetaSyncEnv *_sync_env,
+                       const string& _raw_key,
+                       bufferlist& _bl) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
+                                          raw_key(_raw_key), bl(_bl), req(NULL) {
+  }
+
+  ~RGWMetaStoreEntryCR() override {
+    if (req) {
+      req->finish();
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncMetaStoreEntry(this, stack->create_completion_notifier(),
+			           sync_env->store, raw_key, bl, dpp);
+    sync_env->async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+};
+
+class RGWAsyncMetaRemoveEntry : public RGWAsyncRadosRequest {
+  rgw::sal::RadosStore* store;
+  string raw_key;
+  const DoutPrefixProvider *dpp;
+protected:
+  int _send_request(const DoutPrefixProvider *dpp) override {
+    int ret = store->ctl()->meta.mgr->remove(raw_key, null_yield, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: can't remove key: " << raw_key << " ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+public:
+  RGWAsyncMetaRemoveEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, rgw::sal::RadosStore* _store,
+                       const string& _raw_key, const DoutPrefixProvider *dpp) : RGWAsyncRadosRequest(caller, cn), store(_store),
+                                          raw_key(_raw_key), dpp(dpp) {}
+};
+
+
+class RGWMetaRemoveEntryCR : public RGWSimpleCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  string raw_key;
+
+  RGWAsyncMetaRemoveEntry *req;
+
+public:
+  RGWMetaRemoveEntryCR(RGWMetaSyncEnv *_sync_env,
+                       const string& _raw_key) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
+                                          raw_key(_raw_key), req(NULL) {
+  }
+
+  ~RGWMetaRemoveEntryCR() override {
+    if (req) {
+      req->finish();
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncMetaRemoveEntry(this, stack->create_completion_notifier(),
+			           sync_env->store, raw_key, dpp);
+    sync_env->async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    int r = req->get_ret_status();
+    if (r == -ENOENT) {
+      r = 0;
+    }
+    return r;
+  }
+};
+
+#define META_SYNC_UPDATE_MARKER_WINDOW 10
+
+
+int RGWLastCallerWinsCR::operate(const DoutPrefixProvider *dpp) {
+  RGWCoroutine *call_cr;
+  reenter(this) {
+    while (cr) {
+      call_cr = cr;
+      cr = nullptr;
+      yield call(call_cr);
+      /* cr might have been modified at this point */
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: RGWLastCallerWinsCR() failed: retcode=" << retcode << dendl;
+        return set_cr_error(retcode);
+      }
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWMetaSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
+  RGWMetaSyncEnv *sync_env;
+
+  string marker_oid;
+  rgw_meta_sync_marker sync_marker;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWMetaSyncShardMarkerTrack(RGWMetaSyncEnv *_sync_env,
+                         const string& _marker_oid,
+                         const rgw_meta_sync_marker& _marker,
+                         RGWSyncTraceNodeRef& _tn) : RGWSyncShardMarkerTrack(META_SYNC_UPDATE_MARKER_WINDOW),
+                                                                sync_env(_sync_env),
+                                                                marker_oid(_marker_oid),
+                                                                sync_marker(_marker),
+                                                                tn(_tn){}
+
+  RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
+    sync_marker.marker = new_marker;
+    if (index_pos > 0) {
+      sync_marker.pos = index_pos;
+    }
+
+    if (!real_clock::is_zero(timestamp)) {
+      sync_marker.timestamp = timestamp;
+    }
+
+    ldpp_dout(sync_env->dpp, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << " realm_epoch=" << sync_marker.realm_epoch << dendl;
+    tn->log(20, SSTR("new marker=" << new_marker));
+    rgw::sal::RadosStore* store = sync_env->store;
+    return new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->dpp, store,
+                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, marker_oid),
+                                                           sync_marker);
+  }
+
+  RGWOrderCallCR *allocate_order_control_cr() override {
+    return new RGWLastCallerWinsCR(sync_env->cct);
+  }
+};
+
+RGWMetaSyncSingleEntryCR::RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env,
+		           const string& _raw_key, const string& _entry_marker,
+                           const RGWMDLogStatus& _op_status,
+                           RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent) : RGWCoroutine(_sync_env->cct),
+                                                      sync_env(_sync_env),
+						      raw_key(_raw_key), entry_marker(_entry_marker),
+                                                      op_status(_op_status),
+                                                      pos(0), sync_status(0),
+                                                      marker_tracker(_marker_tracker), tries(0) {
+  error_injection = (sync_env->cct->_conf->rgw_sync_meta_inject_err_probability > 0);
+  tn = sync_env->sync_tracer->add_node(_tn_parent, "entry", raw_key);
+}
+
+int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+#define NUM_TRANSIENT_ERROR_RETRIES 10
+
+    if (error_injection &&
+        rand() % 10000 < cct->_conf->rgw_sync_meta_inject_err_probability * 10000.0) {
+      return set_cr_error(-EIO);
+    }
+
+    if (op_status != MDLOG_STATUS_COMPLETE) {
+      tn->log(20, "skipping pending operation");
+      yield call(marker_tracker->finish(entry_marker));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    tn->set_flag(RGW_SNS_FLAG_ACTIVE);
+    for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
+      yield {
+        pos = raw_key.find(':');
+        section = raw_key.substr(0, pos);
+        key = raw_key.substr(pos + 1);
+        tn->log(10, SSTR("fetching remote metadata entry" << (tries == 0 ? "" : " (retry)")));
+        call(new RGWReadRemoteMetadataCR(sync_env, section, key, &md_bl, tn));
+      }
+
+      sync_status = retcode;
+
+      if (sync_status == -ENOENT) {
+        break;
+      }
+
+      if (sync_status < 0) {
+        if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) {
+          ldpp_dout(dpp, 20) << *this << ": failed to fetch remote metadata entry: " << section << ":" << key << ", will retry" << dendl;
+          continue;
+        }
+
+        tn->log(10, SSTR("failed to read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status));
+        log_error() << "failed to read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << std::endl;
+        yield call(sync_env->error_logger->log_error_cr(dpp, sync_env->conn->get_remote_id(), section, key, -sync_status,
+                                                        string("failed to read remote metadata entry: ") + cpp_strerror(-sync_status)));
+        return set_cr_error(sync_status);
+      }
+
+      break;
+    }
+
+    retcode = 0;
+    for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
+      if (sync_status != -ENOENT) {
+        tn->log(10, SSTR("storing local metadata entry: " << section << ":" << key));
+        yield call(new RGWMetaStoreEntryCR(sync_env, raw_key, md_bl));
+      } else {
+        tn->log(10, SSTR("removing local metadata entry:" << section << ":" << key));
+        yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key));
+        if (retcode == -ENOENT) {
+          retcode = 0;
+          break;
+        }
+      }
+      if ((retcode < 0) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
+        ldpp_dout(dpp, 20) << *this << ": failed to store metadata entry: " << section << ":" << key << ", got retcode=" << retcode << ", will retry" << dendl;
+        continue;
+      }
+      break;
+    }
+
+    sync_status = retcode;
+
+    if (sync_status == 0 && marker_tracker) {
+      /* update marker */
+      yield call(marker_tracker->finish(entry_marker));
+      sync_status = retcode;
+    }
+    if (sync_status < 0) {
+      tn->log(10, SSTR("failed, status=" << sync_status));
+      return set_cr_error(sync_status);
+    }
+    tn->log(10, "success");
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class RGWCloneMetaLogCoroutine : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  RGWMetadataLog *mdlog;
+
+  const std::string& period;
+  int shard_id;
+  string marker;
+  bool truncated = false;
+  string *new_marker;
+
+  int max_entries = CLONE_MAX_ENTRIES;
+
+  RGWRESTReadResource *http_op = nullptr;
+  boost::intrusive_ptr<RGWMetadataLogInfoCompletion> completion;
+
+  RGWMetadataLogInfo shard_info;
+  rgw_mdlog_shard_data data;
+
+public:
+  RGWCloneMetaLogCoroutine(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
+                           const std::string& period, int _id,
+                           const string& _marker, string *_new_marker)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
+      period(period), shard_id(_id), marker(_marker), new_marker(_new_marker) {
+    if (new_marker) {
+      *new_marker = marker;
+    }
+  }
+  ~RGWCloneMetaLogCoroutine() override {
+    if (http_op) {
+      http_op->put();
+    }
+    if (completion) {
+      completion->cancel();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+
+  int state_init();
+  int state_read_shard_status();
+  int state_read_shard_status_complete();
+  int state_send_rest_request(const DoutPrefixProvider *dpp);
+  int state_receive_rest_response();
+  int state_store_mdlog_entries();
+  int state_store_mdlog_entries_complete();
+};
+
+class RGWMetaSyncShardCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  const rgw_pool& pool;
+  const std::string& period; //< currently syncing period id
+  const epoch_t realm_epoch; //< realm_epoch of period
+  RGWMetadataLog* mdlog; //< log of syncing period
+  uint32_t shard_id;
+  rgw_meta_sync_marker& sync_marker;
+  boost::optional<rgw_meta_sync_marker> temp_marker; //< for pending updates
+  string marker;
+  string max_marker;
+  const std::string& period_marker; //< max marker stored in next period
+
+  RGWRadosGetOmapKeysCR::ResultPtr omapkeys;
+  std::set<std::string> entries;
+  std::set<std::string>::iterator iter;
+
+  string oid;
+
+  RGWMetaSyncShardMarkerTrack *marker_tracker = nullptr;
+
+  list<cls_log_entry> log_entries;
+  list<cls_log_entry>::iterator log_iter;
+  bool truncated = false;
+
+  string mdlog_marker;
+  string raw_key;
+  rgw_mdlog_entry mdlog_entry;
+
+  ceph::mutex inc_lock = ceph::make_mutex("RGWMetaSyncShardCR::inc_lock");
+  ceph::condition_variable inc_cond;
+
+  boost::asio::coroutine incremental_cr;
+  boost::asio::coroutine full_cr;
+
+  boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
+  boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
+
+  bool lost_lock = false;
+
+  bool *reset_backoff;
+
+  // hold a reference to the cr stack while it's in the map
+  using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
+  map<StackRef, string> stack_to_pos;
+  map<string, string> pos_to_prev;
+
+  bool can_adjust_marker = false;
+  bool done_with_period = false;
+
+  int total_entries = 0;
+
+  RGWSyncTraceNodeRef tn;
+public:
+  RGWMetaSyncShardCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
+                     const std::string& period, epoch_t realm_epoch,
+                     RGWMetadataLog* mdlog, uint32_t _shard_id,
+                     rgw_meta_sync_marker& _marker,
+                     const std::string& period_marker, bool *_reset_backoff,
+                     RGWSyncTraceNodeRef& _tn)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), pool(_pool),
+      period(period), realm_epoch(realm_epoch), mdlog(mdlog),
+      shard_id(_shard_id), sync_marker(_marker),
+      period_marker(period_marker),
+      reset_backoff(_reset_backoff), tn(_tn) {
+    *reset_backoff = false;
+  }
+
+  ~RGWMetaSyncShardCR() override {
+    delete marker_tracker;
+    if (lease_cr) {
+      lease_cr->abort();
+    }
+  }
+
+  void set_marker_tracker(RGWMetaSyncShardMarkerTrack *mt) {
+    delete marker_tracker;
+    marker_tracker = mt;
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    int r;
+    while (true) {
+      switch (sync_marker.state) {
+      case rgw_meta_sync_marker::FullSync:
+        r  = full_sync();
+        if (r < 0) {
+          ldpp_dout(dpp, 10) << "sync: full_sync: shard_id=" << shard_id << " r=" << r << dendl;
+          return set_cr_error(r);
+        }
+        return 0;
+      case rgw_meta_sync_marker::IncrementalSync:
+        r  = incremental_sync();
+        if (r < 0) {
+          ldpp_dout(dpp, 10) << "sync: incremental_sync: shard_id=" << shard_id << " r=" << r << dendl;
+          return set_cr_error(r);
+        }
+        return 0;
+      }
+    }
+    /* unreachable */
+    return 0;
+  }
+
+  void collect_children()
+  {
+    int child_ret;
+    RGWCoroutinesStack *child;
+    while (collect_next(&child_ret, &child)) {
+      auto iter = stack_to_pos.find(child);
+      if (iter == stack_to_pos.end()) {
+        /* some other stack that we don't care about */
+        continue;
+      }
+
+      string& pos = iter->second;
+
+      if (child_ret < 0) {
+        ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl;
+        // on any error code from RGWMetaSyncSingleEntryCR, we do not advance
+        // the sync status marker past this entry, and set
+        // can_adjust_marker=false to exit out of RGWMetaSyncShardCR.
+        // RGWMetaSyncShardControlCR will rerun RGWMetaSyncShardCR from the
+        // previous marker and retry
+        can_adjust_marker = false;
+      }
+
+      map<string, string>::iterator prev_iter = pos_to_prev.find(pos);
+      ceph_assert(prev_iter != pos_to_prev.end());
+
+      if (pos_to_prev.size() == 1) {
+        if (can_adjust_marker) {
+          sync_marker.marker = pos;
+        }
+        pos_to_prev.erase(prev_iter);
+      } else {
+        ceph_assert(pos_to_prev.size() > 1);
+        pos_to_prev.erase(prev_iter);
+        prev_iter = pos_to_prev.begin();
+        if (can_adjust_marker) {
+          sync_marker.marker = prev_iter->second;
+        }
+      }
+
+      ldpp_dout(sync_env->dpp, 4) << *this << ": adjusting marker pos=" << sync_marker.marker << dendl;
+      stack_to_pos.erase(iter);
+    }
+  }
+
+  int full_sync() {
+#define OMAP_GET_MAX_ENTRIES 100
+    int max_entries = OMAP_GET_MAX_ENTRIES;
+    reenter(&full_cr) {
+      set_status("full_sync");
+      tn->log(10, "start full sync");
+      oid = full_sync_index_shard_oid(shard_id);
+      can_adjust_marker = true;
+      /* grab lock */
+      yield {
+	uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+        string lock_name = "sync_lock";
+	rgw::sal::RadosStore* store = sync_env->store;
+        lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
+                                                rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+                                                lock_name, lock_duration, this, nullptr));
+        lease_stack.reset(spawn(lease_cr.get(), false));
+        lost_lock = false;
+      }
+      while (!lease_cr->is_locked()) {
+        if (lease_cr->is_done()) {
+          drain_all();
+          tn->log(5, "failed to take lease");
+          return lease_cr->get_ret_status();
+        }
+        set_sleeping(true);
+        yield;
+      }
+      tn->log(10, "took lease");
+
+      /* lock succeeded, a retry now should avoid previous backoff status */
+      *reset_backoff = true;
+
+      /* prepare marker tracker */
+      set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
+                                                         sync_env->shard_obj_name(shard_id),
+                                                         sync_marker, tn));
+
+      marker = sync_marker.marker;
+
+      total_entries = sync_marker.pos;
+
+      /* sync! */
+      do {
+        if (!lease_cr->is_locked()) {
+          tn->log(1, "lease is lost, abort");
+          lost_lock = true;
+          break;
+        }
+        omapkeys = std::make_shared<RGWRadosGetOmapKeysCR::Result>();
+        yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, oid),
+                                             marker, max_entries, omapkeys));
+        if (retcode < 0) {
+          ldpp_dout(sync_env->dpp, 0) << "ERROR: " << __func__ << "(): RGWRadosGetOmapKeysCR() returned ret=" << retcode << dendl;
+          tn->log(0, SSTR("ERROR: failed to list omap keys, status=" << retcode));
+          yield lease_cr->go_down();
+          drain_all();
+          return retcode;
+        }
+        entries = std::move(omapkeys->entries);
+        tn->log(20, SSTR("retrieved " << entries.size() << " entries to sync"));
+        if (entries.size() > 0) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+        }
+        iter = entries.begin();
+        for (; iter != entries.end(); ++iter) {
+          marker = *iter;
+          tn->log(20, SSTR("full sync: " << marker));
+          total_entries++;
+          if (!marker_tracker->start(marker, total_entries, real_time())) {
+            tn->log(0, SSTR("ERROR: cannot start syncing " << marker << ". Duplicate entry?"));
+          } else {
+            // fetch remote and write locally
+            yield {
+              RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, marker, marker, MDLOG_STATUS_COMPLETE, marker_tracker, tn), false);
+              // stack_to_pos holds a reference to the stack
+              stack_to_pos[stack] = marker;
+              pos_to_prev[marker] = marker;
+            }
+            // limit spawn window
+            while (num_spawned() > static_cast<size_t>(cct->_conf->rgw_meta_sync_spawn_window)) {
+              yield wait_for_child();
+              collect_children();
+            }
+          }
+        }
+        collect_children();
+      } while (omapkeys->more && can_adjust_marker);
+
+      tn->unset_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+
+      while (num_spawned() > 1) {
+        yield wait_for_child();
+        collect_children();
+      }
+
+      if (!lost_lock) {
+        /* update marker to reflect we're done with full sync */
+        if (can_adjust_marker) {
+          // apply updates to a temporary marker, or operate() will send us
+          // to incremental_sync() after we yield
+          temp_marker = sync_marker;
+	  temp_marker->state = rgw_meta_sync_marker::IncrementalSync;
+	  temp_marker->marker = std::move(temp_marker->next_step_marker);
+	  temp_marker->next_step_marker.clear();
+	  temp_marker->realm_epoch = realm_epoch;
+	  ldpp_dout(sync_env->dpp, 4) << *this << ": saving marker pos=" << temp_marker->marker << " realm_epoch=" << realm_epoch << dendl;
+
+	  using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_meta_sync_marker>;
+	  yield call(new WriteMarkerCR(sync_env->dpp, sync_env->store,
+				       rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+				       *temp_marker));
+        }
+
+        if (retcode < 0) {
+          ldpp_dout(sync_env->dpp, 0) << "ERROR: failed to set sync marker: retcode=" << retcode << dendl;
+          yield lease_cr->go_down();
+          drain_all();
+          return retcode;
+        }
+        // clean up full sync index
+        yield {
+          auto oid = full_sync_index_shard_oid(shard_id);
+          call(new RGWRadosRemoveCR(sync_env->store, {pool, oid}));
+        }
+      }
+
+      /* 
+       * if we reached here, it means that lost_lock is true, otherwise the state
+       * change in the previous block will prevent us from reaching here
+       */
+
+      yield lease_cr->go_down();
+
+      lease_cr.reset();
+
+      drain_all();
+
+      if (!can_adjust_marker) {
+        return -EAGAIN;
+      }
+
+      if (lost_lock) {
+        return -EBUSY;
+      }
+
+      tn->log(10, "full sync complete");
+
+      // apply the sync marker update
+      ceph_assert(temp_marker);
+      sync_marker = std::move(*temp_marker);
+      temp_marker = boost::none;
+      // must not yield after this point!
+    }
+    return 0;
+  }
+    
+
+  int incremental_sync() {
+    reenter(&incremental_cr) {
+      set_status("incremental_sync");
+      tn->log(10, "start incremental sync");
+      can_adjust_marker = true;
+      /* grab lock */
+      if (!lease_cr) { /* could have had  a lease_cr lock from previous state */
+        yield {
+          uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
+          string lock_name = "sync_lock";
+	  rgw::sal::RadosStore* store = sync_env->store;
+          lease_cr.reset( new RGWContinuousLeaseCR(sync_env->async_rados, store,
+                                                   rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+                                                   lock_name, lock_duration, this, nullptr));
+          lease_stack.reset(spawn(lease_cr.get(), false));
+          lost_lock = false;
+        }
+        while (!lease_cr->is_locked()) {
+          if (lease_cr->is_done()) {
+            drain_all();
+            tn->log(5, "failed to take lease");
+            return lease_cr->get_ret_status();
+          }
+          set_sleeping(true);
+          yield;
+        }
+      }
+      tn->log(10, "took lease");
+      // if the period has advanced, we can't use the existing marker
+      if (sync_marker.realm_epoch < realm_epoch) {
+        ldpp_dout(sync_env->dpp, 4) << "clearing marker=" << sync_marker.marker
+            << " from old realm_epoch=" << sync_marker.realm_epoch
+            << " (now " << realm_epoch << ')' << dendl;
+        sync_marker.realm_epoch = realm_epoch;
+        sync_marker.marker.clear();
+      }
+      mdlog_marker = sync_marker.marker;
+      set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
+                                                         sync_env->shard_obj_name(shard_id),
+                                                         sync_marker, tn));
+
+      /*
+       * mdlog_marker: the remote sync marker positiion
+       * sync_marker: the local sync marker position
+       * max_marker: the max mdlog position that we fetched
+       * marker: the current position we try to sync
+       * period_marker: the last marker before the next period begins (optional)
+       */
+      marker = max_marker = sync_marker.marker;
+      /* inc sync */
+      do {
+        if (!lease_cr->is_locked()) {
+          lost_lock = true;
+          tn->log(1, "lease is lost, abort");
+          break;
+        }
+#define INCREMENTAL_MAX_ENTRIES 100
+        ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << " truncated=" << truncated << dendl;
+        if (!period_marker.empty() && period_marker <= mdlog_marker) {
+          tn->log(10, SSTR("finished syncing current period: mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker << " period_marker=" << period_marker));
+          done_with_period = true;
+          break;
+        }
+	if (mdlog_marker <= max_marker || !truncated) {
+	  /* we're at the tip, try to bring more entries */
+          ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " syncing mdlog for shard_id=" << shard_id << dendl;
+          yield call(new RGWCloneMetaLogCoroutine(sync_env, mdlog,
+                                                  period, shard_id,
+                                                  mdlog_marker, &mdlog_marker));
+	}
+        if (retcode < 0) {
+          tn->log(10, SSTR(*this << ": failed to fetch more log entries, retcode=" << retcode));
+          yield lease_cr->go_down();
+          drain_all();
+          *reset_backoff = false; // back off and try again later
+          return retcode;
+        }
+        truncated = true;
+        *reset_backoff = true; /* if we got to this point, all systems function */
+	if (mdlog_marker > max_marker) {
+          tn->set_flag(RGW_SNS_FLAG_ACTIVE); /* actually have entries to sync */
+          tn->log(20, SSTR("mdlog_marker=" << mdlog_marker << " sync_marker=" << sync_marker.marker));
+          marker = max_marker;
+          yield call(new RGWReadMDLogEntriesCR(sync_env, mdlog, shard_id,
+                                               &max_marker, INCREMENTAL_MAX_ENTRIES,
+                                               &log_entries, &truncated));
+          if (retcode < 0) {
+            tn->log(10, SSTR("failed to list mdlog entries, retcode=" << retcode));
+            yield lease_cr->go_down();
+            drain_all();
+            *reset_backoff = false; // back off and try again later
+            return retcode;
+          }
+          for (log_iter = log_entries.begin(); log_iter != log_entries.end() && !done_with_period; ++log_iter) {
+            if (!period_marker.empty() && period_marker <= log_iter->id) {
+              done_with_period = true;
+              if (period_marker < log_iter->id) {
+                tn->log(10, SSTR("found key=" << log_iter->id
+                    << " past period_marker=" << period_marker));
+                break;
+              }
+              ldpp_dout(sync_env->dpp, 10) << "found key at period_marker=" << period_marker << dendl;
+              // sync this entry, then return control to RGWMetaSyncCR
+            }
+            if (!mdlog_entry.convert_from(*log_iter)) {
+              tn->log(0, SSTR("ERROR: failed to convert mdlog entry, shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << " ... skipping entry"));
+              continue;
+            }
+            tn->log(20, SSTR("log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp));
+            if (!marker_tracker->start(log_iter->id, 0, log_iter->timestamp.to_real_time())) {
+              ldpp_dout(sync_env->dpp, 0) << "ERROR: cannot start syncing " << log_iter->id << ". Duplicate entry?" << dendl;
+            } else {
+              raw_key = log_iter->section + ":" + log_iter->name;
+              yield {
+                RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, raw_key, log_iter->id, mdlog_entry.log_data.status, marker_tracker, tn), false);
+                ceph_assert(stack);
+                // stack_to_pos holds a reference to the stack
+                stack_to_pos[stack] = log_iter->id;
+                pos_to_prev[log_iter->id] = marker;
+              }
+              // limit spawn window
+              while (num_spawned() > static_cast<size_t>(cct->_conf->rgw_meta_sync_spawn_window)) {
+                yield wait_for_child();
+                collect_children();
+              }
+            }
+            marker = log_iter->id;
+          }
+        }
+        collect_children();
+	ldpp_dout(sync_env->dpp, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " max_marker=" << max_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
+        if (done_with_period) {
+          // return control to RGWMetaSyncCR and advance to the next period
+          tn->log(10, SSTR(*this << ": done with period"));
+          break;
+        }
+	if (mdlog_marker == max_marker && can_adjust_marker) {
+          tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+	  yield wait(utime_t(cct->_conf->rgw_meta_sync_poll_interval, 0));
+	}
+      } while (can_adjust_marker);
+
+      tn->unset_flag(RGW_SNS_FLAG_ACTIVE);
+
+      while (num_spawned() > 1) {
+        yield wait_for_child();
+        collect_children();
+      }
+
+      yield lease_cr->go_down();
+
+      drain_all();
+
+      if (lost_lock) {
+        return -EBUSY;
+      }
+
+      if (!can_adjust_marker) {
+        return -EAGAIN;
+      }
+
+      return set_cr_done();
+    }
+    /* TODO */
+    return 0;
+  }
+};
+
+class RGWMetaSyncShardControlCR : public RGWBackoffControlCR
+{
+  RGWMetaSyncEnv *sync_env;
+
+  const rgw_pool& pool;
+  const std::string& period;
+  epoch_t realm_epoch;
+  RGWMetadataLog* mdlog;
+  uint32_t shard_id;
+  rgw_meta_sync_marker sync_marker;
+  const std::string period_marker;
+
+  RGWSyncTraceNodeRef tn;
+
+  static constexpr bool exit_on_error = false; // retry on all errors
+public:
+  RGWMetaSyncShardControlCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
+                            const std::string& period, epoch_t realm_epoch,
+                            RGWMetadataLog* mdlog, uint32_t _shard_id,
+                            const rgw_meta_sync_marker& _marker,
+                            std::string&& period_marker,
+                            RGWSyncTraceNodeRef& _tn_parent)
+    : RGWBackoffControlCR(_sync_env->cct, exit_on_error), sync_env(_sync_env),
+      pool(_pool), period(period), realm_epoch(realm_epoch), mdlog(mdlog),
+      shard_id(_shard_id), sync_marker(_marker),
+      period_marker(std::move(period_marker)) {
+    tn = sync_env->sync_tracer->add_node(_tn_parent, "shard",
+                                         std::to_string(shard_id));
+  }
+
+  RGWCoroutine *alloc_cr() override {
+    return new RGWMetaSyncShardCR(sync_env, pool, period, realm_epoch, mdlog,
+                                  shard_id, sync_marker, period_marker, backoff_ptr(), tn);
+  }
+
+  RGWCoroutine *alloc_finisher_cr() override {
+    rgw::sal::RadosStore* store = sync_env->store;
+    return new RGWSimpleRadosReadCR<rgw_meta_sync_marker>(sync_env->dpp, store,
+                                                          rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
+                                                          &sync_marker);
+  }
+};
+
+class RGWMetaSyncCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+  const rgw_pool& pool;
+  RGWPeriodHistory::Cursor cursor; //< sync position in period history
+  RGWPeriodHistory::Cursor next; //< next period in history
+  rgw_meta_sync_status sync_status;
+  RGWSyncTraceNodeRef tn;
+
+  std::mutex mutex; //< protect access to shard_crs
+
+  // TODO: it should be enough to hold a reference on the stack only, as calling
+  // RGWCoroutinesStack::wakeup() doesn't refer to the RGWCoroutine if it has
+  // already completed
+  using ControlCRRef = boost::intrusive_ptr<RGWMetaSyncShardControlCR>;
+  using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
+  using RefPair = std::pair<ControlCRRef, StackRef>;
+  map<int, RefPair> shard_crs;
+  int ret{0};
+
+public:
+  RGWMetaSyncCR(RGWMetaSyncEnv *_sync_env, const RGWPeriodHistory::Cursor &cursor,
+                const rgw_meta_sync_status& _sync_status, RGWSyncTraceNodeRef& _tn)
+    : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
+      pool(sync_env->store->svc()->zone->get_zone_params().log_pool),
+      cursor(cursor), sync_status(_sync_status), tn(_tn) {}
+
+  ~RGWMetaSyncCR() {
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      // loop through one period at a time
+      tn->log(1, "start");
+      for (;;) {
+        if (cursor == sync_env->store->svc()->mdlog->get_period_history()->get_current()) {
+          next = RGWPeriodHistory::Cursor{};
+          if (cursor) {
+            ldpp_dout(dpp, 10) << "RGWMetaSyncCR on current period="
+                << cursor.get_period().get_id() << dendl;
+          } else {
+            ldpp_dout(dpp, 10) << "RGWMetaSyncCR with no period" << dendl;
+          }
+        } else {
+          next = cursor;
+          next.next();
+          ldpp_dout(dpp, 10) << "RGWMetaSyncCR on period="
+              << cursor.get_period().get_id() << ", next="
+              << next.get_period().get_id() << dendl;
+        }
+
+        yield {
+          // get the mdlog for the current period (may be empty)
+          auto& period_id = sync_status.sync_info.period;
+          auto realm_epoch = sync_status.sync_info.realm_epoch;
+          auto mdlog = sync_env->store->svc()->mdlog->get_log(period_id);
+
+          tn->log(1, SSTR("realm epoch=" << realm_epoch << " period id=" << period_id));
+
+          // prevent wakeup() from accessing shard_crs while we're spawning them
+          std::lock_guard<std::mutex> lock(mutex);
+
+          // sync this period on each shard
+          for (const auto& m : sync_status.sync_markers) {
+            uint32_t shard_id = m.first;
+            auto& marker = m.second;
+
+            std::string period_marker;
+            if (next) {
+              // read the maximum marker from the next period's sync status
+              period_marker = next.get_period().get_sync_status()[shard_id];
+              if (period_marker.empty()) {
+                // no metadata changes have occurred on this shard, skip it
+                ldpp_dout(dpp, 10) << "RGWMetaSyncCR: skipping shard " << shard_id
+                    << " with empty period marker" << dendl;
+                continue;
+              }
+            }
+
+            using ShardCR = RGWMetaSyncShardControlCR;
+            auto cr = new ShardCR(sync_env, pool, period_id, realm_epoch,
+                                  mdlog, shard_id, marker,
+                                  std::move(period_marker), tn);
+            auto stack = spawn(cr, false);
+            shard_crs[shard_id] = RefPair{cr, stack};
+          }
+        }
+        // wait for each shard to complete
+        while (ret == 0 && num_spawned() > 0) {
+          yield wait_for_child();
+          collect(&ret, nullptr);
+        }
+        drain_all();
+        {
+          // drop shard cr refs under lock
+          std::lock_guard<std::mutex> lock(mutex);
+          shard_crs.clear();
+        }
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        // advance to the next period
+        ceph_assert(next);
+        cursor = next;
+
+        // write the updated sync info
+        sync_status.sync_info.period = cursor.get_period().get_id();
+        sync_status.sync_info.realm_epoch = cursor.get_epoch();
+        yield call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, sync_env->store,
+                                                                 rgw_raw_obj(pool, sync_env->status_oid()),
+                                                                 sync_status.sync_info));
+      }
+    }
+    return 0;
+  }
+
+  void wakeup(int shard_id) {
+    std::lock_guard<std::mutex> lock(mutex);
+    auto iter = shard_crs.find(shard_id);
+    if (iter == shard_crs.end()) {
+      return;
+    }
+    iter->second.first->wakeup();
+  }
+};
+
+void RGWRemoteMetaLog::init_sync_env(RGWMetaSyncEnv *env) {
+  env->dpp = dpp;
+  env->cct = store->ctx();
+  env->store = store;
+  env->conn = conn;
+  env->async_rados = async_rados;
+  env->http_manager = &http_manager;
+  env->error_logger = error_logger;
+  env->sync_tracer = store->getRados()->get_sync_tracer();
+}
+
+int RGWRemoteMetaLog::read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+  // cannot run concurrently with run_sync(), so run in a separate manager
+  RGWCoroutinesManager crs(store->ctx(), store->getRados()->get_cr_registry());
+  RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
+  int ret = http_manager.start();
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed in http_manager.start() ret=" << ret << dendl;
+    return ret;
+  }
+  RGWMetaSyncEnv sync_env_local = sync_env;
+  sync_env_local.http_manager = &http_manager;
+  tn->log(20, "read sync status");
+  ret = crs.run(dpp, new RGWReadSyncStatusCoroutine(&sync_env_local, sync_status));
+  http_manager.stop();
+  return ret;
+}
+
+int RGWRemoteMetaLog::init_sync_status(const DoutPrefixProvider *dpp)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+
+  rgw_mdlog_info mdlog_info;
+  int r = read_log_info(dpp, &mdlog_info);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
+    return r;
+  }
+
+  rgw_meta_sync_info sync_info;
+  sync_info.num_shards = mdlog_info.num_shards;
+  auto cursor = store->svc()->mdlog->get_period_history()->get_current();
+  if (cursor) {
+    sync_info.period = cursor.get_period().get_id();
+    sync_info.realm_epoch = cursor.get_epoch();
+  }
+
+  return run(dpp, new RGWInitSyncStatusCoroutine(&sync_env, sync_info));
+}
+
+int RGWRemoteMetaLog::store_sync_info(const DoutPrefixProvider *dpp, const rgw_meta_sync_info& sync_info)
+{
+  tn->log(20, "store sync info");
+  return run(dpp, new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(dpp, store,
+                                                           rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, sync_env.status_oid()),
+                                                           sync_info));
+}
+
+// return a cursor to the period at our sync position
+static RGWPeriodHistory::Cursor get_period_at(const DoutPrefixProvider *dpp,
+                                              rgw::sal::RadosStore* store,
+                                              const rgw_meta_sync_info& info,
+					      optional_yield y)
+{
+  if (info.period.empty()) {
+    // return an empty cursor with error=0
+    return RGWPeriodHistory::Cursor{};
+  }
+
+  // look for an existing period in our history
+  auto cursor = store->svc()->mdlog->get_period_history()->lookup(info.realm_epoch);
+  if (cursor) {
+    // verify that the period ids match
+    auto& existing = cursor.get_period().get_id();
+    if (existing != info.period) {
+      ldpp_dout(dpp, -1) << "ERROR: sync status period=" << info.period
+          << " does not match period=" << existing
+          << " in history at realm epoch=" << info.realm_epoch << dendl;
+      return RGWPeriodHistory::Cursor{-EEXIST};
+    }
+    return cursor;
+  }
+
+  // read the period from rados or pull it from the master
+  RGWPeriod period;
+  int r = store->svc()->mdlog->pull_period(dpp, info.period, period, y);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to read period id "
+        << info.period << ": " << cpp_strerror(r) << dendl;
+    return RGWPeriodHistory::Cursor{r};
+  }
+  // attach the period to our history
+  cursor = store->svc()->mdlog->get_period_history()->attach(dpp, std::move(period), y);
+  if (!cursor) {
+    r = cursor.get_error();
+    ldpp_dout(dpp, -1) << "ERROR: failed to read period history back to "
+        << info.period << ": " << cpp_strerror(r) << dendl;
+  }
+  return cursor;
+}
+
+int RGWRemoteMetaLog::run_sync(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  if (store->svc()->zone->is_meta_master()) {
+    return 0;
+  }
+
+  int r = 0;
+
+  // get shard count and oldest log period from master
+  rgw_mdlog_info mdlog_info;
+  for (;;) {
+    if (going_down) {
+      ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl;
+      return 0;
+    }
+    r = read_log_info(dpp, &mdlog_info);
+    if (r == -EIO || r == -ENOENT) {
+      // keep retrying if master isn't alive or hasn't initialized the log
+      ldpp_dout(dpp, 10) << __func__ << "(): waiting for master.." << dendl;
+      backoff.backoff_sleep();
+      continue;
+    }
+    backoff.reset();
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
+      return r;
+    }
+    break;
+  }
+
+  rgw_meta_sync_status sync_status;
+  do {
+    if (going_down) {
+      ldpp_dout(dpp, 1) << __func__ << "(): going down" << dendl;
+      return 0;
+    }
+    r = run(dpp, new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
+    if (r < 0 && r != -ENOENT) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to fetch sync status r=" << r << dendl;
+      return r;
+    }
+
+    if (!mdlog_info.period.empty()) {
+      // restart sync if the remote has a period, but:
+      // a) our status does not, or
+      // b) our sync period comes before the remote's oldest log period
+      if (sync_status.sync_info.period.empty() ||
+          sync_status.sync_info.realm_epoch < mdlog_info.realm_epoch) {
+        sync_status.sync_info.state = rgw_meta_sync_info::StateInit;
+        string reason;
+        if (sync_status.sync_info.period.empty()) {
+          reason = "period is empty";
+        } else {
+          reason = SSTR("sync_info realm epoch is behind: " << sync_status.sync_info.realm_epoch << " < " << mdlog_info.realm_epoch);
+        }
+        tn->log(1, "initialize sync (reason: " + reason + ")");
+        ldpp_dout(dpp, 1) << "epoch=" << sync_status.sync_info.realm_epoch
+           << " in sync status comes before remote's oldest mdlog epoch="
+           << mdlog_info.realm_epoch << ", restarting sync" << dendl;
+      }
+    }
+
+    if (sync_status.sync_info.state == rgw_meta_sync_info::StateInit) {
+      ldpp_dout(dpp, 20) << __func__ << "(): init" << dendl;
+      sync_status.sync_info.num_shards = mdlog_info.num_shards;
+      auto cursor = store->svc()->mdlog->get_period_history()->get_current();
+      if (cursor) {
+        // run full sync, then start incremental from the current period/epoch
+        sync_status.sync_info.period = cursor.get_period().get_id();
+        sync_status.sync_info.realm_epoch = cursor.get_epoch();
+      }
+      r = run(dpp, new RGWInitSyncStatusCoroutine(&sync_env, sync_status.sync_info));
+      if (r == -EBUSY) {
+        backoff.backoff_sleep();
+        continue;
+      }
+      backoff.reset();
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to init sync status r=" << r << dendl;
+        return r;
+      }
+    }
+  } while (sync_status.sync_info.state == rgw_meta_sync_info::StateInit);
+
+  auto num_shards = sync_status.sync_info.num_shards;
+  if (num_shards != mdlog_info.num_shards) {
+    ldpp_dout(dpp, -1) << "ERROR: can't sync, mismatch between num shards, master num_shards=" << mdlog_info.num_shards << " local num_shards=" << num_shards << dendl;
+    return -EINVAL;
+  }
+
+  RGWPeriodHistory::Cursor cursor;
+  do {
+    r = run(dpp, new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
+    if (r < 0 && r != -ENOENT) {
+      tn->log(0, SSTR("ERROR: failed to fetch sync status r=" << r));
+      return r;
+    }
+
+    switch ((rgw_meta_sync_info::SyncState)sync_status.sync_info.state) {
+      case rgw_meta_sync_info::StateBuildingFullSyncMaps:
+        tn->log(20, "building full sync maps");
+        r = run(dpp, new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn));
+        if (r == -EBUSY || r == -EIO) {
+          backoff.backoff_sleep();
+          continue;
+        }
+        backoff.reset();
+        if (r < 0) {
+          tn->log(0, SSTR("ERROR: failed to fetch all metadata keys (r=" << r << ")"));
+          return r;
+        }
+
+        sync_status.sync_info.state = rgw_meta_sync_info::StateSync;
+        r = store_sync_info(dpp, sync_status.sync_info);
+        if (r < 0) {
+          tn->log(0, SSTR("ERROR: failed to update sync status (r=" << r << ")"));
+          return r;
+        }
+        /* fall through */
+      case rgw_meta_sync_info::StateSync:
+        tn->log(20, "sync");
+        // find our position in the period history (if any)
+        cursor = get_period_at(dpp, store, sync_status.sync_info, y);
+        r = cursor.get_error();
+        if (r < 0) {
+          return r;
+        }
+        meta_sync_cr = new RGWMetaSyncCR(&sync_env, cursor, sync_status, tn);
+        r = run(dpp, meta_sync_cr);
+        if (r < 0) {
+          tn->log(0, "ERROR: failed to fetch all metadata keys");
+          return r;
+        }
+        break;
+      default:
+        tn->log(0, "ERROR: bad sync state!");
+        return -EIO;
+    }
+  } while (!going_down);
+
+  return 0;
+}
+
+void RGWRemoteMetaLog::wakeup(int shard_id)
+{
+  if (!meta_sync_cr) {
+    return;
+  }
+  meta_sync_cr->wakeup(shard_id);
+}
+
+int RGWCloneMetaLogCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    do {
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": init request" << dendl;
+        return state_init();
+      }
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status" << dendl;
+        return state_read_shard_status();
+      }
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status complete" << dendl;
+        return state_read_shard_status_complete();
+      }
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": sending rest request" << dendl;
+        return state_send_rest_request(dpp);
+      }
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": receiving rest response" << dendl;
+        return state_receive_rest_response();
+      }
+      yield {
+        ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries" << dendl;
+        return state_store_mdlog_entries();
+      }
+    } while (truncated);
+    yield {
+      ldpp_dout(dpp, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries complete" << dendl;
+      return state_store_mdlog_entries_complete();
+    }
+  }
+
+  return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_init()
+{
+  data = rgw_mdlog_shard_data();
+
+  return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_read_shard_status()
+{
+  const bool add_ref = false; // default constructs with refs=1
+
+  completion.reset(new RGWMetadataLogInfoCompletion(
+    [this](int ret, const cls_log_header& header) {
+      if (ret < 0) {
+        if (ret != -ENOENT) {
+          ldpp_dout(sync_env->dpp, 1) << "ERROR: failed to read mdlog info with "
+                                      << cpp_strerror(ret) << dendl;
+        }
+      } else {
+        shard_info.marker = header.max_marker;
+        shard_info.last_update = header.max_time.to_real_time();
+      }
+      // wake up parent stack
+      io_complete();
+    }), add_ref);
+
+  int ret = mdlog->get_info_async(sync_env->dpp, shard_id, completion.get());
+  if (ret < 0) {
+    ldpp_dout(sync_env->dpp, 0) << "ERROR: mdlog->get_info_async() returned ret=" << ret << dendl;
+    return set_cr_error(ret);
+  }
+
+  return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_read_shard_status_complete()
+{
+  completion.reset();
+
+  ldpp_dout(sync_env->dpp, 20) << "shard_id=" << shard_id << " marker=" << shard_info.marker << " last_update=" << shard_info.last_update << dendl;
+
+  marker = shard_info.marker;
+
+  return 0;
+}
+
+int RGWCloneMetaLogCoroutine::state_send_rest_request(const DoutPrefixProvider *dpp)
+{
+  RGWRESTConn *conn = sync_env->conn;
+
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%d", shard_id);
+
+  char max_entries_buf[32];
+  snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", max_entries);
+
+  const char *marker_key = (marker.empty() ? "" : "marker");
+
+  rgw_http_param_pair pairs[] = { { "type", "metadata" },
+                                  { "id", buf },
+                                  { "period", period.c_str() },
+                                  { "max-entries", max_entries_buf },
+                                  { marker_key, marker.c_str() },
+                                  { NULL, NULL } };
+
+  http_op = new RGWRESTReadResource(conn, "/admin/log", pairs, NULL, sync_env->http_manager);
+
+  init_new_io(http_op);
+
+  int ret = http_op->aio_read(dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to fetch mdlog data" << dendl;
+    log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
+    http_op->put();
+    http_op = NULL;
+    return set_cr_error(ret);
+  }
+
+  return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_receive_rest_response()
+{
+  int ret = http_op->wait(&data, null_yield);
+  if (ret < 0) {
+    error_stream << "http operation failed: " << http_op->to_str() << " status=" << http_op->get_http_status() << std::endl;
+    ldpp_dout(sync_env->dpp, 5) << "failed to wait for op, ret=" << ret << dendl;
+    http_op->put();
+    http_op = NULL;
+    return set_cr_error(ret);
+  }
+  http_op->put();
+  http_op = NULL;
+
+  ldpp_dout(sync_env->dpp, 20) << "remote mdlog, shard_id=" << shard_id << " num of shard entries: " << data.entries.size() << dendl;
+
+  truncated = ((int)data.entries.size() == max_entries);
+
+  if (data.entries.empty()) {
+    if (new_marker) {
+      *new_marker = marker;
+    }
+    return set_cr_done();
+  }
+
+  if (new_marker) {
+    *new_marker = data.entries.back().id;
+  }
+
+  return 0;
+}
+
+
+int RGWCloneMetaLogCoroutine::state_store_mdlog_entries()
+{
+  list<cls_log_entry> dest_entries;
+
+  vector<rgw_mdlog_entry>::iterator iter;
+  for (iter = data.entries.begin(); iter != data.entries.end(); ++iter) {
+    rgw_mdlog_entry& entry = *iter;
+    ldpp_dout(sync_env->dpp, 20) << "entry: name=" << entry.name << dendl;
+
+    cls_log_entry dest_entry;
+    dest_entry.id = entry.id;
+    dest_entry.section = entry.section;
+    dest_entry.name = entry.name;
+    dest_entry.timestamp = utime_t(entry.timestamp);
+  
+    encode(entry.log_data, dest_entry.data);
+
+    dest_entries.push_back(dest_entry);
+
+    marker = entry.id;
+  }
+
+  RGWAioCompletionNotifier *cn = stack->create_completion_notifier();
+
+  int ret = mdlog->store_entries_in_shard(sync_env->dpp, dest_entries, shard_id, cn->completion());
+  if (ret < 0) {
+    cn->put();
+    ldpp_dout(sync_env->dpp, 10) << "failed to store md log entries shard_id=" << shard_id << " ret=" << ret << dendl;
+    return set_cr_error(ret);
+  }
+  return io_block(0);
+}
+
+int RGWCloneMetaLogCoroutine::state_store_mdlog_entries_complete()
+{
+  return set_cr_done();
+}
+
+void rgw_meta_sync_info::decode_json(JSONObj *obj)
+{
+  string s;
+  JSONDecoder::decode_json("status", s, obj);
+  if (s == "init") {
+    state = StateInit;
+  } else if (s == "building-full-sync-maps") {
+    state = StateBuildingFullSyncMaps;
+  } else if (s == "sync") {
+    state = StateSync;
+  }
+  JSONDecoder::decode_json("num_shards", num_shards, obj);
+  JSONDecoder::decode_json("period", period, obj);
+  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+void rgw_meta_sync_info::dump(Formatter *f) const
+{
+  string s;
+  switch ((SyncState)state) {
+  case StateInit:
+    s = "init";
+    break;
+  case StateBuildingFullSyncMaps:
+    s = "building-full-sync-maps";
+    break;
+  case StateSync:
+    s = "sync";
+    break;
+  default:
+    s = "unknown";
+    break;
+  }
+  encode_json("status", s, f);
+  encode_json("num_shards", num_shards, f);
+  encode_json("period", period, f);
+  encode_json("realm_epoch", realm_epoch, f);
+}
+
+
+void rgw_meta_sync_marker::decode_json(JSONObj *obj)
+{
+  int s;
+  JSONDecoder::decode_json("state", s, obj);
+  state = s;
+  JSONDecoder::decode_json("marker", marker, obj);
+  JSONDecoder::decode_json("next_step_marker", next_step_marker, obj);
+  JSONDecoder::decode_json("total_entries", total_entries, obj);
+  JSONDecoder::decode_json("pos", pos, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("timestamp", ut, obj);
+  timestamp = ut.to_real_time();
+  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+void rgw_meta_sync_marker::dump(Formatter *f) const
+{
+  encode_json("state", (int)state, f);
+  encode_json("marker", marker, f);
+  encode_json("next_step_marker", next_step_marker, f);
+  encode_json("total_entries", total_entries, f);
+  encode_json("pos", pos, f);
+  encode_json("timestamp", utime_t(timestamp), f);
+  encode_json("realm_epoch", realm_epoch, f);
+}
+
+void rgw_meta_sync_status::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("info", sync_info, obj);
+  JSONDecoder::decode_json("markers", sync_markers, obj);
+}
+
+void rgw_meta_sync_status::dump(Formatter *f) const {
+  encode_json("info", sync_info, f);
+  encode_json("markers", sync_markers, f);
+}
+
+void rgw_sync_error_info::dump(Formatter *f) const {
+  encode_json("source_zone", source_zone, f);
+  encode_json("error_code", error_code, f);
+  encode_json("message", message, f);
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync.h b/src/rgw/driver/rados/rgw_sync.h
new file mode 100644
index 000000000..e6c255cc6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync.h
@@ -0,0 +1,547 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <atomic>
+
+#include "include/stringify.h"
+
+#include "rgw_coroutine.h"
+#include "rgw_http_client.h"
+#include "rgw_metadata.h"
+#include "rgw_meta_sync_status.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "rgw_sync_trace.h"
+#include "rgw_mdlog.h"
+
+#define ERROR_LOGGER_SHARDS 32
+#define RGW_SYNC_ERROR_LOG_SHARD_PREFIX "sync.error-log"
+
+struct rgw_mdlog_info {
+  uint32_t num_shards;
+  std::string period; //< period id of the master's oldest metadata log
+  epoch_t realm_epoch; //< realm epoch of oldest metadata log
+
+  rgw_mdlog_info() : num_shards(0), realm_epoch(0) {}
+
+  void decode_json(JSONObj *obj);
+};
+
+
+struct rgw_mdlog_entry {
+  std::string id;
+  std::string section;
+  std::string name;
+  ceph::real_time timestamp;
+  RGWMetadataLogData log_data;
+
+  void decode_json(JSONObj *obj);
+
+  bool convert_from(cls_log_entry& le) {
+    id = le.id;
+    section = le.section;
+    name = le.name;
+    timestamp = le.timestamp.to_real_time();
+    try {
+      auto iter = le.data.cbegin();
+      decode(log_data, iter);
+    } catch (buffer::error& err) {
+      return false;
+    }
+    return true;
+  }
+};
+
+struct rgw_mdlog_shard_data {
+  std::string marker;
+  bool truncated;
+  std::vector<rgw_mdlog_entry> entries;
+
+  void decode_json(JSONObj *obj);
+};
+
+class RGWAsyncRadosProcessor;
+class RGWMetaSyncStatusManager;
+class RGWMetaSyncCR;
+class RGWRESTConn;
+class RGWSyncTraceManager;
+
+class RGWSyncErrorLogger {
+  rgw::sal::RadosStore* store;
+
+  std::vector<std::string> oids;
+  int num_shards;
+
+  std::atomic<int64_t> counter = { 0 };
+public:
+  RGWSyncErrorLogger(rgw::sal::RadosStore* _store, const std::string &oid_prefix, int _num_shards);
+  RGWCoroutine *log_error_cr(const DoutPrefixProvider *dpp, const std::string& source_zone, const std::string& section, const std::string& name, uint32_t error_code, const std::string& message);
+
+  static std::string get_shard_oid(const std::string& oid_prefix, int shard_id);
+};
+
+struct rgw_sync_error_info {
+  std::string source_zone;
+  uint32_t error_code;
+  std::string message;
+
+  rgw_sync_error_info() : error_code(0) {}
+  rgw_sync_error_info(const std::string& _source_zone, uint32_t _error_code, const std::string& _message) : source_zone(_source_zone), error_code(_error_code), message(_message) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(source_zone, bl);
+    encode(error_code, bl);
+    encode(message, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(source_zone, bl);
+    decode(error_code, bl);
+    decode(message, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_sync_error_info)
+
+#define DEFAULT_BACKOFF_MAX 30
+
+class RGWSyncBackoff {
+  int cur_wait;
+  int max_secs;
+
+  void update_wait_time();
+public:
+  explicit RGWSyncBackoff(int _max_secs = DEFAULT_BACKOFF_MAX) : cur_wait(0), max_secs(_max_secs) {}
+
+  void backoff_sleep();
+  void reset() {
+    cur_wait = 0;
+  }
+
+  void backoff(RGWCoroutine *op);
+};
+
+class RGWBackoffControlCR : public RGWCoroutine
+{
+  RGWCoroutine *cr;
+  ceph::mutex lock;
+
+  RGWSyncBackoff backoff;
+  bool reset_backoff;
+
+  bool exit_on_error;
+
+protected:
+  bool *backoff_ptr() {
+    return &reset_backoff;
+  }
+
+  ceph::mutex& cr_lock() {
+    return lock;
+  }
+
+  RGWCoroutine *get_cr() {
+    return cr;
+  }
+
+public:
+  RGWBackoffControlCR(CephContext *_cct, bool _exit_on_error)
+    : RGWCoroutine(_cct),
+      cr(nullptr),
+      lock(ceph::make_mutex("RGWBackoffControlCR::lock:" + stringify(this))),
+      reset_backoff(false), exit_on_error(_exit_on_error) {
+  }
+
+  ~RGWBackoffControlCR() override {
+    if (cr) {
+      cr->put();
+    }
+  }
+
+  virtual RGWCoroutine *alloc_cr() = 0;
+  virtual RGWCoroutine *alloc_finisher_cr() { return NULL; }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+struct RGWMetaSyncEnv {
+  const DoutPrefixProvider *dpp;
+  CephContext *cct{nullptr};
+  rgw::sal::RadosStore* store{nullptr};
+  RGWRESTConn *conn{nullptr};
+  RGWAsyncRadosProcessor *async_rados{nullptr};
+  RGWHTTPManager *http_manager{nullptr};
+  RGWSyncErrorLogger *error_logger{nullptr};
+  RGWSyncTraceManager *sync_tracer{nullptr};
+
+  RGWMetaSyncEnv() {}
+
+  void init(const DoutPrefixProvider *_dpp, CephContext *_cct, rgw::sal::RadosStore* _store, RGWRESTConn *_conn,
+            RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
+            RGWSyncErrorLogger *_error_logger, RGWSyncTraceManager *_sync_tracer);
+
+  std::string shard_obj_name(int shard_id);
+  std::string status_oid();
+};
+
+class RGWRemoteMetaLog : public RGWCoroutinesManager {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWRESTConn *conn;
+  RGWAsyncRadosProcessor *async_rados;
+
+  RGWHTTPManager http_manager;
+  RGWMetaSyncStatusManager *status_manager;
+  RGWSyncErrorLogger *error_logger{nullptr};
+  RGWSyncTraceManager *sync_tracer{nullptr};
+
+  RGWMetaSyncCR *meta_sync_cr{nullptr};
+
+  RGWSyncBackoff backoff;
+
+  RGWMetaSyncEnv sync_env;
+
+  void init_sync_env(RGWMetaSyncEnv *env);
+  int store_sync_info(const DoutPrefixProvider *dpp, const rgw_meta_sync_info& sync_info);
+
+  std::atomic<bool> going_down = { false };
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWRemoteMetaLog(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* _store,
+                   RGWAsyncRadosProcessor *async_rados,
+                   RGWMetaSyncStatusManager *_sm)
+    : RGWCoroutinesManager(_store->ctx(), _store->getRados()->get_cr_registry()),
+      dpp(dpp), store(_store), conn(NULL), async_rados(async_rados),
+      http_manager(store->ctx(), completion_mgr),
+      status_manager(_sm) {}
+
+  virtual ~RGWRemoteMetaLog() override;
+
+  int init();
+  void finish();
+
+  int read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info);
+  int read_master_log_shards_info(const DoutPrefixProvider *dpp, const std::string& master_period, std::map<int, RGWMetadataLogInfo> *shards_info);
+  int read_master_log_shards_next(const DoutPrefixProvider *dpp, const std::string& period, std::map<int, std::string> shard_markers, std::map<int, rgw_mdlog_shard_data> *result);
+  int read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status);
+  int init_sync_status(const DoutPrefixProvider *dpp);
+  int run_sync(const DoutPrefixProvider *dpp, optional_yield y);
+
+  void wakeup(int shard_id);
+
+  RGWMetaSyncEnv& get_sync_env() {
+    return sync_env;
+  }
+};
+
+class RGWMetaSyncStatusManager : public DoutPrefixProvider {
+  rgw::sal::RadosStore* store;
+  librados::IoCtx ioctx;
+
+  RGWRemoteMetaLog master_log;
+
+  std::map<int, rgw_raw_obj> shard_objs;
+
+  struct utime_shard {
+    real_time ts;
+    int shard_id;
+
+    utime_shard() : shard_id(-1) {}
+
+    bool operator<(const utime_shard& rhs) const {
+      if (ts == rhs.ts) {
+	return shard_id < rhs.shard_id;
+      }
+      return ts < rhs.ts;
+    }
+  };
+
+  ceph::shared_mutex ts_to_shard_lock = ceph::make_shared_mutex("ts_to_shard_lock");
+  std::map<utime_shard, int> ts_to_shard;
+  std::vector<std::string> clone_markers;
+
+public:
+  RGWMetaSyncStatusManager(rgw::sal::RadosStore* _store, RGWAsyncRadosProcessor *async_rados)
+    : store(_store), master_log(this, store, async_rados, this)
+  {}
+
+  virtual ~RGWMetaSyncStatusManager() override;
+
+  int init(const DoutPrefixProvider *dpp);
+
+  int read_sync_status(const DoutPrefixProvider *dpp, rgw_meta_sync_status *sync_status) {
+    return master_log.read_sync_status(dpp, sync_status);
+  }
+  int init_sync_status(const DoutPrefixProvider *dpp) { return master_log.init_sync_status(dpp); }
+  int read_log_info(const DoutPrefixProvider *dpp, rgw_mdlog_info *log_info) {
+    return master_log.read_log_info(dpp, log_info);
+  }
+  int read_master_log_shards_info(const DoutPrefixProvider *dpp, const std::string& master_period, std::map<int, RGWMetadataLogInfo> *shards_info) {
+    return master_log.read_master_log_shards_info(dpp, master_period, shards_info);
+  }
+  int read_master_log_shards_next(const DoutPrefixProvider *dpp, const std::string& period, std::map<int, std::string> shard_markers, std::map<int, rgw_mdlog_shard_data> *result) {
+    return master_log.read_master_log_shards_next(dpp, period, shard_markers, result);
+  }
+
+  int run(const DoutPrefixProvider *dpp, optional_yield y) { return master_log.run_sync(dpp, y); }
+
+
+  // implements DoutPrefixProvider
+  CephContext *get_cct() const override { return store->ctx(); }
+  unsigned get_subsys() const override;
+  std::ostream& gen_prefix(std::ostream& out) const override;
+
+  void wakeup(int shard_id) { return master_log.wakeup(shard_id); }
+  void stop() {
+    master_log.finish();
+  }
+};
+
+class RGWOrderCallCR : public RGWCoroutine
+{
+public:
+  RGWOrderCallCR(CephContext *cct) : RGWCoroutine(cct) {}
+
+  virtual void call_cr(RGWCoroutine *_cr) = 0;
+};
+
+class RGWLastCallerWinsCR : public RGWOrderCallCR
+{
+  RGWCoroutine *cr{nullptr};
+
+public:
+  explicit RGWLastCallerWinsCR(CephContext *cct) : RGWOrderCallCR(cct) {}
+  ~RGWLastCallerWinsCR() {
+    if (cr) {
+      cr->put();
+    }
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+
+  void call_cr(RGWCoroutine *_cr) override {
+    if (cr) {
+      cr->put();
+    }
+    cr = _cr;
+  }
+};
+
+template <class T, class K>
+class RGWSyncShardMarkerTrack {
+  struct marker_entry {
+    uint64_t pos;
+    real_time timestamp;
+
+    marker_entry() : pos(0) {}
+    marker_entry(uint64_t _p, const real_time& _ts) : pos(_p), timestamp(_ts) {}
+  };
+  typename std::map<T, marker_entry> pending;
+
+  std::map<T, marker_entry> finish_markers;
+
+  int window_size;
+  int updates_since_flush;
+
+  RGWOrderCallCR *order_cr{nullptr};
+
+protected:
+  typename std::set<K> need_retry_set;
+
+  virtual RGWCoroutine *store_marker(const T& new_marker, uint64_t index_pos, const real_time& timestamp) = 0;
+  virtual RGWOrderCallCR *allocate_order_control_cr() = 0;
+  virtual void handle_finish(const T& marker) { }
+
+public:
+  RGWSyncShardMarkerTrack(int _window_size) : window_size(_window_size), updates_since_flush(0) {}
+  virtual ~RGWSyncShardMarkerTrack() {
+    if (order_cr) {
+      order_cr->put();
+    }
+  }
+
+  bool start(const T& pos, int index_pos, const real_time& timestamp) {
+    if (pending.find(pos) != pending.end()) {
+      return false;
+    }
+    pending[pos] = marker_entry(index_pos, timestamp);
+    return true;
+  }
+
+  void try_update_high_marker(const T& pos, int index_pos, const real_time& timestamp) {
+    finish_markers[pos] = marker_entry(index_pos, timestamp);
+  }
+
+  RGWCoroutine *finish(const T& pos) {
+    if (pending.empty()) {
+      /* can happen, due to a bug that ended up with multiple objects with the same name and version
+       * -- which can happen when versioning is enabled an the version is 'null'.
+       */
+      return NULL;
+    }
+
+    typename std::map<T, marker_entry>::iterator iter = pending.begin();
+
+    bool is_first = (pos == iter->first);
+
+    typename std::map<T, marker_entry>::iterator pos_iter = pending.find(pos);
+    if (pos_iter == pending.end()) {
+      /* see pending.empty() comment */
+      return NULL;
+    }
+
+    finish_markers[pos] = pos_iter->second;
+
+    pending.erase(pos);
+
+    handle_finish(pos);
+
+    updates_since_flush++;
+
+    if (is_first && (updates_since_flush >= window_size || pending.empty())) {
+      return flush();
+    }
+    return NULL;
+  }
+
+  RGWCoroutine *flush() {
+    if (finish_markers.empty()) {
+      return NULL;
+    }
+
+    typename std::map<T, marker_entry>::iterator i;
+
+    if (pending.empty()) {
+      i = finish_markers.end();
+    } else {
+      i = finish_markers.lower_bound(pending.begin()->first);
+    }
+    if (i == finish_markers.begin()) {
+      return NULL;
+    }
+    updates_since_flush = 0;
+
+    auto last = i;
+    --i;
+    const T& high_marker = i->first;
+    marker_entry& high_entry = i->second;
+    RGWCoroutine *cr = order(store_marker(high_marker, high_entry.pos, high_entry.timestamp));
+    finish_markers.erase(finish_markers.begin(), last);
+    return cr;
+  }
+
+  /*
+   * a key needs retry if it was processing when another marker that points
+   * to the same bucket shards arrives. Instead of processing it, we mark
+   * it as need_retry so that when we finish processing the original, we
+   * retry the processing on the same bucket shard, in case there are more
+   * entries to process. This closes a race that can happen.
+   */
+  bool need_retry(const K& key) {
+    return (need_retry_set.find(key) != need_retry_set.end());
+  }
+
+  void set_need_retry(const K& key) {
+    need_retry_set.insert(key);
+  }
+
+  void reset_need_retry(const K& key) {
+    need_retry_set.erase(key);
+  }
+
+  RGWCoroutine *order(RGWCoroutine *cr) {
+    /* either returns a new RGWLastWriteWinsCR, or update existing one, in which case it returns
+     * nothing and the existing one will call the cr
+     */
+    if (order_cr && order_cr->is_done()) {
+      order_cr->put();
+      order_cr = nullptr;
+    }
+    if (!order_cr) {
+      order_cr = allocate_order_control_cr();
+      order_cr->get();
+      order_cr->call_cr(cr);
+      return order_cr;
+    }
+    order_cr->call_cr(cr);
+    return nullptr; /* don't call it a second time */
+  }
+};
+
+class RGWMetaSyncShardMarkerTrack;
+
+class RGWMetaSyncSingleEntryCR : public RGWCoroutine {
+  RGWMetaSyncEnv *sync_env;
+
+  std::string raw_key;
+  std::string entry_marker;
+  RGWMDLogStatus op_status;
+
+  ssize_t pos;
+  std::string section;
+  std::string key;
+
+  int sync_status;
+
+  bufferlist md_bl;
+
+  RGWMetaSyncShardMarkerTrack *marker_tracker;
+
+  int tries;
+
+  bool error_injection;
+
+  RGWSyncTraceNodeRef tn;
+
+public:
+  RGWMetaSyncSingleEntryCR(RGWMetaSyncEnv *_sync_env,
+                           const std::string& _raw_key, const std::string& _entry_marker,
+                           const RGWMDLogStatus& _op_status,
+                           RGWMetaSyncShardMarkerTrack *_marker_tracker, const RGWSyncTraceNodeRef& _tn_parent);
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+class RGWShardCollectCR : public RGWCoroutine {
+  int current_running = 0;
+ protected:
+  int max_concurrent;
+  int status = 0;
+
+  // called with the result of each child. error codes can be ignored by
+  // returning 0. if handle_result() returns a negative value, it's
+  // treated as an error and stored in 'status'. the last such error is
+  // reported to the caller with set_cr_error()
+  virtual int handle_result(int r) = 0;
+ public:
+  RGWShardCollectCR(CephContext *_cct, int _max_concurrent)
+    : RGWCoroutine(_cct), max_concurrent(_max_concurrent)
+  {}
+
+  virtual bool spawn_next() = 0;
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+// factory functions for meta sync coroutines needed in mdlog trimming
+
+RGWCoroutine* create_read_remote_mdlog_shard_info_cr(RGWMetaSyncEnv *env,
+                                                     const std::string& period,
+                                                     int shard_id,
+                                                     RGWMetadataLogInfo* info);
+
+RGWCoroutine* create_list_remote_mdlog_shard_cr(RGWMetaSyncEnv *env,
+                                                const std::string& period,
+                                                int shard_id,
+                                                const std::string& marker,
+                                                uint32_t max_entries,
+                                                rgw_mdlog_shard_data *result);
+
diff --git a/src/rgw/driver/rados/rgw_sync_counters.cc b/src/rgw/driver/rados/rgw_sync_counters.cc
new file mode 100644
index 000000000..1d23d58dc
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_counters.cc
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/ceph_context.h"
+#include "rgw_sync_counters.h"
+
+namespace sync_counters {
+
+PerfCountersRef build(CephContext *cct, const std::string& name)
+{
+  PerfCountersBuilder b(cct, name, l_first, l_last);
+
+  // share these counters with ceph-mgr
+  b.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+  b.add_u64_avg(l_fetch, "fetch_bytes", "Number of object bytes replicated");
+  b.add_u64_counter(l_fetch_not_modified, "fetch_not_modified", "Number of objects already replicated");
+  b.add_u64_counter(l_fetch_err, "fetch_errors", "Number of object replication errors");
+
+  b.add_time_avg(l_poll, "poll_latency", "Average latency of replication log requests");
+  b.add_u64_counter(l_poll_err, "poll_errors", "Number of replication log request errors");
+
+  auto logger = PerfCountersRef{ b.create_perf_counters(), cct };
+  cct->get_perfcounters_collection()->add(logger.get());
+  return logger;
+}
+
+} // namespace sync_counters
diff --git a/src/rgw/driver/rados/rgw_sync_counters.h b/src/rgw/driver/rados/rgw_sync_counters.h
new file mode 100644
index 000000000..df3acc680
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_counters.h
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "common/perf_counters_collection.h"
+
+namespace sync_counters {
+
+enum {
+  l_first = 805000,
+
+  l_fetch,
+  l_fetch_not_modified,
+  l_fetch_err,
+
+  l_poll,
+  l_poll_err,
+
+  l_last,
+};
+
+PerfCountersRef build(CephContext *cct, const std::string& name);
+
+} // namespace sync_counters
diff --git a/src/rgw/driver/rados/rgw_sync_error_repo.cc b/src/rgw/driver/rados/rgw_sync_error_repo.cc
new file mode 100644
index 000000000..44305b60b
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_error_repo.cc
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include "rgw_sync_error_repo.h"
+#include "rgw_coroutine.h"
+#include "rgw_sal.h"
+#include "services/svc_rados.h"
+#include "cls/cmpomap/client.h"
+
+namespace rgw::error_repo {
+
+// prefix for the binary encoding of keys. this particular value is not
+// valid as the first byte of a utf8 code point, so we use this to
+// differentiate the binary encoding from existing string keys for
+// backward-compatibility
+constexpr uint8_t binary_key_prefix = 0x80;
+
+struct key_type {
+  rgw_bucket_shard bs;
+  std::optional<uint64_t> gen;
+};
+
+void encode(const key_type& k, bufferlist& bl, uint64_t f=0)
+{
+  ENCODE_START(1, 1, bl);
+  encode(k.bs, bl);
+  encode(k.gen, bl);
+  ENCODE_FINISH(bl);
+}
+
+void decode(key_type& k, bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(k.bs, bl);
+  decode(k.gen, bl);
+  DECODE_FINISH(bl);
+}
+
+std::string encode_key(const rgw_bucket_shard& bs,
+                       std::optional<uint64_t> gen)
+{
+  using ceph::encode;
+  const auto key = key_type{bs, gen};
+  bufferlist bl;
+  encode(binary_key_prefix, bl);
+  encode(key, bl);
+  return bl.to_str();
+}
+
+int decode_key(std::string encoded,
+               rgw_bucket_shard& bs,
+               std::optional<uint64_t>& gen)
+{
+  using ceph::decode;
+  key_type key;
+  const auto bl = bufferlist::static_from_string(encoded);
+  auto p = bl.cbegin();
+  try {
+    uint8_t prefix;
+    decode(prefix, p);
+    if (prefix != binary_key_prefix) {
+      return -EINVAL;
+    }
+    decode(key, p);
+  } catch (const buffer::error&) {
+    return -EIO;
+  }
+  if (!p.end()) {
+    return -EIO; // buffer contained unexpected bytes
+  }
+  bs = std::move(key.bs);
+  gen = key.gen;
+  return 0;
+}
+
+ceph::real_time decode_value(const bufferlist& bl)
+{
+  uint64_t value;
+  try {
+    using ceph::decode;
+    decode(value, bl);
+  } catch (const buffer::error&) {
+    value = 0; // empty buffer = 0
+  }
+  return ceph::real_clock::zero() + ceph::timespan(value);
+}
+
+int write(librados::ObjectWriteOperation& op,
+          const std::string& key,
+          ceph::real_time timestamp)
+{
+  // overwrite the existing timestamp if value is greater
+  const uint64_t value = timestamp.time_since_epoch().count();
+  using namespace ::cls::cmpomap;
+  const bufferlist zero = u64_buffer(0); // compare against 0 for missing keys
+  return cmp_set_vals(op, Mode::U64, Op::GT, {{key, u64_buffer(value)}}, zero);
+}
+
+int remove(librados::ObjectWriteOperation& op,
+           const std::string& key,
+           ceph::real_time timestamp)
+{
+  // remove the omap key if value >= existing
+  const uint64_t value = timestamp.time_since_epoch().count();
+  using namespace ::cls::cmpomap;
+  return cmp_rm_keys(op, Mode::U64, Op::GTE, {{key, u64_buffer(value)}});
+}
+
+class RGWErrorRepoWriteCR : public RGWSimpleCoroutine {
+  RGWSI_RADOS::Obj obj;
+  std::string key;
+  ceph::real_time timestamp;
+
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ public:
+  RGWErrorRepoWriteCR(RGWSI_RADOS* rados, const rgw_raw_obj& raw_obj,
+                      const std::string& key, ceph::real_time timestamp)
+    : RGWSimpleCoroutine(rados->ctx()),
+      obj(rados->obj(raw_obj)),
+      key(key), timestamp(timestamp)
+  {}
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    librados::ObjectWriteOperation op;
+    int r = write(op, key, timestamp);
+    if (r < 0) {
+      return r;
+    }
+    r = obj.open(dpp);
+    if (r < 0) {
+      return r;
+    }
+
+    cn = stack->create_completion_notifier();
+    return obj.aio_operate(cn->completion(), &op);
+  }
+
+  int request_complete() override {
+    return cn->completion()->get_return_value();
+  }
+};
+
+RGWCoroutine* write_cr(RGWSI_RADOS* rados,
+                       const rgw_raw_obj& obj,
+                       const std::string& key,
+                       ceph::real_time timestamp)
+{
+  return new RGWErrorRepoWriteCR(rados, obj, key, timestamp);
+}
+
+
+class RGWErrorRepoRemoveCR : public RGWSimpleCoroutine {
+  RGWSI_RADOS::Obj obj;
+  std::string key;
+  ceph::real_time timestamp;
+
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+ public:
+  RGWErrorRepoRemoveCR(RGWSI_RADOS* rados, const rgw_raw_obj& raw_obj,
+                       const std::string& key, ceph::real_time timestamp)
+    : RGWSimpleCoroutine(rados->ctx()),
+      obj(rados->obj(raw_obj)),
+      key(key), timestamp(timestamp)
+  {}
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    librados::ObjectWriteOperation op;
+    int r = remove(op, key, timestamp);
+    if (r < 0) {
+      return r;
+    }
+    r = obj.open(dpp);
+    if (r < 0) {
+      return r;
+    }
+
+    cn = stack->create_completion_notifier();
+    return obj.aio_operate(cn->completion(), &op);
+  }
+
+  int request_complete() override {
+    return cn->completion()->get_return_value();
+  }
+};
+
+RGWCoroutine* remove_cr(RGWSI_RADOS* rados,
+                        const rgw_raw_obj& obj,
+                        const std::string& key,
+                        ceph::real_time timestamp)
+{
+  return new RGWErrorRepoRemoveCR(rados, obj, key, timestamp);
+}
+
+} // namespace rgw::error_repo
diff --git a/src/rgw/driver/rados/rgw_sync_error_repo.h b/src/rgw/driver/rados/rgw_sync_error_repo.h
new file mode 100644
index 000000000..60525d281
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_error_repo.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include <optional>
+#include "include/rados/librados_fwd.hpp"
+#include "include/buffer_fwd.h"
+#include "common/ceph_time.h"
+
+class RGWSI_RADOS;
+class RGWCoroutine;
+struct rgw_raw_obj;
+struct rgw_bucket_shard;
+
+namespace rgw::error_repo {
+
+// binary-encode a bucket/shard/gen and return it as a string
+std::string encode_key(const rgw_bucket_shard& bs,
+                       std::optional<uint64_t> gen);
+
+// try to decode a key. returns -EINVAL if not in binary format
+int decode_key(std::string encoded,
+               rgw_bucket_shard& bs,
+               std::optional<uint64_t>& gen);
+
+// decode a timestamp as a uint64_t for CMPXATTR_MODE_U64
+ceph::real_time decode_value(const ceph::bufferlist& bl);
+
+// write an omap key iff the given timestamp is newer
+int write(librados::ObjectWriteOperation& op,
+          const std::string& key,
+          ceph::real_time timestamp);
+RGWCoroutine* write_cr(RGWSI_RADOS* rados,
+                       const rgw_raw_obj& obj,
+                       const std::string& key,
+                       ceph::real_time timestamp);
+
+// remove an omap key iff there isn't a newer timestamp
+int remove(librados::ObjectWriteOperation& op,
+           const std::string& key,
+           ceph::real_time timestamp);
+RGWCoroutine* remove_cr(RGWSI_RADOS* rados,
+                        const rgw_raw_obj& obj,
+                        const std::string& key,
+                        ceph::real_time timestamp);
+
+} // namespace rgw::error_repo
diff --git a/src/rgw/driver/rados/rgw_sync_module.cc b/src/rgw/driver/rados/rgw_sync_module.cc
new file mode 100644
index 000000000..5a1e70be3
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module.cc
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_bucket.h"
+
+#include "rgw_sync_module_log.h"
+#include "rgw_sync_module_es.h"
+#include "rgw_sync_module_aws.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+RGWMetadataHandler *RGWSyncModuleInstance::alloc_bucket_meta_handler()
+{
+  return RGWBucketMetaHandlerAllocator::alloc();
+}
+
+RGWBucketInstanceMetadataHandlerBase* RGWSyncModuleInstance::alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver)
+{
+  return RGWBucketInstanceMetaHandlerAllocator::alloc(driver);
+}
+
+RGWStatRemoteObjCBCR::RGWStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                       rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCoroutine(_sc->cct),
+                                                          sc(_sc), sync_env(_sc->env),
+                                                          src_bucket(_src_bucket), key(_key) {
+}
+
+RGWCallStatRemoteObjCR::RGWCallStatRemoteObjCR(RGWDataSyncCtx *_sc,
+                                               rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCoroutine(_sc->cct),
+                                                                                                 sc(_sc), sync_env(_sc->env),
+                                                                                                 src_bucket(_src_bucket), key(_key) {
+}
+
+int RGWCallStatRemoteObjCR::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+    yield {
+      call(new RGWStatRemoteObjCR(sync_env->async_rados, sync_env->driver,
+                                  sc->source_zone,
+                                  src_bucket, key, &mtime, &size, &etag, &attrs, &headers));
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 10) << "RGWStatRemoteObjCR() returned " << retcode << dendl;
+      return set_cr_error(retcode);
+    }
+    ldpp_dout(dpp, 20) << "stat of remote obj: z=" << sc->source_zone
+                             << " b=" << src_bucket << " k=" << key
+                             << " size=" << size << " mtime=" << mtime << dendl;
+    yield {
+      RGWStatRemoteObjCBCR *cb = allocate_callback();
+      if (cb) {
+        cb->set_result(mtime, size, etag, std::move(attrs), std::move(headers));
+        call(cb);
+      }
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 10) << "RGWStatRemoteObjCR() callback returned " << retcode << dendl;
+      return set_cr_error(retcode);
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager)
+{
+  RGWSyncModuleRef default_module(std::make_shared<RGWDefaultSyncModule>());
+  modules_manager->register_module("rgw", default_module, true);
+
+  RGWSyncModuleRef archive_module(std::make_shared<RGWArchiveSyncModule>());
+  modules_manager->register_module("archive", archive_module);
+
+  RGWSyncModuleRef log_module(std::make_shared<RGWLogSyncModule>());
+  modules_manager->register_module("log", log_module);
+
+  RGWSyncModuleRef es_module(std::make_shared<RGWElasticSyncModule>());
+  modules_manager->register_module("elasticsearch", es_module);
+
+  RGWSyncModuleRef aws_module(std::make_shared<RGWAWSSyncModule>());
+  modules_manager->register_module("cloud", aws_module);
+}
diff --git a/src/rgw/driver/rados/rgw_sync_module.h b/src/rgw/driver/rados/rgw_sync_module.h
new file mode 100644
index 000000000..38abb3d1a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module.h
@@ -0,0 +1,203 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+
+class RGWBucketInfo;
+class RGWRemoteDataLog;
+struct RGWDataSyncCtx;
+struct RGWDataSyncEnv;
+struct rgw_bucket_entry_owner;
+struct rgw_obj_key;
+struct rgw_bucket_sync_pipe;
+
+
+class RGWDataSyncModule {
+public:
+  RGWDataSyncModule() {}
+  virtual ~RGWDataSyncModule() {}
+
+  virtual void init(RGWDataSyncCtx *sync_env, uint64_t instance_id) {}
+
+  virtual RGWCoroutine *init_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) {
+    return nullptr;
+  }
+
+  virtual RGWCoroutine *start_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) {
+    return nullptr;
+  }
+  virtual RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc,
+                                    rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                                    std::optional<uint64_t> versioned_epoch,
+                                    const rgw_zone_set_entry& my_trace_entry,
+                                    rgw_zone_set *zones_trace) = 0;
+  virtual RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& bucket_info, rgw_obj_key& key, real_time& mtime,
+                                      bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0;
+  virtual RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& bucket_info, rgw_obj_key& key, real_time& mtime,
+                                             rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) = 0;
+};
+
+class RGWRESTMgr;
+class RGWMetadataHandler;
+class RGWBucketInstanceMetadataHandlerBase;
+
+class RGWSyncModuleInstance {
+public:
+  RGWSyncModuleInstance() {}
+  virtual ~RGWSyncModuleInstance() {}
+  virtual RGWDataSyncModule *get_data_handler() = 0;
+  virtual RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) {
+    return orig;
+  }
+  virtual bool supports_user_writes() {
+    return false;
+  }
+  virtual RGWMetadataHandler *alloc_bucket_meta_handler();
+  virtual RGWBucketInstanceMetadataHandlerBase *alloc_bucket_instance_meta_handler(rgw::sal::Driver* driver);
+
+  // indication whether the sync module start with full sync (default behavior)
+  // incremental sync would follow anyway
+  virtual bool should_full_sync() const {
+      return true;
+  }
+};
+
+typedef std::shared_ptr<RGWSyncModuleInstance> RGWSyncModuleInstanceRef;
+
+class JSONFormattable;
+
+class RGWSyncModule {
+
+public:
+  RGWSyncModule() {}
+  virtual ~RGWSyncModule() {}
+
+  virtual bool supports_writes() {
+    return false;
+  }
+  virtual bool supports_data_export() = 0;
+  virtual int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) = 0;
+};
+
+typedef std::shared_ptr<RGWSyncModule> RGWSyncModuleRef;
+
+
+class RGWSyncModulesManager {
+  ceph::mutex lock = ceph::make_mutex("RGWSyncModulesManager");
+
+  std::map<std::string, RGWSyncModuleRef> modules;
+public:
+  RGWSyncModulesManager() = default;
+
+  void register_module(const std::string& name, RGWSyncModuleRef& module, bool is_default = false) {
+    std::lock_guard l{lock};
+    modules[name] = module;
+    if (is_default) {
+      modules[std::string()] = module;
+    }
+  }
+
+  bool get_module(const std::string& name, RGWSyncModuleRef *module) {
+    std::lock_guard l{lock};
+    auto iter = modules.find(name);
+    if (iter == modules.end()) {
+      return false;
+    }
+    if (module != nullptr) {
+      *module = iter->second;
+    }
+    return true;
+  }
+
+
+  bool supports_data_export(const std::string& name) {
+    RGWSyncModuleRef module;
+    if (!get_module(name, &module)) {
+      return false;
+    }
+
+    return module->supports_data_export();
+  }
+
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const std::string& name, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
+    RGWSyncModuleRef module;
+    if (!get_module(name, &module)) {
+      return -ENOENT;
+    }
+
+    return module.get()->create_instance(dpp, cct, config, instance);
+  }
+
+  std::vector<std::string> get_registered_module_names() const {
+    std::vector<std::string> names;
+    for (auto& i: modules) {
+      if (!i.first.empty()) {
+        names.push_back(i.first);
+      }
+    }
+    return names;
+  }
+};
+
+class RGWStatRemoteObjCBCR : public RGWCoroutine {
+protected:
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  rgw_bucket src_bucket;
+  rgw_obj_key key;
+
+  ceph::real_time mtime;
+  uint64_t size = 0;
+  std::string etag;
+  std::map<std::string, bufferlist> attrs;
+  std::map<std::string, std::string> headers;
+public:
+  RGWStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                       rgw_bucket& _src_bucket, rgw_obj_key& _key);
+  ~RGWStatRemoteObjCBCR() override {}
+
+  void set_result(ceph::real_time& _mtime,
+                  uint64_t _size,
+                  const std::string& _etag,
+                  std::map<std::string, bufferlist>&& _attrs,
+                  std::map<std::string, std::string>&& _headers) {
+    mtime = _mtime;
+    size = _size;
+    etag = _etag;
+    attrs = std::move(_attrs);
+    headers = std::move(_headers);
+  }
+};
+
+class RGWCallStatRemoteObjCR : public RGWCoroutine {
+  ceph::real_time mtime;
+  uint64_t size{0};
+  std::string etag;
+  std::map<std::string, bufferlist> attrs;
+  std::map<std::string, std::string> headers;
+
+protected:
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+
+  rgw_bucket src_bucket;
+  rgw_obj_key key;
+
+public:
+  RGWCallStatRemoteObjCR(RGWDataSyncCtx *_sc,
+                     rgw_bucket& _src_bucket, rgw_obj_key& _key);
+
+  ~RGWCallStatRemoteObjCR() override {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+
+  virtual RGWStatRemoteObjCBCR *allocate_callback() {
+    return nullptr;
+  }
+};
+
+void rgw_register_sync_modules(RGWSyncModulesManager *modules_manager);
diff --git a/src/rgw/driver/rados/rgw_sync_module_aws.cc b/src/rgw/driver/rados/rgw_sync_module_aws.cc
new file mode 100644
index 000000000..cefcd9dd1
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_aws.cc
@@ -0,0 +1,1823 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_sync_module_aws.h"
+#include "rgw_cr_rados.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rest.h"
+#include "rgw_acl.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+
+#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
+
+using namespace std;
+
+static string default_target_path = "rgw-${zonegroup}-${sid}/${bucket}";
+
+static string get_key_oid(const rgw_obj_key& key)
+{
+  string oid = key.name;
+  if (!key.instance.empty() &&
+      !key.have_null_instance()) {
+    oid += string(":") + key.instance;
+  }
+  return oid;
+}
+
+static string obj_to_aws_path(const rgw_obj& obj)
+{
+  return obj.bucket.name + "/" + get_key_oid(obj.key);
+}
+
+/*
+
+   json configuration definition:
+
+    {
+      "connection": {
+        "access_key": <access>,
+        "secret": <secret>,
+        "endpoint": <endpoint>,
+        "host_style": <path | virtual>,
+      },
+      "acls": [ { "type": <id | email | uri>,
+                  "source_id": <source_id>,
+                  "dest_id": <dest_id> } ... ],  # optional, acl mappings, no mappings if does not exist
+      "target_path": <target_path>, # override default
+           
+
+      # anything below here is for non trivial configuration 
+      # can be used in conjuction with the above
+
+      "default": {
+        "connection": {
+            "access_key": <access>,
+            "secret": <secret>,
+            "endpoint": <endpoint>,
+            "host_style" <path | virtual>,
+        },
+        "acls": [    # list of source uids and how they map into destination uids in the dest objects acls
+        {
+          "type" : <id | email | uri>,   #  optional, default is id
+          "source_id": <id>,
+          "dest_id": <id>
+        } ... ]
+        "target_path": "rgwx-${sid}/${bucket}" # how a bucket name is mapped to destination path,
+                                               # final object name will be target_path + "/" + obj
+      },
+      "connections": [
+          {
+            "id": <id>,
+            "access_key": <access>,
+            "secret": <secret>,
+            "endpoint": <endpoint>,
+          } ... ],
+      "acl_profiles": [
+          {
+            "id": <id>, # acl mappings
+            "acls": [ {
+                "type": <id | email | uri>,
+                "source_id": <id>,
+                "dest_id": <id>
+              } ... ]
+          }
+      ],
+      "profiles": [
+          {
+           "source_bucket": <source>, # can specify either specific bucket name (foo), or prefix (foo*)
+           "target_path": <dest>,   # (override default)
+           "connection_id": <connection_id>, # optional, if empty references default connection
+           "acls_id": <mappings_id>, # optional, if empty references default mappings
+          } ... ],
+    }
+
+target path optional variables:
+
+(evaluated at init)
+sid: sync instance id, randomly generated by sync process on first sync initalization
+zonegroup: zonegroup name
+zonegroup_id: zonegroup name
+zone: zone name
+zone_id: zone name
+
+(evaluated when syncing)
+bucket: bucket name
+owner: bucket owner
+
+*/
+
+struct ACLMapping {
+  ACLGranteeTypeEnum type{ACL_TYPE_CANON_USER};
+  string source_id;
+  string dest_id;
+
+  ACLMapping() = default;
+
+  ACLMapping(ACLGranteeTypeEnum t,
+             const string& s,
+             const string& d) : type(t),
+  source_id(s),
+  dest_id(d) {}
+
+  void init(const JSONFormattable& config) {
+    const string& t = config["type"];
+
+    if (t == "email") {
+      type = ACL_TYPE_EMAIL_USER;
+    } else if (t == "uri") {
+      type = ACL_TYPE_GROUP;
+    } else {
+      type = ACL_TYPE_CANON_USER;
+    }
+
+    source_id = config["source_id"];
+    dest_id = config["dest_id"];
+  }
+
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ObjectSection os(jf, "acl_mapping");
+    string s;
+    switch (type) {
+      case ACL_TYPE_EMAIL_USER:
+        s = "email";
+        break;
+      case ACL_TYPE_GROUP:
+        s = "uri";
+        break;
+      default:
+        s = "id";
+        break;
+    }
+    encode_json("type", s, &jf);
+    encode_json("source_id", source_id, &jf);
+    encode_json("dest_id", dest_id, &jf);
+  }
+};
+
+struct ACLMappings {
+  map<string, ACLMapping> acl_mappings;
+
+  void init(const JSONFormattable& config) {
+    for (auto& c : config.array()) {
+      ACLMapping m;
+      m.init(c);
+
+      acl_mappings.emplace(std::make_pair(m.source_id, m));
+    }
+  }
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ArraySection os(jf, "acls");
+
+    for (auto& i : acl_mappings) {
+      i.second.dump_conf(cct, jf);
+    }
+  }
+};
+
+struct AWSSyncConfig_ACLProfiles {
+  map<string, std::shared_ptr<ACLMappings> > acl_profiles;
+
+  void init(const JSONFormattable& config) {
+    for (auto& c : config.array()) {
+      const string& profile_id = c["id"];
+
+      std::shared_ptr<ACLMappings> ap{new ACLMappings};
+      ap->init(c["acls"]);
+
+      acl_profiles[profile_id] = ap;
+    }
+  }
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ArraySection section(jf, "acl_profiles");
+
+    for (auto& p : acl_profiles) {
+      Formatter::ObjectSection section(jf, "profile");
+      encode_json("id", p.first, &jf);
+      p.second->dump_conf(cct, jf);
+    }
+  }
+
+  bool find(const string& profile_id, ACLMappings *result) const {
+    auto iter = acl_profiles.find(profile_id);
+    if (iter == acl_profiles.end()) {
+      return false;
+    }
+    *result = *iter->second;
+    return true;
+  }
+};
+
+struct AWSSyncConfig_Connection {
+  string connection_id;
+  string endpoint;
+  RGWAccessKey key;
+  std::optional<string> region;
+  HostStyle host_style{PathStyle};
+
+  bool has_endpoint{false};
+  bool has_key{false};
+  bool has_host_style{false};
+
+  void init(const JSONFormattable& config) {
+    has_endpoint = config.exists("endpoint");
+    has_key = config.exists("access_key") || config.exists("secret");
+    has_host_style = config.exists("host_style");
+
+    connection_id = config["id"];
+    endpoint = config["endpoint"];
+
+    key = RGWAccessKey(config["access_key"], config["secret"]);
+
+    if (config.exists("region")) {
+      region = config["region"];
+    } else {
+      region.reset();
+    }
+
+    string host_style_str = config["host_style"];
+    if (host_style_str != "virtual") {
+      host_style = PathStyle;
+    } else {
+      host_style = VirtualStyle;
+    }
+  }
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ObjectSection section(jf, "connection");
+    encode_json("id", connection_id, &jf);
+    encode_json("endpoint", endpoint, &jf);
+    string s = (host_style == PathStyle ? "path" : "virtual");
+    encode_json("region", region, &jf);
+    encode_json("host_style", s, &jf);
+
+    {
+      Formatter::ObjectSection os(jf, "key");
+      encode_json("access_key", key.id, &jf);
+      string secret = (key.key.empty() ? "" : "******");
+      encode_json("secret", secret, &jf);
+    }
+  }
+};
+
+static int conf_to_uint64(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, const string& key, uint64_t *pval)
+{
+  string sval;
+  if (config.find(key, &sval)) {
+    string err;
+    uint64_t val = strict_strtoll(sval.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(dpp, 0) << "ERROR: could not parse configurable value for cloud sync module: " << key << ": " << sval << dendl;
+      return -EINVAL;
+    }
+    *pval = val;
+  }
+  return 0;
+}
+
+struct AWSSyncConfig_S3 {
+  uint64_t multipart_sync_threshold{DEFAULT_MULTIPART_SYNC_PART_SIZE};
+  uint64_t multipart_min_part_size{DEFAULT_MULTIPART_SYNC_PART_SIZE};
+
+  int init(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) {
+    int r = conf_to_uint64(dpp, cct, config, "multipart_sync_threshold", &multipart_sync_threshold);
+    if (r < 0) {
+      return r;
+    }
+
+    r = conf_to_uint64(dpp, cct, config, "multipart_min_part_size", &multipart_min_part_size);
+    if (r < 0) {
+      return r;
+    }
+#define MULTIPART_MIN_POSSIBLE_PART_SIZE (5 * 1024 * 1024)
+    if (multipart_min_part_size < MULTIPART_MIN_POSSIBLE_PART_SIZE) {
+      multipart_min_part_size = MULTIPART_MIN_POSSIBLE_PART_SIZE;
+    }
+    return 0;
+  }
+
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ObjectSection section(jf, "s3");
+    encode_json("multipart_sync_threshold", multipart_sync_threshold, &jf);
+    encode_json("multipart_min_part_size", multipart_min_part_size, &jf);
+  }
+};
+
+struct AWSSyncConfig_Profile {
+  string source_bucket;
+  bool prefix{false};
+  string target_path;
+  string connection_id;
+  string acls_id;
+
+  std::shared_ptr<AWSSyncConfig_Connection> conn_conf;
+  std::shared_ptr<ACLMappings> acls;
+
+  std::shared_ptr<RGWRESTConn> conn;
+
+  void init(const JSONFormattable& config) {
+    source_bucket = config["source_bucket"];
+
+    prefix = (!source_bucket.empty() && source_bucket[source_bucket.size() - 1] == '*');
+
+    if (prefix) {
+      source_bucket = source_bucket.substr(0, source_bucket.size() - 1);
+    }
+
+    target_path = config["target_path"];
+    connection_id = config["connection_id"];
+    acls_id = config["acls_id"];
+
+    if (config.exists("connection")) {
+      conn_conf = make_shared<AWSSyncConfig_Connection>();
+      conn_conf->init(config["connection"]);
+    }
+
+    if (config.exists("acls")) {
+      acls = make_shared<ACLMappings>();
+      acls->init(config["acls"]);
+    }
+  }
+
+  void dump_conf(CephContext *cct, JSONFormatter& jf, const char *section = "config") const {
+    Formatter::ObjectSection config(jf, section);
+    string sb{source_bucket};
+    if (prefix) {
+      sb.append("*");
+    }
+    encode_json("source_bucket", sb, &jf);
+    encode_json("target_path", target_path, &jf);
+    encode_json("connection_id", connection_id, &jf);
+    encode_json("acls_id", acls_id, &jf);
+    if (conn_conf.get()) {
+      conn_conf->dump_conf(cct, jf);
+    }
+    if (acls.get()) {
+      acls->dump_conf(cct, jf);
+    }
+  }
+};
+
+static void find_and_replace(const string& src, const string& find, const string& replace, string *dest)
+{
+  string s = src;
+
+  size_t pos = s.find(find);
+  while (pos != string::npos) {
+    size_t next_ofs = pos + find.size();
+    s = s.substr(0, pos) + replace + s.substr(next_ofs);
+    pos = s.find(find, next_ofs);
+  }
+
+  *dest = s;
+}
+
+static void apply_meta_param(const string& src, const string& param, const string& val, string *dest)
+{
+  string s = string("${") + param + "}";
+  find_and_replace(src, s, val, dest);
+}
+
+
+struct AWSSyncConfig {
+  AWSSyncConfig_Profile default_profile;
+  std::shared_ptr<AWSSyncConfig_Profile> root_profile;
+
+  map<string, std::shared_ptr<AWSSyncConfig_Connection> > connections;
+  AWSSyncConfig_ACLProfiles acl_profiles;
+
+  map<string, std::shared_ptr<AWSSyncConfig_Profile> > explicit_profiles;
+
+  AWSSyncConfig_S3 s3;
+
+  int init_profile(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& profile_conf, AWSSyncConfig_Profile& profile,
+                   bool connection_must_exist) {
+    if (!profile.connection_id.empty()) {
+      if (profile.conn_conf) {
+        ldpp_dout(dpp, 0) << "ERROR: ambiguous profile connection configuration, connection_id=" << profile.connection_id << dendl;
+        return -EINVAL;
+      }
+      if (connections.find(profile.connection_id) == connections.end()) {
+        ldpp_dout(dpp, 0) << "ERROR: profile configuration reference non-existent connection_id=" << profile.connection_id << dendl;
+        return -EINVAL;
+      }
+      profile.conn_conf = connections[profile.connection_id];
+    } else if (!profile.conn_conf) {
+      profile.connection_id = default_profile.connection_id;
+      auto i = connections.find(profile.connection_id);
+      if (i != connections.end()) {
+        profile.conn_conf = i->second;
+      }
+    }
+
+    if (connection_must_exist && !profile.conn_conf) {
+      ldpp_dout(dpp, 0) << "ERROR: remote connection undefined for sync profile" << dendl;
+      return -EINVAL;
+    }
+
+    if (profile.conn_conf && default_profile.conn_conf) {
+      if (!profile.conn_conf->has_endpoint) {
+        profile.conn_conf->endpoint = default_profile.conn_conf->endpoint;
+      }
+      if (!profile.conn_conf->has_host_style) {
+        profile.conn_conf->host_style = default_profile.conn_conf->host_style;
+      }
+      if (!profile.conn_conf->has_key) {
+        profile.conn_conf->key = default_profile.conn_conf->key;
+      }
+    }
+
+    ACLMappings acl_mappings;
+
+    if (!profile.acls_id.empty()) {
+      if (!acl_profiles.find(profile.acls_id, &acl_mappings)) {
+        ldpp_dout(dpp, 0) << "ERROR: profile configuration reference non-existent acls id=" << profile.acls_id << dendl;
+        return -EINVAL;
+      }
+      profile.acls = acl_profiles.acl_profiles[profile.acls_id];
+    } else if (!profile.acls) {
+      if (default_profile.acls) {
+        profile.acls = default_profile.acls;
+        profile.acls_id = default_profile.acls_id;
+      }
+    }
+
+    if (profile.target_path.empty()) {
+      profile.target_path = default_profile.target_path;
+    }
+    if (profile.target_path.empty()) {
+      profile.target_path = default_target_path;
+    }
+
+    return 0;
+  }
+
+  int init_target(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& profile_conf, std::shared_ptr<AWSSyncConfig_Profile> *ptarget) {
+    std::shared_ptr<AWSSyncConfig_Profile> profile;
+    profile.reset(new AWSSyncConfig_Profile);
+    profile->init(profile_conf);
+
+    int ret = init_profile(dpp, cct, profile_conf, *profile, true);
+    if (ret < 0) {
+      return ret;
+    }
+
+    auto& sb = profile->source_bucket;
+
+    if (explicit_profiles.find(sb) != explicit_profiles.end()) {
+      ldpp_dout(dpp, 0) << "WARNING: duplicate target configuration in sync module" << dendl;
+    }
+
+    explicit_profiles[sb] = profile;
+    if (ptarget) {
+      *ptarget = profile;
+    }
+    return 0;
+  }
+
+  bool do_find_profile(const rgw_bucket bucket, std::shared_ptr<AWSSyncConfig_Profile> *result) {
+    const string& name = bucket.name;
+    auto iter = explicit_profiles.upper_bound(name);
+    if (iter == explicit_profiles.begin()) {
+      return false;
+    }
+
+    --iter;
+    if (iter->first.size() > name.size()) {
+      return false;
+    }
+    if (name.compare(0, iter->first.size(), iter->first) != 0) {
+      return false;
+    }
+
+    std::shared_ptr<AWSSyncConfig_Profile>& target = iter->second;
+
+    if (!target->prefix &&
+        name.size() != iter->first.size()) {
+      return false;
+    }
+
+    *result = target;
+    return true;
+  }
+
+  void find_profile(const rgw_bucket bucket, std::shared_ptr<AWSSyncConfig_Profile> *result) {
+    if (!do_find_profile(bucket, result)) {
+      *result = root_profile;
+    }
+  }
+
+  AWSSyncConfig() {}
+
+  int init(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) {
+    auto& default_conf = config["default"];
+
+    if (config.exists("default")) {
+      default_profile.init(default_conf);
+      init_profile(dpp, cct, default_conf, default_profile, false);
+    }
+
+    for (auto& conn : config["connections"].array()) {
+      auto new_conn = conn;
+
+      std::shared_ptr<AWSSyncConfig_Connection> c{new AWSSyncConfig_Connection};
+      c->init(new_conn);
+
+      connections[new_conn["id"]] = c;
+    }
+
+    acl_profiles.init(config["acl_profiles"]);
+
+    int r = s3.init(dpp, cct, config["s3"]);
+    if (r < 0) {
+      return r;
+    }
+
+    auto new_root_conf = config;
+
+    r = init_target(dpp, cct, new_root_conf, &root_profile); /* the root profile config */
+    if (r < 0) {
+      return r;
+    }
+
+    for (auto target_conf : config["profiles"].array()) {
+      int r = init_target(dpp, cct, target_conf, nullptr);
+      if (r < 0) {
+        return r;
+      }
+    }
+
+    JSONFormatter jf(true);
+    dump_conf(cct, jf);
+    stringstream ss;
+    jf.flush(ss);
+
+    ldpp_dout(dpp, 5) << "sync module config (parsed representation):\n" << ss.str() << dendl;
+
+    return 0;
+  }
+
+  void expand_target(RGWDataSyncCtx *sc, const string& sid, const string& path, string *dest) {
+      apply_meta_param(path, "sid", sid, dest);
+
+      const RGWZoneGroup& zg = sc->env->svc->zone->get_zonegroup();
+      apply_meta_param(path, "zonegroup", zg.get_name(), dest);
+      apply_meta_param(path, "zonegroup_id", zg.get_id(), dest);
+
+      const RGWZone& zone = sc->env->svc->zone->get_zone();
+      apply_meta_param(path, "zone", zone.name, dest);
+      apply_meta_param(path, "zone_id", zone.id, dest);
+  }
+
+  void update_config(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, const string& sid) {
+    expand_target(sc, sid, root_profile->target_path, &root_profile->target_path);
+    ldpp_dout(dpp, 20) << "updated target: (root) -> " << root_profile->target_path << dendl;
+    for (auto& t : explicit_profiles) {
+      expand_target(sc, sid, t.second->target_path, &t.second->target_path);
+      ldpp_dout(dpp, 20) << "updated target: " << t.first << " -> " << t.second->target_path << dendl;
+    }
+  }
+
+  void dump_conf(CephContext *cct, JSONFormatter& jf) const {
+    Formatter::ObjectSection config(jf, "config");
+    root_profile->dump_conf(cct, jf);
+    jf.open_array_section("connections");
+    for (auto c : connections) {
+      c.second->dump_conf(cct, jf);
+    }
+    jf.close_section();
+
+    acl_profiles.dump_conf(cct, jf);
+
+    { // targets
+      Formatter::ArraySection as(jf, "profiles");
+      for (auto& t : explicit_profiles) {
+        Formatter::ObjectSection target_section(jf, "profile");
+        encode_json("name", t.first, &jf);
+        t.second->dump_conf(cct, jf);
+      }
+    }
+  }
+
+  string get_path(std::shared_ptr<AWSSyncConfig_Profile>& profile,
+                  const RGWBucketInfo& bucket_info,
+                  const rgw_obj_key& obj) {
+    string bucket_str;
+    string owner;
+    if (!bucket_info.owner.tenant.empty()) {
+      bucket_str = owner = bucket_info.owner.tenant + "-";
+      owner += bucket_info.owner.id;
+    }
+    bucket_str += bucket_info.bucket.name;
+
+    const string& path = profile->target_path;
+
+    string new_path;
+    apply_meta_param(path, "bucket", bucket_str, &new_path);
+    apply_meta_param(new_path, "owner", owner, &new_path);
+
+    new_path += string("/") + get_key_oid(obj);
+
+    return new_path;
+  }
+
+  void get_target(std::shared_ptr<AWSSyncConfig_Profile>& profile,
+                  const RGWBucketInfo& bucket_info,
+                  const rgw_obj_key& obj,
+                  string *bucket_name,
+                  string *obj_name) {
+    string path = get_path(profile, bucket_info, obj);
+    size_t pos = path.find('/');
+
+    *bucket_name = path.substr(0, pos);
+    *obj_name = path.substr(pos + 1);
+  }
+
+  void init_conns(RGWDataSyncCtx *sc, const string& id) {
+    auto sync_env = sc->env;
+
+    update_config(sync_env->dpp, sc, id);
+
+    auto& root_conf = root_profile->conn_conf;
+
+    root_profile->conn.reset(new S3RESTConn(sc->cct,
+                                           id,
+                                           { root_conf->endpoint },
+                                           root_conf->key,
+					   sync_env->svc->zone->get_zonegroup().get_id(),
+                                           root_conf->region,
+                                           root_conf->host_style));
+
+    for (auto i : explicit_profiles) {
+      auto& c = i.second;
+
+      c->conn.reset(new S3RESTConn(sc->cct,
+                                   id,
+                                   { c->conn_conf->endpoint },
+                                   c->conn_conf->key,
+				   sync_env->svc->zone->get_zonegroup().get_id(),
+                                   c->conn_conf->region,
+                                   c->conn_conf->host_style));
+    }
+  }
+};
+
+
+struct AWSSyncInstanceEnv {
+  AWSSyncConfig conf;
+  string id;
+
+  explicit AWSSyncInstanceEnv(AWSSyncConfig& _conf) : conf(_conf) {}
+
+  void init(RGWDataSyncCtx *sc, uint64_t instance_id) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%llx", (unsigned long long)instance_id);
+    id = buf;
+
+    conf.init_conns(sc, id);
+  }
+
+  void get_profile(const rgw_bucket& bucket, std::shared_ptr<AWSSyncConfig_Profile> *ptarget) {
+    conf.find_profile(bucket, ptarget);
+    ceph_assert(ptarget);
+  }
+};
+
+static int do_decode_rest_obj(const DoutPrefixProvider *dpp, CephContext *cct, map<string, bufferlist>& attrs, map<string, string>& headers, rgw_rest_obj *info)
+{
+  for (auto header : headers) {
+    const string& val = header.second;
+    if (header.first == "RGWX_OBJECT_SIZE") {
+      info->content_len = atoi(val.c_str());
+    } else {
+      info->attrs[header.first] = val;
+    }
+  }
+
+  info->acls.set_ctx(cct);
+  auto aiter = attrs.find(RGW_ATTR_ACL);
+  if (aiter != attrs.end()) {
+    bufferlist& bl = aiter->second;
+    auto bliter = bl.cbegin();
+    try {
+      info->acls.decode(bliter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode policy off attrs" << dendl;
+      return -EIO;
+    }
+  } else {
+    ldpp_dout(dpp, 0) << "WARNING: acl attrs not provided" << dendl;
+  }
+
+  return 0;
+}
+
+class RGWRESTStreamGetCRF : public RGWStreamReadHTTPResourceCRF
+{
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *conn;
+  const rgw_obj& src_obj;
+  RGWRESTConn::get_obj_params req_params;
+
+  rgw_sync_aws_src_obj_properties src_properties;
+public:
+  RGWRESTStreamGetCRF(CephContext *_cct,
+                               RGWCoroutinesEnv *_env,
+                               RGWCoroutine *_caller,
+                               RGWDataSyncCtx *_sc,
+                               RGWRESTConn *_conn,
+                               const rgw_obj& _src_obj,
+                               const rgw_sync_aws_src_obj_properties& _src_properties) : RGWStreamReadHTTPResourceCRF(_cct, _env, _caller,
+                                                                                                                      _sc->env->http_manager, _src_obj.key),
+                                                                                 sc(_sc), conn(_conn), src_obj(_src_obj),
+                                                                                 src_properties(_src_properties) {
+  }
+
+  int init(const DoutPrefixProvider *dpp) override {
+    /* init input connection */
+
+
+    req_params.get_op = true;
+    req_params.prepend_metadata = true;
+
+    req_params.unmod_ptr = &src_properties.mtime;
+    req_params.etag = src_properties.etag;
+    req_params.mod_zone_id = src_properties.zone_short_id;
+    req_params.mod_pg_ver = src_properties.pg_ver;
+
+    if (range.is_set) {
+      req_params.range_is_set = true;
+      req_params.range_start = range.ofs;
+      req_params.range_end = range.ofs + range.size - 1;
+    }
+
+    RGWRESTStreamRWRequest *in_req;
+    int ret = conn->get_obj(dpp, src_obj, req_params, false /* send */, &in_req);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): conn->get_obj() returned ret=" << ret << dendl;
+      return ret;
+    }
+
+    set_req(in_req);
+
+    return RGWStreamReadHTTPResourceCRF::init(dpp);
+  }
+
+  int decode_rest_obj(const DoutPrefixProvider *dpp, map<string, string>& headers, bufferlist& extra_data) override {
+    map<string, bufferlist> src_attrs;
+
+    ldpp_dout(dpp, 20) << __func__ << ":" << " headers=" << headers << " extra_data.length()=" << extra_data.length() << dendl;
+
+    if (extra_data.length() > 0) {
+      JSONParser jp;
+      if (!jp.parse(extra_data.c_str(), extra_data.length())) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to parse response extra data. len=" << extra_data.length() << " data=" << extra_data.c_str() << dendl;
+        return -EIO;
+      }
+
+      JSONDecoder::decode_json("attrs", src_attrs, &jp);
+    }
+    return do_decode_rest_obj(dpp, sc->cct, src_attrs, headers, &rest_obj);
+  }
+
+  bool need_extra_data() override {
+    return true;
+  }
+};
+
+static std::set<string> keep_headers = { "CONTENT_TYPE",
+                                         "CONTENT_ENCODING",
+                                         "CONTENT_DISPOSITION",
+                                         "CONTENT_LANGUAGE" };
+
+class RGWAWSStreamPutCRF : public RGWStreamWriteHTTPResourceCRF
+{
+  RGWDataSyncCtx *sc;
+  rgw_sync_aws_src_obj_properties src_properties;
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  const rgw_obj& dest_obj;
+  string etag;
+public:
+  RGWAWSStreamPutCRF(CephContext *_cct,
+                               RGWCoroutinesEnv *_env,
+                               RGWCoroutine *_caller,
+                               RGWDataSyncCtx *_sc,
+                               const rgw_sync_aws_src_obj_properties&  _src_properties,
+                               std::shared_ptr<AWSSyncConfig_Profile>& _target,
+                               const rgw_obj& _dest_obj) : RGWStreamWriteHTTPResourceCRF(_cct, _env, _caller, _sc->env->http_manager),
+                                                     sc(_sc), src_properties(_src_properties), target(_target), dest_obj(_dest_obj) {
+  }
+
+  int init() override {
+    /* init output connection */
+    RGWRESTStreamS3PutObj *out_req{nullptr};
+
+    if (multipart.is_multipart) {
+      char buf[32];
+      snprintf(buf, sizeof(buf), "%d", multipart.part_num);
+      rgw_http_param_pair params[] = { { "uploadId", multipart.upload_id.c_str() },
+                                       { "partNumber", buf },
+                                       { nullptr, nullptr } };
+      target->conn->put_obj_send_init(dest_obj, params, &out_req);
+    } else {
+      target->conn->put_obj_send_init(dest_obj, nullptr, &out_req);
+    }
+
+    set_req(out_req);
+
+    return RGWStreamWriteHTTPResourceCRF::init();
+  }
+
+  static bool keep_attr(const string& h) {
+    return (keep_headers.find(h) != keep_headers.end() ||
+            boost::algorithm::starts_with(h, "X_AMZ_"));
+  }
+
+  static void init_send_attrs(const DoutPrefixProvider *dpp,
+                              CephContext *cct,
+                              const rgw_rest_obj& rest_obj,
+                              const rgw_sync_aws_src_obj_properties& src_properties,
+                              const AWSSyncConfig_Profile *target,
+                              map<string, string> *attrs) {
+    auto& new_attrs = *attrs;
+
+    new_attrs.clear();
+
+    for (auto& hi : rest_obj.attrs) {
+      if (keep_attr(hi.first)) {
+        new_attrs.insert(hi);
+      }
+    }
+
+    auto acl = rest_obj.acls.get_acl();
+
+    map<int, vector<string> > access_map;
+
+    if (target->acls) {
+      for (auto& grant : acl.get_grant_map()) {
+        auto& orig_grantee = grant.first;
+        auto& perm = grant.second;
+
+        string grantee;
+
+        const auto& am = target->acls->acl_mappings;
+
+        auto iter = am.find(orig_grantee);
+        if (iter == am.end()) {
+          ldpp_dout(dpp, 20) << "acl_mappings: Could not find " << orig_grantee << " .. ignoring" << dendl;
+          continue;
+        }
+
+        grantee = iter->second.dest_id;
+
+        string type;
+
+        switch (iter->second.type) {
+          case ACL_TYPE_CANON_USER:
+            type = "id";
+            break;
+          case ACL_TYPE_EMAIL_USER:
+            type = "emailAddress";
+            break;
+          case ACL_TYPE_GROUP:
+            type = "uri";
+            break;
+          default:
+            continue;
+        }
+
+        string tv = type + "=" + grantee;
+
+        int flags = perm.get_permission().get_permissions();
+        if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) {
+          access_map[flags].push_back(tv);
+          continue;
+        }
+
+        for (int i = 1; i <= RGW_PERM_WRITE_ACP; i <<= 1) {
+          if (flags & i) {
+            access_map[i].push_back(tv);
+          }
+        }
+      }
+    }
+
+    for (auto aiter : access_map) {
+      int grant_type = aiter.first;
+
+      string header_str("x-amz-grant-");
+
+      switch (grant_type) {
+        case RGW_PERM_READ:
+          header_str.append("read");
+          break;
+        case RGW_PERM_WRITE:
+          header_str.append("write");
+          break;
+        case RGW_PERM_READ_ACP:
+          header_str.append("read-acp");
+          break;
+        case RGW_PERM_WRITE_ACP:
+          header_str.append("write-acp");
+          break;
+        case RGW_PERM_FULL_CONTROL:
+          header_str.append("full-control");
+          break;
+      }
+
+      string s;
+
+      for (auto viter : aiter.second) {
+        if (!s.empty()) {
+          s.append(", ");
+        }
+        s.append(viter);
+      }
+
+      ldpp_dout(dpp, 20) << "acl_mappings: set acl: " << header_str << "=" << s << dendl;
+
+      new_attrs[header_str] = s;
+    }
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%llu", (long long)src_properties.versioned_epoch);
+    new_attrs["x-amz-meta-rgwx-versioned-epoch"] = buf;
+
+    utime_t ut(src_properties.mtime);
+    snprintf(buf, sizeof(buf), "%lld.%09lld",
+             (long long)ut.sec(),
+             (long long)ut.nsec());
+
+    new_attrs["x-amz-meta-rgwx-source-mtime"] = buf;
+    new_attrs["x-amz-meta-rgwx-source-etag"] = src_properties.etag;
+    new_attrs["x-amz-meta-rgwx-source-key"] = rest_obj.key.name;
+    if (!rest_obj.key.instance.empty()) {
+      new_attrs["x-amz-meta-rgwx-source-version-id"] = rest_obj.key.instance;
+    }
+  }
+
+  void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) override {
+    RGWRESTStreamS3PutObj *r = static_cast<RGWRESTStreamS3PutObj *>(req);
+
+    map<string, string> new_attrs;
+    if (!multipart.is_multipart) {
+      init_send_attrs(dpp, sc->cct, rest_obj, src_properties, target.get(), &new_attrs);
+    }
+
+    r->set_send_length(rest_obj.content_len);
+
+    RGWAccessControlPolicy policy;
+
+    r->send_ready(dpp, target->conn->get_key(), new_attrs, policy);
+  }
+
+  void handle_headers(const map<string, string>& headers) {
+    for (auto h : headers) {
+      if (h.first == "ETAG") {
+        etag = h.second;
+      }
+    }
+  }
+
+  bool get_etag(string *petag) {
+    if (etag.empty()) {
+      return false;
+    }
+    *petag = etag;
+    return true;
+  }
+};
+
+
+class RGWAWSStreamObjToCloudPlainCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *source_conn;
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  const rgw_obj& src_obj;
+  const rgw_obj& dest_obj;
+
+  rgw_sync_aws_src_obj_properties src_properties;
+
+  std::shared_ptr<RGWStreamReadHTTPResourceCRF> in_crf;
+  std::shared_ptr<RGWStreamWriteHTTPResourceCRF> out_crf;
+
+public:
+  RGWAWSStreamObjToCloudPlainCR(RGWDataSyncCtx *_sc,
+                                RGWRESTConn *_source_conn,
+                                const rgw_obj& _src_obj,
+                                const rgw_sync_aws_src_obj_properties& _src_properties,
+                                std::shared_ptr<AWSSyncConfig_Profile> _target,
+                                const rgw_obj& _dest_obj) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   source_conn(_source_conn),
+                                                   target(_target),
+                                                   src_obj(_src_obj),
+                                                   dest_obj(_dest_obj),
+                                                   src_properties(_src_properties) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      /* init input */
+      in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sc,
+                                           source_conn, src_obj,
+                                           src_properties));
+
+      /* init output */
+      out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sc,
+                                           src_properties, target, dest_obj));
+
+      yield call(new RGWStreamSpliceCR(cct, sc->env->http_manager, in_crf, out_crf));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSStreamObjToCloudMultipartPartCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *source_conn;
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  const rgw_obj& src_obj;
+  const rgw_obj& dest_obj;
+
+  rgw_sync_aws_src_obj_properties src_properties;
+
+  string upload_id;
+
+  rgw_sync_aws_multipart_part_info part_info;
+
+  std::shared_ptr<RGWStreamReadHTTPResourceCRF> in_crf;
+  std::shared_ptr<RGWStreamWriteHTTPResourceCRF> out_crf;
+
+  string *petag;
+
+public:
+  RGWAWSStreamObjToCloudMultipartPartCR(RGWDataSyncCtx *_sc,
+                                RGWRESTConn *_source_conn,
+                                const rgw_obj& _src_obj,
+                                std::shared_ptr<AWSSyncConfig_Profile>& _target,
+                                const rgw_obj& _dest_obj,
+                                const rgw_sync_aws_src_obj_properties& _src_properties,
+                                const string& _upload_id,
+                                const rgw_sync_aws_multipart_part_info& _part_info,
+                                string *_petag) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   source_conn(_source_conn),
+                                                   target(_target),
+                                                   src_obj(_src_obj),
+                                                   dest_obj(_dest_obj),
+                                                   src_properties(_src_properties),
+                                                   upload_id(_upload_id),
+                                                   part_info(_part_info),
+                                                   petag(_petag) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      /* init input */
+      in_crf.reset(new RGWRESTStreamGetCRF(cct, get_env(), this, sc,
+                                           source_conn, src_obj,
+                                           src_properties));
+
+      in_crf->set_range(part_info.ofs, part_info.size);
+
+      /* init output */
+      out_crf.reset(new RGWAWSStreamPutCRF(cct, get_env(), this, sc,
+                                           src_properties, target, dest_obj));
+
+      out_crf->set_multipart(upload_id, part_info.part_num, part_info.size);
+
+      yield call(new RGWStreamSpliceCR(cct, sc->env->http_manager, in_crf, out_crf));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      if (!(static_cast<RGWAWSStreamPutCRF *>(out_crf.get()))->get_etag(petag)) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to get etag from PUT request" << dendl;
+        return set_cr_error(-EIO);
+      }
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSAbortMultipartCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *dest_conn;
+  const rgw_obj& dest_obj;
+
+  string upload_id;
+
+public:
+  RGWAWSAbortMultipartCR(RGWDataSyncCtx *_sc,
+                        RGWRESTConn *_dest_conn,
+                        const rgw_obj& _dest_obj,
+                        const string& _upload_id) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   dest_conn(_dest_conn),
+                                                   dest_obj(_dest_obj),
+                                                   upload_id(_upload_id) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+      yield {
+        rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+        bufferlist bl;
+        call(new RGWDeleteRESTResourceCR(sc->cct, dest_conn, sc->env->http_manager,
+                                         obj_to_aws_path(dest_obj), params));
+      }
+
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload for dest object=" << dest_obj << " (retcode=" << retcode << ")" << dendl;
+        return set_cr_error(retcode);
+      }
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSInitMultipartCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *dest_conn;
+  const rgw_obj& dest_obj;
+
+  uint64_t obj_size;
+  map<string, string> attrs;
+
+  bufferlist out_bl;
+
+  string *upload_id;
+
+  struct InitMultipartResult {
+    string bucket;
+    string key;
+    string upload_id;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+      RGWXMLDecoder::decode_xml("Key", key, obj);
+      RGWXMLDecoder::decode_xml("UploadId", upload_id, obj);
+    }
+  } result;
+
+public:
+  RGWAWSInitMultipartCR(RGWDataSyncCtx *_sc,
+                        RGWRESTConn *_dest_conn,
+                        const rgw_obj& _dest_obj,
+                        uint64_t _obj_size,
+                        const map<string, string>& _attrs,
+                        string *_upload_id) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   dest_conn(_dest_conn),
+                                                   dest_obj(_dest_obj),
+                                                   obj_size(_obj_size),
+                                                   attrs(_attrs),
+                                                   upload_id(_upload_id) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+      yield {
+        rgw_http_param_pair params[] = { { "uploads", nullptr }, {nullptr, nullptr} };
+        bufferlist bl;
+        call(new RGWPostRawRESTResourceCR <bufferlist> (sc->cct, dest_conn, sc->env->http_manager,
+                                                 obj_to_aws_path(dest_obj), params, &attrs, bl, &out_bl));
+      }
+
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
+        return set_cr_error(retcode);
+      }
+      {
+        /*
+         * If one of the following fails we cannot abort upload, as we cannot
+         * extract the upload id. If one of these fail it's very likely that that's
+         * the least of our problem.
+         */
+        RGWXMLDecoder::XMLParser parser;
+        if (!parser.init()) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+          return set_cr_error(-EIO);
+        }
+
+        if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+          string str(out_bl.c_str(), out_bl.length());
+          ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
+          return set_cr_error(-EIO);
+        }
+
+        try {
+          RGWXMLDecoder::decode_xml("InitiateMultipartUploadResult", result, &parser, true);
+        } catch (RGWXMLDecoder::err& err) {
+          string str(out_bl.c_str(), out_bl.length());
+          ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+          return set_cr_error(-EIO);
+        }
+      }
+
+      ldpp_dout(dpp, 20) << "init multipart result: bucket=" << result.bucket << " key=" << result.key << " upload_id=" << result.upload_id << dendl;
+
+      *upload_id = result.upload_id;
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSCompleteMultipartCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *dest_conn;
+  const rgw_obj& dest_obj;
+
+  bufferlist out_bl;
+
+  string upload_id;
+
+  struct CompleteMultipartReq {
+    map<int, rgw_sync_aws_multipart_part_info> parts;
+
+    explicit CompleteMultipartReq(const map<int, rgw_sync_aws_multipart_part_info>& _parts) : parts(_parts) {}
+
+    void dump_xml(Formatter *f) const {
+      for (auto p : parts) {
+        f->open_object_section("Part");
+        encode_xml("PartNumber", p.first, f);
+        encode_xml("ETag", p.second.etag, f);
+        f->close_section();
+      };
+    }
+  } req_enc;
+
+  struct CompleteMultipartResult {
+    string location;
+    string bucket;
+    string key;
+    string etag;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Location", bucket, obj);
+      RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+      RGWXMLDecoder::decode_xml("Key", key, obj);
+      RGWXMLDecoder::decode_xml("ETag", etag, obj);
+    }
+  } result;
+
+public:
+  RGWAWSCompleteMultipartCR(RGWDataSyncCtx *_sc,
+                        RGWRESTConn *_dest_conn,
+                        const rgw_obj& _dest_obj,
+                        string _upload_id,
+                        const map<int, rgw_sync_aws_multipart_part_info>& _parts) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   dest_conn(_dest_conn),
+                                                   dest_obj(_dest_obj),
+                                                   upload_id(_upload_id),
+                                                   req_enc(_parts) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+      yield {
+        rgw_http_param_pair params[] = { { "uploadId", upload_id.c_str() }, {nullptr, nullptr} };
+        stringstream ss;
+        XMLFormatter formatter;
+
+        encode_xml("CompleteMultipartUpload", req_enc, &formatter);
+
+        formatter.flush(ss);
+
+        bufferlist bl;
+        bl.append(ss.str());
+
+        call(new RGWPostRawRESTResourceCR <bufferlist> (sc->cct, dest_conn, sc->env->http_manager,
+                                                 obj_to_aws_path(dest_obj), params, nullptr, bl, &out_bl));
+      }
+
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to initialize multipart upload for dest object=" << dest_obj << dendl;
+        return set_cr_error(retcode);
+      }
+      {
+        /*
+         * If one of the following fails we cannot abort upload, as we cannot
+         * extract the upload id. If one of these fail it's very likely that that's
+         * the least of our problem.
+         */
+        RGWXMLDecoder::XMLParser parser;
+        if (!parser.init()) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+          return set_cr_error(-EIO);
+        }
+
+        if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+          string str(out_bl.c_str(), out_bl.length());
+          ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
+          return set_cr_error(-EIO);
+        }
+
+        try {
+          RGWXMLDecoder::decode_xml("CompleteMultipartUploadResult", result, &parser, true);
+        } catch (RGWXMLDecoder::err& err) {
+          string str(out_bl.c_str(), out_bl.length());
+          ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+          return set_cr_error(-EIO);
+        }
+      }
+
+      ldpp_dout(dpp, 20) << "complete multipart result: location=" << result.location << " bucket=" << result.bucket << " key=" << result.key << " etag=" << result.etag << dendl;
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+
+class RGWAWSStreamAbortMultipartUploadCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWRESTConn *dest_conn;
+  const rgw_obj& dest_obj;
+  const rgw_raw_obj status_obj;
+
+  string upload_id;
+
+public:
+
+  RGWAWSStreamAbortMultipartUploadCR(RGWDataSyncCtx *_sc,
+                                RGWRESTConn *_dest_conn,
+                                const rgw_obj& _dest_obj,
+                                const rgw_raw_obj& _status_obj,
+                                const string& _upload_id) : RGWCoroutine(_sc->cct), sc(_sc),
+                                                            dest_conn(_dest_conn),
+                                                            dest_obj(_dest_obj),
+                                                            status_obj(_status_obj),
+                                                            upload_id(_upload_id) {}
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield call(new RGWAWSAbortMultipartCR(sc, dest_conn, dest_obj, upload_id));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload dest obj=" << dest_obj << " upload_id=" << upload_id << " retcode=" << retcode << dendl;
+        /* ignore error, best effort */
+      }
+      yield call(new RGWRadosRemoveCR(sc->env->driver, status_obj));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to remove sync status obj obj=" << status_obj << " retcode=" << retcode << dendl;
+        /* ignore error, best effort */
+      }
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSStreamObjToCloudMultipartCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  AWSSyncConfig& conf;
+  RGWRESTConn *source_conn;
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  const rgw_obj& src_obj;
+  const rgw_obj& dest_obj;
+
+  uint64_t obj_size;
+  string src_etag;
+  rgw_sync_aws_src_obj_properties src_properties;
+  rgw_rest_obj rest_obj;
+
+  rgw_sync_aws_multipart_upload_info status;
+
+  map<string, string> new_attrs;
+
+  rgw_sync_aws_multipart_part_info *pcur_part_info{nullptr};
+
+  int ret_err{0};
+
+  rgw_raw_obj status_obj;
+
+public:
+  RGWAWSStreamObjToCloudMultipartCR(RGWDataSyncCtx *_sc,
+				    rgw_bucket_sync_pipe& _sync_pipe,
+                                AWSSyncConfig& _conf,
+                                RGWRESTConn *_source_conn,
+                                const rgw_obj& _src_obj,
+                                std::shared_ptr<AWSSyncConfig_Profile>& _target,
+                                const rgw_obj& _dest_obj,
+                                uint64_t _obj_size,
+                                const rgw_sync_aws_src_obj_properties& _src_properties,
+                                const rgw_rest_obj& _rest_obj) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc),
+                                                   sync_env(_sc->env),
+                                                   conf(_conf),
+                                                   source_conn(_source_conn),
+                                                   target(_target),
+                                                   src_obj(_src_obj),
+                                                   dest_obj(_dest_obj),
+                                                   obj_size(_obj_size),
+                                                   src_properties(_src_properties),
+                                                   rest_obj(_rest_obj),
+                                                   status_obj(sync_env->svc->zone->get_zone_params().log_pool,
+                                                              RGWBucketPipeSyncStatusManager::obj_status_oid(_sync_pipe, sc->source_zone, src_obj)) {
+  }
+
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      yield call(new RGWSimpleRadosReadCR<rgw_sync_aws_multipart_upload_info>(
+		   dpp, sync_env->driver, status_obj, &status, false));
+
+      if (retcode < 0 && retcode != -ENOENT) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to read sync status of object " << src_obj << " retcode=" << retcode << dendl;
+        return retcode;
+      }
+
+      if (retcode >= 0) {
+        /* check here that mtime and size did not change */
+
+        if (status.src_properties.mtime != src_properties.mtime || status.obj_size != obj_size ||
+            status.src_properties.etag != src_properties.etag) {
+          yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
+          retcode = -ENOENT;
+        }
+      }
+
+      if (retcode == -ENOENT) {
+        RGWAWSStreamPutCRF::init_send_attrs(dpp, sc->cct, rest_obj, src_properties, target.get(), &new_attrs);
+
+        yield call(new RGWAWSInitMultipartCR(sc, target->conn.get(), dest_obj, status.obj_size, std::move(new_attrs), &status.upload_id));
+        if (retcode < 0) {
+          return set_cr_error(retcode);
+        }
+
+        status.obj_size = obj_size;
+        status.src_properties = src_properties;
+#define MULTIPART_MAX_PARTS 10000
+        uint64_t min_part_size = obj_size / MULTIPART_MAX_PARTS;
+        status.part_size = std::max(conf.s3.multipart_min_part_size, min_part_size);
+        status.num_parts = (obj_size + status.part_size - 1) / status.part_size;
+        status.cur_part = 1;
+      }
+
+      for (; (uint32_t)status.cur_part <= status.num_parts; ++status.cur_part) {
+        yield {
+          rgw_sync_aws_multipart_part_info& cur_part_info = status.parts[status.cur_part];
+          cur_part_info.part_num = status.cur_part;
+          cur_part_info.ofs = status.cur_ofs;
+          cur_part_info.size = std::min((uint64_t)status.part_size, status.obj_size - status.cur_ofs);
+
+          pcur_part_info = &cur_part_info;
+
+          status.cur_ofs += status.part_size;
+
+          call(new RGWAWSStreamObjToCloudMultipartPartCR(sc,
+                                                             source_conn, src_obj,
+                                                             target,
+                                                             dest_obj,
+                                                             status.src_properties,
+                                                             status.upload_id,
+                                                             cur_part_info,
+                                                             &cur_part_info.etag));
+        }
+
+        if (retcode < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to sync obj=" << src_obj << ", sync via multipart upload, upload_id=" << status.upload_id << " part number " << status.cur_part << " (error: " << cpp_strerror(-retcode) << ")" << dendl;
+          ret_err = retcode;
+          yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
+          return set_cr_error(ret_err);
+        }
+
+        yield call(new RGWSimpleRadosWriteCR<rgw_sync_aws_multipart_upload_info>(dpp, sync_env->driver, status_obj, status));
+        if (retcode < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to store multipart upload state, retcode=" << retcode << dendl;
+          /* continue with upload anyway */
+        }
+        ldpp_dout(dpp, 20) << "sync of object=" << src_obj << " via multipart upload, finished sending part #" << status.cur_part << " etag=" << pcur_part_info->etag << dendl;
+      }
+
+      yield call(new RGWAWSCompleteMultipartCR(sc, target->conn.get(), dest_obj, status.upload_id, status.parts));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to complete multipart upload of obj=" << src_obj << " (error: " << cpp_strerror(-retcode) << ")" << dendl;
+        ret_err = retcode;
+        yield call(new RGWAWSStreamAbortMultipartUploadCR(sc, target->conn.get(), dest_obj, status_obj, status.upload_id));
+        return set_cr_error(ret_err);
+      }
+
+      /* remove status obj */
+      yield call(new RGWRadosRemoveCR(sync_env->driver, status_obj));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to abort multipart upload obj=" << src_obj << " upload_id=" << status.upload_id << " part number " << status.cur_part << " (" << cpp_strerror(-retcode) << ")" << dendl;
+        /* ignore error, best effort */
+      }
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+template <class T>
+int decode_attr(map<string, bufferlist>& attrs, const char *attr_name, T *result, T def_val)
+{
+  map<string, bufferlist>::iterator iter = attrs.find(attr_name);
+  if (iter == attrs.end()) {
+    *result = def_val;
+    return 0;
+  }
+  bufferlist& bl = iter->second;
+  if (bl.length() == 0) {
+    *result = def_val;
+    return 0;
+  }
+  auto bliter = bl.cbegin();
+  try {
+    decode(*result, bliter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+  return 0;
+}
+
+// maybe use Fetch Remote Obj instead?
+class RGWAWSHandleRemoteObjCBCR: public RGWStatRemoteObjCBCR {
+  rgw_bucket_sync_pipe sync_pipe;
+  AWSSyncInstanceEnv& instance;
+
+  uint64_t versioned_epoch{0};
+
+  RGWRESTConn *source_conn{nullptr};
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  bufferlist res;
+  unordered_map <string, bool> bucket_created;
+  rgw_rest_obj rest_obj;
+  int ret{0};
+
+  uint32_t src_zone_short_id{0};
+  uint64_t src_pg_ver{0};
+
+  bufferlist out_bl;
+
+  struct CreateBucketResult {
+    string code;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("Code", code, obj);
+    }
+  } result;
+
+  rgw_obj src_obj;
+  rgw_obj dest_obj;
+
+public:
+  RGWAWSHandleRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                            rgw_bucket_sync_pipe& _sync_pipe,
+                            rgw_obj_key& _key,
+                            AWSSyncInstanceEnv& _instance,
+                            uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+                                                         sync_pipe(_sync_pipe),
+                                                         instance(_instance), versioned_epoch(_versioned_epoch)
+  {}
+
+  ~RGWAWSHandleRemoteObjCBCR(){
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ret = decode_attr(attrs, RGW_ATTR_PG_VER, &src_pg_ver, (uint64_t)0);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl;
+      } else {
+        ret = decode_attr(attrs, RGW_ATTR_SOURCE_ZONE, &src_zone_short_id, (uint32_t)0);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to decode source zone short_id attr, ignoring" << dendl;
+          src_pg_ver = 0; /* all or nothing */
+        }
+      }
+      ldpp_dout(dpp, 4) << "AWS: download begin: z=" << sc->source_zone
+                              << " b=" << src_bucket << " k=" << key << " size=" << size
+                              << " mtime=" << mtime << " etag=" << etag
+                              << " zone_short_id=" << src_zone_short_id << " pg_ver=" << src_pg_ver
+                              << dendl;
+
+      source_conn = sync_env->svc->zone->get_zone_conn(sc->source_zone);
+      if (!source_conn) {
+        ldpp_dout(dpp, 0) << "ERROR: cannot find http connection to zone " << sc->source_zone << dendl;
+        return set_cr_error(-EINVAL);
+      }
+
+      instance.get_profile(sync_pipe.info.source_bs.bucket, &target);
+      instance.conf.get_target(target, sync_pipe.dest_bucket_info, key, &dest_obj.bucket.name, &dest_obj.key.name);
+
+      if (bucket_created.find(dest_obj.bucket.name) == bucket_created.end()){
+        yield {
+          ldpp_dout(dpp, 0) << "AWS: creating bucket " << dest_obj.bucket.name << dendl;
+          bufferlist bl;
+          call(new RGWPutRawRESTResourceCR <bufferlist> (sc->cct, target->conn.get(),
+                                                  sync_env->http_manager,
+                                                  dest_obj.bucket.name, nullptr, bl, &out_bl));
+        }
+        if (retcode < 0 ) {
+          RGWXMLDecoder::XMLParser parser;
+          if (!parser.init()) {
+            ldpp_dout(dpp, 0) << "ERROR: failed to initialize xml parser for parsing multipart init response from server" << dendl;
+            return set_cr_error(retcode);
+          }
+
+          if (!parser.parse(out_bl.c_str(), out_bl.length(), 1)) {
+            string str(out_bl.c_str(), out_bl.length());
+            ldpp_dout(dpp, 5) << "ERROR: failed to parse xml: " << str << dendl;
+            return set_cr_error(retcode);
+          }
+
+          try {
+            RGWXMLDecoder::decode_xml("Error", result, &parser, true);
+          } catch (RGWXMLDecoder::err& err) {
+            string str(out_bl.c_str(), out_bl.length());
+            ldpp_dout(dpp, 5) << "ERROR: unexpected xml: " << str << dendl;
+            return set_cr_error(retcode);
+          }
+
+          if (result.code != "BucketAlreadyOwnedByYou") {
+            return set_cr_error(retcode);
+          }
+        }
+
+        bucket_created[dest_obj.bucket.name] = true;
+      }
+
+      yield {
+        src_obj.bucket = src_bucket;
+        src_obj.key = key;
+
+        /* init output */
+        rgw_sync_aws_src_obj_properties src_properties;
+        src_properties.mtime = mtime;
+        src_properties.etag = etag;
+        src_properties.zone_short_id = src_zone_short_id;
+        src_properties.pg_ver = src_pg_ver;
+        src_properties.versioned_epoch = versioned_epoch;
+
+        if (size < instance.conf.s3.multipart_sync_threshold) {
+          call(new RGWAWSStreamObjToCloudPlainCR(sc, source_conn, src_obj,
+                                                 src_properties,
+                                                 target,
+                                                 dest_obj));
+        } else {
+          rgw_rest_obj rest_obj;
+          rest_obj.init(key);
+          if (do_decode_rest_obj(dpp, sc->cct, attrs, headers, &rest_obj)) {
+            ldpp_dout(dpp, 0) << "ERROR: failed to decode rest obj out of headers=" << headers << ", attrs=" << attrs << dendl;
+            return set_cr_error(-EINVAL);
+          }
+          call(new RGWAWSStreamObjToCloudMultipartCR(sc, sync_pipe, instance.conf, source_conn, src_obj,
+                                                     target, dest_obj, size, src_properties, rest_obj));
+        }
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      return set_cr_done();
+    }
+
+    return 0;
+  }
+};
+
+class RGWAWSHandleRemoteObjCR : public RGWCallStatRemoteObjCR {
+  rgw_bucket_sync_pipe sync_pipe;
+  AWSSyncInstanceEnv& instance;
+  uint64_t versioned_epoch;
+public:
+  RGWAWSHandleRemoteObjCR(RGWDataSyncCtx *_sc,
+                              rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
+                              AWSSyncInstanceEnv& _instance, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+                                                          sync_pipe(_sync_pipe),
+                                                          instance(_instance), versioned_epoch(_versioned_epoch) {
+  }
+
+  ~RGWAWSHandleRemoteObjCR() {}
+
+  RGWStatRemoteObjCBCR *allocate_callback() override {
+    return new RGWAWSHandleRemoteObjCBCR(sc, sync_pipe, key, instance, versioned_epoch);
+  }
+};
+
+class RGWAWSRemoveRemoteObjCBCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  std::shared_ptr<AWSSyncConfig_Profile> target;
+  rgw_bucket_sync_pipe sync_pipe;
+  rgw_obj_key key;
+  ceph::real_time mtime;
+  AWSSyncInstanceEnv& instance;
+  int ret{0};
+public:
+  RGWAWSRemoveRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                          rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, const ceph::real_time& _mtime,
+                          AWSSyncInstanceEnv& _instance) : RGWCoroutine(_sc->cct), sc(_sc),
+                                                        sync_pipe(_sync_pipe), key(_key),
+                                                        mtime(_mtime), instance(_instance) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ldpp_dout(dpp, 0) << ": remove remote obj: z=" << sc->source_zone
+                              << " b=" <<sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << dendl;
+      yield {
+        instance.get_profile(sync_pipe.info.source_bs.bucket, &target);
+        string path =  instance.conf.get_path(target, sync_pipe.dest_bucket_info, key);
+        ldpp_dout(dpp, 0) << "AWS: removing aws object at" << path << dendl;
+
+        call(new RGWDeleteRESTResourceCR(sc->cct, target->conn.get(),
+                                         sc->env->http_manager,
+                                         path, nullptr /* params */));
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+
+};
+
+
+class RGWAWSDataSyncModule: public RGWDataSyncModule {
+  CephContext *cct;
+  AWSSyncInstanceEnv instance;
+public:
+  RGWAWSDataSyncModule(CephContext *_cct, AWSSyncConfig& _conf) :
+                  cct(_cct),
+                  instance(_conf) {
+  }
+
+  void init(RGWDataSyncCtx *sc, uint64_t instance_id) override {
+    instance.init(sc, instance_id);
+  }
+
+  ~RGWAWSDataSyncModule() {}
+
+  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key,
+                            std::optional<uint64_t> versioned_epoch,
+                            const rgw_zone_set_entry& source_trace_entry,
+                            rgw_zone_set *zones_trace) override {
+    ldout(sc->cct, 0) << instance.id << ": sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+    return new RGWAWSHandleRemoteObjCR(sc, sync_pipe, key, instance, versioned_epoch.value_or(0));
+  }
+  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch,
+                              rgw_zone_set *zones_trace) override {
+    ldout(sc->cct, 0) <<"rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    return new RGWAWSRemoveRemoteObjCBCR(sc, sync_pipe, key, mtime, instance);
+  }
+  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch,
+                                     rgw_zone_set *zones_trace) override {
+    ldout(sc->cct, 0) <<"AWS Not implemented: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+                            << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    return NULL;
+  }
+};
+
+class RGWAWSSyncModuleInstance : public RGWSyncModuleInstance {
+  RGWAWSDataSyncModule data_handler;
+public:
+  RGWAWSSyncModuleInstance(CephContext *cct, AWSSyncConfig& _conf) : data_handler(cct, _conf) {}
+  RGWDataSyncModule *get_data_handler() override {
+    return &data_handler;
+  }
+};
+
+int RGWAWSSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config,  RGWSyncModuleInstanceRef *instance){
+  AWSSyncConfig conf;
+
+  int r = conf.init(dpp, cct, config);
+  if (r < 0) {
+    return r;
+  }
+
+  instance->reset(new RGWAWSSyncModuleInstance(cct, conf));
+  return 0;
+}
diff --git a/src/rgw/driver/rados/rgw_sync_module_aws.h b/src/rgw/driver/rados/rgw_sync_module_aws.h
new file mode 100644
index 000000000..92532ff00
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_aws.h
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_sync_module.h"
+
+struct rgw_sync_aws_multipart_part_info {
+  int part_num{0};
+  uint64_t ofs{0};
+  uint64_t size{0};
+  std::string etag;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(part_num, bl);
+    encode(ofs, bl);
+    encode(size, bl);
+    encode(etag, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(part_num, bl);
+    decode(ofs, bl);
+    decode(size, bl);
+    decode(etag, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_part_info)
+
+struct rgw_sync_aws_src_obj_properties {
+  ceph::real_time mtime;
+  std::string etag;
+  uint32_t zone_short_id{0};
+  uint64_t pg_ver{0};
+  uint64_t versioned_epoch{0};
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(mtime, bl);
+    encode(etag, bl);
+    encode(zone_short_id, bl);
+    encode(pg_ver, bl);
+    encode(versioned_epoch, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(mtime, bl);
+    decode(etag, bl);
+    decode(zone_short_id, bl);
+    decode(pg_ver, bl);
+    decode(versioned_epoch, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_sync_aws_src_obj_properties)
+
+struct rgw_sync_aws_multipart_upload_info {
+  std::string upload_id;
+  uint64_t obj_size;
+  rgw_sync_aws_src_obj_properties src_properties;
+  uint32_t part_size{0};
+  uint32_t num_parts{0};
+
+  int cur_part{0};
+  uint64_t cur_ofs{0};
+
+  std::map<int, rgw_sync_aws_multipart_part_info> parts;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(upload_id, bl);
+    encode(obj_size, bl);
+    encode(src_properties, bl);
+    encode(part_size, bl);
+    encode(num_parts, bl);
+    encode(cur_part, bl);
+    encode(cur_ofs, bl);
+    encode(parts, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(upload_id, bl);
+    decode(obj_size, bl);
+    decode(src_properties, bl);
+    decode(part_size, bl);
+    decode(num_parts, bl);
+    decode(cur_part, bl);
+    decode(cur_ofs, bl);
+    decode(parts, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_sync_aws_multipart_upload_info)
+
+class RGWAWSSyncModule : public RGWSyncModule {
+ public:
+  RGWAWSSyncModule() {}
+  bool supports_data_export() override { return false;}
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
diff --git a/src/rgw/driver/rados/rgw_sync_module_es.cc b/src/rgw/driver/rados/rgw_sync_module_es.cc
new file mode 100644
index 000000000..4e8eb6201
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_es.cc
@@ -0,0 +1,962 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_b64.h"
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_sync_module_es.h"
+#include "rgw_sync_module_es_rest.h"
+#include "rgw_rest_conn.h"
+#include "rgw_cr_rest.h"
+#include "rgw_op.h"
+#include "rgw_es_query.h"
+#include "rgw_zone.h"
+
+#include "services/svc_zone.h"
+
+#include "include/str_list.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+/*
+ * allowlist utility. Config string is a list of entries, where an entry is either an item,
+ * a prefix, or a suffix. An item would be the name of the entity that we'd look up,
+ * a prefix would be a string ending with an asterisk, a suffix would be a string starting
+ * with an asterisk. For example:
+ *
+ *      bucket1, bucket2, foo*, *bar
+ */
+class ItemList {
+  bool approve_all{false};
+
+  set<string> entries;
+  set<string> prefixes;
+  set<string> suffixes;
+
+  void parse(const string& str) {
+    list<string> l;
+
+    get_str_list(str, ",", l);
+
+    for (auto& entry : l) {
+      entry = rgw_trim_whitespace(entry);
+      if (entry.empty()) {
+        continue;
+      }
+
+      if (entry == "*") {
+        approve_all = true;
+        return;
+      }
+
+      if (entry[0] == '*') {
+        suffixes.insert(entry.substr(1));
+        continue;
+      }
+
+      if (entry.back() == '*') {
+        prefixes.insert(entry.substr(0, entry.size() - 1));
+        continue;
+      }
+
+      entries.insert(entry);
+    }
+  }
+
+public:
+  ItemList() {}
+  void init(const string& str, bool def_val) {
+    if (str.empty()) {
+      approve_all = def_val;
+    } else {
+      parse(str);
+    }
+  }
+
+  bool exists(const string& entry) {
+    if (approve_all) {
+      return true;
+    }
+
+    if (entries.find(entry) != entries.end()) {
+      return true;
+    }
+
+    auto i = prefixes.upper_bound(entry);
+    if (i != prefixes.begin()) {
+      --i;
+      if (boost::algorithm::starts_with(entry, *i)) {
+        return true;
+      }
+    }
+
+    for (i = suffixes.begin(); i != suffixes.end(); ++i) {
+      if (boost::algorithm::ends_with(entry, *i)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+};
+
+#define ES_NUM_SHARDS_MIN 5
+
+#define ES_NUM_SHARDS_DEFAULT 16
+#define ES_NUM_REPLICAS_DEFAULT 1
+
+using ESVersion = std::pair<int,int>;
+static constexpr ESVersion ES_V5{5,0};
+static constexpr ESVersion ES_V7{7,0};
+
+struct ESInfo {
+  std::string name;
+  std::string cluster_name;
+  std::string cluster_uuid;
+  ESVersion version;
+
+  void decode_json(JSONObj *obj);
+
+  std::string get_version_str(){
+    return std::to_string(version.first) + "." + std::to_string(version.second);
+  }
+};
+
+// simple wrapper structure to wrap the es version nested type
+struct es_version_decoder {
+  ESVersion version;
+
+  int parse_version(const std::string& s) {
+    int major, minor;
+    int ret = sscanf(s.c_str(), "%d.%d", &major, &minor);
+    if (ret < 0) {
+      return ret;
+    }
+    version = std::make_pair(major,minor);
+    return 0;
+  }
+
+  void decode_json(JSONObj *obj) {
+    std::string s;
+    JSONDecoder::decode_json("number",s,obj);
+    if (parse_version(s) < 0)
+      throw JSONDecoder::err("Failed to parse ElasticVersion");
+  }
+};
+
+
+void ESInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("name", name, obj);
+  JSONDecoder::decode_json("cluster_name", cluster_name, obj);
+  JSONDecoder::decode_json("cluster_uuid", cluster_uuid, obj);
+  es_version_decoder esv;
+  JSONDecoder::decode_json("version", esv, obj);
+  version = std::move(esv.version);
+}
+
+struct ElasticConfig {
+  uint64_t sync_instance{0};
+  string id;
+  string index_path;
+  std::unique_ptr<RGWRESTConn> conn;
+  bool explicit_custom_meta{true};
+  string override_index_path;
+  ItemList index_buckets;
+  ItemList allow_owners;
+  uint32_t num_shards{0};
+  uint32_t num_replicas{0};
+  std::map <string,string> default_headers = {{ "Content-Type", "application/json" }};
+  ESInfo es_info;
+
+  void init(CephContext *cct, const JSONFormattable& config) {
+    string elastic_endpoint = config["endpoint"];
+    id = string("elastic:") + elastic_endpoint;
+    conn.reset(new RGWRESTConn(cct, (rgw::sal::Driver*)nullptr, id, { elastic_endpoint }, nullopt /* region */ ));
+    explicit_custom_meta = config["explicit_custom_meta"](true);
+    index_buckets.init(config["index_buckets_list"], true); /* approve all buckets by default */
+    allow_owners.init(config["approved_owners_list"], true); /* approve all bucket owners by default */
+    override_index_path = config["override_index_path"];
+    num_shards = config["num_shards"](ES_NUM_SHARDS_DEFAULT);
+    if (num_shards < ES_NUM_SHARDS_MIN) {
+      num_shards = ES_NUM_SHARDS_MIN;
+    }
+    num_replicas = config["num_replicas"](ES_NUM_REPLICAS_DEFAULT);
+    if (string user = config["username"], pw = config["password"];
+        !user.empty() && !pw.empty()) {
+      auto auth_string = user + ":" + pw;
+      default_headers.emplace("AUTHORIZATION", "Basic " + rgw::to_base64(auth_string));
+    }
+
+  }
+
+  void init_instance(const RGWRealm& realm, uint64_t instance_id) {
+    sync_instance = instance_id;
+
+    if (!override_index_path.empty()) {
+      index_path = override_index_path;
+      return;
+    }
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "-%08x", (uint32_t)(sync_instance & 0xFFFFFFFF));
+
+    index_path = "/rgw-" + realm.get_name() + buf;
+  }
+
+  string get_index_path() {
+    return index_path;
+  }
+
+  map<string, string>& get_request_headers() {
+    return default_headers;
+  }
+
+  string get_obj_path(const RGWBucketInfo& bucket_info, const rgw_obj_key& key) {
+    if (es_info.version >= ES_V7) {
+      return index_path+ "/_doc/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance));
+;
+    } else {
+      return index_path +  "/object/" + url_encode(bucket_info.bucket.bucket_id + ":" + key.name + ":" + (key.instance.empty() ? "null" : key.instance));
+    }
+  }
+
+  bool should_handle_operation(RGWBucketInfo& bucket_info) {
+    return index_buckets.exists(bucket_info.bucket.name) &&
+           allow_owners.exists(bucket_info.owner.to_str());
+  }
+};
+
+using ElasticConfigRef = std::shared_ptr<ElasticConfig>;
+
+static const char *es_type_to_str(const ESType& t) {
+  switch (t) {
+  case ESType::String: return "string";
+  case ESType::Text: return "text";
+  case ESType::Keyword: return "keyword";
+  case ESType::Long: return "long";
+  case ESType::Integer: return "integer";
+  case ESType::Short: return "short";
+  case ESType::Byte: return "byte";
+  case ESType::Double: return "double";
+  case ESType::Float: return "float";
+  case ESType::Half_Float: return "half_float";
+  case ESType::Scaled_Float: return "scaled_float";
+  case ESType::Date: return "date";
+  case ESType::Boolean: return "boolean";
+  case ESType::Integer_Range: return "integer_range";
+  case ESType::Float_Range: return "float_range";
+  case ESType::Double_Range: return "date_range";
+  case ESType::Date_Range: return "date_range";
+  case ESType::Geo_Point: return "geo_point";
+  case ESType::Ip: return "ip";
+  default:
+    return "<unknown>";
+  }
+}
+
+struct es_type_v2 {
+  ESType estype;
+  const char *format{nullptr};
+  std::optional<bool> analyzed;
+
+  es_type_v2(ESType et) : estype(et) {}
+
+  void dump(Formatter *f) const {
+    const char *type_str = es_type_to_str(estype);
+    encode_json("type", type_str, f);
+    if (format) {
+      encode_json("format", format, f);
+    }
+
+    auto is_analyzed = analyzed;
+
+    if (estype == ESType::String &&
+        !is_analyzed) {
+      is_analyzed = false;
+    }
+
+    if (is_analyzed) {
+      encode_json("index", (is_analyzed.value() ? "analyzed" : "not_analyzed"), f);
+    }
+  }
+};
+
+struct es_type_v5 {
+  ESType estype;
+  const char *format{nullptr};
+  std::optional<bool> analyzed;
+  std::optional<bool> index;
+
+  es_type_v5(ESType et) : estype(et) {}
+
+  void dump(Formatter *f) const {
+    ESType new_estype;
+    if (estype != ESType::String) {
+      new_estype = estype;
+    } else {
+      bool is_analyzed = analyzed.value_or(false);
+      new_estype = (is_analyzed ? ESType::Text : ESType::Keyword);
+      /* index = true; ... Not setting index=true, because that's the default,
+       * and dumping a boolean value *might* be a problem when backporting this
+       * because value might get quoted
+       */
+    }
+
+    const char *type_str = es_type_to_str(new_estype);
+    encode_json("type", type_str, f);
+    if (format) {
+      encode_json("format", format, f);
+    }
+    if (index) {
+      encode_json("index", index.value(), f);
+    }
+  }
+};
+
+template <class T>
+struct es_type : public T {
+  es_type(T t) : T(t) {}
+  es_type& set_format(const char *f) {
+    T::format = f;
+    return *this;
+  }
+
+  es_type& set_analyzed(bool a) {
+    T::analyzed = a;
+    return *this;
+  }
+};
+
+template <class T>
+struct es_index_mappings {
+  ESVersion es_version;
+  ESType string_type {ESType::String};
+
+  es_index_mappings(ESVersion esv):es_version(esv) {
+  }
+
+  es_type<T> est(ESType t) const {
+    return es_type<T>(t);
+  }
+
+  void dump_custom(const char *section, ESType type, const char *format, Formatter *f) const {
+    f->open_object_section(section);
+    ::encode_json("type", "nested", f);
+    f->open_object_section("properties");
+    encode_json("name", est(string_type), f);
+    encode_json("value", est(type).set_format(format), f);
+    f->close_section(); // entry
+    f->close_section(); // custom-string
+  }
+
+  void dump(Formatter *f) const {
+    if (es_version <= ES_V7)
+      f->open_object_section("object");
+    f->open_object_section("properties");
+    encode_json("bucket", est(string_type), f);
+    encode_json("name", est(string_type), f);
+    encode_json("instance", est(string_type), f);
+    encode_json("versioned_epoch", est(ESType::Long), f);
+    f->open_object_section("meta");
+    f->open_object_section("properties");
+    encode_json("cache_control", est(string_type), f);
+    encode_json("content_disposition", est(string_type), f);
+    encode_json("content_encoding", est(string_type), f);
+    encode_json("content_language", est(string_type), f);
+    encode_json("content_type", est(string_type), f);
+    encode_json("storage_class", est(string_type), f);
+    encode_json("etag", est(string_type), f);
+    encode_json("expires", est(string_type), f);
+    encode_json("mtime", est(ESType::Date)
+                         .set_format("strict_date_optional_time||epoch_millis"), f);
+    encode_json("size", est(ESType::Long), f);
+    dump_custom("custom-string", string_type, nullptr, f);
+    dump_custom("custom-int", ESType::Long, nullptr, f);
+    dump_custom("custom-date", ESType::Date, "strict_date_optional_time||epoch_millis", f);
+    f->close_section(); // properties
+    f->close_section(); // meta
+    f->close_section(); // properties
+
+    if (es_version <= ES_V7)
+    f->close_section(); // object
+  }
+};
+
+struct es_index_settings {
+  uint32_t num_replicas;
+  uint32_t num_shards;
+
+  es_index_settings(uint32_t _replicas, uint32_t _shards) : num_replicas(_replicas), num_shards(_shards) {}
+
+  void dump(Formatter *f) const {
+    encode_json("number_of_replicas", num_replicas, f);
+    encode_json("number_of_shards", num_shards, f);
+  }
+};
+
+struct es_index_config_base {
+  virtual ~es_index_config_base() {}
+  virtual void dump(Formatter *f) const = 0;
+};
+
+template <class T>
+struct es_index_config : public es_index_config_base {
+  es_index_settings settings;
+  es_index_mappings<T> mappings;
+
+  es_index_config(es_index_settings& _s, ESVersion esv) : settings(_s), mappings(esv) {
+  }
+
+  void dump(Formatter *f) const {
+    encode_json("settings", settings, f);
+    encode_json("mappings", mappings, f);
+  }
+};
+
+static bool is_sys_attr(const std::string& attr_name){
+  static constexpr std::initializer_list<const char*> rgw_sys_attrs =
+                                                         {RGW_ATTR_PG_VER,
+                                                          RGW_ATTR_SOURCE_ZONE,
+                                                          RGW_ATTR_ID_TAG,
+                                                          RGW_ATTR_TEMPURL_KEY1,
+                                                          RGW_ATTR_TEMPURL_KEY2,
+                                                          RGW_ATTR_UNIX1,
+                                                          RGW_ATTR_UNIX_KEY1
+  };
+
+  return std::find(rgw_sys_attrs.begin(), rgw_sys_attrs.end(), attr_name) != rgw_sys_attrs.end();
+}
+
+static size_t attr_len(const bufferlist& val)
+{
+  size_t len = val.length();
+  if (len && val[len - 1] == '\0') {
+    --len;
+  }
+
+  return len;
+}
+
+struct es_obj_metadata {
+  const DoutPrefixProvider *dpp;
+  CephContext *cct;
+  ElasticConfigRef es_conf;
+  RGWBucketInfo bucket_info;
+  rgw_obj_key key;
+  ceph::real_time mtime;
+  uint64_t size;
+  map<string, bufferlist> attrs;
+  uint64_t versioned_epoch;
+
+  es_obj_metadata(CephContext *_cct, ElasticConfigRef _es_conf, const RGWBucketInfo& _bucket_info,
+                  const rgw_obj_key& _key, ceph::real_time& _mtime, uint64_t _size,
+                  map<string, bufferlist>& _attrs, uint64_t _versioned_epoch) : cct(_cct), es_conf(_es_conf), bucket_info(_bucket_info), key(_key),
+                                                     mtime(_mtime), size(_size), attrs(std::move(_attrs)), versioned_epoch(_versioned_epoch) {}
+
+  void dump(Formatter *f) const {
+    map<string, string> out_attrs;
+    map<string, string> custom_meta;
+    RGWAccessControlPolicy policy;
+    set<string> permissions;
+    RGWObjTags obj_tags;
+
+    for (auto i : attrs) {
+      const string& attr_name = i.first;
+      bufferlist& val = i.second;
+
+      if (!boost::algorithm::starts_with(attr_name, RGW_ATTR_PREFIX)) {
+        continue;
+      }
+
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_META_PREFIX)) {
+        custom_meta.emplace(attr_name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1),
+                            string(val.c_str(), attr_len(val)));
+        continue;
+      }
+
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_CRYPT_PREFIX)) {
+        continue;
+      }
+
+      if (boost::algorithm::starts_with(attr_name, RGW_ATTR_OLH_PREFIX)) {
+        // skip versioned object olh info
+        continue;
+      }
+
+      if (attr_name == RGW_ATTR_ACL) {
+        try {
+          auto i = val.cbegin();
+          decode(policy, i);
+        } catch (buffer::error& err) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to decode acl for " << bucket_info.bucket << "/" << key << dendl;
+          continue;
+        }
+
+        const RGWAccessControlList& acl = policy.get_acl();
+
+        permissions.insert(policy.get_owner().get_id().to_str());
+        for (auto acliter : acl.get_grant_map()) {
+          const ACLGrant& grant = acliter.second;
+          if (grant.get_type().get_type() == ACL_TYPE_CANON_USER &&
+              ((uint32_t)grant.get_permission().get_permissions() & RGW_PERM_READ) != 0) {
+            rgw_user user;
+            if (grant.get_id(user)) {
+              permissions.insert(user.to_str());
+            }
+          }
+        }
+      } else if (attr_name == RGW_ATTR_TAGS) {
+        try {
+          auto tags_bl = val.cbegin();
+          decode(obj_tags, tags_bl);
+        } catch (buffer::error& err) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to decode obj tags for "
+                       << bucket_info.bucket << "/" << key << dendl;
+          continue;
+        }
+      } else if (attr_name == RGW_ATTR_COMPRESSION) {
+        RGWCompressionInfo cs_info;
+        try {
+          auto vals_bl = val.cbegin();
+          decode(cs_info, vals_bl);
+        } catch (buffer::error& err) {
+          ldpp_dout(dpp, 0) << "ERROR: failed to decode compression attr for "
+                       << bucket_info.bucket << "/" << key << dendl;
+          continue;
+        }
+        out_attrs.emplace("compression",std::move(cs_info.compression_type));
+      } else {
+        if (!is_sys_attr(attr_name)) {
+          out_attrs.emplace(attr_name.substr(sizeof(RGW_ATTR_PREFIX) - 1),
+                            std::string(val.c_str(), attr_len(val)));
+        }
+      }
+    }
+    ::encode_json("bucket", bucket_info.bucket.name, f);
+    ::encode_json("name", key.name, f);
+    string instance = key.instance;
+    if (instance.empty())
+      instance = "null";
+    ::encode_json("instance", instance, f);
+    ::encode_json("versioned_epoch", versioned_epoch, f);
+    ::encode_json("owner", policy.get_owner(), f);
+    ::encode_json("permissions", permissions, f);
+    f->open_object_section("meta");
+    ::encode_json("size", size, f);
+
+    string mtime_str;
+    rgw_to_iso8601(mtime, &mtime_str);
+    ::encode_json("mtime", mtime_str, f);
+    for (auto i : out_attrs) {
+      ::encode_json(i.first.c_str(), i.second, f);
+    }
+    map<string, string> custom_str;
+    map<string, string> custom_int;
+    map<string, string> custom_date;
+
+    for (auto i : custom_meta) {
+      auto config = bucket_info.mdsearch_config.find(i.first);
+      if (config == bucket_info.mdsearch_config.end()) {
+        if (!es_conf->explicit_custom_meta) {
+          /* default custom meta is of type string */
+          custom_str[i.first] = i.second;
+        } else {
+          ldpp_dout(dpp, 20) << "custom meta entry key=" << i.first << " not found in bucket mdsearch config: " << bucket_info.mdsearch_config << dendl;
+        }
+        continue;
+      }
+      switch (config->second) {
+        case ESEntityTypeMap::ES_ENTITY_DATE:
+          custom_date[i.first] = i.second;
+          break;
+        case ESEntityTypeMap::ES_ENTITY_INT:
+          custom_int[i.first] = i.second;
+          break;
+        default:
+          custom_str[i.first] = i.second;
+      }
+    }
+
+    if (!custom_str.empty()) {
+      f->open_array_section("custom-string");
+      for (auto i : custom_str) {
+        f->open_object_section("entity");
+        ::encode_json("name", i.first.c_str(), f);
+        ::encode_json("value", i.second, f);
+        f->close_section();
+      }
+      f->close_section();
+    }
+    if (!custom_int.empty()) {
+      f->open_array_section("custom-int");
+      for (auto i : custom_int) {
+        f->open_object_section("entity");
+        ::encode_json("name", i.first.c_str(), f);
+        ::encode_json("value", i.second, f);
+        f->close_section();
+      }
+      f->close_section();
+    }
+    if (!custom_date.empty()) {
+      f->open_array_section("custom-date");
+      for (auto i : custom_date) {
+        /*
+         * try to exlicitly parse date field, otherwise elasticsearch could reject the whole doc,
+         * which will end up with failed sync
+         */
+        real_time t;
+        int r = parse_time(i.second.c_str(), &t);
+        if (r < 0) {
+          ldpp_dout(dpp, 20) << __func__ << "(): failed to parse time (" << i.second << "), skipping encoding of custom date attribute" << dendl;
+          continue;
+        }
+
+        string time_str;
+        rgw_to_iso8601(t, &time_str);
+
+        f->open_object_section("entity");
+        ::encode_json("name", i.first.c_str(), f);
+        ::encode_json("value", time_str.c_str(), f);
+        f->close_section();
+      }
+      f->close_section();
+    }
+    f->close_section(); // meta
+    const auto& m = obj_tags.get_tags();
+    if (m.size() > 0){
+      f->open_array_section("tagging");
+      for (const auto &it : m) {
+        f->open_object_section("tag");
+        ::encode_json("key", it.first, f);
+        ::encode_json("value",it.second, f);
+        f->close_section();
+      }
+      f->close_section(); // tagging
+    }
+  }
+};
+
+class RGWElasticGetESInfoCBCR : public RGWCoroutine {
+public:
+  RGWElasticGetESInfoCBCR(RGWDataSyncCtx *_sc, 
+                          ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
+                                                    sc(_sc), sync_env(_sc->env),
+                                                    conf(_conf) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ldpp_dout(dpp, 5) << conf->id << ": get elasticsearch info for zone: " << sc->source_zone << dendl;
+      yield call(new RGWReadRESTResourceCR<ESInfo> (sync_env->cct,
+                                                    conf->conn.get(),
+                                                    sync_env->http_manager,
+                                                    "/", nullptr /*params*/,
+                                                    &(conf->default_headers),
+                                                    &(conf->es_info)));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 5) << conf->id << ": get elasticsearch failed: " << retcode << dendl;
+        return set_cr_error(retcode);
+      }
+
+      ldpp_dout(dpp, 5) << conf->id << ": got elastic version=" << conf->es_info.get_version_str() << dendl;
+      return set_cr_done();
+    }
+    return 0;
+  }
+private:
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  ElasticConfigRef conf;
+};
+
+class RGWElasticPutIndexCBCR : public RGWCoroutine {
+public:
+  RGWElasticPutIndexCBCR(RGWDataSyncCtx *_sc,
+                         ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
+                                                   sc(_sc), sync_env(_sc->env),
+                                                   conf(_conf) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ldpp_dout(dpp, 5) << conf->id << ": put elasticsearch index for zone: " << sc->source_zone << dendl;
+
+      yield {
+        string path = conf->get_index_path();
+        es_index_settings settings(conf->num_replicas, conf->num_shards);
+        std::unique_ptr<es_index_config_base> index_conf;
+
+        if (conf->es_info.version >= ES_V5) {
+          ldpp_dout(dpp, 0) << "elasticsearch: index mapping: version >= 5" << dendl;
+          index_conf.reset(new es_index_config<es_type_v5>(settings, conf->es_info.version));
+        } else {
+          ldpp_dout(dpp, 0) << "elasticsearch: index mapping: version < 5" << dendl;
+          index_conf.reset(new es_index_config<es_type_v2>(settings, conf->es_info.version));
+        }
+        call(new RGWPutRESTResourceCR<es_index_config_base, int, _err_response> (sc->cct,
+                                                             conf->conn.get(),
+                                                             sync_env->http_manager,
+                                                             path, nullptr /*params*/,
+                                                             &(conf->default_headers),
+                                                             *index_conf, nullptr, &err_response));
+      }
+      if (retcode < 0) {
+
+        if (err_response.error.type != "index_already_exists_exception" &&
+	          err_response.error.type != "resource_already_exists_exception") {
+          ldpp_dout(dpp, 0) << "elasticsearch: failed to initialize index: response.type=" << err_response.error.type << " response.reason=" << err_response.error.reason << dendl;
+          return set_cr_error(retcode);
+        }
+
+        ldpp_dout(dpp, 0) << "elasticsearch: index already exists, assuming external initialization" << dendl;
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+
+private:
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  ElasticConfigRef conf;
+
+    struct _err_response {
+    struct err_reason {
+      vector<err_reason> root_cause;
+      string type;
+      string reason;
+      string index;
+
+      void decode_json(JSONObj *obj) {
+        JSONDecoder::decode_json("root_cause", root_cause, obj);
+        JSONDecoder::decode_json("type", type, obj);
+        JSONDecoder::decode_json("reason", reason, obj);
+        JSONDecoder::decode_json("index", index, obj);
+      }
+    } error;
+
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("error", error, obj);
+    }
+  } err_response;
+};
+
+class RGWElasticInitConfigCBCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  ElasticConfigRef conf;
+
+public:
+  RGWElasticInitConfigCBCR(RGWDataSyncCtx *_sc,
+                          ElasticConfigRef _conf) : RGWCoroutine(_sc->cct),
+                                                    sc(_sc), sync_env(_sc->env),
+                                                    conf(_conf) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+
+      yield call(new RGWElasticGetESInfoCBCR(sc, conf));
+
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+
+      yield call(new RGWElasticPutIndexCBCR(sc, conf));
+      if (retcode < 0) {
+          return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+
+};
+
+class RGWElasticHandleRemoteObjCBCR : public RGWStatRemoteObjCBCR {
+  rgw_bucket_sync_pipe sync_pipe;
+  ElasticConfigRef conf;
+  uint64_t versioned_epoch;
+public:
+  RGWElasticHandleRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                          rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
+                          ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWStatRemoteObjCBCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+                                                                               sync_pipe(_sync_pipe), conf(_conf),
+                                                                               versioned_epoch(_versioned_epoch) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ldpp_dout(dpp, 10) << ": stat of remote obj: z=" << sc->source_zone
+                               << " b=" << sync_pipe.info.source_bs.bucket << " k=" << key
+                               << " size=" << size << " mtime=" << mtime << dendl;
+
+      yield {
+        string path = conf->get_obj_path(sync_pipe.dest_bucket_info, key);
+        es_obj_metadata doc(sync_env->cct, conf, sync_pipe.dest_bucket_info, key, mtime, size, attrs, versioned_epoch);
+
+        call(new RGWPutRESTResourceCR<es_obj_metadata, int>(sync_env->cct, conf->conn.get(),
+                                                            sync_env->http_manager,
+                                                            path, nullptr /* params */,
+                                                            &(conf->default_headers),
+                                                            doc, nullptr /* result */));
+
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+class RGWElasticHandleRemoteObjCR : public RGWCallStatRemoteObjCR {
+  rgw_bucket_sync_pipe sync_pipe;
+  ElasticConfigRef conf;
+  uint64_t versioned_epoch;
+public:
+  RGWElasticHandleRemoteObjCR(RGWDataSyncCtx *_sc,
+                        rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key,
+                        ElasticConfigRef _conf, uint64_t _versioned_epoch) : RGWCallStatRemoteObjCR(_sc, _sync_pipe.info.source_bs.bucket, _key),
+                                                           sync_pipe(_sync_pipe),
+                                                           conf(_conf), versioned_epoch(_versioned_epoch) {
+  }
+
+  ~RGWElasticHandleRemoteObjCR() override {}
+
+  RGWStatRemoteObjCBCR *allocate_callback() override {
+    return new RGWElasticHandleRemoteObjCBCR(sc, sync_pipe, key, conf, versioned_epoch);
+  }
+};
+
+class RGWElasticRemoveRemoteObjCBCR : public RGWCoroutine {
+  RGWDataSyncCtx *sc;
+  RGWDataSyncEnv *sync_env;
+  rgw_bucket_sync_pipe sync_pipe;
+  rgw_obj_key key;
+  ceph::real_time mtime;
+  ElasticConfigRef conf;
+public:
+  RGWElasticRemoveRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                          rgw_bucket_sync_pipe& _sync_pipe, rgw_obj_key& _key, const ceph::real_time& _mtime,
+                          ElasticConfigRef _conf) : RGWCoroutine(_sc->cct), sc(_sc), sync_env(_sc->env),
+                                                        sync_pipe(_sync_pipe), key(_key),
+                                                        mtime(_mtime), conf(_conf) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    reenter(this) {
+      ldpp_dout(dpp, 10) << ": remove remote obj: z=" << sc->source_zone
+                               << " b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << dendl;
+      yield {
+        string path = conf->get_obj_path(sync_pipe.dest_bucket_info, key);
+
+        call(new RGWDeleteRESTResourceCR(sync_env->cct, conf->conn.get(),
+                                         sync_env->http_manager,
+                                         path, nullptr /* params */));
+      }
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+
+};
+
+class RGWElasticDataSyncModule : public RGWDataSyncModule {
+  ElasticConfigRef conf;
+public:
+  RGWElasticDataSyncModule(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config) : conf(std::make_shared<ElasticConfig>()) {
+    conf->init(cct, config);
+  }
+  ~RGWElasticDataSyncModule() override {}
+
+  void init(RGWDataSyncCtx *sc, uint64_t instance_id) override {
+    conf->init_instance(sc->env->svc->zone->get_realm(), instance_id);
+  }
+
+  RGWCoroutine *init_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) override {
+    ldpp_dout(dpp, 5) << conf->id << ": init" << dendl;
+    return new RGWElasticInitConfigCBCR(sc, conf);
+  }
+
+  RGWCoroutine *start_sync(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc) override {
+    ldpp_dout(dpp, 5) << conf->id << ": start_sync" << dendl;
+    // try to get elastic search version
+    return new RGWElasticGetESInfoCBCR(sc, conf);
+  }
+
+  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, const rgw_zone_set_entry& source_trace_entry, rgw_zone_set *zones_trace) override {
+    ldpp_dout(dpp, 10) << conf->id << ": sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+    if (!conf->should_handle_operation(sync_pipe.dest_bucket_info)) {
+      ldpp_dout(dpp, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl;
+      return nullptr;
+    }
+    return new RGWElasticHandleRemoteObjCR(sc, sync_pipe, key, conf, versioned_epoch.value_or(0));
+  }
+  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+    /* versioned and versioned epoch params are useless in the elasticsearch backend case */
+    ldpp_dout(dpp, 10) << conf->id << ": rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    if (!conf->should_handle_operation(sync_pipe.dest_bucket_info)) {
+      ldpp_dout(dpp, 10) << conf->id << ": skipping operation (bucket not approved)" << dendl;
+      return nullptr;
+    }
+    return new RGWElasticRemoveRemoteObjCBCR(sc, sync_pipe, key, mtime, conf);
+  }
+  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+    ldpp_dout(dpp, 10) << conf->id << ": create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+                            << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    ldpp_dout(dpp, 10) << conf->id << ": skipping operation (not handled)" << dendl;
+    return NULL;
+  }
+  RGWRESTConn *get_rest_conn() {
+    return conf->conn.get();
+  }
+
+  string get_index_path() {
+    return conf->get_index_path();
+  }
+
+  map<string, string>& get_request_headers() {
+    return conf->get_request_headers();
+  }
+};
+
+RGWElasticSyncModuleInstance::RGWElasticSyncModuleInstance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config)
+{
+  data_handler = std::unique_ptr<RGWElasticDataSyncModule>(new RGWElasticDataSyncModule(dpp, cct, config));
+}
+
+RGWDataSyncModule *RGWElasticSyncModuleInstance::get_data_handler()
+{
+  return data_handler.get();
+}
+
+RGWRESTConn *RGWElasticSyncModuleInstance::get_rest_conn()
+{
+  return data_handler->get_rest_conn();
+}
+
+string RGWElasticSyncModuleInstance::get_index_path() {
+  return data_handler->get_index_path();
+}
+
+map<string, string>& RGWElasticSyncModuleInstance::get_request_headers() {
+  return data_handler->get_request_headers();
+}
+
+RGWRESTMgr *RGWElasticSyncModuleInstance::get_rest_filter(int dialect, RGWRESTMgr *orig) {
+  if (dialect != RGW_REST_S3) {
+    return orig;
+  }
+  delete orig;
+  return new RGWRESTMgr_MDSearch_S3();
+}
+
+int RGWElasticSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
+  string endpoint = config["endpoint"];
+  instance->reset(new RGWElasticSyncModuleInstance(dpp, cct, config));
+  return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync_module_es.h b/src/rgw/driver/rados/rgw_sync_module_es.h
new file mode 100644
index 000000000..c8c9fcc43
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_es.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_sync_module.h"
+
+enum class ESType {
+  /* string datatypes */
+  String, /* Deprecated Since 5.X+ */
+  Text,
+  Keyword,
+
+  /* Numeric Types */
+  Long, Integer, Short, Byte, Double, Float, Half_Float, Scaled_Float,
+
+  /* Date Type */
+  Date,
+
+  /* Boolean */
+  Boolean,
+
+  /* Binary; Must Be Base64 Encoded */
+  Binary,
+
+  /* Range Types */
+  Integer_Range, Float_Range, Long_Range, Double_Range, Date_Range,
+
+  /* A Few Specialized Types */
+  Geo_Point,
+  Ip
+};
+
+
+class RGWElasticSyncModule : public RGWSyncModule {
+public:
+  RGWElasticSyncModule() {}
+  bool supports_data_export() override {
+    return false;
+  }
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
+
+class RGWElasticDataSyncModule;
+class RGWRESTConn;
+
+class RGWElasticSyncModuleInstance : public RGWSyncModuleInstance {
+  std::unique_ptr<RGWElasticDataSyncModule> data_handler;
+public:
+  RGWElasticSyncModuleInstance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config);
+  RGWDataSyncModule *get_data_handler() override;
+  RGWRESTMgr *get_rest_filter(int dialect, RGWRESTMgr *orig) override;
+  RGWRESTConn *get_rest_conn();
+  std::string get_index_path();
+  std::map<std::string, std::string>& get_request_headers();
+  bool supports_user_writes() override {
+    return true;
+  }
+};
diff --git a/src/rgw/driver/rados/rgw_sync_module_es_rest.cc b/src/rgw/driver/rados/rgw_sync_module_es_rest.cc
new file mode 100644
index 000000000..db9d48adb
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_es_rest.cc
@@ -0,0 +1,428 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync_module_es.h"
+#include "rgw_sync_module_es_rest.h"
+#include "rgw_es_query.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_sal_rados.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+struct es_index_obj_response {
+  string bucket;
+  rgw_obj_key key;
+  uint64_t versioned_epoch{0};
+  ACLOwner owner;
+  set<string> read_permissions;
+
+  struct {
+    uint64_t size{0};
+    ceph::real_time mtime;
+    string etag;
+    string content_type;
+    string storage_class;
+    map<string, string> custom_str;
+    map<string, int64_t> custom_int;
+    map<string, string> custom_date;
+
+    template <class T>
+    struct _custom_entry {
+      string name;
+      T value;
+      void decode_json(JSONObj *obj) {
+        JSONDecoder::decode_json("name", name, obj);
+        JSONDecoder::decode_json("value", value, obj);
+      }
+    };
+
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("size", size, obj);
+      string mtime_str;
+      JSONDecoder::decode_json("mtime", mtime_str, obj);
+      parse_time(mtime_str.c_str(), &mtime);
+      JSONDecoder::decode_json("etag", etag, obj);
+      JSONDecoder::decode_json("content_type", content_type, obj);
+      JSONDecoder::decode_json("storage_class", storage_class, obj);
+      list<_custom_entry<string> > str_entries;
+      JSONDecoder::decode_json("custom-string", str_entries, obj);
+      for (auto& e : str_entries) {
+        custom_str[e.name] = e.value;
+      }
+      list<_custom_entry<int64_t> > int_entries;
+      JSONDecoder::decode_json("custom-int", int_entries, obj);
+      for (auto& e : int_entries) {
+        custom_int[e.name] = e.value;
+      }
+      list<_custom_entry<string> > date_entries;
+      JSONDecoder::decode_json("custom-date", date_entries, obj);
+      for (auto& e : date_entries) {
+        custom_date[e.name] = e.value;
+      }
+    }
+  } meta;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("bucket", bucket, obj);
+    JSONDecoder::decode_json("name", key.name, obj);
+    JSONDecoder::decode_json("instance", key.instance, obj);
+    JSONDecoder::decode_json("versioned_epoch", versioned_epoch, obj);
+    JSONDecoder::decode_json("permissions", read_permissions, obj);
+    JSONDecoder::decode_json("owner", owner, obj);
+    JSONDecoder::decode_json("meta", meta, obj);
+  }
+};
+
+struct es_search_response {
+  uint32_t took;
+  bool timed_out;
+  struct {
+    uint32_t total;
+    uint32_t successful;
+    uint32_t failed;
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("total", total, obj);
+      JSONDecoder::decode_json("successful", successful, obj);
+      JSONDecoder::decode_json("failed", failed, obj);
+    }
+  } shards;
+  struct obj_hit {
+    string index;
+    string type;
+    string id;
+    // double score
+    es_index_obj_response source;
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("_index", index, obj);
+      JSONDecoder::decode_json("_type", type, obj);
+      JSONDecoder::decode_json("_id", id, obj);
+      JSONDecoder::decode_json("_source", source, obj);
+    }
+  };
+  struct {
+    uint32_t total;
+    // double max_score;
+    list<obj_hit> hits;
+    void decode_json(JSONObj *obj) {
+      JSONDecoder::decode_json("total", total, obj);
+      // JSONDecoder::decode_json("max_score", max_score, obj);
+      JSONDecoder::decode_json("hits", hits, obj);
+    }
+  } hits;
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("took", took, obj);
+    JSONDecoder::decode_json("timed_out", timed_out, obj);
+    JSONDecoder::decode_json("_shards", shards, obj);
+    JSONDecoder::decode_json("hits", hits, obj);
+  }
+};
+
+class RGWMetadataSearchOp : public RGWOp {
+  RGWSyncModuleInstanceRef sync_module_ref;
+  RGWElasticSyncModuleInstance *es_module;
+protected:
+  string expression;
+  string custom_prefix;
+#define MAX_KEYS_DEFAULT 100
+  uint64_t max_keys{MAX_KEYS_DEFAULT};
+  string marker_str;
+  uint64_t marker{0};
+  string next_marker;
+  bool is_truncated{false};
+  string err;
+
+  es_search_response response;
+
+public:
+  RGWMetadataSearchOp(const RGWSyncModuleInstanceRef& sync_module) : sync_module_ref(sync_module) {
+    es_module = static_cast<RGWElasticSyncModuleInstance *>(sync_module_ref.get());
+  }
+
+  int verify_permission(optional_yield) override {
+    return 0;
+  }
+  virtual int get_params() = 0;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "metadata_search"; }
+  virtual RGWOpType get_type() override { return RGW_OP_METADATA_SEARCH; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+void RGWMetadataSearchOp::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWMetadataSearchOp::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0)
+    return;
+
+  list<pair<string, string> > conds;
+
+  if (!s->user->get_info().system) {
+    conds.push_back(make_pair("permissions", s->user->get_id().to_str()));
+  }
+
+  if (!s->bucket_name.empty()) {
+    conds.push_back(make_pair("bucket", s->bucket_name));
+  }
+
+  ESQueryCompiler es_query(expression, &conds, custom_prefix);
+  
+  static map<string, string, ltstr_nocase> aliases = {
+                                  { "bucket", "bucket" }, /* forces lowercase */
+                                  { "name", "name" },
+                                  { "key", "name" },
+                                  { "instance", "instance" },
+                                  { "etag", "meta.etag" },
+                                  { "size", "meta.size" },
+                                  { "mtime", "meta.mtime" },
+                                  { "lastmodified", "meta.mtime" },
+                                  { "last_modified", "meta.mtime" },
+                                  { "contenttype", "meta.content_type" },
+                                  { "content_type", "meta.content_type" },
+                                  { "storageclass", "meta.storage_class" },
+                                  { "storage_class", "meta.storage_class" },
+  };
+  es_query.set_field_aliases(&aliases);
+
+  static map<string, ESEntityTypeMap::EntityType> generic_map = { {"bucket", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"name", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"instance", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"permissions", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.content_type", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE},
+                                                           {"meta.size", ESEntityTypeMap::ES_ENTITY_INT},
+                                                           {"meta.storage_class", ESEntityTypeMap::ES_ENTITY_STR} };
+  ESEntityTypeMap gm(generic_map);
+  es_query.set_generic_type_map(&gm);
+
+  static set<string> restricted_fields = { {"permissions"} };
+  es_query.set_restricted_fields(&restricted_fields);
+
+  map<string, ESEntityTypeMap::EntityType> custom_map;
+  for (auto& i : s->bucket->get_info().mdsearch_config) {
+    custom_map[i.first] = (ESEntityTypeMap::EntityType)i.second;
+  }
+
+  ESEntityTypeMap em(custom_map);
+  es_query.set_custom_type_map(&em);
+
+  bool valid = es_query.compile(&err);
+  if (!valid) {
+    ldpp_dout(this, 10) << "invalid query, failed generating request json" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  JSONFormatter f;
+  encode_json("root", es_query, &f);
+
+  RGWRESTConn *conn = es_module->get_rest_conn();
+
+  bufferlist in;
+  bufferlist out;
+
+  stringstream ss;
+
+  f.flush(ss);
+  in.append(ss.str());
+
+  string resource = es_module->get_index_path() + "/_search";
+  param_vec_t params;
+  static constexpr int BUFSIZE = 32;
+  char buf[BUFSIZE];
+  snprintf(buf, sizeof(buf), "%lld", (long long)max_keys);
+  params.push_back(param_pair_t("size", buf));
+  if (marker > 0) {
+    params.push_back(param_pair_t("from", marker_str.c_str()));
+  }
+  ldpp_dout(this, 20) << "sending request to elasticsearch, payload=" << string(in.c_str(), in.length()) << dendl;
+  auto& extra_headers = es_module->get_request_headers();
+  op_ret = conn->get_resource(s, resource, &params, &extra_headers,
+                              out, &in, nullptr, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to fetch resource (r=" << resource << ", ret=" << op_ret << ")" << dendl;
+    return;
+  }
+
+  ldpp_dout(this, 20) << "response: " << string(out.c_str(), out.length()) << dendl;
+
+  JSONParser jparser;
+  if (!jparser.parse(out.c_str(), out.length())) {
+    ldpp_dout(this, 0) << "ERROR: failed to parse elasticsearch response" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  try {
+    decode_json_obj(response, &jparser);
+  } catch (const JSONDecoder::err& e) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode JSON input: " << e.what() << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+}
+
+class RGWMetadataSearch_ObjStore_S3 : public RGWMetadataSearchOp {
+public:
+  explicit RGWMetadataSearch_ObjStore_S3(const RGWSyncModuleInstanceRef& _sync_module) : RGWMetadataSearchOp(_sync_module) {
+    custom_prefix = "x-amz-meta-";
+  }
+
+  int get_params() override {
+    expression = s->info.args.get("query");
+    bool exists;
+    string max_keys_str = s->info.args.get("max-keys", &exists);
+#define MAX_KEYS_MAX 10000
+    if (exists) {
+      string err;
+      max_keys = strict_strtoll(max_keys_str.c_str(), 10, &err);
+      if (!err.empty()) {
+        return -EINVAL;
+      }
+      if (max_keys > MAX_KEYS_MAX) {
+        max_keys = MAX_KEYS_MAX;
+      }
+    }
+    marker_str = s->info.args.get("marker", &exists);
+    if (exists) {
+      string err;
+      marker = strict_strtoll(marker_str.c_str(), 10, &err);
+      if (!err.empty()) {
+        return -EINVAL;
+      }
+    }
+    uint64_t nm = marker + max_keys;
+    static constexpr int BUFSIZE = 32;
+    char buf[BUFSIZE];
+    snprintf(buf, sizeof(buf), "%lld", (long long)nm);
+    next_marker = buf;
+    return 0;
+  }
+  void send_response() override {
+    if (op_ret) {
+      s->err.message = err;
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    is_truncated = (response.hits.hits.size() >= max_keys);
+
+    s->formatter->open_object_section("SearchMetadataResponse");
+    s->formatter->dump_string("Marker", marker_str);
+    s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false"));
+    if (is_truncated) {
+      s->formatter->dump_string("NextMarker", next_marker);
+    }
+    if (s->format == RGWFormat::JSON) {
+      s->formatter->open_array_section("Objects");
+    }
+    for (auto& i : response.hits.hits) {
+      s->formatter->open_object_section("Contents");
+      es_index_obj_response& e = i.source;
+      s->formatter->dump_string("Bucket", e.bucket);
+      s->formatter->dump_string("Key", e.key.name);
+      string instance = (!e.key.instance.empty() ? e.key.instance : "null");
+      s->formatter->dump_string("Instance", instance.c_str());
+      s->formatter->dump_int("VersionedEpoch", e.versioned_epoch);
+      dump_time(s, "LastModified", e.meta.mtime);
+      s->formatter->dump_int("Size", e.meta.size);
+      s->formatter->dump_format("ETag", "\"%s\"", e.meta.etag.c_str());
+      s->formatter->dump_string("ContentType", e.meta.content_type.c_str());
+      s->formatter->dump_string("StorageClass", e.meta.storage_class.c_str());
+      dump_owner(s, e.owner.get_id(), e.owner.get_display_name());
+      s->formatter->open_array_section("CustomMetadata");
+      for (auto& m : e.meta.custom_str) {
+        s->formatter->open_object_section("Entry");
+        s->formatter->dump_string("Name", m.first.c_str());
+        s->formatter->dump_string("Value", m.second);
+        s->formatter->close_section();
+      }
+      for (auto& m : e.meta.custom_int) {
+        s->formatter->open_object_section("Entry");
+        s->formatter->dump_string("Name", m.first.c_str());
+        s->formatter->dump_int("Value", m.second);
+        s->formatter->close_section();
+      }
+      for (auto& m : e.meta.custom_date) {
+        s->formatter->open_object_section("Entry");
+        s->formatter->dump_string("Name", m.first.c_str());
+        s->formatter->dump_string("Value", m.second);
+        s->formatter->close_section();
+      }
+      s->formatter->close_section();
+      rgw_flush_formatter(s, s->formatter);
+      s->formatter->close_section();
+    };
+    if (s->format == RGWFormat::JSON) {
+      s->formatter->close_section();
+    }
+    s->formatter->close_section();
+   rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+};
+
+class RGWHandler_REST_MDSearch_S3 : public RGWHandler_REST_S3 {
+protected:
+  RGWOp *op_get() override {
+    if (s->info.args.exists("query")) {
+      return new RGWMetadataSearch_ObjStore_S3(driver->get_sync_module());
+    }
+    if (!s->init_state.url_bucket.empty() &&
+        s->info.args.exists("mdsearch")) {
+      return new RGWGetBucketMetaSearch_ObjStore_S3;
+    }
+    return nullptr;
+  }
+  RGWOp *op_head() override {
+    return nullptr;
+  }
+  RGWOp *op_post() override {
+    return nullptr;
+  }
+public:
+  explicit RGWHandler_REST_MDSearch_S3(const rgw::auth::StrategyRegistry& auth_registry) : RGWHandler_REST_S3(auth_registry) {}
+  virtual ~RGWHandler_REST_MDSearch_S3() {}
+};
+
+
+RGWHandler_REST* RGWRESTMgr_MDSearch_S3::get_handler(rgw::sal::Driver* driver,
+						     req_state* const s,
+                                                     const rgw::auth::StrategyRegistry& auth_registry,
+                                                     const std::string& frontend_prefix)
+{
+  int ret =
+    RGWHandler_REST_S3::init_from_header(driver, s,
+					RGWFormat::XML, true);
+  if (ret < 0) {
+    return nullptr;
+  }
+
+  if (!s->object->empty()) {
+    return nullptr;
+  }
+
+  RGWHandler_REST *handler = new RGWHandler_REST_MDSearch_S3(auth_registry);
+
+  ldpp_dout(s, 20) << __func__ << " handler=" << typeid(*handler).name()
+		    << dendl;
+  return handler;
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync_module_es_rest.h b/src/rgw/driver/rados/rgw_sync_module_es_rest.h
new file mode 100644
index 000000000..b18271a69
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_es_rest.h
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+
+class RGWElasticSyncModuleInstance;
+
+class RGWRESTMgr_MDSearch_S3 : public RGWRESTMgr {
+public:
+  explicit RGWRESTMgr_MDSearch_S3() {}
+
+  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+			       req_state* s,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string& frontend_prefix) override;
+};
diff --git a/src/rgw/driver/rados/rgw_sync_module_log.cc b/src/rgw/driver/rados/rgw_sync_module_log.cc
new file mode 100644
index 000000000..9666ecc4c
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_log.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_sync_module.h"
+#include "rgw_data_sync.h"
+#include "rgw_sync_module_log.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWLogStatRemoteObjCBCR : public RGWStatRemoteObjCBCR {
+public:
+  RGWLogStatRemoteObjCBCR(RGWDataSyncCtx *_sc,
+                          rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWStatRemoteObjCBCR(_sc, _src_bucket, _key) {}
+  int operate(const DoutPrefixProvider *dpp) override {
+    ldpp_dout(dpp, 0) << "SYNC_LOG: stat of remote obj: z=" << sc->source_zone
+                            << " b=" << src_bucket << " k=" << key << " size=" << size << " mtime=" << mtime
+                            << " attrs=" << attrs << dendl;
+    return set_cr_done();
+  }
+
+};
+
+class RGWLogStatRemoteObjCR : public RGWCallStatRemoteObjCR {
+public:
+  RGWLogStatRemoteObjCR(RGWDataSyncCtx *_sc,
+                        rgw_bucket& _src_bucket, rgw_obj_key& _key) : RGWCallStatRemoteObjCR(_sc, _src_bucket, _key) {
+  }
+
+  ~RGWLogStatRemoteObjCR() override {}
+
+  RGWStatRemoteObjCBCR *allocate_callback() override {
+    return new RGWLogStatRemoteObjCBCR(sc, src_bucket, key);
+  }
+};
+
+class RGWLogDataSyncModule : public RGWDataSyncModule {
+  string prefix;
+public:
+  explicit RGWLogDataSyncModule(const string& _prefix) : prefix(_prefix) {}
+
+  RGWCoroutine *sync_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, std::optional<uint64_t> versioned_epoch, const rgw_zone_set_entry& source_trace_entry, rgw_zone_set *zones_trace) override {
+    ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: sync_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " versioned_epoch=" << versioned_epoch.value_or(0) << dendl;
+    return new RGWLogStatRemoteObjCR(sc, sync_pipe.info.source_bs.bucket, key);
+  }
+  RGWCoroutine *remove_object(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+    ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: rm_object: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    return NULL;
+  }
+  RGWCoroutine *create_delete_marker(const DoutPrefixProvider *dpp, RGWDataSyncCtx *sc, rgw_bucket_sync_pipe& sync_pipe, rgw_obj_key& key, real_time& mtime,
+                                     rgw_bucket_entry_owner& owner, bool versioned, uint64_t versioned_epoch, rgw_zone_set *zones_trace) override {
+    ldpp_dout(dpp, 0) << prefix << ": SYNC_LOG: create_delete_marker: b=" << sync_pipe.info.source_bs.bucket << " k=" << key << " mtime=" << mtime
+                            << " versioned=" << versioned << " versioned_epoch=" << versioned_epoch << dendl;
+    return NULL;
+  }
+};
+
+class RGWLogSyncModuleInstance : public RGWSyncModuleInstance {
+  RGWLogDataSyncModule data_handler;
+public:
+  explicit RGWLogSyncModuleInstance(const string& prefix) : data_handler(prefix) {}
+  RGWDataSyncModule *get_data_handler() override {
+    return &data_handler;
+  }
+};
+
+int RGWLogSyncModule::create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) {
+  string prefix = config["prefix"];
+  instance->reset(new RGWLogSyncModuleInstance(prefix));
+  return 0;
+}
+
diff --git a/src/rgw/driver/rados/rgw_sync_module_log.h b/src/rgw/driver/rados/rgw_sync_module_log.h
new file mode 100644
index 000000000..ab475959d
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_module_log.h
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_sync_module.h"
+
+class RGWLogSyncModule : public RGWSyncModule {
+public:
+  RGWLogSyncModule() {}
+  bool supports_data_export() override {
+    return false;
+  }
+  int create_instance(const DoutPrefixProvider *dpp, CephContext *cct, const JSONFormattable& config, RGWSyncModuleInstanceRef *instance) override;
+};
diff --git a/src/rgw/driver/rados/rgw_sync_trace.cc b/src/rgw/driver/rados/rgw_sync_trace.cc
new file mode 100644
index 000000000..b34683593
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_trace.cc
@@ -0,0 +1,290 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#ifndef CEPH_RGW_SYNC_TRACE_H
+#define CEPH_RGW_SYNC_TRACE_H
+
+#include <regex>
+
+#include "common/debug.h"
+#include "common/ceph_json.h"
+
+#include "rgw_sync_trace.h"
+#include "rgw_rados.h"
+#include "rgw_worker.h"
+
+#define dout_context g_ceph_context
+
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+using namespace std;
+
+
+RGWSyncTraceNode::RGWSyncTraceNode(CephContext *_cct, uint64_t _handle,
+                                   const RGWSyncTraceNodeRef& _parent,
+                                   const string& _type, const string& _id) : cct(_cct),
+                                                                             parent(_parent),
+                                                                             type(_type),
+                                                                             id(_id),
+                                                                             handle(_handle),
+                                                                             history(cct->_conf->rgw_sync_trace_per_node_log_size)
+{
+  if (parent.get()) {
+    prefix = parent->get_prefix();
+  }
+
+  if (!type.empty()) {
+    prefix += type;
+    if (!id.empty()) {
+      prefix += "[" + id + "]";
+    }
+    prefix += ":";
+  }
+}
+
+void RGWSyncTraceNode::log(int level, const string& s)
+{
+  status = s;
+  history.push_back(status);
+  /* dump output on either rgw_sync, or rgw -- but only once */
+  if (cct->_conf->subsys.should_gather(ceph_subsys_rgw_sync, level)) {
+    lsubdout(cct, rgw_sync,
+      ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl;
+  } else {
+    lsubdout(cct, rgw,
+      ceph::dout::need_dynamic(level)) << "RGW-SYNC:" << to_str() << dendl;
+  }
+}
+
+
+class RGWSyncTraceServiceMapThread : public RGWRadosThread {
+  RGWRados *store;
+  RGWSyncTraceManager *manager;
+
+  uint64_t interval_msec() override {
+    return cct->_conf->rgw_sync_trace_servicemap_update_interval * 1000;
+  }
+public:
+  RGWSyncTraceServiceMapThread(RGWRados *_store, RGWSyncTraceManager *_manager)
+    : RGWRadosThread(_store, "sync-trace"), store(_store), manager(_manager) {}
+
+  int process(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWSyncTraceServiceMapThread::process(const DoutPrefixProvider *dpp)
+{
+  map<string, string> status;
+  status["current_sync"] = manager->get_active_names();
+  int ret = store->update_service_map(dpp, std::move(status));
+  if (ret < 0) {
+    ldout(store->ctx(), 0) << "ERROR: update_service_map() returned ret=" << ret << dendl;
+  }
+  return 0;
+}
+
+RGWSyncTraceNodeRef RGWSyncTraceManager::add_node(const RGWSyncTraceNodeRef& parent,
+                                                  const std::string& type,
+                                                  const std::string& id)
+{
+  shunique_lock wl(lock, ceph::acquire_unique);
+  auto handle = alloc_handle();
+  RGWSyncTraceNodeRef& ref = nodes[handle];
+  ref.reset(new RGWSyncTraceNode(cct, handle, parent, type, id));
+  // return a separate shared_ptr that calls finish() on the node instead of
+  // deleting it. the lambda capture holds a reference to the original 'ref'
+  auto deleter = [ref, this] (RGWSyncTraceNode *node) { finish_node(node); };
+  return {ref.get(), deleter};
+}
+
+bool RGWSyncTraceNode::match(const string& search_term, bool search_history)
+{
+  try {
+    std::regex expr(search_term);
+    std::smatch m;
+
+    if (regex_search(prefix, m, expr)) {
+      return true;
+    }
+    if (regex_search(status, m,expr)) {
+      return true;
+    }
+    if (!search_history) {
+      return false;
+    }
+
+    for (auto h : history) {
+      if (regex_search(h, m, expr)) {
+        return true;
+      }
+    }
+  } catch (const std::regex_error& e) {
+    ldout(cct, 5) << "NOTICE: sync trace: bad expression: bad regex search term" << dendl;
+  }
+
+  return false;
+}
+
+void RGWSyncTraceManager::init(RGWRados *store)
+{
+  service_map_thread = new RGWSyncTraceServiceMapThread(store, this);
+  service_map_thread->start();
+}
+
+RGWSyncTraceManager::~RGWSyncTraceManager()
+{
+  cct->get_admin_socket()->unregister_commands(this);
+  service_map_thread->stop();
+  delete service_map_thread;
+
+  nodes.clear();
+}
+
+int RGWSyncTraceManager::hook_to_admin_command()
+{
+  AdminSocket *admin_socket = cct->get_admin_socket();
+
+  admin_commands = { { "sync trace show name=search,type=CephString,req=false", "sync trace show [filter_str]: show current multisite tracing information" },
+                     { "sync trace history name=search,type=CephString,req=false", "sync trace history [filter_str]: show history of multisite tracing information" },
+                     { "sync trace active name=search,type=CephString,req=false", "show active multisite sync entities information" },
+                     { "sync trace active_short name=search,type=CephString,req=false", "show active multisite sync entities entries" } };
+  for (auto cmd : admin_commands) {
+    int r = admin_socket->register_command(cmd[0], this,
+                                           cmd[1]);
+    if (r < 0) {
+      lderr(cct) << "ERROR: fail to register admin socket command (r=" << r << ")" << dendl;
+      return r;
+    }
+  }
+  return 0;
+}
+
+static void dump_node(RGWSyncTraceNode *entry, bool show_history, Formatter *f)
+{
+  f->open_object_section("entry");
+  ::encode_json("status", entry->to_str(), f);
+  if (show_history) {
+    f->open_array_section("history");
+    for (auto h : entry->get_history()) {
+      ::encode_json("entry", h, f);
+    }
+    f->close_section();
+  }
+  f->close_section();
+}
+
+string RGWSyncTraceManager::get_active_names()
+{
+  shunique_lock rl(lock, ceph::acquire_shared);
+
+  stringstream ss;
+  JSONFormatter f;
+
+  f.open_array_section("result");
+  for (auto n : nodes) {
+    auto& entry = n.second;
+
+    if (!entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
+      continue;
+    }
+    const string& name = entry->get_resource_name();
+    if (!name.empty()) {
+      ::encode_json("entry", name, &f);
+    }
+    f.flush(ss);
+  }
+  f.close_section();
+  f.flush(ss);
+
+  return ss.str();
+}
+
+int RGWSyncTraceManager::call(std::string_view command, const cmdmap_t& cmdmap,
+			      const bufferlist&,
+			      Formatter *f,
+			      std::ostream& ss,
+			      bufferlist& out) {
+
+  bool show_history = (command == "sync trace history");
+  bool show_short = (command == "sync trace active_short");
+  bool show_active = (command == "sync trace active") || show_short;
+
+  string search;
+
+  auto si = cmdmap.find("search");
+  if (si != cmdmap.end()) {
+    search = boost::get<string>(si->second);
+  }
+
+  shunique_lock rl(lock, ceph::acquire_shared);
+
+  f->open_object_section("result");
+  f->open_array_section("running");
+  for (auto n : nodes) {
+    auto& entry = n.second;
+
+    if (!search.empty() && !entry->match(search, show_history)) {
+      continue;
+    }
+    if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
+      continue;
+    }
+    if (show_short) {
+      const string& name = entry->get_resource_name();
+      if (!name.empty()) {
+        ::encode_json("entry", name, f);
+      }
+    } else {
+      dump_node(entry.get(), show_history, f);
+    }
+    f->flush(out);
+  }
+  f->close_section();
+
+  f->open_array_section("complete");
+  for (auto& entry : complete_nodes) {
+    if (!search.empty() && !entry->match(search, show_history)) {
+      continue;
+    }
+    if (show_active && !entry->test_flags(RGW_SNS_FLAG_ACTIVE)) {
+      continue;
+    }
+    dump_node(entry.get(), show_history, f);
+    f->flush(out);
+  }
+  f->close_section();
+
+  f->close_section();
+
+  return 0;
+}
+
+void RGWSyncTraceManager::finish_node(RGWSyncTraceNode *node)
+{
+  RGWSyncTraceNodeRef old_node;
+
+  {
+    shunique_lock wl(lock, ceph::acquire_unique);
+    if (!node) {
+      return;
+    }
+    auto iter = nodes.find(node->handle);
+    if (iter == nodes.end()) {
+      /* not found, already finished */
+      return;
+    }
+
+    if (complete_nodes.full()) {
+      /* take a reference to the entry that is going to be evicted,
+       * can't let it get evicted under lock held, otherwise
+       * it's a deadlock as it will call finish_node()
+       */
+      old_node = complete_nodes.front();
+    }
+
+    complete_nodes.push_back(iter->second);
+    nodes.erase(iter);
+  }
+};
+
+#endif
+
diff --git a/src/rgw/driver/rados/rgw_sync_trace.h b/src/rgw/driver/rados/rgw_sync_trace.h
new file mode 100644
index 000000000..1fcc8bed8
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_sync_trace.h
@@ -0,0 +1,141 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <atomic>
+
+#include "common/ceph_mutex.h"
+#include "common/shunique_lock.h"
+#include "common/admin_socket.h"
+
+#include <set>
+#include <ostream>
+#include <string>
+#include <shared_mutex>
+#include <boost/circular_buffer.hpp>
+
+#define SSTR(o) ({      \
+  std::stringstream ss; \
+  ss << o;              \
+  ss.str();             \
+})
+
+#define RGW_SNS_FLAG_ACTIVE   1
+#define RGW_SNS_FLAG_ERROR    2
+
+class RGWRados;
+class RGWSyncTraceManager;
+class RGWSyncTraceNode;
+class RGWSyncTraceServiceMapThread;
+
+using RGWSyncTraceNodeRef = std::shared_ptr<RGWSyncTraceNode>;
+
+class RGWSyncTraceNode final {
+  friend class RGWSyncTraceManager;
+
+  CephContext *cct;
+  RGWSyncTraceNodeRef parent;
+
+  uint16_t state{0};
+  std::string status;
+
+  ceph::mutex lock = ceph::make_mutex("RGWSyncTraceNode::lock");
+
+  std::string type;
+  std::string id;
+
+  std::string prefix;
+
+  std::string resource_name;
+
+  uint64_t handle;
+
+  boost::circular_buffer<std::string> history;
+
+  // private constructor, create with RGWSyncTraceManager::add_node()
+  RGWSyncTraceNode(CephContext *_cct, uint64_t _handle,
+                   const RGWSyncTraceNodeRef& _parent,
+                   const std::string& _type, const std::string& _id);
+
+ public:
+  void set_resource_name(const std::string& s) {
+    resource_name = s;
+  }
+
+  const std::string& get_resource_name() {
+    return resource_name;
+  }
+
+  void set_flag(uint16_t s) {
+    state |= s;
+  }
+  void unset_flag(uint16_t s) {
+    state &= ~s;
+  }
+  bool test_flags(uint16_t f) {
+    return (state & f) == f;
+  }
+  void log(int level, const std::string& s);
+
+  std::string to_str() {
+    return prefix + " " + status;
+  }
+
+  const std::string& get_prefix() {
+    return prefix;
+  }
+
+  std::ostream& operator<<(std::ostream& os) { 
+    os << to_str();
+    return os;            
+  }
+
+  boost::circular_buffer<std::string>& get_history() {
+    return history;
+  }
+
+  bool match(const std::string& search_term, bool search_history);
+};
+
+class RGWSyncTraceManager : public AdminSocketHook {
+  friend class RGWSyncTraceNode;
+
+  mutable std::shared_timed_mutex lock;
+  using shunique_lock = ceph::shunique_lock<decltype(lock)>;
+
+  CephContext *cct;
+  RGWSyncTraceServiceMapThread *service_map_thread{nullptr};
+
+  std::map<uint64_t, RGWSyncTraceNodeRef> nodes;
+  boost::circular_buffer<RGWSyncTraceNodeRef> complete_nodes;
+
+  std::atomic<uint64_t> count = { 0 };
+
+  std::list<std::array<std::string, 3> > admin_commands;
+
+  uint64_t alloc_handle() {
+    return ++count;
+  }
+  void finish_node(RGWSyncTraceNode *node);
+
+public:
+  RGWSyncTraceManager(CephContext *_cct, int max_lru) : cct(_cct), complete_nodes(max_lru) {}
+  ~RGWSyncTraceManager();
+
+  void init(RGWRados *store);
+
+  const RGWSyncTraceNodeRef root_node;
+
+  RGWSyncTraceNodeRef add_node(const RGWSyncTraceNodeRef& parent,
+                               const std::string& type,
+                               const std::string& id = "");
+
+  int hook_to_admin_command();
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   const bufferlist&,
+	   Formatter *f,
+	   std::ostream& ss,
+	   bufferlist& out) override;
+  std::string get_active_names();
+};
diff --git a/src/rgw/driver/rados/rgw_tools.cc b/src/rgw/driver/rados/rgw_tools.cc
new file mode 100644
index 000000000..66651da5c
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_tools.cc
@@ -0,0 +1,437 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+#include "librados/librados_asio.h"
+
+#include "include/stringify.h"
+
+#include "rgw_tools.h"
+#include "rgw_acl_s3.h"
+#include "rgw_aio_throttle.h"
+#include "rgw_compression.h"
+#include "common/BackTrace.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#define READ_CHUNK_LEN (512 * 1024)
+
+using namespace std;
+
+int rgw_init_ioctx(const DoutPrefixProvider *dpp,
+                   librados::Rados *rados, const rgw_pool& pool,
+                   librados::IoCtx& ioctx, bool create,
+                   bool mostly_omap,
+                   bool bulk)
+{
+  int r = rados->ioctx_create(pool.name.c_str(), ioctx);
+  if (r == -ENOENT && create) {
+    r = rados->pool_create(pool.name.c_str());
+    if (r == -ERANGE) {
+      ldpp_dout(dpp, 0)
+        << __func__
+        << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
+        << " (this can be due to a pool or placement group misconfiguration, e.g."
+        << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
+        << dendl;
+    }
+    if (r < 0 && r != -EEXIST) {
+      return r;
+    }
+
+    r = rados->ioctx_create(pool.name.c_str(), ioctx);
+    if (r < 0) {
+      return r;
+    }
+
+    r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
+    if (r < 0 && r != -EOPNOTSUPP) {
+      return r;
+    }
+
+    if (mostly_omap) {
+      // set pg_autoscale_bias
+      bufferlist inbl;
+      float bias = g_conf().get_val<double>("rgw_rados_pool_autoscale_bias");
+      int r = rados->mon_command(
+	"{\"prefix\": \"osd pool set\", \"pool\": \"" +
+	pool.name + "\", \"var\": \"pg_autoscale_bias\", \"val\": \"" +
+	stringify(bias) + "\"}",
+	inbl, NULL, NULL);
+      if (r < 0) {
+	ldpp_dout(dpp, 10) << __func__ << " warning: failed to set pg_autoscale_bias on "
+		 << pool.name << dendl;
+      }
+      // set recovery_priority
+      int p = g_conf().get_val<uint64_t>("rgw_rados_pool_recovery_priority");
+      r = rados->mon_command(
+	"{\"prefix\": \"osd pool set\", \"pool\": \"" +
+	pool.name + "\", \"var\": \"recovery_priority\": \"" +
+	stringify(p) + "\"}",
+	inbl, NULL, NULL);
+      if (r < 0) {
+	ldpp_dout(dpp, 10) << __func__ << " warning: failed to set recovery_priority on "
+		 << pool.name << dendl;
+      }
+    }
+    if (bulk) {
+      // set bulk
+      bufferlist inbl;
+      int r = rados->mon_command(
+        "{\"prefix\": \"osd pool set\", \"pool\": \"" +
+        pool.name + "\", \"var\": \"bulk\", \"val\": \"true\"}",
+        inbl, NULL, NULL);
+      if (r < 0) {
+        ldpp_dout(dpp, 10) << __func__ << " warning: failed to set 'bulk' on "
+                 << pool.name << dendl;
+      }
+    }
+  } else if (r < 0) {
+    return r;
+  }
+  if (!pool.ns.empty()) {
+    ioctx.set_namespace(pool.ns);
+  }
+  return 0;
+}
+
+map<string, bufferlist>* no_change_attrs() {
+  static map<string, bufferlist> no_change;
+  return &no_change;
+}
+
+int rgw_put_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+                       const rgw_pool& pool, const string& oid, bufferlist& data, bool exclusive,
+                       RGWObjVersionTracker *objv_tracker, real_time set_mtime, optional_yield y, map<string, bufferlist> *pattrs)
+{
+  map<string,bufferlist> no_attrs;
+  if (!pattrs) {
+    pattrs = &no_attrs;
+  }
+
+  rgw_raw_obj obj(pool, oid);
+
+  auto sysobj = svc_sysobj->get_obj(obj);
+  int ret;
+
+  if (pattrs != no_change_attrs()) {
+    ret = sysobj.wop()
+      .set_objv_tracker(objv_tracker)
+      .set_exclusive(exclusive)
+      .set_mtime(set_mtime)
+      .set_attrs(*pattrs)
+      .write(dpp, data, y);
+  } else {
+    ret = sysobj.wop()
+      .set_objv_tracker(objv_tracker)
+      .set_exclusive(exclusive)
+      .set_mtime(set_mtime)
+      .write_data(dpp, data, y);
+  }
+
+  return ret;
+}
+
+int rgw_stat_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+                        const rgw_pool& pool, const std::string& key,
+                        RGWObjVersionTracker *objv_tracker,
+			real_time *pmtime, optional_yield y,
+			std::map<std::string, bufferlist> *pattrs)
+{
+  rgw_raw_obj obj(pool, key);
+  auto sysobj = svc_sysobj->get_obj(obj);
+  return sysobj.rop()
+               .set_attrs(pattrs)
+               .set_last_mod(pmtime)
+               .stat(y, dpp);
+}
+
+
+int rgw_get_system_obj(RGWSI_SysObj* svc_sysobj, const rgw_pool& pool, const string& key, bufferlist& bl,
+                       RGWObjVersionTracker *objv_tracker, real_time *pmtime, optional_yield y,
+                       const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs,
+                       rgw_cache_entry_info *cache_info,
+		       boost::optional<obj_version> refresh_version, bool raw_attrs)
+{
+  const rgw_raw_obj obj(pool, key);
+  auto sysobj = svc_sysobj->get_obj(obj);
+  auto rop = sysobj.rop();
+  return rop.set_attrs(pattrs)
+            .set_last_mod(pmtime)
+            .set_objv_tracker(objv_tracker)
+            .set_raw_attrs(raw_attrs)
+            .set_cache_info(cache_info)
+            .set_refresh_version(refresh_version)
+            .read(dpp, &bl, y);
+}
+
+int rgw_delete_system_obj(const DoutPrefixProvider *dpp, 
+                          RGWSI_SysObj *sysobj_svc, const rgw_pool& pool, const string& oid,
+                          RGWObjVersionTracker *objv_tracker, optional_yield y)
+{
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid});
+  rgw_raw_obj obj(pool, oid);
+  return sysobj.wop()
+               .set_objv_tracker(objv_tracker)
+               .remove(dpp, y);
+}
+
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                      librados::ObjectReadOperation *op, bufferlist* pbl,
+                      optional_yield y, int flags)
+{
+  // given a yield_context, call async_operate() to yield the coroutine instead
+  // of blocking
+  if (y) {
+    auto& context = y.get_io_context();
+    auto& yield = y.get_yield_context();
+    boost::system::error_code ec;
+    auto bl = librados::async_operate(
+      context, ioctx, oid, op, flags, yield[ec]);
+    if (pbl) {
+      *pbl = std::move(bl);
+    }
+    return -ec.value();
+  }
+  // work on asio threads should be asynchronous, so warn when they block
+  if (is_asio_thread) {
+    ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
+#ifdef _BACKTRACE_LOGGING
+    ldpp_dout(dpp, 20) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl;
+#endif
+  }
+  return ioctx.operate(oid, op, nullptr, flags);
+}
+
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                      librados::ObjectWriteOperation *op, optional_yield y,
+		      int flags)
+{
+  if (y) {
+    auto& context = y.get_io_context();
+    auto& yield = y.get_yield_context();
+    boost::system::error_code ec;
+    librados::async_operate(context, ioctx, oid, op, flags, yield[ec]);
+    return -ec.value();
+  }
+  if (is_asio_thread) {
+    ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
+#ifdef _BACKTRACE_LOGGING
+    ldpp_dout(dpp, 20) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl;
+#endif
+  }
+  return ioctx.operate(oid, op, flags);
+}
+
+int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                     bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl,
+                     optional_yield y)
+{
+  if (y) {
+    auto& context = y.get_io_context();
+    auto& yield = y.get_yield_context();
+    boost::system::error_code ec;
+    auto reply = librados::async_notify(context, ioctx, oid,
+                                        bl, timeout_ms, yield[ec]);
+    if (pbl) {
+      *pbl = std::move(reply);
+    }
+    return -ec.value();
+  }
+  if (is_asio_thread) {
+    ldpp_dout(dpp, 20) << "WARNING: blocking librados call" << dendl;
+#ifdef _BACKTRACE_LOGGING
+    ldpp_dout(dpp, 20) << "BACKTRACE: " << __func__ << ": " << ClibBackTrace(0) << dendl;
+#endif
+  }
+  return ioctx.notify2(oid, bl, timeout_ms, pbl);
+}
+
+void rgw_filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
+                        map<string, bufferlist> *attrset)
+{
+  attrset->clear();
+  map<string, bufferlist>::iterator iter;
+  for (iter = unfiltered_attrset.lower_bound(check_prefix);
+       iter != unfiltered_attrset.end(); ++iter) {
+    if (!boost::algorithm::starts_with(iter->first, check_prefix))
+      break;
+    (*attrset)[iter->first] = iter->second;
+  }
+}
+
+RGWDataAccess::RGWDataAccess(rgw::sal::Driver* _driver) : driver(_driver)
+{
+}
+
+
+int RGWDataAccess::Bucket::finish_init()
+{
+  auto iter = attrs.find(RGW_ATTR_ACL);
+  if (iter == attrs.end()) {
+    return 0;
+  }
+
+  bufferlist::const_iterator bliter = iter->second.begin();
+  try {
+    policy.decode(bliter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RGWDataAccess::Bucket::init(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  int ret = sd->driver->get_bucket(dpp, nullptr, tenant, name, &bucket, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  bucket_info = bucket->get_info();
+  mtime = bucket->get_modification_time();
+  attrs = bucket->get_attrs();
+
+  return finish_init();
+}
+
+int RGWDataAccess::Bucket::init(const RGWBucketInfo& _bucket_info,
+				const map<string, bufferlist>& _attrs)
+{
+  bucket_info = _bucket_info;
+  attrs = _attrs;
+
+  return finish_init();
+}
+
+int RGWDataAccess::Bucket::get_object(const rgw_obj_key& key,
+				      ObjectRef *obj) {
+  obj->reset(new Object(sd, shared_from_this(), key));
+  return 0;
+}
+
+int RGWDataAccess::Object::put(bufferlist& data,
+			       map<string, bufferlist>& attrs,
+                               const DoutPrefixProvider *dpp,
+                               optional_yield y)
+{
+  rgw::sal::Driver* driver = sd->driver;
+  CephContext *cct = driver->ctx();
+
+  string tag;
+  append_rand_alpha(cct, tag, tag, 32);
+
+  RGWBucketInfo& bucket_info = bucket->bucket_info;
+
+  rgw::BlockingAioThrottle aio(driver->ctx()->_conf->rgw_put_obj_min_window_size);
+
+  std::unique_ptr<rgw::sal::Bucket> b;
+  driver->get_bucket(NULL, bucket_info, &b);
+  std::unique_ptr<rgw::sal::Object> obj = b->get_object(key);
+
+  auto& owner = bucket->policy.get_owner();
+
+  string req_id = driver->zone_unique_id(driver->get_new_req_id());
+
+  std::unique_ptr<rgw::sal::Writer> processor;
+  processor = driver->get_atomic_writer(dpp, y, obj.get(),
+				       owner.get_id(),
+				       nullptr, olh_epoch, req_id);
+
+  int ret = processor->prepare(y);
+  if (ret < 0)
+    return ret;
+
+  rgw::sal::DataProcessor *filter = processor.get();
+
+  CompressorRef plugin;
+  boost::optional<RGWPutObj_Compress> compressor;
+
+  const auto& compression_type = driver->get_compression_type(bucket_info.placement_rule);
+  if (compression_type != "none") {
+    plugin = Compressor::create(driver->ctx(), compression_type);
+    if (!plugin) {
+      ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
+        << compression_type << dendl;
+    } else {
+      compressor.emplace(driver->ctx(), plugin, filter);
+      filter = &*compressor;
+    }
+  }
+
+  off_t ofs = 0;
+  auto obj_size = data.length();
+
+  RGWMD5Etag etag_calc;
+
+  do {
+    size_t read_len = std::min(data.length(), (unsigned int)cct->_conf->rgw_max_chunk_size);
+
+    bufferlist bl;
+
+    data.splice(0, read_len, &bl);
+    etag_calc.update(bl);
+
+    ret = filter->process(std::move(bl), ofs);
+    if (ret < 0)
+      return ret;
+
+    ofs += read_len;
+  } while (data.length() > 0);
+
+  ret = filter->process({}, ofs);
+  if (ret < 0) {
+    return ret;
+  }
+  bool has_etag_attr = false;
+  auto iter = attrs.find(RGW_ATTR_ETAG);
+  if (iter != attrs.end()) {
+    bufferlist& bl = iter->second;
+    etag = bl.to_str();
+    has_etag_attr = true;
+  }
+
+  if (!aclbl) {
+    RGWAccessControlPolicy_S3 policy(cct);
+
+    policy.create_canned(bucket->policy.get_owner(), bucket->policy.get_owner(), string()); /* default private policy */
+
+    policy.encode(aclbl.emplace());
+  }
+
+  if (etag.empty()) {
+    etag_calc.finish(&etag);
+  }
+
+  if (!has_etag_attr) {
+    bufferlist etagbl;
+    etagbl.append(etag);
+    attrs[RGW_ATTR_ETAG] = etagbl;
+  }
+  attrs[RGW_ATTR_ACL] = *aclbl;
+
+  string *puser_data = nullptr;
+  if (user_data) {
+    puser_data = &(*user_data);
+  }
+
+  return processor->complete(obj_size, etag,
+			    &mtime, mtime,
+			    attrs, delete_at,
+                            nullptr, nullptr,
+                            puser_data,
+                            nullptr, nullptr, y);
+}
+
+void RGWDataAccess::Object::set_policy(const RGWAccessControlPolicy& policy)
+{
+  policy.encode(aclbl.emplace());
+}
+
+void rgw_complete_aio_completion(librados::AioCompletion* c, int r) {
+  auto pc = c->pc;
+  librados::CB_AioCompleteAndSafe cb(pc);
+  cb(r);
+}
diff --git a/src/rgw/driver/rados/rgw_tools.h b/src/rgw/driver/rados/rgw_tools.h
new file mode 100644
index 000000000..66600856d
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_tools.h
@@ -0,0 +1,276 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+
+#include "include/types.h"
+#include "include/ceph_hash.h"
+
+#include "common/ceph_time.h"
+
+#include "rgw_common.h"
+#include "rgw_sal_fwd.h"
+
+class RGWSI_SysObj;
+
+class RGWRados;
+struct RGWObjVersionTracker;
+class optional_yield;
+
+struct obj_version;
+
+
+int rgw_init_ioctx(const DoutPrefixProvider *dpp,
+                   librados::Rados *rados, const rgw_pool& pool,
+                   librados::IoCtx& ioctx,
+                   bool create = false,
+                   bool mostly_omap = false,
+                   bool bulk = false);
+
+#define RGW_NO_SHARD -1
+
+#define RGW_SHARDS_PRIME_0 7877
+#define RGW_SHARDS_PRIME_1 65521
+
+extern const std::string MP_META_SUFFIX;
+
+inline int rgw_shards_max()
+{
+  return RGW_SHARDS_PRIME_1;
+}
+
+// only called by rgw_shard_id and rgw_bucket_shard_index
+static inline int rgw_shards_mod(unsigned hval, int max_shards)
+{
+  if (max_shards <= RGW_SHARDS_PRIME_0) {
+    return hval % RGW_SHARDS_PRIME_0 % max_shards;
+  }
+  return hval % RGW_SHARDS_PRIME_1 % max_shards;
+}
+
+// used for logging and tagging
+inline int rgw_shard_id(const std::string& key, int max_shards)
+{
+  return rgw_shards_mod(ceph_str_hash_linux(key.c_str(), key.size()),
+			max_shards);
+}
+
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& key, std::string& name, int *shard_id);
+void rgw_shard_name(const std::string& prefix, unsigned max_shards, const std::string& section, const std::string& key, std::string& name);
+void rgw_shard_name(const std::string& prefix, unsigned shard_id, std::string& name);
+
+int rgw_put_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+                       const rgw_pool& pool, const std::string& oid,
+                       bufferlist& data, bool exclusive,
+                       RGWObjVersionTracker *objv_tracker,
+                       real_time set_mtime, optional_yield y,
+                       std::map<std::string, bufferlist> *pattrs = nullptr);
+int rgw_get_system_obj(RGWSI_SysObj* svc_sysobj, const rgw_pool& pool,
+                       const std::string& key, bufferlist& bl,
+                       RGWObjVersionTracker *objv_tracker, real_time *pmtime,
+                       optional_yield y, const DoutPrefixProvider *dpp,
+                       std::map<std::string, bufferlist> *pattrs = nullptr,
+                       rgw_cache_entry_info *cache_info = nullptr,
+		       boost::optional<obj_version> refresh_version = boost::none,
+                       bool raw_attrs=false);
+int rgw_delete_system_obj(const DoutPrefixProvider *dpp, 
+                          RGWSI_SysObj *sysobj_svc, const rgw_pool& pool, const std::string& oid,
+                          RGWObjVersionTracker *objv_tracker, optional_yield y);
+int rgw_stat_system_obj(const DoutPrefixProvider *dpp, RGWSI_SysObj* svc_sysobj,
+                        const rgw_pool& pool, const std::string& key,
+                        RGWObjVersionTracker *objv_tracker,
+                        real_time *pmtime, optional_yield y,
+                        std::map<std::string, bufferlist> *pattrs = nullptr);
+
+const char *rgw_find_mime_by_ext(std::string& ext);
+
+void rgw_filter_attrset(std::map<std::string, bufferlist>& unfiltered_attrset, const std::string& check_prefix,
+                        std::map<std::string, bufferlist> *attrset);
+
+/// indicates whether the current thread is in boost::asio::io_context::run(),
+/// used to log warnings if synchronous librados calls are made
+extern thread_local bool is_asio_thread;
+
+/// perform the rados operation, using the yield context when given
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                      librados::ObjectReadOperation *op, bufferlist* pbl,
+                      optional_yield y, int flags = 0);
+int rgw_rados_operate(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                      librados::ObjectWriteOperation *op, optional_yield y,
+		      int flags = 0);
+int rgw_rados_notify(const DoutPrefixProvider *dpp, librados::IoCtx& ioctx, const std::string& oid,
+                     bufferlist& bl, uint64_t timeout_ms, bufferlist* pbl,
+                     optional_yield y);
+
+int rgw_tools_init(const DoutPrefixProvider *dpp, CephContext *cct);
+void rgw_tools_cleanup();
+
+template<class H, size_t S>
+class RGWEtag
+{
+  H hash;
+
+public:
+  RGWEtag() {
+    if constexpr (std::is_same_v<H, MD5>) {
+      // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+      hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+    }
+  }
+
+  void update(const char *buf, size_t len) {
+    hash.Update((const unsigned char *)buf, len);
+  }
+
+  void update(bufferlist& bl) {
+    if (bl.length() > 0) {
+      update(bl.c_str(), bl.length());
+    }
+  }
+
+  void update(const std::string& s) {
+    if (!s.empty()) {
+      update(s.c_str(), s.size());
+    }
+  }
+  void finish(std::string *etag) {
+    char etag_buf[S];
+    char etag_buf_str[S * 2 + 16];
+
+    hash.Final((unsigned char *)etag_buf);
+    buf_to_hex((const unsigned char *)etag_buf, S,
+	       etag_buf_str);
+
+    *etag = etag_buf_str;
+  }
+};
+
+using RGWMD5Etag = RGWEtag<MD5, CEPH_CRYPTO_MD5_DIGESTSIZE>;
+
+class RGWDataAccess
+{
+  rgw::sal::Driver* driver;
+
+public:
+  RGWDataAccess(rgw::sal::Driver* _driver);
+
+  class Object;
+  class Bucket;
+
+  using BucketRef = std::shared_ptr<Bucket>;
+  using ObjectRef = std::shared_ptr<Object>;
+
+  class Bucket : public std::enable_shared_from_this<Bucket> {
+    friend class RGWDataAccess;
+    friend class Object;
+
+    RGWDataAccess *sd{nullptr};
+    RGWBucketInfo bucket_info;
+    std::string tenant;
+    std::string name;
+    std::string bucket_id;
+    ceph::real_time mtime;
+    std::map<std::string, bufferlist> attrs;
+
+    RGWAccessControlPolicy policy;
+    int finish_init();
+    
+    Bucket(RGWDataAccess *_sd,
+	   const std::string& _tenant,
+	   const std::string& _name,
+	   const std::string& _bucket_id) : sd(_sd),
+                                       tenant(_tenant),
+                                       name(_name),
+				       bucket_id(_bucket_id) {}
+    Bucket(RGWDataAccess *_sd) : sd(_sd) {}
+    int init(const DoutPrefixProvider *dpp, optional_yield y);
+    int init(const RGWBucketInfo& _bucket_info, const std::map<std::string, bufferlist>& _attrs);
+  public:
+    int get_object(const rgw_obj_key& key,
+		   ObjectRef *obj);
+
+  };
+
+
+  class Object {
+    RGWDataAccess *sd{nullptr};
+    BucketRef bucket;
+    rgw_obj_key key;
+
+    ceph::real_time mtime;
+    std::string etag;
+    uint64_t olh_epoch{0};
+    ceph::real_time delete_at;
+    std::optional<std::string> user_data;
+
+    std::optional<bufferlist> aclbl;
+
+    Object(RGWDataAccess *_sd,
+           BucketRef&& _bucket,
+           const rgw_obj_key& _key) : sd(_sd),
+                                      bucket(_bucket),
+                                      key(_key) {}
+  public:
+    int put(bufferlist& data, std::map<std::string, bufferlist>& attrs, const DoutPrefixProvider *dpp, optional_yield y); /* might modify attrs */
+
+    void set_mtime(const ceph::real_time& _mtime) {
+      mtime = _mtime;
+    }
+
+    void set_etag(const std::string& _etag) {
+      etag = _etag;
+    }
+
+    void set_olh_epoch(uint64_t epoch) {
+      olh_epoch = epoch;
+    }
+
+    void set_delete_at(ceph::real_time _delete_at) {
+      delete_at = _delete_at;
+    }
+
+    void set_user_data(const std::string& _user_data) {
+      user_data = _user_data;
+    }
+
+    void set_policy(const RGWAccessControlPolicy& policy);
+
+    friend class Bucket;
+  };
+
+  int get_bucket(const DoutPrefixProvider *dpp, 
+                 const std::string& tenant,
+		 const std::string name,
+		 const std::string bucket_id,
+		 BucketRef *bucket,
+		 optional_yield y) {
+    bucket->reset(new Bucket(this, tenant, name, bucket_id));
+    return (*bucket)->init(dpp, y);
+  }
+
+  int get_bucket(const RGWBucketInfo& bucket_info,
+		 const std::map<std::string, bufferlist>& attrs,
+		 BucketRef *bucket) {
+    bucket->reset(new Bucket(this));
+    return (*bucket)->init(bucket_info, attrs);
+  }
+  friend class Bucket;
+  friend class Object;
+};
+
+using RGWDataAccessRef = std::shared_ptr<RGWDataAccess>;
+
+/// Complete an AioCompletion. To return error values or otherwise
+/// satisfy the caller. Useful for making complicated asynchronous
+/// calls and error handling.
+void rgw_complete_aio_completion(librados::AioCompletion* c, int r);
+
+/// This returns a static, non-NULL pointer, recognized only by
+/// rgw_put_system_obj(). When supplied instead of the attributes, the
+/// attributes will be unmodified.
+///
+// (Currently providing nullptr will wipe all attributes.)
+
+std::map<std::string, ceph::buffer::list>* no_change_attrs();
diff --git a/src/rgw/driver/rados/rgw_trim_bilog.cc b/src/rgw/driver/rados/rgw_trim_bilog.cc
new file mode 100644
index 000000000..4e34abf51
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_bilog.cc
@@ -0,0 +1,1445 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * Author: Casey Bodley <cbodley@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#include <mutex>
+#include <boost/circular_buffer.hpp>
+#include <boost/container/flat_map.hpp>
+
+#include "include/scope_guard.h"
+#include "common/bounded_key_counter.h"
+#include "common/errno.h"
+#include "rgw_trim_bilog.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_cr_tools.h"
+#include "rgw_data_sync.h"
+#include "rgw_metadata.h"
+#include "rgw_sal.h"
+#include "rgw_zone.h"
+#include "rgw_sync.h"
+#include "rgw_bucket.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_meta.h"
+#include "services/svc_bilog_rados.h"
+
+#include <boost/asio/yield.hpp>
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "trim: ")
+
+using namespace std;
+
+using rgw::BucketTrimConfig;
+using BucketChangeCounter = BoundedKeyCounter<std::string, int>;
+
+const std::string rgw::BucketTrimStatus::oid = "bilog.trim";
+using rgw::BucketTrimStatus;
+
+
+// watch/notify api for gateways to coordinate about which buckets to trim
+enum TrimNotifyType {
+  NotifyTrimCounters = 0,
+  NotifyTrimComplete,
+};
+WRITE_RAW_ENCODER(TrimNotifyType);
+
+struct TrimNotifyHandler {
+  virtual ~TrimNotifyHandler() = default;
+
+  virtual void handle(bufferlist::const_iterator& input, bufferlist& output) = 0;
+};
+
+/// api to share the bucket trim counters between gateways in the same zone.
+/// each gateway will process different datalog shards, so the gateway that runs
+/// the trim process needs to accumulate their counters
+struct TrimCounters {
+  /// counter for a single bucket
+  struct BucketCounter {
+    std::string bucket; //< bucket instance metadata key
+    int count{0};
+
+    BucketCounter() = default;
+    BucketCounter(const std::string& bucket, int count)
+      : bucket(bucket), count(count) {}
+
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& p);
+  };
+  using Vector = std::vector<BucketCounter>;
+
+  /// request bucket trim counters from peer gateways
+  struct Request {
+    uint16_t max_buckets; //< maximum number of bucket counters to return
+
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& p);
+  };
+
+  /// return the current bucket trim counters
+  struct Response {
+    Vector bucket_counters;
+
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& p);
+  };
+
+  /// server interface to query the hottest buckets
+  struct Server {
+    virtual ~Server() = default;
+
+    virtual void get_bucket_counters(int count, Vector& counters) = 0;
+    virtual void reset_bucket_counters() = 0;
+  };
+
+  /// notify handler
+  class Handler : public TrimNotifyHandler {
+    Server *const server;
+   public:
+    explicit Handler(Server *server) : server(server) {}
+
+    void handle(bufferlist::const_iterator& input, bufferlist& output) override;
+  };
+};
+std::ostream& operator<<(std::ostream& out, const TrimCounters::BucketCounter& rhs)
+{
+  return out << rhs.bucket << ":" << rhs.count;
+}
+
+void TrimCounters::BucketCounter::encode(bufferlist& bl) const
+{
+  using ceph::encode;
+  // no versioning to save space
+  encode(bucket, bl);
+  encode(count, bl);
+}
+void TrimCounters::BucketCounter::decode(bufferlist::const_iterator& p)
+{
+  using ceph::decode;
+  decode(bucket, p);
+  decode(count, p);
+}
+WRITE_CLASS_ENCODER(TrimCounters::BucketCounter);
+
+void TrimCounters::Request::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(max_buckets, bl);
+  ENCODE_FINISH(bl);
+}
+void TrimCounters::Request::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(max_buckets, p);
+  DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimCounters::Request);
+
+void TrimCounters::Response::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(bucket_counters, bl);
+  ENCODE_FINISH(bl);
+}
+void TrimCounters::Response::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(bucket_counters, p);
+  DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimCounters::Response);
+
+void TrimCounters::Handler::handle(bufferlist::const_iterator& input,
+                                   bufferlist& output)
+{
+  Request request;
+  decode(request, input);
+  auto count = std::min<uint16_t>(request.max_buckets, 128);
+
+  Response response;
+  server->get_bucket_counters(count, response.bucket_counters);
+  encode(response, output);
+}
+
+/// api to notify peer gateways that trim has completed and their bucket change
+/// counters can be reset
+struct TrimComplete {
+  struct Request {
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& p);
+  };
+  struct Response {
+    void encode(bufferlist& bl) const;
+    void decode(bufferlist::const_iterator& p);
+  };
+
+  /// server interface to reset bucket counters
+  using Server = TrimCounters::Server;
+
+  /// notify handler
+  class Handler : public TrimNotifyHandler {
+    Server *const server;
+   public:
+    explicit Handler(Server *server) : server(server) {}
+
+    void handle(bufferlist::const_iterator& input, bufferlist& output) override;
+  };
+};
+
+void TrimComplete::Request::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ENCODE_FINISH(bl);
+}
+void TrimComplete::Request::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimComplete::Request);
+
+void TrimComplete::Response::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ENCODE_FINISH(bl);
+}
+void TrimComplete::Response::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  DECODE_FINISH(p);
+}
+WRITE_CLASS_ENCODER(TrimComplete::Response);
+
+void TrimComplete::Handler::handle(bufferlist::const_iterator& input,
+                                   bufferlist& output)
+{
+  Request request;
+  decode(request, input);
+
+  server->reset_bucket_counters();
+
+  Response response;
+  encode(response, output);
+}
+
+
+/// rados watcher for bucket trim notifications
+class BucketTrimWatcher : public librados::WatchCtx2 {
+  rgw::sal::RadosStore* const store;
+  const rgw_raw_obj& obj;
+  rgw_rados_ref ref;
+  uint64_t handle{0};
+
+  using HandlerPtr = std::unique_ptr<TrimNotifyHandler>;
+  boost::container::flat_map<TrimNotifyType, HandlerPtr> handlers;
+
+ public:
+  BucketTrimWatcher(rgw::sal::RadosStore* store, const rgw_raw_obj& obj,
+                    TrimCounters::Server *counters)
+    : store(store), obj(obj) {
+    handlers.emplace(NotifyTrimCounters,
+        std::make_unique<TrimCounters::Handler>(counters));
+    handlers.emplace(NotifyTrimComplete,
+        std::make_unique<TrimComplete::Handler>(counters));
+  }
+
+  ~BucketTrimWatcher() {
+    stop();
+  }
+
+  int start(const DoutPrefixProvider *dpp) {
+    int r = store->getRados()->get_raw_obj_ref(dpp, obj, &ref);
+    if (r < 0) {
+      return r;
+    }
+
+    // register a watch on the realm's control object
+    r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
+    if (r == -ENOENT) {
+      constexpr bool exclusive = true;
+      r = ref.pool.ioctx().create(ref.obj.oid, exclusive);
+      if (r == -EEXIST || r == 0) {
+        r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
+      }
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << "Failed to watch " << ref.obj
+          << " with " << cpp_strerror(-r) << dendl;
+      ref.pool.ioctx().close();
+      return r;
+    }
+
+    ldpp_dout(dpp, 10) << "Watching " << ref.obj.oid << dendl;
+    return 0;
+  }
+
+  int restart() {
+    int r = ref.pool.ioctx().unwatch2(handle);
+    if (r < 0) {
+      lderr(store->ctx()) << "Failed to unwatch on " << ref.obj
+          << " with " << cpp_strerror(-r) << dendl;
+    }
+    r = ref.pool.ioctx().watch2(ref.obj.oid, &handle, this);
+    if (r < 0) {
+      lderr(store->ctx()) << "Failed to restart watch on " << ref.obj
+          << " with " << cpp_strerror(-r) << dendl;
+      ref.pool.ioctx().close();
+    }
+    return r;
+  }
+
+  void stop() {
+    if (handle) {
+      ref.pool.ioctx().unwatch2(handle);
+      ref.pool.ioctx().close();
+    }
+  }
+
+  /// respond to bucket trim notifications
+  void handle_notify(uint64_t notify_id, uint64_t cookie,
+                     uint64_t notifier_id, bufferlist& bl) override {
+    if (cookie != handle) {
+      return;
+    }
+    bufferlist reply;
+    try {
+      auto p = bl.cbegin();
+      TrimNotifyType type;
+      decode(type, p);
+
+      auto handler = handlers.find(type);
+      if (handler != handlers.end()) {
+        handler->second->handle(p, reply);
+      } else {
+        lderr(store->ctx()) << "no handler for notify type " << type << dendl;
+      }
+    } catch (const buffer::error& e) {
+      lderr(store->ctx()) << "Failed to decode notification: " << e.what() << dendl;
+    }
+    ref.pool.ioctx().notify_ack(ref.obj.oid, notify_id, cookie, reply);
+  }
+
+  /// reestablish the watch if it gets disconnected
+  void handle_error(uint64_t cookie, int err) override {
+    if (cookie != handle) {
+      return;
+    }
+    if (err == -ENOTCONN) {
+      ldout(store->ctx(), 4) << "Disconnected watch on " << ref.obj << dendl;
+      restart();
+    }
+  }
+};
+
+
+/// Interface to communicate with the trim manager about completed operations
+struct BucketTrimObserver {
+  virtual ~BucketTrimObserver() = default;
+
+  virtual void on_bucket_trimmed(std::string&& bucket_instance) = 0;
+  virtual bool trimmed_recently(const std::string_view& bucket_instance) = 0;
+};
+
+/// trim each bilog shard to the given marker, while limiting the number of
+/// concurrent requests
+class BucketTrimShardCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* const store;
+  const RGWBucketInfo& bucket_info;
+  rgw::bucket_index_layout_generation generation;
+  const std::vector<std::string>& markers; //< shard markers to trim
+  size_t i{0}; //< index of current shard marker
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to trim bilog shard: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  BucketTrimShardCollectCR(const DoutPrefixProvider *dpp,
+                           rgw::sal::RadosStore* store, const RGWBucketInfo& bucket_info,
+			   const rgw::bucket_index_layout_generation& generation,
+                           const std::vector<std::string>& markers)
+    : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS),
+      dpp(dpp), store(store), bucket_info(bucket_info),
+      generation(generation), markers(markers)
+  {}
+  bool spawn_next() override;
+};
+
+bool BucketTrimShardCollectCR::spawn_next()
+{
+  while (i < markers.size()) {
+    const auto& marker = markers[i];
+    const auto shard_id = i++;
+
+    // skip empty markers
+    if (!marker.empty()) {
+      ldpp_dout(dpp, 10) << "trimming bilog shard " << shard_id
+          << " of " << bucket_info.bucket << " at marker " << marker << dendl;
+      spawn(new RGWRadosBILogTrimCR(dpp, store, bucket_info, shard_id,
+                                    generation, std::string{}, marker),
+            false);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Delete a BI generation, limiting the number of requests in flight.
+class BucketCleanIndexCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* const store;
+  const RGWBucketInfo& bucket_info;
+  rgw::bucket_index_layout_generation index;
+  uint32_t shard = 0;
+  const uint32_t num_shards = rgw::num_shards(index);
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "clean index: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  BucketCleanIndexCollectCR(const DoutPrefixProvider *dpp,
+			    rgw::sal::RadosStore* store,
+			    const RGWBucketInfo& bucket_info,
+			    rgw::bucket_index_layout_generation index)
+    : RGWShardCollectCR(store->ctx(), MAX_CONCURRENT_SHARDS),
+      dpp(dpp), store(store), bucket_info(bucket_info),
+      index(index)
+  {}
+  bool spawn_next() override {
+    if (shard < num_shards) {
+      RGWRados::BucketShard bs(store->getRados());
+      bs.init(dpp, bucket_info, index, shard);
+      spawn(new RGWRadosRemoveOidCR(store, std::move(bs.bucket_obj), nullptr),
+	    false);
+      ++shard;
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+
+/// trim the bilog of all of the given bucket instance's shards
+class BucketTrimInstanceCR : public RGWCoroutine {
+  static constexpr auto MAX_RETRIES = 25u;
+  rgw::sal::RadosStore* const store;
+  RGWHTTPManager *const http;
+  BucketTrimObserver *const observer;
+  std::string bucket_instance;
+  rgw_bucket_get_sync_policy_params get_policy_params;
+  std::shared_ptr<rgw_bucket_get_sync_policy_result> source_policy;
+  rgw_bucket bucket;
+  const std::string& zone_id; //< my zone id
+  RGWBucketInfo _bucket_info;
+  const RGWBucketInfo *pbucket_info; //< pointer to bucket instance info to locate bucket indices
+  int child_ret = 0;
+  const DoutPrefixProvider *dpp;
+public:
+  struct StatusShards {
+    uint64_t generation = 0;
+    std::vector<rgw_bucket_shard_sync_info> shards;
+  };
+private:
+  std::vector<StatusShards> peer_status; //< sync status for each peer
+  std::vector<std::string> min_markers; //< min marker per shard
+
+  /// The log generation to trim
+  rgw::bucket_log_layout_generation totrim;
+
+  /// Generation to be cleaned/New bucket info (if any)
+  std::optional<std::pair<RGWBucketInfo,
+			  rgw::bucket_log_layout_generation>> clean_info;
+  /// Maximum number of times to attempt to put bucket info
+  unsigned retries = 0;
+
+  int take_min_generation() {
+    // Initialize the min_generation to the bucket's current
+    // generation, used in case we have no peers.
+    auto min_generation = pbucket_info->layout.logs.back().gen;
+
+    // Determine the minimum generation
+    if (auto m = std::min_element(peer_status.begin(),
+				  peer_status.end(),
+				  [](const StatusShards& l,
+				     const StatusShards& r) {
+				    return l.generation < r.generation;
+				  }); m != peer_status.end()) {
+      min_generation = m->generation;
+    }
+
+    auto& logs = pbucket_info->layout.logs;
+    auto log = std::find_if(logs.begin(), logs.end(),
+			    rgw::matches_gen(min_generation));
+    if (log == logs.end()) {
+      ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			<< "ERROR: No log layout for min_generation="
+			<< min_generation << dendl;
+      return -ENOENT;
+    }
+
+    totrim = *log;
+    return 0;
+  }
+
+  /// If there is a generation below the minimum, prepare to clean it up.
+  int maybe_remove_generation() {
+    if (clean_info)
+      return 0;
+
+
+    if (pbucket_info->layout.logs.front().gen < totrim.gen) {
+      clean_info = {*pbucket_info, {}};
+      auto log = clean_info->first.layout.logs.cbegin();
+      clean_info->second = *log;
+
+      if (clean_info->first.layout.logs.size() == 1) {
+	ldpp_dout(dpp, -1)
+	  << "Critical error! Attempt to remove only log generation! "
+	  << "log.gen=" << log->gen << ", totrim.gen=" << totrim.gen
+	  << dendl;
+	return -EIO;
+      }
+      clean_info->first.layout.logs.erase(log);
+    }
+    return 0;
+  }
+
+ public:
+  BucketTrimInstanceCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                       BucketTrimObserver *observer,
+                       const std::string& bucket_instance,
+                       const DoutPrefixProvider *dpp)
+    : RGWCoroutine(store->ctx()), store(store),
+      http(http), observer(observer),
+      bucket_instance(bucket_instance),
+      zone_id(store->svc()->zone->get_zone().id),
+      dpp(dpp) {
+    rgw_bucket_parse_bucket_key(cct, bucket_instance, &bucket, nullptr);
+    source_policy = make_shared<rgw_bucket_get_sync_policy_result>();
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+namespace {
+/// populate the status with the minimum stable marker of each shard
+int take_min_status(
+  CephContext *cct,
+  const uint64_t min_generation,
+  std::vector<BucketTrimInstanceCR::StatusShards>::const_iterator first,
+  std::vector<BucketTrimInstanceCR::StatusShards>::const_iterator last,
+  std::vector<std::string> *status) {
+  for (auto peer = first; peer != last; ++peer) {
+    // Peers on later generations don't get a say in the matter
+    if (peer->generation > min_generation) {
+      continue;
+    }
+    if (peer->shards.size() != status->size()) {
+      // all peers must agree on the number of shards
+      return -EINVAL;
+    }
+
+    auto m = status->begin();
+    for (auto& shard : peer->shards) {
+      auto& marker = *m++;
+      // always take the first marker, or any later marker that's smaller
+      if (peer == first || marker > shard.inc_marker.position) {
+	marker = std::move(shard.inc_marker.position);
+      }
+    }
+  }
+  return 0;
+}
+}
+
+template<>
+inline int parse_decode_json<BucketTrimInstanceCR::StatusShards>(
+  BucketTrimInstanceCR::StatusShards& s, bufferlist& bl)
+{
+  JSONParser p;
+  if (!p.parse(bl.c_str(), bl.length())) {
+    return -EINVAL;
+  }
+
+  try {
+    bilog_status_v2 v;
+    decode_json_obj(v, &p);
+    s.generation = v.sync_status.incremental_gen;
+    s.shards = std::move(v.inc_status);
+  } catch (JSONDecoder::err& e) {
+    try {
+      // Fall back if we're talking to an old node that can't give v2
+      // output.
+      s.generation = 0;
+      decode_json_obj(s.shards, &p);
+    } catch (JSONDecoder::err& e) {
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
+int BucketTrimInstanceCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    ldpp_dout(dpp, 4) << "starting trim on bucket=" << bucket_instance << dendl;
+
+    get_policy_params.zone = zone_id;
+    get_policy_params.bucket = bucket;
+    yield call(new RGWBucketGetSyncPolicyHandlerCR(store->svc()->rados->get_async_processor(),
+                                                   store,
+                                                   get_policy_params,
+                                                   source_policy,
+                                                   dpp));
+    if (retcode < 0) {
+      if (retcode != -ENOENT) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to fetch policy handler for bucket=" << bucket << dendl;
+      }
+
+      return set_cr_error(retcode);
+    }
+
+    if (auto& opt_bucket_info = source_policy->policy_handler->get_bucket_info();
+        opt_bucket_info) {
+      pbucket_info = &(*opt_bucket_info);
+    } else {
+      /* this shouldn't really happen */
+      return set_cr_error(-ENOENT);
+    }
+
+    if (pbucket_info->layout.logs.empty()) {
+      return set_cr_done(); // no bilogs to trim
+    }
+
+    // query peers for sync status
+    set_status("fetching sync status from relevant peers");
+    yield {
+      const auto& all_dests = source_policy->policy_handler->get_all_dests();
+
+      vector<rgw_zone_id> zids;
+      rgw_zone_id last_zid;
+      for (auto& diter : all_dests) {
+        const auto& zid = diter.first;
+        if (zid == last_zid) {
+          continue;
+        }
+        last_zid = zid;
+        zids.push_back(zid);
+      }
+
+      peer_status.resize(zids.size());
+
+      auto& zone_conn_map = store->svc()->zone->get_zone_conn_map();
+
+      auto p = peer_status.begin();
+      for (auto& zid : zids) {
+        // query data sync status from each sync peer
+        rgw_http_param_pair params[] = {
+          { "type", "bucket-index" },
+          { "status", nullptr },
+          { "options", "merge" },
+          { "bucket", bucket_instance.c_str() }, /* equal to source-bucket when `options==merge` and source-bucket
+                                                    param is not provided */
+          { "source-zone", zone_id.c_str() },
+          { "version", "2" },
+          { nullptr, nullptr }
+        };
+
+        auto ziter = zone_conn_map.find(zid);
+        if (ziter == zone_conn_map.end()) {
+          ldpp_dout(dpp, 0) << "WARNING: no connection to zone " << zid << ", can't trim bucket: " << bucket << dendl;
+          return set_cr_error(-ECANCELED);
+        }
+
+	using StatusCR = RGWReadRESTResourceCR<StatusShards>;
+        spawn(new StatusCR(cct, ziter->second, http, "/admin/log/", params, &*p),
+              false);
+        ++p;
+      }
+    }
+    // wait for a response from each peer. all must respond to attempt trim
+    while (num_spawned()) {
+      yield wait_for_child();
+      collect(&child_ret, nullptr);
+      if (child_ret < 0) {
+        drain_all();
+        return set_cr_error(child_ret);
+      }
+    }
+
+    // Determine the minimum generation
+    retcode = take_min_generation();
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to find minimum generation" << dendl;
+      return set_cr_error(retcode);
+    }
+    retcode = maybe_remove_generation();
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "error removing old generation from log: "
+			<< cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+
+    if (clean_info) {
+      if (clean_info->second.layout.type != rgw::BucketLogType::InIndex) {
+	ldpp_dout(dpp, 0) << "Unable to convert log of unknown type "
+			  << clean_info->second.layout.type
+			  << " to rgw::bucket_index_layout_generation " << dendl;
+	return set_cr_error(-EINVAL);
+      }
+
+      yield call(new BucketCleanIndexCollectCR(dpp, store, clean_info->first,
+					       clean_info->second.layout.in_index));
+      if (retcode < 0) {
+	ldpp_dout(dpp, 0) << "failed to remove previous generation: "
+			  << cpp_strerror(retcode) << dendl;
+	return set_cr_error(retcode);
+      }
+      while (clean_info && retries < MAX_RETRIES) {
+	yield call(new RGWPutBucketInstanceInfoCR(
+		     store->svc()->rados->get_async_processor(),
+		     store, clean_info->first, false, {},
+		     no_change_attrs(), dpp));
+
+	// Raced, try again.
+	if (retcode == -ECANCELED) {
+	  yield call(new RGWGetBucketInstanceInfoCR(
+		       store->svc()->rados->get_async_processor(),
+		       store, clean_info->first.bucket,
+		       &(clean_info->first), nullptr, dpp));
+	  if (retcode < 0) {
+	    ldpp_dout(dpp, 0) << "failed to get bucket info: "
+			      << cpp_strerror(retcode) << dendl;
+	    return set_cr_error(retcode);
+	  }
+	  if (clean_info->first.layout.logs.front().gen ==
+	      clean_info->second.gen) {
+	    clean_info->first.layout.logs.erase(
+	      clean_info->first.layout.logs.begin());
+	    ++retries;
+	    continue;
+	  }
+	  // Raced, but someone else did what we needed to.
+	  retcode = 0;
+	}
+
+	if (retcode < 0) {
+	  ldpp_dout(dpp, 0) << "failed to put bucket info: "
+			    << cpp_strerror(retcode) << dendl;
+	  return set_cr_error(retcode);
+	}
+	clean_info = std::nullopt;
+      }
+    } else {
+      if (totrim.layout.type != rgw::BucketLogType::InIndex) {
+	ldpp_dout(dpp, 0) << "Unable to convert log of unknown type "
+			  << totrim.layout.type
+			  << " to rgw::bucket_index_layout_generation " << dendl;
+	return set_cr_error(-EINVAL);
+      }
+      // To avoid hammering the OSD too hard, either trim old
+      // generations OR trim the current one.
+
+      // determine the minimum marker for each shard
+
+      // initialize each shard with the maximum marker, which is only used when
+      // there are no peers syncing from us
+      min_markers.assign(std::max(1u, rgw::num_shards(totrim.layout.in_index)),
+			 RGWSyncLogTrimCR::max_marker);
+
+
+      retcode = take_min_status(cct, totrim.gen, peer_status.cbegin(),
+				peer_status.cend(), &min_markers);
+      if (retcode < 0) {
+	ldpp_dout(dpp, 4) << "failed to correlate bucket sync status from peers" << dendl;
+	return set_cr_error(retcode);
+      }
+
+      // trim shards with a ShardCollectCR
+      ldpp_dout(dpp, 10) << "trimming bilogs for bucket=" << pbucket_info->bucket
+			 << " markers=" << min_markers << ", shards=" << min_markers.size() << dendl;
+      set_status("trimming bilog shards");
+      yield call(new BucketTrimShardCollectCR(dpp, store, *pbucket_info, totrim.layout.in_index,
+					      min_markers));
+      // ENODATA just means there were no keys to trim
+      if (retcode == -ENODATA) {
+	retcode = 0;
+      }
+      if (retcode < 0) {
+	ldpp_dout(dpp, 4) << "failed to trim bilog shards: "
+			  << cpp_strerror(retcode) << dendl;
+	return set_cr_error(retcode);
+      }
+    }
+
+    observer->on_bucket_trimmed(std::move(bucket_instance));
+    return set_cr_done();
+  }
+  return 0;
+}
+
+/// trim each bucket instance while limiting the number of concurrent operations
+
+class BucketTrimInstanceCollectCR : public RGWShardCollectCR {
+  rgw::sal::RadosStore* const store;
+  RGWHTTPManager *const http;
+  BucketTrimObserver *const observer;
+  std::vector<std::string>::const_iterator bucket;
+  std::vector<std::string>::const_iterator end;
+  const DoutPrefixProvider *dpp;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to trim bucket instance: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  BucketTrimInstanceCollectCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                              BucketTrimObserver *observer,
+                              const std::vector<std::string>& buckets,
+                              int max_concurrent,
+                              const DoutPrefixProvider *dpp)
+    : RGWShardCollectCR(store->ctx(), max_concurrent),
+      store(store), http(http), observer(observer),
+      bucket(buckets.begin()), end(buckets.end()),
+      dpp(dpp)
+  {}
+  bool spawn_next() override;
+};
+
+bool BucketTrimInstanceCollectCR::spawn_next()
+{
+  if (bucket == end) {
+    return false;
+  }
+  spawn(new BucketTrimInstanceCR(store, http, observer, *bucket, dpp), false);
+  ++bucket;
+  return true;
+}
+
+/// correlate the replies from each peer gateway into the given counter
+int accumulate_peer_counters(bufferlist& bl, BucketChangeCounter& counter)
+{
+  counter.clear();
+
+  try {
+    // decode notify responses
+    auto p = bl.cbegin();
+    std::map<std::pair<uint64_t, uint64_t>, bufferlist> replies;
+    std::set<std::pair<uint64_t, uint64_t>> timeouts;
+    decode(replies, p);
+    decode(timeouts, p);
+
+    for (auto& peer : replies) {
+      auto q = peer.second.cbegin();
+      TrimCounters::Response response;
+      decode(response, q);
+      for (const auto& b : response.bucket_counters) {
+        counter.insert(b.bucket, b.count);
+      }
+    }
+  } catch (const buffer::error& e) {
+    return -EIO;
+  }
+  return 0;
+}
+
+/// metadata callback has the signature bool(string&& key, string&& marker)
+using MetadataListCallback = std::function<bool(std::string&&, std::string&&)>;
+
+/// lists metadata keys, passing each to a callback until it returns false.
+/// on reaching the end, it will restart at the beginning and list up to the
+/// initial marker
+class AsyncMetadataList : public RGWAsyncRadosRequest {
+  CephContext *const cct;
+  RGWMetadataManager *const mgr;
+  const std::string section;
+  const std::string start_marker;
+  MetadataListCallback callback;
+
+  int _send_request(const DoutPrefixProvider *dpp) override;
+ public:
+  AsyncMetadataList(CephContext *cct, RGWCoroutine *caller,
+                    RGWAioCompletionNotifier *cn, RGWMetadataManager *mgr,
+                    const std::string& section, const std::string& start_marker,
+                    const MetadataListCallback& callback)
+    : RGWAsyncRadosRequest(caller, cn), cct(cct), mgr(mgr),
+      section(section), start_marker(start_marker), callback(callback)
+  {}
+};
+
+int AsyncMetadataList::_send_request(const DoutPrefixProvider *dpp)
+{
+  void* handle = nullptr;
+  std::list<std::string> keys;
+  bool truncated{false};
+  std::string marker;
+
+  // start a listing at the given marker
+  int r = mgr->list_keys_init(dpp, section, start_marker, &handle);
+  if (r == -EINVAL) {
+    // restart with empty marker below
+  } else if (r < 0) {
+    ldpp_dout(dpp, 10) << "failed to init metadata listing: "
+        << cpp_strerror(r) << dendl;
+    return r;
+  } else {
+    ldpp_dout(dpp, 20) << "starting metadata listing at " << start_marker << dendl;
+
+    // release the handle when scope exits
+    auto g = make_scope_guard([=, this] { mgr->list_keys_complete(handle); });
+
+    do {
+      // get the next key and marker
+      r = mgr->list_keys_next(dpp, handle, 1, keys, &truncated);
+      if (r < 0) {
+        ldpp_dout(dpp, 10) << "failed to list metadata: "
+            << cpp_strerror(r) << dendl;
+        return r;
+      }
+      marker = mgr->get_marker(handle);
+
+      if (!keys.empty()) {
+        ceph_assert(keys.size() == 1);
+        auto& key = keys.front();
+        if (!callback(std::move(key), std::move(marker))) {
+          return 0;
+        }
+      }
+    } while (truncated);
+
+    if (start_marker.empty()) {
+      // already listed all keys
+      return 0;
+    }
+  }
+
+  // restart the listing from the beginning (empty marker)
+  handle = nullptr;
+
+  r = mgr->list_keys_init(dpp, section, "", &handle);
+  if (r < 0) {
+    ldpp_dout(dpp, 10) << "failed to restart metadata listing: "
+        << cpp_strerror(r) << dendl;
+    return r;
+  }
+  ldpp_dout(dpp, 20) << "restarting metadata listing" << dendl;
+
+  // release the handle when scope exits
+  auto g = make_scope_guard([=, this] { mgr->list_keys_complete(handle); });
+  do {
+    // get the next key and marker
+    r = mgr->list_keys_next(dpp, handle, 1, keys, &truncated);
+    if (r < 0) {
+      ldpp_dout(dpp, 10) << "failed to list metadata: "
+          << cpp_strerror(r) << dendl;
+      return r;
+    }
+    marker = mgr->get_marker(handle);
+
+    if (!keys.empty()) {
+      ceph_assert(keys.size() == 1);
+      auto& key = keys.front();
+      // stop at original marker
+      if (marker > start_marker) {
+        return 0;
+      }
+      if (!callback(std::move(key), std::move(marker))) {
+        return 0;
+      }
+    }
+  } while (truncated);
+
+  return 0;
+}
+
+/// coroutine wrapper for AsyncMetadataList
+class MetadataListCR : public RGWSimpleCoroutine {
+  RGWAsyncRadosProcessor *const async_rados;
+  RGWMetadataManager *const mgr;
+  const std::string& section;
+  const std::string& start_marker;
+  MetadataListCallback callback;
+  RGWAsyncRadosRequest *req{nullptr};
+ public:
+  MetadataListCR(CephContext *cct, RGWAsyncRadosProcessor *async_rados,
+                 RGWMetadataManager *mgr, const std::string& section,
+                 const std::string& start_marker,
+                 const MetadataListCallback& callback)
+    : RGWSimpleCoroutine(cct), async_rados(async_rados), mgr(mgr),
+      section(section), start_marker(start_marker), callback(callback)
+  {}
+  ~MetadataListCR() override {
+    request_cleanup();
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new AsyncMetadataList(cct, this, stack->create_completion_notifier(),
+                                mgr, section, start_marker, callback);
+    async_rados->queue(req);
+    return 0;
+  }
+  int request_complete() override {
+    return req->get_ret_status();
+  }
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = nullptr;
+    }
+  }
+};
+
+class BucketTrimCR : public RGWCoroutine {
+  rgw::sal::RadosStore* const store;
+  RGWHTTPManager *const http;
+  const BucketTrimConfig& config;
+  BucketTrimObserver *const observer;
+  const rgw_raw_obj& obj;
+  ceph::mono_time start_time;
+  bufferlist notify_replies;
+  BucketChangeCounter counter;
+  std::vector<std::string> buckets; //< buckets selected for trim
+  BucketTrimStatus status;
+  RGWObjVersionTracker objv; //< version tracker for trim status object
+  std::string last_cold_marker; //< position for next trim marker
+  const DoutPrefixProvider *dpp;
+
+  static const std::string section; //< metadata section for bucket instances
+ public:
+  BucketTrimCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+               const BucketTrimConfig& config, BucketTrimObserver *observer,
+               const rgw_raw_obj& obj, const DoutPrefixProvider *dpp)
+    : RGWCoroutine(store->ctx()), store(store), http(http), config(config),
+      observer(observer), obj(obj), counter(config.counter_size), dpp(dpp)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+const std::string BucketTrimCR::section{"bucket.instance"};
+
+int BucketTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    start_time = ceph::mono_clock::now();
+
+    if (config.buckets_per_interval) {
+      // query watch/notify for hot buckets
+      ldpp_dout(dpp, 10) << "fetching active bucket counters" << dendl;
+      set_status("fetching active bucket counters");
+      yield {
+        // request the top bucket counters from each peer gateway
+        const TrimNotifyType type = NotifyTrimCounters;
+        TrimCounters::Request request{32};
+        bufferlist bl;
+        encode(type, bl);
+        encode(request, bl);
+        call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms,
+                                  &notify_replies));
+      }
+      if (retcode < 0) {
+        ldpp_dout(dpp, 10) << "failed to fetch peer bucket counters" << dendl;
+        return set_cr_error(retcode);
+      }
+
+      // select the hottest buckets for trim
+      retcode = accumulate_peer_counters(notify_replies, counter);
+      if (retcode < 0) {
+        ldout(cct, 4) << "failed to correlate peer bucket counters" << dendl;
+        return set_cr_error(retcode);
+      }
+      buckets.reserve(config.buckets_per_interval);
+
+      const int max_count = config.buckets_per_interval -
+                            config.min_cold_buckets_per_interval;
+      counter.get_highest(max_count,
+        [this] (const std::string& bucket, int count) {
+          buckets.push_back(bucket);
+        });
+    }
+
+    if (buckets.size() < config.buckets_per_interval) {
+      // read BucketTrimStatus for marker position
+      set_status("reading trim status");
+      using ReadStatus = RGWSimpleRadosReadCR<BucketTrimStatus>;
+      yield call(new ReadStatus(dpp, store, obj, &status, true, &objv));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 10) << "failed to read bilog trim status: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      if (status.marker == "MAX") {
+        status.marker.clear(); // restart at the beginning
+      }
+      ldpp_dout(dpp, 10) << "listing cold buckets from marker="
+          << status.marker << dendl;
+
+      set_status("listing cold buckets for trim");
+      yield {
+        // capture a reference so 'this' remains valid in the callback
+        auto ref = boost::intrusive_ptr<RGWCoroutine>{this};
+        // list cold buckets to consider for trim
+        auto cb = [this, ref] (std::string&& bucket, std::string&& marker) {
+          // filter out keys that we trimmed recently
+          if (observer->trimmed_recently(bucket)) {
+            return true;
+          }
+          // filter out active buckets that we've already selected
+          auto i = std::find(buckets.begin(), buckets.end(), bucket);
+          if (i != buckets.end()) {
+            return true;
+          }
+          buckets.emplace_back(std::move(bucket));
+          // remember the last cold bucket spawned to update the status marker
+          last_cold_marker = std::move(marker);
+          // return true if there's room for more
+          return buckets.size() < config.buckets_per_interval;
+        };
+
+        call(new MetadataListCR(cct, store->svc()->rados->get_async_processor(),
+                                store->ctl()->meta.mgr,
+                                section, status.marker, cb));
+      }
+      if (retcode < 0) {
+        ldout(cct, 4) << "failed to list bucket instance metadata: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+    }
+
+    // trim bucket instances with limited concurrency
+    set_status("trimming buckets");
+    ldpp_dout(dpp, 4) << "collected " << buckets.size() << " buckets for trim" << dendl;
+    yield call(new BucketTrimInstanceCollectCR(store, http, observer, buckets,
+                                               config.concurrent_buckets, dpp));
+    // ignore errors from individual buckets
+
+    // write updated trim status
+    if (!last_cold_marker.empty() && status.marker != last_cold_marker) {
+      set_status("writing updated trim status");
+      status.marker = std::move(last_cold_marker);
+      ldpp_dout(dpp, 20) << "writing bucket trim marker=" << status.marker << dendl;
+      using WriteStatus = RGWSimpleRadosWriteCR<BucketTrimStatus>;
+      yield call(new WriteStatus(dpp, store, obj, status, &objv));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 4) << "failed to write updated trim status: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+    }
+
+    // notify peers that trim completed
+    set_status("trim completed");
+    yield {
+      const TrimNotifyType type = NotifyTrimComplete;
+      TrimComplete::Request request;
+      bufferlist bl;
+      encode(type, bl);
+      encode(request, bl);
+      call(new RGWRadosNotifyCR(store, obj, bl, config.notify_timeout_ms,
+                                nullptr));
+    }
+    if (retcode < 0) {
+      ldout(cct, 10) << "failed to notify peers of trim completion" << dendl;
+      return set_cr_error(retcode);
+    }
+
+    ldpp_dout(dpp, 4) << "bucket index log processing completed in "
+        << ceph::mono_clock::now() - start_time << dendl;
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class BucketTrimPollCR : public RGWCoroutine {
+  rgw::sal::RadosStore* const store;
+  RGWHTTPManager *const http;
+  const BucketTrimConfig& config;
+  BucketTrimObserver *const observer;
+  const rgw_raw_obj& obj;
+  const std::string name{"trim"}; //< lock name
+  const std::string cookie;
+  const DoutPrefixProvider *dpp;
+
+ public:
+  BucketTrimPollCR(rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                   const BucketTrimConfig& config,
+                   BucketTrimObserver *observer, const rgw_raw_obj& obj,
+                   const DoutPrefixProvider *dpp)
+    : RGWCoroutine(store->ctx()), store(store), http(http),
+      config(config), observer(observer), obj(obj),
+      cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)),
+      dpp(dpp) {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int BucketTrimPollCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    for (;;) {
+      set_status("sleeping");
+      wait(utime_t{static_cast<time_t>(config.trim_interval_sec), 0});
+
+      // prevent others from trimming for our entire wait interval
+      set_status("acquiring trim lock");
+      yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
+                                          obj, name, cookie,
+                                          config.trim_interval_sec));
+      if (retcode < 0) {
+        ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
+        continue;
+      }
+
+      set_status("trimming");
+      yield call(new BucketTrimCR(store, http, config, observer, obj, dpp));
+      if (retcode < 0) {
+        // on errors, unlock so other gateways can try
+        set_status("unlocking");
+        yield call(new RGWSimpleRadosUnlockCR(store->svc()->rados->get_async_processor(), store,
+                                              obj, name, cookie));
+      }
+    }
+  }
+  return 0;
+}
+
+/// tracks a bounded list of events with timestamps. old events can be expired,
+/// and recent events can be searched by key. expiration depends on events being
+/// inserted in temporal order
+template <typename T, typename Clock = ceph::coarse_mono_clock>
+class RecentEventList {
+ public:
+  using clock_type = Clock;
+  using time_point = typename clock_type::time_point;
+
+  RecentEventList(size_t max_size, const ceph::timespan& max_duration)
+    : events(max_size), max_duration(max_duration)
+  {}
+
+  /// insert an event at the given point in time. this time must be at least as
+  /// recent as the last inserted event
+  void insert(T&& value, const time_point& now) {
+    // ceph_assert(events.empty() || now >= events.back().time)
+    events.push_back(Event{std::move(value), now});
+  }
+
+  /// performs a linear search for an event matching the given key, whose type
+  /// U can be any that provides operator==(U, T)
+  template <typename U>
+  bool lookup(const U& key) const {
+    for (const auto& event : events) {
+      if (key == event.value) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /// remove events that are no longer recent compared to the given point in time
+  void expire_old(const time_point& now) {
+    const auto expired_before = now - max_duration;
+    while (!events.empty() && events.front().time < expired_before) {
+      events.pop_front();
+    }
+  }
+
+ private:
+  struct Event {
+    T value;
+    time_point time;
+  };
+  boost::circular_buffer<Event> events;
+  const ceph::timespan max_duration;
+};
+
+namespace rgw {
+
+// read bucket trim configuration from ceph context
+void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config)
+{
+  const auto& conf = cct->_conf;
+
+  config.trim_interval_sec =
+      conf.get_val<int64_t>("rgw_sync_log_trim_interval");
+  config.counter_size = 512;
+  config.buckets_per_interval =
+      conf.get_val<int64_t>("rgw_sync_log_trim_max_buckets");
+  config.min_cold_buckets_per_interval =
+      conf.get_val<int64_t>("rgw_sync_log_trim_min_cold_buckets");
+  config.concurrent_buckets =
+      conf.get_val<int64_t>("rgw_sync_log_trim_concurrent_buckets");
+  config.notify_timeout_ms = 10000;
+  config.recent_size = 128;
+  config.recent_duration = std::chrono::hours(2);
+}
+
+class BucketTrimManager::Impl : public TrimCounters::Server,
+                                public BucketTrimObserver {
+ public:
+   rgw::sal::RadosStore* const store;
+  const BucketTrimConfig config;
+
+  const rgw_raw_obj status_obj;
+
+  /// count frequency of bucket instance entries in the data changes log
+  BucketChangeCounter counter;
+
+  using RecentlyTrimmedBucketList = RecentEventList<std::string>;
+  using clock_type = RecentlyTrimmedBucketList::clock_type;
+  /// track recently trimmed buckets to focus trim activity elsewhere
+  RecentlyTrimmedBucketList trimmed;
+
+  /// serve the bucket trim watch/notify api
+  BucketTrimWatcher watcher;
+
+  /// protect data shared between data sync, trim, and watch/notify threads
+  std::mutex mutex;
+
+  Impl(rgw::sal::RadosStore* store, const BucketTrimConfig& config)
+    : store(store), config(config),
+      status_obj(store->svc()->zone->get_zone_params().log_pool, BucketTrimStatus::oid),
+      counter(config.counter_size),
+      trimmed(config.recent_size, config.recent_duration),
+      watcher(store, status_obj, this)
+  {}
+
+  /// TrimCounters::Server interface for watch/notify api
+  void get_bucket_counters(int count, TrimCounters::Vector& buckets) {
+    buckets.reserve(count);
+    std::lock_guard<std::mutex> lock(mutex);
+    counter.get_highest(count, [&buckets] (const std::string& key, int count) {
+                          buckets.emplace_back(key, count);
+                        });
+    ldout(store->ctx(), 20) << "get_bucket_counters: " << buckets << dendl;
+  }
+
+  void reset_bucket_counters() override {
+    ldout(store->ctx(), 20) << "bucket trim completed" << dendl;
+    std::lock_guard<std::mutex> lock(mutex);
+    counter.clear();
+    trimmed.expire_old(clock_type::now());
+  }
+
+  /// BucketTrimObserver interface to remember successfully-trimmed buckets
+  void on_bucket_trimmed(std::string&& bucket_instance) override {
+    ldout(store->ctx(), 20) << "trimmed bucket instance " << bucket_instance << dendl;
+    std::lock_guard<std::mutex> lock(mutex);
+    trimmed.insert(std::move(bucket_instance), clock_type::now());
+  }
+
+  bool trimmed_recently(const std::string_view& bucket_instance) override {
+    std::lock_guard<std::mutex> lock(mutex);
+    return trimmed.lookup(bucket_instance);
+  }
+};
+
+BucketTrimManager::BucketTrimManager(rgw::sal::RadosStore* store,
+                                     const BucketTrimConfig& config)
+  : impl(new Impl(store, config))
+{
+}
+BucketTrimManager::~BucketTrimManager() = default;
+
+int BucketTrimManager::init()
+{
+  return impl->watcher.start(this);
+}
+
+void BucketTrimManager::on_bucket_changed(const std::string_view& bucket)
+{
+  std::lock_guard<std::mutex> lock(impl->mutex);
+  // filter recently trimmed bucket instances out of bucket change counter
+  if (impl->trimmed.lookup(bucket)) {
+    return;
+  }
+  impl->counter.insert(std::string(bucket));
+}
+
+RGWCoroutine* BucketTrimManager::create_bucket_trim_cr(RGWHTTPManager *http)
+{
+  return new BucketTrimPollCR(impl->store, http, impl->config,
+                              impl.get(), impl->status_obj, this);
+}
+
+RGWCoroutine* BucketTrimManager::create_admin_bucket_trim_cr(RGWHTTPManager *http)
+{
+  // return the trim coroutine without any polling
+  return new BucketTrimCR(impl->store, http, impl->config,
+                          impl.get(), impl->status_obj, this);
+}
+
+CephContext* BucketTrimManager::get_cct() const
+{
+  return impl->store->ctx();
+}
+
+unsigned BucketTrimManager::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& BucketTrimManager::gen_prefix(std::ostream& out) const
+{
+  return out << "rgw bucket trim manager: ";
+}
+
+} // namespace rgw
+
+int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store,
+	       RGWBucketInfo& bucket_info, uint64_t gen, int shard_id,
+	       std::string_view start_marker, std::string_view end_marker)
+{
+  auto& logs = bucket_info.layout.logs;
+  auto log = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(gen));
+  if (log == logs.end()) {
+    ldpp_dout(p, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		    << "ERROR: no log layout with gen=" << gen << dendl;
+    return -ENOENT;
+  }
+
+  auto log_layout = *log;
+
+  auto r = store->svc()->bilog_rados->log_trim(p, bucket_info, log_layout, shard_id, start_marker, end_marker);
+  if (r < 0) {
+    ldpp_dout(p, 5) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		    << "ERROR: bilog_rados->log_trim returned r=" << r << dendl;
+  }
+  return r;
+}
diff --git a/src/rgw/driver/rados/rgw_trim_bilog.h b/src/rgw/driver/rados/rgw_trim_bilog.h
new file mode 100644
index 000000000..6a11d2476
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_bilog.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Red Hat, Inc
+ *
+ * Author: Casey Bodley <cbodley@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string_view>
+
+#include "include/common_fwd.h"
+#include "include/encoding.h"
+#include "common/ceph_time.h"
+#include "common/dout.h"
+#include "rgw_common.h"
+
+class RGWCoroutine;
+class RGWHTTPManager;
+
+namespace rgw {
+
+namespace sal {
+  class RadosStore;
+}
+
+/// Interface to inform the trim process about which buckets are most active
+struct BucketChangeObserver {
+  virtual ~BucketChangeObserver() = default;
+
+  virtual void on_bucket_changed(const std::string_view& bucket_instance) = 0;
+};
+
+/// Configuration for BucketTrimManager
+struct BucketTrimConfig {
+  /// time interval in seconds between bucket trim attempts
+  uint32_t trim_interval_sec{0};
+  /// maximum number of buckets to track with BucketChangeObserver
+  size_t counter_size{0};
+  /// maximum number of buckets to process each trim interval
+  uint32_t buckets_per_interval{0};
+  /// minimum number of buckets to choose from the global bucket instance list
+  uint32_t min_cold_buckets_per_interval{0};
+  /// maximum number of buckets to process in parallel
+  uint32_t concurrent_buckets{0};
+  /// timeout in ms for bucket trim notify replies
+  uint64_t notify_timeout_ms{0};
+  /// maximum number of recently trimmed buckets to remember (should be small
+  /// enough for a linear search)
+  size_t recent_size{0};
+  /// maximum duration to consider a trim as 'recent' (should be some multiple
+  /// of the trim interval, at least)
+  ceph::timespan recent_duration{0};
+};
+
+/// fill out the BucketTrimConfig from the ceph context
+void configure_bucket_trim(CephContext *cct, BucketTrimConfig& config);
+
+/// Determines the buckets on which to focus trim activity, using two sources of
+/// input: the frequency of entries read from the data changes log, and a global
+/// listing of the bucket.instance metadata. This allows us to trim active
+/// buckets quickly, while also ensuring that all buckets will eventually trim
+class BucketTrimManager : public BucketChangeObserver, public DoutPrefixProvider {
+  class Impl;
+  std::unique_ptr<Impl> impl;
+ public:
+  BucketTrimManager(sal::RadosStore *store, const BucketTrimConfig& config);
+  ~BucketTrimManager();
+
+  int init();
+
+  /// increment a counter for the given bucket instance
+  void on_bucket_changed(const std::string_view& bucket_instance) override;
+
+  /// create a coroutine to run the bucket trim process every trim interval
+  RGWCoroutine* create_bucket_trim_cr(RGWHTTPManager *http);
+
+  /// create a coroutine to trim buckets directly via radosgw-admin
+  RGWCoroutine* create_admin_bucket_trim_cr(RGWHTTPManager *http);
+
+  CephContext *get_cct() const override;
+  unsigned get_subsys() const;
+  std::ostream& gen_prefix(std::ostream& out) const;
+};
+
+/// provides persistent storage for the trim manager's current position in the
+/// list of bucket instance metadata
+struct BucketTrimStatus {
+  std::string marker; //< metadata key of current bucket instance
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(marker, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(marker, p);
+    DECODE_FINISH(p);
+  }
+
+  static const std::string oid;
+};
+
+} // namespace rgw
+
+WRITE_CLASS_ENCODER(rgw::BucketTrimStatus);
+
+int bilog_trim(const DoutPrefixProvider* p, rgw::sal::RadosStore* store,
+	       RGWBucketInfo& bucket_info, uint64_t gen, int shard_id,
+	       std::string_view start_marker, std::string_view end_marker);
diff --git a/src/rgw/driver/rados/rgw_trim_datalog.cc b/src/rgw/driver/rados/rgw_trim_datalog.cc
new file mode 100644
index 000000000..72a160039
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_datalog.cc
@@ -0,0 +1,252 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <vector>
+#include <string>
+
+#include "common/errno.h"
+
+#include "rgw_trim_datalog.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_datalog.h"
+#include "rgw_data_sync.h"
+#include "rgw_zone.h"
+#include "rgw_bucket.h"
+
+#include "services/svc_zone.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "data trim: ")
+
+namespace {
+
+class DatalogTrimImplCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  boost::intrusive_ptr<RGWAioCompletionNotifier> cn;
+  int shard;
+  std::string marker;
+  std::string* last_trim_marker;
+
+ public:
+  DatalogTrimImplCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, int shard,
+		    const std::string& marker, std::string* last_trim_marker)
+  : RGWSimpleCoroutine(store->ctx()), dpp(dpp), store(store), shard(shard),
+    marker(marker), last_trim_marker(last_trim_marker) {
+    set_description() << "Datalog trim shard=" << shard
+		      << " marker=" << marker;
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    set_status() << "sending request";
+    cn = stack->create_completion_notifier();
+    return store->svc()->datalog_rados->trim_entries(dpp, shard, marker,
+						     cn->completion());
+  }
+  int request_complete() override {
+    int r = cn->completion()->get_return_value();
+    ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << "(): trim of shard=" << shard
+		  << " marker=" << marker << " returned r=" << r << dendl;
+
+    set_status() << "request complete; ret=" << r;
+    if (r != -ENODATA) {
+      return r;
+    }
+    // nothing left to trim, update last_trim_marker
+    if (*last_trim_marker < marker &&
+	marker != store->svc()->datalog_rados->max_marker()) {
+      *last_trim_marker = marker;
+    }
+    return 0;
+  }
+};
+
+/// return the marker that it's safe to trim up to
+const std::string& get_stable_marker(const rgw_data_sync_marker& m)
+{
+  return m.state == m.FullSync ? m.next_step_marker : m.marker;
+}
+
+/// populate the container starting with 'dest' with the minimum stable marker
+/// of each shard for all of the peers in [first, last)
+template <typename IterIn, typename IterOut>
+void take_min_markers(IterIn first, IterIn last, IterOut dest)
+{
+  if (first == last) {
+    return;
+  }
+  for (auto p = first; p != last; ++p) {
+    auto m = dest;
+    for (auto &shard : p->sync_markers) {
+      const auto& stable = get_stable_marker(shard.second);
+      if (*m > stable) {
+        *m = stable;
+      }
+      ++m;
+    }
+  }
+}
+
+} // anonymous namespace
+
+class DataLogTrimCR : public RGWCoroutine {
+  using TrimCR = DatalogTrimImplCR;
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWHTTPManager *http;
+  const int num_shards;
+  const std::string& zone_id; //< my zone id
+  std::vector<rgw_data_sync_status> peer_status; //< sync status for each peer
+  std::vector<std::string> min_shard_markers; //< min marker per shard
+  std::vector<std::string>& last_trim; //< last trimmed marker per shard
+  int ret{0};
+
+ public:
+  DataLogTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                   int num_shards, std::vector<std::string>& last_trim)
+    : RGWCoroutine(store->ctx()), dpp(dpp), store(store), http(http),
+      num_shards(num_shards),
+      zone_id(store->svc()->zone->get_zone().id),
+      peer_status(store->svc()->zone->get_zone_data_notify_to_map().size()),
+      min_shard_markers(num_shards,
+			std::string(store->svc()->datalog_rados->max_marker())),
+      last_trim(last_trim)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int DataLogTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    ldpp_dout(dpp, 10) << "fetching sync status for zone " << zone_id << dendl;
+    set_status("fetching sync status");
+    yield {
+      // query data sync status from each sync peer
+      rgw_http_param_pair params[] = {
+        { "type", "data" },
+        { "status", nullptr },
+        { "source-zone", zone_id.c_str() },
+        { nullptr, nullptr }
+      };
+
+      auto p = peer_status.begin();
+      for (auto& c : store->svc()->zone->get_zone_data_notify_to_map()) {
+        ldpp_dout(dpp, 20) << "query sync status from " << c.first << dendl;
+        using StatusCR = RGWReadRESTResourceCR<rgw_data_sync_status>;
+        spawn(new StatusCR(cct, c.second, http, "/admin/log/", params, &*p),
+              false);
+        ++p;
+      }
+    }
+
+    // must get a successful reply from all peers to consider trimming
+    ret = 0;
+    while (ret == 0 && num_spawned() > 0) {
+      yield wait_for_child();
+      collect_next(&ret);
+    }
+    drain_all();
+
+    if (ret < 0) {
+      ldpp_dout(dpp, 4) << "failed to fetch sync status from all peers" << dendl;
+      return set_cr_error(ret);
+    }
+
+    ldpp_dout(dpp, 10) << "trimming log shards" << dendl;
+    set_status("trimming log shards");
+    yield {
+      // determine the minimum marker for each shard
+      take_min_markers(peer_status.begin(), peer_status.end(),
+                       min_shard_markers.begin());
+
+      for (int i = 0; i < num_shards; i++) {
+        const auto& m = min_shard_markers[i];
+        if (m <= last_trim[i]) {
+          continue;
+        }
+        ldpp_dout(dpp, 10) << "trimming log shard " << i
+            << " at marker=" << m
+            << " last_trim=" << last_trim[i] << dendl;
+        spawn(new TrimCR(dpp, store, i, m, &last_trim[i]),
+              true);
+      }
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+RGWCoroutine* create_admin_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+                                            RGWHTTPManager *http,
+                                            int num_shards,
+                                            std::vector<std::string>& markers)
+{
+  return new DataLogTrimCR(dpp, store, http, num_shards, markers);
+}
+
+class DataLogTrimPollCR : public RGWCoroutine {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* store;
+  RGWHTTPManager *http;
+  const int num_shards;
+  const utime_t interval; //< polling interval
+  const std::string lock_oid; //< use first data log shard for lock
+  const std::string lock_cookie;
+  std::vector<std::string> last_trim; //< last trimmed marker per shard
+
+ public:
+  DataLogTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                    int num_shards, utime_t interval)
+    : RGWCoroutine(store->ctx()), dpp(dpp), store(store), http(http),
+      num_shards(num_shards), interval(interval),
+      lock_oid(store->svc()->datalog_rados->get_oid(0, 0)),
+      lock_cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct)),
+      last_trim(num_shards)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int DataLogTrimPollCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    for (;;) {
+      set_status("sleeping");
+      wait(interval);
+
+      // request a 'data_trim' lock that covers the entire wait interval to
+      // prevent other gateways from attempting to trim for the duration
+      set_status("acquiring trim lock");
+      yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
+                                          rgw_raw_obj(store->svc()->zone->get_zone_params().log_pool, lock_oid),
+                                          "data_trim", lock_cookie,
+                                          interval.sec()));
+      if (retcode < 0) {
+        // if the lock is already held, go back to sleep and try again later
+        ldpp_dout(dpp, 4) << "failed to lock " << lock_oid << ", trying again in "
+            << interval.sec() << "s" << dendl;
+        continue;
+      }
+
+      set_status("trimming");
+      yield call(new DataLogTrimCR(dpp, store, http, num_shards, last_trim));
+
+      // note that the lock is not released. this is intentional, as it avoids
+      // duplicating this work in other gateways
+    }
+  }
+  return 0;
+}
+
+RGWCoroutine* create_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+                                      RGWHTTPManager *http,
+                                      int num_shards, utime_t interval)
+{
+  return new DataLogTrimPollCR(dpp, store, http, num_shards, interval);
+}
diff --git a/src/rgw/driver/rados/rgw_trim_datalog.h b/src/rgw/driver/rados/rgw_trim_datalog.h
new file mode 100644
index 000000000..9f5bf7252
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_datalog.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "common/dout.h"
+
+class RGWCoroutine;
+class RGWRados;
+class RGWHTTPManager;
+class utime_t;
+namespace rgw { namespace sal {
+  class RadosStore;
+} }
+
+// DataLogTrimCR factory function
+extern RGWCoroutine* create_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+                                             RGWHTTPManager *http,
+                                             int num_shards, utime_t interval);
+
+// factory function for datalog trim via radosgw-admin
+RGWCoroutine* create_admin_data_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+                                            RGWHTTPManager *http,
+                                            int num_shards,
+                                            std::vector<std::string>& markers);
diff --git a/src/rgw/driver/rados/rgw_trim_mdlog.cc b/src/rgw/driver/rados/rgw_trim_mdlog.cc
new file mode 100644
index 000000000..d8e19594a
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_mdlog.cc
@@ -0,0 +1,795 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_trim_mdlog.h"
+#include "rgw_sync.h"
+#include "rgw_cr_rados.h"
+#include "rgw_cr_rest.h"
+#include "rgw_zone.h"
+#include "services/svc_zone.h"
+#include "services/svc_meta.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_cls.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "meta trim: ")
+
+/// purge all log shards for the given mdlog
+class PurgeLogShardsCR : public RGWShardCollectCR {
+  rgw::sal::RadosStore* const store;
+  const RGWMetadataLog* mdlog;
+  const int num_shards;
+  rgw_raw_obj obj;
+  int i{0};
+
+  static constexpr int max_concurrent = 16;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to remove mdlog shard: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  PurgeLogShardsCR(rgw::sal::RadosStore* store, const RGWMetadataLog* mdlog,
+                   const rgw_pool& pool, int num_shards)
+    : RGWShardCollectCR(store->ctx(), max_concurrent),
+      store(store), mdlog(mdlog), num_shards(num_shards), obj(pool, "")
+  {}
+
+  bool spawn_next() override {
+    if (i == num_shards) {
+      return false;
+    }
+    mdlog->get_shard_oid(i++, obj.oid);
+    spawn(new RGWRadosRemoveCR(store, obj), false);
+    return true;
+  }
+};
+
+using Cursor = RGWPeriodHistory::Cursor;
+
+/// purge mdlogs from the oldest up to (but not including) the given realm_epoch
+class PurgePeriodLogsCR : public RGWCoroutine {
+  struct Svc {
+    RGWSI_Zone *zone;
+    RGWSI_MDLog *mdlog;
+  } svc;
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* const store;
+  RGWMetadataManager *const metadata;
+  RGWObjVersionTracker objv;
+  Cursor cursor;
+  epoch_t realm_epoch;
+  epoch_t *last_trim_epoch; //< update last trim on success
+
+ public:
+  PurgePeriodLogsCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, epoch_t realm_epoch, epoch_t *last_trim)
+    : RGWCoroutine(store->ctx()), dpp(dpp), store(store), metadata(store->ctl()->meta.mgr),
+      realm_epoch(realm_epoch), last_trim_epoch(last_trim) {
+    svc.zone = store->svc()->zone;
+    svc.mdlog = store->svc()->mdlog;
+  }
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int PurgePeriodLogsCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // read our current oldest log period
+    yield call(svc.mdlog->read_oldest_log_period_cr(dpp, &cursor, &objv));
+    if (retcode < 0) {
+      return set_cr_error(retcode);
+    }
+    ceph_assert(cursor);
+    ldpp_dout(dpp, 20) << "oldest log realm_epoch=" << cursor.get_epoch()
+        << " period=" << cursor.get_period().get_id() << dendl;
+
+    // trim -up to- the given realm_epoch
+    while (cursor.get_epoch() < realm_epoch) {
+      ldpp_dout(dpp, 4) << "purging log shards for realm_epoch=" << cursor.get_epoch()
+          << " period=" << cursor.get_period().get_id() << dendl;
+      yield {
+        const auto mdlog = svc.mdlog->get_log(cursor.get_period().get_id());
+        const auto& pool = svc.zone->get_zone_params().log_pool;
+        auto num_shards = cct->_conf->rgw_md_log_max_shards;
+        call(new PurgeLogShardsCR(store, mdlog, pool, num_shards));
+      }
+      if (retcode < 0) {
+        ldpp_dout(dpp, 1) << "failed to remove log shards: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      ldpp_dout(dpp, 10) << "removed log shards for realm_epoch=" << cursor.get_epoch()
+          << " period=" << cursor.get_period().get_id() << dendl;
+
+      // update our mdlog history
+      yield call(svc.mdlog->trim_log_period_cr(dpp, cursor, &objv));
+      if (retcode == -ENOENT) {
+        // must have raced to update mdlog history. return success and allow the
+        // winner to continue purging
+        ldpp_dout(dpp, 10) << "already removed log shards for realm_epoch=" << cursor.get_epoch()
+            << " period=" << cursor.get_period().get_id() << dendl;
+        return set_cr_done();
+      } else if (retcode < 0) {
+        ldpp_dout(dpp, 1) << "failed to remove log shards for realm_epoch="
+            << cursor.get_epoch() << " period=" << cursor.get_period().get_id()
+            << " with: " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      if (*last_trim_epoch < cursor.get_epoch()) {
+        *last_trim_epoch = cursor.get_epoch();
+      }
+
+      ceph_assert(cursor.has_next()); // get_current() should always come after
+      cursor.next();
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+namespace {
+
+using connection_map = std::map<std::string, std::unique_ptr<RGWRESTConn>>;
+
+/// construct a RGWRESTConn for each zone in the realm
+template <typename Zonegroups>
+connection_map make_peer_connections(rgw::sal::RadosStore* store,
+                                     const Zonegroups& zonegroups)
+{
+  connection_map connections;
+  for (auto& g : zonegroups) {
+    for (auto& z : g.second.zones) {
+      std::unique_ptr<RGWRESTConn> conn{
+        new RGWRESTConn(store->ctx(), store, z.first.id, z.second.endpoints, g.second.api_name)};
+      connections.emplace(z.first.id, std::move(conn));
+    }
+  }
+  return connections;
+}
+
+/// return the marker that it's safe to trim up to
+const std::string& get_stable_marker(const rgw_meta_sync_marker& m)
+{
+  return m.state == m.FullSync ? m.next_step_marker : m.marker;
+}
+
+/// comparison operator for take_min_status()
+bool operator<(const rgw_meta_sync_marker& lhs, const rgw_meta_sync_marker& rhs)
+{
+  // sort by stable marker
+  return get_stable_marker(lhs) < get_stable_marker(rhs);
+}
+
+/// populate the status with the minimum stable marker of each shard for any
+/// peer whose realm_epoch matches the minimum realm_epoch in the input
+template <typename Iter>
+int take_min_status(CephContext *cct, Iter first, Iter last,
+                    rgw_meta_sync_status *status)
+{
+  if (first == last) {
+    return -EINVAL;
+  }
+  const size_t num_shards = cct->_conf->rgw_md_log_max_shards;
+
+  status->sync_info.realm_epoch = std::numeric_limits<epoch_t>::max();
+  for (auto p = first; p != last; ++p) {
+    // validate peer's shard count
+    if (p->sync_markers.size() != num_shards) {
+      ldout(cct, 1) << "take_min_status got peer status with "
+          << p->sync_markers.size() << " shards, expected "
+          << num_shards << dendl;
+      return -EINVAL;
+    }
+    if (p->sync_info.realm_epoch < status->sync_info.realm_epoch) {
+      // earlier epoch, take its entire status
+      *status = std::move(*p);
+    } else if (p->sync_info.realm_epoch == status->sync_info.realm_epoch) {
+      // same epoch, take any earlier markers
+      auto m = status->sync_markers.begin();
+      for (auto& shard : p->sync_markers) {
+        if (shard.second < m->second) {
+          m->second = std::move(shard.second);
+        }
+        ++m;
+      }
+    }
+  }
+  return 0;
+}
+
+struct TrimEnv {
+  const DoutPrefixProvider *dpp;
+  rgw::sal::RadosStore* const store;
+  RGWHTTPManager *const http;
+  int num_shards;
+  const rgw_zone_id& zone;
+  Cursor current; //< cursor to current period
+  epoch_t last_trim_epoch{0}; //< epoch of last mdlog that was purged
+
+  TrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+    : dpp(dpp), store(store), http(http), num_shards(num_shards),
+      zone(store->svc()->zone->zone_id()),
+      current(store->svc()->mdlog->get_period_history()->get_current())
+  {}
+};
+
+struct MasterTrimEnv : public TrimEnv {
+  connection_map connections; //< peer connections
+  std::vector<rgw_meta_sync_status> peer_status; //< sync status for each peer
+  /// last trim marker for each shard, only applies to current period's mdlog
+  std::vector<std::string> last_trim_markers;
+
+  MasterTrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+    : TrimEnv(dpp, store, http, num_shards),
+      last_trim_markers(num_shards)
+  {
+    auto& period = current.get_period();
+    connections = make_peer_connections(store, period.get_map().zonegroups);
+    connections.erase(zone.id);
+    peer_status.resize(connections.size());
+  }
+};
+
+struct PeerTrimEnv : public TrimEnv {
+  /// last trim timestamp for each shard, only applies to current period's mdlog
+  std::vector<ceph::real_time> last_trim_timestamps;
+
+  PeerTrimEnv(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+    : TrimEnv(dpp, store, http, num_shards),
+      last_trim_timestamps(num_shards)
+  {}
+
+  void set_num_shards(int num_shards) {
+    this->num_shards = num_shards;
+    last_trim_timestamps.resize(num_shards);
+  }
+};
+
+} // anonymous namespace
+
+
+/// spawn a trim cr for each shard that needs it, while limiting the number
+/// of concurrent shards
+class MetaMasterTrimShardCollectCR : public RGWShardCollectCR {
+ private:
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  MasterTrimEnv& env;
+  RGWMetadataLog *mdlog;
+  int shard_id{0};
+  std::string oid;
+  const rgw_meta_sync_status& sync_status;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to trim mdlog shard: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  MetaMasterTrimShardCollectCR(MasterTrimEnv& env, RGWMetadataLog *mdlog,
+                               const rgw_meta_sync_status& sync_status)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), mdlog(mdlog), sync_status(sync_status)
+  {}
+
+  bool spawn_next() override;
+};
+
+bool MetaMasterTrimShardCollectCR::spawn_next()
+{
+  while (shard_id < env.num_shards) {
+    auto m = sync_status.sync_markers.find(shard_id);
+    if (m == sync_status.sync_markers.end()) {
+      shard_id++;
+      continue;
+    }
+    auto& stable = get_stable_marker(m->second);
+    auto& last_trim = env.last_trim_markers[shard_id];
+
+    if (stable <= last_trim) {
+      // already trimmed
+      ldpp_dout(env.dpp, 20) << "skipping log shard " << shard_id
+          << " at marker=" << stable
+          << " last_trim=" << last_trim
+          << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
+      shard_id++;
+      continue;
+    }
+
+    mdlog->get_shard_oid(shard_id, oid);
+
+    ldpp_dout(env.dpp, 10) << "trimming log shard " << shard_id
+        << " at marker=" << stable
+        << " last_trim=" << last_trim
+        << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
+    spawn(new RGWSyncLogTrimCR(env.dpp, env.store, oid, stable, &last_trim), false);
+    shard_id++;
+    return true;
+  }
+  return false;
+}
+
+/// spawn rest requests to read each peer's sync status
+class MetaMasterStatusCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  MasterTrimEnv& env;
+  connection_map::iterator c;
+  std::vector<rgw_meta_sync_status>::iterator s;
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to fetch metadata sync status: "
+          << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  explicit MetaMasterStatusCollectCR(MasterTrimEnv& env)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), c(env.connections.begin()), s(env.peer_status.begin())
+  {}
+
+  bool spawn_next() override {
+    if (c == env.connections.end()) {
+      return false;
+    }
+    static rgw_http_param_pair params[] = {
+      { "type", "metadata" },
+      { "status", nullptr },
+      { nullptr, nullptr }
+    };
+
+    ldout(cct, 20) << "query sync status from " << c->first << dendl;
+    auto conn = c->second.get();
+    using StatusCR = RGWReadRESTResourceCR<rgw_meta_sync_status>;
+    spawn(new StatusCR(cct, conn, env.http, "/admin/log/", params, &*s),
+          false);
+    ++c;
+    ++s;
+    return true;
+  }
+};
+
+class MetaMasterTrimCR : public RGWCoroutine {
+  MasterTrimEnv& env;
+  rgw_meta_sync_status min_status; //< minimum sync status of all peers
+  int ret{0};
+
+ public:
+  explicit MetaMasterTrimCR(MasterTrimEnv& env)
+    : RGWCoroutine(env.store->ctx()), env(env)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaMasterTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // TODO: detect this and fail before we spawn the trim thread?
+    if (env.connections.empty()) {
+      ldpp_dout(dpp, 4) << "no peers, exiting" << dendl;
+      return set_cr_done();
+    }
+
+    ldpp_dout(dpp, 10) << "fetching sync status for zone " << env.zone << dendl;
+    // query mdlog sync status from peers
+    yield call(new MetaMasterStatusCollectCR(env));
+
+    // must get a successful reply from all peers to consider trimming
+    if (ret < 0) {
+      ldpp_dout(dpp, 4) << "failed to fetch sync status from all peers" << dendl;
+      return set_cr_error(ret);
+    }
+
+    // determine the minimum epoch and markers
+    ret = take_min_status(env.store->ctx(), env.peer_status.begin(),
+                          env.peer_status.end(), &min_status);
+    if (ret < 0) {
+      ldpp_dout(dpp, 4) << "failed to calculate min sync status from peers" << dendl;
+      return set_cr_error(ret);
+    }
+    yield {
+      auto store = env.store;
+      auto epoch = min_status.sync_info.realm_epoch;
+      ldpp_dout(dpp, 4) << "realm epoch min=" << epoch
+          << " current=" << env.current.get_epoch()<< dendl;
+      if (epoch > env.last_trim_epoch + 1) {
+        // delete any prior mdlog periods
+        spawn(new PurgePeriodLogsCR(dpp, store, epoch, &env.last_trim_epoch), true);
+      } else {
+        ldpp_dout(dpp, 10) << "mdlogs already purged up to realm_epoch "
+            << env.last_trim_epoch << dendl;
+      }
+
+      // if realm_epoch == current, trim mdlog based on markers
+      if (epoch == env.current.get_epoch()) {
+        auto mdlog = store->svc()->mdlog->get_log(env.current.get_period().get_id());
+        spawn(new MetaMasterTrimShardCollectCR(env, mdlog, min_status), true);
+      }
+    }
+    // ignore any errors during purge/trim because we want to hold the lock open
+    return set_cr_done();
+  }
+  return 0;
+}
+
+
+/// read the first entry of the master's mdlog shard and trim to that position
+class MetaPeerTrimShardCR : public RGWCoroutine {
+  RGWMetaSyncEnv& env;
+  RGWMetadataLog *mdlog;
+  const std::string& period_id;
+  const int shard_id;
+  RGWMetadataLogInfo info;
+  ceph::real_time stable; //< safe timestamp to trim, according to master
+  ceph::real_time *last_trim; //< last trimmed timestamp, updated on trim
+  rgw_mdlog_shard_data result; //< result from master's mdlog listing
+
+ public:
+  MetaPeerTrimShardCR(RGWMetaSyncEnv& env, RGWMetadataLog *mdlog,
+                      const std::string& period_id, int shard_id,
+                      ceph::real_time *last_trim)
+    : RGWCoroutine(env.store->ctx()), env(env), mdlog(mdlog),
+      period_id(period_id), shard_id(shard_id), last_trim(last_trim)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaPeerTrimShardCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // query master's first mdlog entry for this shard
+    yield call(create_list_remote_mdlog_shard_cr(&env, period_id, shard_id,
+                                                 "", 1, &result));
+    if (retcode < 0) {
+      ldpp_dout(dpp, 5) << "failed to read first entry from master's mdlog shard "
+          << shard_id << " for period " << period_id
+          << ": " << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    if (result.entries.empty()) {
+      // if there are no mdlog entries, we don't have a timestamp to compare. we
+      // can't just trim everything, because there could be racing updates since
+      // this empty reply. query the mdlog shard info to read its max timestamp,
+      // then retry the listing to make sure it's still empty before trimming to
+      // that
+      ldpp_dout(dpp, 10) << "empty master mdlog shard " << shard_id
+          << ", reading last timestamp from shard info" << dendl;
+      // read the mdlog shard info for the last timestamp
+      yield call(create_read_remote_mdlog_shard_info_cr(&env, period_id, shard_id, &info));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 5) << "failed to read info from master's mdlog shard "
+            << shard_id << " for period " << period_id
+            << ": " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      if (ceph::real_clock::is_zero(info.last_update)) {
+        return set_cr_done(); // nothing to trim
+      }
+      ldpp_dout(dpp, 10) << "got mdlog shard info with last update="
+          << info.last_update << dendl;
+      // re-read the master's first mdlog entry to make sure it hasn't changed
+      yield call(create_list_remote_mdlog_shard_cr(&env, period_id, shard_id,
+                                                   "", 1, &result));
+      if (retcode < 0) {
+        ldpp_dout(dpp, 5) << "failed to read first entry from master's mdlog shard "
+            << shard_id << " for period " << period_id
+            << ": " << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      // if the mdlog is still empty, trim to max marker
+      if (result.entries.empty()) {
+        stable = info.last_update;
+      } else {
+        stable = result.entries.front().timestamp;
+
+        // can only trim -up to- master's first timestamp, so subtract a second.
+        // (this is why we use timestamps instead of markers for the peers)
+        stable -= std::chrono::seconds(1);
+      }
+    } else {
+      stable = result.entries.front().timestamp;
+      stable -= std::chrono::seconds(1);
+    }
+
+    if (stable <= *last_trim) {
+      ldpp_dout(dpp, 10) << "skipping log shard " << shard_id
+          << " at timestamp=" << stable
+          << " last_trim=" << *last_trim << dendl;
+      return set_cr_done();
+    }
+
+    ldpp_dout(dpp, 10) << "trimming log shard " << shard_id
+        << " at timestamp=" << stable
+        << " last_trim=" << *last_trim << dendl;
+    yield {
+      std::string oid;
+      mdlog->get_shard_oid(shard_id, oid);
+      call(new RGWRadosTimelogTrimCR(dpp, env.store, oid, real_time{}, stable, "", ""));
+    }
+    if (retcode < 0 && retcode != -ENODATA) {
+      ldpp_dout(dpp, 1) << "failed to trim mdlog shard " << shard_id
+          << ": " << cpp_strerror(retcode) << dendl;
+      return set_cr_error(retcode);
+    }
+    *last_trim = stable;
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class MetaPeerTrimShardCollectCR : public RGWShardCollectCR {
+  static constexpr int MAX_CONCURRENT_SHARDS = 16;
+
+  PeerTrimEnv& env;
+  RGWMetadataLog *mdlog;
+  const std::string& period_id;
+  RGWMetaSyncEnv meta_env; //< for RGWListRemoteMDLogShardCR
+  int shard_id{0};
+
+  int handle_result(int r) override {
+    if (r == -ENOENT) { // ENOENT is not a fatal error
+      return 0;
+    }
+    if (r < 0) {
+      ldout(cct, 4) << "failed to trim mdlog shard: " << cpp_strerror(r) << dendl;
+    }
+    return r;
+  }
+ public:
+  MetaPeerTrimShardCollectCR(PeerTrimEnv& env, RGWMetadataLog *mdlog)
+    : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
+      env(env), mdlog(mdlog), period_id(env.current.get_period().get_id())
+  {
+    meta_env.init(env.dpp, cct, env.store, env.store->svc()->zone->get_master_conn(),
+                  env.store->svc()->rados->get_async_processor(), env.http, nullptr,
+                  env.store->getRados()->get_sync_tracer());
+  }
+
+  bool spawn_next() override;
+};
+
+bool MetaPeerTrimShardCollectCR::spawn_next()
+{
+  if (shard_id >= env.num_shards) {
+    return false;
+  }
+  auto& last_trim = env.last_trim_timestamps[shard_id];
+  spawn(new MetaPeerTrimShardCR(meta_env, mdlog, period_id, shard_id, &last_trim),
+        false);
+  shard_id++;
+  return true;
+}
+
+class MetaPeerTrimCR : public RGWCoroutine {
+  PeerTrimEnv& env;
+  rgw_mdlog_info mdlog_info; //< master's mdlog info
+
+ public:
+  explicit MetaPeerTrimCR(PeerTrimEnv& env) : RGWCoroutine(env.store->ctx()), env(env) {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaPeerTrimCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    ldpp_dout(dpp, 10) << "fetching master mdlog info" << dendl;
+    yield {
+      // query mdlog_info from master for oldest_log_period
+      rgw_http_param_pair params[] = {
+        { "type", "metadata" },
+        { nullptr, nullptr }
+      };
+
+      using LogInfoCR = RGWReadRESTResourceCR<rgw_mdlog_info>;
+      call(new LogInfoCR(cct, env.store->svc()->zone->get_master_conn(), env.http,
+                         "/admin/log/", params, &mdlog_info));
+    }
+    if (retcode < 0) {
+      ldpp_dout(dpp, 4) << "failed to read mdlog info from master" << dendl;
+      return set_cr_error(retcode);
+    }
+    // use master's shard count instead
+    env.set_num_shards(mdlog_info.num_shards);
+
+    if (mdlog_info.realm_epoch > env.last_trim_epoch + 1) {
+      // delete any prior mdlog periods
+      yield call(new PurgePeriodLogsCR(dpp, env.store, mdlog_info.realm_epoch,
+                                       &env.last_trim_epoch));
+    } else {
+      ldpp_dout(dpp, 10) << "mdlogs already purged through realm_epoch "
+          << env.last_trim_epoch << dendl;
+    }
+
+    // if realm_epoch == current, trim mdlog based on master's markers
+    if (mdlog_info.realm_epoch == env.current.get_epoch()) {
+      yield {
+        auto mdlog = env.store->svc()->mdlog->get_log(env.current.get_period().get_id());
+        call(new MetaPeerTrimShardCollectCR(env, mdlog));
+        // ignore any errors during purge/trim because we want to hold the lock open
+      }
+    }
+    return set_cr_done();
+  }
+  return 0;
+}
+
+class MetaTrimPollCR : public RGWCoroutine {
+  rgw::sal::RadosStore* const store;
+  const utime_t interval; //< polling interval
+  const rgw_raw_obj obj;
+  const std::string name{"meta_trim"}; //< lock name
+  const std::string cookie;
+
+ protected:
+  /// allocate the coroutine to run within the lease
+  virtual RGWCoroutine* alloc_cr() = 0;
+
+ public:
+  MetaTrimPollCR(rgw::sal::RadosStore* store, utime_t interval)
+    : RGWCoroutine(store->ctx()), store(store), interval(interval),
+      obj(store->svc()->zone->get_zone_params().log_pool, RGWMetadataLogHistory::oid),
+      cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct))
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int MetaTrimPollCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    for (;;) {
+      set_status("sleeping");
+      wait(interval);
+
+      // prevent others from trimming for our entire wait interval
+      set_status("acquiring trim lock");
+      yield call(new RGWSimpleRadosLockCR(store->svc()->rados->get_async_processor(), store,
+                                          obj, name, cookie, interval.sec()));
+      if (retcode < 0) {
+        ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
+        continue;
+      }
+
+      set_status("trimming");
+      yield call(alloc_cr());
+
+      if (retcode < 0) {
+        // on errors, unlock so other gateways can try
+        set_status("unlocking");
+        yield call(new RGWSimpleRadosUnlockCR(store->svc()->rados->get_async_processor(), store,
+                                              obj, name, cookie));
+      }
+    }
+  }
+  return 0;
+}
+
+class MetaMasterTrimPollCR : public MetaTrimPollCR  {
+  MasterTrimEnv env; //< trim state to share between calls
+  RGWCoroutine* alloc_cr() override {
+    return new MetaMasterTrimCR(env);
+  }
+ public:
+  MetaMasterTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                       int num_shards, utime_t interval)
+    : MetaTrimPollCR(store, interval),
+      env(dpp, store, http, num_shards)
+  {}
+};
+
+class MetaPeerTrimPollCR : public MetaTrimPollCR {
+  PeerTrimEnv env; //< trim state to share between calls
+  RGWCoroutine* alloc_cr() override {
+    return new MetaPeerTrimCR(env);
+  }
+ public:
+  MetaPeerTrimPollCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                     int num_shards, utime_t interval)
+    : MetaTrimPollCR(store, interval),
+      env(dpp, store, http, num_shards)
+  {}
+};
+
+namespace {
+bool sanity_check_endpoints(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store) {
+  bool retval = true;
+  auto current = store->svc()->mdlog->get_period_history()->get_current();
+  const auto& period = current.get_period();
+  for (const auto& [_, zonegroup] : period.get_map().zonegroups) {
+    if (zonegroup.endpoints.empty()) {
+      ldpp_dout(dpp, -1)
+	<< __PRETTY_FUNCTION__ << ":" << __LINE__
+	<< " WARNING: Cluster is is misconfigured! "
+	<< " Zonegroup " << zonegroup.get_name()
+	<< " (" << zonegroup.get_id() << ") in Realm "
+	<< period.get_realm_name() << " ( " << period.get_realm() << ") "
+	<< " has no endpoints!" << dendl;
+    }
+    for (const auto& [_, zone] : zonegroup.zones) {
+      if (zone.endpoints.empty()) {
+	ldpp_dout(dpp, -1)
+	  << __PRETTY_FUNCTION__ << ":" << __LINE__
+	  << " ERROR: Cluster is is misconfigured! "
+	  << " Zone " << zone.name << " (" << zone.id << ") in Zonegroup "
+	  << zonegroup.get_name() << " ( " << zonegroup.get_id()
+	  << ") in Realm " << period.get_realm_name()
+	  << " ( " << period.get_realm() << ") "
+	  << " has no endpoints! Trimming is impossible." << dendl;
+	retval = false;
+      }
+    }
+  }
+  return retval;
+}
+}
+
+RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http,
+                                      int num_shards, utime_t interval)
+{
+  if (!sanity_check_endpoints(dpp, store)) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " ERROR: Cluster is is misconfigured! Refusing to trim." << dendl;
+      return nullptr;
+  }
+  if (store->svc()->zone->is_meta_master()) {
+    return new MetaMasterTrimPollCR(dpp, store, http, num_shards, interval);
+  }
+  return new MetaPeerTrimPollCR(dpp, store, http, num_shards, interval);
+}
+
+
+struct MetaMasterAdminTrimCR : private MasterTrimEnv, public MetaMasterTrimCR {
+  MetaMasterAdminTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+    : MasterTrimEnv(dpp, store, http, num_shards),
+      MetaMasterTrimCR(*static_cast<MasterTrimEnv*>(this))
+  {}
+};
+
+struct MetaPeerAdminTrimCR : private PeerTrimEnv, public MetaPeerTrimCR {
+  MetaPeerAdminTrimCR(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store, RGWHTTPManager *http, int num_shards)
+    : PeerTrimEnv(dpp, store, http, num_shards),
+      MetaPeerTrimCR(*static_cast<PeerTrimEnv*>(this))
+  {}
+};
+
+RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* store,
+                                            RGWHTTPManager *http,
+                                            int num_shards)
+{
+  if (!sanity_check_endpoints(dpp, store)) {
+    ldpp_dout(dpp, -1)
+      << __PRETTY_FUNCTION__ << ":" << __LINE__
+      << " ERROR: Cluster is is misconfigured! Refusing to trim." << dendl;
+      return nullptr;
+  }
+  if (store->svc()->zone->is_meta_master()) {
+    return new MetaMasterAdminTrimCR(dpp, store, http, num_shards);
+  }
+  return new MetaPeerAdminTrimCR(dpp, store, http, num_shards);
+}
diff --git a/src/rgw/driver/rados/rgw_trim_mdlog.h b/src/rgw/driver/rados/rgw_trim_mdlog.h
new file mode 100644
index 000000000..1dba8612b
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_trim_mdlog.h
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+class RGWCoroutine;
+class DoutPrefixProvider;
+class RGWRados;
+class RGWHTTPManager;
+class utime_t;
+namespace rgw { namespace sal {
+  class RadosStore;
+} }
+
+// MetaLogTrimCR factory function
+RGWCoroutine* create_meta_log_trim_cr(const DoutPrefixProvider *dpp,
+                                      rgw::sal::RadosStore* store,
+                                      RGWHTTPManager *http,
+                                      int num_shards, utime_t interval);
+
+// factory function for mdlog trim via radosgw-admin
+RGWCoroutine* create_admin_meta_log_trim_cr(const DoutPrefixProvider *dpp,
+                                            rgw::sal::RadosStore* store,
+                                            RGWHTTPManager *http,
+                                            int num_shards);
diff --git a/src/rgw/driver/rados/rgw_user.cc b/src/rgw/driver/rados/rgw_user.cc
new file mode 100644
index 000000000..51b38c082
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_user.cc
@@ -0,0 +1,2776 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_user.h"
+
+#include "rgw_bucket.h"
+#include "rgw_quota.h"
+
+#include "services/svc_user.h"
+#include "services/svc_meta.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+extern void op_type_to_str(uint32_t mask, char *buf, int len);
+
+static string key_type_to_str(int key_type) {
+  switch (key_type) {
+    case KEY_TYPE_SWIFT:
+      return "swift";
+      break;
+
+    default:
+      return "s3";
+      break;
+  }
+}
+
+static bool char_is_unreserved_url(char c)
+{
+  if (isalnum(c))
+    return true;
+
+  switch (c) {
+  case '-':
+  case '.':
+  case '_':
+  case '~':
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool validate_access_key(string& key)
+{
+  const char *p = key.c_str();
+  while (*p) {
+    if (!char_is_unreserved_url(*p))
+      return false;
+    p++;
+  }
+  return true;
+}
+
+static void set_err_msg(std::string *sink, std::string msg)
+{
+  if (sink && !msg.empty())
+    *sink = msg;
+}
+
+/*
+ * Dump either the full user info or a subset to a formatter.
+ *
+ * NOTE: It is the caller's responsibility to ensure that the
+ * formatter is flushed at the correct time.
+ */
+
+static void dump_subusers_info(Formatter *f, RGWUserInfo &info)
+{
+  map<string, RGWSubUser>::iterator uiter;
+
+  f->open_array_section("subusers");
+  for (uiter = info.subusers.begin(); uiter != info.subusers.end(); ++uiter) {
+    RGWSubUser& u = uiter->second;
+    f->open_object_section("user");
+    string s;
+    info.user_id.to_str(s);
+    f->dump_format("id", "%s:%s", s.c_str(), u.name.c_str());
+    char buf[256];
+    rgw_perm_to_str(u.perm_mask, buf, sizeof(buf));
+    f->dump_string("permissions", buf);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+static void dump_access_keys_info(Formatter *f, RGWUserInfo &info)
+{
+  map<string, RGWAccessKey>::iterator kiter;
+  f->open_array_section("keys");
+  for (kiter = info.access_keys.begin(); kiter != info.access_keys.end(); ++kiter) {
+    RGWAccessKey& k = kiter->second;
+    const char *sep = (k.subuser.empty() ? "" : ":");
+    const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
+    f->open_object_section("key");
+    string s;
+    info.user_id.to_str(s);
+    f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser);
+    f->dump_string("access_key", k.id);
+    f->dump_string("secret_key", k.key);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+static void dump_swift_keys_info(Formatter *f, RGWUserInfo &info)
+{
+  map<string, RGWAccessKey>::iterator kiter;
+  f->open_array_section("swift_keys");
+  for (kiter = info.swift_keys.begin(); kiter != info.swift_keys.end(); ++kiter) {
+    RGWAccessKey& k = kiter->second;
+    const char *sep = (k.subuser.empty() ? "" : ":");
+    const char *subuser = (k.subuser.empty() ? "" : k.subuser.c_str());
+    f->open_object_section("key");
+    string s;
+    info.user_id.to_str(s);
+    f->dump_format("user", "%s%s%s", s.c_str(), sep, subuser);
+    f->dump_string("secret_key", k.key);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+static void dump_user_info(Formatter *f, RGWUserInfo &info,
+                           RGWStorageStats *stats = NULL)
+{
+  f->open_object_section("user_info");
+  encode_json("tenant", info.user_id.tenant, f);
+  encode_json("user_id", info.user_id.id, f);
+  encode_json("display_name", info.display_name, f);
+  encode_json("email", info.user_email, f);
+  encode_json("suspended", (int)info.suspended, f);
+  encode_json("max_buckets", (int)info.max_buckets, f);
+
+  dump_subusers_info(f, info);
+  dump_access_keys_info(f, info);
+  dump_swift_keys_info(f, info);
+
+  encode_json("caps", info.caps, f);
+
+  char buf[256];
+  op_type_to_str(info.op_mask, buf, sizeof(buf));
+  encode_json("op_mask", (const char *)buf, f);
+  encode_json("system", (bool)info.system, f);
+  encode_json("admin", (bool)info.admin, f);
+  encode_json("default_placement", info.default_placement.name, f);
+  encode_json("default_storage_class", info.default_placement.storage_class, f);
+  encode_json("placement_tags", info.placement_tags, f);
+  encode_json("bucket_quota", info.quota.bucket_quota, f);
+  encode_json("user_quota", info.quota.user_quota, f);
+  encode_json("temp_url_keys", info.temp_url_keys, f);
+
+  string user_source_type;
+  switch ((RGWIdentityType)info.type) {
+  case TYPE_RGW:
+    user_source_type = "rgw";
+    break;
+  case TYPE_KEYSTONE:
+    user_source_type = "keystone";
+    break;
+  case TYPE_LDAP:
+    user_source_type = "ldap";
+    break;
+  case TYPE_NONE:
+    user_source_type = "none";
+    break;
+  default:
+    user_source_type = "none";
+    break;
+  }
+  encode_json("type", user_source_type, f);
+  encode_json("mfa_ids", info.mfa_ids, f);
+  if (stats) {
+    encode_json("stats", *stats, f);
+  }
+  f->close_section();
+}
+
+static int user_add_helper(RGWUserAdminOpState& op_state, std::string *err_msg)
+{
+  int ret = 0;
+  const rgw_user& uid = op_state.get_user_id();
+  std::string user_email = op_state.get_user_email();
+  std::string display_name = op_state.get_display_name();
+
+  // fail if the user exists already
+  if (op_state.has_existing_user()) {
+    if (op_state.found_by_email) {
+      set_err_msg(err_msg, "email: " + user_email +
+          " is the email address of an existing user");
+      ret = -ERR_EMAIL_EXIST;
+    } else if (op_state.found_by_key) {
+      set_err_msg(err_msg, "duplicate key provided");
+      ret = -ERR_KEY_EXIST;
+    } else {
+      set_err_msg(err_msg, "user: " + uid.to_str() + " exists");
+      ret = -EEXIST;
+    }
+    return ret;
+  }
+
+  // fail if the user_info has already been populated
+  if (op_state.is_populated()) {
+    set_err_msg(err_msg, "cannot overwrite already populated user");
+    return -EEXIST;
+  }
+
+  // fail if the display name was not included
+  if (display_name.empty()) {
+    set_err_msg(err_msg, "no display name specified");
+    return -EINVAL;
+  }
+
+  return ret;
+}
+
+RGWAccessKeyPool::RGWAccessKeyPool(RGWUser* usr)
+{
+  if (!usr) {
+    return;
+  }
+
+  user = usr;
+
+  driver = user->get_driver();
+}
+
+int RGWAccessKeyPool::init(RGWUserAdminOpState& op_state)
+{
+  if (!op_state.is_initialized()) {
+    keys_allowed = false;
+    return -EINVAL;
+  }
+
+  const rgw_user& uid = op_state.get_user_id();
+  if (uid.compare(RGW_USER_ANON_ID) == 0) {
+    keys_allowed = false;
+    return -EINVAL;
+  }
+
+  swift_keys = op_state.get_swift_keys();
+  access_keys = op_state.get_access_keys();
+
+  keys_allowed = true;
+
+  return 0;
+}
+
+RGWUserAdminOpState::RGWUserAdminOpState(rgw::sal::Driver* driver)
+{
+  user = driver->get_user(rgw_user(RGW_USER_ANON_ID));
+}
+
+void RGWUserAdminOpState::set_user_id(const rgw_user& id)
+{
+  if (id.empty())
+    return;
+
+  user->get_info().user_id = id;
+}
+
+void RGWUserAdminOpState::set_subuser(std::string& _subuser)
+{
+  if (_subuser.empty())
+    return;
+
+  size_t pos = _subuser.find(":");
+  if (pos != string::npos) {
+    rgw_user tmp_id;
+    tmp_id.from_str(_subuser.substr(0, pos));
+    if (tmp_id.tenant.empty()) {
+      user->get_info().user_id.id = tmp_id.id;
+    } else {
+      user->get_info().user_id = tmp_id;
+    }
+    subuser = _subuser.substr(pos+1);
+  } else {
+    subuser = _subuser;
+  }
+
+  subuser_specified = true;
+}
+
+void RGWUserAdminOpState::set_user_info(RGWUserInfo& user_info)
+{
+  user->get_info() = user_info;
+}
+
+void RGWUserAdminOpState::set_user_version_tracker(RGWObjVersionTracker& objv_tracker)
+{
+  user->get_version_tracker() = objv_tracker;
+}
+
+const rgw_user& RGWUserAdminOpState::get_user_id()
+{
+  return user->get_id();
+}
+
+RGWUserInfo& RGWUserAdminOpState::get_user_info()
+{
+  return user->get_info();
+}
+
+map<std::string, RGWAccessKey>* RGWUserAdminOpState::get_swift_keys()
+{
+  return &user->get_info().swift_keys;
+}
+
+map<std::string, RGWAccessKey>* RGWUserAdminOpState::get_access_keys()
+{
+  return &user->get_info().access_keys;
+}
+
+map<std::string, RGWSubUser>* RGWUserAdminOpState::get_subusers()
+{
+  return &user->get_info().subusers;
+}
+
+RGWUserCaps *RGWUserAdminOpState::get_caps_obj()
+{
+  return &user->get_info().caps;
+}
+
+std::string RGWUserAdminOpState::build_default_swift_kid()
+{
+  if (user->get_id().empty() || subuser.empty())
+    return "";
+
+  std::string kid;
+  user->get_id().to_str(kid);
+  kid.append(":");
+  kid.append(subuser);
+
+  return kid;
+}
+
+std::string RGWUserAdminOpState::generate_subuser() {
+  if (user->get_id().empty())
+    return "";
+
+  std::string generated_subuser;
+  user->get_id().to_str(generated_subuser);
+  std::string rand_suffix;
+
+  int sub_buf_size = RAND_SUBUSER_LEN + 1;
+  char sub_buf[RAND_SUBUSER_LEN + 1];
+
+  gen_rand_alphanumeric_upper(g_ceph_context, sub_buf, sub_buf_size);
+
+  rand_suffix = sub_buf;
+  if (rand_suffix.empty())
+    return "";
+
+  generated_subuser.append(rand_suffix);
+  subuser = generated_subuser;
+
+  return generated_subuser;
+}
+
+/*
+ * Do a fairly exhaustive search for an existing key matching the parameters
+ * given. Also handles the case where no key type was specified and updates
+ * the operation state if needed.
+ */
+
+bool RGWAccessKeyPool::check_existing_key(RGWUserAdminOpState& op_state)
+{
+  bool existing_key = false;
+
+  int key_type = op_state.get_key_type();
+  std::string kid = op_state.get_access_key();
+  std::map<std::string, RGWAccessKey>::iterator kiter;
+  std::string swift_kid = op_state.build_default_swift_kid();
+
+  RGWUserInfo dup_info;
+
+  if (kid.empty() && swift_kid.empty())
+    return false;
+
+  switch (key_type) {
+  case KEY_TYPE_SWIFT:
+    kiter = swift_keys->find(swift_kid);
+
+    existing_key = (kiter != swift_keys->end());
+    if (existing_key)
+      op_state.set_access_key(swift_kid);
+
+    break;
+  case KEY_TYPE_S3:
+    kiter = access_keys->find(kid);
+    existing_key = (kiter != access_keys->end());
+
+    break;
+  default:
+    kiter = access_keys->find(kid);
+
+    existing_key = (kiter != access_keys->end());
+    if (existing_key) {
+      op_state.set_key_type(KEY_TYPE_S3);
+      break;
+    }
+
+    kiter = swift_keys->find(kid);
+
+    existing_key = (kiter != swift_keys->end());
+    if (existing_key) {
+      op_state.set_key_type(KEY_TYPE_SWIFT);
+      break;
+    }
+
+    // handle the case where the access key was not provided in user:key format
+    if (swift_kid.empty())
+      return false;
+
+    kiter = swift_keys->find(swift_kid);
+
+    existing_key = (kiter != swift_keys->end());
+    if (existing_key) {
+      op_state.set_access_key(swift_kid);
+      op_state.set_key_type(KEY_TYPE_SWIFT);
+    }
+  }
+
+  op_state.set_existing_key(existing_key);
+
+  return existing_key;
+}
+
+int RGWAccessKeyPool::check_op(RGWUserAdminOpState& op_state,
+     std::string *err_msg)
+{
+  RGWUserInfo dup_info;
+
+  if (!op_state.is_populated()) {
+    set_err_msg(err_msg, "user info was not populated");
+    return -EINVAL;
+  }
+
+  if (!keys_allowed) {
+    set_err_msg(err_msg, "keys not allowed for this user");
+    return -EACCES;
+  }
+
+  int32_t key_type = op_state.get_key_type();
+
+  // if a key type wasn't specified
+  if (key_type < 0) {
+      if (op_state.has_subuser()) {
+        key_type = KEY_TYPE_SWIFT;
+      } else {
+        key_type = KEY_TYPE_S3;
+      }
+  }
+
+  op_state.set_key_type(key_type);
+
+  /* see if the access key was specified */
+  if (key_type == KEY_TYPE_S3 && !op_state.will_gen_access() && 
+      op_state.get_access_key().empty()) {
+    set_err_msg(err_msg, "empty access key");
+    return -ERR_INVALID_ACCESS_KEY;
+  }
+
+  // don't check for secret key because we may be doing a removal
+
+  if (check_existing_key(op_state)) {
+    op_state.set_access_key_exist();
+  }
+  return 0;
+}
+
+// Generate a new random key
+int RGWAccessKeyPool::generate_key(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state,
+				   optional_yield y, std::string *err_msg)
+{
+  std::string id;
+  std::string key;
+
+  std::pair<std::string, RGWAccessKey> key_pair;
+  RGWAccessKey new_key;
+  std::unique_ptr<rgw::sal::User> duplicate_check;
+
+  int key_type = op_state.get_key_type();
+  bool gen_access = op_state.will_gen_access();
+  bool gen_secret = op_state.will_gen_secret();
+
+  if (!keys_allowed) {
+    set_err_msg(err_msg, "access keys not allowed for this user");
+    return -EACCES;
+  }
+
+  if (op_state.has_existing_key()) {
+    set_err_msg(err_msg, "cannot create existing key");
+    return -ERR_KEY_EXIST;
+  }
+
+  if (!gen_access) {
+    id = op_state.get_access_key();
+  }
+
+  if (!id.empty()) {
+    switch (key_type) {
+    case KEY_TYPE_SWIFT:
+      if (driver->get_user_by_swift(dpp, id, y, &duplicate_check) >= 0) {
+        set_err_msg(err_msg, "existing swift key in RGW system:" + id);
+        return -ERR_KEY_EXIST;
+      }
+      break;
+    case KEY_TYPE_S3:
+      if (driver->get_user_by_access_key(dpp, id, y, &duplicate_check) >= 0) {
+        set_err_msg(err_msg, "existing S3 key in RGW system:" + id);
+        return -ERR_KEY_EXIST;
+      }
+    }
+  }
+
+  //key's subuser
+  if (op_state.has_subuser()) {
+    //create user and subuser at the same time, user's s3 key should not be set this
+    if (!op_state.key_type_setbycontext || (key_type == KEY_TYPE_SWIFT)) {
+      new_key.subuser = op_state.get_subuser();
+    }
+  }
+
+  //Secret key
+  if (!gen_secret) {
+    if (op_state.get_secret_key().empty()) {
+      set_err_msg(err_msg, "empty secret key");
+      return -ERR_INVALID_SECRET_KEY;
+    }
+
+    key = op_state.get_secret_key();
+  } else {
+    char secret_key_buf[SECRET_KEY_LEN + 1];
+    gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, sizeof(secret_key_buf));
+    key = secret_key_buf;
+  }
+
+  // Generate the access key
+  if (key_type == KEY_TYPE_S3 && gen_access) {
+    char public_id_buf[PUBLIC_ID_LEN + 1];
+
+    do {
+      int id_buf_size = sizeof(public_id_buf);
+      gen_rand_alphanumeric_upper(g_ceph_context, public_id_buf, id_buf_size);
+      id = public_id_buf;
+      if (!validate_access_key(id))
+        continue;
+
+    } while (!driver->get_user_by_access_key(dpp, id, y, &duplicate_check));
+  }
+
+  if (key_type == KEY_TYPE_SWIFT) {
+    id = op_state.build_default_swift_kid();
+    if (id.empty()) {
+      set_err_msg(err_msg, "empty swift access key");
+      return -ERR_INVALID_ACCESS_KEY;
+    }
+
+    // check that the access key doesn't exist
+    if (driver->get_user_by_swift(dpp, id, y, &duplicate_check) >= 0) {
+      set_err_msg(err_msg, "cannot create existing swift key");
+      return -ERR_KEY_EXIST;
+    }
+  }
+
+  // finally create the new key
+  new_key.id = id;
+  new_key.key = key;
+
+  key_pair.first = id;
+  key_pair.second = new_key;
+
+  if (key_type == KEY_TYPE_S3) {
+    access_keys->insert(key_pair);
+  } else if (key_type == KEY_TYPE_SWIFT) {
+    swift_keys->insert(key_pair);
+  }
+
+  return 0;
+}
+
+// modify an existing key
+int RGWAccessKeyPool::modify_key(RGWUserAdminOpState& op_state, std::string *err_msg)
+{
+  std::string id;
+  std::string key = op_state.get_secret_key();
+  int key_type = op_state.get_key_type();
+
+  RGWAccessKey modify_key;
+
+  pair<string, RGWAccessKey> key_pair;
+  map<std::string, RGWAccessKey>::iterator kiter;
+
+  switch (key_type) {
+  case KEY_TYPE_S3:
+    id = op_state.get_access_key();
+    if (id.empty()) {
+      set_err_msg(err_msg, "no access key specified");
+      return -ERR_INVALID_ACCESS_KEY;
+    }
+    break;
+  case KEY_TYPE_SWIFT:
+    id = op_state.build_default_swift_kid();
+    if (id.empty()) {
+      set_err_msg(err_msg, "no subuser specified");
+      return -EINVAL;
+    }
+    break;
+  default:
+    set_err_msg(err_msg, "invalid key type");
+    return -ERR_INVALID_KEY_TYPE;
+  }
+
+  if (!op_state.has_existing_key()) {
+    set_err_msg(err_msg, "key does not exist");
+    return -ERR_INVALID_ACCESS_KEY;
+  }
+
+  key_pair.first = id;
+
+  if (key_type == KEY_TYPE_SWIFT) {
+    modify_key.id = id;
+    modify_key.subuser = op_state.get_subuser();
+  } else if (key_type == KEY_TYPE_S3) {
+    kiter = access_keys->find(id);
+    if (kiter != access_keys->end()) {
+      modify_key = kiter->second;
+    }
+  }
+
+  if (op_state.will_gen_secret()) {
+    char secret_key_buf[SECRET_KEY_LEN + 1];
+    int key_buf_size = sizeof(secret_key_buf);
+    gen_rand_alphanumeric_plain(g_ceph_context, secret_key_buf, key_buf_size);
+    key = secret_key_buf;
+  }
+
+  if (key.empty()) {
+      set_err_msg(err_msg, "empty secret key");
+      return -ERR_INVALID_SECRET_KEY;
+  }
+
+  // update the access key with the new secret key
+  modify_key.key = key;
+
+  key_pair.second = modify_key;
+
+
+  if (key_type == KEY_TYPE_S3) {
+    (*access_keys)[id] = modify_key;
+  } else if (key_type == KEY_TYPE_SWIFT) {
+    (*swift_keys)[id] = modify_key;
+  }
+
+  return 0;
+}
+
+int RGWAccessKeyPool::execute_add(const DoutPrefixProvider *dpp, 
+                                  RGWUserAdminOpState& op_state,
+				  std::string *err_msg, bool defer_user_update,
+				  optional_yield y)
+{
+  int ret = 0;
+
+  std::string subprocess_msg;
+  int key_op = GENERATE_KEY;
+
+  // set the op
+  if (op_state.has_existing_key())
+    key_op = MODIFY_KEY;
+
+  switch (key_op) {
+  case GENERATE_KEY:
+    ret = generate_key(dpp, op_state, y, &subprocess_msg);
+    break;
+  case MODIFY_KEY:
+    ret = modify_key(op_state, &subprocess_msg);
+    break;
+  }
+
+  if (ret < 0) {
+    set_err_msg(err_msg, subprocess_msg);
+    return ret;
+  }
+
+  // store the updated info
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWAccessKeyPool::add(const DoutPrefixProvider *dpp, 
+                          RGWUserAdminOpState& op_state, optional_yield y,
+			  std::string *err_msg)
+{
+  return add(dpp, op_state, err_msg, false, y);
+}
+
+int RGWAccessKeyPool::add(const DoutPrefixProvider *dpp, 
+                          RGWUserAdminOpState& op_state, std::string *err_msg,
+			  bool defer_user_update, optional_yield y)
+{
+  int ret;
+  std::string subprocess_msg;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to add access key, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWAccessKeyPool::execute_remove(const DoutPrefixProvider *dpp, 
+                                     RGWUserAdminOpState& op_state,
+				     std::string *err_msg,
+				     bool defer_user_update,
+				     optional_yield y)
+{
+  int ret = 0;
+
+  int key_type = op_state.get_key_type();
+  std::string id = op_state.get_access_key();
+  map<std::string, RGWAccessKey>::iterator kiter;
+  map<std::string, RGWAccessKey> *keys_map;
+
+  if (!op_state.has_existing_key()) {
+    set_err_msg(err_msg, "unable to find access key,  with key type: " +
+                             key_type_to_str(key_type));
+    return -ERR_INVALID_ACCESS_KEY;
+  }
+
+  if (key_type == KEY_TYPE_S3) {
+    keys_map = access_keys;
+  } else if (key_type == KEY_TYPE_SWIFT) {
+    keys_map = swift_keys;
+  } else {
+    keys_map = NULL;
+    set_err_msg(err_msg, "invalid access key");
+    return -ERR_INVALID_ACCESS_KEY;
+  }
+
+  kiter = keys_map->find(id);
+  if (kiter == keys_map->end()) {
+    set_err_msg(err_msg, "key not found");
+    return -ERR_INVALID_ACCESS_KEY;
+  }
+
+  keys_map->erase(kiter);
+
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWAccessKeyPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+			     std::string *err_msg)
+{
+  return remove(dpp, op_state, err_msg, false, y);
+}
+
+int RGWAccessKeyPool::remove(const DoutPrefixProvider *dpp, 
+                             RGWUserAdminOpState& op_state,
+			     std::string *err_msg, bool defer_user_update,
+			     optional_yield y)
+{
+  int ret;
+
+  std::string subprocess_msg;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_remove(dpp, op_state, &subprocess_msg, defer_user_update, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to remove access key, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+// remove all keys associated with a subuser
+int RGWAccessKeyPool::remove_subuser_keys(const DoutPrefixProvider *dpp, 
+                                          RGWUserAdminOpState& op_state,
+					  std::string *err_msg,
+					  bool defer_user_update,
+					  optional_yield y)
+{
+  int ret = 0;
+
+  if (!op_state.is_populated()) {
+    set_err_msg(err_msg, "user info was not populated");
+    return -EINVAL;
+  }
+
+  if (!op_state.has_subuser()) {
+    set_err_msg(err_msg, "no subuser specified");
+    return -EINVAL;
+  }
+
+  std::string swift_kid = op_state.build_default_swift_kid();
+  if (swift_kid.empty()) {
+    set_err_msg(err_msg, "empty swift access key");
+    return -EINVAL;
+  }
+
+  map<std::string, RGWAccessKey>::iterator kiter;
+  map<std::string, RGWAccessKey> *keys_map;
+
+  // a subuser can have at most one swift key
+  keys_map = swift_keys;
+  kiter = keys_map->find(swift_kid);
+  if (kiter != keys_map->end()) {
+    keys_map->erase(kiter);
+  }
+
+  // a subuser may have multiple s3 key pairs
+  std::string subuser_str = op_state.get_subuser();
+  keys_map = access_keys;
+  RGWUserInfo user_info = op_state.get_user_info();
+  auto user_kiter = user_info.access_keys.begin();
+  for (; user_kiter != user_info.access_keys.end(); ++user_kiter) {
+    if (user_kiter->second.subuser == subuser_str) {
+      kiter = keys_map->find(user_kiter->first);
+      if (kiter != keys_map->end()) {
+        keys_map->erase(kiter);
+      }
+    }
+  }
+
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+RGWSubUserPool::RGWSubUserPool(RGWUser *usr)
+{
+  if (!usr) {
+    return;
+  }
+
+  user = usr;
+
+  subusers_allowed = true;
+  driver = user->get_driver();
+}
+
+int RGWSubUserPool::init(RGWUserAdminOpState& op_state)
+{
+  if (!op_state.is_initialized()) {
+    subusers_allowed = false;
+    return -EINVAL;
+  }
+
+  const rgw_user& uid = op_state.get_user_id();
+  if (uid.compare(RGW_USER_ANON_ID) == 0) {
+    subusers_allowed = false;
+    return -EACCES;
+  }
+
+  subuser_map = op_state.get_subusers();
+  if (subuser_map == NULL) {
+    subusers_allowed = false;
+    return -EINVAL;
+  }
+
+  subusers_allowed = true;
+
+  return 0;
+}
+
+bool RGWSubUserPool::exists(std::string subuser)
+{
+  if (subuser.empty())
+    return false;
+
+  if (!subuser_map)
+    return false;
+
+  if (subuser_map->count(subuser))
+    return true;
+
+  return false;
+}
+
+int RGWSubUserPool::check_op(RGWUserAdminOpState& op_state,
+        std::string *err_msg)
+{
+  bool existing = false;
+  std::string subuser = op_state.get_subuser();
+
+  if (!op_state.is_populated()) {
+    set_err_msg(err_msg, "user info was not populated");
+    return -EINVAL;
+  }
+
+  if (!subusers_allowed) {
+    set_err_msg(err_msg, "subusers not allowed for this user");
+    return -EACCES;
+  }
+
+  if (subuser.empty() && !op_state.will_gen_subuser()) {
+    set_err_msg(err_msg, "empty subuser name");
+    return -EINVAL;
+  }
+
+  if (op_state.get_subuser_perm() == RGW_PERM_INVALID) {
+    set_err_msg(err_msg, "invalid subuser access");
+    return -EINVAL;
+  }
+
+  //set key type when it not set or set by context
+  if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) {
+    op_state.set_key_type(KEY_TYPE_SWIFT);
+    op_state.key_type_setbycontext = true;
+  }
+
+  // check if the subuser exists
+  if (!subuser.empty())
+    existing = exists(subuser);
+
+  op_state.set_existing_subuser(existing);
+
+  return 0;
+}
+
+int RGWSubUserPool::execute_add(const DoutPrefixProvider *dpp, 
+                                RGWUserAdminOpState& op_state,
+				std::string *err_msg, bool defer_user_update,
+				optional_yield y)
+{
+  int ret = 0;
+  std::string subprocess_msg;
+
+  RGWSubUser subuser;
+  std::pair<std::string, RGWSubUser> subuser_pair;
+  std::string subuser_str = op_state.get_subuser();
+
+  subuser_pair.first = subuser_str;
+
+  // assumes key should be created
+  if (op_state.has_key_op()) {
+    ret = user->keys.add(dpp, op_state, &subprocess_msg, true, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to create subuser key, " + subprocess_msg);
+      return ret;
+    }
+  }
+
+  // create the subuser
+  subuser.name = subuser_str;
+
+  if (op_state.has_subuser_perm())
+    subuser.perm_mask = op_state.get_subuser_perm();
+
+  // insert the subuser into user info
+  subuser_pair.second = subuser;
+  subuser_map->insert(subuser_pair);
+
+  // attempt to save the subuser
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSubUserPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+			std::string *err_msg)
+{
+  return add(dpp, op_state, err_msg, false, y);
+}
+
+int RGWSubUserPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update, optional_yield y)
+{
+  std::string subprocess_msg;
+  int ret;
+  int32_t key_type = op_state.get_key_type();
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+    return ret;
+  }
+
+  if (op_state.get_access_key_exist()) {
+    set_err_msg(err_msg, "cannot create existing key");
+    return -ERR_KEY_EXIST;
+  }
+
+  if (key_type == KEY_TYPE_S3 && op_state.get_access_key().empty()) {
+    op_state.set_gen_access();
+  }
+
+  if (op_state.get_secret_key().empty()) {
+    op_state.set_gen_secret();
+  }
+
+  ret = execute_add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to create subuser, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSubUserPool::execute_remove(const DoutPrefixProvider *dpp, 
+                                   RGWUserAdminOpState& op_state,
+				   std::string *err_msg, bool defer_user_update,
+				   optional_yield y)
+{
+  int ret = 0;
+  std::string subprocess_msg;
+
+  std::string subuser_str = op_state.get_subuser();
+
+  map<std::string, RGWSubUser>::iterator siter;
+  siter = subuser_map->find(subuser_str);
+  if (siter == subuser_map->end()){
+    set_err_msg(err_msg, "subuser not found: " + subuser_str);
+    return -ERR_NO_SUCH_SUBUSER;
+  }
+  if (!op_state.has_existing_subuser()) {
+    set_err_msg(err_msg, "subuser not found: " + subuser_str);
+    return -ERR_NO_SUCH_SUBUSER;
+  }
+
+  // always purge all associate keys
+  user->keys.remove_subuser_keys(dpp, op_state, &subprocess_msg, true, y);
+
+  // remove the subuser from the user info
+  subuser_map->erase(siter);
+
+  // attempt to save the subuser
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSubUserPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+			   std::string *err_msg)
+{
+  return remove(dpp, op_state, err_msg, false, y);
+}
+
+int RGWSubUserPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+			   bool defer_user_update, optional_yield y)
+{
+  std::string subprocess_msg;
+  int ret;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_remove(dpp, op_state, &subprocess_msg, defer_user_update, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to remove subuser, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSubUserPool::execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_user_update, optional_yield y)
+{
+  int ret = 0;
+  std::string subprocess_msg;
+  std::map<std::string, RGWSubUser>::iterator siter;
+  std::pair<std::string, RGWSubUser> subuser_pair;
+
+  std::string subuser_str = op_state.get_subuser();
+  RGWSubUser subuser;
+
+  if (!op_state.has_existing_subuser()) {
+    set_err_msg(err_msg, "subuser does not exist");
+    return -ERR_NO_SUCH_SUBUSER;
+  }
+
+  subuser_pair.first = subuser_str;
+
+  siter = subuser_map->find(subuser_str);
+  subuser = siter->second;
+
+  if (op_state.has_key_op()) {
+    ret = user->keys.add(dpp, op_state, &subprocess_msg, true, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to create subuser keys, " + subprocess_msg);
+      return ret;
+    }
+  }
+
+  if (op_state.has_subuser_perm())
+    subuser.perm_mask = op_state.get_subuser_perm();
+
+  subuser_pair.second = subuser;
+
+  subuser_map->erase(siter);
+  subuser_map->insert(subuser_pair);
+
+  // attempt to save the subuser
+  if (!defer_user_update)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSubUserPool::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+  return RGWSubUserPool::modify(dpp, op_state, y, err_msg, false);
+}
+
+int RGWSubUserPool::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg, bool defer_user_update)
+{
+  std::string subprocess_msg;
+  int ret;
+
+  RGWSubUser subuser;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse request, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_modify(dpp, op_state, &subprocess_msg, defer_user_update, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to modify subuser, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+RGWUserCapPool::RGWUserCapPool(RGWUser *usr)
+{
+  if (!usr) {
+    return;
+  }
+  user = usr;
+  caps_allowed = true;
+}
+
+int RGWUserCapPool::init(RGWUserAdminOpState& op_state)
+{
+  if (!op_state.is_initialized()) {
+    caps_allowed = false;
+    return -EINVAL;
+  }
+
+  const rgw_user& uid = op_state.get_user_id();
+  if (uid.compare(RGW_USER_ANON_ID) == 0) {
+    caps_allowed = false;
+    return -EACCES;
+  }
+
+  caps = op_state.get_caps_obj();
+  if (!caps) {
+    caps_allowed = false;
+    return -ERR_INVALID_CAP;
+  }
+
+  caps_allowed = true;
+
+  return 0;
+}
+
+int RGWUserCapPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+			std::string *err_msg)
+{
+  return add(dpp, op_state, err_msg, false, y);
+}
+
+int RGWUserCapPool::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+			bool defer_save, optional_yield y)
+{
+  int ret = 0;
+  std::string caps_str = op_state.get_caps();
+
+  if (!op_state.is_populated()) {
+    set_err_msg(err_msg, "user info was not populated");
+    return -EINVAL;
+  }
+
+  if (!caps_allowed) {
+    set_err_msg(err_msg, "caps not allowed for this user");
+    return -EACCES;
+  }
+
+  if (caps_str.empty()) {
+    set_err_msg(err_msg, "empty user caps");
+    return -ERR_INVALID_CAP;
+  }
+
+  int r = caps->add_from_string(caps_str);
+  if (r < 0) {
+    set_err_msg(err_msg, "unable to add caps: " + caps_str);
+    return r;
+  }
+
+  if (!defer_save)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUserCapPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+			   std::string *err_msg)
+{
+  return remove(dpp, op_state, err_msg, false, y);
+}
+
+int RGWUserCapPool::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+			   bool defer_save, optional_yield y)
+{
+  int ret = 0;
+
+  std::string caps_str = op_state.get_caps();
+
+  if (!op_state.is_populated()) {
+    set_err_msg(err_msg, "user info was not populated");
+    return -EINVAL;
+  }
+
+  if (!caps_allowed) {
+    set_err_msg(err_msg, "caps not allowed for this user");
+    return -EACCES;
+  }
+
+  if (caps_str.empty()) {
+    set_err_msg(err_msg, "empty user caps");
+    return -ERR_INVALID_CAP;
+  }
+
+  int r = caps->remove_from_string(caps_str);
+  if (r < 0) {
+    set_err_msg(err_msg, "unable to remove caps: " + caps_str);
+    return r;
+  }
+
+  if (!defer_save)
+    ret = user->update(dpp, op_state, err_msg, y);
+
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+RGWUser::RGWUser() : caps(this), keys(this), subusers(this)
+{
+  init_default();
+}
+
+int RGWUser::init(const DoutPrefixProvider *dpp, rgw::sal::Driver* _driver,
+		  RGWUserAdminOpState& op_state, optional_yield y)
+{
+  init_default();
+  int ret = init_storage(_driver);
+  if (ret < 0)
+    return ret;
+
+  ret = init(dpp, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+void RGWUser::init_default()
+{
+  // use anonymous user info as a placeholder
+  rgw_get_anon_user(old_info);
+  user_id = RGW_USER_ANON_ID;
+
+  clear_populated();
+}
+
+int RGWUser::init_storage(rgw::sal::Driver* _driver)
+{
+  if (!_driver) {
+    return -EINVAL;
+  }
+
+  driver = _driver;
+
+  clear_populated();
+
+  /* API wrappers */
+  keys = RGWAccessKeyPool(this);
+  caps = RGWUserCapPool(this);
+  subusers = RGWSubUserPool(this);
+
+  return 0;
+}
+
+int RGWUser::init(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y)
+{
+  bool found = false;
+  std::string swift_user;
+  user_id = op_state.get_user_id();
+  std::string user_email = op_state.get_user_email();
+  std::string access_key = op_state.get_access_key();
+  std::string subuser = op_state.get_subuser();
+
+  int key_type = op_state.get_key_type();
+  if (key_type == KEY_TYPE_SWIFT) {
+    swift_user = op_state.get_access_key();
+    access_key.clear();
+  }
+
+  std::unique_ptr<rgw::sal::User> user;
+
+  clear_populated();
+
+  if (user_id.empty() && !subuser.empty()) {
+    size_t pos = subuser.find(':');
+    if (pos != string::npos) {
+      user_id = subuser.substr(0, pos);
+      op_state.set_user_id(user_id);
+    }
+  }
+
+  if (!user_id.empty() && (user_id.compare(RGW_USER_ANON_ID) != 0)) {
+    user = driver->get_user(user_id);
+    found = (user->load_user(dpp, y) >= 0);
+    op_state.found_by_uid = found;
+  }
+  if (driver->ctx()->_conf.get_val<bool>("rgw_user_unique_email")) {
+    if (!user_email.empty() && !found) {
+      found = (driver->get_user_by_email(dpp, user_email, y, &user) >= 0);
+      op_state.found_by_email = found;
+    }
+  }
+  if (!swift_user.empty() && !found) {
+    found = (driver->get_user_by_swift(dpp, swift_user, y, &user) >= 0);
+    op_state.found_by_key = found;
+  }
+  if (!access_key.empty() && !found) {
+    found = (driver->get_user_by_access_key(dpp, access_key, y, &user) >= 0);
+    op_state.found_by_key = found;
+  }
+  
+  op_state.set_existing_user(found);
+  if (found) {
+    op_state.set_user_info(user->get_info());
+    op_state.set_populated();
+    op_state.objv = user->get_version_tracker();
+    op_state.set_user_version_tracker(user->get_version_tracker());
+
+    old_info = user->get_info();
+    set_populated();
+  }
+
+  if (user_id.empty()) {
+    user_id = user->get_id();
+  }
+  op_state.set_initialized();
+
+  // this may have been called by a helper object
+  int ret = init_members(op_state);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUser::init_members(RGWUserAdminOpState& op_state)
+{
+  int ret = 0;
+
+  ret = keys.init(op_state);
+  if (ret < 0)
+    return ret;
+
+  ret = subusers.init(op_state);
+  if (ret < 0)
+    return ret;
+
+  ret = caps.init(op_state);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUser::update(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+		    optional_yield y)
+{
+  int ret;
+  std::string subprocess_msg;
+  rgw::sal::User* user = op_state.get_user();
+
+  if (!driver) {
+    set_err_msg(err_msg, "couldn't initialize storage");
+    return -EINVAL;
+  }
+
+  // if op_state.op_access_keys is not empty most recent keys have been fetched from master zone
+  if(!op_state.op_access_keys.empty()) {
+    auto user_access_keys = op_state.get_access_keys();
+    *(user_access_keys) = op_state.op_access_keys;
+  }
+
+  RGWUserInfo *pold_info = (is_populated() ? &old_info : nullptr);
+
+  ret = user->store_user(dpp, y, false, pold_info);
+  op_state.objv = user->get_version_tracker();
+  op_state.set_user_version_tracker(user->get_version_tracker());
+
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to store user info");
+    return ret;
+  }
+
+  old_info = user->get_info();
+  set_populated();
+
+  return 0;
+}
+
+int RGWUser::check_op(RGWUserAdminOpState& op_state, std::string *err_msg)
+{
+  int ret = 0;
+  const rgw_user& uid = op_state.get_user_id();
+
+  if (uid.compare(RGW_USER_ANON_ID) == 0) {
+    set_err_msg(err_msg, "unable to perform operations on the anonymous user");
+    return -EINVAL;
+  }
+
+  if (is_populated() && user_id.compare(uid) != 0) {
+    set_err_msg(err_msg, "user id mismatch, operation id: " + uid.to_str()
+            + " does not match: " + user_id.to_str());
+
+    return -EINVAL;
+  }
+
+  ret = rgw_validate_tenant_name(uid.tenant);
+  if (ret) {
+    set_err_msg(err_msg,
+		"invalid tenant only alphanumeric and _ characters are allowed");
+    return ret;
+  }
+
+  //set key type when it not set or set by context
+  if ((op_state.get_key_type() < 0) || op_state.key_type_setbycontext) {
+    op_state.set_key_type(KEY_TYPE_S3);
+    op_state.key_type_setbycontext = true;
+  }
+
+  return 0;
+}
+
+// update swift_keys with new user id
+static void rename_swift_keys(const rgw_user& user,
+                              std::map<std::string, RGWAccessKey>& keys)
+{
+  std::string user_id;
+  user.to_str(user_id);
+
+  auto modify_keys = std::move(keys);
+  for ([[maybe_unused]] auto& [k, key] : modify_keys) {
+    std::string id = user_id + ":" + key.subuser;
+    key.id = id;
+    keys[id] = std::move(key);
+  }
+}
+
+int RGWUser::execute_rename(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
+{
+  int ret;
+  bool populated = op_state.is_populated();
+
+  if (!op_state.has_existing_user() && !populated) {
+    set_err_msg(err_msg, "user not found");
+    return -ENOENT;
+  }
+
+  if (!populated) {
+    ret = init(dpp, op_state, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to retrieve user info");
+      return ret;
+    }
+  }
+
+  std::unique_ptr<rgw::sal::User> old_user = driver->get_user(op_state.get_user_info().user_id);
+  std::unique_ptr<rgw::sal::User> new_user = driver->get_user(op_state.get_new_uid());
+  if (old_user->get_tenant() != new_user->get_tenant()) {
+    set_err_msg(err_msg, "users have to be under the same tenant namespace "
+                + old_user->get_tenant() + " != " + new_user->get_tenant());
+    return -EINVAL;
+  }
+
+  // create a stub user and write only the uid index and buckets object
+  std::unique_ptr<rgw::sal::User> user;
+  user = driver->get_user(new_user->get_id());
+
+  const bool exclusive = !op_state.get_overwrite_new_user(); // overwrite if requested
+
+  ret = user->store_user(dpp, y, exclusive);
+  if (ret == -EEXIST) {
+    set_err_msg(err_msg, "user name given by --new-uid already exists");
+    return ret;
+  }
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to store new user info");
+    return ret;
+  }
+
+  RGWAccessControlPolicy policy_instance;
+  policy_instance.create_default(new_user->get_id(), old_user->get_display_name());
+
+  //unlink and link buckets to new user
+  string marker;
+  CephContext *cct = driver->ctx();
+  size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
+  rgw::sal::BucketList buckets;
+
+  do {
+    ret = old_user->list_buckets(dpp, marker, "", max_buckets, false, buckets, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to list user buckets");
+      return ret;
+    }
+
+    auto& m = buckets.get_buckets();
+
+    for (auto it = m.begin(); it != m.end(); ++it) {
+      auto& bucket = it->second;
+      marker = it->first;
+
+      ret = bucket->load_bucket(dpp, y);
+      if (ret < 0) {
+        set_err_msg(err_msg, "failed to fetch bucket info for bucket=" + bucket->get_name());
+        return ret;
+      }
+
+      ret = bucket->set_acl(dpp, policy_instance, y);
+      if (ret < 0) {
+        set_err_msg(err_msg, "failed to set acl on bucket " + bucket->get_name());
+        return ret;
+      }
+
+      ret = rgw_chown_bucket_and_objects(driver, bucket.get(), new_user.get(),
+					 std::string(), nullptr, dpp, y);
+      if (ret < 0) {
+        set_err_msg(err_msg, "failed to run bucket chown" + cpp_strerror(-ret));
+        return ret;
+      }
+    }
+
+  } while (buckets.is_truncated());
+
+  // update the 'stub user' with all of the other fields and rewrite all of the
+  // associated index objects
+  RGWUserInfo& user_info = op_state.get_user_info();
+  user_info.user_id = new_user->get_id();
+  op_state.objv = user->get_version_tracker();
+  op_state.set_user_version_tracker(user->get_version_tracker());
+
+  rename_swift_keys(new_user->get_id(), user_info.swift_keys);
+
+  return update(dpp, op_state, err_msg, y);
+}
+
+int RGWUser::execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+			 optional_yield y)
+{
+  const rgw_user& uid = op_state.get_user_id();
+  std::string user_email = op_state.get_user_email();
+  std::string display_name = op_state.get_display_name();
+
+  // set the user info
+  RGWUserInfo user_info;
+  user_id = uid;
+  user_info.user_id = user_id;
+  user_info.display_name = display_name;
+  user_info.type = TYPE_RGW;
+
+  if (!user_email.empty())
+    user_info.user_email = user_email;
+
+  CephContext *cct = driver->ctx();
+  if (op_state.max_buckets_specified) {
+    user_info.max_buckets = op_state.get_max_buckets();
+  } else {
+    user_info.max_buckets =
+      cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+  }
+
+  user_info.suspended = op_state.get_suspension_status();
+  user_info.admin = op_state.admin;
+  user_info.system = op_state.system;
+
+  if (op_state.op_mask_specified)
+    user_info.op_mask = op_state.get_op_mask();
+
+  if (op_state.has_bucket_quota()) {
+    user_info.quota.bucket_quota = op_state.get_bucket_quota();
+  } else {
+    rgw_apply_default_bucket_quota(user_info.quota.bucket_quota, cct->_conf);
+  }
+
+  if (op_state.temp_url_key_specified) {
+    map<int, string>::iterator iter;
+    for (iter = op_state.temp_url_keys.begin();
+         iter != op_state.temp_url_keys.end(); ++iter) {
+      user_info.temp_url_keys[iter->first] = iter->second;
+    }
+  }
+
+  if (op_state.has_user_quota()) {
+    user_info.quota.user_quota = op_state.get_user_quota();
+  } else {
+    rgw_apply_default_user_quota(user_info.quota.user_quota, cct->_conf);
+  }
+
+  if (op_state.default_placement_specified) {
+    user_info.default_placement = op_state.default_placement;
+  }
+
+  if (op_state.placement_tags_specified) {
+    user_info.placement_tags = op_state.placement_tags;
+  }
+
+  // update the request
+  op_state.set_user_info(user_info);
+  op_state.set_populated();
+
+  // update the helper objects
+  int ret = init_members(op_state);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to initialize user");
+    return ret;
+  }
+
+  // see if we need to add an access key
+  std::string subprocess_msg;
+  bool defer_user_update = true;
+  if (op_state.has_key_op()) {
+    ret = keys.add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to create access key, " + subprocess_msg);
+      return ret;
+    }
+  }
+
+  // see if we need to add some caps
+  if (op_state.has_caps_op()) {
+    ret = caps.add(dpp, op_state, &subprocess_msg, defer_user_update, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to add user capabilities, " + subprocess_msg);
+      return ret;
+    }
+  }
+
+  ret = update(dpp, op_state, err_msg, y);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUser::add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+  std::string subprocess_msg;
+  int ret = user_add_helper(op_state, &subprocess_msg);
+  if (ret != 0) {
+    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_add(dpp, op_state, &subprocess_msg, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to create user, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWUser::rename(RGWUserAdminOpState& op_state, optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg)
+{
+  std::string subprocess_msg;
+  int ret;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_rename(dpp, op_state, &subprocess_msg, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to rename user, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWUser::execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
+{
+  int ret;
+
+  bool purge_data = op_state.will_purge_data();
+  rgw::sal::User* user = op_state.get_user();
+
+  if (!op_state.has_existing_user()) {
+    set_err_msg(err_msg, "user does not exist");
+    return -ENOENT;
+  }
+
+  rgw::sal::BucketList buckets;
+  string marker;
+  CephContext *cct = driver->ctx();
+  size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
+  do {
+    ret = user->list_buckets(dpp, marker, string(), max_buckets, false, buckets, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to read user bucket info");
+      return ret;
+    }
+
+    auto& m = buckets.get_buckets();
+    if (!m.empty() && !purge_data) {
+      set_err_msg(err_msg, "must specify purge data to remove user with buckets");
+      return -EEXIST; // change to code that maps to 409: conflict
+    }
+
+    for (auto it = m.begin(); it != m.end(); ++it) {
+      ret = it->second->remove_bucket(dpp, true, false, nullptr, y);
+      if (ret < 0) {
+        set_err_msg(err_msg, "unable to delete user data");
+        return ret;
+      }
+
+      marker = it->first;
+    }
+
+  } while (buckets.is_truncated());
+
+  ret = user->remove_user(dpp, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to remove user from RADOS");
+    return ret;
+  }
+
+  op_state.clear_populated();
+  clear_populated();
+
+  return 0;
+}
+
+int RGWUser::remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+  std::string subprocess_msg;
+  int ret;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_remove(dpp, op_state, &subprocess_msg, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to remove user, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWUser::execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y)
+{
+  bool populated = op_state.is_populated();
+  int ret = 0;
+  std::string subprocess_msg;
+  std::string op_email = op_state.get_user_email();
+  std::string display_name = op_state.get_display_name();
+
+  RGWUserInfo user_info;
+  std::unique_ptr<rgw::sal::User> duplicate_check;
+
+  // ensure that the user info has been populated or is populate-able
+  if (!op_state.has_existing_user() && !populated) {
+    set_err_msg(err_msg, "user not found");
+    return -ENOENT;
+  }
+
+  // if the user hasn't already been populated...attempt to
+  if (!populated) {
+    ret = init(dpp, op_state, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to retrieve user info");
+      return ret;
+    }
+  }
+
+  // ensure that we can modify the user's attributes
+  if (user_id.compare(RGW_USER_ANON_ID) == 0) {
+    set_err_msg(err_msg, "unable to modify anonymous user's info");
+    return -EACCES;
+  }
+
+  user_info = old_info;
+
+  std::string old_email = old_info.user_email;
+  if (!op_email.empty()) {
+    // make sure we are not adding a duplicate email
+    if (old_email != op_email) {
+      ret = driver->get_user_by_email(dpp, op_email, y, &duplicate_check);
+      if (ret >= 0 && duplicate_check->get_id().compare(user_id) != 0) {
+        set_err_msg(err_msg, "cannot add duplicate email");
+        return -ERR_EMAIL_EXIST;
+      }
+    }
+    user_info.user_email = op_email;
+  } else if (op_email.empty() && op_state.user_email_specified) {
+    ldpp_dout(dpp, 10) << "removing email index: " << user_info.user_email << dendl;
+    /* will be physically removed later when calling update() */
+    user_info.user_email.clear();
+  }
+
+  // update the remaining user info
+  if (!display_name.empty())
+    user_info.display_name = display_name;
+
+  if (op_state.max_buckets_specified)
+    user_info.max_buckets = op_state.get_max_buckets();
+
+  if (op_state.admin_specified)
+    user_info.admin = op_state.admin;
+
+  if (op_state.system_specified)
+    user_info.system = op_state.system;
+
+  if (op_state.temp_url_key_specified) {
+    map<int, string>::iterator iter;
+    for (iter = op_state.temp_url_keys.begin();
+         iter != op_state.temp_url_keys.end(); ++iter) {
+      user_info.temp_url_keys[iter->first] = iter->second;
+    }
+  }
+
+  if (op_state.op_mask_specified)
+    user_info.op_mask = op_state.get_op_mask();
+
+  if (op_state.has_bucket_quota())
+    user_info.quota.bucket_quota = op_state.get_bucket_quota();
+
+  if (op_state.has_user_quota())
+    user_info.quota.user_quota = op_state.get_user_quota();
+
+  if (op_state.has_suspension_op()) {
+    __u8 suspended = op_state.get_suspension_status();
+    user_info.suspended = suspended;
+
+    rgw::sal::BucketList buckets;
+
+    if (user_id.empty()) {
+      set_err_msg(err_msg, "empty user id passed...aborting");
+      return -EINVAL;
+    }
+
+    string marker;
+    CephContext *cct = driver->ctx();
+    size_t max_buckets = cct->_conf->rgw_list_buckets_max_chunk;
+    std::unique_ptr<rgw::sal::User> user = driver->get_user(user_id);
+    do {
+      ret = user->list_buckets(dpp, marker, string(), max_buckets, false, buckets, y);
+      if (ret < 0) {
+        set_err_msg(err_msg, "could not get buckets for uid:  " + user_id.to_str());
+        return ret;
+      }
+
+      auto& m = buckets.get_buckets();
+
+      vector<rgw_bucket> bucket_names;
+      for (auto iter = m.begin(); iter != m.end(); ++iter) {
+	auto& bucket = iter->second;
+        bucket_names.push_back(bucket->get_key());
+
+        marker = iter->first;
+      }
+
+      ret = driver->set_buckets_enabled(dpp, bucket_names, !suspended);
+      if (ret < 0) {
+        set_err_msg(err_msg, "failed to modify bucket");
+        return ret;
+      }
+
+    } while (buckets.is_truncated());
+  }
+
+  if (op_state.mfa_ids_specified) {
+    user_info.mfa_ids = op_state.mfa_ids;
+  }
+
+  if (op_state.default_placement_specified) {
+    user_info.default_placement = op_state.default_placement;
+  }
+
+  if (op_state.placement_tags_specified) {
+    user_info.placement_tags = op_state.placement_tags;
+  }
+
+  op_state.set_user_info(user_info);
+
+  // if we're supposed to modify keys, do so
+  if (op_state.has_key_op()) {
+    ret = keys.add(dpp, op_state, &subprocess_msg, true, y);
+    if (ret < 0) {
+      set_err_msg(err_msg, "unable to create or modify keys, " + subprocess_msg);
+      return ret;
+    }
+  }
+
+  ret = update(dpp, op_state, err_msg, y);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUser::modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg)
+{
+  std::string subprocess_msg;
+  int ret;
+
+  ret = check_op(op_state, &subprocess_msg);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to parse parameters, " + subprocess_msg);
+    return ret;
+  }
+
+  ret = execute_modify(dpp, op_state, &subprocess_msg, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to modify user, " + subprocess_msg);
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWUser::info(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info,
+		  optional_yield y, std::string *err_msg)
+{
+  int ret = init(dpp, op_state, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "unable to fetch user info");
+    return ret;
+  }
+
+  fetched_info = op_state.get_user_info();
+
+  return 0;
+}
+
+int RGWUser::info(RGWUserInfo& fetched_info, std::string *err_msg)
+{
+  if (!is_populated()) {
+    set_err_msg(err_msg, "no user info saved");
+    return -EINVAL;
+  }
+
+  fetched_info = old_info;
+
+  return 0;
+}
+
+int RGWUser::list(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher)
+{
+  Formatter *formatter = flusher.get_formatter();
+  void *handle = nullptr;
+  std::string metadata_key = "user";
+  if (op_state.max_entries > 1000) {
+    op_state.max_entries = 1000;
+  }
+
+  int ret = driver->meta_list_keys_init(dpp, metadata_key, op_state.marker, &handle);
+  if (ret < 0) {
+    return ret;
+  }
+
+  bool truncated = false;
+  uint64_t count = 0;
+  uint64_t left = 0;
+  flusher.start(0);
+
+  // open the result object section
+  formatter->open_object_section("result");
+
+  // open the user id list array section
+  formatter->open_array_section("keys");
+  do {
+    std::list<std::string> keys;
+    left = op_state.max_entries - count;
+    ret = driver->meta_list_keys_next(dpp, handle, left, keys, &truncated);
+    if (ret < 0 && ret != -ENOENT) {
+      return ret;
+    } if (ret != -ENOENT) {
+      for (std::list<std::string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+      formatter->dump_string("key", *iter);
+        ++count;
+      }
+    }
+  } while (truncated && left > 0);
+  // close user id list section
+  formatter->close_section();
+
+  formatter->dump_bool("truncated", truncated);
+  formatter->dump_int("count", count);
+  if (truncated) {
+    formatter->dump_string("marker", driver->meta_get_marker(handle));
+  }
+
+  // close result object section
+  formatter->close_section();
+
+  driver->meta_list_keys_complete(handle);
+
+  flusher.flush();
+  return 0;
+}
+
+int RGWUserAdminOp_User::list(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+                  RGWFormatterFlusher& flusher)
+{
+  RGWUser user;
+
+  int ret = user.init_storage(driver);
+  if (ret < 0)
+    return ret;
+
+  ret = user.list(dpp, op_state, flusher);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUserAdminOp_User::info(const DoutPrefixProvider *dpp,
+			      rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+			      RGWFormatterFlusher& flusher,
+			      optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  std::unique_ptr<rgw::sal::User> ruser;
+
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  ruser = driver->get_user(info.user_id);
+
+  if (op_state.sync_stats) {
+    ret = rgw_user_sync_all_stats(dpp, driver, ruser.get(), y);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  RGWStorageStats stats;
+  RGWStorageStats *arg_stats = NULL;
+  if (op_state.fetch_stats) {
+    int ret = ruser->read_stats(dpp, y, &stats);
+    if (ret < 0 && ret != -ENOENT) {
+      return ret;
+    }
+
+    arg_stats = &stats;
+  }
+
+  if (formatter) {
+    flusher.start(0);
+
+    dump_user_info(formatter, info, arg_stats);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_User::create(const DoutPrefixProvider *dpp,
+				rgw::sal::Driver* driver,
+				RGWUserAdminOpState& op_state,
+				RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.add(dpp, op_state, y, NULL);
+  if (ret < 0) {
+    if (ret == -EEXIST)
+      ret = -ERR_USER_EXIST;
+    return ret;
+  }
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    dump_user_info(formatter, info);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_User::modify(const DoutPrefixProvider *dpp,
+				rgw::sal::Driver* driver,
+				RGWUserAdminOpState& op_state,
+				RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.modify(dpp, op_state, y, NULL);
+  if (ret < 0) {
+    if (ret == -ENOENT)
+      ret = -ERR_NO_SUCH_USER;
+    return ret;
+  }
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    dump_user_info(formatter, info);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_User::remove(const DoutPrefixProvider *dpp,
+				rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+				RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+
+  ret = user.remove(dpp, op_state, y, NULL);
+
+  if (ret == -ENOENT)
+    ret = -ERR_NO_SUCH_USER;
+  return ret;
+}
+
+int RGWUserAdminOp_Subuser::create(const DoutPrefixProvider *dpp,
+				   rgw::sal::Driver* driver,
+				   RGWUserAdminOpState& op_state,
+				   RGWFormatterFlusher& flusher,
+				   optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.subusers.add(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    dump_subusers_info(formatter, info);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_Subuser::modify(const DoutPrefixProvider *dpp,
+				   rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+				   RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.subusers.modify(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+ 
+  if (formatter) {
+    flusher.start(0);
+
+    dump_subusers_info(formatter, info);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_Subuser::remove(const DoutPrefixProvider *dpp,
+				   rgw::sal::Driver* driver,
+				   RGWUserAdminOpState& op_state,
+				   RGWFormatterFlusher& flusher,
+				   optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  ret = user.subusers.remove(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUserAdminOp_Key::create(const DoutPrefixProvider *dpp,
+			       rgw::sal::Driver* driver, RGWUserAdminOpState& op_state,
+			       RGWFormatterFlusher& flusher,
+			       optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.keys.add(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    int key_type = op_state.get_key_type();
+
+    if (key_type == KEY_TYPE_SWIFT)
+      dump_swift_keys_info(formatter, info);
+
+    else if (key_type == KEY_TYPE_S3)
+      dump_access_keys_info(formatter, info);
+
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+int RGWUserAdminOp_Key::remove(const DoutPrefixProvider *dpp,
+			       rgw::sal::Driver* driver,
+			       RGWUserAdminOpState& op_state,
+			       RGWFormatterFlusher& flusher,
+			       optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+
+  ret = user.keys.remove(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWUserAdminOp_Caps::add(const DoutPrefixProvider *dpp,
+			     rgw::sal::Driver* driver,
+			     RGWUserAdminOpState& op_state,
+			     RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.caps.add(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    info.caps.dump(formatter);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+
+int RGWUserAdminOp_Caps::remove(const DoutPrefixProvider *dpp,
+				rgw::sal::Driver* driver,
+				RGWUserAdminOpState& op_state,
+				RGWFormatterFlusher& flusher, optional_yield y)
+{
+  RGWUserInfo info;
+  RGWUser user;
+  int ret = user.init(dpp, driver, op_state, y);
+  if (ret < 0)
+    return ret;
+
+  if (!op_state.has_existing_user())
+    return -ERR_NO_SUCH_USER;
+
+  Formatter *formatter = flusher.get_formatter();
+
+  ret = user.caps.remove(dpp, op_state, y, NULL);
+  if (ret < 0)
+    return ret;
+
+  ret = user.info(info, NULL);
+  if (ret < 0)
+    return ret;
+
+  if (formatter) {
+    flusher.start(0);
+
+    info.caps.dump(formatter);
+    flusher.flush();
+  }
+
+  return 0;
+}
+
+class RGWUserMetadataHandler : public RGWMetadataHandler_GenericMetaBE {
+public:
+  struct Svc {
+    RGWSI_User *user{nullptr};
+  } svc;
+
+  RGWUserMetadataHandler(RGWSI_User *user_svc) {
+    base_init(user_svc->ctx(), user_svc->get_be_handler());
+    svc.user = user_svc;
+  }
+
+  ~RGWUserMetadataHandler() {}
+
+  string get_type() override { return "user"; }
+
+  int do_get(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWUserCompleteInfo uci;
+    RGWObjVersionTracker objv_tracker;
+    real_time mtime;
+
+    rgw_user user = RGWSI_User::user_from_meta_key(entry);
+
+    int ret = svc.user->read_user_info(op->ctx(), user, &uci.info, &objv_tracker,
+                                       &mtime, nullptr, &uci.attrs,
+                                       y, dpp);
+    if (ret < 0) {
+      return ret;
+    }
+
+    RGWUserMetadataObject *mdo = new RGWUserMetadataObject(uci, objv_tracker.read_version, mtime);
+    *obj = mdo;
+
+    return 0;
+  }
+
+  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) override {
+    RGWUserCompleteInfo uci;
+
+    try {
+      decode_json_obj(uci, jo);
+    } catch (JSONDecoder::err& e) {
+      return nullptr;
+    }
+
+    return new RGWUserMetadataObject(uci, objv, mtime);
+  }
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+             RGWMetadataObject *obj,
+             RGWObjVersionTracker& objv_tracker,
+             optional_yield y, const DoutPrefixProvider *dpp,
+             RGWMDLogSyncType type, bool from_remote_zone) override;
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op, string& entry, RGWObjVersionTracker& objv_tracker,
+                optional_yield y, const DoutPrefixProvider *dpp) override {
+    RGWUserInfo info;
+
+    rgw_user user = RGWSI_User::user_from_meta_key(entry);
+
+    int ret = svc.user->read_user_info(op->ctx(), user, &info, nullptr,
+                                       nullptr, nullptr, nullptr,
+                                       y, dpp);
+    if (ret < 0) {
+      return ret;
+    }
+
+    return svc.user->remove_user_info(op->ctx(), info, &objv_tracker,
+                                      y, dpp);
+  }
+};
+
+class RGWMetadataHandlerPut_User : public RGWMetadataHandlerPut_SObj
+{
+  RGWUserMetadataHandler *uhandler;
+  RGWUserMetadataObject *uobj;
+public:
+  RGWMetadataHandlerPut_User(RGWUserMetadataHandler *_handler,
+                             RGWSI_MetaBackend_Handler::Op *op, string& entry,
+                             RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
+                             optional_yield y,
+                             RGWMDLogSyncType type, bool from_remote_zone) : RGWMetadataHandlerPut_SObj(_handler, op, entry, obj, objv_tracker, y, type, from_remote_zone),
+                                                                uhandler(_handler) {
+    uobj = static_cast<RGWUserMetadataObject *>(obj);
+  }
+
+  int put_checked(const DoutPrefixProvider *dpp) override;
+};
+
+int RGWUserMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op, string& entry,
+                                   RGWMetadataObject *obj,
+                                   RGWObjVersionTracker& objv_tracker,
+                                   optional_yield y, const DoutPrefixProvider *dpp,
+                                   RGWMDLogSyncType type, bool from_remote_zone)
+{
+  RGWMetadataHandlerPut_User put_op(this, op, entry, obj, objv_tracker, y, type, from_remote_zone);
+  return do_put_operate(&put_op, dpp);
+}
+
+int RGWMetadataHandlerPut_User::put_checked(const DoutPrefixProvider *dpp)
+{
+  RGWUserMetadataObject *orig_obj = static_cast<RGWUserMetadataObject *>(old_obj);
+  RGWUserCompleteInfo& uci = uobj->get_uci();
+
+  map<string, bufferlist> *pattrs{nullptr};
+  if (uci.has_attrs) {
+    pattrs = &uci.attrs;
+  }
+
+  RGWUserInfo *pold_info = (orig_obj ? &orig_obj->get_uci().info : nullptr);
+
+  auto mtime = obj->get_mtime();
+
+  int ret = uhandler->svc.user->store_user_info(op->ctx(), uci.info, pold_info,
+                                               &objv_tracker, mtime,
+                                               false, pattrs, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return STATUS_APPLIED;
+}
+
+
+RGWUserCtl::RGWUserCtl(RGWSI_Zone *zone_svc,
+                       RGWSI_User *user_svc,
+                       RGWUserMetadataHandler *_umhandler) : umhandler(_umhandler) {
+  svc.zone = zone_svc;
+  svc.user = user_svc;
+  be_handler = umhandler->get_be_handler();
+}
+
+template <class T>
+class optional_default
+{
+  const std::optional<T>& opt;
+  std::optional<T> def;
+  const T *p;
+public:
+  optional_default(const std::optional<T>& _o) : opt(_o) {
+    if (opt) {
+      p = &(*opt);
+    } else {
+      def = T();
+      p = &(*def);
+    }
+  }
+
+  const T *operator->() {
+    return p;
+  }
+
+  const T& operator*() {
+    return *p;
+  }
+};
+
+int RGWUserCtl::get_info_by_uid(const DoutPrefixProvider *dpp, 
+                                const rgw_user& uid,
+                                RGWUserInfo *info,
+                                optional_yield y,
+                                const GetParams& params)
+
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->read_user_info(op->ctx(),
+                                    uid,
+                                    info,
+                                    params.objv_tracker,
+                                    params.mtime,
+                                    params.cache_info,
+                                    params.attrs,
+                                    y,
+                                    dpp);
+  });
+}
+
+int RGWUserCtl::get_info_by_email(const DoutPrefixProvider *dpp, 
+                                  const string& email,
+                                  RGWUserInfo *info,
+                                  optional_yield y,
+                                  const GetParams& params)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->get_user_info_by_email(op->ctx(), email,
+                                            info,
+                                            params.objv_tracker,
+                                            params.mtime,
+                                            y,
+                                            dpp);
+  });
+}
+
+int RGWUserCtl::get_info_by_swift(const DoutPrefixProvider *dpp, 
+                                  const string& swift_name,
+                                  RGWUserInfo *info,
+                                  optional_yield y,
+                                  const GetParams& params)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->get_user_info_by_swift(op->ctx(), swift_name,
+                                            info,
+                                            params.objv_tracker,
+                                            params.mtime,
+                                            y,
+                                            dpp);
+  });
+}
+
+int RGWUserCtl::get_info_by_access_key(const DoutPrefixProvider *dpp, 
+                                       const string& access_key,
+                                       RGWUserInfo *info,
+                                       optional_yield y,
+                                       const GetParams& params)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->get_user_info_by_access_key(op->ctx(), access_key,
+                                                 info,
+                                                 params.objv_tracker,
+                                                 params.mtime,
+                                                 y,
+                                                 dpp);
+  });
+}
+
+int RGWUserCtl::get_attrs_by_uid(const DoutPrefixProvider *dpp, 
+                                 const rgw_user& user_id,
+                                 map<string, bufferlist> *pattrs,
+                                 optional_yield y,
+                                 RGWObjVersionTracker *objv_tracker)
+{
+  RGWUserInfo user_info;
+
+  return get_info_by_uid(dpp, user_id, &user_info, y, RGWUserCtl::GetParams()
+                         .set_attrs(pattrs)
+                         .set_objv_tracker(objv_tracker));
+}
+
+int RGWUserCtl::store_info(const DoutPrefixProvider *dpp, 
+                           const RGWUserInfo& info, optional_yield y,
+                           const PutParams& params)
+{
+  string key = RGWSI_User::get_meta_key(info.user_id);
+
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->store_user_info(op->ctx(), info,
+                                     params.old_info,
+                                     params.objv_tracker,
+                                     params.mtime,
+                                     params.exclusive,
+                                     params.attrs,
+                                     y,
+                                     dpp);
+  });
+}
+
+int RGWUserCtl::remove_info(const DoutPrefixProvider *dpp, 
+                            const RGWUserInfo& info, optional_yield y,
+                            const RemoveParams& params)
+
+{
+  string key = RGWSI_User::get_meta_key(info.user_id);
+
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->remove_user_info(op->ctx(), info,
+                                      params.objv_tracker,
+                                      y, dpp);
+  });
+}
+
+int RGWUserCtl::list_buckets(const DoutPrefixProvider *dpp, 
+                             const rgw_user& user,
+                             const string& marker,
+                             const string& end_marker,
+                             uint64_t max,
+                             bool need_stats,
+                             RGWUserBuckets *buckets,
+                             bool *is_truncated,
+			     optional_yield y,
+                             uint64_t default_max)
+{
+  if (!max) {
+    max = default_max;
+  }
+
+  int ret = svc.user->list_buckets(dpp, user, marker, end_marker,
+                                   max, buckets, is_truncated, y);
+  if (ret < 0) {
+    return ret;
+  }
+  if (need_stats) {
+    map<string, RGWBucketEnt>& m = buckets->get_buckets();
+    ret = ctl.bucket->read_buckets_stats(m, y, dpp);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 0) << "ERROR: could not get stats for buckets" << dendl;
+      return ret;
+    }
+  }
+  return 0;
+}
+
+int RGWUserCtl::read_stats(const DoutPrefixProvider *dpp, 
+                           const rgw_user& user, RGWStorageStats *stats,
+			   optional_yield y,
+			   ceph::real_time *last_stats_sync,
+			   ceph::real_time *last_stats_update)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return svc.user->read_stats(dpp, op->ctx(), user, stats,
+				last_stats_sync, last_stats_update, y);
+  });
+}
+
+RGWMetadataHandler *RGWUserMetaHandlerAllocator::alloc(RGWSI_User *user_svc) {
+  return new RGWUserMetadataHandler(user_svc);
+}
+
+void rgw_user::dump(Formatter *f) const
+{
+  ::encode_json("user", *this, f);
+}
+
diff --git a/src/rgw/driver/rados/rgw_user.h b/src/rgw/driver/rados/rgw_user.h
new file mode 100644
index 000000000..ea05de806
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_user.h
@@ -0,0 +1,885 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <boost/algorithm/string.hpp>
+#include "include/ceph_assert.h"
+
+#include "include/types.h"
+#include "rgw_common.h"
+#include "rgw_tools.h"
+
+#include "rgw_string.h"
+
+#include "common/Formatter.h"
+#include "rgw_formats.h"
+#include "rgw_metadata.h"
+#include "rgw_sal_fwd.h"
+
+#define RGW_USER_ANON_ID "anonymous"
+
+#define SECRET_KEY_LEN 40
+#define PUBLIC_ID_LEN 20
+#define RAND_SUBUSER_LEN 5
+
+#define XMLNS_AWS_S3 "http://s3.amazonaws.com/doc/2006-03-01/"
+
+class RGWUserCtl;
+class RGWBucketCtl;
+class RGWUserBuckets;
+
+class RGWGetUserStats_CB;
+
+/**
+ * A string wrapper that includes encode/decode functions
+ * for easily accessing a UID in all forms
+ */
+struct RGWUID
+{
+  rgw_user user_id;
+  void encode(bufferlist& bl) const {
+    std::string s;
+    user_id.to_str(s);
+    using ceph::encode;
+    encode(s, bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    std::string s;
+    using ceph::decode;
+    decode(s, bl);
+    user_id.from_str(s);
+  }
+};
+WRITE_CLASS_ENCODER(RGWUID)
+
+/** Entry for bucket metadata collection */
+struct bucket_meta_entry {
+  size_t size;
+  size_t size_rounded;
+  ceph::real_time creation_time;
+  uint64_t count;
+};
+
+extern int rgw_user_sync_all_stats(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, rgw::sal::User* user, optional_yield y);
+extern int rgw_user_get_all_buckets_stats(const DoutPrefixProvider *dpp,
+  rgw::sal::Driver* driver, rgw::sal::User* user,
+  std::map<std::string, bucket_meta_entry>& buckets_usage_map, optional_yield y);
+
+/**
+ * Get the anonymous (ie, unauthenticated) user info.
+ */
+extern void rgw_get_anon_user(RGWUserInfo& info);
+
+extern void rgw_perm_to_str(uint32_t mask, char *buf, int len);
+extern uint32_t rgw_str_to_perm(const char *str);
+
+extern int rgw_validate_tenant_name(const std::string& t);
+
+enum ObjectKeyType {
+  KEY_TYPE_SWIFT,
+  KEY_TYPE_S3,
+  KEY_TYPE_UNDEFINED
+};
+
+enum RGWKeyPoolOp {
+  GENERATE_KEY,
+  MODIFY_KEY
+};
+
+enum RGWUserId {
+  RGW_USER_ID,
+  RGW_SWIFT_USERNAME,
+  RGW_USER_EMAIL,
+  RGW_ACCESS_KEY,
+};
+
+/*
+ * An RGWUser class along with supporting classes created
+ * to support the creation of an RESTful administrative API
+ */
+struct RGWUserAdminOpState {
+  // user attributes
+  std::unique_ptr<rgw::sal::User> user;
+  std::string user_email;
+  std::string display_name;
+  rgw_user new_user_id;
+  bool overwrite_new_user = false;
+  int32_t max_buckets{RGW_DEFAULT_MAX_BUCKETS};
+  __u8 suspended{0};
+  __u8 admin{0};
+  __u8 system{0};
+  __u8 exclusive{0};
+  __u8 fetch_stats{0};
+  __u8 sync_stats{0};
+  std::string caps;
+  RGWObjVersionTracker objv;
+  uint32_t op_mask{0};
+  std::map<int, std::string> temp_url_keys;
+
+  // subuser attributes
+  std::string subuser;
+  uint32_t perm_mask{RGW_PERM_NONE};
+
+  // key_attributes
+  std::string id; // access key
+  std::string key; // secret key
+  // access keys fetched for a user in the middle of an op
+  std::map<std::string, RGWAccessKey> op_access_keys;
+  int32_t key_type{-1};
+  bool access_key_exist = false;
+
+  std::set<std::string> mfa_ids;
+
+  // operation attributes
+  bool existing_user{false};
+  bool existing_key{false};
+  bool existing_subuser{false};
+  bool existing_email{false};
+  bool subuser_specified{false};
+  bool gen_secret{false};
+  bool gen_access{false};
+  bool gen_subuser{false};
+  bool id_specified{false};
+  bool key_specified{false};
+  bool type_specified{false};
+  bool key_type_setbycontext{false};   // key type set by user or subuser context
+  bool purge_data{false};
+  bool purge_keys{false};
+  bool display_name_specified{false};
+  bool user_email_specified{false};
+  bool max_buckets_specified{false};
+  bool perm_specified{false};
+  bool op_mask_specified{false};
+  bool caps_specified{false};
+  bool suspension_op{false};
+  bool admin_specified{false};
+  bool system_specified{false};
+  bool key_op{false};
+  bool temp_url_key_specified{false};
+  bool found_by_uid{false};
+  bool found_by_email{false};
+  bool found_by_key{false};
+  bool mfa_ids_specified{false};
+ 
+  // req parameters
+  bool populated{false};
+  bool initialized{false};
+  bool key_params_checked{false};
+  bool subuser_params_checked{false};
+  bool user_params_checked{false};
+
+  bool bucket_quota_specified{false};
+  bool user_quota_specified{false};
+  bool bucket_ratelimit_specified{false};
+  bool user_ratelimit_specified{false};
+
+  RGWQuota quota;
+  RGWRateLimitInfo user_ratelimit;
+  RGWRateLimitInfo bucket_ratelimit;
+
+  // req parameters for listing user
+  std::string marker{""};
+  uint32_t max_entries{1000};
+  rgw_placement_rule default_placement; // user default placement
+  bool default_placement_specified{false};
+
+  std::list<std::string> placement_tags;  // user default placement_tags
+  bool placement_tags_specified{false};
+
+  void set_access_key(const std::string& access_key) {
+    if (access_key.empty())
+      return;
+
+    id = access_key;
+    id_specified = true;
+    gen_access = false;
+    key_op = true;
+  }
+
+  void set_secret_key(const std::string& secret_key) {
+    if (secret_key.empty())
+      return;
+
+    key = secret_key;
+    key_specified = true;
+    gen_secret = false;
+    key_op = true;
+  }
+
+  void set_user_id(const rgw_user& id);
+
+  void set_new_user_id(const rgw_user& id) {
+    if (id.empty())
+      return;
+
+    new_user_id = id;
+  }
+  void set_overwrite_new_user(bool b) {
+    overwrite_new_user = b;
+  }
+
+  void set_user_email(std::string& email) {
+   /* always lowercase email address */
+    boost::algorithm::to_lower(email);
+    user_email = email;
+    user_email_specified = true;
+  }
+
+  void set_display_name(const std::string& name) {
+    if (name.empty())
+      return;
+
+    display_name = name;
+    display_name_specified = true;
+  }
+
+  void set_subuser(std::string& _subuser);
+
+  void set_caps(const std::string& _caps) {
+    if (_caps.empty())
+      return;
+
+    caps = _caps;
+    caps_specified = true;
+  }
+
+  void set_perm(uint32_t perm) {
+    perm_mask = perm;
+    perm_specified = true;
+  }
+
+  void set_op_mask(uint32_t mask) {
+    op_mask = mask;
+    op_mask_specified = true;
+  }
+
+  void set_temp_url_key(const std::string& key, int index) {
+    temp_url_keys[index] = key;
+    temp_url_key_specified = true;
+  }
+
+  void set_key_type(int32_t type) {
+    key_type = type;
+    type_specified = true;
+  }
+
+  void set_access_key_exist() {
+    access_key_exist = true;
+  }
+
+  void set_suspension(__u8 is_suspended) {
+    suspended = is_suspended;
+    suspension_op = true;
+  }
+
+  void set_admin(__u8 is_admin) {
+    admin = is_admin;
+    admin_specified = true;
+  }
+
+  void set_system(__u8 is_system) {
+    system = is_system;
+    system_specified = true;
+  }
+
+  void set_exclusive(__u8 is_exclusive) {
+    exclusive = is_exclusive;
+  }
+
+  void set_fetch_stats(__u8 is_fetch_stats) {
+    fetch_stats = is_fetch_stats;
+  }
+
+  void set_sync_stats(__u8 is_sync_stats) {
+    sync_stats = is_sync_stats;
+  }
+
+  void set_user_info(RGWUserInfo& user_info);
+
+  void set_user_version_tracker(RGWObjVersionTracker& objv_tracker);
+
+  void set_max_buckets(int32_t mb) {
+    max_buckets = mb;
+    max_buckets_specified = true;
+  }
+
+  void set_gen_access() {
+    gen_access = true;
+    key_op = true;
+  }
+
+  void set_gen_secret() {
+    gen_secret = true;
+    key_op = true;
+  }
+
+  void set_generate_key() {
+    if (id.empty())
+      gen_access = true;
+    if (key.empty())
+      gen_secret = true;
+    key_op = true;
+  }
+
+  void clear_generate_key() {
+    gen_access = false;
+    gen_secret = false;
+  }
+
+  void set_purge_keys() {
+    purge_keys = true;
+    key_op = true;
+  }
+
+  void set_bucket_quota(RGWQuotaInfo& quotas) {
+    quota.bucket_quota = quotas;
+    bucket_quota_specified = true;
+  }
+
+  void set_user_quota(RGWQuotaInfo& quotas) {
+    quota.user_quota = quotas;
+    user_quota_specified = true;
+  }
+
+  void set_bucket_ratelimit(RGWRateLimitInfo& ratelimit) {
+    bucket_ratelimit = ratelimit;
+    bucket_ratelimit_specified = true;
+  }
+
+  void set_user_ratelimit(RGWRateLimitInfo& ratelimit) {
+    user_ratelimit = ratelimit;
+    user_ratelimit_specified = true;
+  }
+
+  void set_mfa_ids(const std::set<std::string>& ids) {
+    mfa_ids = ids;
+    mfa_ids_specified = true;
+  }
+
+  void set_default_placement(const rgw_placement_rule& _placement) {
+    default_placement = _placement;
+    default_placement_specified = true;
+  }
+
+  void set_placement_tags(const std::list<std::string>& _tags) {
+    placement_tags = _tags;
+    placement_tags_specified = true;
+  }
+
+  bool is_populated() { return populated; }
+  bool is_initialized() { return initialized; }
+  bool has_existing_user() { return existing_user; }
+  bool has_existing_key() { return existing_key; }
+  bool has_existing_subuser() { return existing_subuser; }
+  bool has_existing_email() { return existing_email; }
+  bool has_subuser() { return subuser_specified; }
+  bool has_key_op() { return key_op; }
+  bool has_caps_op() { return caps_specified; }
+  bool has_suspension_op() { return suspension_op; }
+  bool has_subuser_perm() { return perm_specified; }
+  bool has_op_mask() { return op_mask_specified; }
+  bool will_gen_access() { return gen_access; }
+  bool will_gen_secret() { return gen_secret; }
+  bool will_gen_subuser() { return gen_subuser; }
+  bool will_purge_keys() { return purge_keys; }
+  bool will_purge_data() { return purge_data; }
+  bool will_generate_subuser() { return gen_subuser; }
+  bool has_bucket_quota() { return bucket_quota_specified; }
+  bool has_user_quota() { return user_quota_specified; }
+  void set_populated() { populated = true; }
+  void clear_populated() { populated = false; }
+  void set_initialized() { initialized = true; }
+  void set_existing_user(bool flag) { existing_user = flag; }
+  void set_existing_key(bool flag) { existing_key = flag; }
+  void set_existing_subuser(bool flag) { existing_subuser = flag; }
+  void set_existing_email(bool flag) { existing_email = flag; }
+  void set_purge_data(bool flag) { purge_data = flag; }
+  void set_generate_subuser(bool flag) { gen_subuser = flag; }
+  __u8 get_suspension_status() { return suspended; }
+  int32_t get_key_type() {return key_type; }
+  bool get_access_key_exist() {return access_key_exist; }
+  uint32_t get_subuser_perm() { return perm_mask; }
+  int32_t get_max_buckets() { return max_buckets; }
+  uint32_t get_op_mask() { return op_mask; }
+  RGWQuotaInfo& get_bucket_quota() { return quota.bucket_quota; }
+  RGWQuotaInfo& get_user_quota() { return quota.user_quota; }
+  std::set<std::string>& get_mfa_ids() { return mfa_ids; }
+
+  rgw::sal::User* get_user() { return user.get(); }
+  const rgw_user& get_user_id();
+  std::string get_subuser() { return subuser; }
+  std::string get_access_key() { return id; }
+  std::string get_secret_key() { return key; }
+  std::string get_caps() { return caps; }
+  std::string get_user_email() { return user_email; }
+  std::string get_display_name() { return display_name; }
+  rgw_user& get_new_uid() { return new_user_id; }
+  bool get_overwrite_new_user() const { return overwrite_new_user; }
+  std::map<int, std::string>& get_temp_url_keys() { return temp_url_keys; }
+
+  RGWUserInfo&  get_user_info();
+
+  std::map<std::string, RGWAccessKey>* get_swift_keys();
+  std::map<std::string, RGWAccessKey>* get_access_keys();
+  std::map<std::string, RGWSubUser>* get_subusers();
+
+  RGWUserCaps* get_caps_obj();
+
+  std::string build_default_swift_kid();
+
+  std::string generate_subuser();
+
+  RGWUserAdminOpState(rgw::sal::Driver* driver);
+};
+
+class RGWUser;
+
+class RGWAccessKeyPool
+{
+  RGWUser *user{nullptr};
+
+  std::map<std::string, int, ltstr_nocase> key_type_map;
+  rgw_user user_id;
+  rgw::sal::Driver* driver{nullptr};
+
+  std::map<std::string, RGWAccessKey> *swift_keys{nullptr};
+  std::map<std::string, RGWAccessKey> *access_keys{nullptr};
+
+  // we don't want to allow keys for the anonymous user or a null user
+  bool keys_allowed{false};
+
+private:
+  int create_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+  int generate_key(const DoutPrefixProvider *dpp, 
+                   RGWUserAdminOpState& op_state, optional_yield y,
+		   std::string *err_msg = NULL);
+  int modify_key(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+
+  int check_key_owner(RGWUserAdminOpState& op_state);
+  bool check_existing_key(RGWUserAdminOpState& op_state);
+  int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+
+  /* API Contract Fulfilment */
+  int execute_add(const DoutPrefixProvider *dpp, 
+                  RGWUserAdminOpState& op_state, std::string *err_msg,
+		  bool defer_save, optional_yield y);
+  int execute_remove(const DoutPrefixProvider *dpp, 
+                     RGWUserAdminOpState& op_state, std::string *err_msg,
+		     bool defer_save, optional_yield y);
+  int remove_subuser_keys(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+			  bool defer_save, optional_yield y);
+
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+	  optional_yield y);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+	     bool defer_save, optional_yield y);
+public:
+  explicit RGWAccessKeyPool(RGWUser* usr);
+
+  int init(RGWUserAdminOpState& op_state);
+
+  /* API Contracted Methods */
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+	  std::string *err_msg = NULL);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+	     std::string *err_msg = NULL);
+
+  friend class RGWUser;
+  friend class RGWSubUserPool;
+};
+
+class RGWSubUserPool
+{
+  RGWUser *user{nullptr};
+
+  rgw_user user_id;
+  rgw::sal::Driver* driver{nullptr};
+  bool subusers_allowed{false};
+
+  std::map<std::string, RGWSubUser> *subuser_map{nullptr};
+
+private:
+  int check_op(RGWUserAdminOpState& op_state, std::string *err_msg = NULL);
+
+  /* API Contract Fulfillment */
+  int execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+  int execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+  int execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+	  optional_yield y);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save, optional_yield y);
+  int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg, bool defer_save);
+public:
+  explicit RGWSubUserPool(RGWUser *user);
+
+  bool exists(std::string subuser);
+  int init(RGWUserAdminOpState& op_state);
+
+  /* API contracted methods */
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+	  std::string *err_msg = NULL);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+  int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+  friend class RGWUser;
+};
+
+class RGWUserCapPool
+{
+  RGWUserCaps *caps{nullptr};
+  bool caps_allowed{false};
+  RGWUser *user{nullptr};
+
+private:
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+	  optional_yield y);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, bool defer_save,
+	     optional_yield y);
+
+public:
+  explicit RGWUserCapPool(RGWUser *user);
+
+  int init(RGWUserAdminOpState& op_state);
+
+  /* API contracted methods */
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y,
+	  std::string *err_msg = NULL);
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+  friend class RGWUser;
+};
+
+class RGWUser
+{
+
+private:
+  RGWUserInfo old_info;
+  rgw::sal::Driver* driver{nullptr};
+
+  rgw_user user_id;
+  bool info_stored{false};
+
+  void set_populated() { info_stored = true; }
+  void clear_populated() { info_stored = false; }
+  bool is_populated() { return info_stored; }
+
+  int check_op(RGWUserAdminOpState&  req, std::string *err_msg);
+  int update(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
+
+  void clear_members();
+  void init_default();
+
+  /* API Contract Fulfillment */
+  int execute_add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg,
+		  optional_yield y);
+  int execute_remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state,
+                    std::string *err_msg, optional_yield y);
+  int execute_modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
+  int execute_rename(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, std::string *err_msg, optional_yield y);
+
+public:
+  RGWUser();
+
+  int init(const DoutPrefixProvider *dpp, rgw::sal::Driver* storage, RGWUserAdminOpState& op_state,
+	   optional_yield y);
+
+  int init_storage(rgw::sal::Driver* storage);
+  int init(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y);
+  int init_members(RGWUserAdminOpState& op_state);
+
+  rgw::sal::Driver* get_driver() { return driver; }
+
+  /* API Contracted Members */
+  RGWUserCapPool caps;
+  RGWAccessKeyPool keys;
+  RGWSubUserPool subusers;
+
+  /* API Contracted Methods */
+  int add(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+  int remove(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+  int rename(RGWUserAdminOpState& op_state, optional_yield y, const DoutPrefixProvider *dpp, std::string *err_msg = NULL);
+
+  /* remove an already populated RGWUser */
+  int remove(std::string *err_msg = NULL);
+
+  int modify(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, optional_yield y, std::string *err_msg = NULL);
+
+  /* retrieve info from an existing user in the RGW system */
+  int info(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWUserInfo& fetched_info, optional_yield y,
+	   std::string *err_msg = NULL);
+
+  /* info from an already populated RGWUser */
+  int info (RGWUserInfo& fetched_info, std::string *err_msg = NULL);
+
+  /* list the existing users */
+  int list(const DoutPrefixProvider *dpp, RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher);
+
+  friend class RGWAccessKeyPool;
+  friend class RGWSubUserPool;
+  friend class RGWUserCapPool;
+};
+
+/* Wrappers for admin API functionality */
+
+class RGWUserAdminOp_User
+{
+public:
+  static int list(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+                  RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher);
+
+  static int info(const DoutPrefixProvider *dpp,
+		  rgw::sal::Driver* driver,
+                  RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+		  optional_yield y);
+
+  static int create(const DoutPrefixProvider *dpp,
+		    rgw::sal::Driver* driver,
+		    RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+		    optional_yield y);
+
+  static int modify(const DoutPrefixProvider *dpp,
+		    rgw::sal::Driver* driver,
+		    RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y);
+
+  static int remove(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+                  RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher, optional_yield y);
+};
+
+class RGWUserAdminOp_Subuser
+{
+public:
+  static int create(const DoutPrefixProvider *dpp,
+		    rgw::sal::Driver* driver,
+		    RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+		    optional_yield y);
+
+  static int modify(const DoutPrefixProvider *dpp,
+		    rgw::sal::Driver* driver,
+		    RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+		    optional_yield y);
+
+  static int remove(const DoutPrefixProvider *dpp,
+		    rgw::sal::Driver* driver,
+		    RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+		    optional_yield y);
+};
+
+class RGWUserAdminOp_Key
+{
+public:
+  static int create(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+		    RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+		    optional_yield y);
+
+  static int remove(const DoutPrefixProvider *dpp,
+		    rgw::sal::Driver* driver,
+		    RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+		    optional_yield y);
+};
+
+class RGWUserAdminOp_Caps
+{
+public:
+  static int add(const DoutPrefixProvider *dpp,
+		 rgw::sal::Driver* driver,
+		 RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+		 optional_yield y);
+
+  static int remove(const DoutPrefixProvider *dpp,
+		    rgw::sal::Driver* driver,
+		    RGWUserAdminOpState& op_state, RGWFormatterFlusher& flusher,
+		    optional_yield y);
+};
+
+struct RGWUserCompleteInfo {
+  RGWUserInfo info;
+  std::map<std::string, bufferlist> attrs;
+  bool has_attrs{false};
+
+  void dump(Formatter * const f) const {
+    info.dump(f);
+    encode_json("attrs", attrs, f);
+  }
+
+  void decode_json(JSONObj *obj) {
+    decode_json_obj(info, obj);
+    has_attrs = JSONDecoder::decode_json("attrs", attrs, obj);
+  }
+};
+
+class RGWUserMetadataObject : public RGWMetadataObject {
+  RGWUserCompleteInfo uci;
+public:
+  RGWUserMetadataObject() {}
+  RGWUserMetadataObject(const RGWUserCompleteInfo& _uci, const obj_version& v, real_time m)
+      : uci(_uci) {
+    objv = v;
+    mtime = m;
+  }
+
+  void dump(Formatter *f) const override {
+    uci.dump(f);
+  }
+
+  RGWUserCompleteInfo& get_uci() {
+    return uci;
+  }
+};
+
+class RGWUserMetadataHandler;
+
+class RGWUserCtl
+{
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_User *user{nullptr};
+  } svc;
+
+  struct Ctl {
+    RGWBucketCtl *bucket{nullptr};
+  } ctl;
+
+  RGWUserMetadataHandler *umhandler;
+  RGWSI_MetaBackend_Handler *be_handler{nullptr};
+  
+public:
+  RGWUserCtl(RGWSI_Zone *zone_svc,
+             RGWSI_User *user_svc,
+             RGWUserMetadataHandler *_umhandler);
+
+  void init(RGWBucketCtl *bucket_ctl) {
+    ctl.bucket = bucket_ctl;
+  }
+
+  RGWBucketCtl *get_bucket_ctl() {
+    return ctl.bucket;
+  }
+
+  struct GetParams {
+    RGWObjVersionTracker *objv_tracker{nullptr};
+    ceph::real_time *mtime{nullptr};
+    rgw_cache_entry_info *cache_info{nullptr};
+    std::map<std::string, bufferlist> *attrs{nullptr};
+
+    GetParams() {}
+
+    GetParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+
+    GetParams& set_mtime(ceph::real_time *_mtime) {
+      mtime = _mtime;
+      return *this;
+    }
+
+    GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
+      cache_info = _cache_info;
+      return *this;
+    }
+
+    GetParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+      attrs = _attrs;
+      return *this;
+    }
+  };
+
+  struct PutParams {
+    RGWUserInfo *old_info{nullptr};
+    RGWObjVersionTracker *objv_tracker{nullptr};
+    ceph::real_time mtime;
+    bool exclusive{false};
+    std::map<std::string, bufferlist> *attrs{nullptr};
+
+    PutParams() {}
+
+    PutParams& set_old_info(RGWUserInfo *_info) {
+      old_info = _info;
+      return *this;
+    }
+
+    PutParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+
+    PutParams& set_mtime(const ceph::real_time& _mtime) {
+      mtime = _mtime;
+      return *this;
+    }
+
+    PutParams& set_exclusive(bool _exclusive) {
+      exclusive = _exclusive;
+      return *this;
+    }
+
+    PutParams& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+      attrs = _attrs;
+      return *this;
+    }
+  };
+
+  struct RemoveParams {
+    RGWObjVersionTracker *objv_tracker{nullptr};
+
+    RemoveParams() {}
+
+    RemoveParams& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+      objv_tracker = _objv_tracker;
+      return *this;
+    }
+  };
+
+  int get_info_by_uid(const DoutPrefixProvider *dpp, 
+                      const rgw_user& uid, RGWUserInfo *info,
+                      optional_yield y, const GetParams& params = {});
+  int get_info_by_email(const DoutPrefixProvider *dpp, 
+                        const std::string& email, RGWUserInfo *info,
+                        optional_yield y, const GetParams& params = {});
+  int get_info_by_swift(const DoutPrefixProvider *dpp, 
+                        const std::string& swift_name, RGWUserInfo *info,
+                        optional_yield y, const GetParams& params = {});
+  int get_info_by_access_key(const DoutPrefixProvider *dpp, 
+                             const std::string& access_key, RGWUserInfo *info,
+                             optional_yield y, const GetParams& params = {});
+
+  int get_attrs_by_uid(const DoutPrefixProvider *dpp, 
+                       const rgw_user& user_id,
+                       std::map<std::string, bufferlist> *attrs,
+                       optional_yield y,
+                       RGWObjVersionTracker *objv_tracker = nullptr);
+
+  int store_info(const DoutPrefixProvider *dpp, 
+                 const RGWUserInfo& info, optional_yield y,
+                 const PutParams& params = {});
+  int remove_info(const DoutPrefixProvider *dpp, 
+                  const RGWUserInfo& info, optional_yield y,
+                  const RemoveParams& params = {});
+
+  int list_buckets(const DoutPrefixProvider *dpp, 
+                   const rgw_user& user,
+                   const std::string& marker,
+                   const std::string& end_marker,
+                   uint64_t max,
+                   bool need_stats,
+                   RGWUserBuckets *buckets,
+                   bool *is_truncated,
+		   optional_yield y,
+                   uint64_t default_max = 1000);
+
+  int read_stats(const DoutPrefixProvider *dpp, 
+                 const rgw_user& user, RGWStorageStats *stats,
+		 optional_yield y,
+		 ceph::real_time *last_stats_sync = nullptr,     /* last time a full stats sync completed */
+		 ceph::real_time *last_stats_update = nullptr);   /* last time a stats update was done */
+};
+
+class RGWUserMetaHandlerAllocator {
+public:
+  static RGWMetadataHandler *alloc(RGWSI_User *user_svc);
+};
diff --git a/src/rgw/driver/rados/rgw_zone.cc b/src/rgw/driver/rados/rgw_zone.cc
new file mode 100644
index 000000000..ed09f24f6
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_zone.cc
@@ -0,0 +1,1288 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_zone.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_sal_config.h"
+#include "rgw_sync.h"
+
+#include "services/svc_zone.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw_zone_defaults;
+
+RGWMetaSyncStatusManager::~RGWMetaSyncStatusManager(){}
+
+#define FIRST_EPOCH 1
+
+struct RGWAccessKey;
+
+/// Generate a random uuid for realm/period/zonegroup/zone ids
+static std::string gen_random_uuid()
+{
+  uuid_d uuid;
+  uuid.generate_random();
+  return uuid.to_string();
+}
+
+void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
+  encode_json("default_zonegroup", default_zonegroup, f);
+}
+
+void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
+
+  JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
+  /* backward compatability with region */
+  if (default_zonegroup.empty()) {
+    JSONDecoder::decode_json("default_region", default_zonegroup, obj);
+  }
+}
+
+int RGWZoneGroup::create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format)
+{
+  name = default_zonegroup_name;
+  api_name = default_zonegroup_name;
+  is_master = true;
+
+  RGWZoneGroupPlacementTarget placement_target;
+  placement_target.name = "default-placement";
+  placement_targets[placement_target.name] = placement_target;
+  default_placement.name = "default-placement";
+
+  RGWZoneParams zone_params(default_zone_name);
+
+  int r = zone_params.init(dpp, cct, sysobj_svc, y, false);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  r = zone_params.create_default(dpp, y);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 0) << "create_default: error in create_default  zone params: " << cpp_strerror(-r) << dendl;
+    return r;
+  } else if (r == -EEXIST) {
+    ldpp_dout(dpp, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
+    zone_params.clear_id();
+    r = zone_params.init(dpp, cct, sysobj_svc, y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    ldpp_dout(dpp, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
+		   << dendl;
+  }
+  
+  RGWZone& default_zone = zones[zone_params.get_id()];
+  default_zone.name = zone_params.get_name();
+  default_zone.id = zone_params.get_id();
+  master_zone = default_zone.id;
+
+  // initialize supported zone features
+  default_zone.supported_features.insert(rgw::zone_features::supported.begin(),
+                                         rgw::zone_features::supported.end());
+  // enable default zonegroup features
+  enabled_features.insert(rgw::zone_features::enabled.begin(),
+                          rgw::zone_features::enabled.end());
+  
+  r = create(dpp, y);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  if (r == -EEXIST) {
+    ldpp_dout(dpp, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
+    id.clear();
+    r = init(dpp, cct, sysobj_svc, y);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  if (old_format) {
+    name = id;
+  }
+
+  post_process_params(dpp, y);
+
+  return 0;
+}
+
+int RGWZoneGroup::equals(const string& other_zonegroup) const
+{
+  if (is_master && other_zonegroup.empty())
+    return true;
+
+  return (id  == other_zonegroup);
+}
+
+int RGWZoneGroup::add_zone(const DoutPrefixProvider *dpp, 
+                           const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
+                           const list<string>& endpoints, const string *ptier_type,
+                           bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm,
+                           string *predirect_zone, std::optional<int> bucket_index_max_shards,
+                           RGWSyncModulesManager *sync_mgr,
+                           const rgw::zone_features::set& enable_features,
+                           const rgw::zone_features::set& disable_features,
+			   optional_yield y)
+{
+  auto& zone_id = zone_params.get_id();
+  auto& zone_name = zone_params.get_name();
+
+  // check for duplicate zone name on insert
+  if (!zones.count(zone_id)) {
+    for (const auto& zone : zones) {
+      if (zone.second.name == zone_name) {
+        ldpp_dout(dpp, 0) << "ERROR: found existing zone name " << zone_name
+            << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
+        return -EEXIST;
+      }
+    }
+  }
+
+  if (is_master) {
+    if (*is_master) {
+      if (!master_zone.empty() && master_zone != zone_id) {
+        ldpp_dout(dpp, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
+      }
+      master_zone = zone_id;
+    } else if (master_zone == zone_id) {
+      master_zone.clear();
+    }
+  }
+
+  RGWZone& zone = zones[zone_id];
+  zone.name = zone_name;
+  zone.id = zone_id;
+  if (!endpoints.empty()) {
+    zone.endpoints = endpoints;
+  }
+  if (read_only) {
+    zone.read_only = *read_only;
+  }
+  if (ptier_type) {
+    zone.tier_type = *ptier_type;
+    if (!sync_mgr->get_module(*ptier_type, nullptr)) {
+      ldpp_dout(dpp, 0) << "ERROR: could not found sync module: " << *ptier_type 
+                    << ",  valid sync modules: " 
+                    << sync_mgr->get_registered_module_names()
+                    << dendl;
+      return -ENOENT;
+    }
+  }
+
+  if (psync_from_all) {
+    zone.sync_from_all = *psync_from_all;
+  }
+
+  if (predirect_zone) {
+    zone.redirect_zone = *predirect_zone;
+  }
+
+  if (bucket_index_max_shards) {
+    zone.bucket_index_max_shards = *bucket_index_max_shards;
+  }
+
+  for (auto add : sync_from) {
+    zone.sync_from.insert(add);
+  }
+
+  for (auto rm : sync_from_rm) {
+    zone.sync_from.erase(rm);
+  }
+
+  zone.supported_features.insert(enable_features.begin(),
+                                 enable_features.end());
+
+  for (const auto& feature : disable_features) {
+    if (enabled_features.contains(feature)) {
+      lderr(cct) << "ERROR: Cannot disable zone feature \"" << feature
+          << "\" until it's been disabled in zonegroup " << name << dendl;
+      return -EINVAL;
+    }
+    auto i = zone.supported_features.find(feature);
+    if (i == zone.supported_features.end()) {
+      ldout(cct, 1) << "WARNING: zone feature \"" << feature
+          << "\" was not enabled in zone " << zone.name << dendl;
+      continue;
+    }
+    zone.supported_features.erase(i);
+  }
+
+  post_process_params(dpp, y);
+
+  return update(dpp,y);
+}
+
+
+int RGWZoneGroup::rename_zone(const DoutPrefixProvider *dpp, 
+                              const RGWZoneParams& zone_params,
+			      optional_yield y)
+{
+  RGWZone& zone = zones[zone_params.get_id()];
+  zone.name = zone_params.get_name();
+
+  return update(dpp, y);
+}
+
+void RGWZoneGroup::post_process_params(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  bool log_data = zones.size() > 1;
+
+  if (master_zone.empty()) {
+    auto iter = zones.begin();
+    if (iter != zones.end()) {
+      master_zone = iter->first;
+    }
+  }
+  
+  for (auto& item : zones) {
+    RGWZone& zone = item.second;
+    zone.log_data = log_data;
+
+    RGWZoneParams zone_params(zone.id, zone.name);
+    int ret = zone_params.init(dpp, cct, sysobj_svc, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
+      continue;
+    }
+
+    for (auto& pitem : zone_params.placement_pools) {
+      const string& placement_name = pitem.first;
+      if (placement_targets.find(placement_name) == placement_targets.end()) {
+        RGWZoneGroupPlacementTarget placement_target;
+        placement_target.name = placement_name;
+        placement_targets[placement_name] = placement_target;
+      }
+    }
+  }
+
+  if (default_placement.empty() && !placement_targets.empty()) {
+    default_placement.init(placement_targets.begin()->first, RGW_STORAGE_CLASS_STANDARD);
+  }
+}
+
+int RGWZoneGroup::remove_zone(const DoutPrefixProvider *dpp, const std::string& zone_id, optional_yield y)
+{
+  auto iter = zones.find(zone_id);
+  if (iter == zones.end()) {
+    ldpp_dout(dpp, 0) << "zone id " << zone_id << " is not a part of zonegroup "
+        << name << dendl;
+    return -ENOENT;
+  }
+
+  zones.erase(iter);
+
+  post_process_params(dpp, y);
+
+  return update(dpp, y);
+}
+
+void RGWDefaultSystemMetaObjInfo::dump(Formatter *f) const {
+  encode_json("default_id", default_id, f);
+}
+
+void RGWDefaultSystemMetaObjInfo::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("default_id", default_id, obj);
+}
+
+int RGWSystemMetaObj::rename(const DoutPrefixProvider *dpp, const string& new_name, optional_yield y)
+{
+  string new_id;
+  int ret = read_id(dpp, new_name, new_id, y);
+  if (!ret) {
+    return -EEXIST;
+  }
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  string old_name = name;
+  name = new_name;
+  ret = update(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  ret = store_name(dpp, true, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  /* delete old name */
+  rgw_pool pool(get_pool(cct));
+  string oid = get_names_oid_prefix() + old_name;
+  rgw_raw_obj old_name_obj(pool, oid);
+  auto sysobj = sysobj_svc->get_obj(old_name_obj);
+  ret = sysobj.wop().remove(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "Error delete old obj name  " << old_name << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return ret;
+}
+
+int RGWSystemMetaObj::read(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  int ret = read_id(dpp, name, id, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return read_info(dpp, id, y);
+}
+
+int RGWZoneParams::create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format)
+{
+  name = default_zone_name;
+
+  int r = create(dpp, y);
+  if (r < 0) {
+    return r;
+  }
+
+  if (old_format) {
+    name = id;
+  }
+
+  return r;
+}
+
+const string& RGWZoneParams::get_compression_type(const rgw_placement_rule& placement_rule) const
+{
+  static const std::string NONE{"none"};
+  auto p = placement_pools.find(placement_rule.name);
+  if (p == placement_pools.end()) {
+    return NONE;
+  }
+  const auto& type = p->second.get_compression_type(placement_rule.get_storage_class());
+  return !type.empty() ? type : NONE;
+}
+
+// run an MD5 hash on the zone_id and return the first 32 bits
+static uint32_t gen_short_zone_id(const std::string zone_id)
+{
+  unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  hash.Update((const unsigned char *)zone_id.c_str(), zone_id.size());
+  hash.Final(md5);
+
+  uint32_t short_id;
+  memcpy((char *)&short_id, md5, sizeof(short_id));
+  return std::max(short_id, 1u);
+}
+
+int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
+{
+  if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
+    ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
+    ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and  " << zonegroup.get_id() <<dendl;
+    return -EINVAL;
+  }
+  map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
+  if (iter != zonegroups.end()) {
+    RGWZoneGroup& old_zonegroup = iter->second;
+    if (!old_zonegroup.api_name.empty()) {
+      zonegroups_by_api.erase(old_zonegroup.api_name);
+    }
+  }
+  zonegroups[zonegroup.get_id()] = zonegroup;
+
+  if (!zonegroup.api_name.empty()) {
+    zonegroups_by_api[zonegroup.api_name] = zonegroup;
+  }
+
+  if (zonegroup.is_master_zonegroup()) {
+    master_zonegroup = zonegroup.get_id();
+  } else if (master_zonegroup == zonegroup.get_id()) {
+    master_zonegroup = "";
+  }
+
+  for (auto& i : zonegroup.zones) {
+    auto& zone = i.second;
+    if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
+      continue;
+    }
+    // calculate the zone's short id
+    uint32_t short_id = gen_short_zone_id(zone.id);
+
+    // search for an existing zone with the same short id
+    for (auto& s : short_zone_ids) {
+      if (s.second == short_id) {
+        ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
+            << ") generates the same short_zone_id " << short_id
+            << " as existing zone id " << s.first << dendl;
+        return -EEXIST;
+      }
+    }
+
+    short_zone_ids[zone.id] = short_id;
+  }
+
+  return 0;
+}
+
+uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
+{
+  auto i = short_zone_ids.find(zone_id);
+  if (i == short_zone_ids.end()) {
+    return 0;
+  }
+  return i->second;
+}
+
+bool RGWPeriodMap::find_zone_by_name(const string& zone_name,
+                                     RGWZoneGroup *zonegroup,
+                                     RGWZone *zone) const
+{
+  for (auto& iter : zonegroups) {
+    auto& zg = iter.second;
+    for (auto& ziter : zg.zones) {
+      auto& z = ziter.second;
+
+      if (z.name == zone_name) {
+        *zonegroup = zg;
+        *zone = z;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+namespace rgw {
+
+int read_realm(const DoutPrefixProvider* dpp, optional_yield y,
+               sal::ConfigStore* cfgstore,
+               std::string_view realm_id,
+               std::string_view realm_name,
+               RGWRealm& info,
+               std::unique_ptr<sal::RealmWriter>* writer)
+{
+  if (!realm_id.empty()) {
+    return cfgstore->read_realm_by_id(dpp, y, realm_id, info, writer);
+  }
+  if (!realm_name.empty()) {
+    return cfgstore->read_realm_by_name(dpp, y, realm_name, info, writer);
+  }
+  return cfgstore->read_default_realm(dpp, y, info, writer);
+}
+
+int create_realm(const DoutPrefixProvider* dpp, optional_yield y,
+                 sal::ConfigStore* cfgstore, bool exclusive,
+                 RGWRealm& info,
+                 std::unique_ptr<sal::RealmWriter>* writer_out)
+{
+  if (info.name.empty()) {
+    ldpp_dout(dpp, -1) << __func__ << " requires a realm name" << dendl;
+    return -EINVAL;
+  }
+  if (info.id.empty()) {
+    info.id = gen_random_uuid();
+  }
+
+  // if the realm already has a current_period, just make sure it exists
+  std::optional<RGWPeriod> period;
+  if (!info.current_period.empty()) {
+    period.emplace();
+    int r = cfgstore->read_period(dpp, y, info.current_period,
+                                  std::nullopt, *period);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __func__ << " failed to read realm's current_period="
+          << info.current_period << " with " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  // create the realm
+  std::unique_ptr<sal::RealmWriter> writer;
+  int r = cfgstore->create_realm(dpp, y, exclusive, info, &writer);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!period) {
+    // initialize and exclusive-create the initial period
+    period.emplace();
+    period->id = gen_random_uuid();
+    period->period_map.id = period->id;
+    period->epoch = FIRST_EPOCH;
+    period->realm_id = info.id;
+    period->realm_name = info.name;
+
+    r = cfgstore->create_period(dpp, y, true, *period);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __func__ << " failed to create the initial period id="
+          << period->id << " for realm " << info.name
+          << " with " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  // update the realm's current_period
+  r = realm_set_current_period(dpp, y, cfgstore, *writer, info, *period);
+  if (r < 0) {
+    return r;
+  }
+
+  // try to set as default. may race with another create, so pass exclusive=true
+  // so we don't override an existing default
+  r = set_default_realm(dpp, y, cfgstore, info, true);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 0) << "WARNING: failed to set realm as default: "
+        << cpp_strerror(r) << dendl;
+  }
+
+  if (writer_out) {
+    *writer_out = std::move(writer);
+  }
+  return 0;
+}
+
+int set_default_realm(const DoutPrefixProvider* dpp, optional_yield y,
+                      sal::ConfigStore* cfgstore, const RGWRealm& info,
+                      bool exclusive)
+{
+  return cfgstore->write_default_realm_id(dpp, y, exclusive, info.id);
+}
+
+int realm_set_current_period(const DoutPrefixProvider* dpp, optional_yield y,
+                             sal::ConfigStore* cfgstore,
+                             sal::RealmWriter& writer, RGWRealm& realm,
+                             const RGWPeriod& period)
+{
+  // update realm epoch to match the period's
+  if (realm.epoch > period.realm_epoch) {
+    ldpp_dout(dpp, -1) << __func__ << " with old realm epoch "
+        << period.realm_epoch << ", current epoch=" << realm.epoch << dendl;
+    return -EINVAL;
+  }
+  if (realm.epoch == period.realm_epoch && realm.current_period != period.id) {
+    ldpp_dout(dpp, -1) << __func__ << " with same realm epoch "
+        << period.realm_epoch << ", but different period id "
+        << period.id << " != " << realm.current_period << dendl;
+    return -EINVAL;
+  }
+
+  realm.epoch = period.realm_epoch;
+  realm.current_period = period.id;
+
+  // update the realm object
+  int r = writer.write(dpp, y, realm);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __func__ << " failed to overwrite realm "
+        << realm.name << " with " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // reflect the zonegroup and period config
+  (void) reflect_period(dpp, y, cfgstore, period);
+  return 0;
+}
+
+int reflect_period(const DoutPrefixProvider* dpp, optional_yield y,
+                   sal::ConfigStore* cfgstore, const RGWPeriod& info)
+{
+  // overwrite the local period config and zonegroup objects
+  constexpr bool exclusive = false;
+
+  int r = cfgstore->write_period_config(dpp, y, exclusive, info.realm_id,
+                                        info.period_config);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << __func__ << " failed to store period config for realm id="
+        << info.realm_id << " with " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  for (auto& [zonegroup_id, zonegroup] : info.period_map.zonegroups) {
+    r = cfgstore->create_zonegroup(dpp, y, exclusive, zonegroup, nullptr);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << __func__ << " failed to store zonegroup id="
+          << zonegroup_id << " with " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    if (zonegroup.is_master) {
+      // set master as default if no default exists
+      constexpr bool exclusive = true;
+      r = set_default_zonegroup(dpp, y, cfgstore, zonegroup, exclusive);
+      if (r == 0) {
+        ldpp_dout(dpp, 1) << "Set the period's master zonegroup "
+            << zonegroup.name << " as the default" << dendl;
+      }
+    }
+  }
+  return 0;
+}
+
+std::string get_staging_period_id(std::string_view realm_id)
+{
+  return string_cat_reserve(realm_id, ":staging");
+}
+
+void fork_period(const DoutPrefixProvider* dpp, RGWPeriod& info)
+{
+  ldpp_dout(dpp, 20) << __func__ << " realm id=" << info.realm_id
+      << " period id=" << info.id << dendl;
+
+  info.predecessor_uuid = std::move(info.id);
+  info.id = get_staging_period_id(info.realm_id);
+  info.period_map.reset();
+  info.realm_epoch++;
+}
+
+int update_period(const DoutPrefixProvider* dpp, optional_yield y,
+                  sal::ConfigStore* cfgstore, RGWPeriod& info)
+{
+  // clear zone short ids of removed zones. period_map.update() will add the
+  // remaining zones back
+  info.period_map.short_zone_ids.clear();
+
+  // list all zonegroups in the realm
+  rgw::sal::ListResult<std::string> listing;
+  std::array<std::string, 1000> zonegroup_names; // list in pages of 1000
+  do {
+    int ret = cfgstore->list_zonegroup_names(dpp, y, listing.next,
+                                             zonegroup_names, listing);
+    if (ret < 0) {
+      std::cerr << "failed to list zonegroups: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    for (const auto& name : listing.entries) {
+      RGWZoneGroup zg;
+      ret = cfgstore->read_zonegroup_by_name(dpp, y, name, zg, nullptr);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "WARNING: failed to read zonegroup "
+            << name << ": " << cpp_strerror(-ret) << dendl;
+        continue;
+      }
+
+      if (zg.realm_id != info.realm_id) {
+        ldpp_dout(dpp, 20) << "skipping zonegroup " << zg.get_name()
+            << " with realm id " << zg.realm_id
+            << ", not on our realm " << info.realm_id << dendl;
+        continue;
+      }
+
+      if (zg.master_zone.empty()) {
+        ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
+        return -EINVAL;
+      }
+
+      if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
+        ldpp_dout(dpp, 0) << "ERROR: zonegroup " << zg.get_name()
+                     << " has a non existent master zone "<< dendl;
+        return -EINVAL;
+      }
+
+      if (zg.is_master_zonegroup()) {
+        info.master_zonegroup = zg.get_id();
+        info.master_zone = zg.master_zone;
+      }
+
+      ret = info.period_map.update(zg, dpp->get_cct());
+      if (ret < 0) {
+        return ret;
+      }
+    } // foreach name in listing.entries
+  } while (!listing.next.empty());
+
+  // read the realm's current period config
+  int ret = cfgstore->read_period_config(dpp, y, info.realm_id,
+                                         info.period_config);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to read period config: "
+        << cpp_strerror(ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int commit_period(const DoutPrefixProvider* dpp, optional_yield y,
+                  sal::ConfigStore* cfgstore, sal::Driver* driver,
+                  RGWRealm& realm, sal::RealmWriter& realm_writer,
+                  const RGWPeriod& current_period,
+                  RGWPeriod& info, std::ostream& error_stream,
+                  bool force_if_stale)
+{
+  auto zone_svc = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone; // XXX
+
+  ldpp_dout(dpp, 20) << __func__ << " realm " << realm.id
+      << " period " << current_period.id << dendl;
+  // gateway must be in the master zone to commit
+  if (info.master_zone != zone_svc->get_zone_params().id) {
+    error_stream << "Cannot commit period on zone "
+        << zone_svc->get_zone_params().id << ", it must be sent to "
+        "the period's master zone " << info.master_zone << '.' << std::endl;
+    return -EINVAL;
+  }
+  // period predecessor must match current period
+  if (info.predecessor_uuid != current_period.id) {
+    error_stream << "Period predecessor " << info.predecessor_uuid
+        << " does not match current period " << current_period.id
+        << ". Use 'period pull' to get the latest period from the master, "
+        "reapply your changes, and try again." << std::endl;
+    return -EINVAL;
+  }
+  // realm epoch must be 1 greater than current period
+  if (info.realm_epoch != current_period.realm_epoch + 1) {
+    error_stream << "Period's realm epoch " << info.realm_epoch
+        << " does not come directly after current realm epoch "
+        << current_period.realm_epoch << ". Use 'realm pull' to get the "
+        "latest realm and period from the master zone, reapply your changes, "
+        "and try again." << std::endl;
+    return -EINVAL;
+  }
+  // did the master zone change?
+  if (info.master_zone != current_period.master_zone) {
+    // store the current metadata sync status in the period
+    int r = info.update_sync_status(dpp, driver, current_period,
+                                    error_stream, force_if_stale);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to update metadata sync status: "
+          << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    // create an object with a new period id
+    info.period_map.id = info.id = gen_random_uuid();
+    info.epoch = FIRST_EPOCH;
+
+    constexpr bool exclusive = true;
+    r = cfgstore->create_period(dpp, y, exclusive, info);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    // set as current period
+    r = realm_set_current_period(dpp, y, cfgstore, realm_writer, realm, info);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to update realm's current period: "
+          << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    ldpp_dout(dpp, 4) << "Promoted to master zone and committed new period "
+        << info.id << dendl;
+    (void) cfgstore->realm_notify_new_period(dpp, y, info);
+    return 0;
+  }
+  // period must be based on current epoch
+  if (info.epoch != current_period.epoch) {
+    error_stream << "Period epoch " << info.epoch << " does not match "
+        "predecessor epoch " << current_period.epoch << ". Use "
+        "'period pull' to get the latest epoch from the master zone, "
+        "reapply your changes, and try again." << std::endl;
+    return -EINVAL;
+  }
+  // set period as next epoch
+  info.id = current_period.id;
+  info.epoch = current_period.epoch + 1;
+  info.predecessor_uuid = current_period.predecessor_uuid;
+  info.realm_epoch = current_period.realm_epoch;
+  // write the period
+  constexpr bool exclusive = true;
+  int r = cfgstore->create_period(dpp, y, exclusive, info);
+  if (r == -EEXIST) {
+    // already have this epoch (or a more recent one)
+    return 0;
+  }
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to store period: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  r = reflect_period(dpp, y, cfgstore, info);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to update local objects: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  ldpp_dout(dpp, 4) << "Committed new epoch " << info.epoch
+      << " for period " << info.id << dendl;
+  (void) cfgstore->realm_notify_new_period(dpp, y, info);
+  return 0;
+}
+
+
+int read_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                   sal::ConfigStore* cfgstore,
+                   std::string_view zonegroup_id,
+                   std::string_view zonegroup_name,
+                   RGWZoneGroup& info,
+                   std::unique_ptr<sal::ZoneGroupWriter>* writer)
+{
+  if (!zonegroup_id.empty()) {
+    return cfgstore->read_zonegroup_by_id(dpp, y, zonegroup_id, info, writer);
+  }
+  if (!zonegroup_name.empty()) {
+    return cfgstore->read_zonegroup_by_name(dpp, y, zonegroup_name, info, writer);
+  }
+
+  std::string realm_id;
+  int r = cfgstore->read_default_realm_id(dpp, y, realm_id);
+  if (r == -ENOENT) {
+    return cfgstore->read_zonegroup_by_name(dpp, y, default_zonegroup_name,
+                                            info, writer);
+  }
+  if (r < 0) {
+    return r;
+  }
+  return cfgstore->read_default_zonegroup(dpp, y, realm_id, info, writer);
+}
+
+int create_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                     sal::ConfigStore* cfgstore, bool exclusive,
+                     RGWZoneGroup& info)
+{
+  if (info.name.empty()) {
+    ldpp_dout(dpp, -1) << __func__ << " requires a zonegroup name" << dendl;
+    return -EINVAL;
+  }
+  if (info.id.empty()) {
+    info.id = gen_random_uuid();
+  }
+
+  // insert the default placement target if it doesn't exist
+  constexpr std::string_view default_placement_name = "default-placement";
+
+  RGWZoneGroupPlacementTarget placement_target;
+  placement_target.name = default_placement_name;
+
+  info.placement_targets.emplace(default_placement_name, placement_target);
+  if (info.default_placement.name.empty()) {
+    info.default_placement.name = default_placement_name;
+  }
+
+  int r = cfgstore->create_zonegroup(dpp, y, exclusive, info, nullptr);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to create zonegroup with "
+        << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // try to set as default. may race with another create, so pass exclusive=true
+  // so we don't override an existing default
+  r = set_default_zonegroup(dpp, y, cfgstore, info, true);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 0) << "WARNING: failed to set zonegroup as default: "
+        << cpp_strerror(r) << dendl;
+  }
+
+  return 0;
+}
+
+int set_default_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                          sal::ConfigStore* cfgstore, const RGWZoneGroup& info,
+                          bool exclusive)
+{
+  return cfgstore->write_default_zonegroup_id(
+      dpp, y, exclusive, info.realm_id, info.id);
+}
+
+int remove_zone_from_group(const DoutPrefixProvider* dpp,
+                           RGWZoneGroup& zonegroup,
+                           const rgw_zone_id& zone_id)
+{
+  auto z = zonegroup.zones.find(zone_id);
+  if (z == zonegroup.zones.end()) {
+    return -ENOENT;
+  }
+  zonegroup.zones.erase(z);
+
+  if (zonegroup.master_zone == zone_id) {
+    // choose a new master zone
+    auto m = zonegroup.zones.begin();
+    if (m != zonegroup.zones.end()) {
+      zonegroup.master_zone = m->first;
+      ldpp_dout(dpp, 0) << "NOTICE: promoted " << m->second.name
+         << " as new master_zone of zonegroup " << zonegroup.name << dendl;
+    } else {
+      ldpp_dout(dpp, 0) << "NOTICE: removed master_zone of zonegroup "
+          << zonegroup.name << dendl;
+    }
+  }
+
+  const bool log_data = zonegroup.zones.size() > 1;
+  for (auto& [id, zone] : zonegroup.zones) {
+    zone.log_data = log_data;
+  }
+
+  return 0;
+}
+
+// try to remove the given zone id from every zonegroup in the cluster
+static int remove_zone_from_groups(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   sal::ConfigStore* cfgstore,
+                                   const rgw_zone_id& zone_id)
+{
+  std::array<std::string, 128> zonegroup_names;
+  sal::ListResult<std::string> listing;
+  do {
+    int r = cfgstore->list_zonegroup_names(dpp, y, listing.next,
+                                           zonegroup_names, listing);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to list zonegroups with "
+          << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    for (const auto& name : listing.entries) {
+      RGWZoneGroup zonegroup;
+      std::unique_ptr<sal::ZoneGroupWriter> writer;
+      r = cfgstore->read_zonegroup_by_name(dpp, y, name, zonegroup, &writer);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "WARNING: failed to load zonegroup " << name
+            << " with " << cpp_strerror(r) << dendl;
+        continue;
+      }
+
+      r = remove_zone_from_group(dpp, zonegroup, zone_id);
+      if (r < 0) {
+        continue;
+      }
+
+      // write the updated zonegroup
+      r = writer->write(dpp, y, zonegroup);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "WARNING: failed to write zonegroup " << name
+            << " with " << cpp_strerror(r) << dendl;
+        continue;
+      }
+      ldpp_dout(dpp, 0) << "Removed zone from zonegroup " << name << dendl;
+    }
+  } while (!listing.next.empty());
+
+  return 0;
+}
+
+
+int read_zone(const DoutPrefixProvider* dpp, optional_yield y,
+              sal::ConfigStore* cfgstore,
+              std::string_view zone_id,
+              std::string_view zone_name,
+              RGWZoneParams& info,
+              std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  if (!zone_id.empty()) {
+    return cfgstore->read_zone_by_id(dpp, y, zone_id, info, writer);
+  }
+  if (!zone_name.empty()) {
+    return cfgstore->read_zone_by_name(dpp, y, zone_name, info, writer);
+  }
+
+  std::string realm_id;
+  int r = cfgstore->read_default_realm_id(dpp, y, realm_id);
+  if (r == -ENOENT) {
+    return cfgstore->read_zone_by_name(dpp, y, default_zone_name, info, writer);
+  }
+  if (r < 0) {
+    return r;
+  }
+  return cfgstore->read_default_zone(dpp, y, realm_id, info, writer);
+}
+
+extern int get_zones_pool_set(const DoutPrefixProvider *dpp, optional_yield y,
+                              rgw::sal::ConfigStore* cfgstore,
+                              std::string_view my_zone_id,
+                              std::set<rgw_pool>& pools);
+
+int create_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                sal::ConfigStore* cfgstore, bool exclusive,
+                RGWZoneParams& info, std::unique_ptr<sal::ZoneWriter>* writer)
+{
+  if (info.name.empty()) {
+    ldpp_dout(dpp, -1) << __func__ << " requires a zone name" << dendl;
+    return -EINVAL;
+  }
+  if (info.id.empty()) {
+    info.id = gen_random_uuid();
+  }
+
+  // add default placement with empty pool name
+  rgw_pool pool;
+  auto& placement = info.placement_pools["default-placement"];
+  placement.storage_classes.set_storage_class(
+      RGW_STORAGE_CLASS_STANDARD, &pool, nullptr);
+
+  // build a set of all pool names used by other zones
+  std::set<rgw_pool> pools;
+  int r = get_zones_pool_set(dpp, y, cfgstore, info.id, pools);
+  if (r < 0) {
+    return r;
+  }
+
+  // initialize pool names with the zone name prefix
+  r = init_zone_pool_names(dpp, y, pools, info);
+  if (r < 0) {
+    return r;
+  }
+
+  r = cfgstore->create_zone(dpp, y, exclusive, info, nullptr);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "failed to create zone with "
+        << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  // try to set as default. may race with another create, so pass exclusive=true
+  // so we don't override an existing default
+  r = set_default_zone(dpp, y, cfgstore, info, true);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 0) << "WARNING: failed to set zone as default: "
+        << cpp_strerror(r) << dendl;
+  }
+
+  return 0;
+
+}
+
+int set_default_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                     sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+                     bool exclusive)
+{
+  return cfgstore->write_default_zone_id(
+      dpp, y, exclusive, info.realm_id, info.id);
+}
+
+int delete_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+                sal::ZoneWriter& writer)
+{
+  // remove this zone from any zonegroups that contain it
+  int r = remove_zone_from_groups(dpp, y, cfgstore, info.id);
+  if (r < 0) {
+    return r;
+  }
+
+  return writer.remove(dpp, y);
+}
+
+} // namespace rgw
+
+static inline int conf_to_uint64(const JSONFormattable& config, const string& key, uint64_t *pval)
+{
+  string sval;
+  if (config.find(key, &sval)) {
+    string err;
+    uint64_t val = strict_strtoll(sval.c_str(), 10, &err);
+    if (!err.empty()) {
+      return -EINVAL;
+    }
+    *pval = val;
+  }
+  return 0;
+}
+
+int RGWZoneGroupPlacementTier::update_params(const JSONFormattable& config)
+{
+  int r = -1;
+
+  if (config.exists("retain_head_object")) {
+    string s = config["retain_head_object"];
+    if (s == "true") {
+      retain_head_object = true;
+    } else {
+      retain_head_object = false;
+    }
+  }
+
+  if (tier_type == "cloud-s3") {
+    r = t.s3.update_params(config);
+  }
+
+  return r;
+}
+
+int RGWZoneGroupPlacementTier::clear_params(const JSONFormattable& config)
+{
+  if (config.exists("retain_head_object")) {
+    retain_head_object = false;
+  }
+
+  if (tier_type == "cloud-s3") {
+    t.s3.clear_params(config);
+  }
+
+  return 0;
+}
+
+int RGWZoneGroupPlacementTierS3::update_params(const JSONFormattable& config)
+{
+  int r = -1;
+
+  if (config.exists("endpoint")) {
+    endpoint = config["endpoint"];
+  }
+  if (config.exists("target_path")) {
+    target_path = config["target_path"];
+  }
+  if (config.exists("region")) {
+    region = config["region"];
+  }
+  if (config.exists("host_style")) {
+    string s;
+    s = config["host_style"];
+    if (s != "virtual") {
+      host_style = PathStyle;
+    } else {
+      host_style = VirtualStyle;
+    }
+  }
+  if (config.exists("target_storage_class")) {
+    target_storage_class = config["target_storage_class"];
+  }
+  if (config.exists("access_key")) {
+    key.id = config["access_key"];
+  }
+  if (config.exists("secret")) {
+    key.key = config["secret"];
+  }
+  if (config.exists("multipart_sync_threshold")) {
+    r = conf_to_uint64(config, "multipart_sync_threshold", &multipart_sync_threshold);
+    if (r < 0) {
+      multipart_sync_threshold = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+    }
+  }
+
+  if (config.exists("multipart_min_part_size")) {
+    r = conf_to_uint64(config, "multipart_min_part_size", &multipart_min_part_size);
+    if (r < 0) {
+      multipart_min_part_size = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+    }
+  }
+
+  if (config.exists("acls")) {
+    const JSONFormattable& cc = config["acls"];
+    if (cc.is_array()) {
+      for (auto& c : cc.array()) {
+        RGWTierACLMapping m;
+        m.init(c);
+        if (!m.source_id.empty()) {
+          acl_mappings[m.source_id] = m;
+        }
+      }
+    } else {
+      RGWTierACLMapping m;
+      m.init(cc);
+      if (!m.source_id.empty()) {
+        acl_mappings[m.source_id] = m;
+      }
+    }
+  }
+  return 0;
+}
+
+int RGWZoneGroupPlacementTierS3::clear_params(const JSONFormattable& config)
+{
+  if (config.exists("endpoint")) {
+    endpoint.clear();
+  }
+  if (config.exists("target_path")) {
+    target_path.clear();
+  }
+  if (config.exists("region")) {
+    region.clear();
+  }
+  if (config.exists("host_style")) {
+    /* default */
+    host_style = PathStyle;
+  }
+  if (config.exists("target_storage_class")) {
+    target_storage_class.clear();
+  }
+  if (config.exists("access_key")) {
+    key.id.clear();
+  }
+  if (config.exists("secret")) {
+    key.key.clear();
+  }
+  if (config.exists("multipart_sync_threshold")) {
+    multipart_sync_threshold = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+  }
+  if (config.exists("multipart_min_part_size")) {
+    multipart_min_part_size = DEFAULT_MULTIPART_SYNC_PART_SIZE;
+  }
+  if (config.exists("acls")) {
+    const JSONFormattable& cc = config["acls"];
+    if (cc.is_array()) {
+      for (auto& c : cc.array()) {
+        RGWTierACLMapping m;
+        m.init(c);
+        acl_mappings.erase(m.source_id);
+      }
+    } else {
+      RGWTierACLMapping m;
+      m.init(cc);
+      acl_mappings.erase(m.source_id);
+    }
+  }
+  return 0;
+}
+
+void rgw_meta_sync_info::generate_test_instances(list<rgw_meta_sync_info*>& o)
+{
+  auto info = new rgw_meta_sync_info;
+  info->state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
+  info->period = "periodid";
+  info->realm_epoch = 5;
+  o.push_back(info);
+  o.push_back(new rgw_meta_sync_info);
+}
+
+void rgw_meta_sync_marker::generate_test_instances(list<rgw_meta_sync_marker*>& o)
+{
+  auto marker = new rgw_meta_sync_marker;
+  marker->state = rgw_meta_sync_marker::IncrementalSync;
+  marker->marker = "01234";
+  marker->realm_epoch = 5;
+  o.push_back(marker);
+  o.push_back(new rgw_meta_sync_marker);
+}
+
+void rgw_meta_sync_status::generate_test_instances(list<rgw_meta_sync_status*>& o)
+{
+  o.push_back(new rgw_meta_sync_status);
+}
+
+void RGWZoneParams::generate_test_instances(list<RGWZoneParams*> &o)
+{
+  o.push_back(new RGWZoneParams);
+  o.push_back(new RGWZoneParams);
+}
+
+void RGWPeriodLatestEpochInfo::generate_test_instances(list<RGWPeriodLatestEpochInfo*> &o)
+{
+  RGWPeriodLatestEpochInfo *z = new RGWPeriodLatestEpochInfo;
+  o.push_back(z);
+  o.push_back(new RGWPeriodLatestEpochInfo);
+}
+
+void RGWZoneGroup::generate_test_instances(list<RGWZoneGroup*>& o)
+{
+  RGWZoneGroup *r = new RGWZoneGroup;
+  o.push_back(r);
+  o.push_back(new RGWZoneGroup);
+}
+
+void RGWPeriodLatestEpochInfo::dump(Formatter *f) const {
+  encode_json("latest_epoch", epoch, f);
+}
+
+void RGWPeriodLatestEpochInfo::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("latest_epoch", epoch, obj);
+}
+
+void RGWNameToId::dump(Formatter *f) const {
+  encode_json("obj_id", obj_id, f);
+}
+
+void RGWNameToId::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("obj_id", obj_id, obj);
+}
+
diff --git a/src/rgw/driver/rados/rgw_zone.h b/src/rgw/driver/rados/rgw_zone.h
new file mode 100644
index 000000000..2d69d5f1c
--- /dev/null
+++ b/src/rgw/driver/rados/rgw_zone.h
@@ -0,0 +1,943 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <ostream>
+#include "rgw_zone_types.h"
+#include "rgw_common.h"
+#include "rgw_sal_fwd.h"
+#include "rgw_sync_policy.h"
+
+
+class RGWSyncModulesManager;
+
+class RGWSI_SysObj;
+class RGWSI_Zone;
+
+class RGWSystemMetaObj {
+public:
+  std::string id;
+  std::string name;
+
+  CephContext *cct{nullptr};
+  RGWSI_SysObj *sysobj_svc{nullptr};
+  RGWSI_Zone *zone_svc{nullptr};
+
+  int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+  int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+  int read_info(const DoutPrefixProvider *dpp, const std::string& obj_id, optional_yield y, bool old_format = false);
+  int read_id(const DoutPrefixProvider *dpp, const std::string& obj_name, std::string& obj_id, optional_yield y);
+  int read_default(const DoutPrefixProvider *dpp, 
+                   RGWDefaultSystemMetaObjInfo& default_info,
+		   const std::string& oid,
+		   optional_yield y);
+  /* read and use default id */
+  int use_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+
+public:
+  RGWSystemMetaObj() {}
+  RGWSystemMetaObj(const std::string& _name): name(_name) {}
+  RGWSystemMetaObj(const std::string& _id, const std::string& _name) : id(_id), name(_name) {}
+  RGWSystemMetaObj(CephContext *_cct, RGWSI_SysObj *_sysobj_svc) {
+    reinit_instance(_cct, _sysobj_svc);
+  }
+  RGWSystemMetaObj(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): name(_name) {
+    reinit_instance(_cct, _sysobj_svc);
+  }
+
+  const std::string& get_name() const { return name; }
+  const std::string& get_id() const { return id; }
+
+  void set_name(const std::string& _name) { name = _name;}
+  void set_id(const std::string& _id) { id = _id;}
+  void clear_id() { id.clear(); }
+
+  virtual ~RGWSystemMetaObj() {}
+
+  virtual void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(id, bl);
+    encode(name, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  virtual void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(id, bl);
+    decode(name, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void reinit_instance(CephContext *_cct, RGWSI_SysObj *_sysobj_svc);
+  int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc,
+	   optional_yield y,
+	   bool setup_obj = true, bool old_format = false);
+  virtual int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y,
+			      bool old_format = false);
+  virtual int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false);
+  int delete_default();
+  virtual int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true);
+  int delete_obj(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+  int rename(const DoutPrefixProvider *dpp, const std::string& new_name, optional_yield y);
+  int update(const DoutPrefixProvider *dpp, optional_yield y) { return store_info(dpp, false, y);}
+  int update_name(const DoutPrefixProvider *dpp, optional_yield y) { return store_name(dpp, false, y);}
+  int read(const DoutPrefixProvider *dpp, optional_yield y);
+  int write(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+
+  virtual rgw_pool get_pool(CephContext *cct) const = 0;
+  virtual const std::string get_default_oid(bool old_format = false) const = 0;
+  virtual const std::string& get_names_oid_prefix() const = 0;
+  virtual const std::string& get_info_oid_prefix(bool old_format = false) const = 0;
+  virtual std::string get_predefined_id(CephContext *cct) const = 0;
+  virtual const std::string& get_predefined_name(CephContext *cct) const = 0;
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWSystemMetaObj)
+
+struct RGWZoneParams : RGWSystemMetaObj {
+  rgw_pool domain_root;
+  rgw_pool control_pool;
+  rgw_pool gc_pool;
+  rgw_pool lc_pool;
+  rgw_pool log_pool;
+  rgw_pool intent_log_pool;
+  rgw_pool usage_log_pool;
+  rgw_pool user_keys_pool;
+  rgw_pool user_email_pool;
+  rgw_pool user_swift_pool;
+  rgw_pool user_uid_pool;
+  rgw_pool roles_pool;
+  rgw_pool reshard_pool;
+  rgw_pool otp_pool;
+  rgw_pool oidc_pool;
+  rgw_pool notif_pool;
+
+  RGWAccessKey system_key;
+
+  std::map<std::string, RGWZonePlacementInfo> placement_pools;
+
+  std::string realm_id;
+
+  JSONFormattable tier_config;
+
+  RGWZoneParams() : RGWSystemMetaObj() {}
+  explicit RGWZoneParams(const std::string& name) : RGWSystemMetaObj(name){}
+  RGWZoneParams(const rgw_zone_id& id, const std::string& name) : RGWSystemMetaObj(id.id, name) {}
+  RGWZoneParams(const rgw_zone_id& id, const std::string& name, const std::string& _realm_id)
+    : RGWSystemMetaObj(id.id, name), realm_id(_realm_id) {}
+  virtual ~RGWZoneParams();
+
+  rgw_pool get_pool(CephContext *cct) const override;
+  const std::string get_default_oid(bool old_format = false) const override;
+  const std::string& get_names_oid_prefix() const override;
+  const std::string& get_info_oid_prefix(bool old_format = false) const override;
+  std::string get_predefined_id(CephContext *cct) const override;
+  const std::string& get_predefined_name(CephContext *cct) const override;
+
+  int init(const DoutPrefixProvider *dpp, 
+           CephContext *_cct, RGWSI_SysObj *_sysobj_svc, optional_yield y,
+	   bool setup_obj = true, bool old_format = false);
+  using RGWSystemMetaObj::init;
+  int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, bool old_format = false) override;
+  int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false) override;
+  int create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+  int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true) override;
+  int fix_pool_names(const DoutPrefixProvider *dpp, optional_yield y);
+
+  const std::string& get_compression_type(const rgw_placement_rule& placement_rule) const;
+  
+  void encode(bufferlist& bl) const override {
+    ENCODE_START(14, 1, bl);
+    encode(domain_root, bl);
+    encode(control_pool, bl);
+    encode(gc_pool, bl);
+    encode(log_pool, bl);
+    encode(intent_log_pool, bl);
+    encode(usage_log_pool, bl);
+    encode(user_keys_pool, bl);
+    encode(user_email_pool, bl);
+    encode(user_swift_pool, bl);
+    encode(user_uid_pool, bl);
+    RGWSystemMetaObj::encode(bl);
+    encode(system_key, bl);
+    encode(placement_pools, bl);
+    rgw_pool unused_metadata_heap;
+    encode(unused_metadata_heap, bl);
+    encode(realm_id, bl);
+    encode(lc_pool, bl);
+    std::map<std::string, std::string, ltstr_nocase> old_tier_config;
+    encode(old_tier_config, bl);
+    encode(roles_pool, bl);
+    encode(reshard_pool, bl);
+    encode(otp_pool, bl);
+    encode(tier_config, bl);
+    encode(oidc_pool, bl);
+    encode(notif_pool, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) override {
+    DECODE_START(14, bl);
+    decode(domain_root, bl);
+    decode(control_pool, bl);
+    decode(gc_pool, bl);
+    decode(log_pool, bl);
+    decode(intent_log_pool, bl);
+    decode(usage_log_pool, bl);
+    decode(user_keys_pool, bl);
+    decode(user_email_pool, bl);
+    decode(user_swift_pool, bl);
+    decode(user_uid_pool, bl);
+    if (struct_v >= 6) {
+      RGWSystemMetaObj::decode(bl);
+    } else if (struct_v >= 2) {
+      decode(name, bl);
+      id = name;
+    }
+    if (struct_v >= 3)
+      decode(system_key, bl);
+    if (struct_v >= 4)
+      decode(placement_pools, bl);
+    if (struct_v >= 5) {
+      rgw_pool unused_metadata_heap;
+      decode(unused_metadata_heap, bl);
+    }
+    if (struct_v >= 6) {
+      decode(realm_id, bl);
+    }
+    if (struct_v >= 7) {
+      decode(lc_pool, bl);
+    } else {
+      lc_pool = log_pool.name + ":lc";
+    }
+    std::map<std::string, std::string, ltstr_nocase> old_tier_config;
+    if (struct_v >= 8) {
+      decode(old_tier_config, bl);
+    }
+    if (struct_v >= 9) {
+      decode(roles_pool, bl);
+    } else {
+      roles_pool = name + ".rgw.meta:roles";
+    }
+    if (struct_v >= 10) {
+      decode(reshard_pool, bl);
+    } else {
+      reshard_pool = log_pool.name + ":reshard";
+    }
+    if (struct_v >= 11) {
+      ::decode(otp_pool, bl);
+    } else {
+      otp_pool = name + ".rgw.otp";
+    }
+    if (struct_v >= 12) {
+      ::decode(tier_config, bl);
+    } else {
+      for (auto& kv : old_tier_config) {
+        tier_config.set(kv.first, kv.second);
+      }
+    }
+    if (struct_v >= 13) {
+      ::decode(oidc_pool, bl);
+    } else {
+      oidc_pool = name + ".rgw.meta:oidc";
+    }
+    if (struct_v >= 14) {
+      decode(notif_pool, bl);
+    } else {
+      notif_pool = log_pool.name + ":notif";
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWZoneParams*>& o);
+
+  bool get_placement(const std::string& placement_id, RGWZonePlacementInfo *placement) const {
+    auto iter = placement_pools.find(placement_id);
+    if (iter == placement_pools.end()) {
+      return false;
+    }
+    *placement = iter->second;
+    return true;
+  }
+
+  /*
+   * return data pool of the head object
+   */
+  bool get_head_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool* pool) const {
+    const rgw_data_placement_target& explicit_placement = obj.bucket.explicit_placement;
+    if (!explicit_placement.data_pool.empty()) {
+      if (!obj.in_extra_data) {
+        *pool = explicit_placement.data_pool;
+      } else {
+        *pool = explicit_placement.get_data_extra_pool();
+      }
+      return true;
+    }
+    if (placement_rule.empty()) {
+      return false;
+    }
+    auto iter = placement_pools.find(placement_rule.name);
+    if (iter == placement_pools.end()) {
+      return false;
+    }
+    if (!obj.in_extra_data) {
+      *pool = iter->second.get_data_pool(placement_rule.storage_class);
+    } else {
+      *pool = iter->second.get_data_extra_pool();
+    }
+    return true;
+  }
+
+  bool valid_placement(const rgw_placement_rule& rule) const {
+    auto iter = placement_pools.find(rule.name);
+    if (iter == placement_pools.end()) {
+      return false;
+    }
+    return iter->second.storage_class_exists(rule.storage_class);
+  }
+};
+WRITE_CLASS_ENCODER(RGWZoneParams)
+
+struct RGWZoneGroup : public RGWSystemMetaObj {
+  std::string api_name;
+  std::list<std::string> endpoints;
+  bool is_master = false;
+
+  rgw_zone_id master_zone;
+  std::map<rgw_zone_id, RGWZone> zones;
+
+  std::map<std::string, RGWZoneGroupPlacementTarget> placement_targets;
+  rgw_placement_rule default_placement;
+
+  std::list<std::string> hostnames;
+  std::list<std::string> hostnames_s3website;
+  // TODO: Maybe convert hostnames to a map<std::string,std::list<std::string>> for
+  // endpoint_type->hostnames
+/*
+20:05 < _robbat21irssi> maybe I do someting like: if (hostname_map.empty()) { populate all map keys from hostnames; };
+20:05 < _robbat21irssi> but that's a later compatability migration planning bit
+20:06 < yehudasa> more like if (!hostnames.empty()) {
+20:06 < yehudasa> for (std::list<std::string>::iterator iter = hostnames.begin(); iter != hostnames.end(); ++iter) {
+20:06 < yehudasa> hostname_map["s3"].append(iter->second);
+20:07 < yehudasa> hostname_map["s3website"].append(iter->second);
+20:07 < yehudasa> s/append/push_back/g
+20:08 < _robbat21irssi> inner loop over APIs
+20:08 < yehudasa> yeah, probably
+20:08 < _robbat21irssi> s3, s3website, swift, swith_auth, swift_website
+*/
+  std::map<std::string, std::list<std::string> > api_hostname_map;
+  std::map<std::string, std::list<std::string> > api_endpoints_map;
+
+  std::string realm_id;
+
+  rgw_sync_policy_info sync_policy;
+  rgw::zone_features::set enabled_features;
+
+  RGWZoneGroup(): is_master(false){}
+  RGWZoneGroup(const std::string &id, const std::string &name):RGWSystemMetaObj(id, name) {}
+  explicit RGWZoneGroup(const std::string &_name):RGWSystemMetaObj(_name) {}
+  RGWZoneGroup(const std::string &_name, bool _is_master, CephContext *cct, RGWSI_SysObj* sysobj_svc,
+	       const std::string& _realm_id, const std::list<std::string>& _endpoints)
+    : RGWSystemMetaObj(_name, cct , sysobj_svc), endpoints(_endpoints), is_master(_is_master),
+      realm_id(_realm_id) {}
+  virtual ~RGWZoneGroup();
+
+  bool is_master_zonegroup() const { return is_master;}
+  void update_master(const DoutPrefixProvider *dpp, bool _is_master, optional_yield y) {
+    is_master = _is_master;
+    post_process_params(dpp, y);
+  }
+  void post_process_params(const DoutPrefixProvider *dpp, optional_yield y);
+
+  void encode(bufferlist& bl) const override {
+    ENCODE_START(6, 1, bl);
+    encode(name, bl);
+    encode(api_name, bl);
+    encode(is_master, bl);
+    encode(endpoints, bl);
+    encode(master_zone, bl);
+    encode(zones, bl);
+    encode(placement_targets, bl);
+    encode(default_placement, bl);
+    encode(hostnames, bl);
+    encode(hostnames_s3website, bl);
+    RGWSystemMetaObj::encode(bl);
+    encode(realm_id, bl);
+    encode(sync_policy, bl);
+    encode(enabled_features, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) override {
+    DECODE_START(6, bl);
+    decode(name, bl);
+    decode(api_name, bl);
+    decode(is_master, bl);
+    decode(endpoints, bl);
+    decode(master_zone, bl);
+    decode(zones, bl);
+    decode(placement_targets, bl);
+    decode(default_placement, bl);
+    if (struct_v >= 2) {
+      decode(hostnames, bl);
+    }
+    if (struct_v >= 3) {
+      decode(hostnames_s3website, bl);
+    }
+    if (struct_v >= 4) {
+      RGWSystemMetaObj::decode(bl);
+      decode(realm_id, bl);
+    } else {
+      id = name;
+    }
+    if (struct_v >= 5) {
+      decode(sync_policy, bl);
+    }
+    if (struct_v >= 6) {
+      decode(enabled_features, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  int read_default_id(const DoutPrefixProvider *dpp, std::string& default_id, optional_yield y, bool old_format = false) override;
+  int set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = false) override;
+  int create_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format = false);
+  int equals(const std::string& other_zonegroup) const;
+  int add_zone(const DoutPrefixProvider *dpp, 
+               const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
+               const std::list<std::string>& endpoints, const std::string *ptier_type,
+               bool *psync_from_all, std::list<std::string>& sync_from,
+               std::list<std::string>& sync_from_rm, std::string *predirect_zone,
+               std::optional<int> bucket_index_max_shards, RGWSyncModulesManager *sync_mgr,
+               const rgw::zone_features::set& enable_features,
+               const rgw::zone_features::set& disable_features,
+	       optional_yield y);
+  int remove_zone(const DoutPrefixProvider *dpp, const std::string& zone_id, optional_yield y);
+  int rename_zone(const DoutPrefixProvider *dpp, const RGWZoneParams& zone_params, optional_yield y);
+  rgw_pool get_pool(CephContext *cct) const override;
+  const std::string get_default_oid(bool old_region_format = false) const override;
+  const std::string& get_info_oid_prefix(bool old_region_format = false) const override;
+  const std::string& get_names_oid_prefix() const override;
+  std::string get_predefined_id(CephContext *cct) const override;
+  const std::string& get_predefined_name(CephContext *cct) const override;
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWZoneGroup*>& o);
+
+  bool supports(std::string_view feature) const {
+    return enabled_features.contains(feature);
+  }
+};
+WRITE_CLASS_ENCODER(RGWZoneGroup)
+
+struct RGWPeriodMap
+{
+  std::string id;
+  std::map<std::string, RGWZoneGroup> zonegroups;
+  std::map<std::string, RGWZoneGroup> zonegroups_by_api;
+  std::map<std::string, uint32_t> short_zone_ids;
+
+  std::string master_zonegroup;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+
+  int update(const RGWZoneGroup& zonegroup, CephContext *cct);
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  void reset() {
+    zonegroups.clear();
+    zonegroups_by_api.clear();
+    master_zonegroup.clear();
+  }
+
+  uint32_t get_zone_short_id(const std::string& zone_id) const;
+
+  bool find_zone_by_id(const rgw_zone_id& zone_id,
+                       RGWZoneGroup *zonegroup,
+                       RGWZone *zone) const;
+  bool find_zone_by_name(const std::string& zone_id,
+                       RGWZoneGroup *zonegroup,
+                       RGWZone *zone) const;
+};
+WRITE_CLASS_ENCODER(RGWPeriodMap)
+
+struct RGWPeriodConfig
+{
+  RGWQuota quota;
+  RGWRateLimitInfo user_ratelimit;
+  RGWRateLimitInfo bucket_ratelimit;
+  // rate limit unauthenticated user
+  RGWRateLimitInfo anon_ratelimit;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(quota.bucket_quota, bl);
+    encode(quota.user_quota, bl);
+    encode(bucket_ratelimit, bl);
+    encode(user_ratelimit, bl);
+    encode(anon_ratelimit, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(quota.bucket_quota, bl);
+    decode(quota.user_quota, bl);
+    if (struct_v >= 2) {
+      decode(bucket_ratelimit, bl);
+      decode(user_ratelimit, bl);
+      decode(anon_ratelimit, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  // the period config must be stored in a local object outside of the period,
+  // so that it can be used in a default configuration where no realm/period
+  // exists
+  int read(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, optional_yield y);
+  int write(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id, optional_yield y);
+
+  static std::string get_oid(const std::string& realm_id);
+  static rgw_pool get_pool(CephContext *cct);
+};
+WRITE_CLASS_ENCODER(RGWPeriodConfig)
+
+class RGWRealm;
+class RGWPeriod;
+
+class RGWRealm : public RGWSystemMetaObj
+{
+public:
+  std::string current_period;
+  epoch_t epoch{0}; //< realm epoch, incremented for each new period
+
+  int create_control(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+  int delete_control(const DoutPrefixProvider *dpp, optional_yield y);
+public:
+  RGWRealm() {}
+  RGWRealm(const std::string& _id, const std::string& _name = "") : RGWSystemMetaObj(_id, _name) {}
+  RGWRealm(CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_cct, _sysobj_svc) {}
+  RGWRealm(const std::string& _name, CephContext *_cct, RGWSI_SysObj *_sysobj_svc): RGWSystemMetaObj(_name, _cct, _sysobj_svc){}
+  virtual ~RGWRealm() override;
+
+  void encode(bufferlist& bl) const override {
+    ENCODE_START(1, 1, bl);
+    RGWSystemMetaObj::encode(bl);
+    encode(current_period, bl);
+    encode(epoch, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) override {
+    DECODE_START(1, bl);
+    RGWSystemMetaObj::decode(bl);
+    decode(current_period, bl);
+    decode(epoch, bl);
+    DECODE_FINISH(bl);
+  }
+
+  int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true) override;
+  int delete_obj(const DoutPrefixProvider *dpp, optional_yield y);
+  rgw_pool get_pool(CephContext *cct) const override;
+  const std::string get_default_oid(bool old_format = false) const override;
+  const std::string& get_names_oid_prefix() const override;
+  const std::string& get_info_oid_prefix(bool old_format = false) const override;
+  std::string get_predefined_id(CephContext *cct) const override;
+  const std::string& get_predefined_name(CephContext *cct) const override;
+
+  using RGWSystemMetaObj::read_id; // expose as public for radosgw-admin
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWRealm*>& o);
+
+  const std::string& get_current_period() const {
+    return current_period;
+  }
+  int set_current_period(const DoutPrefixProvider *dpp, RGWPeriod& period, optional_yield y);
+  void clear_current_period_and_epoch() {
+    current_period.clear();
+    epoch = 0;
+  }
+  epoch_t get_epoch() const { return epoch; }
+
+  std::string get_control_oid() const;
+  /// send a notify on the realm control object
+  int notify_zone(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y);
+  /// notify the zone of a new period
+  int notify_new_period(const DoutPrefixProvider *dpp, const RGWPeriod& period, optional_yield y);
+
+  int find_zone(const DoutPrefixProvider *dpp,
+                const rgw_zone_id& zid,
+                RGWPeriod *pperiod,
+                RGWZoneGroup *pzonegroup,
+                bool *pfound,
+                optional_yield y) const;
+};
+WRITE_CLASS_ENCODER(RGWRealm)
+
+struct RGWPeriodLatestEpochInfo {
+  epoch_t epoch = 0;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(epoch, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(epoch, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWPeriodLatestEpochInfo*>& o);
+};
+WRITE_CLASS_ENCODER(RGWPeriodLatestEpochInfo)
+
+
+/*
+ * The RGWPeriod object contains the entire configuration of a
+ * RGWRealm, including its RGWZoneGroups and RGWZones. Consistency of
+ * this configuration is maintained across all zones by passing around
+ * the RGWPeriod object in its JSON representation.
+ *
+ * If a new configuration changes which zone is the metadata master
+ * zone (i.e., master zone of the master zonegroup), then a new
+ * RGWPeriod::id (a uuid) is generated, its RGWPeriod::realm_epoch is
+ * incremented, and the RGWRealm object is updated to reflect that new
+ * current_period id and epoch. If the configuration changes BUT which
+ * zone is the metadata master does NOT change, then only the
+ * RGWPeriod::epoch is incremented (and the RGWPeriod::id remains the
+ * same).
+ *
+ * When a new RGWPeriod is created with a new RGWPeriod::id (uuid), it
+ * is linked back to its predecessor RGWPeriod through the
+ * RGWPeriod::predecessor_uuid field, thus creating a "linked
+ * list"-like structure of RGWPeriods back to the cluster's creation.
+ */
+class RGWPeriod
+{
+public:
+  std::string id; //< a uuid
+  epoch_t epoch{0};
+  std::string predecessor_uuid;
+  std::vector<std::string> sync_status;
+  RGWPeriodMap period_map;
+  RGWPeriodConfig period_config;
+  std::string master_zonegroup;
+  rgw_zone_id master_zone;
+
+  std::string realm_id;
+  std::string realm_name;
+  epoch_t realm_epoch{1}; //< realm epoch when period was made current
+
+  CephContext *cct{nullptr};
+  RGWSI_SysObj *sysobj_svc{nullptr};
+
+  int read_info(const DoutPrefixProvider *dpp, optional_yield y);
+  int read_latest_epoch(const DoutPrefixProvider *dpp,
+                        RGWPeriodLatestEpochInfo& epoch_info,
+			optional_yield y,
+                        RGWObjVersionTracker *objv = nullptr);
+  int use_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y);
+  int use_current_period();
+
+  const std::string get_period_oid() const;
+  const std::string get_period_oid_prefix() const;
+
+  // gather the metadata sync status for each shard; only for use on master zone
+  int update_sync_status(const DoutPrefixProvider *dpp, 
+                         rgw::sal::Driver* driver,
+                         const RGWPeriod &current_period,
+                         std::ostream& error_stream, bool force_if_stale);
+
+public:
+  RGWPeriod() {}
+
+  explicit RGWPeriod(const std::string& period_id, epoch_t _epoch = 0)
+    : id(period_id), epoch(_epoch) {}
+
+  const std::string& get_id() const { return id; }
+  epoch_t get_epoch() const { return epoch; }
+  epoch_t get_realm_epoch() const { return realm_epoch; }
+  const std::string& get_predecessor() const { return predecessor_uuid; }
+  const rgw_zone_id& get_master_zone() const { return master_zone; }
+  const std::string& get_master_zonegroup() const { return master_zonegroup; }
+  const std::string& get_realm() const { return realm_id; }
+  const std::string& get_realm_name() const { return realm_name; }
+  const RGWPeriodMap& get_map() const { return period_map; }
+  RGWPeriodConfig& get_config() { return period_config; }
+  const RGWPeriodConfig& get_config() const { return period_config; }
+  const std::vector<std::string>& get_sync_status() const { return sync_status; }
+  rgw_pool get_pool(CephContext *cct) const;
+  const std::string& get_latest_epoch_oid() const;
+  const std::string& get_info_oid_prefix() const;
+
+  void set_user_quota(RGWQuotaInfo& user_quota) {
+    period_config.quota.user_quota = user_quota;
+  }
+
+  void set_bucket_quota(RGWQuotaInfo& bucket_quota) {
+    period_config.quota.bucket_quota = bucket_quota;
+  }
+
+  void set_id(const std::string& _id) {
+    this->id = _id;
+    period_map.id = _id;
+  }
+  void set_epoch(epoch_t epoch) { this->epoch = epoch; }
+  void set_realm_epoch(epoch_t epoch) { realm_epoch = epoch; }
+
+  void set_predecessor(const std::string& predecessor)
+  {
+    predecessor_uuid = predecessor;
+  }
+
+  void set_realm_id(const std::string& _realm_id) {
+    realm_id = _realm_id;
+  }
+
+  int reflect(const DoutPrefixProvider *dpp, optional_yield y);
+
+  int get_zonegroup(RGWZoneGroup& zonegroup,
+		    const std::string& zonegroup_id) const;
+
+  bool is_single_zonegroup() const
+  {
+      return (period_map.zonegroups.size() <= 1);
+  }
+
+  /*
+    returns true if there are several zone groups with a least one zone
+   */
+  bool is_multi_zonegroups_with_zones() const
+  {
+    int count = 0;
+    for (const auto& zg:  period_map.zonegroups) {
+      if (zg.second.zones.size() > 0) {
+	if (count++ > 0) {
+	  return true;
+	}
+      }
+    }
+    return false;
+  }
+
+  bool find_zone(const DoutPrefixProvider *dpp,
+                const rgw_zone_id& zid,
+                RGWZoneGroup *pzonegroup,
+                optional_yield y) const;
+
+  int get_latest_epoch(const DoutPrefixProvider *dpp, epoch_t& epoch, optional_yield y);
+  int set_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y,
+		       epoch_t epoch, bool exclusive = false,
+                       RGWObjVersionTracker *objv = nullptr);
+  // update latest_epoch if the given epoch is higher, else return -EEXIST
+  int update_latest_epoch(const DoutPrefixProvider *dpp, epoch_t epoch, optional_yield y);
+
+  int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, const std::string &period_realm_id, optional_yield y,
+	   const std::string &period_realm_name = "", bool setup_obj = true);
+  int init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc, optional_yield y, bool setup_obj = true);  
+
+  int create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive = true);
+  int delete_obj(const DoutPrefixProvider *dpp, optional_yield y);
+  int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+  int add_zonegroup(const DoutPrefixProvider *dpp, const RGWZoneGroup& zonegroup, optional_yield y);
+
+  void fork();
+  int update(const DoutPrefixProvider *dpp, optional_yield y);
+
+  // commit a staging period; only for use on master zone
+  int commit(const DoutPrefixProvider *dpp,
+	     rgw::sal::Driver* driver,
+             RGWRealm& realm, const RGWPeriod &current_period,
+             std::ostream& error_stream, optional_yield y,
+	     bool force_if_stale = false);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(id, bl);
+    encode(epoch, bl);
+    encode(realm_epoch, bl);
+    encode(predecessor_uuid, bl);
+    encode(sync_status, bl);
+    encode(period_map, bl);
+    encode(master_zone, bl);
+    encode(master_zonegroup, bl);
+    encode(period_config, bl);
+    encode(realm_id, bl);
+    encode(realm_name, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(id, bl);
+    decode(epoch, bl);
+    decode(realm_epoch, bl);
+    decode(predecessor_uuid, bl);
+    decode(sync_status, bl);
+    decode(period_map, bl);
+    decode(master_zone, bl);
+    decode(master_zonegroup, bl);
+    decode(period_config, bl);
+    decode(realm_id, bl);
+    decode(realm_name, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWPeriod*>& o);
+
+  static std::string get_staging_id(const std::string& realm_id) {
+    return realm_id + ":staging";
+  }
+};
+WRITE_CLASS_ENCODER(RGWPeriod)
+
+namespace rgw {
+
+/// Look up a realm by its id. If no id is given, look it up by name.
+/// If no name is given, fall back to the cluster's default realm.
+int read_realm(const DoutPrefixProvider* dpp, optional_yield y,
+               sal::ConfigStore* cfgstore,
+               std::string_view realm_id,
+               std::string_view realm_name,
+               RGWRealm& info,
+               std::unique_ptr<sal::RealmWriter>* writer = nullptr);
+
+/// Create a realm and its initial period. If the info.id is empty, a
+/// random uuid will be generated.
+int create_realm(const DoutPrefixProvider* dpp, optional_yield y,
+                 sal::ConfigStore* cfgstore, bool exclusive,
+                 RGWRealm& info,
+                 std::unique_ptr<sal::RealmWriter>* writer = nullptr);
+
+/// Set the given realm as the cluster's default realm.
+int set_default_realm(const DoutPrefixProvider* dpp, optional_yield y,
+                      sal::ConfigStore* cfgstore, const RGWRealm& info,
+                      bool exclusive = false);
+
+/// Update the current_period of an existing realm.
+int realm_set_current_period(const DoutPrefixProvider* dpp, optional_yield y,
+                             sal::ConfigStore* cfgstore,
+                             sal::RealmWriter& writer, RGWRealm& realm,
+                             const RGWPeriod& period);
+
+/// Overwrite the local zonegroup and period config objects with the new
+/// configuration contained in the given period.
+int reflect_period(const DoutPrefixProvider* dpp, optional_yield y,
+                   sal::ConfigStore* cfgstore, const RGWPeriod& info);
+
+/// Return the staging period id for the given realm.
+std::string get_staging_period_id(std::string_view realm_id);
+
+/// Convert the given period into a separate staging period, where
+/// radosgw-admin can make changes to it without effecting the running
+/// configuration.
+void fork_period(const DoutPrefixProvider* dpp, RGWPeriod& info);
+
+/// Read all zonegroups in the period's realm and add them to the period.
+int update_period(const DoutPrefixProvider* dpp, optional_yield y,
+                  sal::ConfigStore* cfgstore, RGWPeriod& info);
+
+/// Validates the given 'staging' period and tries to commit it as the
+/// realm's new current period.
+int commit_period(const DoutPrefixProvider* dpp, optional_yield y,
+                  sal::ConfigStore* cfgstore, sal::Driver* driver,
+                  RGWRealm& realm, sal::RealmWriter& realm_writer,
+                  const RGWPeriod& current_period,
+                  RGWPeriod& info, std::ostream& error_stream,
+                  bool force_if_stale);
+
+
+/// Look up a zonegroup by its id. If no id is given, look it up by name.
+/// If no name is given, fall back to the cluster's default zonegroup.
+int read_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                   sal::ConfigStore* cfgstore,
+                   std::string_view zonegroup_id,
+                   std::string_view zonegroup_name,
+                   RGWZoneGroup& info,
+                   std::unique_ptr<sal::ZoneGroupWriter>* writer = nullptr);
+
+/// Initialize and create the given zonegroup. If the given info.id is empty,
+/// a random uuid will be generated. May fail with -EEXIST.
+int create_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                     sal::ConfigStore* cfgstore, bool exclusive,
+                     RGWZoneGroup& info);
+
+/// Set the given zonegroup as its realm's default zonegroup.
+int set_default_zonegroup(const DoutPrefixProvider* dpp, optional_yield y,
+                          sal::ConfigStore* cfgstore, const RGWZoneGroup& info,
+                          bool exclusive = false);
+
+/// Add a zone to the zonegroup, or update an existing zone entry.
+int add_zone_to_group(const DoutPrefixProvider* dpp,
+                      RGWZoneGroup& zonegroup,
+                      const RGWZoneParams& zone_params,
+                      const bool *pis_master, const bool *pread_only,
+                      const std::list<std::string>& endpoints,
+                      const std::string *ptier_type,
+                      const bool *psync_from_all,
+                      const std::list<std::string>& sync_from,
+                      const std::list<std::string>& sync_from_rm,
+                      const std::string *predirect_zone,
+                      std::optional<int> bucket_index_max_shards,
+                      const rgw::zone_features::set& enable_features,
+                      const rgw::zone_features::set& disable_features);
+
+/// Remove a zone by id from its zonegroup, promoting a new master zone if
+/// necessary.
+int remove_zone_from_group(const DoutPrefixProvider* dpp,
+                           RGWZoneGroup& info,
+                           const rgw_zone_id& zone_id);
+
+
+/// Look up a zone by its id. If no id is given, look it up by name. If no name
+/// is given, fall back to the realm's default zone.
+int read_zone(const DoutPrefixProvider* dpp, optional_yield y,
+              sal::ConfigStore* cfgstore,
+              std::string_view zone_id,
+              std::string_view zone_name,
+              RGWZoneParams& info,
+              std::unique_ptr<sal::ZoneWriter>* writer = nullptr);
+
+/// Initialize and create a new zone. If the given info.id is empty, a random
+/// uuid will be generated. Pool names are initialized with the zone name as a
+/// prefix. If any pool names conflict with existing zones, a random suffix is
+/// added.
+int create_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                sal::ConfigStore* cfgstore, bool exclusive,
+                RGWZoneParams& info,
+                std::unique_ptr<sal::ZoneWriter>* writer = nullptr);
+
+/// Initialize the zone's pool names using the zone name as a prefix. If a pool
+/// name conflicts with an existing zone's pool, add a unique suffix.
+int init_zone_pool_names(const DoutPrefixProvider *dpp, optional_yield y,
+                         const std::set<rgw_pool>& pools, RGWZoneParams& info);
+
+/// Set the given zone as its realm's default zone.
+int set_default_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                      sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+                      bool exclusive = false);
+
+/// Delete an existing zone and remove it from any zonegroups that contain it.
+int delete_zone(const DoutPrefixProvider* dpp, optional_yield y,
+                sal::ConfigStore* cfgstore, const RGWZoneParams& info,
+                sal::ZoneWriter& writer);
+
+} // namespace rgw
diff --git a/src/rgw/jwt-cpp/base.h b/src/rgw/jwt-cpp/base.h
new file mode 100644
index 000000000..dfca7fc08
--- /dev/null
+++ b/src/rgw/jwt-cpp/base.h
@@ -0,0 +1,168 @@
+#pragma once
+#include <string>
+#include <array>
+
+namespace jwt {
+	namespace alphabet {
+		struct base64 {
+			static const std::array<char, 64>& data() {
+                            static std::array<char, 64> data = {
+                                {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+                                 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+                                 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+                                 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'}};
+                            return data;
+			};
+			static const std::string& fill() {
+				static std::string fill = "=";
+				return fill;
+			}
+		};
+		struct base64url {
+			static const std::array<char, 64>& data() {
+                            static std::array<char, 64> data = {
+                                {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+                                 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+                                 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+                                 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'}};
+                            return data;
+			};
+			static const std::string& fill() {
+				static std::string fill = "%3d";
+				return fill;
+			}
+		};
+	}
+
+	class base {
+	public:
+		template<typename T>
+		static std::string encode(const std::string& bin) {
+			return encode(bin, T::data(), T::fill());
+		}
+		template<typename T>
+		static std::string decode(const std::string& base) {
+			return decode(base, T::data(), T::fill());
+		}
+
+	private:
+		static std::string encode(const std::string& bin, const std::array<char, 64>& alphabet, const std::string& fill) {
+			size_t size = bin.size();
+			std::string res;
+
+			// clear incomplete bytes
+			size_t fast_size = size - size % 3;
+			for (size_t i = 0; i < fast_size;) {
+				uint32_t octet_a = (unsigned char)bin[i++];
+				uint32_t octet_b = (unsigned char)bin[i++];
+				uint32_t octet_c = (unsigned char)bin[i++];
+
+				uint32_t triple = (octet_a << 0x10) + (octet_b << 0x08) + octet_c;
+
+				res += alphabet[(triple >> 3 * 6) & 0x3F];
+				res += alphabet[(triple >> 2 * 6) & 0x3F];
+				res += alphabet[(triple >> 1 * 6) & 0x3F];
+				res += alphabet[(triple >> 0 * 6) & 0x3F];
+			}
+
+			if (fast_size == size)
+				return res;
+
+			size_t mod = size % 3;
+
+			uint32_t octet_a = fast_size < size ? (unsigned char)bin[fast_size++] : 0;
+			uint32_t octet_b = fast_size < size ? (unsigned char)bin[fast_size++] : 0;
+			uint32_t octet_c = fast_size < size ? (unsigned char)bin[fast_size++] : 0;
+
+			uint32_t triple = (octet_a << 0x10) + (octet_b << 0x08) + octet_c;
+
+			switch (mod) {
+			case 1:
+				res += alphabet[(triple >> 3 * 6) & 0x3F];
+				res += alphabet[(triple >> 2 * 6) & 0x3F];
+				res += fill;
+				res += fill;
+				break;
+			case 2:
+				res += alphabet[(triple >> 3 * 6) & 0x3F];
+				res += alphabet[(triple >> 2 * 6) & 0x3F];
+				res += alphabet[(triple >> 1 * 6) & 0x3F];
+				res += fill;
+				break;
+			default:
+				break;
+			}
+
+			return res;
+		}
+
+		static std::string decode(const std::string& base, const std::array<char, 64>& alphabet, const std::string& fill) {
+			size_t size = base.size();
+
+			size_t fill_cnt = 0;
+			while (size > fill.size()) {
+				if (base.substr(size - fill.size(), fill.size()) == fill) {
+					fill_cnt++;
+					size -= fill.size();
+					if(fill_cnt > 2)
+						throw std::runtime_error("Invalid input");
+				}
+				else break;
+			}
+
+			if ((size + fill_cnt) % 4 != 0)
+				throw std::runtime_error("Invalid input");
+
+			size_t out_size = size / 4 * 3;
+			std::string res;
+			res.reserve(out_size);
+
+			auto get_sextet = [&](size_t offset) {
+				for (size_t i = 0; i < alphabet.size(); i++) {
+					if (alphabet[i] == base[offset])
+						return i;
+				}
+				throw std::runtime_error("Invalid input");
+			};
+
+			
+			size_t fast_size = size - size % 4;
+			for (size_t i = 0; i < fast_size;) {
+				uint32_t sextet_a = get_sextet(i++);
+				uint32_t sextet_b = get_sextet(i++);
+				uint32_t sextet_c = get_sextet(i++);
+				uint32_t sextet_d = get_sextet(i++);
+
+				uint32_t triple = (sextet_a << 3 * 6)
+					+ (sextet_b << 2 * 6)
+					+ (sextet_c << 1 * 6)
+					+ (sextet_d << 0 * 6);
+
+				res += (triple >> 2 * 8) & 0xFF;
+				res += (triple >> 1 * 8) & 0xFF;
+				res += (triple >> 0 * 8) & 0xFF;
+			}
+
+			if (fill_cnt == 0)
+				return res;
+
+			uint32_t triple = (get_sextet(fast_size) << 3 * 6)
+				+ (get_sextet(fast_size + 1) << 2 * 6);
+
+			switch (fill_cnt) {
+			case 1:
+				triple |= (get_sextet(fast_size + 2) << 1 * 6);
+				res += (triple >> 2 * 8) & 0xFF;
+				res += (triple >> 1 * 8) & 0xFF;
+				break;
+			case 2:
+				res += (triple >> 2 * 8) & 0xFF;
+				break;
+			default:
+				break;
+			}
+
+			return res;
+		}
+	};
+}
diff --git a/src/rgw/jwt-cpp/jwt.h b/src/rgw/jwt-cpp/jwt.h
new file mode 100644
index 000000000..b86fb57b0
--- /dev/null
+++ b/src/rgw/jwt-cpp/jwt.h
@@ -0,0 +1,1615 @@
+#pragma once
+#define PICOJSON_USE_INT64
+#include "picojson/picojson.h"
+#include "base.h"
+#include <set>
+#include <chrono>
+#include <unordered_map>
+#include <memory>
+#include <openssl/evp.h>
+#include <openssl/hmac.h>
+#include <openssl/pem.h>
+#include <openssl/ec.h>
+#include <openssl/err.h>
+
+//If openssl version less than 1.1
+#if OPENSSL_VERSION_NUMBER < 269484032
+#define OPENSSL10
+#endif
+
+#ifndef JWT_CLAIM_EXPLICIT
+#define JWT_CLAIM_EXPLICIT 1
+#endif
+
+namespace jwt {
+	using date = std::chrono::system_clock::time_point;
+
+	struct signature_verification_exception : public std::runtime_error {
+		signature_verification_exception()
+			: std::runtime_error("signature verification failed")
+		{}
+		explicit signature_verification_exception(const std::string& msg)
+			: std::runtime_error(msg)
+		{}
+		explicit signature_verification_exception(const char* msg)
+			: std::runtime_error(msg)
+		{}
+	};
+	struct signature_generation_exception : public std::runtime_error {
+		signature_generation_exception()
+			: std::runtime_error("signature generation failed")
+		{}
+		explicit signature_generation_exception(const std::string& msg)
+			: std::runtime_error(msg)
+		{}
+		explicit signature_generation_exception(const char* msg)
+			: std::runtime_error(msg)
+		{}
+	};
+	struct rsa_exception : public std::runtime_error {
+		explicit rsa_exception(const std::string& msg)
+			: std::runtime_error(msg)
+		{}
+		explicit rsa_exception(const char* msg)
+			: std::runtime_error(msg)
+		{}
+	};
+	struct ecdsa_exception : public std::runtime_error {
+		explicit ecdsa_exception(const std::string& msg)
+			: std::runtime_error(msg)
+		{}
+		explicit ecdsa_exception(const char* msg)
+			: std::runtime_error(msg)
+		{}
+	};
+	struct token_verification_exception : public std::runtime_error {
+		token_verification_exception()
+			: std::runtime_error("token verification failed")
+		{}
+		explicit token_verification_exception(const std::string& msg)
+			: std::runtime_error("token verification failed: " + msg)
+		{}
+	};
+
+	namespace helper {
+		inline
+		std::string extract_pubkey_from_cert(const std::string& certstr, const std::string& pw = "") {
+			// TODO: Cannot find the exact version this change happended
+#if OPENSSL_VERSION_NUMBER <= 0x1000114fL
+			std::unique_ptr<BIO, decltype(&BIO_free_all)> certbio(BIO_new_mem_buf(const_cast<char*>(certstr.data()), certstr.size()), BIO_free_all);
+#else
+			std::unique_ptr<BIO, decltype(&BIO_free_all)> certbio(BIO_new_mem_buf(certstr.data(), certstr.size()), BIO_free_all);
+#endif
+			std::unique_ptr<BIO, decltype(&BIO_free_all)> keybio(BIO_new(BIO_s_mem()), BIO_free_all);
+
+			std::unique_ptr<X509, decltype(&X509_free)> cert(PEM_read_bio_X509(certbio.get(), nullptr, nullptr, const_cast<char*>(pw.c_str())), X509_free);
+			if (!cert) throw rsa_exception("Error loading cert into memory");
+			std::unique_ptr<EVP_PKEY, decltype(&EVP_PKEY_free)> key(X509_get_pubkey(cert.get()), EVP_PKEY_free);
+			if(!key) throw rsa_exception("Error getting public key from certificate");
+			if(!PEM_write_bio_PUBKEY(keybio.get(), key.get())) throw rsa_exception("Error writing public key data in PEM format");
+			char* ptr = nullptr;
+			auto len = BIO_get_mem_data(keybio.get(), &ptr);
+			if(len <= 0 || ptr == nullptr) throw rsa_exception("Failed to convert pubkey to pem");
+			std::string res(ptr, len);
+			return res;
+		}
+
+		inline
+		std::shared_ptr<EVP_PKEY> load_public_key_from_string(const std::string& key, const std::string& password = "") {
+			std::unique_ptr<BIO, decltype(&BIO_free_all)> pubkey_bio(BIO_new(BIO_s_mem()), BIO_free_all);
+			if(key.substr(0, 27) == "-----BEGIN CERTIFICATE-----") {
+				auto epkey = helper::extract_pubkey_from_cert(key, password);
+				if ((size_t)BIO_write(pubkey_bio.get(), epkey.data(), epkey.size()) != epkey.size())
+					throw rsa_exception("failed to load public key: bio_write failed");
+			} else {
+				if ((size_t)BIO_write(pubkey_bio.get(), key.data(), key.size()) != key.size())
+					throw rsa_exception("failed to load public key: bio_write failed");
+			}
+			
+			std::shared_ptr<EVP_PKEY> pkey(PEM_read_bio_PUBKEY(pubkey_bio.get(), nullptr, nullptr, (void*)password.c_str()), EVP_PKEY_free);
+			if (!pkey)
+				throw rsa_exception("failed to load public key: PEM_read_bio_PUBKEY failed:" + std::string(ERR_error_string(ERR_get_error(), NULL)));
+			return pkey;
+		}
+
+		inline
+		std::shared_ptr<EVP_PKEY> load_private_key_from_string(const std::string& key, const std::string& password = "") {
+			std::unique_ptr<BIO, decltype(&BIO_free_all)> privkey_bio(BIO_new(BIO_s_mem()), BIO_free_all);
+			if ((size_t)BIO_write(privkey_bio.get(), key.data(), key.size()) != key.size())
+				throw rsa_exception("failed to load private key: bio_write failed");
+			std::shared_ptr<EVP_PKEY> pkey(PEM_read_bio_PrivateKey(privkey_bio.get(), nullptr, nullptr, const_cast<char*>(password.c_str())), EVP_PKEY_free);
+			if (!pkey)
+				throw rsa_exception("failed to load private key: PEM_read_bio_PrivateKey failed");
+			return pkey;
+		}
+	}
+
+	namespace algorithm {
+		/**
+		 * "none" algorithm.
+		 * 
+		 * Returns and empty signature and checks if the given signature is empty.
+		 */
+		struct none {
+			/// Return an empty string
+			std::string sign(const std::string&) const {
+				return "";
+			}
+			/// Check if the given signature is empty. JWT's with "none" algorithm should not contain a signature.
+			void verify(const std::string&, const std::string& signature) const {
+				if (!signature.empty())
+					throw signature_verification_exception();
+			}
+			/// Get algorithm name
+			std::string name() const {
+				return "none";
+			}
+		};
+		/**
+		 * Base class for HMAC family of algorithms
+		 */
+		struct hmacsha {
+			/**
+			 * Construct new hmac algorithm
+			 * \param key Key to use for HMAC
+			 * \param md Pointer to hash function
+			 * \param name Name of the algorithm
+			 */
+			hmacsha(std::string key, const EVP_MD*(*md)(), const std::string& name)
+				: secret(std::move(key)), md(md), alg_name(name)
+			{}
+			/**
+			 * Sign jwt data
+			 * \param data The data to sign
+			 * \return HMAC signature for the given data
+			 * \throws signature_generation_exception
+			 */
+			std::string sign(const std::string& data) const {
+				std::string res;
+				res.resize(EVP_MAX_MD_SIZE);
+				unsigned int len = res.size();
+				if (HMAC(md(), secret.data(), secret.size(), (const unsigned char*)data.data(), data.size(), (unsigned char*)res.data(), &len) == nullptr)
+					throw signature_generation_exception();
+				res.resize(len);
+				return res;
+			}
+			/**
+			 * Check if signature is valid
+			 * \param data The data to check signature against
+			 * \param signature Signature provided by the jwt
+			 * \throws signature_verification_exception If the provided signature does not match
+			 */
+			void verify(const std::string& data, const std::string& signature) const {
+				try {
+					auto res = sign(data);
+					bool matched = true;
+					for (size_t i = 0; i < std::min<size_t>(res.size(), signature.size()); i++)
+						if (res[i] != signature[i])
+							matched = false;
+					if (res.size() != signature.size())
+						matched = false;
+					if (!matched)
+						throw signature_verification_exception();
+				}
+				catch (const signature_generation_exception&) {
+					throw signature_verification_exception();
+				}
+			}
+			/**
+			 * Returns the algorithm name provided to the constructor
+			 * \return Algorithmname
+			 */
+			std::string name() const {
+				return alg_name;
+			}
+		private:
+			/// HMAC secrect
+			const std::string secret;
+			/// HMAC hash generator
+			const EVP_MD*(*md)();
+			/// Algorithmname
+			const std::string alg_name;
+		};
+		/**
+		 * Base class for RSA family of algorithms
+		 */
+		struct rsa {
+			/**
+			 * Construct new rsa algorithm
+			 * \param public_key RSA public key in PEM format
+			 * \param private_key RSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 * \param md Pointer to hash function
+			 * \param name Name of the algorithm
+			 */
+			rsa(const std::string& public_key, const std::string& private_key, const std::string& public_key_password, const std::string& private_key_password, const EVP_MD*(*md)(), const std::string& name)
+				: md(md), alg_name(name)
+			{
+				if (!private_key.empty()) {
+					pkey = helper::load_private_key_from_string(private_key, private_key_password);
+				} else if(!public_key.empty()) {
+					pkey = helper::load_public_key_from_string(public_key, public_key_password);
+				} else
+					throw rsa_exception("at least one of public or private key need to be present");
+			}
+			/**
+			 * Sign jwt data
+			 * \param data The data to sign
+			 * \return RSA signature for the given data
+			 * \throws signature_generation_exception
+			 */
+			std::string sign(const std::string& data) const {
+#ifdef OPENSSL10
+				std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_destroy)> ctx(EVP_MD_CTX_create(), EVP_MD_CTX_destroy);
+#else
+				std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_free)> ctx(EVP_MD_CTX_create(), EVP_MD_CTX_free);
+#endif
+				if (!ctx)
+					throw signature_generation_exception("failed to create signature: could not create context");
+				if (!EVP_SignInit(ctx.get(), md()))
+					throw signature_generation_exception("failed to create signature: SignInit failed");
+
+				std::string res;
+				res.resize(EVP_PKEY_size(pkey.get()));
+				unsigned int len = 0;
+
+				if (!EVP_SignUpdate(ctx.get(), data.data(), data.size()))
+					throw signature_generation_exception();
+				if (!EVP_SignFinal(ctx.get(), (unsigned char*)res.data(), &len, pkey.get()))
+					throw signature_generation_exception();
+
+				res.resize(len);
+				return res;
+			}
+			/**
+			 * Check if signature is valid
+			 * \param data The data to check signature against
+			 * \param signature Signature provided by the jwt
+			 * \throws signature_verification_exception If the provided signature does not match
+			 */
+			void verify(const std::string& data, const std::string& signature) const {
+#ifdef OPENSSL10
+				std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_destroy)> ctx(EVP_MD_CTX_create(), EVP_MD_CTX_destroy);
+#else
+				std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_free)> ctx(EVP_MD_CTX_create(), EVP_MD_CTX_free);
+#endif
+				if (!ctx)
+					throw signature_verification_exception("failed to verify signature: could not create context");
+				if (!EVP_VerifyInit(ctx.get(), md()))
+					throw signature_verification_exception("failed to verify signature: VerifyInit failed");
+				if (!EVP_VerifyUpdate(ctx.get(), data.data(), data.size()))
+					throw signature_verification_exception("failed to verify signature: VerifyUpdate failed");
+				auto res = EVP_VerifyFinal(ctx.get(), (const unsigned char*)signature.data(), signature.size(), pkey.get());
+				if (res != 1)
+					throw signature_verification_exception("evp verify final failed: " + std::to_string(res) + " " + ERR_error_string(ERR_get_error(), NULL));
+			}
+			/**
+			 * Returns the algorithm name provided to the constructor
+			 * \return Algorithmname
+			 */
+			std::string name() const {
+				return alg_name;
+			}
+		private:
+			/// OpenSSL structure containing converted keys
+			std::shared_ptr<EVP_PKEY> pkey;
+			/// Hash generator
+			const EVP_MD*(*md)();
+			/// Algorithmname
+			const std::string alg_name;
+		};
+		/**
+		 * Base class for ECDSA family of algorithms
+		 */
+		struct ecdsa {
+			/**
+			 * Construct new ecdsa algorithm
+			 * \param public_key ECDSA public key in PEM format
+			 * \param private_key ECDSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 * \param md Pointer to hash function
+			 * \param name Name of the algorithm
+			 */
+			ecdsa(const std::string& public_key, const std::string& private_key, const std::string& public_key_password, const std::string& private_key_password, const EVP_MD*(*md)(), const std::string& name, size_t siglen)
+				: md(md), alg_name(name), signature_length(siglen)
+			{
+				if (!public_key.empty()) {
+					std::unique_ptr<BIO, decltype(&BIO_free_all)> pubkey_bio(BIO_new(BIO_s_mem()), BIO_free_all);
+					if(public_key.substr(0, 27) == "-----BEGIN CERTIFICATE-----") {
+						auto epkey = helper::extract_pubkey_from_cert(public_key, public_key_password);
+						if ((size_t)BIO_write(pubkey_bio.get(), epkey.data(), epkey.size()) != epkey.size())
+							throw ecdsa_exception("failed to load public key: bio_write failed");
+					} else  {
+						if ((size_t)BIO_write(pubkey_bio.get(), public_key.data(), public_key.size()) != public_key.size())
+							throw ecdsa_exception("failed to load public key: bio_write failed");
+					}
+
+					pkey.reset(PEM_read_bio_EC_PUBKEY(pubkey_bio.get(), nullptr, nullptr, (void*)public_key_password.c_str()), EC_KEY_free);
+					if (!pkey)
+						throw ecdsa_exception("failed to load public key: PEM_read_bio_EC_PUBKEY failed:" + std::string(ERR_error_string(ERR_get_error(), NULL)));
+					size_t keysize = EC_GROUP_get_degree(EC_KEY_get0_group(pkey.get()));
+					if(keysize != signature_length*4 && (signature_length != 132 || keysize != 521))
+						throw ecdsa_exception("invalid key size");
+				}
+
+				if (!private_key.empty()) {
+					std::unique_ptr<BIO, decltype(&BIO_free_all)> privkey_bio(BIO_new(BIO_s_mem()), BIO_free_all);
+					if ((size_t)BIO_write(privkey_bio.get(), private_key.data(), private_key.size()) != private_key.size())
+						throw rsa_exception("failed to load private key: bio_write failed");
+					pkey.reset(PEM_read_bio_ECPrivateKey(privkey_bio.get(), nullptr, nullptr, const_cast<char*>(private_key_password.c_str())), EC_KEY_free);
+					if (!pkey)
+						throw rsa_exception("failed to load private key: PEM_read_bio_ECPrivateKey failed");
+					size_t keysize = EC_GROUP_get_degree(EC_KEY_get0_group(pkey.get()));
+					if(keysize != signature_length*4 && (signature_length != 132 || keysize != 521))
+						throw ecdsa_exception("invalid key size");
+				}
+				if(!pkey)
+					throw rsa_exception("at least one of public or private key need to be present");
+
+				if(EC_KEY_check_key(pkey.get()) == 0)
+					throw ecdsa_exception("failed to load key: key is invalid");
+			}
+			/**
+			 * Sign jwt data
+			 * \param data The data to sign
+			 * \return ECDSA signature for the given data
+			 * \throws signature_generation_exception
+			 */
+			std::string sign(const std::string& data) const {
+				const std::string hash = generate_hash(data);
+
+				std::unique_ptr<ECDSA_SIG, decltype(&ECDSA_SIG_free)>
+					sig(ECDSA_do_sign((const unsigned char*)hash.data(), hash.size(), pkey.get()), ECDSA_SIG_free);
+				if(!sig)
+					throw signature_generation_exception();
+#ifdef OPENSSL10
+
+				auto rr = bn2raw(sig->r);
+				auto rs = bn2raw(sig->s);
+#else
+				const BIGNUM *r;
+				const BIGNUM *s;
+				ECDSA_SIG_get0(sig.get(), &r, &s);
+				auto rr = bn2raw(r);
+				auto rs = bn2raw(s);
+#endif
+				if(rr.size() > signature_length/2 || rs.size() > signature_length/2)
+					throw std::logic_error("bignum size exceeded expected length");
+				while(rr.size() != signature_length/2) rr = '\0' + rr;
+				while(rs.size() != signature_length/2) rs = '\0' + rs;
+				return rr + rs;
+			}
+
+			/**
+			 * Check if signature is valid
+			 * \param data The data to check signature against
+			 * \param signature Signature provided by the jwt
+			 * \throws signature_verification_exception If the provided signature does not match
+			 */
+			void verify(const std::string& data, const std::string& signature) const {
+				const std::string hash = generate_hash(data);
+				auto r = raw2bn(signature.substr(0, signature.size() / 2));
+				auto s = raw2bn(signature.substr(signature.size() / 2));
+
+#ifdef OPENSSL10
+				ECDSA_SIG sig;
+				sig.r = r.get();
+				sig.s = s.get();
+
+				if(ECDSA_do_verify((const unsigned char*)hash.data(), hash.size(), &sig, pkey.get()) != 1)
+					throw signature_verification_exception("Invalid signature");
+#else
+				std::unique_ptr<ECDSA_SIG, decltype(&ECDSA_SIG_free)> sig(ECDSA_SIG_new(), ECDSA_SIG_free);
+
+				ECDSA_SIG_set0(sig.get(), r.release(), s.release());
+
+				if(ECDSA_do_verify((const unsigned char*)hash.data(), hash.size(), sig.get(), pkey.get()) != 1)
+					throw signature_verification_exception("Invalid signature");
+#endif
+			}
+			/**
+			 * Returns the algorithm name provided to the constructor
+			 * \return Algorithmname
+			 */
+			std::string name() const {
+				return alg_name;
+			}
+		private:
+			/**
+			 * Convert a OpenSSL BIGNUM to a std::string
+			 * \param bn BIGNUM to convert
+			 * \return bignum as string
+			 */
+#ifdef OPENSSL10
+			static std::string bn2raw(BIGNUM* bn)
+#else
+			static std::string bn2raw(const BIGNUM* bn)
+#endif
+			{
+				std::string res;
+				res.resize(BN_num_bytes(bn));
+				BN_bn2bin(bn, (unsigned char*)res.data());
+				return res;
+			}
+			/**
+			 * Convert an std::string to a OpenSSL BIGNUM
+			 * \param raw String to convert
+			 * \return BIGNUM representation
+			 */
+			static std::unique_ptr<BIGNUM, decltype(&BN_free)> raw2bn(const std::string& raw) {
+				return std::unique_ptr<BIGNUM, decltype(&BN_free)>(BN_bin2bn((const unsigned char*)raw.data(), raw.size(), nullptr), BN_free);
+			}
+
+			/**
+			 * Hash the provided data using the hash function specified in constructor
+			 * \param data Data to hash
+			 * \return Hash of data
+			 */
+			std::string generate_hash(const std::string& data) const {
+#ifdef OPENSSL10
+				std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_destroy)> ctx(EVP_MD_CTX_create(), &EVP_MD_CTX_destroy);
+#else
+				std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_free)> ctx(EVP_MD_CTX_new(), EVP_MD_CTX_free);
+#endif
+				if(EVP_DigestInit(ctx.get(), md()) == 0)
+					throw signature_generation_exception("EVP_DigestInit failed");
+				if(EVP_DigestUpdate(ctx.get(), data.data(), data.size()) == 0)
+					throw signature_generation_exception("EVP_DigestUpdate failed");
+				unsigned int len = 0;
+				std::string res;
+				res.resize(EVP_MD_CTX_size(ctx.get()));
+				if(EVP_DigestFinal(ctx.get(), (unsigned char*)res.data(), &len) == 0)
+					throw signature_generation_exception("EVP_DigestFinal failed");
+				res.resize(len);
+				return res;
+			}
+
+			/// OpenSSL struct containing keys
+			std::shared_ptr<EC_KEY> pkey;
+			/// Hash generator function
+			const EVP_MD*(*md)();
+			/// Algorithmname
+			const std::string alg_name;
+			/// Length of the resulting signature
+			const size_t signature_length;
+		};
+
+		/**
+		 * Base class for PSS-RSA family of algorithms
+		 */
+		struct pss {
+			/**
+			 * Construct new pss algorithm
+			 * \param public_key RSA public key in PEM format
+			 * \param private_key RSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 * \param md Pointer to hash function
+			 * \param name Name of the algorithm
+			 */
+			pss(const std::string& public_key, const std::string& private_key, const std::string& public_key_password, const std::string& private_key_password, const EVP_MD*(*md)(), const std::string& name)
+				: md(md), alg_name(name)
+			{
+				if (!private_key.empty()) {
+					pkey = helper::load_private_key_from_string(private_key, private_key_password);
+				} else if(!public_key.empty()) {
+					pkey = helper::load_public_key_from_string(public_key, public_key_password);
+				} else
+					throw rsa_exception("at least one of public or private key need to be present");
+			}
+			/**
+			 * Sign jwt data
+			 * \param data The data to sign
+			 * \return ECDSA signature for the given data
+			 * \throws signature_generation_exception
+			 */
+			std::string sign(const std::string& data) const {
+				auto hash = this->generate_hash(data);
+
+				std::unique_ptr<RSA, decltype(&RSA_free)> key(EVP_PKEY_get1_RSA(pkey.get()), RSA_free);
+				const int size = RSA_size(key.get());
+
+				std::string padded(size, 0x00);
+				if (!RSA_padding_add_PKCS1_PSS_mgf1(key.get(), (unsigned char*)padded.data(), (const unsigned char*)hash.data(), md(), md(), -1))  
+					throw signature_generation_exception("failed to create signature: RSA_padding_add_PKCS1_PSS_mgf1 failed");
+
+				std::string res(size, 0x00);
+				if (RSA_private_encrypt(size, (const unsigned char*)padded.data(), (unsigned char*)res.data(), key.get(), RSA_NO_PADDING) < 0)
+					throw signature_generation_exception("failed to create signature: RSA_private_encrypt failed");
+				return res;
+			}
+			/**
+			 * Check if signature is valid
+			 * \param data The data to check signature against
+			 * \param signature Signature provided by the jwt
+			 * \throws signature_verification_exception If the provided signature does not match
+			 */
+			void verify(const std::string& data, const std::string& signature) const {
+				auto hash = this->generate_hash(data);
+
+				std::unique_ptr<RSA, decltype(&RSA_free)> key(EVP_PKEY_get1_RSA(pkey.get()), RSA_free);
+				const int size = RSA_size(key.get());
+				
+				std::string sig(size, 0x00);
+				if(!RSA_public_decrypt(signature.size(), (const unsigned char*)signature.data(), (unsigned char*)sig.data(), key.get(), RSA_NO_PADDING))
+					throw signature_verification_exception("Invalid signature");
+				
+				if(!RSA_verify_PKCS1_PSS_mgf1(key.get(), (const unsigned char*)hash.data(), md(), md(), (const unsigned char*)sig.data(), -1))
+					throw signature_verification_exception("Invalid signature");
+			}
+			/**
+			 * Returns the algorithm name provided to the constructor
+			 * \return Algorithmname
+			 */
+			std::string name() const {
+				return alg_name;
+			}
+		private:
+			/**
+			 * Hash the provided data using the hash function specified in constructor
+			 * \param data Data to hash
+			 * \return Hash of data
+			 */
+			std::string generate_hash(const std::string& data) const {
+#ifdef OPENSSL10
+				std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_destroy)> ctx(EVP_MD_CTX_create(), &EVP_MD_CTX_destroy);
+#else
+				std::unique_ptr<EVP_MD_CTX, decltype(&EVP_MD_CTX_free)> ctx(EVP_MD_CTX_new(), EVP_MD_CTX_free);
+#endif
+				if(EVP_DigestInit(ctx.get(), md()) == 0)
+					throw signature_generation_exception("EVP_DigestInit failed");
+				if(EVP_DigestUpdate(ctx.get(), data.data(), data.size()) == 0)
+					throw signature_generation_exception("EVP_DigestUpdate failed");
+				unsigned int len = 0;
+				std::string res;
+				res.resize(EVP_MD_CTX_size(ctx.get()));
+				if(EVP_DigestFinal(ctx.get(), (unsigned char*)res.data(), &len) == 0)
+					throw signature_generation_exception("EVP_DigestFinal failed");
+				res.resize(len);
+				return res;
+			}
+			
+			/// OpenSSL structure containing keys
+			std::shared_ptr<EVP_PKEY> pkey;
+			/// Hash generator function
+			const EVP_MD*(*md)();
+			/// Algorithmname
+			const std::string alg_name;
+		};
+
+		/**
+		 * HS256 algorithm
+		 */
+		struct hs256 : public hmacsha {
+			/**
+			 * Construct new instance of algorithm
+			 * \param key HMAC signing key
+			 */
+			explicit hs256(std::string key)
+				: hmacsha(std::move(key), EVP_sha256, "HS256")
+			{}
+		};
+		/**
+		 * HS384 algorithm
+		 */
+		struct hs384 : public hmacsha {
+			/**
+			 * Construct new instance of algorithm
+			 * \param key HMAC signing key
+			 */
+			explicit hs384(std::string key)
+				: hmacsha(std::move(key), EVP_sha384, "HS384")
+			{}
+		};
+		/**
+		 * HS512 algorithm
+		 */
+		struct hs512 : public hmacsha {
+			/**
+			 * Construct new instance of algorithm
+			 * \param key HMAC signing key
+			 */
+			explicit hs512(std::string key)
+				: hmacsha(std::move(key), EVP_sha512, "HS512")
+			{}
+		};
+		/**
+		 * RS256 algorithm
+		 */
+		struct rs256 : public rsa {
+			/**
+			 * Construct new instance of algorithm
+			 * \param public_key RSA public key in PEM format
+			 * \param private_key RSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 */
+			explicit rs256(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "")
+				: rsa(public_key, private_key, public_key_password, private_key_password, EVP_sha256, "RS256")
+			{}
+		};
+		/**
+		 * RS384 algorithm
+		 */
+		struct rs384 : public rsa {
+			/**
+			 * Construct new instance of algorithm
+			 * \param public_key RSA public key in PEM format
+			 * \param private_key RSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 */
+			explicit rs384(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "")
+				: rsa(public_key, private_key, public_key_password, private_key_password, EVP_sha384, "RS384")
+			{}
+		};
+		/**
+		 * RS512 algorithm
+		 */
+		struct rs512 : public rsa {
+			/**
+			 * Construct new instance of algorithm
+			 * \param public_key RSA public key in PEM format
+			 * \param private_key RSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 */
+			explicit rs512(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "")
+				: rsa(public_key, private_key, public_key_password, private_key_password, EVP_sha512, "RS512")
+			{}
+		};
+		/**
+		 * ES256 algorithm
+		 */
+		struct es256 : public ecdsa {
+			/**
+			 * Construct new instance of algorithm
+			 * \param public_key ECDSA public key in PEM format
+			 * \param private_key ECDSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 */
+			explicit es256(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "")
+				: ecdsa(public_key, private_key, public_key_password, private_key_password, EVP_sha256, "ES256", 64)
+			{}
+		};
+		/**
+		 * ES384 algorithm
+		 */
+		struct es384 : public ecdsa {
+			/**
+			 * Construct new instance of algorithm
+			 * \param public_key ECDSA public key in PEM format
+			 * \param private_key ECDSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 */
+			explicit es384(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "")
+				: ecdsa(public_key, private_key, public_key_password, private_key_password, EVP_sha384, "ES384", 96)
+			{}
+		};
+		/**
+		 * ES512 algorithm
+		 */
+		struct es512 : public ecdsa {
+			/**
+			 * Construct new instance of algorithm
+			 * \param public_key ECDSA public key in PEM format
+			 * \param private_key ECDSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 */
+			explicit es512(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "")
+				: ecdsa(public_key, private_key, public_key_password, private_key_password, EVP_sha512, "ES512", 132)
+			{}
+		};
+
+		/**
+		 * PS256 algorithm
+		 */
+		struct ps256 : public pss {
+			/**
+			 * Construct new instance of algorithm
+			 * \param public_key RSA public key in PEM format
+			 * \param private_key RSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 */
+			explicit ps256(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "")
+				: pss(public_key, private_key, public_key_password, private_key_password, EVP_sha256, "PS256")
+			{}
+		};
+		/**
+		 * PS384 algorithm
+		 */
+		struct ps384 : public pss {
+			/**
+			 * Construct new instance of algorithm
+			 * \param public_key RSA public key in PEM format
+			 * \param private_key RSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 */
+			explicit ps384(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "")
+				: pss(public_key, private_key, public_key_password, private_key_password, EVP_sha384, "PS384")
+			{}
+		};
+		/**
+		 * PS512 algorithm
+		 */
+		struct ps512 : public pss {
+			/**
+			 * Construct new instance of algorithm
+			 * \param public_key RSA public key in PEM format
+			 * \param private_key RSA private key or empty string if not available. If empty, signing will always fail.
+			 * \param public_key_password Password to decrypt public key pem.
+			 * \param privat_key_password Password to decrypt private key pem.
+			 */
+			explicit ps512(const std::string& public_key, const std::string& private_key = "", const std::string& public_key_password = "", const std::string& private_key_password = "")
+				: pss(public_key, private_key, public_key_password, private_key_password, EVP_sha512, "PS512")
+			{}
+		};
+	}
+
+	/**
+	 * Convenience wrapper for JSON value
+	 */
+	class claim {
+		picojson::value val;
+	public:
+		enum class type {
+			null,
+			boolean,
+			number,
+			string,
+			array,
+			object,
+			int64
+		};
+
+		claim()
+			: val()
+		{}
+#if JWT_CLAIM_EXPLICIT
+		explicit claim(std::string s)
+			: val(std::move(s))
+		{}
+		explicit claim(const date& s)
+			: val(int64_t(std::chrono::system_clock::to_time_t(s)))
+		{}
+		explicit claim(const std::set<std::string>& s)
+			: val(picojson::array(s.cbegin(), s.cend()))
+		{}
+		explicit claim(const picojson::value& val)
+			: val(val)
+		{}
+#else
+		claim(std::string s)
+			: val(std::move(s))
+		{}
+		claim(const date& s)
+			: val(int64_t(std::chrono::system_clock::to_time_t(s)))
+		{}
+		claim(const std::set<std::string>& s)
+			: val(picojson::array(s.cbegin(), s.cend()))
+		{}
+		claim(const picojson::value& val)
+			: val(val)
+		{}
+#endif
+
+		template<typename Iterator>
+		claim(Iterator start, Iterator end)
+			: val(picojson::array())
+		{
+			auto& arr = val.get<picojson::array>();
+			for(; start != end; start++) {
+				arr.push_back(picojson::value(*start));
+			}
+		}
+
+		/**
+		 * Get wrapped json object
+		 * \return Wrapped json object
+		 */
+		picojson::value to_json() const {
+			return val;
+		}
+
+		/**
+		 * Get type of contained object
+		 * \return Type
+		 * \throws std::logic_error An internal error occured
+		 */
+		type get_type() const {
+			if (val.is<picojson::null>()) return type::null;
+			else if (val.is<bool>()) return type::boolean;
+			else if (val.is<int64_t>()) return type::int64;
+			else if (val.is<double>()) return type::number;
+			else if (val.is<std::string>()) return type::string;
+			else if (val.is<picojson::array>()) return type::array;
+			else if (val.is<picojson::object>()) return type::object;
+			else throw std::logic_error("internal error");
+		}
+
+		/**
+		 * Get the contained object as a string
+		 * \return content as string
+		 * \throws std::bad_cast Content was not a string
+		 */
+		const std::string& as_string() const {
+			if (!val.is<std::string>())
+				throw std::bad_cast();
+			return val.get<std::string>();
+		}
+		/**
+		 * Get the contained object as a date
+		 * \return content as date
+		 * \throws std::bad_cast Content was not a date
+		 */
+		date as_date() const {
+			return std::chrono::system_clock::from_time_t(as_int());
+		}
+		/**
+		 * Get the contained object as an array
+		 * \return content as array
+		 * \throws std::bad_cast Content was not an array
+		 */
+		const picojson::array& as_array() const {
+			if (!val.is<picojson::array>())
+				throw std::bad_cast();
+			return val.get<picojson::array>();
+		}
+		/**
+		 * Get the contained object as a set of strings
+		 * \return content as set of strings
+		 * \throws std::bad_cast Content was not a set
+		 */
+		const std::set<std::string> as_set() const {
+			std::set<std::string> res;
+			for(auto& e : as_array()) {
+				if(!e.is<std::string>())
+					throw std::bad_cast();
+				res.insert(e.get<std::string>());
+			}
+			return res;
+		}
+		/**
+		 * Get the contained object as an integer
+		 * \return content as int
+		 * \throws std::bad_cast Content was not an int
+		 */
+		int64_t as_int() const {
+			if (!val.is<int64_t>())
+				throw std::bad_cast();
+			return val.get<int64_t>();
+		}
+		/**
+		 * Get the contained object as a bool
+		 * \return content as bool
+		 * \throws std::bad_cast Content was not a bool
+		 */
+		bool as_bool() const {
+			if (!val.is<bool>())
+				throw std::bad_cast();
+			return val.get<bool>();
+		}
+		/**
+		 * Get the contained object as a number
+		 * \return content as double
+		 * \throws std::bad_cast Content was not a number
+		 */
+		double as_number() const {
+			if (!val.is<double>())
+				throw std::bad_cast();
+			return val.get<double>();
+		}
+		/**
+		 * Get the contained object as an object
+		 * \return content as object
+		 * \throws std::bad_cast Content was not an object
+		 */
+		const picojson::object& as_object() const {
+			if (!val.is<picojson::object>())
+				throw std::bad_cast();
+			return val.get<picojson::object>();
+		}
+	};
+
+	/**
+	 * Base class that represents a token payload.
+	 * Contains Convenience accessors for common claims.
+	 */
+	class payload {
+	protected:
+		std::unordered_map<std::string, claim> payload_claims;
+	public:
+		/**
+		 * Check if issuer is present ("iss")
+		 * \return true if present, false otherwise
+		 */
+		bool has_issuer() const noexcept { return has_payload_claim("iss"); }
+		/**
+		 * Check if subject is present ("sub")
+		 * \return true if present, false otherwise
+		 */
+		bool has_subject() const noexcept { return has_payload_claim("sub"); }
+		/**
+		 * Check if audience is present ("aud")
+		 * \return true if present, false otherwise
+		 */
+		bool has_audience() const noexcept { return has_payload_claim("aud"); }
+		/**
+		 * Check if expires is present ("exp")
+		 * \return true if present, false otherwise
+		 */
+		bool has_expires_at() const noexcept { return has_payload_claim("exp"); }
+		/**
+		 * Check if not before is present ("nbf")
+		 * \return true if present, false otherwise
+		 */
+		bool has_not_before() const noexcept { return has_payload_claim("nbf"); }
+		/**
+		 * Check if issued at is present ("iat")
+		 * \return true if present, false otherwise
+		 */
+		bool has_issued_at() const noexcept { return has_payload_claim("iat"); }
+		/**
+		 * Check if token id is present ("jti")
+		 * \return true if present, false otherwise
+		 */
+		bool has_id() const noexcept { return has_payload_claim("jti"); }
+		/**
+		 * Get issuer claim
+		 * \return issuer as string
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token)
+		 */
+		const std::string& get_issuer() const { return get_payload_claim("iss").as_string(); }
+		/**
+		 * Get subject claim
+		 * \return subject as string
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token)
+		 */
+		const std::string& get_subject() const { return get_payload_claim("sub").as_string(); }
+		/**
+		 * Get audience claim
+		 * \return audience as a set of strings
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a set (Should not happen in a valid token)
+		 */
+		std::set<std::string> get_audience() const { 
+			auto aud = get_payload_claim("aud");
+			if(aud.get_type() == jwt::claim::type::string) return { aud.as_string()};
+			else return aud.as_set();
+		}
+		/**
+		 * Get expires claim
+		 * \return expires as a date in utc
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a date (Should not happen in a valid token)
+		 */
+		const date get_expires_at() const { return get_payload_claim("exp").as_date(); }
+		/**
+		 * Get not valid before claim
+		 * \return nbf date in utc
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a date (Should not happen in a valid token)
+		 */
+		const date get_not_before() const { return get_payload_claim("nbf").as_date(); }
+		/**
+		 * Get issued at claim
+		 * \return issued at as date in utc
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a date (Should not happen in a valid token)
+		 */
+		const date get_issued_at() const { return get_payload_claim("iat").as_date(); }
+		/**
+		 * Get id claim
+		 * \return id as string
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token)
+		 */
+		const std::string& get_id() const { return get_payload_claim("jti").as_string(); }
+		/**
+		 * Check if a payload claim is present
+		 * \return true if claim was present, false otherwise
+		 */
+		bool has_payload_claim(const std::string& name) const noexcept { return payload_claims.count(name) != 0; }
+		/**
+		 * Get payload claim
+		 * \return Requested claim
+		 * \throws std::runtime_error If claim was not present
+		 */
+		const claim& get_payload_claim(const std::string& name) const {
+			if (!has_payload_claim(name))
+				throw std::runtime_error("claim not found");
+			return payload_claims.at(name);
+		}
+		/**
+		 * Get all payload claims
+		 * \return map of claims
+		 */
+		std::unordered_map<std::string, claim> get_payload_claims() const { return payload_claims; }
+	};
+
+	/**
+	 * Base class that represents a token header.
+	 * Contains Convenience accessors for common claims.
+	 */
+	class header {
+	protected:
+		std::unordered_map<std::string, claim> header_claims;
+	public:
+		/**
+		 * Check if algortihm is present ("alg")
+		 * \return true if present, false otherwise
+		 */
+		bool has_algorithm() const noexcept { return has_header_claim("alg"); }
+		/**
+		 * Check if type is present ("typ")
+		 * \return true if present, false otherwise
+		 */
+		bool has_type() const noexcept { return has_header_claim("typ"); }
+		/**
+		 * Check if content type is present ("cty")
+		 * \return true if present, false otherwise
+		 */
+		bool has_content_type() const noexcept { return has_header_claim("cty"); }
+		/**
+		 * Check if key id is present ("kid")
+		 * \return true if present, false otherwise
+		 */
+		bool has_key_id() const noexcept { return has_header_claim("kid"); }
+		/**
+		 * Get algorithm claim
+		 * \return algorithm as string
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token)
+		 */
+		const std::string& get_algorithm() const { return get_header_claim("alg").as_string(); }
+		/**
+		 * Get type claim
+		 * \return type as a string
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token)
+		 */
+		const std::string& get_type() const { return get_header_claim("typ").as_string(); }
+		/**
+		 * Get content type claim
+		 * \return content type as string
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token)
+		 */
+		const std::string& get_content_type() const { return get_header_claim("cty").as_string(); }
+		/**
+		 * Get key id claim
+		 * \return key id as string
+		 * \throws std::runtime_error If claim was not present
+		 * \throws std::bad_cast Claim was present but not a string (Should not happen in a valid token)
+		 */
+		const std::string& get_key_id() const { return get_header_claim("kid").as_string(); }
+		/**
+		 * Check if a header claim is present
+		 * \return true if claim was present, false otherwise
+		 */
+		bool has_header_claim(const std::string& name) const noexcept { return header_claims.count(name) != 0; }
+		/**
+		 * Get header claim
+		 * \return Requested claim
+		 * \throws std::runtime_error If claim was not present
+		 */
+		const claim& get_header_claim(const std::string& name) const {
+			if (!has_header_claim(name))
+				throw std::runtime_error("claim not found");
+			return header_claims.at(name);
+		}
+		/**
+		 * Get all header claims
+		 * \return map of claims
+		 */
+		std::unordered_map<std::string, claim> get_header_claims() const { return header_claims; }
+	};
+
+	/**
+	 * Class containing all information about a decoded token
+	 */
+	class decoded_jwt : public header, public payload {
+	protected:
+		/// Unmodifed token, as passed to constructor
+		const std::string token;
+		/// Header part decoded from base64
+		std::string header;
+		/// Unmodified header part in base64
+		std::string header_base64;
+		/// Payload part decoded from base64
+		std::string payload;
+		/// Unmodified payload part in base64
+		std::string payload_base64;
+		/// Signature part decoded from base64
+		std::string signature;
+		/// Unmodified signature part in base64
+		std::string signature_base64;
+	public:
+		/**
+		 * Constructor 
+		 * Parses a given token
+		 * \param token The token to parse
+		 * \throws std::invalid_argument Token is not in correct format
+		 * \throws std::runtime_error Base64 decoding failed or invalid json
+		 */
+		explicit decoded_jwt(const std::string& token)
+			: token(token)
+		{
+			auto hdr_end = token.find('.');
+			if (hdr_end == std::string::npos)
+				throw std::invalid_argument("invalid token supplied");
+			auto payload_end = token.find('.', hdr_end + 1);
+			if (payload_end == std::string::npos)
+				throw std::invalid_argument("invalid token supplied");
+			header = header_base64 = token.substr(0, hdr_end);
+			payload = payload_base64 = token.substr(hdr_end + 1, payload_end - hdr_end - 1);
+			signature = signature_base64 = token.substr(payload_end + 1);
+
+			// Fix padding: JWT requires padding to get removed
+			auto fix_padding = [](std::string& str) {
+				switch (str.size() % 4) {
+				case 1:
+					str += alphabet::base64url::fill();
+#ifdef __has_cpp_attribute
+#if __has_cpp_attribute(fallthrough)
+					[[fallthrough]];
+#endif
+#endif
+				case 2:
+					str += alphabet::base64url::fill();
+#ifdef __has_cpp_attribute
+#if __has_cpp_attribute(fallthrough)
+					[[fallthrough]];
+#endif
+#endif
+				case 3:
+					str += alphabet::base64url::fill();
+#ifdef __has_cpp_attribute
+#if __has_cpp_attribute(fallthrough)
+					[[fallthrough]];
+#endif
+#endif
+				default:
+					break;
+				}
+			};
+			fix_padding(header);
+			fix_padding(payload);
+			fix_padding(signature);
+
+			header = base::decode<alphabet::base64url>(header);
+			payload = base::decode<alphabet::base64url>(payload);
+			signature = base::decode<alphabet::base64url>(signature);
+
+			auto parse_claims = [](const std::string& str) {
+				std::unordered_map<std::string, claim> res;
+				picojson::value val;
+				if (!picojson::parse(val, str).empty())
+					throw std::runtime_error("Invalid json");
+
+				for (auto& e : val.get<picojson::object>()) { res.insert({ e.first, claim(e.second) }); }
+
+				return res;
+			};
+
+			header_claims = parse_claims(header);
+			payload_claims = parse_claims(payload);
+		}
+
+		/**
+		 * Get token string, as passed to constructor
+		 * \return token as passed to constructor
+		 */
+		const std::string& get_token() const noexcept { return token; }
+		/**
+		 * Get header part as json string
+		 * \return header part after base64 decoding
+		 */
+		const std::string& get_header() const noexcept { return header; }
+		/**
+		 * Get payload part as json string
+		 * \return payload part after base64 decoding
+		 */
+		const std::string& get_payload() const noexcept { return payload; }
+		/**
+		 * Get signature part as json string
+		 * \return signature part after base64 decoding
+		 */
+		const std::string& get_signature() const noexcept { return signature; }
+		/**
+		 * Get header part as base64 string
+		 * \return header part before base64 decoding
+		 */
+		const std::string& get_header_base64() const noexcept { return header_base64; }
+		/**
+		 * Get payload part as base64 string
+		 * \return payload part before base64 decoding
+		 */
+		const std::string& get_payload_base64() const noexcept { return payload_base64; }
+		/**
+		 * Get signature part as base64 string
+		 * \return signature part before base64 decoding
+		 */
+		const std::string& get_signature_base64() const noexcept { return signature_base64; }
+
+	};
+
+	/**
+	 * Builder class to build and sign a new token
+	 * Use jwt::create() to get an instance of this class.
+	 */
+	class builder {
+		std::unordered_map<std::string, claim> header_claims;
+		std::unordered_map<std::string, claim> payload_claims;
+
+		builder() {}
+		friend builder create();
+	public:
+		/**
+		 * Set a header claim.
+		 * \param id Name of the claim
+		 * \param c Claim to add
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_header_claim(const std::string& id, claim c) { header_claims[id] = std::move(c); return *this; }
+		/**
+		 * Set a payload claim.
+		 * \param id Name of the claim
+		 * \param c Claim to add
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_payload_claim(const std::string& id, claim c) { payload_claims[id] = std::move(c); return *this; }
+		/**
+		 * Set algorithm claim
+		 * You normally don't need to do this, as the algorithm is automatically set if you don't change it.
+		 * \param str Name of algorithm
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_algorithm(const std::string& str) { return set_header_claim("alg", claim(str)); }
+		/**
+		 * Set type claim
+		 * \param str Type to set
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_type(const std::string& str) { return set_header_claim("typ", claim(str)); }
+		/**
+		 * Set content type claim
+		 * \param str Type to set
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_content_type(const std::string& str) { return set_header_claim("cty", claim(str)); }
+		/**
+		 * Set key id claim
+		 * \param str Key id to set
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_key_id(const std::string& str) { return set_header_claim("kid", claim(str)); }
+		/**
+		 * Set issuer claim
+		 * \param str Issuer to set
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_issuer(const std::string& str) { return set_payload_claim("iss", claim(str)); }
+		/**
+		 * Set subject claim
+		 * \param str Subject to set
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_subject(const std::string& str) { return set_payload_claim("sub", claim(str)); }
+		/**
+		 * Set audience claim
+		 * \param l Audience set
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_audience(const std::set<std::string>& l) { return set_payload_claim("aud", claim(l)); }
+		/**
+		 * Set audience claim
+		 * \param aud Single audience
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_audience(const std::string& aud) { return set_payload_claim("aud", claim(aud)); }
+		/**
+		 * Set expires at claim
+		 * \param d Expires time
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_expires_at(const date& d) { return set_payload_claim("exp", claim(d)); }
+		/**
+		 * Set not before claim
+		 * \param d First valid time
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_not_before(const date& d) { return set_payload_claim("nbf", claim(d)); }
+		/**
+		 * Set issued at claim
+		 * \param d Issued at time, should be current time
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_issued_at(const date& d) { return set_payload_claim("iat", claim(d)); }
+		/**
+		 * Set id claim
+		 * \param str ID to set
+		 * \return *this to allow for method chaining
+		 */
+		builder& set_id(const std::string& str) { return set_payload_claim("jti", claim(str)); }
+
+		/**
+		 * Sign token and return result
+		 * \param algo Instance of an algorithm to sign the token with
+		 * \return Final token as a string
+		 */
+		template<typename T>
+		std::string sign(const T& algo) const {
+			picojson::object obj_header;
+			obj_header["alg"] = picojson::value(algo.name());
+			for (auto& e : header_claims) {
+				obj_header[e.first] = e.second.to_json();
+			}
+			picojson::object obj_payload;
+			for (auto& e : payload_claims) {
+				obj_payload.insert({ e.first, e.second.to_json() });
+			}
+
+			auto encode = [](const std::string& data) {
+				auto base = base::encode<alphabet::base64url>(data);
+				auto pos = base.find(alphabet::base64url::fill());
+				base = base.substr(0, pos);
+				return base;
+			};
+
+			std::string header = encode(picojson::value(obj_header).serialize());
+			std::string payload = encode(picojson::value(obj_payload).serialize());
+
+			std::string token = header + "." + payload;
+
+			return token + "." + encode(algo.sign(token));
+		}
+	};
+
+	/**
+	 * Verifier class used to check if a decoded token contains all claims required by your application and has a valid signature.
+	 */
+	template<typename Clock>
+	class verifier {
+		struct algo_base {
+			virtual ~algo_base() {}
+			virtual void verify(const std::string& data, const std::string& sig) = 0;
+		};
+		template<typename T>
+		struct algo : public algo_base {
+			T alg;
+			explicit algo(T a) : alg(a) {}
+			virtual void verify(const std::string& data, const std::string& sig) override {
+				alg.verify(data, sig);
+			}
+		};
+
+		/// Required claims
+		std::unordered_map<std::string, claim> claims;
+		/// Leeway time for exp, nbf and iat
+		size_t default_leeway = 0;
+		/// Instance of clock type
+		Clock clock;
+		/// Supported algorithms
+		std::unordered_map<std::string, std::shared_ptr<algo_base>> algs;
+	public:
+		/**
+		 * Constructor for building a new verifier instance
+		 * \param c Clock instance
+		 */
+		explicit verifier(Clock c) : clock(c) {}
+
+		/**
+		 * Set default leeway to use.
+		 * \param leeway Default leeway to use if not specified otherwise
+		 * \return *this to allow chaining
+		 */
+		verifier& leeway(size_t leeway) { default_leeway = leeway; return *this; }
+		/**
+		 * Set leeway for expires at.
+		 * If not specified the default leeway will be used.
+		 * \param leeway Set leeway to use for expires at.
+		 * \return *this to allow chaining
+		 */
+		verifier& expires_at_leeway(size_t leeway) { return with_claim("exp", claim(std::chrono::system_clock::from_time_t(leeway))); }
+		/**
+		 * Set leeway for not before.
+		 * If not specified the default leeway will be used.
+		 * \param leeway Set leeway to use for not before.
+		 * \return *this to allow chaining
+		 */
+		verifier& not_before_leeway(size_t leeway) { return with_claim("nbf", claim(std::chrono::system_clock::from_time_t(leeway))); }
+		/**
+		 * Set leeway for issued at.
+		 * If not specified the default leeway will be used.
+		 * \param leeway Set leeway to use for issued at.
+		 * \return *this to allow chaining
+		 */
+		verifier& issued_at_leeway(size_t leeway) { return with_claim("iat", claim(std::chrono::system_clock::from_time_t(leeway))); }
+		/**
+		 * Set an issuer to check for.
+		 * Check is casesensitive.
+		 * \param iss Issuer to check for.
+		 * \return *this to allow chaining
+		 */
+		verifier& with_issuer(const std::string& iss) { return with_claim("iss", claim(iss)); }
+		/**
+		 * Set a subject to check for.
+		 * Check is casesensitive.
+		 * \param sub Subject to check for.
+		 * \return *this to allow chaining
+		 */
+		verifier& with_subject(const std::string& sub) { return with_claim("sub", claim(sub)); }
+		/**
+		 * Set an audience to check for.
+		 * If any of the specified audiences is not present in the token the check fails.
+		 * \param aud Audience to check for.
+		 * \return *this to allow chaining
+		 */
+		verifier& with_audience(const std::set<std::string>& aud) { return with_claim("aud", claim(aud)); }
+		/**
+		 * Set an id to check for.
+		 * Check is casesensitive.
+		 * \param id ID to check for.
+		 * \return *this to allow chaining
+		 */
+		verifier& with_id(const std::string& id) { return with_claim("jti", claim(id)); }
+		/**
+		 * Specify a claim to check for.
+		 * \param name Name of the claim to check for
+		 * \param c Claim to check for
+		 * \return *this to allow chaining
+		 */
+		verifier& with_claim(const std::string& name, claim c) { claims[name] = c; return *this; }
+
+		/**
+		 * Add an algorithm available for checking.
+		 * \param alg Algorithm to allow
+		 * \return *this to allow chaining
+		 */
+		template<typename Algorithm>
+		verifier& allow_algorithm(Algorithm alg) {
+			algs[alg.name()] = std::make_shared<algo<Algorithm>>(alg);
+			return *this;
+		}
+
+		/**
+		 * Verify the given token.
+		 * \param jwt Token to check
+		 * \throws token_verification_exception Verification failed
+		 */
+		void verify(const decoded_jwt& jwt) const {
+			const std::string data = jwt.get_header_base64() + "." + jwt.get_payload_base64();
+			const std::string sig = jwt.get_signature();
+			const std::string& algo = jwt.get_algorithm();
+			if (algs.count(algo) == 0)
+				throw token_verification_exception("wrong algorithm");
+			algs.at(algo)->verify(data, sig);
+
+			auto assert_claim_eq = [](const decoded_jwt& jwt, const std::string& key, const claim& c) {
+				if (!jwt.has_payload_claim(key))
+					throw token_verification_exception("decoded_jwt is missing " + key + " claim");
+				auto& jc = jwt.get_payload_claim(key);
+				if (jc.get_type() != c.get_type())
+					throw token_verification_exception("claim " + key + " type mismatch");
+				if (c.get_type() == claim::type::int64) {
+					if (c.as_date() != jc.as_date())
+						throw token_verification_exception("claim " + key + " does not match expected");
+				}
+				else if (c.get_type() == claim::type::array) {
+					auto s1 = c.as_set();
+					auto s2 = jc.as_set();
+					if (s1.size() != s2.size())
+						throw token_verification_exception("claim " + key + " does not match expected");
+					auto it1 = s1.cbegin();
+					auto it2 = s2.cbegin();
+					while (it1 != s1.cend() && it2 != s2.cend()) {
+						if (*it1++ != *it2++)
+							throw token_verification_exception("claim " + key + " does not match expected");
+					}
+				}
+				else if (c.get_type() == claim::type::string) {
+					if (c.as_string() != jc.as_string())
+						throw token_verification_exception("claim " + key + " does not match expected");
+				}
+				else throw token_verification_exception("internal error");
+			};
+
+			auto time = clock.now();
+
+			if (jwt.has_expires_at()) {
+				auto leeway = claims.count("exp") == 1 ? std::chrono::system_clock::to_time_t(claims.at("exp").as_date()) : default_leeway;
+				auto exp = jwt.get_expires_at();
+				if (time > exp + std::chrono::seconds(leeway))
+					throw token_verification_exception("token expired");
+			}
+			if (jwt.has_issued_at()) {
+				auto leeway = claims.count("iat") == 1 ? std::chrono::system_clock::to_time_t(claims.at("iat").as_date()) : default_leeway;
+				auto iat = jwt.get_issued_at();
+				if (time < iat - std::chrono::seconds(leeway))
+					throw token_verification_exception("token expired");
+			}
+			if (jwt.has_not_before()) {
+				auto leeway = claims.count("nbf") == 1 ? std::chrono::system_clock::to_time_t(claims.at("nbf").as_date()) : default_leeway;
+				auto nbf = jwt.get_not_before();
+				if (time < nbf - std::chrono::seconds(leeway))
+					throw token_verification_exception("token expired");
+			}
+			for (auto& c : claims)
+			{
+				if (c.first == "exp" || c.first == "iat" || c.first == "nbf") {
+					// Nothing to do here, already checked
+				}
+				else if (c.first == "aud") {
+					if (!jwt.has_audience())
+						throw token_verification_exception("token doesn't contain the required audience");
+					auto aud = jwt.get_audience();
+					auto expected = c.second.as_set();
+					for (auto& e : expected)
+						if (aud.count(e) == 0)
+							throw token_verification_exception("token doesn't contain the required audience");
+				}
+				else {
+					assert_claim_eq(jwt, c.first, c.second);
+				}
+			}
+		}
+	};
+
+	/**
+	 * Create a verifier using the given clock
+	 * \param c Clock instance to use
+	 * \return verifier instance
+	 */
+	template<typename Clock>
+	verifier<Clock> verify(Clock c) {
+		return verifier<Clock>(c);
+	}
+
+	/**
+	 * Default clock class using std::chrono::system_clock as a backend.
+	 */
+	struct default_clock {
+		std::chrono::system_clock::time_point now() const {
+			return std::chrono::system_clock::now();
+		}
+	};
+
+	/**
+	 * Create a verifier using the default clock
+	 * \return verifier instance
+	 */
+    inline
+	verifier<default_clock> verify() {
+		return verify<default_clock>({});
+	}
+
+	/**
+	 * Return a builder instance to create a new token
+	 */
+    inline
+	builder create() {
+		return builder();
+	}
+
+	/**
+	 * Decode a token
+	 * \param token Token to decode
+	 * \return Decoded token
+	 * \throws std::invalid_argument Token is not in correct format
+	 * \throws std::runtime_error Base64 decoding failed or invalid json
+	 */
+    inline
+	decoded_jwt decode(const std::string& token) {
+		return decoded_jwt(token);
+	}
+}
diff --git a/src/rgw/librgw.cc b/src/rgw/librgw.cc
new file mode 100644
index 000000000..bf6fc50d7
--- /dev/null
+++ b/src/rgw/librgw.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <string.h>
+#include <chrono>
+
+#include "include/rados/librgw.h"
+
+#include "include/str_list.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_context.h"
+#include "common/dout.h"
+
+#include "rgw_lib.h"
+
+#include <errno.h>
+#include <thread>
+#include <string>
+#include <mutex>
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw {
+
+bool global_stop = false;
+static std::mutex librgw_mtx;
+static RGWLib rgwlib;
+
+} // namespace rgw
+
+extern "C" {
+
+int librgw_create(librgw_t* rgw, int argc, char **argv)
+{
+  using namespace rgw;
+
+  int rc = -EINVAL;
+
+  g_rgwlib = &rgwlib;
+
+  if (! g_ceph_context) {
+    std::lock_guard<std::mutex> lg(librgw_mtx);
+    if (! g_ceph_context) {
+      std::vector<std::string> spl_args;
+      // last non-0 argument will be split and consumed
+      if (argc > 1) {
+	const std::string spl_arg{argv[(--argc)]};
+	get_str_vec(spl_arg, " \t", spl_args);
+      }
+      auto args = argv_to_vec(argc, argv);
+      // append split args, if any
+      for (const auto& elt : spl_args) {
+	args.push_back(elt.c_str());
+      }
+      rc = rgwlib.init(args);
+    }
+  }
+
+  *rgw = g_ceph_context->get();
+
+  return rc;
+}
+
+void librgw_shutdown(librgw_t rgw)
+{
+  using namespace rgw;
+
+  CephContext* cct = static_cast<CephContext*>(rgw);
+  rgwlib.stop();
+
+  dout(1) << "final shutdown" << dendl;
+
+  cct->put();
+}
+
+} /* extern "C" */
diff --git a/src/rgw/picojson/picojson.h b/src/rgw/picojson/picojson.h
new file mode 100644
index 000000000..ceaeb5ba8
--- /dev/null
+++ b/src/rgw/picojson/picojson.h
@@ -0,0 +1,1177 @@
+/*
+ * Copyright 2009-2010 Cybozu Labs, Inc.
+ * Copyright 2011-2014 Kazuho Oku
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef picojson_h
+#define picojson_h
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cstddef>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <utility>
+
+// for isnan/isinf
+#if __cplusplus >= 201103L
+#include <cmath>
+#else
+extern "C" {
+#ifdef _MSC_VER
+#include <float.h>
+#elif defined(__INTEL_COMPILER)
+#include <mathimf.h>
+#else
+#include <math.h>
+#endif
+}
+#endif
+
+#ifndef PICOJSON_USE_RVALUE_REFERENCE
+#if (defined(__cpp_rvalue_references) && __cpp_rvalue_references >= 200610) || (defined(_MSC_VER) && _MSC_VER >= 1600)
+#define PICOJSON_USE_RVALUE_REFERENCE 1
+#else
+#define PICOJSON_USE_RVALUE_REFERENCE 0
+#endif
+#endif // PICOJSON_USE_RVALUE_REFERENCE
+
+#ifndef PICOJSON_NOEXCEPT
+#if PICOJSON_USE_RVALUE_REFERENCE
+#define PICOJSON_NOEXCEPT noexcept
+#else
+#define PICOJSON_NOEXCEPT throw()
+#endif
+#endif
+
+// experimental support for int64_t (see README.mkdn for detail)
+#ifdef PICOJSON_USE_INT64
+//#define __STDC_FORMAT_MACROS
+#include <cerrno>
+#if __cplusplus >= 201103L
+#include <cinttypes>
+#else
+extern "C" {
+#include <inttypes.h>
+}
+#endif
+#endif
+
+// to disable the use of localeconv(3), set PICOJSON_USE_LOCALE to 0
+#ifndef PICOJSON_USE_LOCALE
+#define PICOJSON_USE_LOCALE 1
+#endif
+#if PICOJSON_USE_LOCALE
+extern "C" {
+#include <locale.h>
+}
+#endif
+
+#ifndef PICOJSON_ASSERT
+#define PICOJSON_ASSERT(e)                                                                                                         \
+  do {                                                                                                                             \
+    if (!(e))                                                                                                                      \
+      throw std::runtime_error(#e);                                                                                                \
+  } while (0)
+#endif
+
+#ifdef _MSC_VER
+#define SNPRINTF _snprintf_s
+#pragma warning(push)
+#pragma warning(disable : 4244) // conversion from int to char
+#pragma warning(disable : 4127) // conditional expression is constant
+#pragma warning(disable : 4702) // unreachable code
+#else
+#define SNPRINTF snprintf
+#endif
+
+namespace picojson {
+
+enum {
+  null_type,
+  boolean_type,
+  number_type,
+  string_type,
+  array_type,
+  object_type
+#ifdef PICOJSON_USE_INT64
+  ,
+  int64_type
+#endif
+};
+
+enum { INDENT_WIDTH = 2 };
+
+struct null {};
+
+class value {
+public:
+  typedef std::vector<value> array;
+  typedef std::map<std::string, value> object;
+  union _storage {
+    bool boolean_;
+    double number_;
+#ifdef PICOJSON_USE_INT64
+    int64_t int64_;
+#endif
+    std::string *string_;
+    array *array_;
+    object *object_;
+  };
+
+protected:
+  int type_;
+  _storage u_;
+
+public:
+  value();
+  value(int type, bool);
+  explicit value(bool b);
+#ifdef PICOJSON_USE_INT64
+  explicit value(int64_t i);
+#endif
+  explicit value(double n);
+  explicit value(const std::string &s);
+  explicit value(const array &a);
+  explicit value(const object &o);
+#if PICOJSON_USE_RVALUE_REFERENCE
+  explicit value(std::string &&s);
+  explicit value(array &&a);
+  explicit value(object &&o);
+#endif
+  explicit value(const char *s);
+  value(const char *s, size_t len);
+  ~value();
+  value(const value &x);
+  value &operator=(const value &x);
+#if PICOJSON_USE_RVALUE_REFERENCE
+  value(value &&x) PICOJSON_NOEXCEPT;
+  value &operator=(value &&x) PICOJSON_NOEXCEPT;
+#endif
+  void swap(value &x) PICOJSON_NOEXCEPT;
+  template <typename T> bool is() const;
+  template <typename T> const T &get() const;
+  template <typename T> T &get();
+  template <typename T> void set(const T &);
+#if PICOJSON_USE_RVALUE_REFERENCE
+  template <typename T> void set(T &&);
+#endif
+  bool evaluate_as_boolean() const;
+  const value &get(const size_t idx) const;
+  const value &get(const std::string &key) const;
+  value &get(const size_t idx);
+  value &get(const std::string &key);
+
+  bool contains(const size_t idx) const;
+  bool contains(const std::string &key) const;
+  std::string to_str() const;
+  template <typename Iter> void serialize(Iter os, bool prettify = false) const;
+  std::string serialize(bool prettify = false) const;
+
+private:
+  template <typename T> value(const T *); // intentionally defined to block implicit conversion of pointer to bool
+  template <typename Iter> static void _indent(Iter os, int indent);
+  template <typename Iter> void _serialize(Iter os, int indent) const;
+  std::string _serialize(int indent) const;
+  void clear();
+};
+
+typedef value::array array;
+typedef value::object object;
+
+inline value::value() : type_(null_type), u_() {
+}
+
+inline value::value(int type, bool) : type_(type), u_() {
+  switch (type) {
+#define INIT(p, v)                                                                                                                 \
+  case p##type:                                                                                                                    \
+    u_.p = v;                                                                                                                      \
+    break
+    INIT(boolean_, false);
+    INIT(number_, 0.0);
+#ifdef PICOJSON_USE_INT64
+    INIT(int64_, 0);
+#endif
+    INIT(string_, new std::string());
+    INIT(array_, new array());
+    INIT(object_, new object());
+#undef INIT
+  default:
+    break;
+  }
+}
+
+inline value::value(bool b) : type_(boolean_type), u_() {
+  u_.boolean_ = b;
+}
+
+#ifdef PICOJSON_USE_INT64
+inline value::value(int64_t i) : type_(int64_type), u_() {
+  u_.int64_ = i;
+}
+#endif
+
+inline value::value(double n) : type_(number_type), u_() {
+  if (
+#ifdef _MSC_VER
+      !_finite(n)
+#elif __cplusplus >= 201103L
+      std::isnan(n) || std::isinf(n)
+#else
+      isnan(n) || isinf(n)
+#endif
+          ) {
+    throw std::overflow_error("");
+  }
+  u_.number_ = n;
+}
+
+inline value::value(const std::string &s) : type_(string_type), u_() {
+  u_.string_ = new std::string(s);
+}
+
+inline value::value(const array &a) : type_(array_type), u_() {
+  u_.array_ = new array(a);
+}
+
+inline value::value(const object &o) : type_(object_type), u_() {
+  u_.object_ = new object(o);
+}
+
+#if PICOJSON_USE_RVALUE_REFERENCE
+inline value::value(std::string &&s) : type_(string_type), u_() {
+  u_.string_ = new std::string(std::move(s));
+}
+
+inline value::value(array &&a) : type_(array_type), u_() {
+  u_.array_ = new array(std::move(a));
+}
+
+inline value::value(object &&o) : type_(object_type), u_() {
+  u_.object_ = new object(std::move(o));
+}
+#endif
+
+inline value::value(const char *s) : type_(string_type), u_() {
+  u_.string_ = new std::string(s);
+}
+
+inline value::value(const char *s, size_t len) : type_(string_type), u_() {
+  u_.string_ = new std::string(s, len);
+}
+
+inline void value::clear() {
+  switch (type_) {
+#define DEINIT(p)                                                                                                                  \
+  case p##type:                                                                                                                    \
+    delete u_.p;                                                                                                                   \
+    break
+    DEINIT(string_);
+    DEINIT(array_);
+    DEINIT(object_);
+#undef DEINIT
+  default:
+    break;
+  }
+}
+
+inline value::~value() {
+  clear();
+}
+
+inline value::value(const value &x) : type_(x.type_), u_() {
+  switch (type_) {
+#define INIT(p, v)                                                                                                                 \
+  case p##type:                                                                                                                    \
+    u_.p = v;                                                                                                                      \
+    break
+    INIT(string_, new std::string(*x.u_.string_));
+    INIT(array_, new array(*x.u_.array_));
+    INIT(object_, new object(*x.u_.object_));
+#undef INIT
+  default:
+    u_ = x.u_;
+    break;
+  }
+}
+
+inline value &value::operator=(const value &x) {
+  if (this != &x) {
+    value t(x);
+    swap(t);
+  }
+  return *this;
+}
+
+#if PICOJSON_USE_RVALUE_REFERENCE
+inline value::value(value &&x) PICOJSON_NOEXCEPT : type_(null_type), u_() {
+  swap(x);
+}
+inline value &value::operator=(value &&x) PICOJSON_NOEXCEPT {
+  swap(x);
+  return *this;
+}
+#endif
+inline void value::swap(value &x) PICOJSON_NOEXCEPT {
+  std::swap(type_, x.type_);
+  std::swap(u_, x.u_);
+}
+
+#define IS(ctype, jtype)                                                                                                           \
+  template <> inline bool value::is<ctype>() const {                                                                               \
+    return type_ == jtype##_type;                                                                                                  \
+  }
+IS(null, null)
+IS(bool, boolean)
+#ifdef PICOJSON_USE_INT64
+IS(int64_t, int64)
+#endif
+IS(std::string, string)
+IS(array, array)
+IS(object, object)
+#undef IS
+template <> inline bool value::is<double>() const {
+  return type_ == number_type
+#ifdef PICOJSON_USE_INT64
+         || type_ == int64_type
+#endif
+      ;
+}
+
+#define GET(ctype, var)                                                                                                            \
+  template <> inline const ctype &value::get<ctype>() const {                                                                      \
+    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && is<ctype>());                                           \
+    return var;                                                                                                                    \
+  }                                                                                                                                \
+  template <> inline ctype &value::get<ctype>() {                                                                                  \
+    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && is<ctype>());                                           \
+    return var;                                                                                                                    \
+  }
+GET(bool, u_.boolean_)
+GET(std::string, *u_.string_)
+GET(array, *u_.array_)
+GET(object, *u_.object_)
+#ifdef PICOJSON_USE_INT64
+GET(double,
+    (type_ == int64_type && (const_cast<value *>(this)->type_ = number_type, (const_cast<value *>(this)->u_.number_ = u_.int64_)),
+     u_.number_))
+GET(int64_t, u_.int64_)
+#else
+GET(double, u_.number_)
+#endif
+#undef GET
+
+#define SET(ctype, jtype, setter)                                                                                                  \
+  template <> inline void value::set<ctype>(const ctype &_val) {                                                                   \
+    clear();                                                                                                                       \
+    type_ = jtype##_type;                                                                                                          \
+    setter                                                                                                                         \
+  }
+SET(bool, boolean, u_.boolean_ = _val;)
+SET(std::string, string, u_.string_ = new std::string(_val);)
+SET(array, array, u_.array_ = new array(_val);)
+SET(object, object, u_.object_ = new object(_val);)
+SET(double, number, u_.number_ = _val;)
+#ifdef PICOJSON_USE_INT64
+SET(int64_t, int64, u_.int64_ = _val;)
+#endif
+#undef SET
+
+#if PICOJSON_USE_RVALUE_REFERENCE
+#define MOVESET(ctype, jtype, setter)                                                                                              \
+  template <> inline void value::set<ctype>(ctype && _val) {                                                                       \
+    clear();                                                                                                                       \
+    type_ = jtype##_type;                                                                                                          \
+    setter                                                                                                                         \
+  }
+MOVESET(std::string, string, u_.string_ = new std::string(std::move(_val));)
+MOVESET(array, array, u_.array_ = new array(std::move(_val));)
+MOVESET(object, object, u_.object_ = new object(std::move(_val));)
+#undef MOVESET
+#endif
+
+inline bool value::evaluate_as_boolean() const {
+  switch (type_) {
+  case null_type:
+    return false;
+  case boolean_type:
+    return u_.boolean_;
+  case number_type:
+    return u_.number_ != 0;
+#ifdef PICOJSON_USE_INT64
+  case int64_type:
+    return u_.int64_ != 0;
+#endif
+  case string_type:
+    return !u_.string_->empty();
+  default:
+    return true;
+  }
+}
+
+inline const value &value::get(const size_t idx) const {
+  static value s_null;
+  PICOJSON_ASSERT(is<array>());
+  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
+}
+
+inline value &value::get(const size_t idx) {
+  static value s_null;
+  PICOJSON_ASSERT(is<array>());
+  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
+}
+
+inline const value &value::get(const std::string &key) const {
+  static value s_null;
+  PICOJSON_ASSERT(is<object>());
+  object::const_iterator i = u_.object_->find(key);
+  return i != u_.object_->end() ? i->second : s_null;
+}
+
+inline value &value::get(const std::string &key) {
+  static value s_null;
+  PICOJSON_ASSERT(is<object>());
+  object::iterator i = u_.object_->find(key);
+  return i != u_.object_->end() ? i->second : s_null;
+}
+
+inline bool value::contains(const size_t idx) const {
+  PICOJSON_ASSERT(is<array>());
+  return idx < u_.array_->size();
+}
+
+inline bool value::contains(const std::string &key) const {
+  PICOJSON_ASSERT(is<object>());
+  object::const_iterator i = u_.object_->find(key);
+  return i != u_.object_->end();
+}
+
+inline std::string value::to_str() const {
+  switch (type_) {
+  case null_type:
+    return "null";
+  case boolean_type:
+    return u_.boolean_ ? "true" : "false";
+#ifdef PICOJSON_USE_INT64
+  case int64_type: {
+    char buf[sizeof("-9223372036854775808")];
+    SNPRINTF(buf, sizeof(buf), "%" PRId64, u_.int64_);
+    return buf;
+  }
+#endif
+  case number_type: {
+    char buf[256];
+    double tmp;
+    SNPRINTF(buf, sizeof(buf), fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0 ? "%.f" : "%.17g", u_.number_);
+#if PICOJSON_USE_LOCALE
+    char *decimal_point = localeconv()->decimal_point;
+    if (strcmp(decimal_point, ".") != 0) {
+      size_t decimal_point_len = strlen(decimal_point);
+      for (char *p = buf; *p != '\0'; ++p) {
+        if (strncmp(p, decimal_point, decimal_point_len) == 0) {
+          return std::string(buf, p) + "." + (p + decimal_point_len);
+        }
+      }
+    }
+#endif
+    return buf;
+  }
+  case string_type:
+    return *u_.string_;
+  case array_type:
+    return "array";
+  case object_type:
+    return "object";
+  default:
+    PICOJSON_ASSERT(0);
+#ifdef _MSC_VER
+    __assume(0);
+#endif
+  }
+  return std::string();
+}
+
+template <typename Iter> void copy(const std::string &s, Iter oi) {
+  std::copy(s.begin(), s.end(), oi);
+}
+
+template <typename Iter> struct serialize_str_char {
+  Iter oi;
+  void operator()(char c) {
+    switch (c) {
+#define MAP(val, sym)                                                                                                              \
+  case val:                                                                                                                        \
+    copy(sym, oi);                                                                                                                 \
+    break
+      MAP('"', "\\\"");
+      MAP('\\', "\\\\");
+      MAP('/', "\\/");
+      MAP('\b', "\\b");
+      MAP('\f', "\\f");
+      MAP('\n', "\\n");
+      MAP('\r', "\\r");
+      MAP('\t', "\\t");
+#undef MAP
+    default:
+      if (static_cast<unsigned char>(c) < 0x20 || c == 0x7f) {
+        char buf[7];
+        SNPRINTF(buf, sizeof(buf), "\\u%04x", c & 0xff);
+        copy(buf, buf + 6, oi);
+      } else {
+        *oi++ = c;
+      }
+      break;
+    }
+  }
+};
+
+template <typename Iter> void serialize_str(const std::string &s, Iter oi) {
+  *oi++ = '"';
+  serialize_str_char<Iter> process_char = {oi};
+  std::for_each(s.begin(), s.end(), process_char);
+  *oi++ = '"';
+}
+
+template <typename Iter> void value::serialize(Iter oi, bool prettify) const {
+  return _serialize(oi, prettify ? 0 : -1);
+}
+
+inline std::string value::serialize(bool prettify) const {
+  return _serialize(prettify ? 0 : -1);
+}
+
+template <typename Iter> void value::_indent(Iter oi, int indent) {
+  *oi++ = '\n';
+  for (int i = 0; i < indent * INDENT_WIDTH; ++i) {
+    *oi++ = ' ';
+  }
+}
+
+template <typename Iter> void value::_serialize(Iter oi, int indent) const {
+  switch (type_) {
+  case string_type:
+    serialize_str(*u_.string_, oi);
+    break;
+  case array_type: {
+    *oi++ = '[';
+    if (indent != -1) {
+      ++indent;
+    }
+    for (array::const_iterator i = u_.array_->begin(); i != u_.array_->end(); ++i) {
+      if (i != u_.array_->begin()) {
+        *oi++ = ',';
+      }
+      if (indent != -1) {
+        _indent(oi, indent);
+      }
+      i->_serialize(oi, indent);
+    }
+    if (indent != -1) {
+      --indent;
+      if (!u_.array_->empty()) {
+        _indent(oi, indent);
+      }
+    }
+    *oi++ = ']';
+    break;
+  }
+  case object_type: {
+    *oi++ = '{';
+    if (indent != -1) {
+      ++indent;
+    }
+    for (object::const_iterator i = u_.object_->begin(); i != u_.object_->end(); ++i) {
+      if (i != u_.object_->begin()) {
+        *oi++ = ',';
+      }
+      if (indent != -1) {
+        _indent(oi, indent);
+      }
+      serialize_str(i->first, oi);
+      *oi++ = ':';
+      if (indent != -1) {
+        *oi++ = ' ';
+      }
+      i->second._serialize(oi, indent);
+    }
+    if (indent != -1) {
+      --indent;
+      if (!u_.object_->empty()) {
+        _indent(oi, indent);
+      }
+    }
+    *oi++ = '}';
+    break;
+  }
+  default:
+    copy(to_str(), oi);
+    break;
+  }
+  if (indent == 0) {
+    *oi++ = '\n';
+  }
+}
+
+inline std::string value::_serialize(int indent) const {
+  std::string s;
+  _serialize(std::back_inserter(s), indent);
+  return s;
+}
+
+template <typename Iter> class input {
+protected:
+  Iter cur_, end_;
+  bool consumed_;
+  int line_;
+
+public:
+  input(const Iter &first, const Iter &last) : cur_(first), end_(last), consumed_(false), line_(1) {
+  }
+  int getc() {
+    if (consumed_) {
+      if (*cur_ == '\n') {
+        ++line_;
+      }
+      ++cur_;
+    }
+    if (cur_ == end_) {
+      consumed_ = false;
+      return -1;
+    }
+    consumed_ = true;
+    return *cur_ & 0xff;
+  }
+  void ungetc() {
+    consumed_ = false;
+  }
+  Iter cur() const {
+    if (consumed_) {
+      input<Iter> *self = const_cast<input<Iter> *>(this);
+      self->consumed_ = false;
+      ++self->cur_;
+    }
+    return cur_;
+  }
+  int line() const {
+    return line_;
+  }
+  void skip_ws() {
+    while (1) {
+      int ch = getc();
+      if (!(ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')) {
+        ungetc();
+        break;
+      }
+    }
+  }
+  bool picojson_expect(const int expected) {
+    skip_ws();
+    if (getc() != expected) {
+      ungetc();
+      return false;
+    }
+    return true;
+  }
+  bool match(const std::string &pattern) {
+    for (std::string::const_iterator pi(pattern.begin()); pi != pattern.end(); ++pi) {
+      if (getc() != *pi) {
+        ungetc();
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+template <typename Iter> inline int _parse_quadhex(input<Iter> &in) {
+  int uni_ch = 0, hex;
+  for (int i = 0; i < 4; i++) {
+    if ((hex = in.getc()) == -1) {
+      return -1;
+    }
+    if ('0' <= hex && hex <= '9') {
+      hex -= '0';
+    } else if ('A' <= hex && hex <= 'F') {
+      hex -= 'A' - 0xa;
+    } else if ('a' <= hex && hex <= 'f') {
+      hex -= 'a' - 0xa;
+    } else {
+      in.ungetc();
+      return -1;
+    }
+    uni_ch = uni_ch * 16 + hex;
+  }
+  return uni_ch;
+}
+
+template <typename String, typename Iter> inline bool _parse_codepoint(String &out, input<Iter> &in) {
+  int uni_ch;
+  if ((uni_ch = _parse_quadhex(in)) == -1) {
+    return false;
+  }
+  if (0xd800 <= uni_ch && uni_ch <= 0xdfff) {
+    if (0xdc00 <= uni_ch) {
+      // a second 16-bit of a surrogate pair appeared
+      return false;
+    }
+    // first 16-bit of surrogate pair, get the next one
+    if (in.getc() != '\\' || in.getc() != 'u') {
+      in.ungetc();
+      return false;
+    }
+    int second = _parse_quadhex(in);
+    if (!(0xdc00 <= second && second <= 0xdfff)) {
+      return false;
+    }
+    uni_ch = ((uni_ch - 0xd800) << 10) | ((second - 0xdc00) & 0x3ff);
+    uni_ch += 0x10000;
+  }
+  if (uni_ch < 0x80) {
+    out.push_back(static_cast<char>(uni_ch));
+  } else {
+    if (uni_ch < 0x800) {
+      out.push_back(static_cast<char>(0xc0 | (uni_ch >> 6)));
+    } else {
+      if (uni_ch < 0x10000) {
+        out.push_back(static_cast<char>(0xe0 | (uni_ch >> 12)));
+      } else {
+        out.push_back(static_cast<char>(0xf0 | (uni_ch >> 18)));
+        out.push_back(static_cast<char>(0x80 | ((uni_ch >> 12) & 0x3f)));
+      }
+      out.push_back(static_cast<char>(0x80 | ((uni_ch >> 6) & 0x3f)));
+    }
+    out.push_back(static_cast<char>(0x80 | (uni_ch & 0x3f)));
+  }
+  return true;
+}
+
+template <typename String, typename Iter> inline bool _parse_string(String &out, input<Iter> &in) {
+  while (1) {
+    int ch = in.getc();
+    if (ch < ' ') {
+      in.ungetc();
+      return false;
+    } else if (ch == '"') {
+      return true;
+    } else if (ch == '\\') {
+      if ((ch = in.getc()) == -1) {
+        return false;
+      }
+      switch (ch) {
+#define MAP(sym, val)                                                                                                              \
+  case sym:                                                                                                                        \
+    out.push_back(val);                                                                                                            \
+    break
+        MAP('"', '\"');
+        MAP('\\', '\\');
+        MAP('/', '/');
+        MAP('b', '\b');
+        MAP('f', '\f');
+        MAP('n', '\n');
+        MAP('r', '\r');
+        MAP('t', '\t');
+#undef MAP
+      case 'u':
+        if (!_parse_codepoint(out, in)) {
+          return false;
+        }
+        break;
+      default:
+        return false;
+      }
+    } else {
+      out.push_back(static_cast<char>(ch));
+    }
+  }
+  return false;
+}
+
+template <typename Context, typename Iter> inline bool _parse_array(Context &ctx, input<Iter> &in) {
+  if (!ctx.parse_array_start()) {
+    return false;
+  }
+  size_t idx = 0;
+  if (in.picojson_expect(']')) {
+    return ctx.parse_array_stop(idx);
+  }
+  do {
+    if (!ctx.parse_array_item(in, idx)) {
+      return false;
+    }
+    idx++;
+  } while (in.picojson_expect(','));
+  return in.picojson_expect(']') && ctx.parse_array_stop(idx);
+}
+
+template <typename Context, typename Iter> inline bool _parse_object(Context &ctx, input<Iter> &in) {
+  if (!ctx.parse_object_start()) {
+    return false;
+  }
+  if (in.picojson_expect('}')) {
+    return true;
+  }
+  do {
+    std::string key;
+    if (!in.picojson_expect('"') || !_parse_string(key, in) || !in.picojson_expect(':')) {
+      return false;
+    }
+    if (!ctx.parse_object_item(in, key)) {
+      return false;
+    }
+  } while (in.picojson_expect(','));
+  return in.picojson_expect('}');
+}
+
+template <typename Iter> inline std::string _parse_number(input<Iter> &in) {
+  std::string num_str;
+  while (1) {
+    int ch = in.getc();
+    if (('0' <= ch && ch <= '9') || ch == '+' || ch == '-' || ch == 'e' || ch == 'E') {
+      num_str.push_back(static_cast<char>(ch));
+    } else if (ch == '.') {
+#if PICOJSON_USE_LOCALE
+      num_str += localeconv()->decimal_point;
+#else
+      num_str.push_back('.');
+#endif
+    } else {
+      in.ungetc();
+      break;
+    }
+  }
+  return num_str;
+}
+
+template <typename Context, typename Iter> inline bool _parse(Context &ctx, input<Iter> &in) {
+  in.skip_ws();
+  int ch = in.getc();
+  switch (ch) {
+#define IS(ch, text, op)                                                                                                           \
+  case ch:                                                                                                                         \
+    if (in.match(text) && op) {                                                                                                    \
+      return true;                                                                                                                 \
+    } else {                                                                                                                       \
+      return false;                                                                                                                \
+    }
+    IS('n', "ull", ctx.set_null());
+    IS('f', "alse", ctx.set_bool(false));
+    IS('t', "rue", ctx.set_bool(true));
+#undef IS
+  case '"':
+    return ctx.parse_string(in);
+  case '[':
+    return _parse_array(ctx, in);
+  case '{':
+    return _parse_object(ctx, in);
+  default:
+    if (('0' <= ch && ch <= '9') || ch == '-') {
+      double f;
+      char *endp;
+      in.ungetc();
+      std::string num_str(_parse_number(in));
+      if (num_str.empty()) {
+        return false;
+      }
+#ifdef PICOJSON_USE_INT64
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wtautological-type-limit-compare"
+      {
+        errno = 0;
+        intmax_t ival = strtoimax(num_str.c_str(), &endp, 10);
+        if (errno == 0 && std::numeric_limits<int64_t>::min() <= ival && ival <= std::numeric_limits<int64_t>::max() &&
+            endp == num_str.c_str() + num_str.size()) {
+          ctx.set_int64(ival);
+          return true;
+        }
+      }
+#pragma clang diagnostic pop
+#endif
+      f = strtod(num_str.c_str(), &endp);
+      if (endp == num_str.c_str() + num_str.size()) {
+        ctx.set_number(f);
+        return true;
+      }
+      return false;
+    }
+    break;
+  }
+  in.ungetc();
+  return false;
+}
+
+class deny_parse_context {
+public:
+  bool set_null() {
+    return false;
+  }
+  bool set_bool(bool) {
+    return false;
+  }
+#ifdef PICOJSON_USE_INT64
+  bool set_int64(int64_t) {
+    return false;
+  }
+#endif
+  bool set_number(double) {
+    return false;
+  }
+  template <typename Iter> bool parse_string(input<Iter> &) {
+    return false;
+  }
+  bool parse_array_start() {
+    return false;
+  }
+  template <typename Iter> bool parse_array_item(input<Iter> &, size_t) {
+    return false;
+  }
+  bool parse_array_stop(size_t) {
+    return false;
+  }
+  bool parse_object_start() {
+    return false;
+  }
+  template <typename Iter> bool parse_object_item(input<Iter> &, const std::string &) {
+    return false;
+  }
+};
+
+class default_parse_context {
+protected:
+  value *out_;
+
+public:
+  default_parse_context(value *out) : out_(out) {
+  }
+  bool set_null() {
+    *out_ = value();
+    return true;
+  }
+  bool set_bool(bool b) {
+    *out_ = value(b);
+    return true;
+  }
+#ifdef PICOJSON_USE_INT64
+  bool set_int64(int64_t i) {
+    *out_ = value(i);
+    return true;
+  }
+#endif
+  bool set_number(double f) {
+    *out_ = value(f);
+    return true;
+  }
+  template <typename Iter> bool parse_string(input<Iter> &in) {
+    *out_ = value(string_type, false);
+    return _parse_string(out_->get<std::string>(), in);
+  }
+  bool parse_array_start() {
+    *out_ = value(array_type, false);
+    return true;
+  }
+  template <typename Iter> bool parse_array_item(input<Iter> &in, size_t) {
+    array &a = out_->get<array>();
+    a.push_back(value());
+    default_parse_context ctx(&a.back());
+    return _parse(ctx, in);
+  }
+  bool parse_array_stop(size_t) {
+    return true;
+  }
+  bool parse_object_start() {
+    *out_ = value(object_type, false);
+    return true;
+  }
+  template <typename Iter> bool parse_object_item(input<Iter> &in, const std::string &key) {
+    object &o = out_->get<object>();
+    default_parse_context ctx(&o[key]);
+    return _parse(ctx, in);
+  }
+
+private:
+  default_parse_context(const default_parse_context &);
+  default_parse_context &operator=(const default_parse_context &);
+};
+
+class null_parse_context {
+public:
+  struct dummy_str {
+    void push_back(int) {
+    }
+  };
+
+public:
+  null_parse_context() {
+  }
+  bool set_null() {
+    return true;
+  }
+  bool set_bool(bool) {
+    return true;
+  }
+#ifdef PICOJSON_USE_INT64
+  bool set_int64(int64_t) {
+    return true;
+  }
+#endif
+  bool set_number(double) {
+    return true;
+  }
+  template <typename Iter> bool parse_string(input<Iter> &in) {
+    dummy_str s;
+    return _parse_string(s, in);
+  }
+  bool parse_array_start() {
+    return true;
+  }
+  template <typename Iter> bool parse_array_item(input<Iter> &in, size_t) {
+    return _parse(*this, in);
+  }
+  bool parse_array_stop(size_t) {
+    return true;
+  }
+  bool parse_object_start() {
+    return true;
+  }
+  template <typename Iter> bool parse_object_item(input<Iter> &in, const std::string &) {
+    return _parse(*this, in);
+  }
+
+private:
+  null_parse_context(const null_parse_context &);
+  null_parse_context &operator=(const null_parse_context &);
+};
+
+// obsolete, use the version below
+template <typename Iter> inline std::string parse(value &out, Iter &pos, const Iter &last) {
+  std::string err;
+  pos = parse(out, pos, last, &err);
+  return err;
+}
+
+template <typename Context, typename Iter> inline Iter _parse(Context &ctx, const Iter &first, const Iter &last, std::string *err) {
+  input<Iter> in(first, last);
+  if (!_parse(ctx, in) && err != NULL) {
+    char buf[64];
+    SNPRINTF(buf, sizeof(buf), "syntax error at line %d near: ", in.line());
+    *err = buf;
+    while (1) {
+      int ch = in.getc();
+      if (ch == -1 || ch == '\n') {
+        break;
+      } else if (ch >= ' ') {
+        err->push_back(static_cast<char>(ch));
+      }
+    }
+  }
+  return in.cur();
+}
+
+template <typename Iter> inline Iter parse(value &out, const Iter &first, const Iter &last, std::string *err) {
+  default_parse_context ctx(&out);
+  return _parse(ctx, first, last, err);
+}
+
+inline std::string parse(value &out, const std::string &s) {
+  std::string err;
+  parse(out, s.begin(), s.end(), &err);
+  return err;
+}
+
+inline std::string parse(value &out, std::istream &is) {
+  std::string err;
+  parse(out, std::istreambuf_iterator<char>(is.rdbuf()), std::istreambuf_iterator<char>(), &err);
+  return err;
+}
+
+template <typename T> struct last_error_t { static std::string s; };
+template <typename T> std::string last_error_t<T>::s;
+
+inline void set_last_error(const std::string &s) {
+  last_error_t<bool>::s = s;
+}
+
+inline const std::string &get_last_error() {
+  return last_error_t<bool>::s;
+}
+
+inline bool operator==(const value &x, const value &y) {
+  if (x.is<null>())
+    return y.is<null>();
+#define PICOJSON_CMP(type)                                                                                                         \
+  if (x.is<type>())                                                                                                                \
+  return y.is<type>() && x.get<type>() == y.get<type>()
+  PICOJSON_CMP(bool);
+  PICOJSON_CMP(double);
+  PICOJSON_CMP(std::string);
+  PICOJSON_CMP(array);
+  PICOJSON_CMP(object);
+#undef PICOJSON_CMP
+  PICOJSON_ASSERT(0);
+#ifdef _MSC_VER
+  __assume(0);
+#endif
+  return false;
+}
+
+inline bool operator!=(const value &x, const value &y) {
+  return !(x == y);
+}
+}
+
+#if !PICOJSON_USE_RVALUE_REFERENCE
+namespace std {
+template <> inline void swap(picojson::value &x, picojson::value &y) {
+  x.swap(y);
+}
+}
+#endif
+
+inline std::istream &operator>>(std::istream &is, picojson::value &x) {
+  picojson::set_last_error(std::string());
+  const std::string err(picojson::parse(x, is));
+  if (!err.empty()) {
+    picojson::set_last_error(err);
+    is.setstate(std::ios::failbit);
+  }
+  return is;
+}
+
+inline std::ostream &operator<<(std::ostream &os, const picojson::value &x) {
+  x.serialize(std::ostream_iterator<char>(os));
+  return os;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif
diff --git a/src/rgw/rgw-gap-list b/src/rgw/rgw-gap-list
new file mode 100755
index 000000000..5018cedd7
--- /dev/null
+++ b/src/rgw/rgw-gap-list
@@ -0,0 +1,456 @@
+#!/usr/bin/env bash
+
+# Last revision 2023-01-13
+
+# NOTE: This script based based on rgw-orphan-list but doing the
+# reverse calculation.
+
+# NOTE: The awk included in this script replaces the 'ceph-diff-sorted'
+# utility but duplicates its functionality.  This was done to minimize
+# the number of times the massive data set must be iterated to complete
+# the task.
+
+# IMPORTANT: Affects order produced by 'sort'.
+export LC_ALL=C
+
+trap "exit 1" TERM
+TOP_PID=$$
+
+out_dir="$PWD"
+timestamp=$(date -u +%Y%m%d%H%M)
+lspools_err="${out_dir}/lspools-${timestamp}.error"
+rados_out="${out_dir}/rados-${timestamp}.intermediate"
+rados_err="${out_dir}/rados-${timestamp}.error"
+rgwadmin_out="${out_dir}/radosgw-admin-${timestamp}.intermediate"
+rgwadmin_err="${out_dir}/radosgw-admin-${timestamp}.error"
+gap_out="${out_dir}/gap-list-${timestamp}.gap"
+
+
+# field separator
+# contains ascii 0xFE, designed to be a  character that won't appear
+# in normal output, can only be a single character due to use in the 
+# sort command
+fs=$(echo -e "\xFE")
+
+log() {
+  echo $(date +%F\ %T) $(hostname -s) "$1"
+}
+
+#
+# checkReturn RETURNCODE MESSAGE TERMINATE
+#  RETURNCODE - ( usually $? ) of previous command
+#  MESSAGE    - Message to print on non-zero return code
+#  TERMINATE  - non-empty == terminate the script on non-zero return code
+#
+checkReturn() {
+  if [ $1 -ne 0 ]; then
+    error_addon=""
+    if [ ! -z "$3" ]; then
+      error_addon="; Terminating"
+    fi
+    log "ERROR: ${2} failed: returned ${1}${error_addon}"
+    if [ ! -z "$3" ]; then
+      >&2 echo
+      >&2 echo '***'
+      >&2 echo '*** WARNING: The results are incomplete. Do not use! ***'
+      >&2 echo '***'
+      kill -s TERM $TOP_PID
+    fi
+  fi
+}
+
+prompt_pool() {
+  # note: all prompts go to stderr so stdout contains just the result
+  rados lspools >"$temp_file" 2>"$lspools_err"
+  checkReturn $? "Listing pools" 1
+
+  >&2 echo ""
+  >&2 echo "Available pools:"
+  >&2 sed 's/^/    /' "$temp_file" # list pools and indent
+  >&2 echo ""
+  >&2 echo "Which Rados Gateway Data pool do you want to search for gaps? "
+  >&2 echo ""
+  >&2 echo "NOTE: If your installation has multiple bucket data pools using "
+  >&2 echo "      bucket placement policies, please enter a space separated "
+  >&2 echo "      list of bucket data pools to enumerate."
+  >&2 echo ""
+  local mypool
+  read mypool
+  echo $mypool
+}
+
+radosgw_radoslist() {
+  log "Running 'radosgw-admin bucket radoslist'."
+  rm -f "$rgwadmin_flag" &> /dev/null
+  radosgw-admin bucket radoslist --rgw-obj-fs="$fs" >"$rgwadmin_out" 2>"$rgwadmin_err"
+  RETVAL=$?
+  if [ "$RETVAL" -ne 0 ] ;then
+    touch "$rgwadmin_flag"
+  fi
+  checkReturn $RETVAL "radosgw-admin radoslist" 1
+  log "Completed 'radosgw-admin bucket radoslist'."
+
+  log "Sorting 'radosgw-admin bucket radoslist' output."
+  sort -T ${temp_prefix} --field-separator="$fs" -k1,1 -u "$rgwadmin_out" > "$rgwadmin_temp"
+  checkReturn $? "Sorting 'radosgw-admin bucket radoslist' output" 1
+  log "Completed sorting 'radosgw-admin bucket radoslist'."
+
+  log "Moving 'radosgw-admin bucket radoslist' output."
+  mv -f "$rgwadmin_temp" "$rgwadmin_out"
+  checkReturn $? "Moving 'radosgw-admin bucket radoslist' output" 1
+  log "Completed moving 'radosgw-admin bucket radoslist' output."
+}
+
+rados_ls() {
+  log "Starting 'rados ls' function."
+  rm -f "$rados_flag" &> /dev/null
+  rm -f "$rados_out" &> /dev/null
+  local mypool
+  for mypool in $pool; do
+    log "Running 'rados ls' on pool ${mypool}."
+    rados ls --pool="$mypool" >>"$rados_out" 2>"$rados_err"
+    RETVAL=$?
+    if [ "$RETVAL" -ne 0 ] ;then
+      touch "$rados_flag"
+    fi
+    checkReturn $RETVAL "'rados ls' on pool ${mypool}" 1
+    log "Completed 'rados ls' on pool ${mypool}."
+  done
+  if [ ! -e "$rados_flag" ]; then
+    log "Sorting 'rados ls' output(s)."
+    sort -T ${temp_prefix} -u "$rados_out" >"$rados_temp"
+    checkReturn $? "Sorting 'rados ls' output(s)" 1
+
+    log "Moving sorted output(s)."
+    mv -f "$rados_temp" "$rados_out"
+    checkReturn $? "Moving temp file to output file" 1
+    log "Sorting 'rados ls' output(s) complete."
+  fi
+}
+
+usage() {
+  >&2 cat << EOF
+
+WARNING   WARNING   WARNING   WARNING   WARNING   WARNING   WARNING
+WARNING: 
+WARNING: Command option format has changed.  Please check closely.
+WARNING: 
+WARNING   WARNING   WARNING   WARNING   WARNING   WARNING   WARNING
+
+Usage: $0 [-m] [-p <pool>] [-t <temp_dir>]
+
+Where:
+  -m               Optionally, run the two listings in multiple threads.
+                   --See NOTE below--
+
+  -p <pool>        The RGW bucket data pool name, if omitted, pool name
+                   will be prompted for during execution.
+                   Multiple pools can be supplied as a space separated
+                   double quoted list.
+
+  -t <temp_dir>    Optionally, set the directory to use for temp space.
+                   This may be required if /tmp is low on space.
+
+NOTE: This tool is currently considered to be EXPERIMENTAL.
+
+NOTE: False positives are possible. False positives would likely
+      appear as objects that were never deleted and are fully
+      intact. All results should therefore be verified.
+
+NOTE: Multithread listing may increase performance but may also increase
+      the risk of false positives when the cluster is undergoing
+      modifications during the listing processes. In addition to the
+      above, false positives might also include objects that were
+      intentionally deleted.
+
+EOF
+  exit 1
+}
+
+multithread=0
+error=0
+temp_prefix="/tmp"
+while getopts ":mp:t:" o; do
+  case "${o}" in
+    m)
+      multithread=1
+    ;;
+    p)
+      pool=${OPTARG}
+    ;;
+    t)
+      if [ -d ${OPTARG} ]; then
+        temp_prefix=${OPTARG}
+      else
+        echo
+        echo "ERROR: Temporary directory does not exist: ${OPTARG}"
+        error=1
+      fi
+    ;;
+    *)
+      echo
+      echo "ERROR: Unrecognized argument: ${o}"
+      error=1
+    ;;
+  esac
+done
+shift $((OPTIND-1))
+
+temp_file=${temp_prefix}/gap-tmp.$$
+rados_temp=${temp_prefix}/rados-tmp.$$
+rgwadmin_temp=${temp_prefix}/radosgw-admin-tmp.$$
+rados_flag=${temp_prefix}/rados-flag.$$
+rgwadmin_flag=${temp_prefix}/radosgw-admin-flag.$$
+incremental_grep_awk="${temp_prefix}/ig-${$}.awk"
+
+if [ $error -gt 0 ]; then
+  usage
+fi
+
+if [ -z "$pool" ]; then
+  pool="$(prompt_pool)"
+fi
+
+error=0
+rados ${CEPH_ARGS} lspools > ${temp_file}
+checkReturn $? "rados lspools" 1
+for mypool in $pool; do
+  if [ $(grep -c "^${mypool}$" "${temp_file}") -eq 0 ]; then
+      echo
+      echo "ERROR: Supplied pool does not exist: ${mypool}"
+      error=1
+  fi
+done
+
+if [ $error -gt 0 ]; then
+  exit 1
+fi
+
+log "Pool is \"$pool\"."
+log "Note: output files produced will be tagged with the current timestamp -- ${timestamp}."
+
+if [ $multithread -eq 1 ] ;then
+  startsecs=$(date +%s)
+  log "Starting multithread tasks..."
+  rados_ls &
+  radosgw_radoslist &
+  jobs &> /dev/null  # without this, the myjobs count always equals 1 (confused)
+  myjobs=$(jobs | wc -l)
+  while [ $myjobs -gt 0 ]; do
+    # provide minutely status update
+    if [ $(( ($(date +%s)-$startsecs) % 60 )) -eq 0 ]; then
+      echo
+      deltasecs=$(( $(date +%s)-$startsecs ))
+      log "Waiting for listing tasks to complete. Running ${myjobs} tasks for ${deltasecs} seconds."
+    fi
+    sleep 1
+    echo -n .
+    if [ -e "$rgw_admin_flag" ]; then
+      exit 1
+    fi
+    if [ -e "$rados_flag" ]; then
+      exit 2
+    fi
+    jobs &> /dev/null  # without this, the myjobs count always equals 1 (confused)
+    myjobs=$(jobs | wc -l)
+  done
+  echo
+else
+  rados_ls
+  radosgw_radoslist
+fi
+
+if [ -e "$rgw_admin_flag" ]; then
+  exit 1
+fi
+
+if [ -e "$rados_flag" ]; then
+  exit 2
+fi
+
+for myfile in $rados_out $rgwadmin_out; do
+  if [ ! -s "${myfile}" ]; then
+    log "ERROR: Empty file detected: ${myfile}"
+    log "ERROR: RESULTS ARE INCOMPLETE - DO NOT USE"
+    exit 1
+  fi 
+done
+
+# Create an awk script in a file for parsing the two command outoputs.
+log "Creating awk script for comparing outputs: ${incremental_grep_awk}"
+
+cat <<"EOF" >$incremental_grep_awk
+# This awk script is used by rgw-gap-list and will sequence through
+# each line in $rados_out and $rgwadmin_out exactly once.
+#
+# During this iteration:
+#  * The 1st column of $rgwadmin_out is compared to the line of
+#    $rados_out.
+#  * If they are equal, the next line of $rados_out is read in and the
+#    next line of $rgwadmin_out is provided via normal awk iteration.
+#  * If a value appears in $rgwadmin_out, but not $rados_out, this 
+#    indicates a possible deleted tail object and the accompanying
+#    bucket / user object name is output, assuming it had not been
+#    previously identified.
+#    - A map of outputed bucket / user object is maintained in memory
+#  * If a value appears in $rados_out, but not in $rgwadmin_out, the
+#    $rados_out file is iterated until the $rados_out line is equal
+#    or > (alphabetically) the value from the $rgwadmin_out file.
+
+function usage() {
+  print "Example Usage:">>"/dev/stderr"
+  print "   # limit $fs to single char that will not appear in either output">>"/dev/stderr"
+  print "   # The below is Octal 376, or Hex 0xFE">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  print "   $ fs=$(echo -e \"\\0376\")  ">>"/dev/stderr"
+  print "   $ rados ls -p default.rgw.buckets.data > rados_out.txt">>"/dev/stderr"
+  print "   $ radosgw-admin bucket radoslist --rgw-obj-fs=\"$fs\" \\">>"/dev/stderr"
+  print "       | sort --field-separator=\"$fs\" -k 1,1 > rgwadmin_out.txt">>"/dev/stderr"
+  print " ">>"/dev/stderr"
+  print "   $ awk -F \"$fs\" \\">>"/dev/stderr"
+  print "         -v filetwo=rados_out.txt \\">>"/dev/stderr"
+  print "         -v map_out=MappedOutput.txt \\">>"/dev/stderr"
+  print "         -f ig_awk \\">>"/dev/stderr"
+  print "         rgwadmin_out.txt">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  print "   Result will be provided in the 'MappedOutput.txt' file in this">>"/dev/stderr"
+  print "   example.  If you'd prefer the output to be sorted, you can run">>"/dev/stderr"
+  print "   $ sort MappedOutput.txt > SortedMappedOutput.txt">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  exit 1
+}
+
+function get_date_time() {
+  dtstr="date +%F\\ %T"
+  dtstr | getline mydt
+  close(dtstr)
+  return mydt
+}
+
+function status_out() {
+  printf("%s % 17d\t% 17d\t% 12d\n",get_date_time(),f1_count,f2_count,lineoutCount)>>"/dev/stderr"
+}
+
+function advance_f2() {
+  if ((getline f2line<filetwo) <= 0) {
+    f2_eof=1
+  } else {
+    f2_count++
+    bcount=split(f2line,b,FS)
+  }
+}
+
+function test_lines() {
+  if ($1==b[1]) {
+    advance_f2()
+    return 0
+  } else if ($1<b[1]) {
+    line_out()
+    return 1
+  } else {
+    return 2
+  }
+}
+
+function findnul(myfield) {
+  for(i=1;i<=split(myfield,a,"");i++) {
+    if(ord[a[i]]==0) {
+      return 1
+    }
+  }
+  return 0
+}
+
+function line_out() {
+  if(findnul($1)) {
+    # If the RADOS object name has a NUL character, skip output
+    return
+  }
+  # Note: Intentionally using $2 and $NF below
+  # Use of $NF eliminates risk of exhausting input field count
+  if ($2" "$NF!=lastline) {
+    # Only output a given bucket/Obj combination once
+    printf("Bucket: \"%s\"  Object: \"%s\"\n", $2, $NF)>>map_out
+    lastline=$2" "$NF
+    lineoutCount++
+  }
+}
+
+BEGIN {
+  if(filetwo==""||map_out=="") {
+     print "">>"/dev/stderr"
+     print "">>"/dev/stderr"
+     print "Missing parameter."
+     print "">>"/dev/stderr"
+     print "">>"/dev/stderr"
+     usage()
+  }
+  status_delta=100000
+  f1_count=0
+  f2_count=0
+  advance_f2()
+  printf("%s File 1 Line Count\tFile 2 Line Count\tPotentially Impacted Objects\n",get_date_time())>>"/dev/stderr"
+  for(n=0;n<256;n++) {
+    ord[sprintf("%c",n)]=n
+  }
+}
+
+{
+  f1_count++
+  if(f2_eof==0) {
+    if(test_lines()==2) {
+      while ($1>b[1]) {
+        advance_f2()
+      }
+      test_lines()
+    }
+  } else {
+    # If EOF hit, dump all remaining lines since they're missing
+    # from filetwo
+    line_out()
+  }
+  if((f1_count % status_delta)==0) {
+    status_out()
+  }
+}
+
+END {
+  if(f1_count>0) {
+    status_out()
+  }
+}
+
+EOF
+
+
+log "Begin identifying potentially impacted user object names."
+
+echo -n > "$temp_file" # Ensure the file is empty
+awk -F "$fs" -v filetwo=$rados_out -v map_out=$temp_file -f $incremental_grep_awk $rgwadmin_out
+checkReturn $? "Identifying potentially impacted user object names" 1
+
+log "Begin sorting results."
+sort -T ${temp_prefix} "$temp_file" > "$gap_out"
+checkReturn $? "sorting results" 1
+rm -f "$temp_file"
+
+found=$(wc -l < "$gap_out")
+mydate=$(date +%F\ %T)
+
+log "Done."
+
+cat << EOF
+
+Found $found *possible* gaps.
+The results can be found in "${gap_out}".
+
+Intermediate files: "${rados_out}" and "${rgwadmin_out}".
+
+***
+*** WARNING: This is EXPERIMENTAL code and the results should be used
+***          with CAUTION and VERIFIED. Not everything listed is an
+***          actual gap. EXPECT false positives. Every result
+***          produced should be verified.
+***
+EOF
diff --git a/src/rgw/rgw-gap-list-comparator b/src/rgw/rgw-gap-list-comparator
new file mode 100755
index 000000000..c377fdaf8
--- /dev/null
+++ b/src/rgw/rgw-gap-list-comparator
@@ -0,0 +1,119 @@
+#!/usr/bin/awk -f
+
+#
+# Version 1
+#
+# This awk script takes two, similarly sorted lists and outputs
+# only the lines which exist in both lists.  The script takes
+# three inputs:
+#
+# ./rgw-gap-list-comparator \
+#     -v filetwo=gap-list-B.txt \
+#     -v matchout=matched_lines.txt \
+#     gap-list-A.txt
+#
+
+function usage() {
+  print "">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  print "The idea behind the script is to eliminate false positive hits">>"/dev/stderr"
+  print "from the rgw-gap-list tool which are due to upload timing of new">>"/dev/stderr"
+  print "objects during the tool's execution.  To use the tool properly,">>"/dev/stderr"
+  print "the following process should be followed:">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  print " 1: Run the 'rgw-gap-list' tool twice">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  print " 2: Sort the resulting map files:">>"/dev/stderr"
+  print "   $ export LC_ALL=C">>"/dev/stderr"
+  print "   $ sort gap-list-A.gap > gap-list-A.sorted.gap">>"/dev/stderr"
+  print "   $ sort gap-list-B.gap > gap-list.B.sorted.gap">>"/dev/stderr"
+  print "   -- Where the A / B in the gap-list file names are the date/time associated with each of the respective 'rgw-gap-list' outputs">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  print " 3: Run the 'same_lines_only.awk' script over the two files:">>"/dev/stderr"
+  print "   $ rm matched_lines.txt">>"/dev/stderr"
+  print "   $ ./rgw-gap-list-comparator -v filetwo=gap-list-B.sorted.gap -v matchout=matched_lines.txt gap-list-A.sorted.gap">>"/dev/stderr"
+  print "   -- Where the A / B in the gap-list file names are the date/time associated with each of the respective 'rgw-gap-list' outputs">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  print " The resulting 'matched_lines.txt' will be a high confidence list of impacted objects with little to no false positives.">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  print "">>"/dev/stderr"
+  exit 1
+}
+
+function advance_f2() {
+  if ((getline f2line<filetwo) <= 0) {
+    f2_eof=1
+  } else {
+    f2_count++
+  }
+}
+
+function test_lines() {
+ if($0==f2line) {
+    print $0>>matchout
+    lineoutcount++
+    advance_f2()
+    return 0
+  } else if ($0>f2line) {
+    return 2
+  } else {
+    return 1
+  }
+}
+
+function status_out() {
+  printf("%s % 17d\t% 17d\t% 12d\n",get_date_time(),f1_count,f2_count,lineoutcount)>>"/dev/stderr"
+}
+
+function get_date_time() {
+  dtstr="date +%F\\ %T"
+  dtstr | getline mydt
+  close(dtstr)
+  return mydt
+}
+
+BEGIN {
+  if(filetwo==""||matchout=="") {
+     print "">>"/dev/stderr"
+     print "">>"/dev/stderr"
+     print "Missing parameter."
+     print "">>"/dev/stderr"
+     print "">>"/dev/stderr"
+     usage()
+  }
+
+  f1_count=0
+  f2_count=0
+  lineoutcount=0
+  f2_eof=0
+  statusevery=100000
+  advance_f2()
+  printf("%s File 1 Line Count\tFile 2 Line Count\tPotentially Impacted Objects\n",get_date_time())>>"/dev/stderr"
+  status_out()
+}
+
+
+{
+  f1_count++
+  if(f2_eof==0) {
+    if(test_lines()==2) {
+      while($0>f2line && f2_eof==0) {
+        advance_f2()
+      }
+      test_lines()
+    }
+  } else {
+    exit 0
+  }
+  if ((f1_count % statusevery)==0) {
+    status_out()
+  }
+}
+
+END {
+  if(f1_count>0) {
+    status_out()
+  }
+}
+
diff --git a/src/rgw/rgw-orphan-list b/src/rgw/rgw-orphan-list
new file mode 100755
index 000000000..c8856e8ee
--- /dev/null
+++ b/src/rgw/rgw-orphan-list
@@ -0,0 +1,278 @@
+#!/usr/bin/env bash
+
+# version 2023-01-11
+
+# IMPORTANT: affects order produced by 'sort' and 'ceph-diff-sorted'
+# relies on this ordering
+export LC_ALL=C
+
+# If your ceph.conf is not in /etc/ceph, then set CEPH_CONF="-c /path/to/ceph.conf"
+
+trap "exit 1" TERM
+TOP_PID=$$
+
+out_dir="."
+timestamp=$(date -u +%Y%m%d%H%M%S)
+lspools_err="${out_dir}/lspools-${timestamp}.error"
+rados_out="${out_dir}/rados-${timestamp}.intermediate"
+rados_odd="${out_dir}/rados-${timestamp}.issues"
+rados_err="${out_dir}/rados-${timestamp}.error"
+rgwadmin_out="${out_dir}/radosgw-admin-${timestamp}.intermediate"
+rgwadmin_err="${out_dir}/radosgw-admin-${timestamp}.error"
+delta_out="${out_dir}/orphan-list-${timestamp}.out"
+
+log() {
+  echo $(date +%F\ %T) $(hostname -s) "$1"
+}
+
+usage() {
+  >&2 cat << EOF
+
+Usage: $0 [-h] "<radospools>" [<temp_dir>]
+
+Where:
+  -h               This help output
+  <radospools>     The RGW data pool name, if omitted, pool name will be
+                   prompted for during execution.
+                   If specifying multiple pools, please use space separated
+                   list and wrap the entire list in quotes.
+
+  <temp_dir>       Optionally, set the directory to use for temp space.
+                   This may be required if /tmp is low on space.
+
+NOTES:
+  - This tool should be ran on a node with ceph-radosgw package installed.
+    Specifically, it needs the 'ceph-diff-tool' command from that package.
+
+  - This tool is currently considered to be EXPERIMENTAL.
+
+  - False positives are possible. False positives would likely
+    appear as objects that were never deleted and are fully
+    intact. All results should therefore be verified.
+
+WARNING:
+  - Indexless buckets will appear as 100% orphan objects.
+  - Therefore, this tool MUST NOT be used in environments with indexless
+    buckets.
+
+EOF
+  exit 1
+}
+
+#
+# checkReturn RETURNCODE MESSAGE TERMINATE
+#  RETURNCODE - ( usually $? ) of previous command
+#  MESSAGE    - Message to print on non-zero return code
+#  TERMINATE  - non-empty == terminate the script on non-zero return code
+#
+checkReturn() {
+  if [ $1 -ne 0 ]; then
+    error_addon=""
+    if [ ! -z "$3" ]; then
+      error_addon="; Terminating"
+    fi
+    log "ERROR: ${2} failed: returned ${1}${error_addon}"
+    if [ ! -z "$3" ]; then
+      >&2 echo
+      >&2 echo '***'
+      >&2 echo '*** WARNING: The results are incomplete. Do not use! ***'
+      >&2 echo '***'
+      kill -s TERM $TOP_PID
+    fi
+  fi
+}
+
+prompt_pool() {
+    # note: all prompts go to stderr so stdout contains just the result
+    >&2 echo "Available pools:"
+    rados ${CEPH_CONF} lspools >"$temp_file" 2>"$lspools_err"
+    checkReturn $? "Listing pools failed" 1
+
+    >&2 sed 's/^/    /' "$temp_file" # list pools and indent
+    >&2 printf "Which pool do you want to search for orphans (for multiple, use space-separated list)? "
+    local mypool
+    read mypool
+    echo $mypool
+}
+
+radosgw_radoslist() {
+  log "Running 'radosgw-admin bucket radoslist'."
+  rm -f "$rgwadmin_flag" &> /dev/null
+  radosgw-admin ${CEPH_CONF} bucket radoslist >"$rgwadmin_out" 2>"$rgwadmin_err"
+  RETVAL=$?
+  if [ "$RETVAL" -ne 0 ] ;then
+    touch "$rgwadmin_flag"
+  fi
+  checkReturn $RETVAL "radosgw-admin radoslist" 1
+  log "Completed 'radosgw-admin bucket radoslist'."
+
+  log "Sorting 'radosgw-admin bucket radoslist' output."
+  sort -T ${temp_prefix} -u "$rgwadmin_out" > "$rgwadmin_temp"
+  checkReturn $? "Sorting 'radosgw-admin bucket radoslist' output" 1
+  log "Completed sorting 'radosgw-admin bucket radoslist'."
+
+  log "Moving 'radosgw-admin bucket radoslist' output."
+  mv -f "$rgwadmin_temp" "$rgwadmin_out"
+  checkReturn $? "Moving 'radosgw-admin bucket radoslist' output" 1
+  log "Completed moving 'radosgw-admin bucket radoslist' output."
+}
+
+rados_ls() {
+  log "Starting 'rados ls' function."
+  rm -f "$rados_flag" &> /dev/null
+  rm -f "$rados_out" &> /dev/null
+  local mypool
+  for mypool in $pool; do
+    log "Running 'rados ls' on pool ${mypool}."
+    rados ${CEPH_CONF} ls --pool="$mypool" --all >>"$rados_out" 2>"$rados_err"
+    RETVAL=$?
+    if [ "$RETVAL" -ne 0 ] ;then
+      touch "$rados_flag"
+    fi
+    checkReturn $RETVAL "'rados ls' on pool ${mypool}" 1
+    log "Completed 'rados ls' on pool ${mypool}."
+  done
+  if [ ! -e "$rados_flag" ]; then
+    # NOTE: Each entry (line of output) of `rados ls --all` should be in
+    # one of four formats depending on whether or not an entry has a
+    # namespace and/or locator:
+    #
+    #   <TAB>oid
+    #   <TAB>oid<TAB>locator
+    #   namespace<TAB>oid
+    #   namespace<TAB>oid<TAB>locator
+    #
+    # Any occurrences of the 2nd, 3rd, or 4th (i.e., existence of
+    # namespace and/or locator) should cause the create of the "odd" file
+    # and an explanation in the output, and those entries will not be
+    # retained, and therefore they will not be called out as orphans. They
+    # will need special handling by the end-user as we do not expect
+    # namespaces or locators.
+
+    # check for namespaces -- any line that does not begin with a tab
+    # indicates a namespace; add those to "odd" file and set flag; note:
+    # this also picks up entries with namespace and locator
+    log "Checking for namespaces"
+    grep --text $'^[^\t]' "$rados_out" >"$rados_odd"
+    if [ "${PIPESTATUS[0]}" -eq 0 ] ;then
+        log "Namespaces found"
+        namespace_found=1
+    fi
+
+    # check for locators (w/o namespace); we idenitfy them by skipping
+    # past the empty namespace (i.e., one TAB), skipping past the oid,
+    # then looking for a TAB; note we use egrep to get the '+' character
+    # and the $ in front of the ' allows the \t to be interpreted as a TAB
+    log "Checking for locators"
+    egrep --text $'^\t[[:graph:]]+\t' "$rados_out" >>"$rados_odd"
+    if [ "${PIPESTATUS[0]}" -eq 0 ] ;then
+        log "Locator found"
+        locator_found=1
+    fi
+
+    # extract the entries that are just oids (i.e., no namespace or
+    # locator) for further processing; only look at lines that begin with
+    # a TAB and do not contain a second TAB, and then grab everything
+    # after the initial TAB
+    log "Generating final 'rados ls' output (without namespaces or locators)"
+    grep --text $'^\t' "$rados_out" | grep --text -v $'^\t.*\t' | sed -E 's/^\t//' >"$temp_file"
+    mv -f "$temp_file" "$rados_out"
+
+    log "Sorting 'rados ls' output(s)."
+    sort -T ${temp_prefix} -u "$rados_out" >"$temp_file"
+    checkReturn $? "Sorting 'rados ls' output(s)" 1
+    log "Sorting 'rados ls' output(s) complete."
+
+    log "Moving sorted output(s)."
+    mv -f "$temp_file" "$rados_out"
+    checkReturn $? "Moving temp file to output file" 1
+  fi
+}
+
+temp_prefix="/tmp"
+if [ ! -z "$2" ]; then
+    if [ -d "$2" ]; then
+        temp_prefix=$2
+    else
+        echo
+        echo "ERROR: Provided temp directory does not exist: ${2}"
+        usage
+    fi
+    temp_prefix="$2"
+fi
+temp_file=${temp_prefix}/temp.$$
+rados_flag=${temp_prefix}/rados_flag.$$
+rgwadmin_flag=${temp_prefix}/rgwadmin_flag.$$
+rgwadmin_temp=${temp_prefix}/rgwadmin_temp.$$
+
+if [ $# -eq 0 ] ;then
+    pool="$(prompt_pool)"
+else
+    if [ "$1" == "-h" ]; then
+      usage
+    fi
+    pool="$1"
+fi
+
+error=0
+rados ${CEPH_CONF} lspools > $temp_file
+for mypool in $pool; do
+    if [ $(grep -c "^${mypool}$" "${temp_file}") -eq 0 ]; then
+        echo
+        echo "ERROR: Supplied pool does not exist: ${mypool}"
+        error=1
+    fi
+done
+if [ $error -gt 0 ]; then
+    echo "Terminating"
+    exit 1
+fi
+
+log "Pool is \"$pool\"."
+log "Note: output files produced will be tagged with the current timestamp -- ${timestamp}."
+
+rados_ls
+radosgw_radoslist
+
+#
+# Check for any empty output files
+#
+
+for myfile in $rados_out $rgwadmin_out; do
+  if [ ! -s "${myfile}" ]; then
+    log "ERROR: Empty file detected: ${myfile}"
+    log "ERROR: RESULTS ARE INCOMPLETE - DO NOT USE"
+    exit 1
+  fi 
+done
+
+log "Computing delta..."
+ceph-diff-sorted "$rados_out" "$rgwadmin_out" | grep --text "^<" | sed 's/^< *//' >"$delta_out"
+# use PIPESTATUS to get at exit status of first process in above pipe;
+# 0 means same, 1 means different, >1 means error
+if [ "${PIPESTATUS[0]}" -gt 1 ] ;then
+    log "ERROR: ceph-diff-sorted failed with status: ${PIPESTATUS[0]}"
+    log "TERMINATING - Results are incomplete - DO NOT USE"
+    exit 1
+fi
+
+log "Computing results..."
+found=$(wc -l < "$delta_out")
+possible=$(wc -l < "$rados_out")
+percentage=0
+if [ $possible -ne 0 ] ;then
+    percentage=$(expr 100 \* $found / $possible)
+fi
+
+echo "$found potential orphans found out of a possible $possible (${percentage}%)."
+echo "The results can be found in '${delta_out}'."
+echo "    Intermediate files are '${rados_out}' and '${rgwadmin_out}'."
+if [ -n "$namespace_found" -o -n "$locator_found" ] ;then
+    echo "    Note: 'rados ls' found entries that might be in a namespace or might"
+    echo "          have a locator; see '${rados_odd}' for those entries."
+fi
+echo "***"
+echo "*** WARNING: This is EXPERIMENTAL code and the results should be used"
+echo "***          only with CAUTION!"
+echo "***"
+echo "Done at $(date +%F\ %T)."
diff --git a/src/rgw/rgw-restore-bucket-index b/src/rgw/rgw-restore-bucket-index
new file mode 100755
index 000000000..056658119
--- /dev/null
+++ b/src/rgw/rgw-restore-bucket-index
@@ -0,0 +1,250 @@
+#!/usr/bin/env bash
+
+# version 2023-03-21
+
+# rgw-restore-bucket-index is an EXPERIMENTAL tool to use in case
+# bucket index entries for objects in the bucket are somehow lost. It
+# is expected to be needed and used rarely. A bucket name is provided
+# and the data pool for that bucket is scanned for all head objects
+# matching the bucket's marker. The rgw object name is then extracted
+# from the rados object name, and `radosgw-admin bucket reindex ...`
+# is used to add the bucket index entry.
+#
+# Because this script must process json objects, the `jq` tool must be
+# installed on the system.
+#
+# Usage: $0 [--proceed] <bucket-name> [data-pool-name]
+#
+# This tool is designed to be interactive, allowing the user to
+# examine the list of objects to be reindexed before
+# proceeding. However, if the "--proceed" option is provided, the
+# script will not prompt the user and simply proceed.
+
+trap "clean ; exit 1" TERM
+export TOP_PID=$$
+
+# IMPORTANT: affects order produced by 'sort' and 'ceph-diff-sorted'
+# relies on this ordering
+export LC_ALL=C
+
+export bkt_entry=/tmp/rgwrbi-bkt-entry.$$
+export bkt_inst=/tmp/rgwrbi-bkt-inst.$$
+export bkt_inst_new=/tmp/rgwrbi-bkt-inst-new.$$
+export obj_list=/tmp/rgwrbi-object-list.$$
+export zone_info=/tmp/rgwrbi-zone-info.$$
+export clean_temps=1
+
+# number of seconds for a bucket index pending op to be completed via
+# dir_suggest mechanism
+pending_op_secs=120
+
+#
+if which radosgw-admin > /dev/null ;then
+  :
+else
+  echo 'Error: must have command `radosgw-admin` installed and on $PATH for operation.'
+  exit 1
+fi
+
+# make sure jq is available
+if which jq > /dev/null ;then
+  :
+else
+  echo 'Error: must have command `jq` installed and on $PATH for json parsing.'
+  exit 1
+fi
+
+clean() {
+  if [ -n "$clean_temps" ] ;then
+    rm -f $bkt_entry $bkt_inst $bkt_inst_new $obj_list $zone_info
+  fi
+}
+
+super_exit() {
+   kill -s TERM $TOP_PID
+}
+
+usage() {
+  >&2 cat << EOF
+
+Usage: $0 [--proceed] <bucket-name> [data-pool-name]
+  NOTE: This tool is currently considered EXPERIMENTAL.
+  NOTE: If a data-pool-name is not supplied then it will be inferred from bucket and zone information.
+  NOTE: If --proceed is provided then user will not be prompted to proceed. Use with caution.
+EOF
+  super_exit
+}
+
+# strips the starting and ending double quotes from a string, so:
+#   "dog"   -> dog
+#   "dog    -> "dog
+#   d"o"g   -> d"o"g
+#   "do"g"  -> do"g
+strip_quotes() {
+  echo "$1" | sed 's/^"\(.*\)"$/\1/'
+}
+
+# Determines the name of the data pool. Expects the optional
+# command-line argument to appear as $1 if there is one. The
+# command-line has the highest priority, then the "explicit_placement"
+# in the bucket instance data, and finally the "placement_rule" in the
+# bucket instance data.
+get_pool() {
+  # command-line
+  if [ -n "$1" ] ;then
+    echo "$1"
+    exit 0
+  fi
+
+  # explicit_placement
+  expl_pool=$(strip_quotes $(jq '.data.bucket_info.bucket.explicit_placement.data_pool' $bkt_inst))
+  if [ -n "$expl_pool" ] ;then
+    echo "$expl_pool"
+    exit 0
+  fi
+
+  # placement_rule
+  plmt_rule=$(strip_quotes $(jq '.data.bucket_info.placement_rule' $bkt_inst))
+  plmt_pool=$(echo "$plmt_rule" | awk -F / '{print $1}')
+  plmt_class=$(echo "$plmt_rule" | awk -F / '{print $2}')
+  if [ -z "$plmt_class" ] ;then
+    plmt_class=STANDARD
+  fi
+
+  radosgw-admin zone get >$zone_info 2>/dev/null
+  pool=$(strip_quotes $(jq ".placement_pools [] | select(.key | contains(\"${plmt_pool}\")) .val .storage_classes.${plmt_class}.data_pool" $zone_info))
+
+  if [ -z "$pool" ] ;then
+      echo ERROR: unable to determine pool.
+      super_exit
+  fi
+  echo "$pool"
+}
+
+if [ $1 == "--proceed" ] ;then
+    echo "NOTICE: This tool is currently considered EXPERIMENTAL."
+    proceed=1
+    shift
+fi
+
+# expect 1 or 2 arguments
+if [ $# -eq 0 -o $# -gt 2 ] ;then
+   usage
+fi
+
+bucket=$1
+
+# read bucket entry metadata
+radosgw-admin metadata get bucket:$bucket >$bkt_entry 2>/dev/null
+marker=$(strip_quotes $(jq ".data.bucket.marker" $bkt_entry))
+bucket_id=$(strip_quotes $(jq ".data.bucket.bucket_id" $bkt_entry))
+if [ -z "$marker" -o -z "$bucket_id" ] ;then
+    echo "ERROR: unable to read entry-point metadata for bucket \"$bucket\"."
+    clean
+    exit 1
+fi
+
+echo marker is $marker
+echo bucket_id is $bucket_id
+
+# read bucket instance metadata
+radosgw-admin metadata get bucket.instance:${bucket}:$bucket_id >$bkt_inst 2>/dev/null
+
+# handle versioned buckets
+bkt_flags=$(jq ".data.bucket_info.flags" $bkt_inst)
+if [ -z "$bkt_flags" ] ;then
+    echo "ERROR: unable to read instance metadata for bucket \"$bucket\"."
+    exit 1
+fi
+
+# mask bit indicating it's a versioned bucket
+is_versioned=$(( $bkt_flags & 2))
+if [ "$is_versioned" -ne 0 ] ;then
+    echo "Error: this bucket appears to be versioned, and this tool cannot work with versioned buckets."
+    clean
+    exit 1
+fi
+
+# examine number of bucket index shards
+num_shards=$(jq ".data.bucket_info.num_shards" $bkt_inst)
+echo number of bucket index shards is $num_shards
+
+# determine data pool
+pool=$(get_pool $2)
+echo data pool is $pool
+
+# search the data pool for all of the head objects that begin with the
+# marker that are not in namespaces (indicated by an extra underscore)
+# and then strip away all but the rgw object name
+( rados -p $pool ls | grep "^${marker}_[^_]" | sed "s/^${marker}_\(.*\)/\1/" >$obj_list ) 2>/dev/null
+
+# handle the case where the resulting object list file is empty
+if [ -s $obj_list ] ;then
+    :
+else
+    echo "NOTICE: No head objects for bucket \"$bucket\" were found in pool \"$pool\", so nothing was recovered."
+    clean
+    exit 0
+fi
+
+if [ -z "$proceed" ] ;then
+    # warn user and get permission to proceed
+    echo "NOTICE: This tool is currently considered EXPERIMENTAL."
+    echo "The list of objects that we will attempt to restore can be found in \"$obj_list\"."
+    echo "Please review the object names in that file (either below or in another window/terminal) before proceeding."
+    while true ; do
+	read -p "Type \"proceed!\" to proceed, \"view\" to view object list, or \"q\" to quit: " action
+	if [ "$action" == "q" ] ;then
+	    echo "Exiting..."
+	    clean
+	    exit 0
+	elif [ "$action" == "view" ] ;then
+	    echo "Viewing..."
+	    less $obj_list
+	elif [ "$action" == "proceed!" ] ;then
+	    echo "Proceeding..."
+	    break
+	else
+	    echo "Error: response \"$action\" is not understood."
+	fi
+    done
+fi
+
+# execute object rewrite on all of the head objects
+radosgw-admin object reindex --bucket=$bucket --objects-file=$obj_list 2>/dev/null
+reindex_done=$(date +%s)
+
+# note: large is 2^30
+export large=1073741824
+
+listcmd="radosgw-admin bucket list --bucket=$bucket --allow-unordered --max-entries=$large"
+
+if [ -n "$proceed" ] ;then
+    sleep $pending_op_secs
+    $listcmd >/dev/null 2>/dev/null
+else
+    echo "NOTICE: Bucket stats are currently incorrect. They can be restored with the following command after 2 minutes:"
+    echo "    $listcmd"
+
+    while true ; do
+	read -p "Would you like to take the time to recalculate bucket stats now? [yes/no] " action
+	if [ "$action" == "no" ] ;then
+	    break
+	elif [ "$action" == "yes" ] ;then
+	    # make sure at least $pending_op_secs since reindex completed
+	    now=$(date +%s)
+	    sleep_time=$(expr $pending_op_secs - $now + $reindex_done)
+	    if [ "$sleep_time" -gt 0 ] ;then
+		sleep $sleep_time
+	    fi
+
+	    $listcmd >/dev/null 2>/dev/null
+	    break
+	else
+	    echo "Error: response \"$action\" is not understood."
+	fi
+    done
+fi
+
+clean
+echo Done
diff --git a/src/rgw/rgw_acl.cc b/src/rgw/rgw_acl.cc
new file mode 100644
index 000000000..f32a73f26
--- /dev/null
+++ b/src/rgw/rgw_acl.cc
@@ -0,0 +1,442 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+
+#include <iostream>
+#include <map>
+
+#include "include/types.h"
+
+#include "common/Formatter.h"
+
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_user.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+bool operator==(const ACLPermission& lhs, const ACLPermission& rhs) {
+  return lhs.flags == rhs.flags;
+}
+bool operator!=(const ACLPermission& lhs, const ACLPermission& rhs) {
+  return !(lhs == rhs);
+}
+
+bool operator==(const ACLGranteeType& lhs, const ACLGranteeType& rhs) {
+  return lhs.type == rhs.type;
+}
+bool operator!=(const ACLGranteeType& lhs, const ACLGranteeType& rhs) {
+  return lhs.type != rhs.type;
+}
+
+bool operator==(const ACLGrant& lhs, const ACLGrant& rhs) {
+  return lhs.type == rhs.type && lhs.id == rhs.id
+      && lhs.email == rhs.email && lhs.permission == rhs.permission
+      && lhs.name == rhs.name && lhs.group == rhs.group
+      && lhs.url_spec == rhs.url_spec;
+}
+bool operator!=(const ACLGrant& lhs, const ACLGrant& rhs) {
+  return !(lhs == rhs);
+}
+
+bool operator==(const ACLReferer& lhs, const ACLReferer& rhs) {
+  return lhs.url_spec == rhs.url_spec && lhs.perm == rhs.perm;
+}
+bool operator!=(const ACLReferer& lhs, const ACLReferer& rhs) {
+  return !(lhs == rhs);
+}
+
+bool operator==(const RGWAccessControlList& lhs,
+                const RGWAccessControlList& rhs) {
+  return lhs.acl_user_map == rhs.acl_user_map
+      && lhs.acl_group_map == rhs.acl_group_map
+      && lhs.referer_list == rhs.referer_list
+      && lhs.grant_map == rhs.grant_map;
+}
+bool operator!=(const RGWAccessControlList& lhs,
+                const RGWAccessControlList& rhs) {
+  return !(lhs == rhs);
+}
+
+bool operator==(const ACLOwner& lhs, const ACLOwner& rhs) {
+  return lhs.id == rhs.id && lhs.display_name == rhs.display_name;
+}
+bool operator!=(const ACLOwner& lhs, const ACLOwner& rhs) {
+  return !(lhs == rhs);
+}
+
+bool operator==(const RGWAccessControlPolicy& lhs,
+                const RGWAccessControlPolicy& rhs) {
+  return lhs.acl == rhs.acl && lhs.owner == rhs.owner;
+}
+bool operator!=(const RGWAccessControlPolicy& lhs,
+                const RGWAccessControlPolicy& rhs) {
+  return !(lhs == rhs);
+}
+
+void RGWAccessControlList::_add_grant(ACLGrant *grant)
+{
+  ACLPermission& perm = grant->get_permission();
+  ACLGranteeType& type = grant->get_type();
+  switch (type.get_type()) {
+  case ACL_TYPE_REFERER:
+    referer_list.emplace_back(grant->get_referer(), perm.get_permissions());
+
+    /* We're specially handling the Swift's .r:* as the S3 API has a similar
+     * concept and thus we can have a small portion of compatibility here. */
+     if (grant->get_referer() == RGW_REFERER_WILDCARD) {
+       acl_group_map[ACL_GROUP_ALL_USERS] |= perm.get_permissions();
+     }
+    break;
+  case ACL_TYPE_GROUP:
+    acl_group_map[grant->get_group()] |= perm.get_permissions();
+    break;
+  default:
+    {
+      rgw_user id;
+      if (!grant->get_id(id)) {
+        ldout(cct, 0) << "ERROR: grant->get_id() failed" << dendl;
+      }
+      acl_user_map[id.to_str()] |= perm.get_permissions();
+    }
+  }
+}
+
+void RGWAccessControlList::add_grant(ACLGrant *grant)
+{
+  rgw_user id;
+  grant->get_id(id); // not that this will return false for groups, but that's ok, we won't search groups
+  grant_map.insert(pair<string, ACLGrant>(id.to_str(), *grant));
+  _add_grant(grant);
+}
+
+void RGWAccessControlList::remove_canon_user_grant(rgw_user& user_id)
+{
+  auto multi_map_iter = grant_map.find(user_id.to_str());
+  if(multi_map_iter != grant_map.end()) {
+    auto grants = grant_map.equal_range(user_id.to_str());
+    grant_map.erase(grants.first, grants.second);
+  }
+
+  auto map_iter = acl_user_map.find(user_id.to_str());
+  if (map_iter != acl_user_map.end()){
+    acl_user_map.erase(map_iter);
+  }
+}
+
+uint32_t RGWAccessControlList::get_perm(const DoutPrefixProvider* dpp, 
+                                        const rgw::auth::Identity& auth_identity,
+                                        const uint32_t perm_mask)
+{
+  ldpp_dout(dpp, 5) << "Searching permissions for identity=" << auth_identity
+                << " mask=" << perm_mask << dendl;
+
+  return perm_mask & auth_identity.get_perms_from_aclspec(dpp, acl_user_map);
+}
+
+uint32_t RGWAccessControlList::get_group_perm(const DoutPrefixProvider *dpp, 
+                                              ACLGroupTypeEnum group,
+                                              const uint32_t perm_mask) const
+{
+  ldpp_dout(dpp, 5) << "Searching permissions for group=" << (int)group
+                << " mask=" << perm_mask << dendl;
+
+  const auto iter = acl_group_map.find((uint32_t)group);
+  if (iter != acl_group_map.end()) {
+    ldpp_dout(dpp, 5) << "Found permission: " << iter->second << dendl;
+    return iter->second & perm_mask;
+  }
+  ldpp_dout(dpp, 5) << "Permissions for group not found" << dendl;
+  return 0;
+}
+
+uint32_t RGWAccessControlList::get_referer_perm(const DoutPrefixProvider *dpp,
+                                                const uint32_t current_perm,
+                                                const std::string http_referer,
+                                                const uint32_t perm_mask)
+{
+  ldpp_dout(dpp, 5) << "Searching permissions for referer=" << http_referer
+                << " mask=" << perm_mask << dendl;
+
+  /* This function is basically a transformation from current perm to
+   * a new one that takes into consideration the Swift's HTTP referer-
+   * based ACLs. We need to go through all items to respect negative
+   * grants. */
+  uint32_t referer_perm = current_perm;
+  for (const auto& r : referer_list) {
+    if (r.is_match(http_referer)) {
+       referer_perm = r.perm;
+    }
+  }
+
+  ldpp_dout(dpp, 5) << "Found referer permission=" << referer_perm << dendl;
+  return referer_perm & perm_mask;
+}
+
+uint32_t RGWAccessControlPolicy::get_perm(const DoutPrefixProvider* dpp,
+                                          const rgw::auth::Identity& auth_identity,
+                                          const uint32_t perm_mask,
+                                          const char * const http_referer,
+                                          bool ignore_public_acls)
+{
+  ldpp_dout(dpp, 20) << "-- Getting permissions begin with perm_mask=" << perm_mask
+                 << dendl;
+
+  uint32_t perm = acl.get_perm(dpp, auth_identity, perm_mask);
+
+  if (auth_identity.is_owner_of(owner.get_id())) {
+    perm |= perm_mask & (RGW_PERM_READ_ACP | RGW_PERM_WRITE_ACP);
+  }
+
+  if (perm == perm_mask) {
+    return perm;
+  }
+
+  /* should we continue looking up? */
+  if (!ignore_public_acls && ((perm & perm_mask) != perm_mask)) {
+    perm |= acl.get_group_perm(dpp, ACL_GROUP_ALL_USERS, perm_mask);
+
+    if (false == auth_identity.is_owner_of(rgw_user(RGW_USER_ANON_ID))) {
+      /* this is not the anonymous user */
+      perm |= acl.get_group_perm(dpp, ACL_GROUP_AUTHENTICATED_USERS, perm_mask);
+    }
+  }
+
+  /* Should we continue looking up even deeper? */
+  if (nullptr != http_referer && (perm & perm_mask) != perm_mask) {
+    perm = acl.get_referer_perm(dpp, perm, http_referer, perm_mask);
+  }
+
+  ldpp_dout(dpp, 5) << "-- Getting permissions done for identity=" << auth_identity
+                << ", owner=" << owner.get_id()
+                << ", perm=" << perm << dendl;
+
+  return perm;
+}
+
+bool RGWAccessControlPolicy::verify_permission(const DoutPrefixProvider* dpp,
+                                               const rgw::auth::Identity& auth_identity,
+                                               const uint32_t user_perm_mask,
+                                               const uint32_t perm,
+                                               const char * const http_referer,
+                                               bool ignore_public_acls)
+{
+  uint32_t test_perm = perm | RGW_PERM_READ_OBJS | RGW_PERM_WRITE_OBJS;
+
+  uint32_t policy_perm = get_perm(dpp, auth_identity, test_perm, http_referer, ignore_public_acls);
+
+  /* the swift WRITE_OBJS perm is equivalent to the WRITE obj, just
+     convert those bits. Note that these bits will only be set on
+     buckets, so the swift READ permission on bucket will allow listing
+     the bucket content */
+  if (policy_perm & RGW_PERM_WRITE_OBJS) {
+    policy_perm |= (RGW_PERM_WRITE | RGW_PERM_WRITE_ACP);
+  }
+  if (policy_perm & RGW_PERM_READ_OBJS) {
+    policy_perm |= (RGW_PERM_READ | RGW_PERM_READ_ACP);
+  }
+   
+  uint32_t acl_perm = policy_perm & perm & user_perm_mask;
+
+  ldpp_dout(dpp, 10) << " identity=" << auth_identity
+                 << " requested perm (type)=" << perm
+                 << ", policy perm=" << policy_perm
+                 << ", user_perm_mask=" << user_perm_mask
+                 << ", acl perm=" << acl_perm << dendl;
+
+  return (perm == acl_perm);
+}
+
+
+bool RGWAccessControlPolicy::is_public(const DoutPrefixProvider *dpp) const
+{
+
+  static constexpr auto public_groups = {ACL_GROUP_ALL_USERS,
+					 ACL_GROUP_AUTHENTICATED_USERS};
+  return std::any_of(public_groups.begin(), public_groups.end(),
+                         [&, dpp](ACLGroupTypeEnum g) {
+                           auto p = acl.get_group_perm(dpp, g, RGW_PERM_FULL_CONTROL);
+                           return (p != RGW_PERM_NONE) && (p != RGW_PERM_INVALID);
+                         }
+                         );
+
+}
+
+void ACLPermission::generate_test_instances(list<ACLPermission*>& o)
+{
+  ACLPermission *p = new ACLPermission;
+  p->set_permissions(RGW_PERM_WRITE_ACP);
+  o.push_back(p);
+  o.push_back(new ACLPermission);
+}
+
+void ACLPermission::dump(Formatter *f) const
+{
+  f->dump_int("flags", flags);
+}
+
+void ACLGranteeType::dump(Formatter *f) const
+{
+  f->dump_unsigned("type", type);
+}
+
+void ACLGrant::dump(Formatter *f) const
+{
+  f->open_object_section("type");
+  type.dump(f);
+  f->close_section();
+
+  f->dump_string("id", id.to_str());
+  f->dump_string("email", email);
+
+  f->open_object_section("permission");
+  permission.dump(f);
+  f->close_section();
+
+  f->dump_string("name", name);
+  f->dump_int("group", (int)group);
+  f->dump_string("url_spec", url_spec);
+}
+
+void ACLGrant::generate_test_instances(list<ACLGrant*>& o)
+{
+  rgw_user id("rgw");
+  string name, email;
+  name = "Mr. RGW";
+  email = "r@gw";
+
+  ACLGrant *g1 = new ACLGrant;
+  g1->set_canon(id, name, RGW_PERM_READ);
+  g1->email = email;
+  o.push_back(g1);
+
+  ACLGrant *g2 = new ACLGrant;
+  g1->set_group(ACL_GROUP_AUTHENTICATED_USERS, RGW_PERM_WRITE);
+  o.push_back(g2);
+
+  o.push_back(new ACLGrant);
+}
+
+void ACLGranteeType::generate_test_instances(list<ACLGranteeType*>& o)
+{
+  ACLGranteeType *t = new ACLGranteeType;
+  t->set(ACL_TYPE_CANON_USER);
+  o.push_back(t);
+  o.push_back(new ACLGranteeType);
+}
+
+void RGWAccessControlList::generate_test_instances(list<RGWAccessControlList*>& o)
+{
+  RGWAccessControlList *acl = new RGWAccessControlList(NULL);
+
+  list<ACLGrant *> glist;
+  list<ACLGrant *>::iterator iter;
+
+  ACLGrant::generate_test_instances(glist);
+  for (iter = glist.begin(); iter != glist.end(); ++iter) {
+    ACLGrant *grant = *iter;
+    acl->add_grant(grant);
+
+    delete grant;
+  }
+  o.push_back(acl);
+  o.push_back(new RGWAccessControlList(NULL));
+}
+
+void ACLOwner::generate_test_instances(list<ACLOwner*>& o)
+{
+  ACLOwner *owner = new ACLOwner;
+  owner->id = "rgw";
+  owner->display_name = "Mr. RGW";
+  o.push_back(owner);
+  o.push_back(new ACLOwner);
+}
+
+void RGWAccessControlPolicy::generate_test_instances(list<RGWAccessControlPolicy*>& o)
+{
+  list<RGWAccessControlList *> acl_list;
+  list<RGWAccessControlList *>::iterator iter;
+  for (iter = acl_list.begin(); iter != acl_list.end(); ++iter) {
+    RGWAccessControlList::generate_test_instances(acl_list);
+    iter = acl_list.begin();
+
+    RGWAccessControlPolicy *p = new RGWAccessControlPolicy(NULL);
+    RGWAccessControlList *l = *iter;
+    p->acl = *l;
+
+    string name = "radosgw";
+    rgw_user id("rgw");
+    p->owner.set_name(name);
+    p->owner.set_id(id);
+
+    o.push_back(p);
+
+    delete l;
+  }
+
+  o.push_back(new RGWAccessControlPolicy(NULL));
+}
+
+void RGWAccessControlList::dump(Formatter *f) const
+{
+  map<string, int>::const_iterator acl_user_iter = acl_user_map.begin();
+  f->open_array_section("acl_user_map");
+  for (; acl_user_iter != acl_user_map.end(); ++acl_user_iter) {
+    f->open_object_section("entry");
+    f->dump_string("user", acl_user_iter->first);
+    f->dump_int("acl", acl_user_iter->second);
+    f->close_section();
+  }
+  f->close_section();
+
+  map<uint32_t, int>::const_iterator acl_group_iter = acl_group_map.begin();
+  f->open_array_section("acl_group_map");
+  for (; acl_group_iter != acl_group_map.end(); ++acl_group_iter) {
+    f->open_object_section("entry");
+    f->dump_unsigned("group", acl_group_iter->first);
+    f->dump_int("acl", acl_group_iter->second);
+    f->close_section();
+  }
+  f->close_section();
+
+  multimap<string, ACLGrant>::const_iterator giter = grant_map.begin();
+  f->open_array_section("grant_map");
+  for (; giter != grant_map.end(); ++giter) {
+    f->open_object_section("entry");
+    f->dump_string("id", giter->first);
+    f->open_object_section("grant");
+    giter->second.dump(f);
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void ACLOwner::dump(Formatter *f) const
+{
+  encode_json("id", id.to_str(), f);
+  encode_json("display_name", display_name, f);
+}
+
+void ACLOwner::decode_json(JSONObj *obj) {
+  string id_str;
+  JSONDecoder::decode_json("id", id_str, obj);
+  id.from_str(id_str);
+  JSONDecoder::decode_json("display_name", display_name, obj);
+}
+
+void RGWAccessControlPolicy::dump(Formatter *f) const
+{
+  encode_json("acl", acl, f);
+  encode_json("owner", owner, f);
+}
+
+ACLGroupTypeEnum ACLGrant::uri_to_group(string& uri)
+{
+  // this is required for backward compatibility
+  return ACLGrant_S3::uri_to_group(uri);
+}
+
diff --git a/src/rgw/rgw_acl.h b/src/rgw/rgw_acl.h
new file mode 100644
index 000000000..c52050158
--- /dev/null
+++ b/src/rgw/rgw_acl.h
@@ -0,0 +1,414 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <string_view>
+#include <include/types.h>
+
+#include <boost/optional.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "common/debug.h"
+
+#include "rgw_basic_types.h" //includes rgw_acl_types.h
+
+class ACLGrant
+{
+protected:
+  ACLGranteeType type;
+  rgw_user id;
+  std::string email;
+  mutable rgw_user email_id;
+  ACLPermission permission;
+  std::string name;
+  ACLGroupTypeEnum group;
+  std::string url_spec;
+
+public:
+  ACLGrant() : group(ACL_GROUP_NONE) {}
+  virtual ~ACLGrant() {}
+
+  /* there's an assumption here that email/uri/id encodings are
+     different and there can't be any overlap */
+  bool get_id(rgw_user& _id) const {
+    switch(type.get_type()) {
+    case ACL_TYPE_EMAIL_USER:
+      _id = email; // implies from_str() that parses the 't:u' syntax
+      return true;
+    case ACL_TYPE_GROUP:
+    case ACL_TYPE_REFERER:
+      return false;
+    default:
+      _id = id;
+      return true;
+    }
+  }
+
+  const rgw_user* get_id() const {
+    switch(type.get_type()) {
+    case ACL_TYPE_EMAIL_USER:
+      email_id.from_str(email);
+      return &email_id;
+    case ACL_TYPE_GROUP:
+    case ACL_TYPE_REFERER:
+      return nullptr;
+    default:
+      return &id;
+    }
+  }
+
+  ACLGranteeType& get_type() { return type; }
+  const ACLGranteeType& get_type() const { return type; }
+  ACLPermission& get_permission() { return permission; }
+  const ACLPermission& get_permission() const { return permission; }
+  ACLGroupTypeEnum get_group() const { return group; }
+  const std::string& get_referer() const { return url_spec; }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(5, 3, bl);
+    encode(type, bl);
+    std::string s;
+    id.to_str(s);
+    encode(s, bl);
+    std::string uri;
+    encode(uri, bl);
+    encode(email, bl);
+    encode(permission, bl);
+    encode(name, bl);
+    __u32 g = (__u32)group;
+    encode(g, bl);
+    encode(url_spec, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+    decode(type, bl);
+    std::string s;
+    decode(s, bl);
+    id.from_str(s);
+    std::string uri;
+    decode(uri, bl);
+    decode(email, bl);
+    decode(permission, bl);
+    decode(name, bl);
+    if (struct_v > 1) {
+      __u32 g;
+      decode(g, bl);
+      group = (ACLGroupTypeEnum)g;
+    } else {
+      group = uri_to_group(uri);
+    }
+    if (struct_v >= 5) {
+      decode(url_spec, bl);
+    } else {
+      url_spec.clear();
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<ACLGrant*>& o);
+
+  ACLGroupTypeEnum uri_to_group(std::string& uri);
+
+  void set_canon(const rgw_user& _id, const std::string& _name, const uint32_t perm) {
+    type.set(ACL_TYPE_CANON_USER);
+    id = _id;
+    name = _name;
+    permission.set_permissions(perm);
+  }
+  void set_group(ACLGroupTypeEnum _group, const uint32_t perm) {
+    type.set(ACL_TYPE_GROUP);
+    group = _group;
+    permission.set_permissions(perm);
+  }
+  void set_referer(const std::string& _url_spec, const uint32_t perm) {
+    type.set(ACL_TYPE_REFERER);
+    url_spec = _url_spec;
+    permission.set_permissions(perm);
+  }
+
+  friend bool operator==(const ACLGrant& lhs, const ACLGrant& rhs);
+  friend bool operator!=(const ACLGrant& lhs, const ACLGrant& rhs);
+};
+WRITE_CLASS_ENCODER(ACLGrant)
+
+struct ACLReferer {
+  std::string url_spec;
+  uint32_t perm;
+
+  ACLReferer() : perm(0) {}
+  ACLReferer(const std::string& url_spec,
+             const uint32_t perm)
+    : url_spec(url_spec),
+      perm(perm) {
+  }
+
+  bool is_match(std::string_view http_referer) const {
+    const auto http_host = get_http_host(http_referer);
+    if (!http_host || http_host->length() < url_spec.length()) {
+      return false;
+    }
+
+    if ("*" == url_spec) {
+      return true;
+    }
+
+    if (http_host->compare(url_spec) == 0) {
+      return true;
+    }
+
+    if ('.' == url_spec[0]) {
+      /* Wildcard support: a referer matches the spec when its last char are
+       * perfectly equal to spec. */
+      return boost::algorithm::ends_with(http_host.value(), url_spec);
+    }
+
+    return false;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(url_spec, bl);
+    encode(perm, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, bl);
+    decode(url_spec, bl);
+    decode(perm, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+
+  friend bool operator==(const ACLReferer& lhs, const ACLReferer& rhs);
+  friend bool operator!=(const ACLReferer& lhs, const ACLReferer& rhs);
+
+private:
+  boost::optional<std::string_view> get_http_host(const std::string_view url) const {
+    size_t pos = url.find("://");
+    if (pos == std::string_view::npos || boost::algorithm::starts_with(url, "://") ||
+        boost::algorithm::ends_with(url, "://") || boost::algorithm::ends_with(url, "@")) {
+      return boost::none;
+    }
+    std::string_view url_sub = url.substr(pos + strlen("://"));
+    pos = url_sub.find('@');
+    if (pos != std::string_view::npos) {
+      url_sub = url_sub.substr(pos + 1);
+    }
+    pos = url_sub.find_first_of("/:");
+    if (pos == std::string_view::npos) {
+      /* no port or path exists */
+      return url_sub;
+    }
+    return url_sub.substr(0, pos);
+  }
+};
+WRITE_CLASS_ENCODER(ACLReferer)
+
+namespace rgw {
+namespace auth {
+  class Identity;
+}
+}
+
+using ACLGrantMap = std::multimap<std::string, ACLGrant>;
+
+class RGWAccessControlList
+{
+protected:
+  CephContext *cct;
+  /* FIXME: in the feature we should consider switching to uint32_t also
+   * in data structures. */
+  std::map<std::string, int> acl_user_map;
+  std::map<uint32_t, int> acl_group_map;
+  std::list<ACLReferer> referer_list;
+  ACLGrantMap grant_map;
+  void _add_grant(ACLGrant *grant);
+public:
+  explicit RGWAccessControlList(CephContext *_cct) : cct(_cct) {}
+  RGWAccessControlList() : cct(NULL) {}
+
+  void set_ctx(CephContext *ctx) {
+    cct = ctx;
+  }
+
+  virtual ~RGWAccessControlList() {}
+
+  uint32_t get_perm(const DoutPrefixProvider* dpp,
+                    const rgw::auth::Identity& auth_identity,
+                    uint32_t perm_mask);
+  uint32_t get_group_perm(const DoutPrefixProvider *dpp, ACLGroupTypeEnum group, uint32_t perm_mask) const;
+  uint32_t get_referer_perm(const DoutPrefixProvider *dpp, uint32_t current_perm,
+                            std::string http_referer,
+                            uint32_t perm_mask);
+  void encode(bufferlist& bl) const {
+    ENCODE_START(4, 3, bl);
+    bool maps_initialized = true;
+    encode(maps_initialized, bl);
+    encode(acl_user_map, bl);
+    encode(grant_map, bl);
+    encode(acl_group_map, bl);
+    encode(referer_list, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
+    bool maps_initialized;
+    decode(maps_initialized, bl);
+    decode(acl_user_map, bl);
+    decode(grant_map, bl);
+    if (struct_v >= 2) {
+      decode(acl_group_map, bl);
+    } else if (!maps_initialized) {
+      ACLGrantMap::iterator iter;
+      for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) {
+        ACLGrant& grant = iter->second;
+        _add_grant(&grant);
+      }
+    }
+    if (struct_v >= 4) {
+      decode(referer_list, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWAccessControlList*>& o);
+
+  void add_grant(ACLGrant *grant);
+  void remove_canon_user_grant(rgw_user& user_id);
+
+  ACLGrantMap& get_grant_map() { return grant_map; }
+  const ACLGrantMap& get_grant_map() const { return grant_map; }
+
+  void create_default(const rgw_user& id, std::string name) {
+    acl_user_map.clear();
+    acl_group_map.clear();
+    referer_list.clear();
+
+    ACLGrant grant;
+    grant.set_canon(id, name, RGW_PERM_FULL_CONTROL);
+    add_grant(&grant);
+  }
+
+  friend bool operator==(const RGWAccessControlList& lhs, const RGWAccessControlList& rhs);
+  friend bool operator!=(const RGWAccessControlList& lhs, const RGWAccessControlList& rhs);
+};
+WRITE_CLASS_ENCODER(RGWAccessControlList)
+
+class ACLOwner
+{
+protected:
+  rgw_user id;
+  std::string display_name;
+public:
+  ACLOwner() {}
+  ACLOwner(const rgw_user& _id) : id(_id) {}
+  ~ACLOwner() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 2, bl);
+    std::string s;
+    id.to_str(s);
+    encode(s, bl);
+    encode(display_name, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+    std::string s;
+    decode(s, bl);
+    id.from_str(s);
+    decode(display_name, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<ACLOwner*>& o);
+  void set_id(const rgw_user& _id) { id = _id; }
+  void set_name(const std::string& name) { display_name = name; }
+
+  rgw_user& get_id() { return id; }
+  const rgw_user& get_id() const { return id; }
+  std::string& get_display_name() { return display_name; }
+  const std::string& get_display_name() const { return display_name; }
+  friend bool operator==(const ACLOwner& lhs, const ACLOwner& rhs);
+  friend bool operator!=(const ACLOwner& lhs, const ACLOwner& rhs);
+};
+WRITE_CLASS_ENCODER(ACLOwner)
+
+class RGWAccessControlPolicy
+{
+protected:
+  CephContext *cct;
+  RGWAccessControlList acl;
+  ACLOwner owner;
+
+public:
+  explicit RGWAccessControlPolicy(CephContext *_cct) : cct(_cct), acl(_cct) {}
+  RGWAccessControlPolicy() : cct(NULL), acl(NULL) {}
+  virtual ~RGWAccessControlPolicy() {}
+
+  void set_ctx(CephContext *ctx) {
+    cct = ctx;
+    acl.set_ctx(ctx);
+  }
+
+  uint32_t get_perm(const DoutPrefixProvider* dpp,
+                    const rgw::auth::Identity& auth_identity,
+                    uint32_t perm_mask,
+                    const char * http_referer,
+                    bool ignore_public_acls=false);
+  bool verify_permission(const DoutPrefixProvider* dpp,
+                         const rgw::auth::Identity& auth_identity,
+                         uint32_t user_perm_mask,
+                         uint32_t perm,
+                         const char * http_referer = nullptr,
+                         bool ignore_public_acls=false);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(owner, bl);
+    encode(acl, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+    decode(owner, bl);
+    decode(acl, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWAccessControlPolicy*>& o);
+  void decode_owner(bufferlist::const_iterator& bl) { // sometimes we only need that, should be faster
+    DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+    decode(owner, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void set_owner(ACLOwner& o) { owner = o; }
+  ACLOwner& get_owner() {
+    return owner;
+  }
+
+  void create_default(const rgw_user& id, std::string& name) {
+    acl.create_default(id, name);
+    owner.set_id(id);
+    owner.set_name(name);
+  }
+  RGWAccessControlList& get_acl() {
+    return acl;
+  }
+  const RGWAccessControlList& get_acl() const {
+    return acl;
+  }
+
+  virtual bool compare_group_name(std::string& id, ACLGroupTypeEnum group) { return false; }
+  bool is_public(const DoutPrefixProvider *dpp) const;
+
+  friend bool operator==(const RGWAccessControlPolicy& lhs, const RGWAccessControlPolicy& rhs);
+  friend bool operator!=(const RGWAccessControlPolicy& lhs, const RGWAccessControlPolicy& rhs);
+};
+WRITE_CLASS_ENCODER(RGWAccessControlPolicy)
diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc
new file mode 100644
index 000000000..9f71e3281
--- /dev/null
+++ b/src/rgw/rgw_acl_s3.cc
@@ -0,0 +1,643 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+
+#include <iostream>
+#include <map>
+
+#include "include/types.h"
+
+#include "rgw_acl_s3.h"
+#include "rgw_user.h"
+#include "rgw_sal.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+
+
+#define RGW_URI_ALL_USERS	"http://acs.amazonaws.com/groups/global/AllUsers"
+#define RGW_URI_AUTH_USERS	"http://acs.amazonaws.com/groups/global/AuthenticatedUsers"
+
+using namespace std;
+
+static string rgw_uri_all_users = RGW_URI_ALL_USERS;
+static string rgw_uri_auth_users = RGW_URI_AUTH_USERS;
+
+void ACLPermission_S3::to_xml(ostream& out)
+{
+  if ((flags & RGW_PERM_FULL_CONTROL) == RGW_PERM_FULL_CONTROL) {
+   out << "<Permission>FULL_CONTROL</Permission>";
+  } else {
+    if (flags & RGW_PERM_READ)
+      out << "<Permission>READ</Permission>";
+    if (flags & RGW_PERM_WRITE)
+      out << "<Permission>WRITE</Permission>";
+    if (flags & RGW_PERM_READ_ACP)
+      out << "<Permission>READ_ACP</Permission>";
+    if (flags & RGW_PERM_WRITE_ACP)
+      out << "<Permission>WRITE_ACP</Permission>";
+  }
+}
+
+bool ACLPermission_S3::
+xml_end(const char *el)
+{
+  const char *s = data.c_str();
+  if (strcasecmp(s, "READ") == 0) {
+    flags |= RGW_PERM_READ;
+    return true;
+  } else if (strcasecmp(s, "WRITE") == 0) {
+    flags |= RGW_PERM_WRITE;
+    return true;
+  } else if (strcasecmp(s, "READ_ACP") == 0) {
+    flags |= RGW_PERM_READ_ACP;
+    return true;
+  } else if (strcasecmp(s, "WRITE_ACP") == 0) {
+    flags |= RGW_PERM_WRITE_ACP;
+    return true;
+  } else if (strcasecmp(s, "FULL_CONTROL") == 0) {
+    flags |= RGW_PERM_FULL_CONTROL;
+    return true;
+  }
+  return false;
+}
+
+
+class ACLGranteeType_S3 {
+public:
+  static const char *to_string(ACLGranteeType& type) {
+    switch (type.get_type()) {
+    case ACL_TYPE_CANON_USER:
+      return "CanonicalUser";
+    case ACL_TYPE_EMAIL_USER:
+      return "AmazonCustomerByEmail";
+    case ACL_TYPE_GROUP:
+      return "Group";
+     default:
+      return "unknown";
+    }
+  }
+
+  static void set(const char *s, ACLGranteeType& type) {
+    if (!s) {
+      type.set(ACL_TYPE_UNKNOWN);
+      return;
+    }
+    if (strcmp(s, "CanonicalUser") == 0)
+      type.set(ACL_TYPE_CANON_USER);
+    else if (strcmp(s, "AmazonCustomerByEmail") == 0)
+      type.set(ACL_TYPE_EMAIL_USER);
+    else if (strcmp(s, "Group") == 0)
+      type.set(ACL_TYPE_GROUP);
+    else
+      type.set(ACL_TYPE_UNKNOWN);
+  }
+};
+
+class ACLID_S3 : public XMLObj
+{
+public:
+  ACLID_S3() {}
+  ~ACLID_S3() override {}
+  string& to_str() { return data; }
+};
+
+class ACLURI_S3 : public XMLObj
+{
+public:
+  ACLURI_S3() {}
+  ~ACLURI_S3() override {}
+};
+
+class ACLEmail_S3 : public XMLObj
+{
+public:
+  ACLEmail_S3() {}
+  ~ACLEmail_S3() override {}
+};
+
+class ACLDisplayName_S3 : public XMLObj
+{
+public:
+ ACLDisplayName_S3() {}
+ ~ACLDisplayName_S3() override {}
+};
+
+bool ACLOwner_S3::xml_end(const char *el) {
+  ACLID_S3 *acl_id = static_cast<ACLID_S3 *>(find_first("ID"));
+  ACLID_S3 *acl_name = static_cast<ACLID_S3 *>(find_first("DisplayName"));
+
+  // ID is mandatory
+  if (!acl_id)
+    return false;
+  id = acl_id->get_data();
+
+  // DisplayName is optional
+  if (acl_name)
+    display_name = acl_name->get_data();
+  else
+    display_name = "";
+
+  return true;
+}
+
+void  ACLOwner_S3::to_xml(ostream& out) {
+  string s;
+  id.to_str(s);
+  if (s.empty())
+    return;
+  out << "<Owner>" << "<ID>" << s << "</ID>";
+  if (!display_name.empty())
+    out << "<DisplayName>" << display_name << "</DisplayName>";
+  out << "</Owner>";
+}
+
+bool ACLGrant_S3::xml_end(const char *el) {
+  ACLGrantee_S3 *acl_grantee;
+  ACLID_S3 *acl_id;
+  ACLURI_S3 *acl_uri;
+  ACLEmail_S3 *acl_email;
+  ACLPermission_S3 *acl_permission;
+  ACLDisplayName_S3 *acl_name;
+  string uri;
+
+  acl_grantee = static_cast<ACLGrantee_S3 *>(find_first("Grantee"));
+  if (!acl_grantee)
+    return false;
+  string type_str;
+  if (!acl_grantee->get_attr("xsi:type", type_str))
+    return false;
+  ACLGranteeType_S3::set(type_str.c_str(), type);
+  
+  acl_permission = static_cast<ACLPermission_S3 *>(find_first("Permission"));
+  if (!acl_permission)
+    return false;
+
+  permission = *acl_permission;
+
+  id.clear();
+  name.clear();
+  email.clear();
+
+  switch (type.get_type()) {
+  case ACL_TYPE_CANON_USER:
+    acl_id = static_cast<ACLID_S3 *>(acl_grantee->find_first("ID"));
+    if (!acl_id)
+      return false;
+    id = acl_id->to_str();
+    acl_name = static_cast<ACLDisplayName_S3 *>(acl_grantee->find_first("DisplayName"));
+    if (acl_name)
+      name = acl_name->get_data();
+    break;
+  case ACL_TYPE_GROUP:
+    acl_uri = static_cast<ACLURI_S3 *>(acl_grantee->find_first("URI"));
+    if (!acl_uri)
+      return false;
+    uri = acl_uri->get_data();
+    group = uri_to_group(uri);
+    break;
+  case ACL_TYPE_EMAIL_USER:
+    acl_email = static_cast<ACLEmail_S3 *>(acl_grantee->find_first("EmailAddress"));
+    if (!acl_email)
+      return false;
+    email = acl_email->get_data();
+    break;
+  default:
+    // unknown user type
+    return false;
+  };
+  return true;
+}
+
+void ACLGrant_S3::to_xml(CephContext *cct, ostream& out) {
+  ACLPermission_S3& perm = static_cast<ACLPermission_S3 &>(permission);
+
+  /* only show s3 compatible permissions */
+  if (!(perm.get_permissions() & RGW_PERM_ALL_S3))
+    return;
+
+  string uri;
+
+  out << "<Grant>" <<
+         "<Grantee xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"" << ACLGranteeType_S3::to_string(type) << "\">";
+  switch (type.get_type()) {
+  case ACL_TYPE_CANON_USER:
+    out << "<ID>" << id << "</ID>";
+    if (name.size()) {
+      out << "<DisplayName>" << name << "</DisplayName>";
+    }
+    break;
+  case ACL_TYPE_EMAIL_USER:
+    out << "<EmailAddress>" << email << "</EmailAddress>";
+    break;
+  case ACL_TYPE_GROUP:
+    if (!group_to_uri(group, uri)) {
+      ldout(cct, 0) << "ERROR: group_to_uri failed with group=" << (int)group << dendl;
+      break;
+    }
+    out << "<URI>" << uri << "</URI>";
+    break;
+  default:
+    break;
+  }
+  out << "</Grantee>";
+  perm.to_xml(out);
+  out << "</Grant>";
+}
+
+bool ACLGrant_S3::group_to_uri(ACLGroupTypeEnum group, string& uri)
+{
+  switch (group) {
+  case ACL_GROUP_ALL_USERS:
+    uri = rgw_uri_all_users;
+    return true;
+  case ACL_GROUP_AUTHENTICATED_USERS:
+    uri = rgw_uri_auth_users;
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool RGWAccessControlList_S3::xml_end(const char *el) {
+  XMLObjIter iter = find("Grant");
+  ACLGrant_S3 *grant = static_cast<ACLGrant_S3 *>(iter.get_next());
+  while (grant) {
+    add_grant(grant);
+    grant = static_cast<ACLGrant_S3 *>(iter.get_next());
+  }
+  return true;
+}
+
+void  RGWAccessControlList_S3::to_xml(ostream& out) {
+  multimap<string, ACLGrant>::iterator iter;
+  out << "<AccessControlList>";
+  for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) {
+    ACLGrant_S3& grant = static_cast<ACLGrant_S3 &>(iter->second);
+    grant.to_xml(cct, out);
+  }
+  out << "</AccessControlList>";
+}
+
+struct s3_acl_header {
+  int rgw_perm;
+  const char *http_header;
+};
+
+static const char *get_acl_header(const RGWEnv *env,
+        const struct s3_acl_header *perm)
+{
+  const char *header = perm->http_header;
+
+  return env->get(header, NULL);
+}
+
+static int parse_grantee_str(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, string& grantee_str,
+        const struct s3_acl_header *perm, ACLGrant& grant)
+{
+  string id_type, id_val_quoted;
+  int rgw_perm = perm->rgw_perm;
+  int ret;
+
+  ret = parse_key_value(grantee_str, id_type, id_val_quoted);
+  if (ret < 0)
+    return ret;
+
+  string id_val = rgw_trim_quotes(id_val_quoted);
+
+  if (strcasecmp(id_type.c_str(), "emailAddress") == 0) {
+    std::unique_ptr<rgw::sal::User> user;
+    ret = driver->get_user_by_email(dpp, id_val, null_yield, &user);
+    if (ret < 0)
+      return ret;
+
+    grant.set_canon(user->get_id(), user->get_display_name(), rgw_perm);
+  } else if (strcasecmp(id_type.c_str(), "id") == 0) {
+    std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(id_val));
+    ret = user->load_user(dpp, null_yield);
+    if (ret < 0)
+      return ret;
+
+    grant.set_canon(user->get_id(), user->get_display_name(), rgw_perm);
+  } else if (strcasecmp(id_type.c_str(), "uri") == 0) {
+    ACLGroupTypeEnum gid = grant.uri_to_group(id_val);
+    if (gid == ACL_GROUP_NONE)
+      return -EINVAL;
+
+    grant.set_group(gid, rgw_perm);
+  } else {
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+static int parse_acl_header(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+			    const RGWEnv *env, const struct s3_acl_header *perm,
+			    std::list<ACLGrant>& _grants)
+{
+  std::list<string> grantees;
+  std::string hacl_str;
+
+  const char *hacl = get_acl_header(env, perm);
+  if (hacl == NULL)
+    return 0;
+
+  hacl_str = hacl;
+  get_str_list(hacl_str, ",", grantees);
+
+  for (list<string>::iterator it = grantees.begin(); it != grantees.end(); ++it) {
+    ACLGrant grant;
+    int ret = parse_grantee_str(dpp, driver, *it, perm, grant);
+    if (ret < 0)
+      return ret;
+
+    _grants.push_back(grant);
+  }
+
+  return 0;
+}
+
+int RGWAccessControlList_S3::create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const string& canned_acl)
+{
+  acl_user_map.clear();
+  grant_map.clear();
+
+  ACLGrant owner_grant;
+
+  rgw_user bid = bucket_owner.get_id();
+  string bname = bucket_owner.get_display_name();
+
+  /* owner gets full control */
+  owner_grant.set_canon(owner.get_id(), owner.get_display_name(), RGW_PERM_FULL_CONTROL);
+  add_grant(&owner_grant);
+
+  if (canned_acl.size() == 0 || canned_acl.compare("private") == 0) {
+    return 0;
+  }
+
+  ACLGrant bucket_owner_grant;
+  ACLGrant group_grant;
+  if (canned_acl.compare("public-read") == 0) {
+    group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ);
+    add_grant(&group_grant);
+  } else if (canned_acl.compare("public-read-write") == 0) {
+    group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_READ);
+    add_grant(&group_grant);
+    group_grant.set_group(ACL_GROUP_ALL_USERS, RGW_PERM_WRITE);
+    add_grant(&group_grant);
+  } else if (canned_acl.compare("authenticated-read") == 0) {
+    group_grant.set_group(ACL_GROUP_AUTHENTICATED_USERS, RGW_PERM_READ);
+    add_grant(&group_grant);
+  } else if (canned_acl.compare("bucket-owner-read") == 0) {
+    bucket_owner_grant.set_canon(bid, bname, RGW_PERM_READ);
+    if (bid.compare(owner.get_id()) != 0)
+      add_grant(&bucket_owner_grant);
+  } else if (canned_acl.compare("bucket-owner-full-control") == 0) {
+    bucket_owner_grant.set_canon(bid, bname, RGW_PERM_FULL_CONTROL);
+    if (bid.compare(owner.get_id()) != 0)
+      add_grant(&bucket_owner_grant);
+  } else {
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int RGWAccessControlList_S3::create_from_grants(std::list<ACLGrant>& grants)
+{
+  if (grants.empty())
+    return -EINVAL;
+
+  acl_user_map.clear();
+  grant_map.clear();
+
+  for (std::list<ACLGrant>::iterator it = grants.begin(); it != grants.end(); ++it) {
+    ACLGrant g = *it;
+    add_grant(&g);
+  }
+
+  return 0;
+}
+
+bool RGWAccessControlPolicy_S3::xml_end(const char *el) {
+  RGWAccessControlList_S3 *s3acl =
+      static_cast<RGWAccessControlList_S3 *>(find_first("AccessControlList"));
+  if (!s3acl)
+    return false;
+
+  acl = *s3acl;
+
+  ACLOwner *owner_p = static_cast<ACLOwner_S3 *>(find_first("Owner"));
+  if (!owner_p)
+    return false;
+  owner = *owner_p;
+  return true;
+}
+
+void  RGWAccessControlPolicy_S3::to_xml(ostream& out) {
+  out << "<AccessControlPolicy xmlns=\"" << XMLNS_AWS_S3 << "\">";
+  ACLOwner_S3& _owner = static_cast<ACLOwner_S3 &>(owner);
+  RGWAccessControlList_S3& _acl = static_cast<RGWAccessControlList_S3 &>(acl);
+  _owner.to_xml(out);
+  _acl.to_xml(out);
+  out << "</AccessControlPolicy>";
+}
+
+static const s3_acl_header acl_header_perms[] = {
+  {RGW_PERM_READ, "HTTP_X_AMZ_GRANT_READ"},
+  {RGW_PERM_WRITE, "HTTP_X_AMZ_GRANT_WRITE"},
+  {RGW_PERM_READ_ACP,"HTTP_X_AMZ_GRANT_READ_ACP"},
+  {RGW_PERM_WRITE_ACP, "HTTP_X_AMZ_GRANT_WRITE_ACP"},
+  {RGW_PERM_FULL_CONTROL, "HTTP_X_AMZ_GRANT_FULL_CONTROL"},
+  {0, NULL}
+};
+
+int RGWAccessControlPolicy_S3::create_from_headers(const DoutPrefixProvider *dpp,
+						   rgw::sal::Driver* driver,
+						   const RGWEnv *env, ACLOwner& _owner)
+{
+  std::list<ACLGrant> grants;
+  int r = 0;
+
+  for (const struct s3_acl_header *p = acl_header_perms; p->rgw_perm; p++) {
+    r = parse_acl_header(dpp, driver, env, p, grants);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  RGWAccessControlList_S3& _acl = static_cast<RGWAccessControlList_S3 &>(acl);
+  r = _acl.create_from_grants(grants);
+
+  owner = _owner;
+
+  return r;
+}
+
+/*
+  can only be called on object that was parsed
+ */
+int RGWAccessControlPolicy_S3::rebuild(const DoutPrefixProvider *dpp,
+				       rgw::sal::Driver* driver, ACLOwner *owner,
+				       RGWAccessControlPolicy& dest, std::string &err_msg)
+{
+  if (!owner)
+    return -EINVAL;
+
+  ACLOwner *requested_owner = static_cast<ACLOwner_S3 *>(find_first("Owner"));
+  if (requested_owner) {
+    rgw_user& requested_id = requested_owner->get_id();
+    if (!requested_id.empty() && requested_id.compare(owner->get_id()) != 0)
+      return -EPERM;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(owner->get_id());
+  if (user->load_user(dpp, null_yield) < 0) {
+    ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
+    err_msg = "Invalid id";
+    return -EINVAL;
+  }
+  ACLOwner& dest_owner = dest.get_owner();
+  dest_owner.set_id(owner->get_id());
+  dest_owner.set_name(user->get_display_name());
+
+  ldpp_dout(dpp, 20) << "owner id=" << owner->get_id() << dendl;
+  ldpp_dout(dpp, 20) << "dest owner id=" << dest.get_owner().get_id() << dendl;
+
+  RGWAccessControlList& dst_acl = dest.get_acl();
+
+  multimap<string, ACLGrant>& grant_map = acl.get_grant_map();
+  multimap<string, ACLGrant>::iterator iter;
+  for (iter = grant_map.begin(); iter != grant_map.end(); ++iter) {
+    ACLGrant& src_grant = iter->second;
+    ACLGranteeType& type = src_grant.get_type();
+    ACLGrant new_grant;
+    bool grant_ok = false;
+    rgw_user uid;
+    RGWUserInfo grant_user;
+    switch (type.get_type()) {
+    case ACL_TYPE_EMAIL_USER:
+      {
+        string email;
+        rgw_user u;
+        if (!src_grant.get_id(u)) {
+          ldpp_dout(dpp, 0) << "ERROR: src_grant.get_id() failed" << dendl;
+          return -EINVAL;
+        }
+        email = u.id;
+        ldpp_dout(dpp, 10) << "grant user email=" << email << dendl;
+	if (driver->get_user_by_email(dpp, email, null_yield, &user) < 0) {
+          ldpp_dout(dpp, 10) << "grant user email not found or other error" << dendl;
+          err_msg = "The e-mail address you provided does not match any account on record.";
+          return -ERR_UNRESOLVABLE_EMAIL;
+        }
+	grant_user = user->get_info();
+        uid = grant_user.user_id;
+      }
+    case ACL_TYPE_CANON_USER:
+      {
+        if (type.get_type() == ACL_TYPE_CANON_USER) {
+          if (!src_grant.get_id(uid)) {
+            ldpp_dout(dpp, 0) << "ERROR: src_grant.get_id() failed" << dendl;
+            err_msg = "Invalid id";
+            return -EINVAL;
+          }
+        }
+    
+        if (grant_user.user_id.empty()) {
+	  user = driver->get_user(uid);
+	  if (user->load_user(dpp, null_yield) < 0) {
+	    ldpp_dout(dpp, 10) << "grant user does not exist:" << uid << dendl;
+	    err_msg = "Invalid id";
+	    return -EINVAL;
+	  } else {
+	    grant_user = user->get_info();
+	  }
+        }
+	ACLPermission& perm = src_grant.get_permission();
+	new_grant.set_canon(uid, grant_user.display_name, perm.get_permissions());
+	grant_ok = true;
+	rgw_user new_id;
+	new_grant.get_id(new_id);
+	ldpp_dout(dpp, 10) << "new grant: " << new_id << ":" << grant_user.display_name << dendl;
+      }
+      break;
+    case ACL_TYPE_GROUP:
+      {
+        string uri;
+        if (ACLGrant_S3::group_to_uri(src_grant.get_group(), uri)) {
+          new_grant = src_grant;
+          grant_ok = true;
+          ldpp_dout(dpp, 10) << "new grant: " << uri << dendl;
+        } else {
+          ldpp_dout(dpp, 10) << "bad grant group:" << (int)src_grant.get_group() << dendl;
+          err_msg = "Invalid group uri";
+          return -EINVAL;
+        }
+      }
+    default:
+      break;
+    }
+    if (grant_ok) {
+      dst_acl.add_grant(&new_grant);
+    }
+  }
+
+  return 0; 
+}
+
+bool RGWAccessControlPolicy_S3::compare_group_name(string& id, ACLGroupTypeEnum group)
+{
+  switch (group) {
+  case ACL_GROUP_ALL_USERS:
+    return (id.compare(RGW_USER_ANON_ID) == 0);
+  case ACL_GROUP_AUTHENTICATED_USERS:
+    return (id.compare(rgw_uri_auth_users) == 0);
+  default:
+    return id.empty();
+  }
+
+  // shouldn't get here
+  return false;
+}
+
+XMLObj *RGWACLXMLParser_S3::alloc_obj(const char *el)
+{
+  XMLObj * obj = NULL;
+  if (strcmp(el, "AccessControlPolicy") == 0) {
+    obj = new RGWAccessControlPolicy_S3(cct);
+  } else if (strcmp(el, "Owner") == 0) {
+    obj = new ACLOwner_S3();
+  } else if (strcmp(el, "AccessControlList") == 0) {
+    obj = new RGWAccessControlList_S3(cct);
+  } else if (strcmp(el, "ID") == 0) {
+    obj = new ACLID_S3();
+  } else if (strcmp(el, "DisplayName") == 0) {
+    obj = new ACLDisplayName_S3();
+  } else if (strcmp(el, "Grant") == 0) {
+    obj = new ACLGrant_S3();
+  } else if (strcmp(el, "Grantee") == 0) {
+    obj = new ACLGrantee_S3();
+  } else if (strcmp(el, "Permission") == 0) {
+    obj = new ACLPermission_S3();
+  } else if (strcmp(el, "URI") == 0) {
+    obj = new ACLURI_S3();
+  } else if (strcmp(el, "EmailAddress") == 0) {
+    obj = new ACLEmail_S3();
+  }
+
+  return obj;
+}
+
+ACLGroupTypeEnum ACLGrant_S3::uri_to_group(string& uri)
+{
+  if (uri.compare(rgw_uri_all_users) == 0)
+    return ACL_GROUP_ALL_USERS;
+  else if (uri.compare(rgw_uri_auth_users) == 0)
+    return ACL_GROUP_AUTHENTICATED_USERS;
+
+  return ACL_GROUP_NONE;
+}
+
diff --git a/src/rgw/rgw_acl_s3.h b/src/rgw/rgw_acl_s3.h
new file mode 100644
index 000000000..c234d722b
--- /dev/null
+++ b/src/rgw/rgw_acl_s3.h
@@ -0,0 +1,115 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <iosfwd>
+#include <include/types.h>
+
+#include "include/str_list.h"
+#include "rgw_xml.h"
+#include "rgw_acl.h"
+#include "rgw_sal_fwd.h"
+
+class RGWUserCtl;
+
+class ACLPermission_S3 : public ACLPermission, public XMLObj
+{
+public:
+  ACLPermission_S3() {}
+  virtual ~ACLPermission_S3() override {}
+
+  bool xml_end(const char *el) override;
+  void to_xml(std::ostream& out);
+};
+
+class ACLGrantee_S3 : public ACLGrantee, public XMLObj
+{
+public:
+  ACLGrantee_S3() {}
+  virtual ~ACLGrantee_S3() override {}
+
+  bool xml_start(const char *el, const char **attr);
+};
+
+
+class ACLGrant_S3 : public ACLGrant, public XMLObj
+{
+public:
+  ACLGrant_S3() {}
+  virtual ~ACLGrant_S3() override {}
+
+  void to_xml(CephContext *cct, std::ostream& out);
+  bool xml_end(const char *el) override;
+  bool xml_start(const char *el, const char **attr);
+
+  static ACLGroupTypeEnum uri_to_group(std::string& uri);
+  static bool group_to_uri(ACLGroupTypeEnum group, std::string& uri);
+};
+
+class RGWAccessControlList_S3 : public RGWAccessControlList, public XMLObj
+{
+public:
+  explicit RGWAccessControlList_S3(CephContext *_cct) : RGWAccessControlList(_cct) {}
+  virtual ~RGWAccessControlList_S3() override {}
+
+  bool xml_end(const char *el) override;
+  void to_xml(std::ostream& out);
+
+  int create_canned(ACLOwner& owner, ACLOwner& bucket_owner, const std::string& canned_acl);
+  int create_from_grants(std::list<ACLGrant>& grants);
+};
+
+class ACLOwner_S3 : public ACLOwner, public XMLObj
+{
+public:
+  ACLOwner_S3() {}
+  virtual ~ACLOwner_S3() override {}
+
+  bool xml_end(const char *el) override;
+  void to_xml(std::ostream& out);
+};
+
+class RGWEnv;
+
+class RGWAccessControlPolicy_S3 : public RGWAccessControlPolicy, public XMLObj
+{
+public:
+  explicit RGWAccessControlPolicy_S3(CephContext *_cct) : RGWAccessControlPolicy(_cct) {}
+  virtual ~RGWAccessControlPolicy_S3() override {}
+
+  bool xml_end(const char *el) override;
+
+  void to_xml(std::ostream& out);
+  int rebuild(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, ACLOwner *owner,
+	      RGWAccessControlPolicy& dest, std::string &err_msg);
+  bool compare_group_name(std::string& id, ACLGroupTypeEnum group) override;
+
+  virtual int create_canned(ACLOwner& _owner, ACLOwner& bucket_owner, const std::string& canned_acl) {
+    RGWAccessControlList_S3& _acl = static_cast<RGWAccessControlList_S3 &>(acl);
+    if (_owner.get_id() == rgw_user("anonymous")) {
+      owner = bucket_owner;
+    } else {
+      owner = _owner;
+    }
+    int ret = _acl.create_canned(owner, bucket_owner, canned_acl);
+    return ret;
+  }
+  int create_from_headers(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+			  const RGWEnv *env, ACLOwner& _owner);
+};
+
+/**
+ * Interfaces with the webserver's XML handling code
+ * to parse it in a way that makes sense for the rgw.
+ */
+class RGWACLXMLParser_S3 : public RGWXMLParser
+{
+  CephContext *cct;
+
+  XMLObj *alloc_obj(const char *el) override;
+public:
+  explicit RGWACLXMLParser_S3(CephContext *_cct) : cct(_cct) {}
+};
diff --git a/src/rgw/rgw_acl_swift.cc b/src/rgw/rgw_acl_swift.cc
new file mode 100644
index 000000000..f1ca68d63
--- /dev/null
+++ b/src/rgw/rgw_acl_swift.cc
@@ -0,0 +1,438 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+
+#include <vector>
+
+#include <boost/algorithm/string/predicate.hpp>
+
+#include "common/ceph_json.h"
+#include "rgw_common.h"
+#include "rgw_user.h"
+#include "rgw_acl_swift.h"
+#include "rgw_sal.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+
+#define SWIFT_PERM_READ  RGW_PERM_READ_OBJS
+#define SWIFT_PERM_WRITE RGW_PERM_WRITE_OBJS
+/* FIXME: do we really need separate RW? */
+#define SWIFT_PERM_RWRT  (SWIFT_PERM_READ | SWIFT_PERM_WRITE)
+#define SWIFT_PERM_ADMIN RGW_PERM_FULL_CONTROL
+
+#define SWIFT_GROUP_ALL_USERS ".r:*"
+
+using namespace std;
+
+static int parse_list(const char* uid_list,
+                      std::vector<std::string>& uids)           /* out */
+{
+  char *s = strdup(uid_list);
+  if (!s) {
+    return -ENOMEM;
+  }
+
+  char *tokctx;
+  const char *p = strtok_r(s, " ,", &tokctx);
+  while (p) {
+    if (*p) {
+      string acl = p;
+      uids.push_back(acl);
+    }
+    p = strtok_r(NULL, " ,", &tokctx);
+  }
+  free(s);
+  return 0;
+}
+
+static bool is_referrer(const std::string& designator)
+{
+  return designator.compare(".r") == 0 ||
+         designator.compare(".ref") == 0 ||
+         designator.compare(".referer") == 0 ||
+         designator.compare(".referrer") == 0;
+}
+
+static bool uid_is_public(const string& uid)
+{
+  if (uid[0] != '.' || uid[1] != 'r')
+    return false;
+
+  int pos = uid.find(':');
+  if (pos < 0 || pos == (int)uid.size())
+    return false;
+
+  string sub = uid.substr(0, pos);
+  string after = uid.substr(pos + 1);
+
+  if (after.compare("*") != 0)
+    return false;
+
+  return is_referrer(sub);
+}
+
+static boost::optional<ACLGrant> referrer_to_grant(std::string url_spec,
+                                                   const uint32_t perm)
+{
+  /* This function takes url_spec as non-ref std::string because of the trim
+   * operation that is essential to preserve compliance with Swift. It can't
+   * be easily accomplished with std::string_view. */
+  try {
+    bool is_negative;
+    ACLGrant grant;
+
+    if ('-' == url_spec[0]) {
+      url_spec = url_spec.substr(1);
+      boost::algorithm::trim(url_spec);
+
+      is_negative = true;
+    } else {
+      is_negative = false;
+    }
+
+    if (url_spec != RGW_REFERER_WILDCARD) {
+      if ('*' == url_spec[0]) {
+        url_spec = url_spec.substr(1);
+        boost::algorithm::trim(url_spec);
+      }
+
+      if (url_spec.empty() || url_spec == ".") {
+        return boost::none;
+      }
+    } else {
+      /* Please be aware we're specially handling the .r:* in _add_grant()
+       * of RGWAccessControlList as the S3 API has a similar concept, and
+       * thus we can have a small portion of compatibility. */
+    }
+
+    grant.set_referer(url_spec, is_negative ? 0 : perm);
+    return grant;
+  } catch (const std::out_of_range&) {
+    return boost::none;
+  }
+}
+
+static ACLGrant user_to_grant(const DoutPrefixProvider *dpp,
+			      CephContext* const cct,
+                              rgw::sal::Driver* driver,
+                              const std::string& uid,
+                              const uint32_t perm)
+{
+  RGWUserInfo grant_user;
+  ACLGrant grant;
+  std::unique_ptr<rgw::sal::User> user;
+
+  user = driver->get_user(rgw_user(uid));
+  if (user->load_user(dpp, null_yield) < 0) {
+    ldpp_dout(dpp, 10) << "grant user does not exist: " << uid << dendl;
+    /* skipping silently */
+    grant.set_canon(user->get_id(), std::string(), perm);
+  } else {
+    grant.set_canon(user->get_id(), user->get_display_name(), perm);
+  }
+
+  return grant;
+}
+
+int RGWAccessControlPolicy_SWIFT::add_grants(const DoutPrefixProvider *dpp,
+					     rgw::sal::Driver* driver,
+                                             const std::vector<std::string>& uids,
+                                             const uint32_t perm)
+{
+  for (const auto& uid : uids) {
+    boost::optional<ACLGrant> grant;
+    ldpp_dout(dpp, 20) << "trying to add grant for ACL uid=" << uid << dendl;
+
+    /* Let's check whether the item has a separator potentially indicating
+     * a special meaning (like an HTTP referral-based grant). */
+    const size_t pos = uid.find(':');
+    if (std::string::npos == pos) {
+      /* No, it don't have -- we've got just a regular user identifier. */
+      grant = user_to_grant(dpp, cct, driver, uid, perm);
+    } else {
+      /* Yes, *potentially* an HTTP referral. */
+      auto designator = uid.substr(0, pos);
+      auto designatee = uid.substr(pos + 1);
+
+      /* Swift strips whitespaces at both beginning and end. */
+      boost::algorithm::trim(designator);
+      boost::algorithm::trim(designatee);
+
+      if (! boost::algorithm::starts_with(designator, ".")) {
+        grant = user_to_grant(dpp, cct, driver, uid, perm);
+      } else if ((perm & SWIFT_PERM_WRITE) == 0 && is_referrer(designator)) {
+        /* HTTP referrer-based ACLs aren't acceptable for writes. */
+        grant = referrer_to_grant(designatee, perm);
+      }
+    }
+
+    if (grant) {
+      acl.add_grant(&*grant);
+    } else {
+      return -EINVAL;
+    }
+  }
+
+  return 0;
+}
+
+
+int RGWAccessControlPolicy_SWIFT::create(const DoutPrefixProvider *dpp,
+					 rgw::sal::Driver* driver,
+                                         const rgw_user& id,
+                                         const std::string& name,
+                                         const char* read_list,
+                                         const char* write_list,
+                                         uint32_t& rw_mask)
+{
+  acl.create_default(id, name);
+  owner.set_id(id);
+  owner.set_name(name);
+  rw_mask = 0;
+
+  if (read_list) {
+    std::vector<std::string> uids;
+    int r = parse_list(read_list, uids);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: parse_list for read returned r="
+                    << r << dendl;
+      return r;
+    }
+
+    r = add_grants(dpp, driver, uids, SWIFT_PERM_READ);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: add_grants for read returned r="
+                    << r << dendl;
+      return r;
+    }
+    rw_mask |= SWIFT_PERM_READ;
+  }
+  if (write_list) {
+    std::vector<std::string> uids;
+    int r = parse_list(write_list, uids);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: parse_list for write returned r="
+                    << r << dendl;
+      return r;
+    }
+
+    r = add_grants(dpp, driver, uids, SWIFT_PERM_WRITE);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: add_grants for write returned r="
+                    << r << dendl;
+      return r;
+    }
+    rw_mask |= SWIFT_PERM_WRITE;
+  }
+  return 0;
+}
+
+void RGWAccessControlPolicy_SWIFT::filter_merge(uint32_t rw_mask,
+                                                RGWAccessControlPolicy_SWIFT *old)
+{
+  /* rw_mask&SWIFT_PERM_READ => setting read acl,
+   * rw_mask&SWIFT_PERM_WRITE => setting write acl
+   * when bit is cleared, copy matching elements from old.
+   */
+  if (rw_mask == (SWIFT_PERM_READ|SWIFT_PERM_WRITE)) {
+    return;
+  }
+  rw_mask ^= (SWIFT_PERM_READ|SWIFT_PERM_WRITE);
+  for (auto &iter: old->acl.get_grant_map()) {
+    ACLGrant& grant = iter.second;
+    uint32_t perm = grant.get_permission().get_permissions();
+    rgw_user id;
+    string url_spec;
+    if (!grant.get_id(id)) {
+      if (grant.get_group() != ACL_GROUP_ALL_USERS) {
+        url_spec = grant.get_referer();
+        if (url_spec.empty()) {
+          continue;
+        }
+        if (perm == 0) {
+          /* We need to carry also negative, HTTP referrer-based ACLs. */
+          perm = SWIFT_PERM_READ;
+        }
+      }
+    }
+    if (perm & rw_mask) {
+      acl.add_grant(&grant);
+    }
+  }
+}
+
+void RGWAccessControlPolicy_SWIFT::to_str(string& read, string& write)
+{
+  multimap<string, ACLGrant>& m = acl.get_grant_map();
+  multimap<string, ACLGrant>::iterator iter;
+
+  for (iter = m.begin(); iter != m.end(); ++iter) {
+    ACLGrant& grant = iter->second;
+    const uint32_t perm = grant.get_permission().get_permissions();
+    rgw_user id;
+    string url_spec;
+    if (!grant.get_id(id)) {
+      if (grant.get_group() == ACL_GROUP_ALL_USERS) {
+        id = SWIFT_GROUP_ALL_USERS;
+      } else {
+        url_spec = grant.get_referer();
+        if (url_spec.empty()) {
+          continue;
+        }
+        id = (perm != 0) ? ".r:" + url_spec : ".r:-" + url_spec;
+      }
+    }
+    if (perm & SWIFT_PERM_READ) {
+      if (!read.empty()) {
+        read.append(",");
+      }
+      read.append(id.to_str());
+    } else if (perm & SWIFT_PERM_WRITE) {
+      if (!write.empty()) {
+        write.append(",");
+      }
+      write.append(id.to_str());
+    } else if (perm == 0 && !url_spec.empty()) {
+      /* only X-Container-Read headers support referers */
+      if (!read.empty()) {
+        read.append(",");
+      }
+      read.append(id.to_str());
+    }
+  }
+}
+
+void RGWAccessControlPolicy_SWIFTAcct::add_grants(const DoutPrefixProvider *dpp,
+						  rgw::sal::Driver* driver,
+                                                  const std::vector<std::string>& uids,
+                                                  const uint32_t perm)
+{
+  for (const auto& uid : uids) {
+    ACLGrant grant;
+
+    if (uid_is_public(uid)) {
+      grant.set_group(ACL_GROUP_ALL_USERS, perm);
+      acl.add_grant(&grant);
+    } else  {
+      std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(uid));
+
+      if (user->load_user(dpp, null_yield) < 0) {
+        ldpp_dout(dpp, 10) << "grant user does not exist:" << uid << dendl;
+        /* skipping silently */
+        grant.set_canon(user->get_id(), std::string(), perm);
+        acl.add_grant(&grant);
+      } else {
+        grant.set_canon(user->get_id(), user->get_display_name(), perm);
+        acl.add_grant(&grant);
+      }
+    }
+  }
+}
+
+bool RGWAccessControlPolicy_SWIFTAcct::create(const DoutPrefixProvider *dpp,
+					      rgw::sal::Driver* driver,
+                                              const rgw_user& id,
+                                              const std::string& name,
+                                              const std::string& acl_str)
+{
+  acl.create_default(id, name);
+  owner.set_id(id);
+  owner.set_name(name);
+
+  JSONParser parser;
+
+  if (!parser.parse(acl_str.c_str(), acl_str.length())) {
+    ldpp_dout(dpp, 0) << "ERROR: JSONParser::parse returned error=" << dendl;
+    return false;
+  }
+
+  JSONObjIter iter = parser.find_first("admin");
+  if (!iter.end() && (*iter)->is_array()) {
+    std::vector<std::string> admin;
+    decode_json_obj(admin, *iter);
+    ldpp_dout(dpp, 0) << "admins: " << admin << dendl;
+
+    add_grants(dpp, driver, admin, SWIFT_PERM_ADMIN);
+  }
+
+  iter = parser.find_first("read-write");
+  if (!iter.end() && (*iter)->is_array()) {
+    std::vector<std::string> readwrite;
+    decode_json_obj(readwrite, *iter);
+    ldpp_dout(dpp, 0) << "read-write: " << readwrite << dendl;
+
+    add_grants(dpp, driver, readwrite, SWIFT_PERM_RWRT);
+  }
+
+  iter = parser.find_first("read-only");
+  if (!iter.end() && (*iter)->is_array()) {
+    std::vector<std::string> readonly;
+    decode_json_obj(readonly, *iter);
+    ldpp_dout(dpp, 0) << "read-only: " << readonly << dendl;
+
+    add_grants(dpp, driver, readonly, SWIFT_PERM_READ);
+  }
+
+  return true;
+}
+
+boost::optional<std::string> RGWAccessControlPolicy_SWIFTAcct::to_str() const
+{
+  std::vector<std::string> admin;
+  std::vector<std::string> readwrite;
+  std::vector<std::string> readonly;
+
+  /* Parition the grant map into three not-overlapping groups. */
+  for (const auto& item : get_acl().get_grant_map()) {
+    const ACLGrant& grant = item.second;
+    const uint32_t perm = grant.get_permission().get_permissions();
+
+    rgw_user id;
+    if (!grant.get_id(id)) {
+      if (grant.get_group() != ACL_GROUP_ALL_USERS) {
+        continue;
+      }
+      id = SWIFT_GROUP_ALL_USERS;
+    } else if (owner.get_id() == id) {
+      continue;
+    }
+
+    if (SWIFT_PERM_ADMIN == (perm & SWIFT_PERM_ADMIN)) {
+      admin.insert(admin.end(), id.to_str());
+    } else if (SWIFT_PERM_RWRT == (perm & SWIFT_PERM_RWRT)) {
+      readwrite.insert(readwrite.end(), id.to_str());
+    } else if (SWIFT_PERM_READ == (perm & SWIFT_PERM_READ)) {
+      readonly.insert(readonly.end(), id.to_str());
+    } else {
+      // FIXME: print a warning
+    }
+  }
+
+  /* If there is no grant to serialize, let's exit earlier to not return
+   * an empty JSON object which brakes the functional tests of Swift. */
+  if (admin.empty() && readwrite.empty() && readonly.empty()) {
+    return boost::none;
+  }
+
+  /* Serialize the groups. */
+  JSONFormatter formatter;
+
+  formatter.open_object_section("acl");
+  if (!readonly.empty()) {
+    encode_json("read-only", readonly, &formatter);
+  }
+  if (!readwrite.empty()) {
+    encode_json("read-write", readwrite, &formatter);
+  }
+  if (!admin.empty()) {
+    encode_json("admin", admin, &formatter);
+  }
+  formatter.close_section();
+
+  std::ostringstream oss;
+  formatter.flush(oss);
+
+  return oss.str();
+}
diff --git a/src/rgw/rgw_acl_swift.h b/src/rgw/rgw_acl_swift.h
new file mode 100644
index 000000000..4cb1e4b8f
--- /dev/null
+++ b/src/rgw/rgw_acl_swift.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include <string>
+#include <include/types.h>
+
+#include <boost/optional.hpp>
+
+#include "rgw_acl.h"
+
+class RGWUserCtl;
+
+class RGWAccessControlPolicy_SWIFT : public RGWAccessControlPolicy
+{
+  int add_grants(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+                 const std::vector<std::string>& uids,
+                 uint32_t perm);
+
+public:
+  explicit RGWAccessControlPolicy_SWIFT(CephContext* const cct)
+    : RGWAccessControlPolicy(cct) {
+  }
+  ~RGWAccessControlPolicy_SWIFT() override = default;
+
+  int create(const DoutPrefixProvider *dpp,
+	     rgw::sal::Driver* driver,
+             const rgw_user& id,
+             const std::string& name,
+             const char* read_list,
+             const char* write_list,
+             uint32_t& rw_mask);
+  void filter_merge(uint32_t mask, RGWAccessControlPolicy_SWIFT *policy);
+  void to_str(std::string& read, std::string& write);
+};
+
+class RGWAccessControlPolicy_SWIFTAcct : public RGWAccessControlPolicy
+{
+public:
+  explicit RGWAccessControlPolicy_SWIFTAcct(CephContext * const cct)
+    : RGWAccessControlPolicy(cct) {
+  }
+  ~RGWAccessControlPolicy_SWIFTAcct() override {}
+
+  void add_grants(const DoutPrefixProvider *dpp,
+		  rgw::sal::Driver* driver,
+                  const std::vector<std::string>& uids,
+                  uint32_t perm);
+  bool create(const DoutPrefixProvider *dpp,
+	      rgw::sal::Driver* driver,
+              const rgw_user& id,
+              const std::string& name,
+              const std::string& acl_str);
+  boost::optional<std::string> to_str() const;
+};
diff --git a/src/rgw/rgw_acl_types.h b/src/rgw/rgw_acl_types.h
new file mode 100644
index 000000000..af256b1b5
--- /dev/null
+++ b/src/rgw/rgw_acl_types.h
@@ -0,0 +1,213 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * introduce changes or include files which can only be compiled in
+ * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h)
+ */
+
+#pragma once
+
+#include <string>
+#include <list>
+#include <fmt/format.h>
+
+#include "include/types.h"
+#include "common/Formatter.h"
+
+#define RGW_PERM_NONE            0x00
+#define RGW_PERM_READ            0x01
+#define RGW_PERM_WRITE           0x02
+#define RGW_PERM_READ_ACP        0x04
+#define RGW_PERM_WRITE_ACP       0x08
+#define RGW_PERM_READ_OBJS       0x10
+#define RGW_PERM_WRITE_OBJS      0x20
+#define RGW_PERM_FULL_CONTROL    ( RGW_PERM_READ | RGW_PERM_WRITE | \
+                                  RGW_PERM_READ_ACP | RGW_PERM_WRITE_ACP )
+#define RGW_PERM_ALL_S3          RGW_PERM_FULL_CONTROL
+#define RGW_PERM_INVALID         0xFF00
+
+static constexpr char RGW_REFERER_WILDCARD[] = "*";
+
+struct RGWAccessKey {
+  std::string id; // AccessKey
+  std::string key; // SecretKey
+  std::string subuser;
+
+  RGWAccessKey() {}
+  RGWAccessKey(std::string _id, std::string _key)
+    : id(std::move(_id)), key(std::move(_key)) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(id, bl);
+    encode(key, bl);
+    encode(subuser, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl);
+     decode(id, bl);
+     decode(key, bl);
+     decode(subuser, bl);
+     DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void dump_plain(Formatter *f) const;
+  void dump(Formatter *f, const std::string& user, bool swift) const;
+  static void generate_test_instances(std::list<RGWAccessKey*>& o);
+
+  void decode_json(JSONObj *obj);
+  void decode_json(JSONObj *obj, bool swift);
+};
+WRITE_CLASS_ENCODER(RGWAccessKey)
+
+struct RGWSubUser {
+  std::string name;
+  uint32_t perm_mask;
+
+  RGWSubUser() : perm_mask(0) {}
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(name, bl);
+    encode(perm_mask, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl);
+     decode(name, bl);
+     decode(perm_mask, bl);
+     DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void dump(Formatter *f, const std::string& user) const;
+  static void generate_test_instances(std::list<RGWSubUser*>& o);
+
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWSubUser)
+
+class RGWUserCaps
+{
+  std::map<std::string, uint32_t> caps;
+
+  int get_cap(const std::string& cap, std::string& type, uint32_t *perm);
+  int add_cap(const std::string& cap);
+  int remove_cap(const std::string& cap);
+public:
+  static int parse_cap_perm(const std::string& str, uint32_t *perm);
+  int add_from_string(const std::string& str);
+  int remove_from_string(const std::string& str);
+
+  void encode(bufferlist& bl) const {
+     ENCODE_START(1, 1, bl);
+     encode(caps, bl);
+     ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(caps, bl);
+     DECODE_FINISH(bl);
+  }
+  int check_cap(const std::string& cap, uint32_t perm) const;
+  bool is_valid_cap_type(const std::string& tp);
+  void dump(Formatter *f) const;
+  void dump(Formatter *f, const char *name) const;
+
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWUserCaps)
+
+enum ACLGranteeTypeEnum {
+/* numbers are encoded, should not change */
+  ACL_TYPE_CANON_USER = 0,
+  ACL_TYPE_EMAIL_USER = 1,
+  ACL_TYPE_GROUP      = 2,
+  ACL_TYPE_UNKNOWN    = 3,
+  ACL_TYPE_REFERER    = 4,
+};
+
+enum ACLGroupTypeEnum {
+/* numbers are encoded should not change */
+  ACL_GROUP_NONE                = 0,
+  ACL_GROUP_ALL_USERS           = 1,
+  ACL_GROUP_AUTHENTICATED_USERS = 2,
+};
+
+class ACLPermission
+{
+protected:
+  int flags;
+public:
+  ACLPermission() : flags(0) {}
+  ~ACLPermission() {}
+  uint32_t get_permissions() const { return flags; }
+  void set_permissions(uint32_t perm) { flags = perm; }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(flags, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+    decode(flags, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<ACLPermission*>& o);
+
+  friend bool operator==(const ACLPermission& lhs, const ACLPermission& rhs);
+  friend bool operator!=(const ACLPermission& lhs, const ACLPermission& rhs);
+};
+WRITE_CLASS_ENCODER(ACLPermission)
+
+class ACLGranteeType
+{
+protected:
+  __u32 type;
+public:
+  ACLGranteeType() : type(ACL_TYPE_UNKNOWN) {}
+  virtual ~ACLGranteeType() {}
+//  virtual const char *to_string() = 0;
+  ACLGranteeTypeEnum get_type() const { return (ACLGranteeTypeEnum)type; }
+  void set(ACLGranteeTypeEnum t) { type = t; }
+//  virtual void set(const char *s) = 0;
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(type, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+    decode(type, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<ACLGranteeType*>& o);
+
+  friend bool operator==(const ACLGranteeType& lhs, const ACLGranteeType& rhs);
+  friend bool operator!=(const ACLGranteeType& lhs, const ACLGranteeType& rhs);
+};
+WRITE_CLASS_ENCODER(ACLGranteeType)
+
+class ACLGrantee
+{
+public:
+  ACLGrantee() {}
+  ~ACLGrantee() {}
+};
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
new file mode 100644
index 000000000..73b0736b1
--- /dev/null
+++ b/src/rgw/rgw_admin.cc
@@ -0,0 +1,10799 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <boost/asio.hpp>
+#include <boost/optional.hpp>
+
+extern "C" {
+#include <liboath/oath.h>
+}
+
+#include <fmt/format.h>
+
+#include "auth/Crypto.h"
+#include "compressor/Compressor.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/fault_injector.h"
+
+#include "include/util.h"
+
+#include "cls/rgw/cls_rgw_types.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_user.h"
+#include "rgw_otp.h"
+#include "rgw_rados.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_datalog.h"
+#include "rgw_lc.h"
+#include "rgw_log.h"
+#include "rgw_formats.h"
+#include "rgw_usage.h"
+#include "rgw_orphan.h"
+#include "rgw_sync.h"
+#include "rgw_trim_bilog.h"
+#include "rgw_trim_datalog.h"
+#include "rgw_trim_mdlog.h"
+#include "rgw_data_sync.h"
+#include "rgw_rest_conn.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_role.h"
+#include "rgw_reshard.h"
+#include "rgw_http_client_curl.h"
+#include "rgw_zone.h"
+#include "rgw_pubsub.h"
+#include "rgw_bucket_sync.h"
+#include "rgw_sync_checkpoint.h"
+#include "rgw_lua.h"
+#include "rgw_sal.h"
+#include "rgw_sal_config.h"
+
+#include "services/svc_sync_modules.h"
+#include "services/svc_cls.h"
+#include "services/svc_bilog_rados.h"
+#include "services/svc_mdlog.h"
+#include "services/svc_meta_be_otp.h"
+#include "services/svc_user.h"
+#include "services/svc_zone.h"
+
+#include "driver/rados/rgw_bucket.h"
+#include "driver/rados/rgw_sal_rados.h"
+
+#define dout_context g_ceph_context
+
+#define SECRET_KEY_LEN 40
+#define PUBLIC_ID_LEN 20
+
+using namespace std;
+
+static rgw::sal::Driver* driver = NULL;
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+
+static const DoutPrefixProvider* dpp() {
+  struct GlobalPrefix : public DoutPrefixProvider {
+    CephContext *get_cct() const override { return dout_context; }
+    unsigned get_subsys() const override { return dout_subsys; }
+    std::ostream& gen_prefix(std::ostream& out) const override { return out; }
+  };
+  static GlobalPrefix global_dpp;
+  return &global_dpp;
+}
+
+#define CHECK_TRUE(x, msg, err) \
+  do { \
+    if (!x) { \
+      cerr << msg << std::endl; \
+      return err; \
+    } \
+  } while (0)
+
+#define CHECK_SUCCESS(x, msg) \
+  do { \
+    int _x_val = (x); \
+    if (_x_val < 0) { \
+      cerr << msg << ": " << cpp_strerror(-_x_val) << std::endl; \
+      return _x_val; \
+    } \
+  } while (0)
+
+static inline int posix_errortrans(int r)
+{
+  switch(r) {
+  case ERR_NO_SUCH_BUCKET:
+    r = ENOENT;
+    break;
+  default:
+    break;
+  }
+  return r;
+}
+
+
+static const std::string LUA_CONTEXT_LIST("prerequest, postrequest, background, getdata, putdata");
+
+void usage()
+{
+  cout << "usage: radosgw-admin <cmd> [options...]" << std::endl;
+  cout << "commands:\n";
+  cout << "  user create                create a new user\n" ;
+  cout << "  user modify                modify user\n";
+  cout << "  user info                  get user info\n";
+  cout << "  user rename                rename user\n";
+  cout << "  user rm                    remove user\n";
+  cout << "  user suspend               suspend a user\n";
+  cout << "  user enable                re-enable user after suspension\n";
+  cout << "  user check                 check user info\n";
+  cout << "  user stats                 show user stats as accounted by quota subsystem\n";
+  cout << "  user list                  list users\n";
+  cout << "  caps add                   add user capabilities\n";
+  cout << "  caps rm                    remove user capabilities\n";
+  cout << "  subuser create             create a new subuser\n" ;
+  cout << "  subuser modify             modify subuser\n";
+  cout << "  subuser rm                 remove subuser\n";
+  cout << "  key create                 create access key\n";
+  cout << "  key rm                     remove access key\n";
+  cout << "  bucket list                list buckets (specify --allow-unordered for\n";
+  cout << "                             faster, unsorted listing)\n";
+  cout << "  bucket limit check         show bucket sharding stats\n";
+  cout << "  bucket link                link bucket to specified user\n";
+  cout << "  bucket unlink              unlink bucket from specified user\n";
+  cout << "  bucket stats               returns bucket statistics\n";
+  cout << "  bucket rm                  remove bucket\n";
+  cout << "  bucket check               check bucket index by verifying size and object count stats\n";
+  cout << "  bucket check olh           check for olh index entries and objects that are pending removal\n";
+  cout << "  bucket check unlinked      check for object versions that are not visible in a bucket listing \n";
+  cout << "  bucket chown               link bucket to specified user and update its object ACLs\n";
+  cout << "  bucket reshard             reshard bucket\n";
+  cout << "  bucket rewrite             rewrite all objects in the specified bucket\n";
+  cout << "  bucket sync checkpoint     poll a bucket's sync status until it catches up to its remote\n";
+  cout << "  bucket sync disable        disable bucket sync\n";
+  cout << "  bucket sync enable         enable bucket sync\n";
+  cout << "  bucket radoslist           list rados objects backing bucket's objects\n";
+  cout << "  bi get                     retrieve bucket index object entries\n";
+  cout << "  bi put                     store bucket index object entries\n";
+  cout << "  bi list                    list raw bucket index entries\n";
+  cout << "  bi purge                   purge bucket index entries\n";
+  cout << "  object rm                  remove object\n";
+  cout << "  object put                 put object\n";
+  cout << "  object stat                stat an object for its metadata\n";
+  cout << "  object unlink              unlink object from bucket index\n";
+  cout << "  object rewrite             rewrite the specified object\n";
+  cout << "  object reindex             reindex the object(s) indicated by --bucket and either --object or --objects-file\n";
+  cout << "  objects expire             run expired objects cleanup\n";
+  cout << "  objects expire-stale list  list stale expired objects (caused by reshard)\n";
+  cout << "  objects expire-stale rm    remove stale expired objects\n";
+  cout << "  period rm                  remove a period\n";
+  cout << "  period get                 get period info\n";
+  cout << "  period get-current         get current period info\n";
+  cout << "  period pull                pull a period\n";
+  cout << "  period push                push a period\n";
+  cout << "  period list                list all periods\n";
+  cout << "  period update              update the staging period\n";
+  cout << "  period commit              commit the staging period\n";
+  cout << "  quota set                  set quota params\n";
+  cout << "  quota enable               enable quota\n";
+  cout << "  quota disable              disable quota\n";
+  cout << "  ratelimit get              get ratelimit params\n";
+  cout << "  ratelimit set              set ratelimit params\n";
+  cout << "  ratelimit enable           enable ratelimit\n";
+  cout << "  ratelimit disable          disable ratelimit\n";
+  cout << "  global quota get           view global quota params\n";
+  cout << "  global quota set           set global quota params\n";
+  cout << "  global quota enable        enable a global quota\n";
+  cout << "  global quota disable       disable a global quota\n";
+  cout << "  global ratelimit get       view global ratelimit params\n";
+  cout << "  global ratelimit set       set global ratelimit params\n";
+  cout << "  global ratelimit enable    enable a ratelimit quota\n";
+  cout << "  global ratelimit disable   disable a ratelimit quota\n";
+  cout << "  realm create               create a new realm\n";
+  cout << "  realm rm                   remove a realm\n";
+  cout << "  realm get                  show realm info\n";
+  cout << "  realm get-default          get default realm name\n";
+  cout << "  realm list                 list realms\n";
+  cout << "  realm list-periods         list all realm periods\n";
+  cout << "  realm rename               rename a realm\n";
+  cout << "  realm set                  set realm info (requires infile)\n";
+  cout << "  realm default              set realm as default\n";
+  cout << "  realm pull                 pull a realm and its current period\n";
+  cout << "  zonegroup add              add a zone to a zonegroup\n";
+  cout << "  zonegroup create           create a new zone group info\n";
+  cout << "  zonegroup default          set default zone group\n";
+  cout << "  zonegroup delete           delete a zone group info\n";
+  cout << "  zonegroup get              show zone group info\n";
+  cout << "  zonegroup modify           modify an existing zonegroup\n";
+  cout << "  zonegroup set              set zone group info (requires infile)\n";
+  cout << "  zonegroup rm               remove a zone from a zonegroup\n";
+  cout << "  zonegroup rename           rename a zone group\n";
+  cout << "  zonegroup list             list all zone groups set on this cluster\n";
+  cout << "  zonegroup placement list   list zonegroup's placement targets\n";
+  cout << "  zonegroup placement get    get a placement target of a specific zonegroup\n";
+  cout << "  zonegroup placement add    add a placement target id to a zonegroup\n";
+  cout << "  zonegroup placement modify modify a placement target of a specific zonegroup\n";
+  cout << "  zonegroup placement rm     remove a placement target from a zonegroup\n";
+  cout << "  zonegroup placement default  set a zonegroup's default placement target\n";
+  cout << "  zone create                create a new zone\n";
+  cout << "  zone rm                    remove a zone\n";
+  cout << "  zone get                   show zone cluster params\n";
+  cout << "  zone modify                modify an existing zone\n";
+  cout << "  zone set                   set zone cluster params (requires infile)\n";
+  cout << "  zone list                  list all zones set on this cluster\n";
+  cout << "  zone rename                rename a zone\n";
+  cout << "  zone placement list        list zone's placement targets\n";
+  cout << "  zone placement get         get a zone placement target\n";
+  cout << "  zone placement add         add a zone placement target\n";
+  cout << "  zone placement modify      modify a zone placement target\n";
+  cout << "  zone placement rm          remove a zone placement target\n";
+  cout << "  metadata sync status       get metadata sync status\n";
+  cout << "  metadata sync init         init metadata sync\n";
+  cout << "  metadata sync run          run metadata sync\n";
+  cout << "  data sync status           get data sync status of the specified source zone\n";
+  cout << "  data sync init             init data sync for the specified source zone\n";
+  cout << "  data sync run              run data sync for the specified source zone\n";
+  cout << "  pool add                   add an existing pool for data placement\n";
+  cout << "  pool rm                    remove an existing pool from data placement set\n";
+  cout << "  pools list                 list placement active set\n";
+  cout << "  policy                     read bucket/object policy\n";
+  cout << "  log list                   list log objects\n";
+  cout << "  log show                   dump a log from specific object or (bucket + date\n";
+  cout << "                             + bucket-id)\n";
+  cout << "                             (NOTE: required to specify formatting of date\n";
+  cout << "                             to \"YYYY-MM-DD-hh\")\n";
+  cout << "  log rm                     remove log object\n";
+  cout << "  usage show                 show usage (by user, by bucket, date range)\n";
+  cout << "  usage trim                 trim usage (by user, by bucket, date range)\n";
+  cout << "  usage clear                reset all the usage stats for the cluster\n";
+  cout << "  gc list                    dump expired garbage collection objects (specify\n";
+  cout << "                             --include-all to list all entries, including unexpired)\n";
+  cout << "  gc process                 manually process garbage (specify\n";
+  cout << "                             --include-all to process all entries, including unexpired)\n";
+  cout << "  lc list                    list all bucket lifecycle progress\n";
+  cout << "  lc get                     get a lifecycle bucket configuration\n";
+  cout << "  lc process                 manually process lifecycle\n";
+  cout << "  lc reshard fix             fix LC for a resharded bucket\n";
+  cout << "  metadata get               get metadata info\n";
+  cout << "  metadata put               put metadata info\n";
+  cout << "  metadata rm                remove metadata info\n";
+  cout << "  metadata list              list metadata info\n";
+  cout << "  mdlog list                 list metadata log\n";
+  cout << "  mdlog autotrim             auto trim metadata log\n";
+  cout << "  mdlog trim                 trim metadata log (use marker)\n";
+  cout << "  mdlog status               read metadata log status\n";
+  cout << "  bilog list                 list bucket index log\n";
+  cout << "  bilog trim                 trim bucket index log (use start-marker, end-marker)\n";
+  cout << "  bilog status               read bucket index log status\n";
+  cout << "  bilog autotrim             auto trim bucket index log\n";
+  cout << "  datalog list               list data log\n";
+  cout << "  datalog trim               trim data log\n";
+  cout << "  datalog status             read data log status\n";
+  cout << "  datalog type               change datalog type to --log_type={fifo,omap}\n";
+  cout << "  orphans find               deprecated -- init and run search for leaked rados objects (use job-id, pool)\n";
+  cout << "  orphans finish             deprecated -- clean up search for leaked rados objects\n";
+  cout << "  orphans list-jobs          deprecated -- list the current job-ids for orphans search\n";
+  cout << "                           * the three 'orphans' sub-commands are now deprecated; consider using the `rgw-orphan-list` tool\n";
+  cout << "  role create                create a AWS role for use with STS\n";
+  cout << "  role delete                remove a role\n";
+  cout << "  role get                   get a role\n";
+  cout << "  role list                  list roles with specified path prefix\n";
+  cout << "  role-trust-policy modify   modify the assume role policy of an existing role\n";
+  cout << "  role-policy put            add/update permission policy to role\n";
+  cout << "  role-policy list           list policies attached to a role\n";
+  cout << "  role-policy get            get the specified inline policy document embedded with the given role\n";
+  cout << "  role-policy delete         remove policy attached to a role\n";
+  cout << "  role update                update max_session_duration of a role\n";
+  cout << "  reshard add                schedule a resharding of a bucket\n";
+  cout << "  reshard list               list all bucket resharding or scheduled to be resharded\n";
+  cout << "  reshard status             read bucket resharding status\n";
+  cout << "  reshard process            process of scheduled reshard jobs\n";
+  cout << "  reshard cancel             cancel resharding a bucket\n";
+  cout << "  reshard stale-instances list list stale-instances from bucket resharding\n";
+  cout << "  reshard stale-instances delete   cleanup stale-instances from bucket resharding\n";
+  cout << "  sync error list            list sync error\n";
+  cout << "  sync error trim            trim sync error\n";
+  cout << "  mfa create                 create a new MFA TOTP token\n";
+  cout << "  mfa list                   list MFA TOTP tokens\n";
+  cout << "  mfa get                    show MFA TOTP token\n";
+  cout << "  mfa remove                 delete MFA TOTP token\n";
+  cout << "  mfa check                  check MFA TOTP token\n";
+  cout << "  mfa resync                 re-sync MFA TOTP token\n";
+  cout << "  topic list                 list bucket notifications topics\n";
+  cout << "  topic get                  get a bucket notifications topic\n";
+  cout << "  topic rm                   remove a bucket notifications topic\n";
+  cout << "  script put                 upload a lua script to a context\n";
+  cout << "  script get                 get the lua script of a context\n";
+  cout << "  script rm                  remove the lua scripts of a context\n";
+  cout << "  script-package add         add a lua package to the scripts allowlist\n";
+  cout << "  script-package rm          remove a lua package from the scripts allowlist\n";
+  cout << "  script-package list        get the lua packages allowlist\n";
+  cout << "  notification list          list bucket notifications configuration\n";
+  cout << "  notification get           get a bucket notifications configuration\n";
+  cout << "  notification rm            remove a bucket notifications configuration\n";
+  cout << "options:\n";
+  cout << "   --tenant=<tenant>         tenant name\n";
+  cout << "   --user_ns=<namespace>     namespace of user (oidc in case of users authenticated with oidc provider)\n";
+  cout << "   --uid=<id>                user id\n";
+  cout << "   --new-uid=<id>            new user id\n";
+  cout << "   --subuser=<name>          subuser name\n";
+  cout << "   --access-key=<key>        S3 access key\n";
+  cout << "   --email=<email>           user's email address\n";
+  cout << "   --secret/--secret-key=<key>\n";
+  cout << "                             specify secret key\n";
+  cout << "   --gen-access-key          generate random access key (for S3)\n";
+  cout << "   --gen-secret              generate random secret key\n";
+  cout << "   --key-type=<type>         key type, options are: swift, s3\n";
+  cout << "   --temp-url-key[-2]=<key>  temp url key\n";
+  cout << "   --access=<access>         Set access permissions for sub-user, should be one\n";
+  cout << "                             of read, write, readwrite, full\n";
+  cout << "   --display-name=<name>     user's display name\n";
+  cout << "   --max-buckets             max number of buckets for a user\n";
+  cout << "   --admin                   set the admin flag on the user\n";
+  cout << "   --system                  set the system flag on the user\n";
+  cout << "   --op-mask                 set the op mask on the user\n";
+  cout << "   --bucket=<bucket>         Specify the bucket name. Also used by the quota command.\n";
+  cout << "   --pool=<pool>             Specify the pool name. Also used to scan for leaked rados objects.\n";
+  cout << "   --object=<object>         object name\n";
+  cout << "   --objects-file=<file>     file containing a list of object names to process\n";
+  cout << "   --object-version=<version>         object version\n";
+  cout << "   --date=<date>             date in the format yyyy-mm-dd\n";
+  cout << "   --start-date=<date>       start date in the format yyyy-mm-dd\n";
+  cout << "   --end-date=<date>         end date in the format yyyy-mm-dd\n";
+  cout << "   --bucket-id=<bucket-id>   bucket id\n";
+  cout << "   --bucket-new-name=<bucket>\n";
+  cout << "                             for bucket link: optional new name\n";
+  cout << "   --shard-id=<shard-id>     optional for: \n";
+  cout << "                               mdlog list\n";
+  cout << "                               data sync status\n";
+  cout << "                             required for: \n";
+  cout << "                               mdlog trim\n";
+  cout << "   --gen=<gen-id>            optional for: \n";
+  cout << "                               bilog list\n";
+  cout << "                               bilog trim\n";
+  cout << "                               bilog status\n";
+  cout << "   --max-entries=<entries>   max entries for listing operations\n";
+  cout << "   --metadata-key=<key>      key to retrieve metadata from with metadata get\n";
+  cout << "   --remote=<remote>         zone or zonegroup id of remote gateway\n";
+  cout << "   --period=<id>             period id\n";
+  cout << "   --url=<url>               url for pushing/pulling period/realm\n";
+  cout << "   --epoch=<number>          period epoch\n";
+  cout << "   --commit                  commit the period during 'period update'\n";
+  cout << "   --staging                 get staging period info\n";
+  cout << "   --master                  set as master\n";
+  cout << "   --master-zone=<id>        master zone id\n";
+  cout << "   --rgw-realm=<name>        realm name\n";
+  cout << "   --realm-id=<id>           realm id\n";
+  cout << "   --realm-new-name=<name>   realm new name\n";
+  cout << "   --rgw-zonegroup=<name>    zonegroup name\n";
+  cout << "   --zonegroup-id=<id>       zonegroup id\n";
+  cout << "   --zonegroup-new-name=<name>\n";
+  cout << "                             zonegroup new name\n";
+  cout << "   --rgw-zone=<name>         name of zone in which radosgw is running\n";
+  cout << "   --zone-id=<id>            zone id\n";
+  cout << "   --zone-new-name=<name>    zone new name\n";
+  cout << "   --source-zone             specify the source zone (for data sync)\n";
+  cout << "   --default                 set entity (realm, zonegroup, zone) as default\n";
+  cout << "   --read-only               set zone as read-only (when adding to zonegroup)\n";
+  cout << "   --redirect-zone           specify zone id to redirect when response is 404 (not found)\n";
+  cout << "   --placement-id            placement id for zonegroup placement commands\n";
+  cout << "   --storage-class           storage class for zonegroup placement commands\n";
+  cout << "   --tags=<list>             list of tags for zonegroup placement add and modify commands\n";
+  cout << "   --tags-add=<list>         list of tags to add for zonegroup placement modify command\n";
+  cout << "   --tags-rm=<list>          list of tags to remove for zonegroup placement modify command\n";
+  cout << "   --endpoints=<list>        zone endpoints\n";
+  cout << "   --index-pool=<pool>       placement target index pool\n";
+  cout << "   --data-pool=<pool>        placement target data pool\n";
+  cout << "   --data-extra-pool=<pool>  placement target data extra (non-ec) pool\n";
+  cout << "   --placement-index-type=<type>\n";
+  cout << "                             placement target index type (normal, indexless, or #id)\n";
+  cout << "   --placement-inline-data=<true>\n";
+  cout << "                             set whether the placement target is configured to store a data\n";
+  cout << "                             chunk inline in head objects\n";
+  cout << "   --compression=<type>      placement target compression type (plugin name or empty/none)\n";
+  cout << "   --tier-type=<type>        zone tier type\n";
+  cout << "   --tier-config=<k>=<v>[,...]\n";
+  cout << "                             set zone tier config keys, values\n";
+  cout << "   --tier-config-rm=<k>[,...]\n";
+  cout << "                             unset zone tier config keys\n";
+  cout << "   --sync-from-all[=false]   set/reset whether zone syncs from all zonegroup peers\n";
+  cout << "   --sync-from=[zone-name][,...]\n";
+  cout << "                             set list of zones to sync from\n";
+  cout << "   --sync-from-rm=[zone-name][,...]\n";
+  cout << "                             remove zones from list of zones to sync from\n";
+  cout << "   --bucket-index-max-shards override a zone/zonegroup's default bucket index shard count\n";
+  cout << "   --fix                     besides checking bucket index, will also fix it\n";
+  cout << "   --check-objects           bucket check: rebuilds bucket index according to\n";
+  cout << "                             actual objects state\n";
+  cout << "   --format=<format>         specify output format for certain operations: xml,\n";
+  cout << "                             json\n";
+  cout << "   --purge-data              when specified, user removal will also purge all the\n";
+  cout << "                             user data\n";
+  cout << "   --purge-keys              when specified, subuser removal will also purge all the\n";
+  cout << "                             subuser keys\n";
+  cout << "   --purge-objects           remove a bucket's objects before deleting it\n";
+  cout << "                             (NOTE: required to delete a non-empty bucket)\n";
+  cout << "   --sync-stats              option to 'user stats', update user stats with current\n";
+  cout << "                             stats reported by user's buckets indexes\n";
+  cout << "   --reset-stats             option to 'user stats', reset stats in accordance with user buckets\n";
+  cout << "   --show-config             show configuration\n";
+  cout << "   --show-log-entries=<flag> enable/disable dump of log entries on log show\n";
+  cout << "   --show-log-sum=<flag>     enable/disable dump of log summation on log show\n";
+  cout << "   --skip-zero-entries       log show only dumps entries that don't have zero value\n";
+  cout << "                             in one of the numeric field\n";
+  cout << "   --infile=<file>           file to read in when setting data\n";
+  cout << "   --categories=<list>       comma separated list of categories, used in usage show\n";
+  cout << "   --caps=<caps>             list of caps (e.g., \"usage=read, write; user=read\")\n";
+  cout << "   --op-mask=<op-mask>       permission of user's operations (e.g., \"read, write, delete, *\")\n";
+  cout << "   --yes-i-really-mean-it    required for certain operations\n";
+  cout << "   --warnings-only           when specified with bucket limit check, list\n";
+  cout << "                             only buckets nearing or over the current max\n";
+  cout << "                             objects per shard value\n";
+  cout << "   --bypass-gc               when specified with bucket deletion, triggers\n";
+  cout << "                             object deletions by not involving GC\n";
+  cout << "   --inconsistent-index      when specified with bucket deletion and bypass-gc set to true,\n";
+  cout << "                             ignores bucket index consistency\n";
+  cout << "   --min-rewrite-size        min object size for bucket rewrite (default 4M)\n";
+  cout << "   --max-rewrite-size        max object size for bucket rewrite (default ULLONG_MAX)\n";
+  cout << "   --min-rewrite-stripe-size min stripe size for object rewrite (default 0)\n";
+  cout << "   --trim-delay-ms           time interval in msec to limit the frequency of sync error log entries trimming operations,\n";
+  cout << "                             the trimming process will sleep the specified msec for every 1000 entries trimmed\n";
+  cout << "   --max-concurrent-ios      maximum concurrent ios for bucket operations (default: 32)\n";
+  cout << "   --enable-feature          enable a zone/zonegroup feature\n";
+  cout << "   --disable-feature         disable a zone/zonegroup feature\n";
+  cout << "\n";
+  cout << "<date> := \"YYYY-MM-DD[ hh:mm:ss]\"\n";
+  cout << "\nQuota options:\n";
+  cout << "   --max-objects             specify max objects (negative value to disable)\n";
+  cout << "   --max-size                specify max size (in B/K/M/G/T, negative value to disable)\n";
+  cout << "   --quota-scope             scope of quota (bucket, user)\n";
+  cout << "\nRate limiting options:\n";
+  cout << "   --max-read-ops            specify max requests per minute for READ ops per RGW (GET and HEAD request methods), 0 means unlimited\n";
+  cout << "   --max-read-bytes          specify max bytes per minute for READ ops per RGW (GET and HEAD request methods), 0 means unlimited\n";
+  cout << "   --max-write-ops           specify max requests per minute for WRITE ops per RGW (Not GET or HEAD request methods), 0 means unlimited\n";
+  cout << "   --max-write-bytes         specify max bytes per minute for WRITE ops per RGW (Not GET or HEAD request methods), 0 means unlimited\n";
+  cout << "   --ratelimit-scope         scope of rate limiting: bucket, user, anonymous\n";
+  cout << "                             anonymous can be configured only with global rate limit\n";
+  cout << "\nOrphans search options:\n";
+  cout << "   --num-shards              num of shards to use for keeping the temporary scan info\n";
+  cout << "   --orphan-stale-secs       num of seconds to wait before declaring an object to be an orphan (default: 86400)\n";
+  cout << "   --job-id                  set the job id (for orphans find)\n";
+  cout << "   --detail                  detailed mode, log and stat head objects as well\n";
+  cout << "\nOrphans list-jobs options:\n";
+  cout << "   --extra-info              provide extra info in job list\n";
+  cout << "\nRole options:\n";
+  cout << "   --role-name               name of the role to create\n";
+  cout << "   --path                    path to the role\n";
+  cout << "   --assume-role-policy-doc  the trust relationship policy document that grants an entity permission to assume the role\n";
+  cout << "   --policy-name             name of the policy document\n";
+  cout << "   --policy-doc              permission policy document\n";
+  cout << "   --path-prefix             path prefix for filtering roles\n";
+  cout << "\nMFA options:\n";
+  cout << "   --totp-serial             a string that represents the ID of a TOTP token\n";
+  cout << "   --totp-seed               the secret seed that is used to calculate the TOTP\n";
+  cout << "   --totp-seconds            the time resolution that is being used for TOTP generation\n";
+  cout << "   --totp-window             the number of TOTP tokens that are checked before and after the current token when validating token\n";
+  cout << "   --totp-pin                the valid value of a TOTP token at a certain time\n";
+  cout << "\nBucket notifications options:\n";
+  cout << "   --topic                   bucket notifications topic name\n";
+  cout << "   --notification-id         bucket notifications id\n";
+  cout << "\nScript options:\n";
+  cout << "   --context                 context in which the script runs. one of: "+LUA_CONTEXT_LIST+"\n";
+  cout << "   --package                 name of the lua package that should be added/removed to/from the allowlist\n";
+  cout << "   --allow-compilation       package is allowed to compile C code as part of its installation\n";
+  cout << "\nBucket check olh/unlinked options:\n";
+  cout << "   --min-age-hours           minimum age of unlinked objects to consider for bucket check unlinked (default: 1)\n";
+  cout << "   --dump-keys               when specified, all keys identified as problematic are printed to stdout\n";
+  cout << "   --hide-progress           when specified, per-shard progress details are not printed to stderr\n";
+  cout << "\nradoslist options:\n";
+  cout << "   --rgw-obj-fs              the field separator that will separate the rados\n";
+  cout << "                             object name from the rgw object name;\n";
+  cout << "                             additionally rados objects for incomplete\n";
+  cout << "                             multipart uploads will not be output\n";
+  cout << "\n";
+  generic_client_usage();
+}
+
+
+class SimpleCmd {
+public:
+  struct Def {
+    string cmd;
+    std::any opt;
+  };
+
+  using Aliases = std::vector<std::set<string> >;
+  using Commands = std::vector<Def>;
+
+private:
+  struct Node {
+    map<string, Node> next;
+    set<string> expected; /* separate un-normalized list */
+    std::any opt;
+  };
+
+  Node cmd_root;
+  map<string, string> alias_map;
+
+  string normalize_alias(const string& s) const {
+    auto iter = alias_map.find(s);
+    if (iter == alias_map.end()) {
+      return s;
+    }
+
+    return iter->second;
+  }
+  void init_alias_map(Aliases& aliases) {
+    for (auto& alias_set : aliases) {
+      std::optional<string> first;
+
+      for (auto& alias : alias_set) {
+        if (!first) {
+          first = alias;
+        } else {
+          alias_map[alias] = *first;
+        }
+      }
+    }
+  }
+
+  bool gen_next_expected(Node *node, vector<string> *expected, bool ret) {
+    for (auto& next_cmd : node->expected) {
+      expected->push_back(next_cmd);
+    }
+    return ret;
+  }
+
+  Node root;
+
+public:
+  SimpleCmd() {}
+
+  SimpleCmd(std::optional<Commands> cmds,
+            std::optional<Aliases> aliases) {
+    if (aliases) {
+      add_aliases(*aliases);
+    }
+
+    if (cmds) {
+      add_commands(*cmds);
+    }
+  }
+
+  void add_aliases(Aliases& aliases) {
+    init_alias_map(aliases);
+  }
+
+  void add_commands(std::vector<Def>& cmds) {
+    for (auto& cmd : cmds) {
+      vector<string> words;
+      get_str_vec(cmd.cmd, " ", words);
+
+      auto node = &cmd_root;
+      for (auto& word : words) {
+        auto norm = normalize_alias(word);
+        auto parent = node;
+
+        node->expected.insert(word);
+
+        node = &node->next[norm];
+
+        if (norm == "[*]") { /* optional param at the end */
+          parent->next["*"] = *node; /* can be also looked up by '*' */
+          parent->opt = cmd.opt;
+        }
+      }
+
+      node->opt = cmd.opt;
+    }
+  }
+
+  template <class Container>
+  bool find_command(Container& args,
+                    std::any *opt_cmd,
+                    vector<string> *extra_args,
+                    string *error,
+                    vector<string> *expected) {
+    auto node = &cmd_root;
+
+    std::optional<std::any> found_opt;
+
+    for (auto& arg : args) {
+      string norm = normalize_alias(arg);
+      auto iter = node->next.find(norm);
+      if (iter == node->next.end()) {
+        iter = node->next.find("*");
+        if (iter == node->next.end()) {
+          *error = string("ERROR: Unrecognized argument: '") + arg + "'";
+          return gen_next_expected(node, expected, false);
+        }
+        extra_args->push_back(arg);
+        if (!found_opt) {
+          found_opt = node->opt;
+        }
+      }
+      node = &(iter->second);
+    }
+
+    *opt_cmd = found_opt.value_or(node->opt);
+
+    if (!opt_cmd->has_value()) {
+      *error ="ERROR: Unknown command";
+      return gen_next_expected(node, expected, false);
+    }
+
+    return true;
+  }
+};
+
+
+namespace rgw_admin {
+
+enum class OPT {
+  NO_CMD,
+  USER_CREATE,
+  USER_INFO,
+  USER_MODIFY,
+  USER_RENAME,
+  USER_RM,
+  USER_SUSPEND,
+  USER_ENABLE,
+  USER_CHECK,
+  USER_STATS,
+  USER_LIST,
+  SUBUSER_CREATE,
+  SUBUSER_MODIFY,
+  SUBUSER_RM,
+  KEY_CREATE,
+  KEY_RM,
+  BUCKETS_LIST,
+  BUCKET_LIMIT_CHECK,
+  BUCKET_LINK,
+  BUCKET_UNLINK,
+  BUCKET_LAYOUT,
+  BUCKET_STATS,
+  BUCKET_CHECK,
+  BUCKET_CHECK_OLH,
+  BUCKET_CHECK_UNLINKED,
+  BUCKET_SYNC_CHECKPOINT,
+  BUCKET_SYNC_INFO,
+  BUCKET_SYNC_STATUS,
+  BUCKET_SYNC_MARKERS,
+  BUCKET_SYNC_INIT,
+  BUCKET_SYNC_RUN,
+  BUCKET_SYNC_DISABLE,
+  BUCKET_SYNC_ENABLE,
+  BUCKET_RM,
+  BUCKET_REWRITE,
+  BUCKET_RESHARD,
+  BUCKET_CHOWN,
+  BUCKET_RADOS_LIST,
+  BUCKET_SHARD_OBJECTS,
+  BUCKET_OBJECT_SHARD,
+  BUCKET_RESYNC_ENCRYPTED_MULTIPART,
+  POLICY,
+  POOL_ADD,
+  POOL_RM,
+  POOLS_LIST,
+  LOG_LIST,
+  LOG_SHOW,
+  LOG_RM,
+  USAGE_SHOW,
+  USAGE_TRIM,
+  USAGE_CLEAR,
+  OBJECT_PUT,
+  OBJECT_RM,
+  OBJECT_UNLINK,
+  OBJECT_STAT,
+  OBJECT_REWRITE,
+  OBJECT_REINDEX,
+  OBJECTS_EXPIRE,
+  OBJECTS_EXPIRE_STALE_LIST,
+  OBJECTS_EXPIRE_STALE_RM,
+  BI_GET,
+  BI_PUT,
+  BI_LIST,
+  BI_PURGE,
+  OLH_GET,
+  OLH_READLOG,
+  QUOTA_SET,
+  QUOTA_ENABLE,
+  QUOTA_DISABLE,
+  GC_LIST,
+  GC_PROCESS,
+  LC_LIST,
+  LC_GET,
+  LC_PROCESS,
+  LC_RESHARD_FIX,
+  ORPHANS_FIND,
+  ORPHANS_FINISH,
+  ORPHANS_LIST_JOBS,
+  RATELIMIT_GET,
+  RATELIMIT_SET,
+  RATELIMIT_ENABLE,
+  RATELIMIT_DISABLE,
+  ZONEGROUP_ADD,
+  ZONEGROUP_CREATE,
+  ZONEGROUP_DEFAULT,
+  ZONEGROUP_DELETE,
+  ZONEGROUP_GET,
+  ZONEGROUP_MODIFY,
+  ZONEGROUP_SET,
+  ZONEGROUP_LIST,
+  ZONEGROUP_REMOVE,
+  ZONEGROUP_RENAME,
+  ZONEGROUP_PLACEMENT_ADD,
+  ZONEGROUP_PLACEMENT_MODIFY,
+  ZONEGROUP_PLACEMENT_RM,
+  ZONEGROUP_PLACEMENT_LIST,
+  ZONEGROUP_PLACEMENT_GET,
+  ZONEGROUP_PLACEMENT_DEFAULT,
+  ZONE_CREATE,
+  ZONE_DELETE,
+  ZONE_GET,
+  ZONE_MODIFY,
+  ZONE_SET,
+  ZONE_LIST,
+  ZONE_RENAME,
+  ZONE_DEFAULT,
+  ZONE_PLACEMENT_ADD,
+  ZONE_PLACEMENT_MODIFY,
+  ZONE_PLACEMENT_RM,
+  ZONE_PLACEMENT_LIST,
+  ZONE_PLACEMENT_GET,
+  CAPS_ADD,
+  CAPS_RM,
+  METADATA_GET,
+  METADATA_PUT,
+  METADATA_RM,
+  METADATA_LIST,
+  METADATA_SYNC_STATUS,
+  METADATA_SYNC_INIT,
+  METADATA_SYNC_RUN,
+  MDLOG_LIST,
+  MDLOG_AUTOTRIM,
+  MDLOG_TRIM,
+  MDLOG_FETCH,
+  MDLOG_STATUS,
+  SYNC_ERROR_LIST,
+  SYNC_ERROR_TRIM,
+  SYNC_GROUP_CREATE,
+  SYNC_GROUP_MODIFY,
+  SYNC_GROUP_GET,
+  SYNC_GROUP_REMOVE,
+  SYNC_GROUP_FLOW_CREATE,
+  SYNC_GROUP_FLOW_REMOVE,
+  SYNC_GROUP_PIPE_CREATE,
+  SYNC_GROUP_PIPE_MODIFY,
+  SYNC_GROUP_PIPE_REMOVE,
+  SYNC_POLICY_GET,
+  BILOG_LIST,
+  BILOG_TRIM,
+  BILOG_STATUS,
+  BILOG_AUTOTRIM,
+  DATA_SYNC_STATUS,
+  DATA_SYNC_INIT,
+  DATA_SYNC_RUN,
+  DATALOG_LIST,
+  DATALOG_STATUS,
+  DATALOG_AUTOTRIM,
+  DATALOG_TRIM,
+  DATALOG_TYPE,
+  DATALOG_PRUNE,
+  REALM_CREATE,
+  REALM_DELETE,
+  REALM_GET,
+  REALM_GET_DEFAULT,
+  REALM_LIST,
+  REALM_LIST_PERIODS,
+  REALM_RENAME,
+  REALM_SET,
+  REALM_DEFAULT,
+  REALM_PULL,
+  PERIOD_DELETE,
+  PERIOD_GET,
+  PERIOD_GET_CURRENT,
+  PERIOD_PULL,
+  PERIOD_PUSH,
+  PERIOD_LIST,
+  PERIOD_UPDATE,
+  PERIOD_COMMIT,
+  GLOBAL_QUOTA_GET,
+  GLOBAL_QUOTA_SET,
+  GLOBAL_QUOTA_ENABLE,
+  GLOBAL_QUOTA_DISABLE,
+  GLOBAL_RATELIMIT_GET,
+  GLOBAL_RATELIMIT_SET,
+  GLOBAL_RATELIMIT_ENABLE,
+  GLOBAL_RATELIMIT_DISABLE,
+  SYNC_INFO,
+  SYNC_STATUS,
+  ROLE_CREATE,
+  ROLE_DELETE,
+  ROLE_GET,
+  ROLE_TRUST_POLICY_MODIFY,
+  ROLE_LIST,
+  ROLE_POLICY_PUT,
+  ROLE_POLICY_LIST,
+  ROLE_POLICY_GET,
+  ROLE_POLICY_DELETE,
+  ROLE_UPDATE,
+  RESHARD_ADD,
+  RESHARD_LIST,
+  RESHARD_STATUS,
+  RESHARD_PROCESS,
+  RESHARD_CANCEL,
+  MFA_CREATE,
+  MFA_REMOVE,
+  MFA_GET,
+  MFA_LIST,
+  MFA_CHECK,
+  MFA_RESYNC,
+  RESHARD_STALE_INSTANCES_LIST,
+  RESHARD_STALE_INSTANCES_DELETE,
+  PUBSUB_TOPIC_LIST,
+  PUBSUB_TOPIC_GET,
+  PUBSUB_TOPIC_RM,
+  PUBSUB_NOTIFICATION_LIST,
+  PUBSUB_NOTIFICATION_GET,
+  PUBSUB_NOTIFICATION_RM,
+  SCRIPT_PUT,
+  SCRIPT_GET,
+  SCRIPT_RM,
+  SCRIPT_PACKAGE_ADD,
+  SCRIPT_PACKAGE_RM,
+  SCRIPT_PACKAGE_LIST
+};
+
+}
+
+using namespace rgw_admin;
+
+static SimpleCmd::Commands all_cmds = {
+  { "user create", OPT::USER_CREATE },
+  { "user info", OPT::USER_INFO },
+  { "user modify", OPT::USER_MODIFY },
+  { "user rename", OPT::USER_RENAME },
+  { "user rm", OPT::USER_RM },
+  { "user suspend", OPT::USER_SUSPEND },
+  { "user enable", OPT::USER_ENABLE },
+  { "user check", OPT::USER_CHECK },
+  { "user stats", OPT::USER_STATS },
+  { "user list", OPT::USER_LIST },
+  { "subuser create", OPT::SUBUSER_CREATE },
+  { "subuser modify", OPT::SUBUSER_MODIFY },
+  { "subuser rm", OPT::SUBUSER_RM },
+  { "key create", OPT::KEY_CREATE },
+  { "key rm", OPT::KEY_RM },
+  { "buckets list", OPT::BUCKETS_LIST },
+  { "bucket list", OPT::BUCKETS_LIST },
+  { "bucket limit check", OPT::BUCKET_LIMIT_CHECK },
+  { "bucket link", OPT::BUCKET_LINK },
+  { "bucket unlink", OPT::BUCKET_UNLINK },
+  { "bucket layout", OPT::BUCKET_LAYOUT },
+  { "bucket stats", OPT::BUCKET_STATS },
+  { "bucket check", OPT::BUCKET_CHECK },
+  { "bucket check olh", OPT::BUCKET_CHECK_OLH },
+  { "bucket check unlinked", OPT::BUCKET_CHECK_UNLINKED },
+  { "bucket sync checkpoint", OPT::BUCKET_SYNC_CHECKPOINT },
+  { "bucket sync info", OPT::BUCKET_SYNC_INFO },
+  { "bucket sync status", OPT::BUCKET_SYNC_STATUS },
+  { "bucket sync markers", OPT::BUCKET_SYNC_MARKERS },
+  { "bucket sync init", OPT::BUCKET_SYNC_INIT },
+  { "bucket sync run", OPT::BUCKET_SYNC_RUN },
+  { "bucket sync disable", OPT::BUCKET_SYNC_DISABLE },
+  { "bucket sync enable", OPT::BUCKET_SYNC_ENABLE },
+  { "bucket rm", OPT::BUCKET_RM },
+  { "bucket rewrite", OPT::BUCKET_REWRITE },
+  { "bucket reshard", OPT::BUCKET_RESHARD },
+  { "bucket chown", OPT::BUCKET_CHOWN },
+  { "bucket radoslist", OPT::BUCKET_RADOS_LIST },
+  { "bucket rados list", OPT::BUCKET_RADOS_LIST },
+  { "bucket shard objects", OPT::BUCKET_SHARD_OBJECTS },
+  { "bucket shard object", OPT::BUCKET_SHARD_OBJECTS },
+  { "bucket object shard", OPT::BUCKET_OBJECT_SHARD },
+  { "bucket resync encrypted multipart", OPT::BUCKET_RESYNC_ENCRYPTED_MULTIPART },
+  { "policy", OPT::POLICY },
+  { "pool add", OPT::POOL_ADD },
+  { "pool rm", OPT::POOL_RM },
+  { "pool list", OPT::POOLS_LIST },
+  { "pools list", OPT::POOLS_LIST },
+  { "log list", OPT::LOG_LIST },
+  { "log show", OPT::LOG_SHOW },
+  { "log rm", OPT::LOG_RM },
+  { "usage show", OPT::USAGE_SHOW },
+  { "usage trim", OPT::USAGE_TRIM },
+  { "usage clear", OPT::USAGE_CLEAR },
+  { "object put", OPT::OBJECT_PUT },
+  { "object rm", OPT::OBJECT_RM },
+  { "object unlink", OPT::OBJECT_UNLINK },
+  { "object stat", OPT::OBJECT_STAT },
+  { "object rewrite", OPT::OBJECT_REWRITE },
+  { "object reindex", OPT::OBJECT_REINDEX },
+  { "objects expire", OPT::OBJECTS_EXPIRE },
+  { "objects expire-stale list", OPT::OBJECTS_EXPIRE_STALE_LIST },
+  { "objects expire-stale rm", OPT::OBJECTS_EXPIRE_STALE_RM },
+  { "bi get", OPT::BI_GET },
+  { "bi put", OPT::BI_PUT },
+  { "bi list", OPT::BI_LIST },
+  { "bi purge", OPT::BI_PURGE },
+  { "olh get", OPT::OLH_GET },
+  { "olh readlog", OPT::OLH_READLOG },
+  { "quota set", OPT::QUOTA_SET },
+  { "quota enable", OPT::QUOTA_ENABLE },
+  { "quota disable", OPT::QUOTA_DISABLE },
+  { "ratelimit get", OPT::RATELIMIT_GET },
+  { "ratelimit set", OPT::RATELIMIT_SET },
+  { "ratelimit enable", OPT::RATELIMIT_ENABLE },
+  { "ratelimit disable", OPT::RATELIMIT_DISABLE },
+  { "gc list", OPT::GC_LIST },
+  { "gc process", OPT::GC_PROCESS },
+  { "lc list", OPT::LC_LIST },
+  { "lc get", OPT::LC_GET },
+  { "lc process", OPT::LC_PROCESS },
+  { "lc reshard fix", OPT::LC_RESHARD_FIX },
+  { "orphans find", OPT::ORPHANS_FIND },
+  { "orphans finish", OPT::ORPHANS_FINISH },
+  { "orphans list jobs", OPT::ORPHANS_LIST_JOBS },
+  { "orphans list-jobs", OPT::ORPHANS_LIST_JOBS },
+  { "zonegroup add", OPT::ZONEGROUP_ADD },
+  { "zonegroup create", OPT::ZONEGROUP_CREATE },
+  { "zonegroup default", OPT::ZONEGROUP_DEFAULT },
+  { "zonegroup delete", OPT::ZONEGROUP_DELETE },
+  { "zonegroup get", OPT::ZONEGROUP_GET },
+  { "zonegroup modify", OPT::ZONEGROUP_MODIFY },
+  { "zonegroup set", OPT::ZONEGROUP_SET },
+  { "zonegroup list", OPT::ZONEGROUP_LIST },
+  { "zonegroups list", OPT::ZONEGROUP_LIST },
+  { "zonegroup remove", OPT::ZONEGROUP_REMOVE },
+  { "zonegroup remove zone", OPT::ZONEGROUP_REMOVE },
+  { "zonegroup rename", OPT::ZONEGROUP_RENAME },
+  { "zonegroup placement add", OPT::ZONEGROUP_PLACEMENT_ADD },
+  { "zonegroup placement modify", OPT::ZONEGROUP_PLACEMENT_MODIFY },
+  { "zonegroup placement rm", OPT::ZONEGROUP_PLACEMENT_RM },
+  { "zonegroup placement list", OPT::ZONEGROUP_PLACEMENT_LIST },
+  { "zonegroup placement get", OPT::ZONEGROUP_PLACEMENT_GET },
+  { "zonegroup placement default", OPT::ZONEGROUP_PLACEMENT_DEFAULT },
+  { "zone create", OPT::ZONE_CREATE },
+  { "zone delete", OPT::ZONE_DELETE },
+  { "zone get", OPT::ZONE_GET },
+  { "zone modify", OPT::ZONE_MODIFY },
+  { "zone set", OPT::ZONE_SET },
+  { "zone list", OPT::ZONE_LIST },
+  { "zones list", OPT::ZONE_LIST },
+  { "zone rename", OPT::ZONE_RENAME },
+  { "zone default", OPT::ZONE_DEFAULT },
+  { "zone placement add", OPT::ZONE_PLACEMENT_ADD },
+  { "zone placement modify", OPT::ZONE_PLACEMENT_MODIFY },
+  { "zone placement rm", OPT::ZONE_PLACEMENT_RM },
+  { "zone placement list", OPT::ZONE_PLACEMENT_LIST },
+  { "zone placement get", OPT::ZONE_PLACEMENT_GET },
+  { "caps add", OPT::CAPS_ADD },
+  { "caps rm", OPT::CAPS_RM },
+  { "metadata get [*]", OPT::METADATA_GET },
+  { "metadata put [*]", OPT::METADATA_PUT },
+  { "metadata rm [*]", OPT::METADATA_RM },
+  { "metadata list [*]", OPT::METADATA_LIST },
+  { "metadata sync status", OPT::METADATA_SYNC_STATUS },
+  { "metadata sync init", OPT::METADATA_SYNC_INIT },
+  { "metadata sync run", OPT::METADATA_SYNC_RUN },
+  { "mdlog list", OPT::MDLOG_LIST },
+  { "mdlog autotrim", OPT::MDLOG_AUTOTRIM },
+  { "mdlog trim", OPT::MDLOG_TRIM },
+  { "mdlog fetch", OPT::MDLOG_FETCH },
+  { "mdlog status", OPT::MDLOG_STATUS },
+  { "sync error list", OPT::SYNC_ERROR_LIST },
+  { "sync error trim", OPT::SYNC_ERROR_TRIM },
+  { "sync policy get", OPT::SYNC_POLICY_GET },
+  { "sync group create", OPT::SYNC_GROUP_CREATE },
+  { "sync group modify", OPT::SYNC_GROUP_MODIFY },
+  { "sync group get", OPT::SYNC_GROUP_GET },
+  { "sync group remove", OPT::SYNC_GROUP_REMOVE },
+  { "sync group flow create", OPT::SYNC_GROUP_FLOW_CREATE },
+  { "sync group flow remove", OPT::SYNC_GROUP_FLOW_REMOVE },
+  { "sync group pipe create", OPT::SYNC_GROUP_PIPE_CREATE },
+  { "sync group pipe modify", OPT::SYNC_GROUP_PIPE_MODIFY },
+  { "sync group pipe remove", OPT::SYNC_GROUP_PIPE_REMOVE },
+  { "bilog list", OPT::BILOG_LIST },
+  { "bilog trim", OPT::BILOG_TRIM },
+  { "bilog status", OPT::BILOG_STATUS },
+  { "bilog autotrim", OPT::BILOG_AUTOTRIM },
+  { "data sync status", OPT::DATA_SYNC_STATUS },
+  { "data sync init", OPT::DATA_SYNC_INIT },
+  { "data sync run", OPT::DATA_SYNC_RUN },
+  { "datalog list", OPT::DATALOG_LIST },
+  { "datalog status", OPT::DATALOG_STATUS },
+  { "datalog autotrim", OPT::DATALOG_AUTOTRIM },
+  { "datalog trim", OPT::DATALOG_TRIM },
+  { "datalog type", OPT::DATALOG_TYPE },
+  { "datalog prune", OPT::DATALOG_PRUNE },
+  { "realm create", OPT::REALM_CREATE },
+  { "realm rm", OPT::REALM_DELETE },
+  { "realm get", OPT::REALM_GET },
+  { "realm get default", OPT::REALM_GET_DEFAULT },
+  { "realm get-default", OPT::REALM_GET_DEFAULT },
+  { "realm list", OPT::REALM_LIST },
+  { "realm list periods", OPT::REALM_LIST_PERIODS },
+  { "realm list-periods", OPT::REALM_LIST_PERIODS },
+  { "realm rename", OPT::REALM_RENAME },
+  { "realm set", OPT::REALM_SET },
+  { "realm default", OPT::REALM_DEFAULT },
+  { "realm pull", OPT::REALM_PULL },
+  { "period delete", OPT::PERIOD_DELETE },
+  { "period get", OPT::PERIOD_GET },
+  { "period get-current", OPT::PERIOD_GET_CURRENT },
+  { "period get current", OPT::PERIOD_GET_CURRENT },
+  { "period pull", OPT::PERIOD_PULL },
+  { "period push", OPT::PERIOD_PUSH },
+  { "period list", OPT::PERIOD_LIST },
+  { "period update", OPT::PERIOD_UPDATE },
+  { "period commit", OPT::PERIOD_COMMIT },
+  { "global quota get", OPT::GLOBAL_QUOTA_GET },
+  { "global quota set", OPT::GLOBAL_QUOTA_SET },
+  { "global quota enable", OPT::GLOBAL_QUOTA_ENABLE },
+  { "global quota disable", OPT::GLOBAL_QUOTA_DISABLE },
+  { "global ratelimit get", OPT::GLOBAL_RATELIMIT_GET },
+  { "global ratelimit set", OPT::GLOBAL_RATELIMIT_SET },
+  { "global ratelimit enable", OPT::GLOBAL_RATELIMIT_ENABLE },
+  { "global ratelimit disable", OPT::GLOBAL_RATELIMIT_DISABLE },
+  { "sync info", OPT::SYNC_INFO },
+  { "sync status", OPT::SYNC_STATUS },
+  { "role create", OPT::ROLE_CREATE },
+  { "role delete", OPT::ROLE_DELETE },
+  { "role get", OPT::ROLE_GET },
+  { "role-trust-policy modify", OPT::ROLE_TRUST_POLICY_MODIFY },
+  { "role list", OPT::ROLE_LIST },
+  { "role policy put", OPT::ROLE_POLICY_PUT },
+  { "role-policy put", OPT::ROLE_POLICY_PUT },
+  { "role policy list", OPT::ROLE_POLICY_LIST },
+  { "role-policy list", OPT::ROLE_POLICY_LIST },
+  { "role policy get", OPT::ROLE_POLICY_GET },
+  { "role-policy get", OPT::ROLE_POLICY_GET },
+  { "role policy delete", OPT::ROLE_POLICY_DELETE },
+  { "role-policy delete", OPT::ROLE_POLICY_DELETE },
+  { "role update", OPT::ROLE_UPDATE },
+  { "reshard bucket", OPT::BUCKET_RESHARD },
+  { "reshard add", OPT::RESHARD_ADD },
+  { "reshard list", OPT::RESHARD_LIST },
+  { "reshard status", OPT::RESHARD_STATUS },
+  { "reshard process", OPT::RESHARD_PROCESS },
+  { "reshard cancel", OPT::RESHARD_CANCEL },
+  { "mfa create", OPT::MFA_CREATE },
+  { "mfa remove", OPT::MFA_REMOVE },
+  { "mfa get", OPT::MFA_GET },
+  { "mfa list", OPT::MFA_LIST },
+  { "mfa check", OPT::MFA_CHECK },
+  { "mfa resync", OPT::MFA_RESYNC },
+  { "reshard stale-instances list", OPT::RESHARD_STALE_INSTANCES_LIST },
+  { "reshard stale list", OPT::RESHARD_STALE_INSTANCES_LIST },
+  { "reshard stale-instances delete", OPT::RESHARD_STALE_INSTANCES_DELETE },
+  { "reshard stale delete", OPT::RESHARD_STALE_INSTANCES_DELETE },
+  { "topic list", OPT::PUBSUB_TOPIC_LIST },
+  { "topic get", OPT::PUBSUB_TOPIC_GET },
+  { "topic rm", OPT::PUBSUB_TOPIC_RM },
+  { "notification list", OPT::PUBSUB_NOTIFICATION_LIST },
+  { "notification get", OPT::PUBSUB_NOTIFICATION_GET },
+  { "notification rm", OPT::PUBSUB_NOTIFICATION_RM },
+  { "script put", OPT::SCRIPT_PUT },
+  { "script get", OPT::SCRIPT_GET },
+  { "script rm", OPT::SCRIPT_RM },
+  { "script-package add", OPT::SCRIPT_PACKAGE_ADD },
+  { "script-package rm", OPT::SCRIPT_PACKAGE_RM },
+  { "script-package list", OPT::SCRIPT_PACKAGE_LIST },
+};
+
+static SimpleCmd::Aliases cmd_aliases = {
+  { "delete", "del" },
+  { "remove", "rm" },
+  { "rename", "mv" },
+};
+
+
+
+BIIndexType get_bi_index_type(const string& type_str) {
+  if (type_str == "plain")
+    return BIIndexType::Plain;
+  if (type_str == "instance")
+    return BIIndexType::Instance;
+  if (type_str == "olh")
+    return BIIndexType::OLH;
+
+  return BIIndexType::Invalid;
+}
+
+log_type get_log_type(const string& type_str) {
+  if (strcasecmp(type_str.c_str(), "fifo") == 0)
+    return log_type::fifo;
+  if (strcasecmp(type_str.c_str(), "omap") == 0)
+    return log_type::omap;
+
+  return static_cast<log_type>(0xff);
+}
+
+void dump_bi_entry(bufferlist& bl, BIIndexType index_type, Formatter *formatter)
+{
+  auto iter = bl.cbegin();
+  switch (index_type) {
+    case BIIndexType::Plain:
+    case BIIndexType::Instance:
+      {
+        rgw_bucket_dir_entry entry;
+        decode(entry, iter);
+        encode_json("entry", entry, formatter);
+      }
+      break;
+    case BIIndexType::OLH:
+      {
+        rgw_bucket_olh_entry entry;
+        decode(entry, iter);
+        encode_json("entry", entry, formatter);
+      }
+      break;
+    default:
+      ceph_abort();
+      break;
+  }
+}
+
+static void show_user_info(RGWUserInfo& info, Formatter *formatter)
+{
+  encode_json("user_info", info, formatter);
+  formatter->flush(cout);
+  cout << std::endl;
+}
+
+static void show_perm_policy(string perm_policy, Formatter* formatter)
+{
+  formatter->open_object_section("role");
+  formatter->dump_string("Permission policy", perm_policy);
+  formatter->close_section();
+  formatter->flush(cout);
+}
+
+static void show_policy_names(std::vector<string> policy_names, Formatter* formatter)
+{
+  formatter->open_array_section("PolicyNames");
+  for (const auto& it : policy_names) {
+    formatter->dump_string("policyname", it);
+  }
+  formatter->close_section();
+  formatter->flush(cout);
+}
+
+static void show_role_info(rgw::sal::RGWRole* role, Formatter* formatter)
+{
+  formatter->open_object_section("role");
+  role->dump(formatter);
+  formatter->close_section();
+  formatter->flush(cout);
+}
+
+static void show_roles_info(vector<std::unique_ptr<rgw::sal::RGWRole>>& roles, Formatter* formatter)
+{
+  formatter->open_array_section("Roles");
+  for (const auto& it : roles) {
+    formatter->open_object_section("role");
+    it->dump(formatter);
+    formatter->close_section();
+  }
+  formatter->close_section();
+  formatter->flush(cout);
+}
+
+static void show_reshard_status(
+  const list<cls_rgw_bucket_instance_entry>& status, Formatter *formatter)
+{
+  formatter->open_array_section("status");
+  for (const auto& entry : status) {
+    formatter->open_object_section("entry");
+    formatter->dump_string("reshard_status", to_string(entry.reshard_status));
+    formatter->close_section();
+  }
+  formatter->close_section();
+  formatter->flush(cout);
+}
+
+class StoreDestructor {
+  rgw::sal::Driver* driver;
+public:
+  explicit StoreDestructor(rgw::sal::Driver* _s) : driver(_s) {}
+  ~StoreDestructor() {
+    DriverManager::close_storage(driver);
+    rgw_http_client_cleanup();
+  }
+};
+
+static int init_bucket(rgw::sal::User* user, const rgw_bucket& b,
+                       std::unique_ptr<rgw::sal::Bucket>* bucket)
+{
+  return driver->get_bucket(dpp(), user, b, bucket, null_yield);
+}
+
+static int init_bucket(rgw::sal::User* user,
+		       const string& tenant_name,
+		       const string& bucket_name,
+		       const string& bucket_id,
+                       std::unique_ptr<rgw::sal::Bucket>* bucket)
+{
+  rgw_bucket b{tenant_name, bucket_name, bucket_id};
+  return init_bucket(user, b, bucket);
+}
+
+static int read_input(const string& infile, bufferlist& bl)
+{
+  int fd = 0;
+  if (infile.size()) {
+    fd = open(infile.c_str(), O_RDONLY);
+    if (fd < 0) {
+      int err = -errno;
+      cerr << "error reading input file " << infile << std::endl;
+      return err;
+    }
+  }
+
+#define READ_CHUNK 8196
+  int r;
+  int err;
+
+  do {
+    char buf[READ_CHUNK];
+
+    r = safe_read(fd, buf, READ_CHUNK);
+    if (r < 0) {
+      err = -errno;
+      cerr << "error while reading input" << std::endl;
+      goto out;
+    }
+    bl.append(buf, r);
+  } while (r > 0);
+  err = 0;
+
+ out:
+  if (infile.size()) {
+    close(fd);
+  }
+  return err;
+}
+
+template <class T>
+static int read_decode_json(const string& infile, T& t)
+{
+  bufferlist bl;
+  int ret = read_input(infile, bl);
+  if (ret < 0) {
+    cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+  JSONParser p;
+  if (!p.parse(bl.c_str(), bl.length())) {
+    cout << "failed to parse JSON" << std::endl;
+    return -EINVAL;
+  }
+
+  try {
+    decode_json_obj(t, &p);
+  } catch (const JSONDecoder::err& e) {
+    cout << "failed to decode JSON input: " << e.what() << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+template <class T, class K>
+static int read_decode_json(const string& infile, T& t, K *k)
+{
+  bufferlist bl;
+  int ret = read_input(infile, bl);
+  if (ret < 0) {
+    cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+  JSONParser p;
+  if (!p.parse(bl.c_str(), bl.length())) {
+    cout << "failed to parse JSON" << std::endl;
+    return -EINVAL;
+  }
+
+  try {
+    t.decode_json(&p, k);
+  } catch (const JSONDecoder::err& e) {
+    cout << "failed to decode JSON input: " << e.what() << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+template <class T>
+static bool decode_dump(const char *field_name, bufferlist& bl, Formatter *f)
+{
+  T t;
+
+  auto iter = bl.cbegin();
+
+  try {
+    decode(t, iter);
+  } catch (buffer::error& err) {
+    return false;
+  }
+
+  encode_json(field_name, t, f);
+
+  return true;
+}
+
+static bool dump_string(const char *field_name, bufferlist& bl, Formatter *f)
+{
+  string val = bl.to_str();
+  f->dump_string(field_name, val.c_str() /* hide encoded null termination chars */);
+
+  return true;
+}
+
+bool set_ratelimit_info(RGWRateLimitInfo& ratelimit, OPT opt_cmd, int64_t max_read_ops, int64_t max_write_ops,
+                    int64_t max_read_bytes, int64_t max_write_bytes,
+                    bool have_max_read_ops, bool have_max_write_ops,
+                    bool have_max_read_bytes, bool have_max_write_bytes)
+{
+  bool ratelimit_configured = true;
+  switch (opt_cmd) {
+    case OPT::RATELIMIT_ENABLE:
+    case OPT::GLOBAL_RATELIMIT_ENABLE:
+      ratelimit.enabled = true;
+      break;
+
+    case OPT::RATELIMIT_SET:
+    case OPT::GLOBAL_RATELIMIT_SET:
+      ratelimit_configured = false;
+      if (have_max_read_ops) {
+        if (max_read_ops >= 0) {
+          ratelimit.max_read_ops = max_read_ops;
+          ratelimit_configured = true;
+        }
+      }
+      if (have_max_write_ops) {
+        if (max_write_ops >= 0) {
+          ratelimit.max_write_ops = max_write_ops;
+          ratelimit_configured = true;
+        }
+      }
+      if (have_max_read_bytes) {
+        if (max_read_bytes >= 0) {
+          ratelimit.max_read_bytes = max_read_bytes;
+          ratelimit_configured = true;
+        }
+      }
+      if (have_max_write_bytes) {
+        if (max_write_bytes >= 0) {
+          ratelimit.max_write_bytes = max_write_bytes;
+          ratelimit_configured = true;
+        }
+      }
+      break;
+    case OPT::RATELIMIT_DISABLE:
+    case OPT::GLOBAL_RATELIMIT_DISABLE:
+      ratelimit.enabled = false;
+      break;
+    default:
+      break;
+  }
+  return ratelimit_configured;
+}
+
+void set_quota_info(RGWQuotaInfo& quota, OPT opt_cmd, int64_t max_size, int64_t max_objects,
+                    bool have_max_size, bool have_max_objects)
+{
+  switch (opt_cmd) {
+    case OPT::QUOTA_ENABLE:
+    case OPT::GLOBAL_QUOTA_ENABLE:
+      quota.enabled = true;
+
+      // falling through on purpose
+
+    case OPT::QUOTA_SET:
+    case OPT::GLOBAL_QUOTA_SET:
+      if (have_max_objects) {
+        if (max_objects < 0) {
+          quota.max_objects = -1;
+        } else {
+          quota.max_objects = max_objects;
+        }
+      }
+      if (have_max_size) {
+        if (max_size < 0) {
+          quota.max_size = -1;
+        } else {
+          quota.max_size = rgw_rounded_kb(max_size) * 1024;
+        }
+      }
+      break;
+    case OPT::QUOTA_DISABLE:
+    case OPT::GLOBAL_QUOTA_DISABLE:
+      quota.enabled = false;
+      break;
+    default:
+      break;
+  }
+}
+
+int set_bucket_quota(rgw::sal::Driver* driver, OPT opt_cmd,
+                     const string& tenant_name, const string& bucket_name,
+                     int64_t max_size, int64_t max_objects,
+                     bool have_max_size, bool have_max_objects)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  int r = driver->get_bucket(dpp(), nullptr, tenant_name, bucket_name, &bucket, null_yield);
+  if (r < 0) {
+    cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl;
+    return -r;
+  }
+
+  set_quota_info(bucket->get_info().quota, opt_cmd, max_size, max_objects, have_max_size, have_max_objects);
+
+  r = bucket->put_info(dpp(), false, real_time());
+  if (r < 0) {
+    cerr << "ERROR: failed writing bucket instance info: " << cpp_strerror(-r) << std::endl;
+    return -r;
+  }
+  return 0;
+}
+
+int set_bucket_ratelimit(rgw::sal::Driver* driver, OPT opt_cmd,
+                     const string& tenant_name, const string& bucket_name,
+                     int64_t max_read_ops, int64_t max_write_ops,
+                     int64_t max_read_bytes, int64_t max_write_bytes,
+                     bool have_max_read_ops, bool have_max_write_ops,
+                     bool have_max_read_bytes, bool have_max_write_bytes)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  int r = driver->get_bucket(dpp(), nullptr, tenant_name, bucket_name, &bucket, null_yield);
+  if (r < 0) {
+    cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl;
+    return -r;
+  }
+  RGWRateLimitInfo ratelimit_info;
+  auto iter = bucket->get_attrs().find(RGW_ATTR_RATELIMIT);
+  if(iter != bucket->get_attrs().end()) {
+    try {
+      bufferlist& bl = iter->second;
+      auto biter = bl.cbegin();
+      decode(ratelimit_info, biter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp(), 0) << "ERROR: failed to decode rate limit" << dendl;
+      return -EIO;
+    }
+  }
+  bool ratelimit_configured = set_ratelimit_info(ratelimit_info, opt_cmd, max_read_ops, max_write_ops,
+                         max_read_bytes, max_write_bytes,
+                         have_max_read_ops, have_max_write_ops,
+                         have_max_read_bytes, have_max_write_bytes);
+  if (!ratelimit_configured) {
+    ldpp_dout(dpp(), 0) << "ERROR: no rate limit values have been specified" << dendl;
+    return -EINVAL;
+  }
+  bufferlist bl;
+  ratelimit_info.encode(bl);
+  rgw::sal::Attrs attr;
+  attr[RGW_ATTR_RATELIMIT] = bl;
+  r = bucket->merge_and_store_attrs(dpp(), attr, null_yield);
+  if (r < 0) {
+    cerr << "ERROR: failed writing bucket instance info: " << cpp_strerror(-r) << std::endl;
+    return -r;
+  }
+  return 0;
+}
+
+int set_user_ratelimit(OPT opt_cmd, std::unique_ptr<rgw::sal::User>& user,
+                     int64_t max_read_ops, int64_t max_write_ops,
+                     int64_t max_read_bytes, int64_t max_write_bytes,
+                     bool have_max_read_ops, bool have_max_write_ops,
+                     bool have_max_read_bytes, bool have_max_write_bytes)
+{
+  RGWRateLimitInfo ratelimit_info;
+  user->load_user(dpp(), null_yield);
+  auto iter = user->get_attrs().find(RGW_ATTR_RATELIMIT);
+  if(iter != user->get_attrs().end()) {
+    try {
+      bufferlist& bl = iter->second;
+      auto biter = bl.cbegin();
+      decode(ratelimit_info, biter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp(), 0) << "ERROR: failed to decode rate limit" << dendl;
+      return -EIO;
+    }
+  }
+  bool ratelimit_configured = set_ratelimit_info(ratelimit_info, opt_cmd, max_read_ops, max_write_ops,
+                         max_read_bytes, max_write_bytes,
+                         have_max_read_ops, have_max_write_ops,
+                         have_max_read_bytes, have_max_write_bytes);
+  if (!ratelimit_configured) {
+    ldpp_dout(dpp(), 0) << "ERROR: no rate limit values have been specified" << dendl;
+    return -EINVAL;
+  }
+  bufferlist bl;
+  ratelimit_info.encode(bl);
+  rgw::sal::Attrs attr;
+  attr[RGW_ATTR_RATELIMIT] = bl;
+  int r = user->merge_and_store_attrs(dpp(), attr, null_yield);
+  if (r < 0) {
+    cerr << "ERROR: failed writing user instance info: " << cpp_strerror(-r) << std::endl;
+    return -r;
+  }
+  return 0;
+}
+
+int show_user_ratelimit(std::unique_ptr<rgw::sal::User>& user, Formatter *formatter)
+{
+  RGWRateLimitInfo ratelimit_info;
+  user->load_user(dpp(), null_yield);
+  auto iter = user->get_attrs().find(RGW_ATTR_RATELIMIT);
+  if(iter != user->get_attrs().end()) {
+    try {
+      bufferlist& bl = iter->second;
+      auto biter = bl.cbegin();
+      decode(ratelimit_info, biter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp(), 0) << "ERROR: failed to decode rate limit" << dendl;
+      return -EIO;
+    }
+  }
+  formatter->open_object_section("user_ratelimit");
+  encode_json("user_ratelimit", ratelimit_info, formatter);
+  formatter->close_section();
+  formatter->flush(cout);
+  cout << std::endl;
+  return 0;
+}
+
+int show_bucket_ratelimit(rgw::sal::Driver* driver, const string& tenant_name,
+                          const string& bucket_name, Formatter *formatter)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  int r = driver->get_bucket(dpp(), nullptr, tenant_name, bucket_name, &bucket, null_yield);
+  if (r < 0) {
+    cerr << "could not get bucket info for bucket=" << bucket_name << ": " << cpp_strerror(-r) << std::endl;
+    return -r;
+  }
+  RGWRateLimitInfo ratelimit_info;
+  auto iter = bucket->get_attrs().find(RGW_ATTR_RATELIMIT);
+  if (iter != bucket->get_attrs().end()) {
+    try {
+      bufferlist& bl = iter->second;
+      auto biter = bl.cbegin();
+      decode(ratelimit_info, biter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp(), 0) << "ERROR: failed to decode rate limit" << dendl;
+      return -EIO;
+    }
+  }
+  formatter->open_object_section("bucket_ratelimit");
+  encode_json("bucket_ratelimit", ratelimit_info, formatter);
+  formatter->close_section();
+  formatter->flush(cout);
+  cout << std::endl;
+  return 0;
+}
+int set_user_bucket_quota(OPT opt_cmd, RGWUser& user, RGWUserAdminOpState& op_state, int64_t max_size, int64_t max_objects,
+                          bool have_max_size, bool have_max_objects)
+{
+  RGWUserInfo& user_info = op_state.get_user_info();
+
+  set_quota_info(user_info.quota.bucket_quota, opt_cmd, max_size, max_objects, have_max_size, have_max_objects);
+
+  op_state.set_bucket_quota(user_info.quota.bucket_quota);
+
+  string err;
+  int r = user.modify(dpp(), op_state, null_yield, &err);
+  if (r < 0) {
+    cerr << "ERROR: failed updating user info: " << cpp_strerror(-r) << ": " << err << std::endl;
+    return -r;
+  }
+  return 0;
+}
+
+int set_user_quota(OPT opt_cmd, RGWUser& user, RGWUserAdminOpState& op_state, int64_t max_size, int64_t max_objects,
+                   bool have_max_size, bool have_max_objects)
+{
+  RGWUserInfo& user_info = op_state.get_user_info();
+
+  set_quota_info(user_info.quota.user_quota, opt_cmd, max_size, max_objects, have_max_size, have_max_objects);
+
+  op_state.set_user_quota(user_info.quota.user_quota);
+
+  string err;
+  int r = user.modify(dpp(), op_state, null_yield, &err);
+  if (r < 0) {
+    cerr << "ERROR: failed updating user info: " << cpp_strerror(-r) << ": " << err << std::endl;
+    return -r;
+  }
+  return 0;
+}
+
+int check_min_obj_stripe_size(rgw::sal::Driver* driver, rgw::sal::Object* obj, uint64_t min_stripe_size, bool *need_rewrite)
+{
+  int ret = obj->get_obj_attrs(null_yield, dpp());
+  if (ret < 0) {
+    ldpp_dout(dpp(), -1) << "ERROR: failed to stat object, returned error: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  map<string, bufferlist>::iterator iter;
+  iter = obj->get_attrs().find(RGW_ATTR_MANIFEST);
+  if (iter == obj->get_attrs().end()) {
+    *need_rewrite = (obj->get_obj_size() >= min_stripe_size);
+    return 0;
+  }
+
+  RGWObjManifest manifest;
+
+  try {
+    bufferlist& bl = iter->second;
+    auto biter = bl.cbegin();
+    decode(manifest, biter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp(), 0) << "ERROR: failed to decode manifest" << dendl;
+    return -EIO;
+  }
+
+  map<uint64_t, RGWObjManifestPart>& objs = manifest.get_explicit_objs();
+  map<uint64_t, RGWObjManifestPart>::iterator oiter;
+  for (oiter = objs.begin(); oiter != objs.end(); ++oiter) {
+    RGWObjManifestPart& part = oiter->second;
+
+    if (part.size >= min_stripe_size) {
+      *need_rewrite = true;
+      return 0;
+    }
+  }
+  *need_rewrite = false;
+
+  return 0;
+}
+
+
+int check_obj_locator_underscore(rgw::sal::Object* obj, bool fix, bool remove_bad, Formatter *f) {
+  f->open_object_section("object");
+  f->open_object_section("key");
+  f->dump_string("type", "head");
+  f->dump_string("name", obj->get_name());
+  f->dump_string("instance", obj->get_instance());
+  f->close_section();
+
+  string oid;
+  string locator;
+
+  get_obj_bucket_and_oid_loc(obj->get_obj(), oid, locator);
+
+  f->dump_string("oid", oid);
+  f->dump_string("locator", locator);
+
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op = obj->get_read_op();
+
+  int ret = read_op->prepare(null_yield, dpp());
+  bool needs_fixing = (ret == -ENOENT);
+
+  f->dump_bool("needs_fixing", needs_fixing);
+
+  string status = (needs_fixing ? "needs_fixing" : "ok");
+
+  if ((needs_fixing || remove_bad) && fix) {
+    ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->fix_head_obj_locator(dpp(), obj->get_bucket()->get_info(), needs_fixing, remove_bad, obj->get_key());
+    if (ret < 0) {
+      cerr << "ERROR: fix_head_object_locator() returned ret=" << ret << std::endl;
+      goto done;
+    }
+    status = "fixed";
+  }
+
+done:
+  f->dump_string("status", status);
+
+  f->close_section();
+
+  return 0;
+}
+
+int check_obj_tail_locator_underscore(RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, Formatter *f) {
+  f->open_object_section("object");
+  f->open_object_section("key");
+  f->dump_string("type", "tail");
+  f->dump_string("name", key.name);
+  f->dump_string("instance", key.instance);
+  f->close_section();
+
+  bool needs_fixing;
+  string status;
+
+  int ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->fix_tail_obj_locator(dpp(), bucket_info, key, fix, &needs_fixing, null_yield);
+  if (ret < 0) {
+    cerr << "ERROR: fix_tail_object_locator_underscore() returned ret=" << ret << std::endl;
+    status = "failed";
+  } else {
+    status = (needs_fixing && !fix ? "needs_fixing" : "ok");
+  }
+
+  f->dump_bool("needs_fixing", needs_fixing);
+  f->dump_string("status", status);
+
+  f->close_section();
+
+  return 0;
+}
+
+int do_check_object_locator(const string& tenant_name, const string& bucket_name,
+                            bool fix, bool remove_bad, Formatter *f)
+{
+  if (remove_bad && !fix) {
+    cerr << "ERROR: can't have remove_bad specified without fix" << std::endl;
+    return -EINVAL;
+  }
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  string bucket_id;
+
+  f->open_object_section("bucket");
+  f->dump_string("bucket", bucket_name);
+  int ret = init_bucket(nullptr, tenant_name, bucket_name, bucket_id, &bucket);
+  if (ret < 0) {
+    cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+  int count = 0;
+
+  int max_entries = 1000;
+
+  string prefix;
+  string delim;
+  string marker;
+  vector<rgw_bucket_dir_entry> result;
+  string ns;
+
+  rgw::sal::Bucket::ListParams params;
+  rgw::sal::Bucket::ListResults results;
+
+  params.prefix = prefix;
+  params.delim = delim;
+  params.marker = rgw_obj_key(marker);
+  params.ns = ns;
+  params.enforce_ns = true;
+  params.list_versions = true;
+
+  f->open_array_section("check_objects");
+  do {
+    ret = bucket->list(dpp(), params, max_entries - count, results, null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: driver->list_objects(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    count += results.objs.size();
+
+    for (vector<rgw_bucket_dir_entry>::iterator iter = results.objs.begin(); iter != results.objs.end(); ++iter) {
+      std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(iter->key);
+
+      if (obj->get_name()[0] == '_') {
+        ret = check_obj_locator_underscore(obj.get(), fix, remove_bad, f);
+
+	if (ret >= 0) {
+          ret = check_obj_tail_locator_underscore(bucket->get_info(), obj->get_key(), fix, f);
+          if (ret < 0) {
+              cerr << "ERROR: check_obj_tail_locator_underscore(): " << cpp_strerror(-ret) << std::endl;
+              return -ret;
+          }
+	}
+      }
+    }
+    f->flush(cout);
+  } while (results.is_truncated && count < max_entries);
+  f->close_section();
+  f->close_section();
+
+  f->flush(cout);
+
+  return 0;
+}
+
+/// search for a matching zone/zonegroup id and return a connection if found
+static boost::optional<RGWRESTConn> get_remote_conn(rgw::sal::RadosStore* driver,
+                                                    const RGWZoneGroup& zonegroup,
+                                                    const std::string& remote)
+{
+  boost::optional<RGWRESTConn> conn;
+  if (remote == zonegroup.get_id()) {
+    conn.emplace(driver->ctx(), driver, remote, zonegroup.endpoints, zonegroup.api_name);
+  } else {
+    for (const auto& z : zonegroup.zones) {
+      const auto& zone = z.second;
+      if (remote == zone.id) {
+        conn.emplace(driver->ctx(), driver, remote, zone.endpoints, zonegroup.api_name);
+        break;
+      }
+    }
+  }
+  return conn;
+}
+
+/// search each zonegroup for a connection
+static boost::optional<RGWRESTConn> get_remote_conn(rgw::sal::RadosStore* driver,
+                                                    const RGWPeriodMap& period_map,
+                                                    const std::string& remote)
+{
+  boost::optional<RGWRESTConn> conn;
+  for (const auto& zg : period_map.zonegroups) {
+    conn = get_remote_conn(driver, zg.second, remote);
+    if (conn) {
+      break;
+    }
+  }
+  return conn;
+}
+
+// we expect a very small response
+static constexpr size_t MAX_REST_RESPONSE = 128 * 1024;
+
+static int send_to_remote_gateway(RGWRESTConn* conn, req_info& info,
+                                  bufferlist& in_data, JSONParser& parser)
+{
+  if (!conn) {
+    return -EINVAL;
+  }
+
+  ceph::bufferlist response;
+  rgw_user user;
+  int ret = conn->forward(dpp(), user, info, nullptr, MAX_REST_RESPONSE, &in_data, &response, null_yield);
+
+  int parse_ret = parser.parse(response.c_str(), response.length());
+  if (parse_ret < 0) {
+    cerr << "failed to parse response" << std::endl;
+    return parse_ret;
+  }
+  return ret;
+}
+
+static int send_to_url(const string& url,
+                       std::optional<string> opt_region,
+                       const string& access,
+                       const string& secret, req_info& info,
+                       bufferlist& in_data, JSONParser& parser)
+{
+  if (access.empty() || secret.empty()) {
+    cerr << "An --access-key and --secret must be provided with --url." << std::endl;
+    return -EINVAL;
+  }
+  RGWAccessKey key;
+  key.id = access;
+  key.key = secret;
+
+  param_vec_t params;
+  RGWRESTSimpleRequest req(g_ceph_context, info.method, url, NULL, &params, opt_region);
+
+  bufferlist response;
+  int ret = req.forward_request(dpp(), key, info, MAX_REST_RESPONSE, &in_data, &response, null_yield);
+
+  int parse_ret = parser.parse(response.c_str(), response.length());
+  if (parse_ret < 0) {
+    cout << "failed to parse response" << std::endl;
+    return parse_ret;
+  }
+  return ret;
+}
+
+static int send_to_remote_or_url(RGWRESTConn *conn, const string& url,
+                                 std::optional<string> opt_region,
+                                 const string& access, const string& secret,
+                                 req_info& info, bufferlist& in_data,
+                                 JSONParser& parser)
+{
+  if (url.empty()) {
+    return send_to_remote_gateway(conn, info, in_data, parser);
+  }
+  return send_to_url(url, opt_region, access, secret, info, in_data, parser);
+}
+
+static int commit_period(rgw::sal::ConfigStore* cfgstore,
+                         RGWRealm& realm, rgw::sal::RealmWriter& realm_writer,
+                         RGWPeriod& period, string remote, const string& url,
+                         std::optional<string> opt_region,
+                         const string& access, const string& secret,
+                         bool force)
+{
+  auto& master_zone = period.get_master_zone().id;
+  if (master_zone.empty()) {
+    cerr << "cannot commit period: period does not have a master zone of a master zonegroup" << std::endl;
+    return -EINVAL;
+  }
+  // are we the period's master zone?
+  if (driver->get_zone()->get_id() == master_zone) {
+    // read the current period
+    RGWPeriod current_period;
+    int ret = cfgstore->read_period(dpp(), null_yield, realm.current_period,
+                                    std::nullopt, current_period);
+    if (ret < 0) {
+      cerr << "failed to load current period: " << cpp_strerror(ret) << std::endl;
+      return ret;
+    }
+    // the master zone can commit locally
+    ret = rgw::commit_period(dpp(), null_yield, cfgstore, driver,
+                             realm, realm_writer, current_period,
+                             period, cerr, force);
+    if (ret < 0) {
+      cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl;
+    }
+    return ret;
+  }
+
+  if (remote.empty() && url.empty()) {
+    // use the new master zone's connection
+    remote = master_zone;
+    cerr << "Sending period to new master zone " << remote << std::endl;
+  }
+  boost::optional<RGWRESTConn> conn;
+  RGWRESTConn *remote_conn = nullptr;
+  if (!remote.empty()) {
+    conn = get_remote_conn(static_cast<rgw::sal::RadosStore*>(driver), period.get_map(), remote);
+    if (!conn) {
+      cerr << "failed to find a zone or zonegroup for remote "
+          << remote << std::endl;
+      return -ENOENT;
+    }
+    remote_conn = &*conn;
+  }
+
+  // push period to the master with an empty period id
+  period.set_id(string());
+
+  RGWEnv env;
+  req_info info(g_ceph_context, &env);
+  info.method = "POST";
+  info.request_uri = "/admin/realm/period";
+
+  // json format into a bufferlist
+  JSONFormatter jf(false);
+  encode_json("period", period, &jf);
+  bufferlist bl;
+  jf.flush(bl);
+
+  JSONParser p;
+  int ret = send_to_remote_or_url(remote_conn, url, opt_region, access, secret, info, bl, p);
+  if (ret < 0) {
+    cerr << "request failed: " << cpp_strerror(-ret) << std::endl;
+
+    // did we parse an error message?
+    auto message = p.find_obj("Message");
+    if (message) {
+      cerr << "Reason: " << message->get_data() << std::endl;
+    }
+    return ret;
+  }
+
+  // decode the response and driver it back
+  try {
+    decode_json_obj(period, &p);
+  } catch (const JSONDecoder::err& e) {
+    cout << "failed to decode JSON input: " << e.what() << std::endl;
+    return -EINVAL;
+  }
+  if (period.get_id().empty()) {
+    cerr << "Period commit got back an empty period id" << std::endl;
+    return -EINVAL;
+  }
+  // the master zone gave us back the period that it committed, so it's
+  // safe to save it as our latest epoch
+  constexpr bool exclusive = false;
+  ret = cfgstore->create_period(dpp(), null_yield, exclusive, period);
+  if (ret < 0) {
+    cerr << "Error storing committed period " << period.get_id() << ": "
+        << cpp_strerror(ret) << std::endl;
+    return ret;
+  }
+  ret = rgw::reflect_period(dpp(), null_yield, cfgstore, period);
+  if (ret < 0) {
+    cerr << "Error updating local objects: " << cpp_strerror(ret) << std::endl;
+    return ret;
+  }
+  (void) cfgstore->realm_notify_new_period(dpp(), null_yield, period);
+  return ret;
+}
+
+static int update_period(rgw::sal::ConfigStore* cfgstore,
+                         const string& realm_id, const string& realm_name,
+                         const string& period_epoch, bool commit,
+                         const string& remote, const string& url,
+                         std::optional<string> opt_region,
+                         const string& access, const string& secret,
+                         Formatter *formatter, bool force)
+{
+  RGWRealm realm;
+  std::unique_ptr<rgw::sal::RealmWriter> realm_writer;
+  int ret = rgw::read_realm(dpp(), null_yield, cfgstore,
+                            realm_id, realm_name,
+                            realm, &realm_writer);
+  if (ret < 0) {
+    cerr << "failed to load realm " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+  std::optional<epoch_t> epoch;
+  if (!period_epoch.empty()) {
+    epoch = atoi(period_epoch.c_str());
+  }
+  RGWPeriod period;
+  ret = cfgstore->read_period(dpp(), null_yield, realm.current_period,
+                              epoch, period);
+  if (ret < 0) {
+    cerr << "failed to load current period: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+  // convert to the realm's staging period
+  rgw::fork_period(dpp(), period);
+  // update the staging period with all of the realm's zonegroups
+  ret = rgw::update_period(dpp(), null_yield, cfgstore, period);
+  if (ret < 0) {
+    return ret;
+  }
+
+  constexpr bool exclusive = false;
+  ret = cfgstore->create_period(dpp(), null_yield, exclusive, period);
+  if (ret < 0) {
+    cerr << "failed to driver period: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+  if (commit) {
+    ret = commit_period(cfgstore, realm, *realm_writer, period, remote, url,
+                        opt_region, access, secret, force);
+    if (ret < 0) {
+      cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+  }
+  encode_json("period", period, formatter);
+  formatter->flush(cout);
+  return 0;
+}
+
+static int init_bucket_for_sync(rgw::sal::User* user,
+				const string& tenant, const string& bucket_name,
+                                const string& bucket_id,
+				std::unique_ptr<rgw::sal::Bucket>* bucket)
+{
+  int ret = init_bucket(user, tenant, bucket_name, bucket_id, bucket);
+  if (ret < 0) {
+    cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+
+  return 0;
+}
+
+static int do_period_pull(rgw::sal::ConfigStore* cfgstore,
+                          RGWRESTConn *remote_conn, const string& url,
+                          std::optional<string> opt_region,
+                          const string& access_key, const string& secret_key,
+                          const string& realm_id, const string& realm_name,
+                          const string& period_id, const string& period_epoch,
+                          RGWPeriod *period)
+{
+  RGWEnv env;
+  req_info info(g_ceph_context, &env);
+  info.method = "GET";
+  info.request_uri = "/admin/realm/period";
+
+  map<string, string> &params = info.args.get_params();
+  if (!realm_id.empty())
+    params["realm_id"] = realm_id;
+  if (!realm_name.empty())
+    params["realm_name"] = realm_name;
+  if (!period_id.empty())
+    params["period_id"] = period_id;
+  if (!period_epoch.empty())
+    params["epoch"] = period_epoch;
+
+  bufferlist bl;
+  JSONParser p;
+  int ret = send_to_remote_or_url(remote_conn, url, opt_region, access_key, secret_key,
+                                  info, bl, p);
+  if (ret < 0) {
+    cerr << "request failed: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+  try {
+    decode_json_obj(*period, &p);
+  } catch (const JSONDecoder::err& e) {
+    cout << "failed to decode JSON input: " << e.what() << std::endl;
+    return -EINVAL;
+  }
+  constexpr bool exclusive = false;
+  ret = cfgstore->create_period(dpp(), null_yield, exclusive, *period);
+  if (ret < 0) {
+    cerr << "Error storing period " << period->get_id() << ": " << cpp_strerror(ret) << std::endl;
+  }
+  return 0;
+}
+
+void flush_ss(stringstream& ss, list<string>& l)
+{
+  if (!ss.str().empty()) {
+    l.push_back(ss.str());
+  }
+  ss.str("");
+}
+
+stringstream& push_ss(stringstream& ss, list<string>& l, int tab = 0)
+{
+  flush_ss(ss, l);
+  if (tab > 0) {
+    ss << setw(tab) << "" << setw(1);
+  }
+  return ss;
+}
+
+static void get_md_sync_status(list<string>& status)
+{
+  RGWMetaSyncStatusManager sync(static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados->get_async_processor());
+
+  int ret = sync.init(dpp());
+  if (ret < 0) {
+    status.push_back(string("failed to retrieve sync info: sync.init() failed: ") + cpp_strerror(-ret));
+    return;
+  }
+
+  rgw_meta_sync_status sync_status;
+  ret = sync.read_sync_status(dpp(), &sync_status);
+  if (ret < 0) {
+    status.push_back(string("failed to read sync status: ") + cpp_strerror(-ret));
+    return;
+  }
+
+  string status_str;
+  switch (sync_status.sync_info.state) {
+    case rgw_meta_sync_info::StateInit:
+      status_str = "init";
+      break;
+    case rgw_meta_sync_info::StateBuildingFullSyncMaps:
+      status_str = "preparing for full sync";
+      break;
+    case rgw_meta_sync_info::StateSync:
+      status_str = "syncing";
+      break;
+    default:
+      status_str = "unknown";
+  }
+
+  status.push_back(status_str);
+
+  uint64_t full_total = 0;
+  uint64_t full_complete = 0;
+
+  int num_full = 0;
+  int num_inc = 0;
+  int total_shards = 0;
+  set<int> shards_behind_set;
+
+  for (auto marker_iter : sync_status.sync_markers) {
+    full_total += marker_iter.second.total_entries;
+    total_shards++;
+    if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) {
+      num_full++;
+      full_complete += marker_iter.second.pos;
+      int shard_id = marker_iter.first;
+      shards_behind_set.insert(shard_id);
+    } else {
+      full_complete += marker_iter.second.total_entries;
+    }
+    if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::IncrementalSync) {
+      num_inc++;
+    }
+  }
+
+  stringstream ss;
+  push_ss(ss, status) << "full sync: " << num_full << "/" << total_shards << " shards";
+
+  if (num_full > 0) {
+    push_ss(ss, status) << "full sync: " << full_total - full_complete << " entries to sync";
+  }
+
+  push_ss(ss, status) << "incremental sync: " << num_inc << "/" << total_shards << " shards";
+
+  map<int, RGWMetadataLogInfo> master_shards_info;
+  string master_period = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_current_period_id();
+
+  ret = sync.read_master_log_shards_info(dpp(), master_period, &master_shards_info);
+  if (ret < 0) {
+    status.push_back(string("failed to fetch master sync status: ") + cpp_strerror(-ret));
+    return;
+  }
+
+  map<int, string> shards_behind;
+  if (sync_status.sync_info.period != master_period) {
+    status.push_back(string("master is on a different period: master_period=" +
+                            master_period + " local_period=" + sync_status.sync_info.period));
+  } else {
+    for (auto local_iter : sync_status.sync_markers) {
+      int shard_id = local_iter.first;
+      auto iter = master_shards_info.find(shard_id);
+
+      if (iter == master_shards_info.end()) {
+        /* huh? */
+        derr << "ERROR: could not find remote sync shard status for shard_id=" << shard_id << dendl;
+        continue;
+      }
+      auto master_marker = iter->second.marker;
+      if (local_iter.second.state == rgw_meta_sync_marker::SyncState::IncrementalSync &&
+          master_marker > local_iter.second.marker) {
+        shards_behind[shard_id] = local_iter.second.marker;
+        shards_behind_set.insert(shard_id);
+      }
+    }
+  }
+
+  // fetch remote log entries to determine the oldest change
+  std::optional<std::pair<int, ceph::real_time>> oldest;
+  if (!shards_behind.empty()) {
+    map<int, rgw_mdlog_shard_data> master_pos;
+    ret = sync.read_master_log_shards_next(dpp(), sync_status.sync_info.period, shards_behind, &master_pos);
+    if (ret < 0) {
+      derr << "ERROR: failed to fetch master next positions (" << cpp_strerror(-ret) << ")" << dendl;
+    } else {
+      for (auto iter : master_pos) {
+        rgw_mdlog_shard_data& shard_data = iter.second;
+
+        if (shard_data.entries.empty()) {
+          // there aren't any entries in this shard, so we're not really behind
+          shards_behind.erase(iter.first);
+          shards_behind_set.erase(iter.first);
+        } else {
+          rgw_mdlog_entry& entry = shard_data.entries.front();
+          if (!oldest) {
+            oldest.emplace(iter.first, entry.timestamp);
+          } else if (!ceph::real_clock::is_zero(entry.timestamp) && entry.timestamp < oldest->second) {
+            oldest.emplace(iter.first, entry.timestamp);
+          }
+        }
+      }
+    }
+  }
+
+  int total_behind = shards_behind.size() + (sync_status.sync_info.num_shards - num_inc);
+  if (total_behind == 0) {
+    push_ss(ss, status) << "metadata is caught up with master";
+  } else {
+    push_ss(ss, status) << "metadata is behind on " << total_behind << " shards";
+    push_ss(ss, status) << "behind shards: " << "[" << shards_behind_set << "]";
+    if (oldest) {
+      push_ss(ss, status) << "oldest incremental change not applied: "
+          << oldest->second << " [" << oldest->first << ']';
+    }
+  }
+
+  flush_ss(ss, status);
+}
+
+static void get_data_sync_status(const rgw_zone_id& source_zone, list<string>& status, int tab)
+{
+  stringstream ss;
+
+  RGWZone *sz;
+
+  if (!(sz = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->find_zone(source_zone))) {
+    push_ss(ss, status, tab) << string("zone not found");
+    flush_ss(ss, status);
+    return;
+  }
+
+  if (!static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->zone_syncs_from(*sz)) {
+    push_ss(ss, status, tab) << string("not syncing from zone");
+    flush_ss(ss, status);
+    return;
+  }
+  RGWDataSyncStatusManager sync(static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados->get_async_processor(), source_zone, nullptr);
+
+  int ret = sync.init(dpp());
+  if (ret < 0) {
+    push_ss(ss, status, tab) << string("failed to retrieve sync info: ") + cpp_strerror(-ret);
+    flush_ss(ss, status);
+    return;
+  }
+
+  rgw_data_sync_status sync_status;
+  ret = sync.read_sync_status(dpp(), &sync_status);
+  if (ret < 0 && ret != -ENOENT) {
+    push_ss(ss, status, tab) << string("failed read sync status: ") + cpp_strerror(-ret);
+    return;
+  }
+
+  set<int> recovering_shards;
+  ret = sync.read_recovering_shards(dpp(), sync_status.sync_info.num_shards, recovering_shards);
+  if (ret < 0 && ret != ENOENT) {
+    push_ss(ss, status, tab) << string("failed read recovering shards: ") + cpp_strerror(-ret);
+    return;
+  }
+
+  string status_str;
+  switch (sync_status.sync_info.state) {
+    case rgw_data_sync_info::StateInit:
+      status_str = "init";
+      break;
+    case rgw_data_sync_info::StateBuildingFullSyncMaps:
+      status_str = "preparing for full sync";
+      break;
+    case rgw_data_sync_info::StateSync:
+      status_str = "syncing";
+      break;
+    default:
+      status_str = "unknown";
+  }
+
+  push_ss(ss, status, tab) << status_str;
+
+  uint64_t full_total = 0;
+  uint64_t full_complete = 0;
+
+  int num_full = 0;
+  int num_inc = 0;
+  int total_shards = 0;
+  set<int> shards_behind_set;
+
+  for (auto marker_iter : sync_status.sync_markers) {
+    full_total += marker_iter.second.total_entries;
+    total_shards++;
+    if (marker_iter.second.state == rgw_data_sync_marker::SyncState::FullSync) {
+      num_full++;
+      full_complete += marker_iter.second.pos;
+      int shard_id = marker_iter.first;
+      shards_behind_set.insert(shard_id);
+    } else {
+      full_complete += marker_iter.second.total_entries;
+    }
+    if (marker_iter.second.state == rgw_data_sync_marker::SyncState::IncrementalSync) {
+      num_inc++;
+    }
+  }
+
+  push_ss(ss, status, tab) << "full sync: " << num_full << "/" << total_shards << " shards";
+
+  if (num_full > 0) {
+    push_ss(ss, status, tab) << "full sync: " << full_total - full_complete << " buckets to sync";
+  }
+
+  push_ss(ss, status, tab) << "incremental sync: " << num_inc << "/" << total_shards << " shards";
+
+  map<int, RGWDataChangesLogInfo> source_shards_info;
+
+  ret = sync.read_source_log_shards_info(dpp(), &source_shards_info);
+  if (ret < 0) {
+    push_ss(ss, status, tab) << string("failed to fetch source sync status: ") + cpp_strerror(-ret);
+    return;
+  }
+
+  map<int, string> shards_behind;
+
+  for (auto local_iter : sync_status.sync_markers) {
+    int shard_id = local_iter.first;
+    auto iter = source_shards_info.find(shard_id);
+
+    if (iter == source_shards_info.end()) {
+      /* huh? */
+      derr << "ERROR: could not find remote sync shard status for shard_id=" << shard_id << dendl;
+      continue;
+    }
+    auto master_marker = iter->second.marker;
+    if (local_iter.second.state == rgw_data_sync_marker::SyncState::IncrementalSync &&
+        master_marker > local_iter.second.marker) {
+      shards_behind[shard_id] = local_iter.second.marker;
+      shards_behind_set.insert(shard_id);
+    }
+  }
+
+  std::optional<std::pair<int, ceph::real_time>> oldest;
+  if (!shards_behind.empty()) {
+    map<int, rgw_datalog_shard_data> master_pos;
+    ret = sync.read_source_log_shards_next(dpp(), shards_behind, &master_pos);
+
+    if (ret < 0) {
+      derr << "ERROR: failed to fetch next positions (" << cpp_strerror(-ret) << ")" << dendl;
+    } else {
+      for (auto iter : master_pos) {
+        rgw_datalog_shard_data& shard_data = iter.second;
+        if (shard_data.entries.empty()) {
+          // there aren't any entries in this shard, so we're not really behind
+          shards_behind.erase(iter.first);
+          shards_behind_set.erase(iter.first);
+        } else {
+          rgw_datalog_entry& entry = shard_data.entries.front();
+          if (!oldest) {
+            oldest.emplace(iter.first, entry.timestamp);
+          } else if (!ceph::real_clock::is_zero(entry.timestamp) && entry.timestamp < oldest->second) {
+            oldest.emplace(iter.first, entry.timestamp);
+          }
+        }
+      }
+    }
+  }
+
+  int total_behind = shards_behind.size() + (sync_status.sync_info.num_shards - num_inc);
+  int total_recovering = recovering_shards.size();
+
+  if (total_behind == 0 && total_recovering == 0) {
+    push_ss(ss, status, tab) << "data is caught up with source";
+  } else if (total_behind > 0) {
+    push_ss(ss, status, tab) << "data is behind on " << total_behind << " shards";
+    push_ss(ss, status, tab) << "behind shards: " << "[" << shards_behind_set << "]" ;
+    if (oldest) {
+      push_ss(ss, status, tab) << "oldest incremental change not applied: "
+          << oldest->second << " [" << oldest->first << ']';
+    }
+  }
+
+  if (total_recovering > 0) {
+    push_ss(ss, status, tab) << total_recovering << " shards are recovering";
+    push_ss(ss, status, tab) << "recovering shards: " << "[" << recovering_shards << "]";
+  }
+
+  flush_ss(ss, status);
+}
+
+static void tab_dump(const string& header, int width, const list<string>& entries)
+{
+  string s = header;
+
+  for (auto e : entries) {
+    cout << std::setw(width) << s << std::setw(1) << " " << e << std::endl;
+    s.clear();
+  }
+}
+
+// return features that are supported but not enabled
+static auto get_disabled_features(const rgw::zone_features::set& enabled) {
+  auto features = rgw::zone_features::set{rgw::zone_features::supported.begin(),
+                                          rgw::zone_features::supported.end()};
+  for (const auto& feature : enabled) {
+    features.erase(feature);
+  }
+  return features;
+}
+
+
+static void sync_status(Formatter *formatter)
+{
+  const rgw::sal::ZoneGroup& zonegroup = driver->get_zone()->get_zonegroup();
+  rgw::sal::Zone* zone = driver->get_zone();
+
+  int width = 15;
+
+  cout << std::setw(width) << "realm" << std::setw(1) << " " << zone->get_realm_id() << " (" << zone->get_realm_name() << ")" << std::endl;
+  cout << std::setw(width) << "zonegroup" << std::setw(1) << " " << zonegroup.get_id() << " (" << zonegroup.get_name() << ")" << std::endl;
+  cout << std::setw(width) << "zone" << std::setw(1) << " " << zone->get_id() << " (" << zone->get_name() << ")" << std::endl;
+  cout << std::setw(width) << "current time" << std::setw(1) << " "
+       << to_iso_8601(ceph::real_clock::now(), iso_8601_format::YMDhms) << std::endl;
+
+  const auto& rzg =
+    static_cast<const rgw::sal::RadosZoneGroup&>(zonegroup).get_group();
+
+  cout << std::setw(width) << "zonegroup features enabled: " << rzg.enabled_features << std::endl;
+  if (auto d = get_disabled_features(rzg.enabled_features); !d.empty()) {
+    cout << std::setw(width) << "                   disabled: " << d << std::endl;
+  }
+
+  list<string> md_status;
+
+  if (driver->is_meta_master()) {
+    md_status.push_back("no sync (zone is master)");
+  } else {
+    get_md_sync_status(md_status);
+  }
+
+  tab_dump("metadata sync", width, md_status);
+
+  list<string> data_status;
+
+  auto& zone_conn_map = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone_conn_map();
+
+  for (auto iter : zone_conn_map) {
+    const rgw_zone_id& source_id = iter.first;
+    string source_str = "source: ";
+    string s = source_str + source_id.id;
+    std::unique_ptr<rgw::sal::Zone> sz;
+    if (driver->get_zone()->get_zonegroup().get_zone_by_id(source_id.id, &sz) == 0) {
+      s += string(" (") + sz->get_name() + ")";
+    }
+    data_status.push_back(s);
+    get_data_sync_status(source_id, data_status, source_str.size());
+  }
+
+  tab_dump("data sync", width, data_status);
+}
+
+struct indented {
+  int w; // indent width
+  std::string_view header;
+  indented(int w, std::string_view header = "") : w(w), header(header) {}
+};
+std::ostream& operator<<(std::ostream& out, const indented& h) {
+  return out << std::setw(h.w) << h.header << std::setw(1) << ' ';
+}
+
+static int bucket_source_sync_status(const DoutPrefixProvider *dpp, rgw::sal::RadosStore* driver, const RGWZone& zone,
+                                     const RGWZone& source, RGWRESTConn *conn,
+                                     const RGWBucketInfo& bucket_info,
+                                     rgw_sync_bucket_pipe pipe,
+                                     int width, std::ostream& out)
+{
+  out << indented{width, "source zone"} << source.id << " (" << source.name << ")" << std::endl;
+
+  // syncing from this zone?
+  if (!driver->svc()->zone->zone_syncs_from(zone, source)) {
+    out << indented{width} << "does not sync from zone\n";
+    return 0;
+  }
+
+  if (!pipe.source.bucket) {
+    ldpp_dout(dpp, -1) << __func__ << "(): missing source bucket" << dendl;
+    return -EINVAL;
+  }
+
+  std::unique_ptr<rgw::sal::Bucket> source_bucket;
+  int r = init_bucket(nullptr, *pipe.source.bucket, &source_bucket);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "failed to read source bucket info: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  out << indented{width, "source bucket"} << source_bucket->get_key() << std::endl;
+  pipe.source.bucket = source_bucket->get_key();
+
+  pipe.dest.bucket = bucket_info.bucket;
+
+  uint64_t gen = 0;
+  std::vector<rgw_bucket_shard_sync_info> shard_status;
+
+  // check for full sync status
+  rgw_bucket_sync_status full_status;
+  r = rgw_read_bucket_full_sync_status(dpp, driver, pipe, &full_status, null_yield);
+  if (r >= 0) {
+    if (full_status.state == BucketSyncState::Init) {
+      out << indented{width} << "init: bucket sync has not started\n";
+      return 0;
+    }
+    if (full_status.state == BucketSyncState::Stopped) {
+      out << indented{width} << "stopped: bucket sync is disabled\n";
+      return 0;
+    }
+    if (full_status.state == BucketSyncState::Full) {
+      out << indented{width} << "full sync: " << full_status.full.count << " objects completed\n";
+      return 0;
+    }
+    gen = full_status.incremental_gen;
+    shard_status.resize(full_status.shards_done_with_gen.size());
+  } else if (r == -ENOENT) {
+    // no full status, but there may be per-shard status from before upgrade
+    const auto& logs = source_bucket->get_info().layout.logs;
+    if (logs.empty()) {
+      out << indented{width} << "init: bucket sync has not started\n";
+      return 0;
+    }
+    const auto& log = logs.front();
+    if (log.gen > 0) {
+      // this isn't the backward-compatible case, so we just haven't started yet
+      out << indented{width} << "init: bucket sync has not started\n";
+      return 0;
+    }
+    if (log.layout.type != rgw::BucketLogType::InIndex) {
+      ldpp_dout(dpp, -1) << "unrecognized log layout type " << log.layout.type << dendl;
+      return -EINVAL;
+    }
+    // use shard count from our log gen=0
+    shard_status.resize(rgw::num_shards(log.layout.in_index));
+  } else {
+    lderr(driver->ctx()) << "failed to read bucket full sync status: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  r = rgw_read_bucket_inc_sync_status(dpp, driver, pipe, gen, &shard_status);
+  if (r < 0) {
+    lderr(driver->ctx()) << "failed to read bucket incremental sync status: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  const int total_shards = shard_status.size();
+
+  out << indented{width} << "incremental sync on " << total_shards << " shards\n";
+
+  rgw_bucket_index_marker_info remote_info;
+  BucketIndexShardsManager remote_markers;
+  r = rgw_read_remote_bilog_info(dpp, conn, source_bucket->get_key(),
+                                 remote_info, remote_markers, null_yield);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "failed to read remote log: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  std::set<int> shards_behind;
+  for (const auto& r : remote_markers.get()) {
+    auto shard_id = r.first;
+    if (r.second.empty()) {
+      continue; // empty bucket index shard
+    }
+    if (shard_id >= total_shards) {
+      // unexpected shard id. we don't have status for it, so we're behind
+      shards_behind.insert(shard_id);
+      continue;
+    }
+    auto& m = shard_status[shard_id];
+    const auto pos = BucketIndexShardsManager::get_shard_marker(m.inc_marker.position);
+    if (pos < r.second) {
+      shards_behind.insert(shard_id);
+    }
+  }
+  if (!shards_behind.empty()) {
+    out << indented{width} << "bucket is behind on " << shards_behind.size() << " shards\n";
+    out << indented{width} << "behind shards: [" << shards_behind << "]\n" ;
+  } else {
+    out << indented{width} << "bucket is caught up with source\n";
+  }
+  return 0;
+}
+
+void encode_json(const char *name, const RGWBucketSyncFlowManager::pipe_set& pset, Formatter *f)
+{
+  Formatter::ObjectSection top_section(*f, name);
+  Formatter::ArraySection as(*f, "entries");
+
+  for (auto& pipe_handler : pset) {
+    Formatter::ObjectSection hs(*f, "handler");
+    encode_json("source", pipe_handler.source, f);
+    encode_json("dest", pipe_handler.dest, f);
+  }
+}
+
+static std::vector<string> convert_bucket_set_to_str_vec(const std::set<rgw_bucket>& bs)
+{
+  std::vector<string> result;
+  result.reserve(bs.size());
+  for (auto& b : bs) {
+    result.push_back(b.get_key());
+  }
+  return result;
+}
+
+static void get_hint_entities(const std::set<rgw_zone_id>& zones, const std::set<rgw_bucket>& buckets,
+			      std::set<rgw_sync_bucket_entity> *hint_entities)
+{
+  for (auto& zone_id : zones) {
+    for (auto& b : buckets) {
+      std::unique_ptr<rgw::sal::Bucket> hint_bucket;
+      int ret = init_bucket(nullptr, b, &hint_bucket);
+      if (ret < 0) {
+	ldpp_dout(dpp(), 20) << "could not init bucket info for hint bucket=" << b << " ... skipping" << dendl;
+	continue;
+      }
+
+      hint_entities->insert(rgw_sync_bucket_entity(zone_id, hint_bucket->get_key()));
+    }
+  }
+}
+
+static rgw_zone_id resolve_zone_id(const string& s)
+{
+  std::unique_ptr<rgw::sal::Zone> zone;
+  int ret = driver->get_zone()->get_zonegroup().get_zone_by_id(s, &zone);
+  if (ret < 0)
+    ret = driver->get_zone()->get_zonegroup().get_zone_by_name(s, &zone);
+  if (ret < 0)
+    return rgw_zone_id(s);
+
+  return rgw_zone_id(zone->get_id());
+}
+
+rgw_zone_id validate_zone_id(const rgw_zone_id& zone_id)
+{
+  return resolve_zone_id(zone_id.id);
+}
+
+static int sync_info(std::optional<rgw_zone_id> opt_target_zone, std::optional<rgw_bucket> opt_bucket, Formatter *formatter)
+{
+  rgw_zone_id zone_id = opt_target_zone.value_or(driver->get_zone()->get_id());
+
+  auto zone_policy_handler = driver->get_zone()->get_sync_policy_handler();
+
+  RGWBucketSyncPolicyHandlerRef bucket_handler;
+
+  std::optional<rgw_bucket> eff_bucket = opt_bucket;
+
+  auto handler = zone_policy_handler;
+
+  if (eff_bucket) {
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+
+    int ret = init_bucket(nullptr, *eff_bucket, &bucket);
+    if (ret < 0 && ret != -ENOENT) {
+      cerr << "ERROR: init_bucket failed: " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+
+    if (ret >= 0) {
+      rgw::sal::Attrs attrs = bucket->get_attrs();
+      bucket_handler.reset(handler->alloc_child(bucket->get_info(), std::move(attrs)));
+    } else {
+      cerr << "WARNING: bucket not found, simulating result" << std::endl;
+      bucket_handler.reset(handler->alloc_child(*eff_bucket, nullopt));
+    }
+
+    ret = bucket_handler->init(dpp(), null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: failed to init bucket sync policy handler: " << cpp_strerror(-ret) << " (ret=" << ret << ")" << std::endl;
+      return ret;
+    }
+
+    handler = bucket_handler;
+  }
+
+  std::set<rgw_sync_bucket_pipe> sources;
+  std::set<rgw_sync_bucket_pipe> dests;
+
+  handler->get_pipes(&sources, &dests, std::nullopt);
+
+  auto source_hints_vec = convert_bucket_set_to_str_vec(handler->get_source_hints());
+  auto target_hints_vec = convert_bucket_set_to_str_vec(handler->get_target_hints());
+
+  std::set<rgw_sync_bucket_pipe> resolved_sources;
+  std::set<rgw_sync_bucket_pipe> resolved_dests;
+
+  rgw_sync_bucket_entity self_entity(zone_id, opt_bucket);
+
+  set<rgw_zone_id> source_zones;
+  set<rgw_zone_id> target_zones;
+
+  zone_policy_handler->reflect(dpp(), nullptr, nullptr,
+                               nullptr, nullptr,
+                               &source_zones,
+                               &target_zones,
+                               false); /* relaxed: also get all zones that we allow to sync to/from */
+
+  std::set<rgw_sync_bucket_entity> hint_entities;
+
+  get_hint_entities(source_zones, handler->get_source_hints(), &hint_entities);
+  get_hint_entities(target_zones, handler->get_target_hints(), &hint_entities);
+
+  for (auto& hint_entity : hint_entities) {
+    if (!hint_entity.zone ||
+	!hint_entity.bucket) {
+      continue; /* shouldn't really happen */
+    }
+
+    auto zid = validate_zone_id(*hint_entity.zone);
+    auto& hint_bucket = *hint_entity.bucket;
+
+    RGWBucketSyncPolicyHandlerRef hint_bucket_handler;
+    int r = driver->get_sync_policy_handler(dpp(), zid, hint_bucket, &hint_bucket_handler, null_yield);
+    if (r < 0) {
+      ldpp_dout(dpp(), 20) << "could not get bucket sync policy handler for hint bucket=" << hint_bucket << " ... skipping" << dendl;
+      continue;
+    }
+
+    hint_bucket_handler->get_pipes(&resolved_dests,
+                                   &resolved_sources,
+                                   self_entity); /* flipping resolved dests and sources as these are
+                                                    relative to the remote entity */
+  }
+
+  {
+    Formatter::ObjectSection os(*formatter, "result");
+    encode_json("sources", sources, formatter);
+    encode_json("dests", dests, formatter);
+    {
+      Formatter::ObjectSection hints_section(*formatter, "hints");
+      encode_json("sources", source_hints_vec, formatter);
+      encode_json("dests", target_hints_vec, formatter);
+    }
+    {
+      Formatter::ObjectSection resolved_hints_section(*formatter, "resolved-hints-1");
+      encode_json("sources", resolved_sources, formatter);
+      encode_json("dests", resolved_dests, formatter);
+    }
+    {
+      Formatter::ObjectSection resolved_hints_section(*formatter, "resolved-hints");
+      encode_json("sources", handler->get_resolved_source_hints(), formatter);
+      encode_json("dests", handler->get_resolved_dest_hints(), formatter);
+    }
+  }
+
+  formatter->flush(cout);
+
+  return 0;
+}
+
+static int bucket_sync_info(rgw::sal::Driver* driver, const RGWBucketInfo& info,
+                              std::ostream& out)
+{
+  const rgw::sal::ZoneGroup& zonegroup = driver->get_zone()->get_zonegroup();
+  rgw::sal::Zone* zone = driver->get_zone();
+  constexpr int width = 15;
+
+  out << indented{width, "realm"} << zone->get_realm_id() << " (" << zone->get_realm_name() << ")\n";
+  out << indented{width, "zonegroup"} << zonegroup.get_id() << " (" << zonegroup.get_name() << ")\n";
+  out << indented{width, "zone"} << zone->get_id() << " (" << zone->get_name() << ")\n";
+  out << indented{width, "bucket"} << info.bucket << "\n\n";
+
+  if (!static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->bucket_imports_data(info.bucket, null_yield, dpp())) {
+    out << "Sync is disabled for bucket " << info.bucket.name << '\n';
+    return 0;
+  }
+
+  RGWBucketSyncPolicyHandlerRef handler;
+
+  int r = driver->get_sync_policy_handler(dpp(), std::nullopt, info.bucket, &handler, null_yield);
+  if (r < 0) {
+    ldpp_dout(dpp(), -1) << "ERROR: failed to get policy handler for bucket (" << info.bucket << "): r=" << r << ": " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  auto& sources = handler->get_sources();
+
+  for (auto& m : sources) {
+    auto& zone = m.first;
+    out << indented{width, "source zone"} << zone << std::endl;
+    for (auto& pipe_handler : m.second) {
+      out << indented{width, "bucket"} << *pipe_handler.source.bucket << std::endl;
+    }
+  }
+
+  return 0;
+}
+
+static int bucket_sync_status(rgw::sal::Driver* driver, const RGWBucketInfo& info,
+                              const rgw_zone_id& source_zone_id,
+			      std::optional<rgw_bucket>& opt_source_bucket,
+                              std::ostream& out)
+{
+  const rgw::sal::ZoneGroup& zonegroup = driver->get_zone()->get_zonegroup();
+  rgw::sal::Zone* zone = driver->get_zone();
+  constexpr int width = 15;
+
+  out << indented{width, "realm"} << zone->get_realm_id() << " (" << zone->get_realm_name() << ")\n";
+  out << indented{width, "zonegroup"} << zonegroup.get_id() << " (" << zonegroup.get_name() << ")\n";
+  out << indented{width, "zone"} << zone->get_id() << " (" << zone->get_name() << ")\n";
+  out << indented{width, "bucket"} << info.bucket << "\n";
+  out << indented{width, "current time"}
+    << to_iso_8601(ceph::real_clock::now(), iso_8601_format::YMDhms) << "\n\n";
+
+
+  if (!static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->bucket_imports_data(info.bucket, null_yield, dpp())) {
+    out << "Sync is disabled for bucket " << info.bucket.name << " or bucket has no sync sources" << std::endl;
+    return 0;
+  }
+
+  RGWBucketSyncPolicyHandlerRef handler;
+
+  int r = driver->get_sync_policy_handler(dpp(), std::nullopt, info.bucket, &handler, null_yield);
+  if (r < 0) {
+    ldpp_dout(dpp(), -1) << "ERROR: failed to get policy handler for bucket (" << info.bucket << "): r=" << r << ": " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  auto sources = handler->get_all_sources();
+
+  auto& zone_conn_map = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone_conn_map();
+  set<rgw_zone_id> zone_ids;
+
+  if (!source_zone_id.empty()) {
+    std::unique_ptr<rgw::sal::Zone> zone;
+    int ret = driver->get_zone()->get_zonegroup().get_zone_by_id(source_zone_id.id, &zone);
+    if (ret < 0) {
+      ldpp_dout(dpp(), -1) << "Source zone not found in zonegroup "
+          << zonegroup.get_name() << dendl;
+      return -EINVAL;
+    }
+    auto c = zone_conn_map.find(source_zone_id);
+    if (c == zone_conn_map.end()) {
+      ldpp_dout(dpp(), -1) << "No connection to zone " << zone->get_name() << dendl;
+      return -EINVAL;
+    }
+    zone_ids.insert(source_zone_id);
+  } else {
+    std::list<std::string> ids;
+    int ret = driver->get_zone()->get_zonegroup().list_zones(ids);
+    if (ret == 0) {
+      for (const auto& entry : ids) {
+	zone_ids.insert(entry);
+      }
+    }
+  }
+
+  for (auto& zone_id : zone_ids) {
+    auto z = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zonegroup().zones.find(zone_id.id);
+    if (z == static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zonegroup().zones.end()) { /* should't happen */
+      continue;
+    }
+    auto c = zone_conn_map.find(zone_id.id);
+    if (c == zone_conn_map.end()) { /* should't happen */
+      continue;
+    }
+
+    for (auto& entry : sources) {
+      auto& pipe = entry.second;
+      if (opt_source_bucket &&
+	  pipe.source.bucket != opt_source_bucket) {
+	continue;
+      }
+      if (pipe.source.zone.value_or(rgw_zone_id()) == z->second.id) {
+	bucket_source_sync_status(dpp(), static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone(), z->second,
+				  c->second,
+				  info, pipe,
+				  width, out);
+      }
+    }
+  }
+
+  return 0;
+}
+
+static void parse_tier_config_param(const string& s, map<string, string, ltstr_nocase>& out)
+{
+  int level = 0;
+  string cur_conf;
+  list<string> confs;
+  for (auto c : s) {
+    if (c == ',') {
+      if (level == 0) {
+        confs.push_back(cur_conf);
+        cur_conf.clear();
+        continue;
+      }
+    }
+    if (c == '{') {
+      ++level;
+    } else if (c == '}') {
+      --level;
+    }
+    cur_conf += c;
+  }
+  if (!cur_conf.empty()) {
+    confs.push_back(cur_conf);
+  }
+
+  for (auto c : confs) {
+    ssize_t pos = c.find("=");
+    if (pos < 0) {
+      out[c] = "";
+    } else {
+      out[c.substr(0, pos)] = c.substr(pos + 1);
+    }
+  }
+}
+
+static int check_pool_support_omap(const rgw_pool& pool)
+{
+  librados::IoCtx io_ctx;
+  int ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_rados_handle()->ioctx_create(pool.to_str().c_str(), io_ctx);
+  if (ret < 0) {
+     // the pool may not exist at this moment, we have no way to check if it supports omap.
+     return 0;
+  }
+
+  ret = io_ctx.omap_clear("__omap_test_not_exist_oid__");
+  if (ret == -EOPNOTSUPP) {
+    io_ctx.close();
+    return ret;
+  }
+  io_ctx.close();
+  return 0;
+}
+
+int check_reshard_bucket_params(rgw::sal::Driver* driver,
+				const string& bucket_name,
+				const string& tenant,
+				const string& bucket_id,
+				bool num_shards_specified,
+				int num_shards,
+				int yes_i_really_mean_it,
+				std::unique_ptr<rgw::sal::Bucket>* bucket)
+{
+  if (bucket_name.empty()) {
+    cerr << "ERROR: bucket not specified" << std::endl;
+    return -EINVAL;
+  }
+
+  if (!num_shards_specified) {
+    cerr << "ERROR: --num-shards not specified" << std::endl;
+    return -EINVAL;
+  }
+
+  if (num_shards > (int)static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_max_bucket_shards()) {
+    cerr << "ERROR: num_shards too high, max value: " << static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_max_bucket_shards() << std::endl;
+    return -EINVAL;
+  }
+
+  if (num_shards < 0) {
+    cerr << "ERROR: num_shards must be non-negative integer" << std::endl;
+    return -EINVAL;
+  }
+
+  int ret = init_bucket(nullptr, tenant, bucket_name, bucket_id, bucket);
+  if (ret < 0) {
+    cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+    return ret;
+  }
+
+  int num_source_shards = rgw::current_num_shards((*bucket)->get_info().layout);
+
+  if (num_shards <= num_source_shards && !yes_i_really_mean_it) {
+    cerr << "num shards is less or equal to current shards count" << std::endl
+	 << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+static int scan_totp(CephContext *cct, ceph::real_time& now, rados::cls::otp::otp_info_t& totp, vector<string>& pins,
+                     time_t *pofs)
+{
+#define MAX_TOTP_SKEW_HOURS (24 * 7)
+  time_t start_time = ceph::real_clock::to_time_t(now);
+  time_t time_ofs = 0, time_ofs_abs = 0;
+  time_t step_size = totp.step_size;
+  if (step_size == 0) {
+    step_size = OATH_TOTP_DEFAULT_TIME_STEP_SIZE;
+  }
+  uint32_t count = 0;
+  int sign = 1;
+
+  uint32_t max_skew = MAX_TOTP_SKEW_HOURS * 3600;
+
+  while (time_ofs_abs < max_skew) {
+    int rc = oath_totp_validate2(totp.seed_bin.c_str(), totp.seed_bin.length(),
+                             start_time, 
+                             step_size,
+                             time_ofs,
+                             1,
+                             nullptr,
+                             pins[0].c_str());
+    if (rc != OATH_INVALID_OTP) {
+      rc = oath_totp_validate2(totp.seed_bin.c_str(), totp.seed_bin.length(),
+                               start_time, 
+                               step_size,
+                               time_ofs - step_size, /* smaller time_ofs moves time forward */
+                               1,
+                               nullptr,
+                               pins[1].c_str());
+      if (rc != OATH_INVALID_OTP) {
+        *pofs = time_ofs - step_size + step_size * totp.window / 2;
+        ldpp_dout(dpp(), 20) << "found at time=" << start_time - time_ofs << " time_ofs=" << time_ofs << dendl;
+        return 0;
+      }
+    }
+    sign = -sign;
+    time_ofs_abs = (++count) * step_size;
+    time_ofs = sign * time_ofs_abs;
+  }
+
+  return -ENOENT;
+}
+
+static int trim_sync_error_log(int shard_id, const string& marker, int delay_ms)
+{
+  auto oid = RGWSyncErrorLogger::get_shard_oid(RGW_SYNC_ERROR_LOG_SHARD_PREFIX,
+                                               shard_id);
+  // call cls_log_trim() until it returns -ENODATA
+  for (;;) {
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->timelog.trim(dpp(), oid, {}, {}, {}, marker, nullptr,
+					      null_yield);
+    if (ret == -ENODATA) {
+      return 0;
+    }
+    if (ret < 0) {
+      return ret;
+    }
+    if (delay_ms) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(delay_ms));
+    }
+  }
+  // unreachable
+}
+
+static bool symmetrical_flow_opt(const string& opt)
+{
+  return (opt == "symmetrical" || opt == "symmetric");
+}
+
+static bool directional_flow_opt(const string& opt)
+{
+  return (opt == "directional" || opt == "direction");
+}
+
+template <class T>
+static bool require_opt(std::optional<T> opt, bool extra_check = true)
+{
+  if (!opt || !extra_check) {
+    return false;
+  }
+  return true;
+}
+
+template <class T>
+static bool require_non_empty_opt(std::optional<T> opt, bool extra_check = true)
+{
+  if (!opt || opt->empty() || !extra_check) {
+    return false;
+  }
+  return true;
+}
+
+template <class T>
+static void show_result(T& obj,
+                        Formatter *formatter,
+                        ostream& os)
+{
+  encode_json("obj", obj, formatter);
+
+  formatter->flush(cout);
+}
+
+void init_optional_bucket(std::optional<rgw_bucket>& opt_bucket,
+                          std::optional<string>& opt_tenant,
+                          std::optional<string>& opt_bucket_name,
+                          std::optional<string>& opt_bucket_id)
+{
+  if (opt_tenant || opt_bucket_name || opt_bucket_id) {
+    opt_bucket.emplace();
+    if (opt_tenant) {
+      opt_bucket->tenant = *opt_tenant;
+    }
+    if (opt_bucket_name) {
+      opt_bucket->name = *opt_bucket_name;
+    }
+    if (opt_bucket_id) {
+      opt_bucket->bucket_id = *opt_bucket_id;
+    }
+  }
+}
+
+class SyncPolicyContext
+{
+  rgw::sal::ConfigStore* cfgstore;
+  RGWZoneGroup zonegroup;
+  std::unique_ptr<rgw::sal::ZoneGroupWriter> zonegroup_writer;
+
+  std::optional<rgw_bucket> b;
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  rgw_sync_policy_info *policy{nullptr};
+
+  std::optional<rgw_user> owner;
+
+public:
+  SyncPolicyContext(rgw::sal::ConfigStore* cfgstore,
+                    std::optional<rgw_bucket> _bucket)
+      : cfgstore(cfgstore), b(std::move(_bucket)) {}
+
+  int init(const string& zonegroup_id, const string& zonegroup_name) {
+    int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore,
+                                  zonegroup_id, zonegroup_name,
+                                  zonegroup, &zonegroup_writer);
+    if (ret < 0) {
+      cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+
+    if (!b) {
+      policy = &zonegroup.sync_policy;
+      return 0;
+    }
+
+    ret = init_bucket(nullptr, *b, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+
+    owner = bucket->get_info().owner;
+
+    if (!bucket->get_info().sync_policy) {
+      rgw_sync_policy_info new_policy;
+      bucket->get_info().set_sync_policy(std::move(new_policy));
+    }
+
+    policy = &(*bucket->get_info().sync_policy);
+
+    return 0;
+  }
+
+  int write_policy() {
+    if (!b) {
+      int ret = zonegroup_writer->write(dpp(), null_yield, zonegroup);
+      if (ret < 0) {
+        cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+      return 0;
+    }
+
+    int ret = bucket->put_info(dpp(), false, real_time());
+    if (ret < 0) {
+      cerr << "failed to driver bucket info: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    return 0;
+  }
+
+  rgw_sync_policy_info& get_policy() {
+    return *policy;
+  }
+
+  std::optional<rgw_user>& get_owner() {
+    return owner;
+  }
+};
+
+void resolve_zone_id_opt(std::optional<string>& zone_name, std::optional<rgw_zone_id>& zone_id)
+{
+  if (!zone_name || zone_id) {
+    return;
+  }
+  zone_id.emplace();
+  std::unique_ptr<rgw::sal::Zone> zone;
+  int ret = driver->get_zone()->get_zonegroup().get_zone_by_name(*zone_name, &zone);
+  if (ret < 0) {
+    cerr << "WARNING: cannot find source zone id for name=" << *zone_name << std::endl;
+    zone_id = rgw_zone_id(*zone_name);
+  } else {
+    zone_id->id = zone->get_id();
+  }
+}
+void resolve_zone_ids_opt(std::optional<vector<string> >& names, std::optional<vector<rgw_zone_id> >& ids)
+{
+  if (!names || ids) {
+    return;
+  }
+  ids.emplace();
+  for (auto& name : *names) {
+    rgw_zone_id zid;
+    std::unique_ptr<rgw::sal::Zone> zone;
+    int ret = driver->get_zone()->get_zonegroup().get_zone_by_name(name, &zone);
+    if (ret < 0) {
+      cerr << "WARNING: cannot find source zone id for name=" << name << std::endl;
+      zid = rgw_zone_id(name);
+    } else {
+      zid.id = zone->get_id();
+    }
+    ids->push_back(zid);
+  }
+}
+
+static vector<rgw_zone_id> zone_ids_from_str(const string& val)
+{
+  vector<rgw_zone_id> result;
+  vector<string> v;
+  get_str_vec(val, v);
+  for (auto& z : v) {
+    result.push_back(rgw_zone_id(z));
+  }
+  return result;
+}
+
+class JSONFormatter_PrettyZone : public JSONFormatter {
+  class Handler : public JSONEncodeFilter::Handler<rgw_zone_id> {
+    void encode_json(const char *name, const void *pval, ceph::Formatter *f) const override {
+      auto zone_id = *(static_cast<const rgw_zone_id *>(pval));
+      string zone_name;
+      std::unique_ptr<rgw::sal::Zone> zone;
+      if (driver->get_zone()->get_zonegroup().get_zone_by_id(zone_id.id, &zone) == 0) {
+        zone_name = zone->get_name();
+      } else {
+        cerr << "WARNING: cannot find zone name for id=" << zone_id << std::endl;
+        zone_name = zone_id.id;
+      }
+
+      ::encode_json(name, zone_name, f);
+    }
+  } zone_id_type_handler;
+
+  JSONEncodeFilter encode_filter;
+public:
+  JSONFormatter_PrettyZone(bool pretty_format) : JSONFormatter(pretty_format) {
+    encode_filter.register_type(&zone_id_type_handler);
+  }
+
+  void *get_external_feature_handler(const std::string& feature) override {
+    if (feature != "JSONEncodeFilter") {
+      return nullptr;
+    }
+    return &encode_filter;
+  }
+};
+
+void init_realm_param(CephContext *cct, string& var, std::optional<string>& opt_var, const string& conf_name)
+{
+  var = cct->_conf.get_val<string>(conf_name);
+  if (!var.empty()) {
+    opt_var = var;
+  }
+}
+
+int main(int argc, const char **argv)
+{
+  auto args = argv_to_vec(argc, argv);
+  if (args.empty()) {
+    cerr << argv[0] << ": -h or --help for usage" << std::endl;
+    exit(1);
+  }
+  if (ceph_argparse_need_usage(args)) {
+    usage();
+    exit(0);
+  }
+
+  auto cct = rgw_global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+			     CODE_ENVIRONMENT_UTILITY, 0);
+
+  // for region -> zonegroup conversion (must happen before common_init_finish())
+  if (!g_conf()->rgw_region.empty() && g_conf()->rgw_zonegroup.empty()) {
+    g_conf().set_val_or_die("rgw_zonegroup", g_conf()->rgw_region.c_str());
+  }
+
+  rgw_user user_id_arg;
+  std::unique_ptr<rgw::sal::User> user;
+  string tenant;
+  string user_ns;
+  rgw_user new_user_id;
+  std::string access_key, secret_key, user_email, display_name;
+  std::string bucket_name, pool_name, object;
+  rgw_pool pool;
+  std::string date, subuser, access, format;
+  std::string start_date, end_date;
+  std::string key_type_str;
+  std::string period_id, period_epoch, remote, url;
+  std::optional<string> opt_region;
+  std::string master_zone;
+  std::string realm_name, realm_id, realm_new_name;
+  std::optional<string> opt_realm_name, opt_realm_id;
+  std::string zone_name, zone_id, zone_new_name;
+  std::optional<string> opt_zone_name, opt_zone_id;
+  std::string zonegroup_name, zonegroup_id, zonegroup_new_name;
+  std::optional<string> opt_zonegroup_name, opt_zonegroup_id;
+  std::string api_name;
+  std::string role_name, path, assume_role_doc, policy_name, perm_policy_doc, path_prefix, max_session_duration;
+  std::string redirect_zone;
+  bool redirect_zone_set = false;
+  list<string> endpoints;
+  int tmp_int;
+  int sync_from_all_specified = false;
+  bool sync_from_all = false;
+  list<string> sync_from;
+  list<string> sync_from_rm;
+  int is_master_int;
+  int set_default = 0;
+  bool is_master = false;
+  bool is_master_set = false;
+  int read_only_int;
+  bool read_only = false;
+  int is_read_only_set = false;
+  int commit = false;
+  int staging = false;
+  int key_type = KEY_TYPE_UNDEFINED;
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  uint32_t perm_mask = 0;
+  RGWUserInfo info;
+  OPT opt_cmd = OPT::NO_CMD;
+  int gen_access_key = 0;
+  int gen_secret_key = 0;
+  bool set_perm = false;
+  bool set_temp_url_key = false;
+  map<int, string> temp_url_keys;
+  string bucket_id;
+  string new_bucket_name;
+  std::unique_ptr<Formatter> formatter;
+  std::unique_ptr<Formatter> zone_formatter;
+  int purge_data = false;
+  int pretty_format = false;
+  int show_log_entries = true;
+  int show_log_sum = true;
+  int skip_zero_entries = false;  // log show
+  int purge_keys = false;
+  int yes_i_really_mean_it = false;
+  int delete_child_objects = false;
+  int fix = false;
+  int remove_bad = false;
+  int check_head_obj_locator = false;
+  int max_buckets = -1;
+  bool max_buckets_specified = false;
+  map<string, bool> categories;
+  string caps;
+  int check_objects = false;
+  RGWBucketAdminOpState bucket_op;
+  string infile;
+  string metadata_key;
+  RGWObjVersionTracker objv_tracker;
+  string marker;
+  string start_marker;
+  string end_marker;
+  int max_entries = -1;
+  bool max_entries_specified = false;
+  int admin = false;
+  bool admin_specified = false;
+  int system = false;
+  bool system_specified = false;
+  int shard_id = -1;
+  bool specified_shard_id = false;
+  string client_id;
+  string op_id;
+  string op_mask_str;
+  string quota_scope;
+  string ratelimit_scope;
+  std::string objects_file;
+  string object_version;
+  string placement_id;
+  std::optional<string> opt_storage_class;
+  list<string> tags;
+  list<string> tags_add;
+  list<string> tags_rm;
+  int placement_inline_data = true;
+  bool placement_inline_data_specified = false;
+
+  int64_t max_objects = -1;
+  int64_t max_size = -1;
+  int64_t max_read_ops = 0;
+  int64_t max_write_ops = 0;
+  int64_t max_read_bytes = 0;
+  int64_t max_write_bytes = 0;
+  bool have_max_objects = false;
+  bool have_max_size = false;
+  bool have_max_write_ops = false;
+  bool have_max_read_ops = false;
+  bool have_max_write_bytes = false;
+  bool have_max_read_bytes = false;
+  int include_all = false;
+  int allow_unordered = false;
+
+  int sync_stats = false;
+  int reset_stats = false;
+  int bypass_gc = false;
+  int warnings_only = false;
+  int inconsistent_index = false;
+
+  int verbose = false;
+
+  int extra_info = false;
+
+  uint64_t min_rewrite_size = 4 * 1024 * 1024;
+  uint64_t max_rewrite_size = ULLONG_MAX;
+  uint64_t min_rewrite_stripe_size = 0;
+
+  BIIndexType bi_index_type = BIIndexType::Plain;
+  std::optional<log_type> opt_log_type;
+
+  string job_id;
+  int num_shards = 0;
+  bool num_shards_specified = false;
+  std::optional<int> bucket_index_max_shards;
+
+  int max_concurrent_ios = 32;
+  ceph::timespan min_age = std::chrono::hours(1);
+  bool hide_progress = false;
+  bool dump_keys = false;
+  uint64_t orphan_stale_secs = (24 * 3600);
+  int detail = false;
+
+  std::string val;
+  std::ostringstream errs;
+  string err;
+
+  string source_zone_name;
+  rgw_zone_id source_zone; /* zone id */
+
+  string tier_type;
+  bool tier_type_specified = false;
+
+  map<string, string, ltstr_nocase> tier_config_add;
+  map<string, string, ltstr_nocase> tier_config_rm;
+
+  boost::optional<string> index_pool;
+  boost::optional<string> data_pool;
+  boost::optional<string> data_extra_pool;
+  rgw::BucketIndexType placement_index_type = rgw::BucketIndexType::Normal;
+  bool index_type_specified = false;
+
+  boost::optional<std::string> compression_type;
+
+  string totp_serial;
+  string totp_seed;
+  string totp_seed_type = "hex";
+  vector<string> totp_pin;
+  int totp_seconds = 0;
+  int totp_window = 0;
+  int trim_delay_ms = 0;
+
+  string topic_name;
+  string notification_id;
+  string sub_name;
+  string event_id;
+
+  std::optional<uint64_t> gen;
+  std::optional<std::string> str_script_ctx;
+  std::optional<std::string> script_package;
+  int allow_compilation = false;
+
+  std::optional<string> opt_group_id;
+  std::optional<string> opt_status;
+  std::optional<string> opt_flow_type;
+  std::optional<vector<string> > opt_zone_names;
+  std::optional<vector<rgw_zone_id> > opt_zone_ids;
+  std::optional<string> opt_flow_id;
+  std::optional<string> opt_source_zone_name;
+  std::optional<rgw_zone_id> opt_source_zone_id;
+  std::optional<string> opt_dest_zone_name;
+  std::optional<rgw_zone_id> opt_dest_zone_id;
+  std::optional<vector<string> > opt_source_zone_names;
+  std::optional<vector<rgw_zone_id> > opt_source_zone_ids;
+  std::optional<vector<string> > opt_dest_zone_names;
+  std::optional<vector<rgw_zone_id> > opt_dest_zone_ids;
+  std::optional<string> opt_pipe_id;
+  std::optional<rgw_bucket> opt_bucket;
+  std::optional<string> opt_tenant;
+  std::optional<string> opt_bucket_name;
+  std::optional<string> opt_bucket_id;
+  std::optional<rgw_bucket> opt_source_bucket;
+  std::optional<string> opt_source_tenant;
+  std::optional<string> opt_source_bucket_name;
+  std::optional<string> opt_source_bucket_id;
+  std::optional<rgw_bucket> opt_dest_bucket;
+  std::optional<string> opt_dest_tenant;
+  std::optional<string> opt_dest_bucket_name;
+  std::optional<string> opt_dest_bucket_id;
+  std::optional<string> opt_effective_zone_name;
+  std::optional<rgw_zone_id> opt_effective_zone_id;
+
+  std::optional<string> opt_prefix;
+  std::optional<string> opt_prefix_rm;
+
+  std::optional<int> opt_priority;
+  std::optional<string> opt_mode;
+  std::optional<rgw_user> opt_dest_owner;
+  ceph::timespan opt_retry_delay_ms = std::chrono::milliseconds(2000);
+  ceph::timespan opt_timeout_sec = std::chrono::seconds(60);
+
+  std::optional<std::string> inject_error_at;
+  std::optional<int> inject_error_code;
+  std::optional<std::string> inject_abort_at;
+  std::optional<std::string> inject_delay_at;
+  ceph::timespan inject_delay = std::chrono::milliseconds(2000);
+
+  rgw::zone_features::set enable_features;
+  rgw::zone_features::set disable_features;
+
+  SimpleCmd cmd(all_cmds, cmd_aliases);
+  bool raw_storage_op = false;
+
+  std::optional<std::string> rgw_obj_fs; // radoslist field separator
+
+  init_realm_param(cct.get(), realm_id, opt_realm_id, "rgw_realm_id");
+  init_realm_param(cct.get(), zonegroup_id, opt_zonegroup_id, "rgw_zonegroup_id");
+  init_realm_param(cct.get(), zone_id, opt_zone_id, "rgw_zone_id");
+
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_double_dash(args, i)) {
+      break;
+    } else if (ceph_argparse_witharg(args, i, &val, "-i", "--uid", (char*)NULL)) {
+      user_id_arg.from_str(val);
+      if (user_id_arg.empty()) {
+        cerr << "no value for uid" << std::endl;
+        exit(1);
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--new-uid", (char*)NULL)) {
+      new_user_id.from_str(val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--tenant", (char*)NULL)) {
+      tenant = val;
+      opt_tenant = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--user_ns", (char*)NULL)) {
+      user_ns = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--access-key", (char*)NULL)) {
+      access_key = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--subuser", (char*)NULL)) {
+      subuser = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--secret", "--secret-key", (char*)NULL)) {
+      secret_key = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "-e", "--email", (char*)NULL)) {
+      user_email = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "-n", "--display-name", (char*)NULL)) {
+      display_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "-b", "--bucket", (char*)NULL)) {
+      bucket_name = val;
+      opt_bucket_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) {
+      pool_name = val;
+      pool = rgw_pool(pool_name);
+    } else if (ceph_argparse_witharg(args, i, &val, "-o", "--object", (char*)NULL)) {
+      object = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--objects-file", (char*)NULL)) {
+      objects_file = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--object-version", (char*)NULL)) {
+      object_version = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--client-id", (char*)NULL)) {
+      client_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--op-id", (char*)NULL)) {
+      op_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--op-mask", (char*)NULL)) {
+      op_mask_str = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--key-type", (char*)NULL)) {
+      key_type_str = val;
+      if (key_type_str.compare("swift") == 0) {
+        key_type = KEY_TYPE_SWIFT;
+      } else if (key_type_str.compare("s3") == 0) {
+        key_type = KEY_TYPE_S3;
+      } else {
+        cerr << "bad key type: " << key_type_str << std::endl;
+        exit(1);
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--job-id", (char*)NULL)) {
+      job_id = val;
+    } else if (ceph_argparse_binary_flag(args, i, &gen_access_key, NULL, "--gen-access-key", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &gen_secret_key, NULL, "--gen-secret", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &show_log_entries, NULL, "--show-log-entries", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &show_log_sum, NULL, "--show-log-sum", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &skip_zero_entries, NULL, "--skip-zero-entries", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &admin, NULL, "--admin", (char*)NULL)) {
+      admin_specified = true;
+    } else if (ceph_argparse_binary_flag(args, i, &system, NULL, "--system", (char*)NULL)) {
+      system_specified = true;
+    } else if (ceph_argparse_binary_flag(args, i, &verbose, NULL, "--verbose", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &staging, NULL, "--staging", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &commit, NULL, "--commit", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_witharg(args, i, &val, "--min-rewrite-size", (char*)NULL)) {
+      min_rewrite_size = (uint64_t)atoll(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-rewrite-size", (char*)NULL)) {
+      max_rewrite_size = (uint64_t)atoll(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--min-rewrite-stripe-size", (char*)NULL)) {
+      min_rewrite_stripe_size = (uint64_t)atoll(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-buckets", (char*)NULL)) {
+      max_buckets = (int)strict_strtol(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max buckets: " << err << std::endl;
+        return EINVAL;
+      }
+      max_buckets_specified = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-entries", (char*)NULL)) {
+      max_entries = (int)strict_strtol(val.c_str(), 10, &err);
+      max_entries_specified = true;
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max entries: " << err << std::endl;
+        return EINVAL;
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) {
+      max_size = strict_iec_cast<long long>(val, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max size: " << err << std::endl;
+        return EINVAL;
+      }
+      have_max_size = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-objects", (char*)NULL)) {
+      max_objects = (int64_t)strict_strtoll(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max objects: " << err << std::endl;
+        return EINVAL;
+      }
+      have_max_objects = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-read-ops", (char*)NULL)) {
+      max_read_ops = (int64_t)strict_strtoll(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max read requests: " << err << std::endl;
+        return EINVAL;
+      }
+      have_max_read_ops = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-write-ops", (char*)NULL)) {
+      max_write_ops = (int64_t)strict_strtoll(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max write requests: " << err << std::endl;
+        return EINVAL;
+      }
+      have_max_write_ops = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-read-bytes", (char*)NULL)) {
+      max_read_bytes = (int64_t)strict_strtoll(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max read bytes: " << err << std::endl;
+        return EINVAL;
+      }
+      have_max_read_bytes = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-write-bytes", (char*)NULL)) {
+      max_write_bytes = (int64_t)strict_strtoll(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max write bytes: " << err << std::endl;
+        return EINVAL;
+      }
+      have_max_write_bytes = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--date", "--time", (char*)NULL)) {
+      date = val;
+      if (end_date.empty())
+        end_date = date;
+    } else if (ceph_argparse_witharg(args, i, &val, "--start-date", "--start-time", (char*)NULL)) {
+      start_date = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--end-date", "--end-time", (char*)NULL)) {
+      end_date = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--num-shards", (char*)NULL)) {
+      num_shards = (int)strict_strtol(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse num shards: " << err << std::endl;
+        return EINVAL;
+      }
+      num_shards_specified = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--bucket-index-max-shards", (char*)NULL)) {
+      bucket_index_max_shards = (int)strict_strtol(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse bucket-index-max-shards: " << err << std::endl;
+        return EINVAL;
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-concurrent-ios", (char*)NULL)) {
+      max_concurrent_ios = (int)strict_strtol(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max concurrent ios: " << err << std::endl;
+        return EINVAL;
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--min-age-hours", (char*)NULL)) {
+      min_age = std::chrono::hours(atoi(val.c_str()));
+    } else if (ceph_argparse_witharg(args, i, &val, "--orphan-stale-secs", (char*)NULL)) {
+      orphan_stale_secs = (uint64_t)strict_strtoll(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse orphan stale secs: " << err << std::endl;
+        return EINVAL;
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--shard-id", (char*)NULL)) {
+      shard_id = (int)strict_strtol(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse shard id: " << err << std::endl;
+        return EINVAL;
+      }
+      specified_shard_id = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--gen", (char*)NULL)) {
+      gen = strict_strtoll(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse gen id: " << err << std::endl;
+        return EINVAL;
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--access", (char*)NULL)) {
+      access = val;
+      perm_mask = rgw_str_to_perm(access.c_str());
+      set_perm = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--temp-url-key", (char*)NULL)) {
+      temp_url_keys[0] = val;
+      set_temp_url_key = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--temp-url-key2", "--temp-url-key-2", (char*)NULL)) {
+      temp_url_keys[1] = val;
+      set_temp_url_key = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--bucket-id", (char*)NULL)) {
+      bucket_id = val;
+      opt_bucket_id = val;
+      if (bucket_id.empty()) {
+        cerr << "no value for bucket-id" << std::endl;
+        exit(1);
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--bucket-new-name", (char*)NULL)) {
+      new_bucket_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--format", (char*)NULL)) {
+      format = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--categories", (char*)NULL)) {
+      string cat_str = val;
+      list<string> cat_list;
+      list<string>::iterator iter;
+      get_str_list(cat_str, cat_list);
+      for (iter = cat_list.begin(); iter != cat_list.end(); ++iter) {
+	categories[*iter] = true;
+      }
+    } else if (ceph_argparse_binary_flag(args, i, &delete_child_objects, NULL, "--purge-objects", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &pretty_format, NULL, "--pretty-format", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &purge_data, NULL, "--purge-data", (char*)NULL)) {
+      delete_child_objects = purge_data;
+    } else if (ceph_argparse_binary_flag(args, i, &purge_keys, NULL, "--purge-keys", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &yes_i_really_mean_it, NULL, "--yes-i-really-mean-it", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &fix, NULL, "--fix", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &remove_bad, NULL, "--remove-bad", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &check_head_obj_locator, NULL, "--check-head-obj-locator", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &check_objects, NULL, "--check-objects", (char*)NULL)) {
+     // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &sync_stats, NULL, "--sync-stats", (char*)NULL)) {
+     // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &reset_stats, NULL, "--reset-stats", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &include_all, NULL, "--include-all", (char*)NULL)) {
+     // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &allow_unordered, NULL, "--allow-unordered", (char*)NULL)) {
+     // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &extra_info, NULL, "--extra-info", (char*)NULL)) {
+     // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &bypass_gc, NULL, "--bypass-gc", (char*)NULL)) {
+     // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &warnings_only, NULL, "--warnings-only", (char*)NULL)) {
+     // do nothing
+    } else if (ceph_argparse_binary_flag(args, i, &inconsistent_index, NULL, "--inconsistent-index", (char*)NULL)) {
+     // do nothing
+    } else if (ceph_argparse_flag(args, i, "--hide-progress", (char*)NULL)) {
+      hide_progress = true;
+    } else if (ceph_argparse_flag(args, i, "--dump-keys", (char*)NULL)) {
+      dump_keys = true;
+    } else if (ceph_argparse_binary_flag(args, i, &placement_inline_data, NULL, "--placement-inline-data", (char*)NULL)) {
+      placement_inline_data_specified = true;
+     // do nothing
+    } else if (ceph_argparse_witharg(args, i, &val, "--caps", (char*)NULL)) {
+      caps = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--infile", (char*)NULL)) {
+      infile = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--metadata-key", (char*)NULL)) {
+      metadata_key = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--marker", (char*)NULL)) {
+      marker = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--start-marker", (char*)NULL)) {
+      start_marker = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--end-marker", (char*)NULL)) {
+      end_marker = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--quota-scope", (char*)NULL)) {
+      quota_scope = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--ratelimit-scope", (char*)NULL)) {
+      ratelimit_scope = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--index-type", (char*)NULL)) {
+      string index_type_str = val;
+      bi_index_type = get_bi_index_type(index_type_str);
+      if (bi_index_type == BIIndexType::Invalid) {
+        cerr << "ERROR: invalid bucket index entry type" << std::endl;
+        return EINVAL;
+      }
+    } else if (ceph_argparse_witharg(args, i, &val, "--log-type", (char*)NULL)) {
+      string log_type_str = val;
+      auto l = get_log_type(log_type_str);
+      if (l == static_cast<log_type>(0xff)) {
+        cerr << "ERROR: invalid log type" << std::endl;
+        return EINVAL;
+      }
+      opt_log_type = l;
+    } else if (ceph_argparse_binary_flag(args, i, &is_master_int, NULL, "--master", (char*)NULL)) {
+      is_master = (bool)is_master_int;
+      is_master_set = true;
+    } else if (ceph_argparse_binary_flag(args, i, &set_default, NULL, "--default", (char*)NULL)) {
+      /* do nothing */
+    } else if (ceph_argparse_witharg(args, i, &val, "--redirect-zone", (char*)NULL)) {
+      redirect_zone = val;
+      redirect_zone_set = true;
+    } else if (ceph_argparse_binary_flag(args, i, &read_only_int, NULL, "--read-only", (char*)NULL)) {
+      read_only = (bool)read_only_int;
+      is_read_only_set = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--master-zone", (char*)NULL)) {
+      master_zone = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--period", (char*)NULL)) {
+      period_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--epoch", (char*)NULL)) {
+      period_epoch = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--remote", (char*)NULL)) {
+      remote = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--url", (char*)NULL)) {
+      url = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--region", (char*)NULL)) {
+      opt_region = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--realm-id", (char*)NULL)) {
+      realm_id = val;
+      opt_realm_id = val;
+      g_conf().set_val("rgw_realm_id", val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--realm-new-name", (char*)NULL)) {
+      realm_new_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--zonegroup-id", (char*)NULL)) {
+      zonegroup_id = val;
+      opt_zonegroup_id = val;
+      g_conf().set_val("rgw_zonegroup_id", val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--zonegroup-new-name", (char*)NULL)) {
+      zonegroup_new_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--placement-id", (char*)NULL)) {
+      placement_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--storage-class", (char*)NULL)) {
+      opt_storage_class = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--tags", (char*)NULL)) {
+      get_str_list(val, ",", tags);
+    } else if (ceph_argparse_witharg(args, i, &val, "--tags-add", (char*)NULL)) {
+      get_str_list(val, ",", tags_add);
+    } else if (ceph_argparse_witharg(args, i, &val, "--tags-rm", (char*)NULL)) {
+      get_str_list(val, ",", tags_rm);
+    } else if (ceph_argparse_witharg(args, i, &val, "--api-name", (char*)NULL)) {
+      api_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--zone-id", (char*)NULL)) {
+      zone_id = val;
+      opt_zone_id = val;
+      g_conf().set_val("rgw_zone_id", val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--zone-new-name", (char*)NULL)) {
+      zone_new_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--endpoints", (char*)NULL)) {
+      get_str_list(val, endpoints);
+    } else if (ceph_argparse_witharg(args, i, &val, "--sync-from", (char*)NULL)) {
+      get_str_list(val, sync_from);
+    } else if (ceph_argparse_witharg(args, i, &val, "--sync-from-rm", (char*)NULL)) {
+      get_str_list(val, sync_from_rm);
+    } else if (ceph_argparse_binary_flag(args, i, &tmp_int, NULL, "--sync-from-all", (char*)NULL)) {
+      sync_from_all = (bool)tmp_int;
+      sync_from_all_specified = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--source-zone", (char*)NULL)) {
+      source_zone_name = val;
+      opt_source_zone_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--source-zone-id", (char*)NULL)) {
+      opt_source_zone_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--dest-zone", (char*)NULL)) {
+      opt_dest_zone_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--dest-zone-id", (char*)NULL)) {
+      opt_dest_zone_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--tier-type", (char*)NULL)) {
+      tier_type = val;
+      tier_type_specified = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--tier-config", (char*)NULL)) {
+      parse_tier_config_param(val, tier_config_add);
+    } else if (ceph_argparse_witharg(args, i, &val, "--tier-config-rm", (char*)NULL)) {
+      parse_tier_config_param(val, tier_config_rm);
+    } else if (ceph_argparse_witharg(args, i, &val, "--index-pool", (char*)NULL)) {
+      index_pool = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--data-pool", (char*)NULL)) {
+      data_pool = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--data-extra-pool", (char*)NULL)) {
+      data_extra_pool = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--placement-index-type", (char*)NULL)) {
+      if (val == "normal") {
+        placement_index_type = rgw::BucketIndexType::Normal;
+      } else if (val == "indexless") {
+        placement_index_type = rgw::BucketIndexType::Indexless;
+      } else {
+        placement_index_type = (rgw::BucketIndexType)strict_strtol(val.c_str(), 10, &err);
+        if (!err.empty()) {
+          cerr << "ERROR: failed to parse index type index: " << err << std::endl;
+          return EINVAL;
+        }
+      }
+      index_type_specified = true;
+    } else if (ceph_argparse_witharg(args, i, &val, "--compression", (char*)NULL)) {
+      compression_type = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--role-name", (char*)NULL)) {
+      role_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--path", (char*)NULL)) {
+      path = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--assume-role-policy-doc", (char*)NULL)) {
+      assume_role_doc = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--policy-name", (char*)NULL)) {
+      policy_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--policy-doc", (char*)NULL)) {
+      perm_policy_doc = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--path-prefix", (char*)NULL)) {
+      path_prefix = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--max-session-duration", (char*)NULL)) {
+      max_session_duration = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--totp-serial", (char*)NULL)) {
+      totp_serial = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--totp-pin", (char*)NULL)) {
+      totp_pin.push_back(val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--totp-seed", (char*)NULL)) {
+      totp_seed = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--totp-seed-type", (char*)NULL)) {
+      totp_seed_type = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--totp-seconds", (char*)NULL)) {
+      totp_seconds = atoi(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--totp-window", (char*)NULL)) {
+      totp_window = atoi(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--trim-delay-ms", (char*)NULL)) {
+      trim_delay_ms = atoi(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--topic", (char*)NULL)) {
+      topic_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--notification-id", (char*)NULL)) {
+      notification_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--subscription", (char*)NULL)) {
+      sub_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--event-id", (char*)NULL)) {
+      event_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--group-id", (char*)NULL)) {
+      opt_group_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--status", (char*)NULL)) {
+      opt_status = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--flow-type", (char*)NULL)) {
+      opt_flow_type = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--zones", "--zone-names", (char*)NULL)) {
+      vector<string> v;
+      get_str_vec(val, v);
+      opt_zone_names = std::move(v);
+    } else if (ceph_argparse_witharg(args, i, &val, "--zone-ids", (char*)NULL)) {
+      opt_zone_ids = zone_ids_from_str(val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--source-zones", "--source-zone-names", (char*)NULL)) {
+      vector<string> v;
+      get_str_vec(val, v);
+      opt_source_zone_names = std::move(v);
+    } else if (ceph_argparse_witharg(args, i, &val, "--source-zone-ids", (char*)NULL)) {
+      opt_source_zone_ids = zone_ids_from_str(val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--dest-zones", "--dest-zone-names", (char*)NULL)) {
+      vector<string> v;
+      get_str_vec(val, v);
+      opt_dest_zone_names = std::move(v);
+    } else if (ceph_argparse_witharg(args, i, &val, "--dest-zone-ids", (char*)NULL)) {
+      opt_dest_zone_ids = zone_ids_from_str(val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--flow-id", (char*)NULL)) {
+      opt_flow_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--pipe-id", (char*)NULL)) {
+      opt_pipe_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--source-tenant", (char*)NULL)) {
+      opt_source_tenant = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--source-bucket", (char*)NULL)) {
+      opt_source_bucket_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--source-bucket-id", (char*)NULL)) {
+      opt_source_bucket_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--dest-tenant", (char*)NULL)) {
+      opt_dest_tenant = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--dest-bucket", (char*)NULL)) {
+      opt_dest_bucket_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--dest-bucket-id", (char*)NULL)) {
+      opt_dest_bucket_id = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--effective-zone-name", "--effective-zone", (char*)NULL)) {
+      opt_effective_zone_name = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--effective-zone-id", (char*)NULL)) {
+      opt_effective_zone_id = rgw_zone_id(val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--prefix", (char*)NULL)) {
+      opt_prefix = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--prefix-rm", (char*)NULL)) {
+      opt_prefix_rm = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--priority", (char*)NULL)) {
+      opt_priority = atoi(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--mode", (char*)NULL)) {
+      opt_mode = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--dest-owner", (char*)NULL)) {
+      opt_dest_owner.emplace(val);
+      opt_dest_owner = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--retry-delay-ms", (char*)NULL)) {
+      opt_retry_delay_ms = std::chrono::milliseconds(atoi(val.c_str()));
+    } else if (ceph_argparse_witharg(args, i, &val, "--timeout-sec", (char*)NULL)) {
+      opt_timeout_sec = std::chrono::seconds(atoi(val.c_str()));
+    } else if (ceph_argparse_witharg(args, i, &val, "--inject-error-at", (char*)NULL)) {
+      inject_error_at = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--inject-error-code", (char*)NULL)) {
+      inject_error_code = atoi(val.c_str());
+    } else if (ceph_argparse_witharg(args, i, &val, "--inject-abort-at", (char*)NULL)) {
+      inject_abort_at = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--inject-delay-at", (char*)NULL)) {
+      inject_delay_at = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--inject-delay-ms", (char*)NULL)) {
+      inject_delay = std::chrono::milliseconds(atoi(val.c_str()));
+    } else if (ceph_argparse_binary_flag(args, i, &detail, NULL, "--detail", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_witharg(args, i, &val, "--context", (char*)NULL)) {
+      str_script_ctx = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--package", (char*)NULL)) {
+      script_package = val;
+    } else if (ceph_argparse_binary_flag(args, i, &allow_compilation, NULL, "--allow-compilation", (char*)NULL)) {
+      // do nothing
+    } else if (ceph_argparse_witharg(args, i, &val, "--rgw-obj-fs", (char*)NULL)) {
+      rgw_obj_fs = val;
+    } else if (ceph_argparse_witharg(args, i, &val, "--enable-feature", (char*)NULL)) {
+      if (!rgw::zone_features::supports(val)) {
+        std::cerr << "ERROR: Cannot enable unrecognized zone feature \"" << val << "\"" << std::endl;
+        return EINVAL;
+      }
+      enable_features.insert(val);
+    } else if (ceph_argparse_witharg(args, i, &val, "--disable-feature", (char*)NULL)) {
+      disable_features.insert(val);
+    } else if (strncmp(*i, "-", 1) == 0) {
+      cerr << "ERROR: invalid flag " << *i << std::endl;
+      return EINVAL;
+    } else {
+      ++i;
+    }
+  }
+
+  /* common_init_finish needs to be called after g_conf().set_val() */
+  common_init_finish(g_ceph_context);
+
+  std::unique_ptr<rgw::sal::ConfigStore> cfgstore;
+
+  if (args.empty()) {
+    usage();
+    exit(1);
+  }
+  else {
+    std::vector<string> extra_args;
+    std::vector<string> expected;
+
+    std::any _opt_cmd;
+
+    if (!cmd.find_command(args, &_opt_cmd, &extra_args, &err, &expected)) {
+      if (!expected.empty()) {
+        cerr << err << std::endl;
+        cerr << "Expected one of the following:" << std::endl;
+        for (auto& exp : expected) {
+          if (exp == "*" || exp == "[*]") {
+            continue;
+          }
+          cerr << "  " << exp << std::endl;
+        }
+      } else {
+        cerr << "Command not found:";
+        for (auto& arg : args) {
+          cerr << " " << arg;
+        }
+        cerr << std::endl;
+      }
+      exit(1);
+    }
+
+    opt_cmd = std::any_cast<OPT>(_opt_cmd);
+
+    /* some commands may have an optional extra param */
+    if (!extra_args.empty()) {
+      switch (opt_cmd) {
+        case OPT::METADATA_GET:
+        case OPT::METADATA_PUT:
+        case OPT::METADATA_RM:
+        case OPT::METADATA_LIST:
+          metadata_key = extra_args[0];
+          break;
+        default:
+          break;
+      }
+    }
+
+    // not a raw op if 'period update' needs to commit to master
+    bool raw_period_update = opt_cmd == OPT::PERIOD_UPDATE && !commit;
+    // not a raw op if 'period pull' needs to read zone/period configuration
+    bool raw_period_pull = opt_cmd == OPT::PERIOD_PULL && !url.empty();
+
+    std::set<OPT> raw_storage_ops_list = {OPT::ZONEGROUP_ADD, OPT::ZONEGROUP_CREATE,
+			 OPT::ZONEGROUP_DELETE,
+			 OPT::ZONEGROUP_GET, OPT::ZONEGROUP_LIST,
+			 OPT::ZONEGROUP_SET, OPT::ZONEGROUP_DEFAULT,
+			 OPT::ZONEGROUP_RENAME, OPT::ZONEGROUP_MODIFY,
+			 OPT::ZONEGROUP_REMOVE,
+			 OPT::ZONEGROUP_PLACEMENT_ADD, OPT::ZONEGROUP_PLACEMENT_RM,
+			 OPT::ZONEGROUP_PLACEMENT_MODIFY, OPT::ZONEGROUP_PLACEMENT_LIST,
+			 OPT::ZONEGROUP_PLACEMENT_GET,
+			 OPT::ZONEGROUP_PLACEMENT_DEFAULT,
+			 OPT::ZONE_CREATE, OPT::ZONE_DELETE,
+			 OPT::ZONE_GET, OPT::ZONE_SET, OPT::ZONE_RENAME,
+			 OPT::ZONE_LIST, OPT::ZONE_MODIFY, OPT::ZONE_DEFAULT,
+			 OPT::ZONE_PLACEMENT_ADD, OPT::ZONE_PLACEMENT_RM,
+			 OPT::ZONE_PLACEMENT_MODIFY, OPT::ZONE_PLACEMENT_LIST,
+			 OPT::ZONE_PLACEMENT_GET,
+			 OPT::REALM_CREATE,
+			 OPT::PERIOD_DELETE, OPT::PERIOD_GET,
+			 OPT::PERIOD_GET_CURRENT, OPT::PERIOD_LIST,
+			 OPT::GLOBAL_QUOTA_GET, OPT::GLOBAL_QUOTA_SET,
+			 OPT::GLOBAL_QUOTA_ENABLE, OPT::GLOBAL_QUOTA_DISABLE,
+       OPT::GLOBAL_RATELIMIT_GET, OPT::GLOBAL_RATELIMIT_SET,
+			 OPT::GLOBAL_RATELIMIT_ENABLE, OPT::GLOBAL_RATELIMIT_DISABLE,
+			 OPT::REALM_DELETE, OPT::REALM_GET, OPT::REALM_LIST,
+			 OPT::REALM_LIST_PERIODS,
+			 OPT::REALM_GET_DEFAULT,
+			 OPT::REALM_RENAME, OPT::REALM_SET,
+			 OPT::REALM_DEFAULT, OPT::REALM_PULL};
+
+    std::set<OPT> readonly_ops_list = {
+                         OPT::USER_INFO,
+			 OPT::USER_STATS,
+			 OPT::BUCKETS_LIST,
+			 OPT::BUCKET_LIMIT_CHECK,
+			 OPT::BUCKET_LAYOUT,
+			 OPT::BUCKET_STATS,
+			 OPT::BUCKET_SYNC_CHECKPOINT,
+			 OPT::BUCKET_SYNC_INFO,
+			 OPT::BUCKET_SYNC_STATUS,
+			 OPT::BUCKET_SYNC_MARKERS,
+			 OPT::BUCKET_SHARD_OBJECTS,
+			 OPT::BUCKET_OBJECT_SHARD,
+			 OPT::LOG_LIST,
+			 OPT::LOG_SHOW,
+			 OPT::USAGE_SHOW,
+			 OPT::OBJECT_STAT,
+			 OPT::BI_GET,
+			 OPT::BI_LIST,
+			 OPT::OLH_GET,
+			 OPT::OLH_READLOG,
+			 OPT::GC_LIST,
+			 OPT::LC_LIST,
+			 OPT::ORPHANS_LIST_JOBS,
+			 OPT::ZONEGROUP_GET,
+			 OPT::ZONEGROUP_LIST,
+			 OPT::ZONEGROUP_PLACEMENT_LIST,
+			 OPT::ZONEGROUP_PLACEMENT_GET,
+			 OPT::ZONE_GET,
+			 OPT::ZONE_LIST,
+			 OPT::ZONE_PLACEMENT_LIST,
+			 OPT::ZONE_PLACEMENT_GET,
+			 OPT::METADATA_GET,
+			 OPT::METADATA_LIST,
+			 OPT::METADATA_SYNC_STATUS,
+			 OPT::MDLOG_LIST,
+			 OPT::MDLOG_STATUS,
+			 OPT::SYNC_ERROR_LIST,
+			 OPT::SYNC_GROUP_GET,
+			 OPT::SYNC_POLICY_GET,
+			 OPT::BILOG_LIST,
+			 OPT::BILOG_STATUS,
+			 OPT::DATA_SYNC_STATUS,
+			 OPT::DATALOG_LIST,
+			 OPT::DATALOG_STATUS,
+			 OPT::REALM_GET,
+			 OPT::REALM_GET_DEFAULT,
+			 OPT::REALM_LIST,
+			 OPT::REALM_LIST_PERIODS,
+			 OPT::PERIOD_GET,
+			 OPT::PERIOD_GET_CURRENT,
+			 OPT::PERIOD_LIST,
+			 OPT::GLOBAL_QUOTA_GET,
+       OPT::GLOBAL_RATELIMIT_GET,
+			 OPT::SYNC_INFO,
+			 OPT::SYNC_STATUS,
+			 OPT::ROLE_GET,
+			 OPT::ROLE_LIST,
+			 OPT::ROLE_POLICY_LIST,
+			 OPT::ROLE_POLICY_GET,
+			 OPT::RESHARD_LIST,
+			 OPT::RESHARD_STATUS,
+			 OPT::PUBSUB_TOPIC_LIST,
+       OPT::PUBSUB_NOTIFICATION_LIST,
+			 OPT::PUBSUB_TOPIC_GET,
+       OPT::PUBSUB_NOTIFICATION_GET,
+			 OPT::SCRIPT_GET,
+    };
+
+    std::set<OPT> gc_ops_list = {
+			 OPT::GC_LIST,
+			 OPT::GC_PROCESS,
+			 OPT::OBJECT_RM,
+			 OPT::BUCKET_RM,  // --purge-objects
+			 OPT::USER_RM,    // --purge-data
+			 OPT::OBJECTS_EXPIRE,
+			 OPT::OBJECTS_EXPIRE_STALE_RM,
+			 OPT::LC_PROCESS,
+       OPT::BUCKET_SYNC_RUN,
+       OPT::DATA_SYNC_RUN,
+       OPT::BUCKET_REWRITE,
+       OPT::OBJECT_REWRITE
+    };
+
+    raw_storage_op = (raw_storage_ops_list.find(opt_cmd) != raw_storage_ops_list.end() ||
+			   raw_period_update || raw_period_pull);
+    bool need_cache = readonly_ops_list.find(opt_cmd) == readonly_ops_list.end();
+    bool need_gc = (gc_ops_list.find(opt_cmd) != gc_ops_list.end()) && !bypass_gc;
+
+    DriverManager::Config cfg = DriverManager::get_config(true, g_ceph_context);
+
+    auto config_store_type = g_conf().get_val<std::string>("rgw_config_store");
+    cfgstore = DriverManager::create_config_store(dpp(), config_store_type);
+    if (!cfgstore) {
+      cerr << "couldn't init config storage provider" << std::endl;
+      return EIO;
+    }
+
+    if (raw_storage_op) {
+      driver = DriverManager::get_raw_storage(dpp(),
+					    g_ceph_context,
+					    cfg);
+    } else {
+      driver = DriverManager::get_storage(dpp(),
+					g_ceph_context,
+					cfg,
+					false,
+					false,
+					false,
+					false,
+					false,
+					need_cache && g_conf()->rgw_cache_enabled,
+					need_gc);
+    }
+    if (!driver) {
+      cerr << "couldn't init storage provider" << std::endl;
+      return EIO;
+    }
+
+    /* Needs to be after the driver is initialized.  Note, user could be empty here. */
+    user = driver->get_user(user_id_arg);
+
+    init_optional_bucket(opt_bucket, opt_tenant,
+                         opt_bucket_name, opt_bucket_id);
+    init_optional_bucket(opt_source_bucket, opt_source_tenant,
+                         opt_source_bucket_name, opt_source_bucket_id);
+    init_optional_bucket(opt_dest_bucket, opt_dest_tenant,
+                         opt_dest_bucket_name, opt_dest_bucket_id);
+
+    if (tenant.empty()) {
+      tenant = user->get_tenant();
+    } else {
+      if (rgw::sal::User::empty(user) && opt_cmd != OPT::ROLE_CREATE
+                          && opt_cmd != OPT::ROLE_DELETE
+                          && opt_cmd != OPT::ROLE_GET
+                          && opt_cmd != OPT::ROLE_TRUST_POLICY_MODIFY
+                          && opt_cmd != OPT::ROLE_LIST
+                          && opt_cmd != OPT::ROLE_POLICY_PUT
+                          && opt_cmd != OPT::ROLE_POLICY_LIST
+                          && opt_cmd != OPT::ROLE_POLICY_GET
+                          && opt_cmd != OPT::ROLE_POLICY_DELETE
+                          && opt_cmd != OPT::ROLE_UPDATE
+                          && opt_cmd != OPT::RESHARD_ADD
+                          && opt_cmd != OPT::RESHARD_CANCEL
+                          && opt_cmd != OPT::RESHARD_STATUS
+                          && opt_cmd != OPT::PUBSUB_TOPIC_LIST
+                          && opt_cmd != OPT::PUBSUB_NOTIFICATION_LIST
+                          && opt_cmd != OPT::PUBSUB_TOPIC_GET
+                          && opt_cmd != OPT::PUBSUB_NOTIFICATION_GET
+                          && opt_cmd != OPT::PUBSUB_TOPIC_RM
+                          && opt_cmd != OPT::PUBSUB_NOTIFICATION_RM) {
+        cerr << "ERROR: --tenant is set, but there's no user ID" << std::endl;
+        return EINVAL;
+      }
+      user->set_tenant(tenant);
+    }
+    if (user_ns.empty()) {
+      user_ns = user->get_id().ns;
+    } else {
+      user->set_ns(user_ns);
+    }
+
+    if (!new_user_id.empty() && !tenant.empty()) {
+      new_user_id.tenant = tenant;
+    }
+
+    /* check key parameter conflict */
+    if ((!access_key.empty()) && gen_access_key) {
+        cerr << "ERROR: key parameter conflict, --access-key & --gen-access-key" << std::endl;
+        return EINVAL;
+    }
+    if ((!secret_key.empty()) && gen_secret_key) {
+        cerr << "ERROR: key parameter conflict, --secret & --gen-secret" << std::endl;
+        return EINVAL;
+    }
+  }
+
+  // default to pretty json
+  if (format.empty()) {
+    format = "json";
+    pretty_format = true;
+  }
+
+  if (format ==  "xml")
+    formatter = make_unique<XMLFormatter>(new XMLFormatter(pretty_format));
+  else if (format == "json")
+    formatter = make_unique<JSONFormatter>(new JSONFormatter(pretty_format));
+  else {
+    cerr << "unrecognized format: " << format << std::endl;
+    exit(1);
+  }
+
+  zone_formatter = std::make_unique<JSONFormatter_PrettyZone>(pretty_format);
+
+  realm_name = g_conf()->rgw_realm;
+  zone_name = g_conf()->rgw_zone;
+  zonegroup_name = g_conf()->rgw_zonegroup;
+
+  if (!realm_name.empty()) {
+    opt_realm_name = realm_name;
+  }
+
+  if (!zone_name.empty()) {
+    opt_zone_name = zone_name;
+  }
+
+  if (!zonegroup_name.empty()) {
+    opt_zonegroup_name = zonegroup_name;
+  }
+
+  RGWStreamFlusher stream_flusher(formatter.get(), cout);
+
+  RGWUserAdminOpState user_op(driver);
+  if (!user_email.empty()) {
+    user_op.user_email_specified=true;
+  }
+
+  if (!source_zone_name.empty()) {
+    std::unique_ptr<rgw::sal::Zone> zone;
+    if (driver->get_zone()->get_zonegroup().get_zone_by_name(source_zone_name, &zone) < 0) {
+      cerr << "WARNING: cannot find source zone id for name=" << source_zone_name << std::endl;
+      source_zone = source_zone_name;
+    } else {
+      source_zone.id = zone->get_id();
+    }
+  }
+
+  rgw_http_client_init(g_ceph_context);
+
+  struct rgw_curl_setup {
+    rgw_curl_setup() {
+      rgw::curl::setup_curl(boost::none);
+    }
+    ~rgw_curl_setup() {
+      rgw::curl::cleanup_curl();
+    }
+  } curl_cleanup;
+
+  oath_init();
+
+  StoreDestructor store_destructor(driver);
+
+  if (raw_storage_op) {
+    switch (opt_cmd) {
+    case OPT::PERIOD_DELETE:
+      {
+	if (period_id.empty()) {
+	  cerr << "missing period id" << std::endl;
+	  return EINVAL;
+	}
+        int ret = cfgstore->delete_period(dpp(), null_yield, period_id);
+	if (ret < 0) {
+	  cerr << "ERROR: couldn't delete period: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+      }
+      break;
+    case OPT::PERIOD_GET:
+      {
+        std::optional<epoch_t> epoch;
+	if (!period_epoch.empty()) {
+	  epoch = atoi(period_epoch.c_str());
+	}
+        if (staging) {
+          RGWRealm realm;
+          int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                    realm_id, realm_name, realm);
+          if (ret < 0 ) {
+            cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          realm_id = realm.get_id();
+          realm_name = realm.get_name();
+          period_id = RGWPeriod::get_staging_id(realm_id);
+          epoch = 1;
+        }
+        if (period_id.empty()) {
+          // use realm's current period
+          RGWRealm realm;
+          int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                    realm_id, realm_name, realm);
+          if (ret < 0 ) {
+            cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          period_id = realm.current_period;
+        }
+
+	RGWPeriod period;
+        int ret = cfgstore->read_period(dpp(), null_yield, period_id,
+                                        epoch, period);
+	if (ret < 0) {
+	  cerr << "failed to load period: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+	encode_json("period", period, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::PERIOD_GET_CURRENT:
+      {
+        RGWRealm realm;
+        int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                  realm_id, realm_name, realm);
+	if (ret < 0) {
+          std::cerr << "failed to load realm: " << cpp_strerror(ret) << std::endl;
+	  return -ret;
+	}
+
+	formatter->open_object_section("period_get_current");
+	encode_json("current_period", realm.current_period, formatter.get());
+	formatter->close_section();
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::PERIOD_LIST:
+      {
+        Formatter::ObjectSection periods_list{*formatter, "periods_list"};
+        Formatter::ArraySection periods{*formatter, "periods"};
+        rgw::sal::ListResult<std::string> listing;
+        std::array<std::string, 1000> period_ids; // list in pages of 1000
+        do {
+          int ret = cfgstore->list_period_ids(dpp(), null_yield, listing.next,
+                                              period_ids, listing);
+          if (ret < 0) {
+            std::cerr << "failed to list periods: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          for (const auto& id : listing.entries) {
+            encode_json("id", id, formatter.get());
+          }
+        } while (!listing.next.empty());
+      } // close sections periods and periods_list
+      formatter->flush(cout);
+      break;
+    case OPT::PERIOD_UPDATE:
+      {
+        int ret = update_period(cfgstore.get(), realm_id, realm_name,
+                                period_epoch, commit, remote, url,
+                                opt_region, access_key, secret_key,
+                                formatter.get(), yes_i_really_mean_it);
+	if (ret < 0) {
+	  return -ret;
+	}
+      }
+      break;
+    case OPT::PERIOD_PULL:
+      {
+        boost::optional<RGWRESTConn> conn;
+        RGWRESTConn *remote_conn = nullptr;
+        if (url.empty()) {
+          // load current period for endpoints
+          RGWRealm realm;
+          int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                    realm_id, realm_name, realm);
+          if (ret < 0 ) {
+            cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          period_id = realm.current_period;
+
+          RGWPeriod current_period;
+          ret = cfgstore->read_period(dpp(), null_yield, period_id,
+                                      std::nullopt, current_period);
+          if (ret < 0) {
+            cerr << "failed to load current period: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          if (remote.empty()) {
+            // use realm master zone as remote
+            remote = current_period.get_master_zone().id;
+          }
+          conn = get_remote_conn(static_cast<rgw::sal::RadosStore*>(driver), current_period.get_map(), remote);
+          if (!conn) {
+            cerr << "failed to find a zone or zonegroup for remote "
+                << remote << std::endl;
+            return -ENOENT;
+          }
+          remote_conn = &*conn;
+        }
+
+        RGWPeriod period;
+        int ret = do_period_pull(cfgstore.get(), remote_conn, url,
+                                 opt_region, access_key, secret_key,
+                                 realm_id, realm_name, period_id, period_epoch,
+                                 &period);
+        if (ret < 0) {
+          cerr << "period pull failed: " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+
+        encode_json("period", period, formatter.get());
+        formatter->flush(cout);
+      }
+      break;
+    case OPT::GLOBAL_RATELIMIT_GET:
+    case OPT::GLOBAL_RATELIMIT_SET:
+    case OPT::GLOBAL_RATELIMIT_ENABLE:
+    case OPT::GLOBAL_RATELIMIT_DISABLE:
+      {
+        if (realm_id.empty()) {
+          if (!realm_name.empty()) {
+            // look up realm_id for the given realm_name
+            int ret = cfgstore->read_realm_id(dpp(), null_yield,
+                                              realm_name, realm_id);
+            if (ret < 0) {
+              cerr << "ERROR: failed to read realm for " << realm_name
+                  << ": " << cpp_strerror(-ret) << std::endl;
+              return -ret;
+            }
+          } else {
+            // use default realm_id when none is given
+            int ret = cfgstore->read_default_realm_id(dpp(), null_yield,
+                                                      realm_id);
+            if (ret < 0 && ret != -ENOENT) { // on ENOENT, use empty realm_id
+              cerr << "ERROR: failed to read default realm: "
+                  << cpp_strerror(-ret) << std::endl;
+              return -ret;
+            }
+          }
+        }
+
+        RGWPeriodConfig period_config;
+        int ret = cfgstore->read_period_config(dpp(), null_yield, realm_id,
+                                               period_config);
+        if (ret < 0 && ret != -ENOENT) {
+          cerr << "ERROR: failed to read period config: "
+              << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+        bool ratelimit_configured = true;
+        formatter->open_object_section("period_config");
+        if (ratelimit_scope == "bucket") {
+          ratelimit_configured = set_ratelimit_info(period_config.bucket_ratelimit, opt_cmd,
+                         max_read_ops, max_write_ops,
+                         max_read_bytes, max_write_bytes,
+                         have_max_read_ops, have_max_write_ops,
+                         have_max_read_bytes, have_max_write_bytes);
+          encode_json("bucket_ratelimit", period_config.bucket_ratelimit, formatter.get());
+        } else if (ratelimit_scope == "user") {
+          ratelimit_configured = set_ratelimit_info(period_config.user_ratelimit, opt_cmd,
+                         max_read_ops, max_write_ops,
+                         max_read_bytes, max_write_bytes,
+                         have_max_read_ops, have_max_write_ops,
+                         have_max_read_bytes, have_max_write_bytes);
+          encode_json("user_ratelimit", period_config.user_ratelimit, formatter.get());
+        } else if (ratelimit_scope == "anonymous") {
+          ratelimit_configured = set_ratelimit_info(period_config.anon_ratelimit, opt_cmd,
+                         max_read_ops, max_write_ops,
+                         max_read_bytes, max_write_bytes,
+                         have_max_read_ops, have_max_write_ops,
+                         have_max_read_bytes, have_max_write_bytes);
+          encode_json("anonymous_ratelimit", period_config.anon_ratelimit, formatter.get());
+        } else if (ratelimit_scope.empty() && opt_cmd == OPT::GLOBAL_RATELIMIT_GET) {
+          // if no scope is given for GET, print both
+          encode_json("bucket_ratelimit", period_config.bucket_ratelimit, formatter.get());
+          encode_json("user_ratelimit", period_config.user_ratelimit, formatter.get());
+          encode_json("anonymous_ratelimit", period_config.anon_ratelimit, formatter.get());
+        } else {
+          cerr << "ERROR: invalid rate limit scope specification. Please specify "
+              "either --ratelimit-scope=bucket, or --ratelimit-scope=user or --ratelimit-scope=anonymous" << std::endl;
+          return EINVAL;
+        }
+        if (!ratelimit_configured) {
+          cerr << "ERROR: no rate limit values have been specified" << std::endl;
+          return EINVAL;
+        }
+
+        formatter->close_section();
+
+        if (opt_cmd != OPT::GLOBAL_RATELIMIT_GET) {
+          // write the modified period config
+          constexpr bool exclusive = false;
+          ret = cfgstore->write_period_config(dpp(), null_yield, exclusive,
+                                              realm_id, period_config);
+          if (ret < 0) {
+            cerr << "ERROR: failed to write period config: "
+                << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          if (!realm_id.empty()) {
+            cout << "Global ratelimit changes saved. Use 'period update' to apply "
+                "them to the staging period, and 'period commit' to commit the "
+                "new period." << std::endl;
+          } else {
+            cout << "Global ratelimit changes saved. They will take effect as "
+                "the gateways are restarted." << std::endl;
+          }
+        }
+
+        formatter->flush(cout);
+      }
+      break;
+    case OPT::GLOBAL_QUOTA_GET:
+    case OPT::GLOBAL_QUOTA_SET:
+    case OPT::GLOBAL_QUOTA_ENABLE:
+    case OPT::GLOBAL_QUOTA_DISABLE:
+      {
+        if (realm_id.empty()) {
+          if (!realm_name.empty()) {
+            // look up realm_id for the given realm_name
+            int ret = cfgstore->read_realm_id(dpp(), null_yield,
+                                              realm_name, realm_id);
+            if (ret < 0) {
+              cerr << "ERROR: failed to read realm for " << realm_name
+                  << ": " << cpp_strerror(-ret) << std::endl;
+              return -ret;
+            }
+          } else {
+            // use default realm_id when none is given
+            int ret = cfgstore->read_default_realm_id(dpp(), null_yield,
+                                                      realm_id);
+            if (ret < 0 && ret != -ENOENT) { // on ENOENT, use empty realm_id
+              cerr << "ERROR: failed to read default realm: "
+                  << cpp_strerror(-ret) << std::endl;
+              return -ret;
+            }
+          }
+        }
+
+        RGWPeriodConfig period_config;
+        int ret = cfgstore->read_period_config(dpp(), null_yield, realm_id,
+                                               period_config);
+        if (ret < 0 && ret != -ENOENT) {
+          cerr << "ERROR: failed to read period config: "
+              << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+
+        formatter->open_object_section("period_config");
+        if (quota_scope == "bucket") {
+          set_quota_info(period_config.quota.bucket_quota, opt_cmd,
+                         max_size, max_objects,
+                         have_max_size, have_max_objects);
+          encode_json("bucket quota", period_config.quota.bucket_quota, formatter.get());
+        } else if (quota_scope == "user") {
+          set_quota_info(period_config.quota.user_quota, opt_cmd,
+                         max_size, max_objects,
+                         have_max_size, have_max_objects);
+          encode_json("user quota", period_config.quota.user_quota, formatter.get());
+        } else if (quota_scope.empty() && opt_cmd == OPT::GLOBAL_QUOTA_GET) {
+          // if no scope is given for GET, print both
+          encode_json("bucket quota", period_config.quota.bucket_quota, formatter.get());
+          encode_json("user quota", period_config.quota.user_quota, formatter.get());
+        } else {
+          cerr << "ERROR: invalid quota scope specification. Please specify "
+              "either --quota-scope=bucket, or --quota-scope=user" << std::endl;
+          return EINVAL;
+        }
+        formatter->close_section();
+
+        if (opt_cmd != OPT::GLOBAL_QUOTA_GET) {
+          // write the modified period config
+          constexpr bool exclusive = false;
+          ret = cfgstore->write_period_config(dpp(), null_yield, exclusive,
+                                              realm_id, period_config);
+          if (ret < 0) {
+            cerr << "ERROR: failed to write period config: "
+                << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          if (!realm_id.empty()) {
+            cout << "Global quota changes saved. Use 'period update' to apply "
+                "them to the staging period, and 'period commit' to commit the "
+                "new period." << std::endl;
+          } else {
+            cout << "Global quota changes saved. They will take effect as "
+                "the gateways are restarted." << std::endl;
+          }
+        }
+
+        formatter->flush(cout);
+      }
+      break;
+    case OPT::REALM_CREATE:
+      {
+	if (realm_name.empty()) {
+	  cerr << "missing realm name" << std::endl;
+	  return EINVAL;
+	}
+
+	RGWRealm realm;
+        realm.name = realm_name;
+
+        constexpr bool exclusive = true;
+	int ret = rgw::create_realm(dpp(), null_yield, cfgstore.get(),
+                                    exclusive, realm);
+	if (ret < 0) {
+	  cerr << "ERROR: couldn't create realm " << realm_name << ": " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        if (set_default) {
+          ret = rgw::set_default_realm(dpp(), null_yield, cfgstore.get(), realm);
+          if (ret < 0) {
+            cerr << "failed to set realm " << realm_name << " as default: " << cpp_strerror(-ret) << std::endl;
+          }
+        }
+
+	encode_json("realm", realm, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::REALM_DELETE:
+      {
+	if (realm_id.empty() && realm_name.empty()) {
+	  cerr << "missing realm name or id" << std::endl;
+	  return EINVAL;
+	}
+	RGWRealm realm;
+        std::unique_ptr<rgw::sal::RealmWriter> writer;
+        int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                  realm_id, realm_name, realm, &writer);
+	if (ret < 0) {
+	  cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+        ret = writer->remove(dpp(), null_yield);
+	if (ret < 0) {
+	  cerr << "failed to remove realm: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+      }
+      break;
+    case OPT::REALM_GET:
+      {
+	RGWRealm realm;
+        int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                  realm_id, realm_name, realm);
+	if (ret < 0) {
+	  if (ret == -ENOENT && realm_name.empty() && realm_id.empty()) {
+	    cerr << "missing realm name or id, or default realm not found" << std::endl;
+	  } else {
+	    cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+          }
+	  return -ret;
+	}
+	encode_json("realm", realm, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::REALM_GET_DEFAULT:
+      {
+	string default_id;
+	int ret = cfgstore->read_default_realm_id(dpp(), null_yield, default_id);
+	if (ret == -ENOENT) {
+	  cout << "No default realm is set" << std::endl;
+	  return -ret;
+	} else if (ret < 0) {
+	  cerr << "Error reading default realm: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+	cout << "default realm: " << default_id << std::endl;
+      }
+      break;
+    case OPT::REALM_LIST:
+      {
+        std::string default_id;
+        int ret = cfgstore->read_default_realm_id(dpp(), null_yield,
+                                                  default_id);
+	if (ret < 0 && ret != -ENOENT) {
+	  cerr << "could not determine default realm: " << cpp_strerror(-ret) << std::endl;
+	}
+
+        Formatter::ObjectSection realms_list{*formatter, "realms_list"};
+        encode_json("default_info", default_id, formatter.get());
+
+        Formatter::ArraySection realms{*formatter, "realms"};
+        rgw::sal::ListResult<std::string> listing;
+        std::array<std::string, 1000> names; // list in pages of 1000
+        do {
+          ret = cfgstore->list_realm_names(dpp(), null_yield, listing.next,
+                                           names, listing);
+          if (ret < 0) {
+            std::cerr << "failed to list realms: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          for (const auto& name : listing.entries) {
+            encode_json("name", name, formatter.get());
+          }
+        } while (!listing.next.empty());
+      } // close sections realms and realms_list
+      formatter->flush(cout);
+      break;
+    case OPT::REALM_LIST_PERIODS:
+      {
+        // use realm's current period
+        RGWRealm realm;
+        int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                  realm_id, realm_name, realm);
+        if (ret < 0) {
+          cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+        period_id = realm.current_period;
+
+        Formatter::ObjectSection periods_list{*formatter, "realm_periods_list"};
+	encode_json("current_period", period_id, formatter.get());
+
+        Formatter::ArraySection periods{*formatter, "periods"};
+
+        while (!period_id.empty()) {
+          RGWPeriod period;
+          ret = cfgstore->read_period(dpp(), null_yield, period_id,
+                                      std::nullopt, period);
+          if (ret < 0) {
+            cerr << "failed to load period id " << period_id
+                << ": " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          encode_json("id", period_id, formatter.get());
+          period_id = period.predecessor_uuid;
+        }
+      } // close sections periods and realm_periods_list
+      formatter->flush(cout);
+      break;
+
+    case OPT::REALM_RENAME:
+      {
+	if (realm_new_name.empty()) {
+	  cerr << "missing realm new name" << std::endl;
+	  return EINVAL;
+	}
+	if (realm_name.empty() && realm_id.empty()) {
+	  cerr << "missing realm name or id" << std::endl;
+	  return EINVAL;
+	}
+
+        RGWRealm realm;
+        std::unique_ptr<rgw::sal::RealmWriter> writer;
+        int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                  realm_id, realm_name, realm, &writer);
+	if (ret < 0) {
+	  cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+        ret = writer->rename(dpp(), null_yield, realm, realm_new_name);
+	if (ret < 0) {
+	  cerr << "rename failed: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+        cout << "Realm name updated. Note that this change only applies to "
+            "the current cluster, so this command must be run separately "
+            "on each of the realm's other clusters." << std::endl;
+      }
+      break;
+    case OPT::REALM_SET:
+      {
+	if (realm_id.empty() && realm_name.empty()) {
+	  cerr << "no realm name or id provided" << std::endl;
+	  return EINVAL;
+	}
+	bool new_realm = false;
+        RGWRealm realm;
+        std::unique_ptr<rgw::sal::RealmWriter> writer;
+        int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                  realm_id, realm_name, realm, &writer);
+	if (ret < 0 && ret != -ENOENT) {
+	  cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	} else if (ret == -ENOENT) {
+	  new_realm = true;
+	}
+	ret = read_decode_json(infile, realm);
+	if (ret < 0) {
+	  return 1;
+	}
+	if (!realm_name.empty() && realm.get_name() != realm_name) {
+	  cerr << "mismatch between --rgw-realm " << realm_name << " and json input file name " <<
+	    realm.get_name() << std::endl;
+	  return EINVAL;
+	}
+	/* new realm */
+	if (new_realm) {
+	  cout << "clearing period and epoch for new realm" << std::endl;
+	  realm.clear_current_period_and_epoch();
+          constexpr bool exclusive = true;
+          ret = rgw::create_realm(dpp(), null_yield, cfgstore.get(),
+                                  exclusive, realm);
+	  if (ret < 0) {
+	    cerr << "ERROR: couldn't create new realm: " << cpp_strerror(-ret) << std::endl;
+	    return 1;
+	  }
+	} else {
+          ret = writer->write(dpp(), null_yield, realm);
+	  if (ret < 0) {
+	    cerr << "ERROR: couldn't driver realm info: " << cpp_strerror(-ret) << std::endl;
+	    return 1;
+	  }
+	}
+
+        if (set_default) {
+          ret = rgw::set_default_realm(dpp(), null_yield, cfgstore.get(), realm);
+          if (ret < 0) {
+            cerr << "failed to set realm " << realm_name << " as default: " << cpp_strerror(-ret) << std::endl;
+          }
+        }
+	encode_json("realm", realm, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+
+    case OPT::REALM_DEFAULT:
+      {
+        RGWRealm realm;
+        int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                  realm_id, realm_name, realm);
+	if (ret < 0) {
+	  cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+        ret = rgw::set_default_realm(dpp(), null_yield, cfgstore.get(), realm);
+	if (ret < 0) {
+	  cerr << "failed to set realm as default: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+      }
+      break;
+    case OPT::REALM_PULL:
+      {
+        if (url.empty()) {
+          cerr << "A --url must be provided." << std::endl;
+          return EINVAL;
+        }
+        RGWEnv env;
+        req_info info(g_ceph_context, &env);
+        info.method = "GET";
+        info.request_uri = "/admin/realm";
+
+        map<string, string> &params = info.args.get_params();
+        if (!realm_id.empty())
+          params["id"] = realm_id;
+        if (!realm_name.empty())
+          params["name"] = realm_name;
+
+        bufferlist bl;
+        JSONParser p;
+        int ret = send_to_url(url, opt_region, access_key, secret_key, info, bl, p);
+        if (ret < 0) {
+          cerr << "request failed: " << cpp_strerror(-ret) << std::endl;
+          if (ret == -EACCES) {
+            cerr << "If the realm has been changed on the master zone, the "
+                "master zone's gateway may need to be restarted to recognize "
+                "this user." << std::endl;
+          }
+          return -ret;
+        }
+        RGWRealm realm;
+        try {
+          decode_json_obj(realm, &p);
+        } catch (const JSONDecoder::err& e) {
+          cerr << "failed to decode JSON response: " << e.what() << std::endl;
+          return EINVAL;
+        }
+        RGWPeriod period;
+        auto& current_period = realm.get_current_period();
+        if (!current_period.empty()) {
+          // pull the latest epoch of the realm's current period
+          ret = do_period_pull(cfgstore.get(), nullptr, url, opt_region,
+                               access_key, secret_key,
+                               realm_id, realm_name, current_period, "",
+                               &period);
+          if (ret < 0) {
+            cerr << "could not fetch period " << current_period << std::endl;
+            return -ret;
+          }
+        }
+        constexpr bool exclusive = false;
+        ret = rgw::create_realm(dpp(), null_yield, cfgstore.get(),
+                                exclusive, realm);
+        if (ret < 0) {
+          cerr << "Error storing realm " << realm.get_id() << ": "
+            << cpp_strerror(ret) << std::endl;
+          return -ret;
+        }
+
+        if (set_default) {
+          ret = rgw::set_default_realm(dpp(), null_yield, cfgstore.get(), realm);
+          if (ret < 0) {
+            cerr << "failed to set realm " << realm_name << " as default: " << cpp_strerror(-ret) << std::endl;
+          }
+        }
+
+        encode_json("realm", realm, formatter.get());
+        formatter->flush(cout);
+      }
+      break;
+
+    case OPT::ZONEGROUP_ADD:
+      {
+	if (zonegroup_id.empty() && zonegroup_name.empty()) {
+	  cerr << "no zonegroup name or id provided" << std::endl;
+	  return EINVAL;
+	}
+
+        // load the zonegroup and zone params
+	RGWZoneGroup zonegroup;
+        std::unique_ptr<rgw::sal::ZoneGroupWriter> zonegroup_writer;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      zonegroup_id, zonegroup_name,
+                                      zonegroup, &zonegroup_writer);
+	if (ret < 0) {
+	  cerr << "failed to load zonegroup " << zonegroup_name << " id "
+              << zonegroup_id << ": " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+	RGWZoneParams zone_params;
+        std::unique_ptr<rgw::sal::ZoneWriter> zone_writer;
+        ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                             zone_id, zone_name, zone_params, &zone_writer);
+	if (ret < 0) {
+	  cerr << "unable to load zone: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        // update zone_params if necessary
+        bool need_zone_update = false;
+
+        if (zone_params.realm_id != zonegroup.realm_id) {
+          if (!zone_params.realm_id.empty()) {
+            cerr << "WARNING: overwriting zone realm_id=" << zone_params.realm_id
+                << " to match zonegroup realm_id=" << zonegroup.realm_id << std::endl;
+          }
+          zone_params.realm_id = zonegroup.realm_id;
+          need_zone_update = true;
+        }
+
+        for (auto a : tier_config_add) {
+          ret = zone_params.tier_config.set(a.first, a.second);
+          if (ret < 0) {
+            cerr << "ERROR: failed to set configurable: " << a << std::endl;
+            return EINVAL;
+          }
+          need_zone_update = true;
+        }
+
+        if (need_zone_update) {
+          ret = zone_writer->write(dpp(), null_yield, zone_params);
+          if (ret < 0) {
+            cerr << "failed to save zone info: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+        }
+
+        const bool *pis_master = (is_master_set ? &is_master : nullptr);
+        const bool *pread_only = (is_read_only_set ? &read_only : nullptr);
+        const bool *psync_from_all = (sync_from_all_specified ? &sync_from_all : nullptr);
+        const string *predirect_zone = (redirect_zone_set ? &redirect_zone : nullptr);
+
+        // validate --tier-type if specified
+        const string *ptier_type = (tier_type_specified ? &tier_type : nullptr);
+        if (ptier_type) {
+          auto sync_mgr = static_cast<rgw::sal::RadosStore*>(driver)->svc()->sync_modules->get_manager();
+          if (!sync_mgr->get_module(*ptier_type, nullptr)) {
+            ldpp_dout(dpp(), -1) << "ERROR: could not find sync module: "
+                << *ptier_type << ",  valid sync modules: "
+                << sync_mgr->get_registered_module_names() << dendl;
+            return EINVAL;
+          }
+        }
+
+        if (enable_features.empty()) { // enable all features by default
+          enable_features.insert(rgw::zone_features::supported.begin(),
+                                 rgw::zone_features::supported.end());
+        }
+
+        // add/update the public zone information stored in the zonegroup
+        ret = rgw::add_zone_to_group(dpp(), zonegroup, zone_params,
+                                     pis_master, pread_only, endpoints,
+                                     ptier_type, psync_from_all,
+                                     sync_from, sync_from_rm,
+                                     predirect_zone, bucket_index_max_shards,
+                                     enable_features, disable_features);
+        if (ret < 0) {
+          return -ret;
+        }
+
+        // write the updated zonegroup
+        ret = zonegroup_writer->write(dpp(), null_yield, zonegroup);
+	if (ret < 0) {
+	  cerr << "failed to write updated zonegroup " << zonegroup.get_name()
+              << ": " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        encode_json("zonegroup", zonegroup, formatter.get());
+        formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONEGROUP_CREATE:
+      {
+	if (zonegroup_name.empty()) {
+	  cerr << "Missing zonegroup name" << std::endl;
+	  return EINVAL;
+	}
+	RGWRealm realm;
+        int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                  realm_id, realm_name, realm);
+	if (ret < 0) {
+	  cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+	RGWZoneGroup zonegroup;
+        zonegroup.name = zonegroup_name;
+        zonegroup.is_master = is_master;
+        zonegroup.realm_id = realm.get_id();
+        zonegroup.endpoints = endpoints;
+        zonegroup.api_name = (api_name.empty() ? zonegroup_name : api_name);
+
+        zonegroup.enabled_features = enable_features;
+        if (zonegroup.enabled_features.empty()) { // enable features by default
+          zonegroup.enabled_features.insert(rgw::zone_features::enabled.begin(),
+                                            rgw::zone_features::enabled.end());
+        }
+        for (const auto& feature : disable_features) {
+          auto i = zonegroup.enabled_features.find(feature);
+          if (i == zonegroup.enabled_features.end()) {
+            ldout(cct, 1) << "WARNING: zone feature \"" << feature
+                << "\" was not enabled in zonegroup " << zonegroup_name << dendl;
+            continue;
+          }
+          zonegroup.enabled_features.erase(i);
+        }
+
+        constexpr bool exclusive = true;
+        ret = rgw::create_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                    exclusive, zonegroup);
+	if (ret < 0) {
+	  cerr << "failed to create zonegroup " << zonegroup_name << ": " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        if (set_default) {
+          ret = rgw::set_default_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                           zonegroup);
+          if (ret < 0) {
+            cerr << "failed to set zonegroup " << zonegroup_name << " as default: " << cpp_strerror(-ret) << std::endl;
+          }
+        }
+
+	encode_json("zonegroup", zonegroup, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONEGROUP_DEFAULT:
+      {
+	if (zonegroup_id.empty() && zonegroup_name.empty()) {
+	  cerr << "no zonegroup name or id provided" << std::endl;
+	  return EINVAL;
+	}
+
+	RGWZoneGroup zonegroup;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      zonegroup_id, zonegroup_name,
+                                      zonegroup);
+	if (ret < 0) {
+	  cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        ret = rgw::set_default_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                         zonegroup);
+	if (ret < 0) {
+	  cerr << "failed to set zonegroup as default: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+      }
+      break;
+    case OPT::ZONEGROUP_DELETE:
+      {
+	if (zonegroup_id.empty() && zonegroup_name.empty()) {
+	  cerr << "no zonegroup name or id provided" << std::endl;
+	  return EINVAL;
+	}
+	RGWZoneGroup zonegroup;
+        std::unique_ptr<rgw::sal::ZoneGroupWriter> writer;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      zonegroup_id, zonegroup_name,
+                                      zonegroup, &writer);
+	if (ret < 0) {
+	  cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+        ret = writer->remove(dpp(), null_yield);
+	if (ret < 0) {
+	  cerr << "ERROR: couldn't delete zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+      }
+      break;
+    case OPT::ZONEGROUP_GET:
+      {
+	RGWZoneGroup zonegroup;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      zonegroup_id, zonegroup_name, zonegroup);
+	if (ret < 0) {
+	  cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+	encode_json("zonegroup", zonegroup, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONEGROUP_LIST:
+      {
+        RGWZoneGroup default_zonegroup;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      {}, {}, default_zonegroup);
+	if (ret < 0 && ret != -ENOENT) {
+	  cerr << "could not determine default zonegroup: " << cpp_strerror(-ret) << std::endl;
+	}
+
+        Formatter::ObjectSection zonegroups_list{*formatter, "zonegroups_list"};
+        encode_json("default_info", default_zonegroup.id, formatter.get());
+
+        Formatter::ArraySection zonegroups{*formatter, "zonegroups"};
+        rgw::sal::ListResult<std::string> listing;
+        std::array<std::string, 1000> names; // list in pages of 1000
+        do {
+          ret = cfgstore->list_zonegroup_names(dpp(), null_yield, listing.next,
+                                               names, listing);
+          if (ret < 0) {
+            std::cerr << "failed to list zonegroups: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          for (const auto& name : listing.entries) {
+            encode_json("name", name, formatter.get());
+          }
+        } while (!listing.next.empty());
+      } // close sections zonegroups and zonegroups_list
+      formatter->flush(cout);
+      break;
+    case OPT::ZONEGROUP_MODIFY:
+      {
+	RGWZoneGroup zonegroup;
+        std::unique_ptr<rgw::sal::ZoneGroupWriter> writer;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      zonegroup_id, zonegroup_name,
+                                      zonegroup, &writer);
+	if (ret < 0) {
+	  cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        bool need_update = false;
+
+        if (!master_zone.empty()) {
+          zonegroup.master_zone = master_zone;
+          need_update = true;
+        }
+
+	if (is_master_set) {
+	  zonegroup.is_master = is_master;
+          need_update = true;
+        }
+
+        if (!endpoints.empty()) {
+          zonegroup.endpoints = endpoints;
+          need_update = true;
+        }
+
+        if (!api_name.empty()) {
+          zonegroup.api_name = api_name;
+          need_update = true;
+        }
+
+        if (!realm_id.empty()) {
+          zonegroup.realm_id = realm_id;
+          need_update = true;
+        } else if (!realm_name.empty()) {
+          // get realm id from name
+          ret = cfgstore->read_realm_id(dpp(), null_yield, realm_name,
+                                        zonegroup.realm_id);
+          if (ret < 0) {
+            cerr << "failed to find realm by name " << realm_name << std::endl;
+            return -ret;
+          }
+          need_update = true;
+        }
+
+        if (bucket_index_max_shards) {
+          for (auto& [name, zone] : zonegroup.zones) {
+            zone.bucket_index_max_shards = *bucket_index_max_shards;
+          }
+          need_update = true;
+        }
+
+        for (const auto& feature : enable_features) {
+          zonegroup.enabled_features.insert(feature);
+          need_update = true;
+        }
+        for (const auto& feature : disable_features) {
+          auto i = zonegroup.enabled_features.find(feature);
+          if (i == zonegroup.enabled_features.end()) {
+            ldout(cct, 1) << "WARNING: zone feature \"" << feature
+                << "\" was not enabled in zonegroup "
+                << zonegroup.get_name() << dendl;
+            continue;
+          }
+          zonegroup.enabled_features.erase(i);
+          need_update = true;
+        }
+
+        if (need_update) {
+	  ret = writer->write(dpp(), null_yield, zonegroup);
+	  if (ret < 0) {
+	    cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl;
+	    return -ret;
+	  }
+	}
+
+        if (set_default) {
+          ret = rgw::set_default_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                           zonegroup);
+          if (ret < 0) {
+            cerr << "failed to set zonegroup " << zonegroup_name << " as default: " << cpp_strerror(-ret) << std::endl;
+          }
+        }
+
+        encode_json("zonegroup", zonegroup, formatter.get());
+        formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONEGROUP_SET:
+      {
+	RGWRealm realm;
+        int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                  realm_id, realm_name, realm);
+	bool default_realm_not_exist = (ret == -ENOENT && realm_id.empty() && realm_name.empty());
+
+	if (ret < 0 && !default_realm_not_exist) {
+	  cerr << "failed to init realm: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+	RGWZoneGroup zonegroup;
+	ret = read_decode_json(infile, zonegroup);
+	if (ret < 0) {
+	  return 1;
+	}
+	if (zonegroup.realm_id.empty() && !default_realm_not_exist) {
+	  zonegroup.realm_id = realm.get_id();
+	}
+        // validate zonegroup features
+        for (const auto& feature : zonegroup.enabled_features) {
+          if (!rgw::zone_features::supports(feature)) {
+            std::cerr << "ERROR: Unrecognized zonegroup feature \""
+                << feature << "\"" << std::endl;
+            return EINVAL;
+          }
+        }
+        for (const auto& [name, zone] : zonegroup.zones) {
+          // validate zone features
+          for (const auto& feature : zone.supported_features) {
+            if (!rgw::zone_features::supports(feature)) {
+              std::cerr << "ERROR: Unrecognized zone feature \""
+                  << feature << "\" in zone " << zone.name << std::endl;
+              return EINVAL;
+            }
+          }
+          // zone must support everything zonegroup does
+          for (const auto& feature : zonegroup.enabled_features) {
+            if (!zone.supports(feature)) {
+              std::cerr << "ERROR: Zone " << name << " does not support feature \""
+                  << feature << "\" required by zonegroup" << std::endl;
+              return EINVAL;
+            }
+          }
+        }
+
+        // create/overwrite the zonegroup info
+        constexpr bool exclusive = false;
+        ret = rgw::create_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                    exclusive, zonegroup);
+	if (ret < 0) {
+	  cerr << "ERROR: couldn't create zonegroup info: " << cpp_strerror(-ret) << std::endl;
+	  return 1;
+	}
+
+        if (set_default) {
+          ret = rgw::set_default_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                           zonegroup);
+          if (ret < 0) {
+            cerr << "failed to set zonegroup " << zonegroup_name << " as default: " << cpp_strerror(-ret) << std::endl;
+          }
+        }
+
+	encode_json("zonegroup", zonegroup, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONEGROUP_REMOVE:
+      {
+	RGWZoneGroup zonegroup;
+        std::unique_ptr<rgw::sal::ZoneGroupWriter> writer;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      zonegroup_id, zonegroup_name,
+                                      zonegroup, &writer);
+        if (ret < 0) {
+          cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+
+        if (zone_id.empty()) {
+          if (zone_name.empty()) {
+            cerr << "no --zone-id or --rgw-zone name provided" << std::endl;
+            return EINVAL;
+          }
+          // look up zone id by name
+          for (auto& z : zonegroup.zones) {
+            if (zone_name == z.second.name) {
+              zone_id = z.second.id;
+              break;
+            }
+          }
+          if (zone_id.empty()) {
+            cerr << "zone name " << zone_name << " not found in zonegroup "
+                << zonegroup.get_name() << std::endl;
+            return ENOENT;
+          }
+        }
+
+        ret = rgw::remove_zone_from_group(dpp(), zonegroup, zone_id);
+        if (ret < 0) {
+          cerr << "failed to remove zone: " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+
+        ret = writer->write(dpp(), null_yield, zonegroup);
+        if (ret < 0) {
+          cerr << "failed to write zonegroup: " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+
+        encode_json("zonegroup", zonegroup, formatter.get());
+        formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONEGROUP_RENAME:
+      {
+	if (zonegroup_new_name.empty()) {
+	  cerr << " missing zonegroup new name" << std::endl;
+	  return EINVAL;
+	}
+	if (zonegroup_id.empty() && zonegroup_name.empty()) {
+	  cerr << "no zonegroup name or id provided" << std::endl;
+	  return EINVAL;
+	}
+	RGWZoneGroup zonegroup;
+        std::unique_ptr<rgw::sal::ZoneGroupWriter> writer;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      zonegroup_id, zonegroup_name,
+                                      zonegroup, &writer);
+	if (ret < 0) {
+	  cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+        ret = writer->rename(dpp(), null_yield, zonegroup, zonegroup_new_name);
+	if (ret < 0) {
+	  cerr << "failed to rename zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+      }
+      break;
+    case OPT::ZONEGROUP_PLACEMENT_LIST:
+      {
+	RGWZoneGroup zonegroup;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      zonegroup_id, zonegroup_name, zonegroup);
+	if (ret < 0) {
+	  cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+	encode_json("placement_targets", zonegroup.placement_targets, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONEGROUP_PLACEMENT_GET:
+      {
+	if (placement_id.empty()) {
+	  cerr << "ERROR: --placement-id not specified" << std::endl;
+	  return EINVAL;
+	}
+
+	RGWZoneGroup zonegroup;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      zonegroup_id, zonegroup_name, zonegroup);
+	if (ret < 0) {
+	  cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+	auto p = zonegroup.placement_targets.find(placement_id);
+	if (p == zonegroup.placement_targets.end()) {
+	  cerr << "failed to find a zonegroup placement target named '" << placement_id << "'" << std::endl;
+	  return -ENOENT;
+	}
+	encode_json("placement_targets", p->second, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONEGROUP_PLACEMENT_ADD:
+    case OPT::ZONEGROUP_PLACEMENT_MODIFY:
+    case OPT::ZONEGROUP_PLACEMENT_RM:
+    case OPT::ZONEGROUP_PLACEMENT_DEFAULT:
+      {
+    if (placement_id.empty()) {
+      cerr << "ERROR: --placement-id not specified" << std::endl;
+      return EINVAL;
+    }
+
+    rgw_placement_rule rule;
+    rule.from_str(placement_id);
+
+    if (!rule.storage_class.empty() && opt_storage_class &&
+        rule.storage_class != *opt_storage_class) {
+      cerr << "ERROR: provided contradicting storage class configuration" << std::endl;
+      return EINVAL;
+    } else if (rule.storage_class.empty()) {
+      rule.storage_class = opt_storage_class.value_or(string());
+    }
+
+	RGWZoneGroup zonegroup;
+        std::unique_ptr<rgw::sal::ZoneGroupWriter> writer;
+        int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                      zonegroup_id, zonegroup_name,
+                                      zonegroup, &writer);
+	if (ret < 0) {
+	  cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+    if (opt_cmd == OPT::ZONEGROUP_PLACEMENT_ADD ||
+      opt_cmd == OPT::ZONEGROUP_PLACEMENT_MODIFY) {
+      RGWZoneGroupPlacementTarget& target = zonegroup.placement_targets[placement_id];
+      if (!tags.empty()) {
+        target.tags.clear();
+        for (auto& t : tags) {
+          target.tags.insert(t);
+        }
+      }
+
+      target.name = placement_id;
+      for (auto& t : tags_rm) {
+        target.tags.erase(t);
+      }
+      for (auto& t : tags_add) {
+        target.tags.insert(t);
+      }
+      target.storage_classes.insert(rule.get_storage_class());
+
+      /* Tier options */
+      bool tier_class = false;
+      std::string storage_class = rule.get_storage_class();
+      RGWZoneGroupPlacementTier t{storage_class};
+      RGWZoneGroupPlacementTier *pt = &t;
+
+	  auto ptiter = target.tier_targets.find(storage_class);
+	  if (ptiter != target.tier_targets.end()) {
+        pt = &ptiter->second;
+        tier_class = true;
+      } else if (tier_type_specified) {
+        if (tier_type == "cloud-s3") {
+          /* we support only cloud-s3 tier-type for now.
+           * Once set cant be reset. */
+          tier_class = true;
+          pt->tier_type = tier_type;
+          pt->storage_class = storage_class;
+        } else {
+	      cerr << "ERROR: Invalid tier-type specified" << std::endl;
+	      return EINVAL;
+        }
+      }
+
+      if (tier_class) {
+        if (tier_config_add.size() > 0) {
+          JSONFormattable tconfig;
+          for (auto add : tier_config_add) {
+            int r = tconfig.set(add.first, add.second);
+            if (r < 0) {
+              cerr << "ERROR: failed to set configurable: " << add << std::endl;
+              return EINVAL;
+            }
+          }
+          int r = pt->update_params(tconfig);
+          if (r < 0) {
+            cerr << "ERROR: failed to update tier_config options"<< std::endl;
+          }
+        }
+        if (tier_config_rm.size() > 0) {
+          JSONFormattable tconfig;
+          for (auto add : tier_config_rm) {
+            int r = tconfig.set(add.first, add.second);
+            if (r < 0) {
+              cerr << "ERROR: failed to set configurable: " << add << std::endl;
+              return EINVAL;
+            }
+          }
+          int r = pt->clear_params(tconfig);
+          if (r < 0) {
+            cerr << "ERROR: failed to update tier_config options"<< std::endl;
+          }
+        }
+
+        target.tier_targets.emplace(std::make_pair(storage_class, *pt));
+      }
+
+      if (zonegroup.default_placement.empty()) {
+        zonegroup.default_placement.init(rule.name, RGW_STORAGE_CLASS_STANDARD);
+      }
+    } else if (opt_cmd == OPT::ZONEGROUP_PLACEMENT_RM) {
+      if (!opt_storage_class || opt_storage_class->empty()) {
+        zonegroup.placement_targets.erase(placement_id);
+        if (zonegroup.default_placement.name == placement_id) {
+          // clear default placement
+          zonegroup.default_placement.clear();
+        }
+      } else {
+        auto iter = zonegroup.placement_targets.find(placement_id);
+        if (iter != zonegroup.placement_targets.end()) {
+          RGWZoneGroupPlacementTarget& info = zonegroup.placement_targets[placement_id];
+          info.storage_classes.erase(*opt_storage_class);
+
+          if (zonegroup.default_placement == rule) {
+            // clear default storage class
+            zonegroup.default_placement.storage_class.clear();
+          }
+
+	      auto ptiter = info.tier_targets.find(*opt_storage_class);
+	      if (ptiter != info.tier_targets.end()) {
+		    info.tier_targets.erase(ptiter);
+	      }
+        }
+      }
+    } else if (opt_cmd == OPT::ZONEGROUP_PLACEMENT_DEFAULT) {
+      if (!zonegroup.placement_targets.count(placement_id)) {
+        cerr << "failed to find a zonegroup placement target named '"
+             << placement_id << "'" << std::endl;
+        return -ENOENT;
+      }
+      zonegroup.default_placement = rule;
+    }
+
+    ret = writer->write(dpp(), null_yield, zonegroup);
+    if (ret < 0) {
+      cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    encode_json("placement_targets", zonegroup.placement_targets, formatter.get());
+    formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONE_CREATE:
+      {
+        if (zone_name.empty()) {
+	  cerr << "zone name not provided" << std::endl;
+	  return EINVAL;
+        }
+
+	RGWZoneGroup zonegroup;
+        std::unique_ptr<rgw::sal::ZoneGroupWriter> zonegroup_writer;
+	/* if the user didn't provide zonegroup info , create stand alone zone */
+	if (!zonegroup_id.empty() || !zonegroup_name.empty()) {
+          int ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                        zonegroup_id, zonegroup_name,
+                                        zonegroup, &zonegroup_writer);
+	  if (ret < 0) {
+	    cerr << "failed to load zonegroup " << zonegroup_name << ": " << cpp_strerror(-ret) << std::endl;
+	    return -ret;
+	  }
+	  if (realm_id.empty() && realm_name.empty()) {
+	    realm_id = zonegroup.realm_id;
+	  }
+	}
+
+        // create the local zone params
+	RGWZoneParams zone_params;
+        zone_params.id = zone_id;
+        zone_params.name = zone_name;
+
+        zone_params.system_key.id = access_key;
+        zone_params.system_key.key = secret_key;
+	zone_params.realm_id = realm_id;
+        for (const auto& a : tier_config_add) {
+          int r = zone_params.tier_config.set(a.first, a.second);
+          if (r < 0) {
+            cerr << "ERROR: failed to set configurable: " << a << std::endl;
+            return EINVAL;
+          }
+        }
+
+        if (zone_params.realm_id.empty()) {
+          RGWRealm realm;
+          int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                    realm_id, realm_name, realm);
+          if (ret < 0 && ret != -ENOENT) {
+            cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          zone_params.realm_id = realm.id;
+          cerr << "NOTICE: set zone's realm_id=" << realm.id << std::endl;
+        }
+
+        constexpr bool exclusive = true;
+        int ret = rgw::create_zone(dpp(), null_yield, cfgstore.get(),
+                                   exclusive, zone_params);
+	if (ret < 0) {
+	  cerr << "failed to create zone " << zone_name << ": " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+	if (zonegroup_writer) {
+          const bool *pis_master = (is_master_set ? &is_master : nullptr);
+          const bool *pread_only = (is_read_only_set ? &read_only : nullptr);
+          const bool *psync_from_all = (sync_from_all_specified ? &sync_from_all : nullptr);
+          const string *predirect_zone = (redirect_zone_set ? &redirect_zone : nullptr);
+
+          // validate --tier-type if specified
+          const string *ptier_type = (tier_type_specified ? &tier_type : nullptr);
+          if (ptier_type) {
+            auto sync_mgr = static_cast<rgw::sal::RadosStore*>(driver)->svc()->sync_modules->get_manager();
+            if (!sync_mgr->get_module(*ptier_type, nullptr)) {
+              ldpp_dout(dpp(), -1) << "ERROR: could not find sync module: "
+                  << *ptier_type << ",  valid sync modules: "
+                  << sync_mgr->get_registered_module_names() << dendl;
+              return EINVAL;
+            }
+          }
+
+          if (enable_features.empty()) { // enable all features by default
+            enable_features.insert(rgw::zone_features::supported.begin(),
+                                   rgw::zone_features::supported.end());
+          }
+
+          // add/update the public zone information stored in the zonegroup
+          ret = rgw::add_zone_to_group(dpp(), zonegroup, zone_params,
+                                       pis_master, pread_only, endpoints,
+                                       ptier_type, psync_from_all,
+                                       sync_from, sync_from_rm,
+                                       predirect_zone, bucket_index_max_shards,
+                                       enable_features, disable_features);
+          if (ret < 0) {
+            return -ret;
+          }
+
+          // write the updated zonegroup
+          ret = zonegroup_writer->write(dpp(), null_yield, zonegroup);
+	  if (ret < 0) {
+	    cerr << "failed to add zone " << zone_name << " to zonegroup " << zonegroup.get_name()
+		 << ": " << cpp_strerror(-ret) << std::endl;
+	    return -ret;
+	  }
+	}
+
+        if (set_default) {
+          ret = rgw::set_default_zone(dpp(), null_yield, cfgstore.get(),
+                                      zone_params);
+          if (ret < 0) {
+            cerr << "failed to set zone " << zone_name << " as default: " << cpp_strerror(-ret) << std::endl;
+          }
+        }
+
+	encode_json("zone", zone_params, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONE_DEFAULT:
+      {
+	if (zone_id.empty() && zone_name.empty()) {
+	  cerr << "no zone name or id provided" << std::endl;
+	  return EINVAL;
+	}
+	RGWZoneParams zone_params;
+        int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                                 zone_id, zone_name, zone_params);
+	if (ret < 0) {
+	  cerr << "unable to load zone: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        ret = rgw::set_default_zone(dpp(), null_yield, cfgstore.get(),
+                                    zone_params);
+	if (ret < 0) {
+	  cerr << "failed to set zone as default: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+      }
+      break;
+    case OPT::ZONE_DELETE:
+      {
+	if (zone_id.empty() && zone_name.empty()) {
+	  cerr << "no zone name or id provided" << std::endl;
+	  return EINVAL;
+	}
+	RGWZoneParams zone_params;
+        std::unique_ptr<rgw::sal::ZoneWriter> writer;
+        int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                                 zone_id, zone_name, zone_params, &writer);
+	if (ret < 0) {
+	  cerr << "failed to load zone: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        ret = rgw::delete_zone(dpp(), null_yield, cfgstore.get(),
+                               zone_params, *writer);
+	if (ret < 0) {
+	  cerr << "failed to delete zone " << zone_params.get_name()
+              << ": " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+      }
+      break;
+    case OPT::ZONE_GET:
+      {
+	RGWZoneParams zone_params;
+        int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                                 zone_id, zone_name, zone_params);
+	if (ret < 0) {
+	  cerr << "failed to load zone: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+	encode_json("zone", zone_params, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONE_SET:
+      {
+	RGWZoneParams zone;
+        std::unique_ptr<rgw::sal::ZoneWriter> writer;
+        int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                                 zone_id, zone_name, zone, &writer);
+        if (ret < 0 && ret != -ENOENT) {
+	  cerr << "failed to load zone: " << cpp_strerror(ret) << std::endl;
+          return -ret;
+        }
+
+        string orig_id = zone.get_id();
+
+	ret = read_decode_json(infile, zone);
+	if (ret < 0) {
+	  return 1;
+	}
+
+	if (zone.realm_id.empty()) {
+	  RGWRealm realm;
+          ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                realm_id, realm_name, realm);
+	  if (ret < 0 && ret != -ENOENT) {
+	    cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+	    return -ret;
+	  }
+	  zone.realm_id = realm.get_id();
+          cerr << "NOTICE: set zone's realm_id=" << zone.realm_id << std::endl;
+	}
+
+	if (!zone_name.empty() && !zone.get_name().empty() && zone.get_name() != zone_name) {
+	  cerr << "Error: zone name " << zone_name << " is different than the zone name " << zone.get_name() << " in the provided json " << std::endl;
+	  return EINVAL;
+	}
+
+        if (zone.get_name().empty()) {
+          zone.set_name(zone_name);
+          if (zone.get_name().empty()) {
+            cerr << "no zone name specified" << std::endl;
+            return EINVAL;
+          }
+        }
+
+        zone_name = zone.get_name();
+
+        if (zone.get_id().empty()) {
+          zone.set_id(orig_id);
+        }
+
+        constexpr bool exclusive = false;
+        ret = rgw::create_zone(dpp(), null_yield, cfgstore.get(),
+                               exclusive, zone);
+	if (ret < 0) {
+	  cerr << "ERROR: couldn't create zone: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        if (set_default) {
+          ret = rgw::set_default_zone(dpp(), null_yield, cfgstore.get(), zone);
+          if (ret < 0) {
+            cerr << "failed to set zone " << zone_name << " as default: " << cpp_strerror(-ret) << std::endl;
+          }
+        }
+
+	encode_json("zone", zone, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONE_LIST:
+      {
+        RGWZoneParams default_zone_params;
+        int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                                 {}, {}, default_zone_params);
+	if (ret < 0 && ret != -ENOENT) {
+	  cerr << "could not determine default zone: " << cpp_strerror(-ret) << std::endl;
+	}
+
+        Formatter::ObjectSection zones_list{*formatter, "zones_list"};
+        encode_json("default_info", default_zone_params.id, formatter.get());
+
+        Formatter::ArraySection zones{*formatter, "zones"};
+        rgw::sal::ListResult<std::string> listing;
+        std::array<std::string, 1000> names; // list in pages of 1000
+        do {
+          ret = cfgstore->list_zone_names(dpp(), null_yield, listing.next,
+                                          names, listing);
+          if (ret < 0) {
+            std::cerr << "failed to list zones: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+          for (const auto& name : listing.entries) {
+            encode_json("name", name, formatter.get());
+          }
+        } while (!listing.next.empty());
+      } // close sections zones and zones_list
+      formatter->flush(cout);
+      break;
+    case OPT::ZONE_MODIFY:
+      {
+	RGWZoneParams zone_params;
+        std::unique_ptr<rgw::sal::ZoneWriter> zone_writer;
+        int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                                 zone_id, zone_name, zone_params, &zone_writer);
+        if (ret < 0) {
+	  cerr << "failed to load zone: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        bool need_zone_update = false;
+        if (!access_key.empty()) {
+          zone_params.system_key.id = access_key;
+          need_zone_update = true;
+        }
+
+        if (!secret_key.empty()) {
+          zone_params.system_key.key = secret_key;
+          need_zone_update = true;
+        }
+
+        if (!realm_id.empty()) {
+          zone_params.realm_id = realm_id;
+          need_zone_update = true;
+        } else if (!realm_name.empty()) {
+          // get realm id from name
+          ret = cfgstore->read_realm_id(dpp(), null_yield,
+                                        realm_name, zone_params.realm_id);
+          if (ret < 0) {
+            cerr << "failed to find realm by name " << realm_name << std::endl;
+            return -ret;
+          }
+          need_zone_update = true;
+        }
+
+        for (const auto& add : tier_config_add) {
+          ret = zone_params.tier_config.set(add.first, add.second);
+          if (ret < 0) {
+            cerr << "ERROR: failed to set configurable: " << add << std::endl;
+            return EINVAL;
+          }
+          need_zone_update = true;
+        }
+
+        for (const auto& rm : tier_config_rm) {
+          if (!rm.first.empty()) { /* otherwise will remove the entire config */
+            zone_params.tier_config.erase(rm.first);
+            need_zone_update = true;
+          }
+        }
+
+        if (need_zone_update) {
+          ret = zone_writer->write(dpp(), null_yield, zone_params);
+          if (ret < 0) {
+            cerr << "failed to save zone info: " << cpp_strerror(-ret) << std::endl;
+            return -ret;
+          }
+        }
+
+	RGWZoneGroup zonegroup;
+        std::unique_ptr<rgw::sal::ZoneGroupWriter> zonegroup_writer;
+        ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                  zonegroup_id, zonegroup_name,
+                                  zonegroup, &zonegroup_writer);
+	if (ret < 0) {
+	  cerr << "failed to load zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        const bool *pis_master = (is_master_set ? &is_master : nullptr);
+        const bool *pread_only = (is_read_only_set ? &read_only : nullptr);
+        const bool *psync_from_all = (sync_from_all_specified ? &sync_from_all : nullptr);
+        const string *predirect_zone = (redirect_zone_set ? &redirect_zone : nullptr);
+
+        // validate --tier-type if specified
+        const string *ptier_type = (tier_type_specified ? &tier_type : nullptr);
+        if (ptier_type) {
+          auto sync_mgr = static_cast<rgw::sal::RadosStore*>(driver)->svc()->sync_modules->get_manager();
+          if (!sync_mgr->get_module(*ptier_type, nullptr)) {
+            ldpp_dout(dpp(), -1) << "ERROR: could not find sync module: "
+                << *ptier_type << ",  valid sync modules: "
+                << sync_mgr->get_registered_module_names() << dendl;
+            return EINVAL;
+          }
+        }
+
+        if (enable_features.empty()) { // enable all features by default
+          enable_features.insert(rgw::zone_features::supported.begin(),
+                                 rgw::zone_features::supported.end());
+        }
+
+        // add/update the public zone information stored in the zonegroup
+        ret = rgw::add_zone_to_group(dpp(), zonegroup, zone_params,
+                                     pis_master, pread_only, endpoints,
+                                     ptier_type, psync_from_all,
+                                     sync_from, sync_from_rm,
+                                     predirect_zone, bucket_index_max_shards,
+                                     enable_features, disable_features);
+        if (ret < 0) {
+          return -ret;
+        }
+
+        // write the updated zonegroup
+        ret = zonegroup_writer->write(dpp(), null_yield, zonegroup);
+	if (ret < 0) {
+	  cerr << "failed to update zonegroup: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        if (set_default) {
+          ret = rgw::set_default_zone(dpp(), null_yield, cfgstore.get(),
+                                      zone_params);
+          if (ret < 0) {
+            cerr << "failed to set zone " << zone_name << " as default: " << cpp_strerror(-ret) << std::endl;
+          }
+        }
+
+        encode_json("zone", zone_params, formatter.get());
+        formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONE_RENAME:
+      {
+	if (zone_new_name.empty()) {
+	  cerr << " missing zone new name" << std::endl;
+	  return EINVAL;
+	}
+	if (zone_id.empty() && zone_name.empty()) {
+	  cerr << "no zone name or id provided" << std::endl;
+	  return EINVAL;
+	}
+
+	RGWZoneParams zone_params;
+        std::unique_ptr<rgw::sal::ZoneWriter> zone_writer;
+        int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                                 zone_id, zone_name, zone_params, &zone_writer);
+	if (ret < 0) {
+	  cerr << "failed to load zone: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+	ret = zone_writer->rename(dpp(), null_yield, zone_params, zone_new_name);
+	if (ret < 0) {
+	  cerr << "failed to rename zone " << zone_name << " to " << zone_new_name << ": " << cpp_strerror(-ret)
+	       << std::endl;
+	  return -ret;
+	}
+
+	RGWZoneGroup zonegroup;
+        std::unique_ptr<rgw::sal::ZoneGroupWriter> zonegroup_writer;
+        ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                  zonegroup_id, zonegroup_name,
+                                  zonegroup, &zonegroup_writer);
+	if (ret < 0) {
+	  cerr << "WARNING: failed to load zonegroup " << zonegroup_name << std::endl;
+          return EXIT_SUCCESS;
+	}
+
+        auto z = zonegroup.zones.find(zone_params.id);
+        if (z == zonegroup.zones.end()) {
+          return EXIT_SUCCESS;
+        }
+        z->second.name = zone_params.name;
+
+        ret = zonegroup_writer->write(dpp(), null_yield, zonegroup);
+        if (ret < 0) {
+          cerr << "Error in zonegroup rename for " << zone_name << ": " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+	}
+      }
+      break;
+    case OPT::ZONE_PLACEMENT_ADD:
+    case OPT::ZONE_PLACEMENT_MODIFY:
+    case OPT::ZONE_PLACEMENT_RM:
+      {
+        if (placement_id.empty()) {
+          cerr << "ERROR: --placement-id not specified" << std::endl;
+          return EINVAL;
+        }
+        // validate compression type
+        if (compression_type && *compression_type != "random"
+            && !Compressor::get_comp_alg_type(*compression_type)) {
+          std::cerr << "Unrecognized compression type" << std::endl;
+          return EINVAL;
+        }
+
+	RGWZoneParams zone;
+        std::unique_ptr<rgw::sal::ZoneWriter> writer;
+        int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                                 zone_id, zone_name, zone, &writer);
+        if (ret < 0) {
+	  cerr << "failed to init zone: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+
+        if (opt_cmd == OPT::ZONE_PLACEMENT_ADD ||
+	    opt_cmd == OPT::ZONE_PLACEMENT_MODIFY) {
+	  RGWZoneGroup zonegroup;
+          ret = rgw::read_zonegroup(dpp(), null_yield, cfgstore.get(),
+                                    zonegroup_id, zonegroup_name, zonegroup);
+	  if (ret < 0) {
+	    cerr << "failed to init zonegroup: " << cpp_strerror(-ret) << std::endl;
+	    return -ret;
+	  }
+
+	  auto ptiter = zonegroup.placement_targets.find(placement_id);
+	  if (ptiter == zonegroup.placement_targets.end()) {
+	    cerr << "ERROR: placement id '" << placement_id << "' is not configured in zonegroup placement targets" << std::endl;
+	    return EINVAL;
+	  }
+
+	  string storage_class = rgw_placement_rule::get_canonical_storage_class(opt_storage_class.value_or(string()));
+	  if (ptiter->second.storage_classes.find(storage_class) == ptiter->second.storage_classes.end()) {
+	    cerr << "ERROR: storage class '" << storage_class << "' is not defined in zonegroup '" << placement_id << "' placement target" << std::endl;
+	    return EINVAL;
+	  }
+	  if (ptiter->second.tier_targets.find(storage_class) != ptiter->second.tier_targets.end()) {
+	    cerr << "ERROR: storage class '" << storage_class << "' is of tier type in zonegroup '" << placement_id << "' placement target" << std::endl;
+	    return EINVAL;
+	  }
+
+          RGWZonePlacementInfo& info = zone.placement_pools[placement_id];
+
+	  string opt_index_pool = index_pool.value_or(string());
+	  string opt_data_pool = data_pool.value_or(string());
+
+	  if (!opt_index_pool.empty()) {
+	    info.index_pool = opt_index_pool;
+	  }
+
+	  if (info.index_pool.empty()) {
+            cerr << "ERROR: index pool not configured, need to specify --index-pool" << std::endl;
+            return EINVAL;
+	  }
+
+	  if (opt_data_pool.empty()) {
+	    const RGWZoneStorageClass *porig_sc{nullptr};
+	    if (info.storage_classes.find(storage_class, &porig_sc)) {
+	      if (porig_sc->data_pool) {
+		opt_data_pool = porig_sc->data_pool->to_str();
+	      }
+	    }
+	    if (opt_data_pool.empty()) {
+	      cerr << "ERROR: data pool not configured, need to specify --data-pool" << std::endl;
+	      return EINVAL;
+	    }
+	  }
+
+          rgw_pool dp = opt_data_pool;
+          info.storage_classes.set_storage_class(storage_class, &dp, compression_type.get_ptr());
+
+          if (data_extra_pool) {
+            info.data_extra_pool = *data_extra_pool;
+          }
+          if (index_type_specified) {
+	    info.index_type = placement_index_type;
+          }
+          if (placement_inline_data_specified) {
+            info.inline_data = placement_inline_data;
+          }
+
+          ret = check_pool_support_omap(info.get_data_extra_pool());
+          if (ret < 0) {
+             cerr << "ERROR: the data extra (non-ec) pool '" << info.get_data_extra_pool() 
+                 << "' does not support omap" << std::endl;
+             return ret;
+          }
+        } else if (opt_cmd == OPT::ZONE_PLACEMENT_RM) {
+          if (!opt_storage_class ||
+              opt_storage_class->empty()) {
+            zone.placement_pools.erase(placement_id);
+          } else {
+            auto iter = zone.placement_pools.find(placement_id);
+            if (iter != zone.placement_pools.end()) {
+              RGWZonePlacementInfo& info = zone.placement_pools[placement_id];
+              info.storage_classes.remove_storage_class(*opt_storage_class);
+            }
+          }
+        }
+
+        ret = writer->write(dpp(), null_yield, zone);
+        if (ret < 0) {
+          cerr << "failed to save zone info: " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+
+        encode_json("zone", zone, formatter.get());
+        formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONE_PLACEMENT_LIST:
+      {
+	RGWZoneParams zone;
+        int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                                 zone_id, zone_name, zone);
+	if (ret < 0) {
+	  cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+	encode_json("placement_pools", zone.placement_pools, formatter.get());
+	formatter->flush(cout);
+      }
+      break;
+    case OPT::ZONE_PLACEMENT_GET:
+      {
+	if (placement_id.empty()) {
+	  cerr << "ERROR: --placement-id not specified" << std::endl;
+	  return EINVAL;
+	}
+
+	RGWZoneParams zone;
+        int ret = rgw::read_zone(dpp(), null_yield, cfgstore.get(),
+                                 zone_id, zone_name, zone);
+	if (ret < 0) {
+	  cerr << "unable to initialize zone: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+	}
+	auto p = zone.placement_pools.find(placement_id);
+	if (p == zone.placement_pools.end()) {
+	  cerr << "ERROR: zone placement target '" << placement_id << "' not found" << std::endl;
+	  return ENOENT;
+	}
+	encode_json("placement_pools", p->second, formatter.get());
+	formatter->flush(cout);
+      }
+    default:
+      break;
+    }
+    return 0;
+  }
+
+  resolve_zone_id_opt(opt_effective_zone_name, opt_effective_zone_id);
+  resolve_zone_id_opt(opt_source_zone_name, opt_source_zone_id);
+  resolve_zone_id_opt(opt_dest_zone_name, opt_dest_zone_id);
+  resolve_zone_ids_opt(opt_zone_names, opt_zone_ids);
+  resolve_zone_ids_opt(opt_source_zone_names, opt_source_zone_ids);
+  resolve_zone_ids_opt(opt_dest_zone_names, opt_dest_zone_ids);
+
+  bool non_master_cmd = (!driver->is_meta_master() && !yes_i_really_mean_it);
+  std::set<OPT> non_master_ops_list = {OPT::USER_CREATE, OPT::USER_RM, 
+                                        OPT::USER_MODIFY, OPT::USER_ENABLE,
+                                        OPT::USER_SUSPEND, OPT::SUBUSER_CREATE,
+                                        OPT::SUBUSER_MODIFY, OPT::SUBUSER_RM,
+                                        OPT::BUCKET_LINK, OPT::BUCKET_UNLINK,
+                                        OPT::BUCKET_RM,
+                                        OPT::BUCKET_CHOWN, OPT::METADATA_PUT,
+                                        OPT::METADATA_RM, OPT::MFA_CREATE,
+                                        OPT::MFA_REMOVE, OPT::MFA_RESYNC,
+                                        OPT::CAPS_ADD, OPT::CAPS_RM,
+                                        OPT::ROLE_CREATE, OPT::ROLE_DELETE,
+                                        OPT::ROLE_POLICY_PUT, OPT::ROLE_POLICY_DELETE};
+
+  bool print_warning_message = (non_master_ops_list.find(opt_cmd) != non_master_ops_list.end() &&
+                                non_master_cmd);
+
+  if (print_warning_message) {
+      cerr << "Please run the command on master zone. Performing this operation on non-master zone leads to inconsistent metadata between zones" << std::endl;
+      cerr << "Are you sure you want to go ahead? (requires --yes-i-really-mean-it)" << std::endl;
+      return EINVAL;
+  }
+
+  if (!rgw::sal::User::empty(user)) {
+    user_op.set_user_id(user->get_id());
+    bucket_op.set_user_id(user->get_id());
+  }
+
+  if (!display_name.empty())
+    user_op.set_display_name(display_name);
+
+  if (!user_email.empty())
+    user_op.set_user_email(user_email);
+
+  if (!rgw::sal::User::empty(user)) {
+    user_op.set_new_user_id(new_user_id);
+  }
+
+  if (!access_key.empty())
+    user_op.set_access_key(access_key);
+
+  if (!secret_key.empty())
+    user_op.set_secret_key(secret_key);
+
+  if (!subuser.empty())
+    user_op.set_subuser(subuser);
+
+  if (!caps.empty())
+    user_op.set_caps(caps);
+
+  user_op.set_purge_data(purge_data);
+
+  if (purge_keys)
+    user_op.set_purge_keys();
+
+  if (gen_access_key)
+    user_op.set_generate_key();
+
+  if (gen_secret_key)
+    user_op.set_gen_secret(); // assume that a key pair should be created
+
+  if (max_buckets_specified)
+    user_op.set_max_buckets(max_buckets);
+
+  if (admin_specified)
+     user_op.set_admin(admin);
+
+  if (system_specified)
+    user_op.set_system(system);
+
+  if (set_perm)
+    user_op.set_perm(perm_mask);
+
+  if (set_temp_url_key) {
+    map<int, string>::iterator iter = temp_url_keys.begin();
+    for (; iter != temp_url_keys.end(); ++iter) {
+      user_op.set_temp_url_key(iter->second, iter->first);
+    }
+  }
+
+  if (!op_mask_str.empty()) {
+    uint32_t op_mask;
+    int ret = rgw_parse_op_type_list(op_mask_str, &op_mask);
+    if (ret < 0) {
+      cerr << "failed to parse op_mask: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    user_op.set_op_mask(op_mask);
+  }
+
+  if (key_type != KEY_TYPE_UNDEFINED)
+    user_op.set_key_type(key_type);
+
+  // set suspension operation parameters
+  if (opt_cmd == OPT::USER_ENABLE)
+    user_op.set_suspension(false);
+  else if (opt_cmd == OPT::USER_SUSPEND)
+    user_op.set_suspension(true);
+
+  if (!placement_id.empty()) {
+    rgw_placement_rule target_rule;
+    target_rule.name = placement_id;
+    target_rule.storage_class = opt_storage_class.value_or("");
+    if (!driver->valid_placement(target_rule)) {
+      cerr << "NOTICE: invalid dest placement: " << target_rule.to_str() << std::endl;
+      return EINVAL;
+    }
+    user_op.set_default_placement(target_rule);
+  }
+
+  if (!tags.empty()) {
+    user_op.set_placement_tags(tags);
+  }
+
+  // RGWUser to use for user operations
+  RGWUser ruser;
+  int ret = 0;
+  if (!(rgw::sal::User::empty(user) && access_key.empty()) || !subuser.empty()) {
+    ret = ruser.init(dpp(), driver, user_op, null_yield);
+    if (ret < 0) {
+      cerr << "user.init failed: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  /* populate bucket operation */
+  bucket_op.set_bucket_name(bucket_name);
+  bucket_op.set_object(object);
+  bucket_op.set_check_objects(check_objects);
+  bucket_op.set_delete_children(delete_child_objects);
+  bucket_op.set_fix_index(fix);
+  bucket_op.set_max_aio(max_concurrent_ios);
+  bucket_op.set_min_age(min_age);
+  bucket_op.set_dump_keys(dump_keys);
+  bucket_op.set_hide_progress(hide_progress);
+
+  // required to gather errors from operations
+  std::string err_msg;
+
+  bool output_user_info = true;
+
+  switch (opt_cmd) {
+  case OPT::USER_INFO:
+    if (rgw::sal::User::empty(user) && access_key.empty()) {
+      cerr << "ERROR: --uid or --access-key required" << std::endl;
+      return EINVAL;
+    }
+    break;
+  case OPT::USER_CREATE:
+    if (!user_op.has_existing_user()) {
+      user_op.set_generate_key(); // generate a new key by default
+    }
+    ret = ruser.add(dpp(), user_op, null_yield, &err_msg);
+    if (ret < 0) {
+      cerr << "could not create user: " << err_msg << std::endl;
+      if (ret == -ERR_INVALID_TENANT_NAME)
+	ret = -EINVAL;
+
+      return -ret;
+    }
+    if (!subuser.empty()) {
+      ret = ruser.subusers.add(dpp(),user_op, null_yield, &err_msg);
+      if (ret < 0) {
+        cerr << "could not create subuser: " << err_msg << std::endl;
+        return -ret;
+      }
+    }
+    break;
+  case OPT::USER_RM:
+    ret = ruser.remove(dpp(), user_op, null_yield, &err_msg);
+    if (ret < 0) {
+      cerr << "could not remove user: " << err_msg << std::endl;
+      return -ret;
+    }
+
+    output_user_info = false;
+    break;
+  case OPT::USER_RENAME:
+    if (yes_i_really_mean_it) {
+      user_op.set_overwrite_new_user(true);
+    }
+    ret = ruser.rename(user_op, null_yield, dpp(), &err_msg);
+    if (ret < 0) {
+      if (ret == -EEXIST) {
+        err_msg += ". to overwrite this user, add --yes-i-really-mean-it";
+      }
+      cerr << "could not rename user: " << err_msg << std::endl;
+      return -ret;
+    }
+
+    break;
+  case OPT::USER_ENABLE:
+  case OPT::USER_SUSPEND:
+  case OPT::USER_MODIFY:
+    ret = ruser.modify(dpp(), user_op, null_yield, &err_msg);
+    if (ret < 0) {
+      cerr << "could not modify user: " << err_msg << std::endl;
+      return -ret;
+    }
+
+    break;
+  case OPT::SUBUSER_CREATE:
+    ret = ruser.subusers.add(dpp(), user_op, null_yield, &err_msg);
+    if (ret < 0) {
+      cerr << "could not create subuser: " << err_msg << std::endl;
+      return -ret;
+    }
+
+    break;
+  case OPT::SUBUSER_MODIFY:
+    ret = ruser.subusers.modify(dpp(), user_op, null_yield, &err_msg);
+    if (ret < 0) {
+      cerr << "could not modify subuser: " << err_msg << std::endl;
+      return -ret;
+    }
+
+    break;
+  case OPT::SUBUSER_RM:
+    ret = ruser.subusers.remove(dpp(), user_op, null_yield, &err_msg);
+    if (ret < 0) {
+      cerr << "could not remove subuser: " << err_msg << std::endl;
+      return -ret;
+    }
+
+    break;
+  case OPT::CAPS_ADD:
+    ret = ruser.caps.add(dpp(), user_op, null_yield, &err_msg);
+    if (ret < 0) {
+      cerr << "could not add caps: " << err_msg << std::endl;
+      return -ret;
+    }
+
+    break;
+  case OPT::CAPS_RM:
+    ret = ruser.caps.remove(dpp(), user_op, null_yield, &err_msg);
+    if (ret < 0) {
+      cerr << "could not remove caps: " << err_msg << std::endl;
+      return -ret;
+    }
+
+    break;
+  case OPT::KEY_CREATE:
+    ret = ruser.keys.add(dpp(), user_op, null_yield, &err_msg);
+    if (ret < 0) {
+      cerr << "could not create key: " << err_msg << std::endl;
+      return -ret;
+    }
+
+    break;
+  case OPT::KEY_RM:
+    ret = ruser.keys.remove(dpp(), user_op, null_yield, &err_msg);
+    if (ret < 0) {
+      cerr << "could not remove key: " << err_msg << std::endl;
+      return -ret;
+    }
+    break;
+  case OPT::PERIOD_PUSH:
+    {
+      RGWEnv env;
+      req_info info(g_ceph_context, &env);
+      info.method = "POST";
+      info.request_uri = "/admin/realm/period";
+
+      map<string, string> &params = info.args.get_params();
+      if (!realm_id.empty())
+        params["realm_id"] = realm_id;
+      if (!realm_name.empty())
+        params["realm_name"] = realm_name;
+      if (!period_id.empty())
+        params["period_id"] = period_id;
+      if (!period_epoch.empty())
+        params["epoch"] = period_epoch;
+
+      // load the period
+      RGWPeriod period;
+      int ret = cfgstore->read_period(dpp(), null_yield, period_id,
+                                      std::nullopt, period);
+      if (ret < 0) {
+        cerr << "failed to load period: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+      // json format into a bufferlist
+      JSONFormatter jf(false);
+      encode_json("period", period, &jf);
+      bufferlist bl;
+      jf.flush(bl);
+
+      JSONParser p;
+      ret = send_to_remote_or_url(nullptr, url, opt_region,
+                                  access_key, secret_key,
+                                  info, bl, p);
+      if (ret < 0) {
+        cerr << "request failed: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+    }
+    return 0;
+  case OPT::PERIOD_UPDATE:
+    {
+      int ret = update_period(cfgstore.get(), realm_id, realm_name,
+                              period_epoch, commit, remote, url,
+                              opt_region, access_key, secret_key,
+                              formatter.get(), yes_i_really_mean_it);
+      if (ret < 0) {
+	return -ret;
+      }
+    }
+    return 0;
+  case OPT::PERIOD_COMMIT:
+    {
+      // read realm and staging period
+      RGWRealm realm;
+      std::unique_ptr<rgw::sal::RealmWriter> realm_writer;
+      int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                realm_id, realm_name,
+                                realm, &realm_writer);
+      if (ret < 0) {
+        cerr << "Error initializing realm: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+      period_id = rgw::get_staging_period_id(realm.id);
+      epoch_t epoch = 1;
+
+      RGWPeriod period;
+      ret = cfgstore->read_period(dpp(), null_yield, period_id, epoch, period);
+      if (ret < 0) {
+        cerr << "failed to load period: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+      ret = commit_period(cfgstore.get(), realm, *realm_writer, period,
+                          remote, url, opt_region, access_key, secret_key,
+                          yes_i_really_mean_it);
+      if (ret < 0) {
+        cerr << "failed to commit period: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+
+      encode_json("period", period, formatter.get());
+      formatter->flush(cout);
+    }
+    return 0;
+  case OPT::ROLE_CREATE:
+    {
+      if (role_name.empty()) {
+        cerr << "ERROR: role name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      if (assume_role_doc.empty()) {
+        cerr << "ERROR: assume role policy document is empty" << std::endl;
+        return -EINVAL;
+      }
+      bufferlist bl = bufferlist::static_from_string(assume_role_doc);
+      try {
+        const rgw::IAM::Policy p(
+	  g_ceph_context, tenant, bl,
+	  g_ceph_context->_conf.get_val<bool>(
+	    "rgw_policy_reject_invalid_principals"));
+      } catch (rgw::IAM::PolicyParseException& e) {
+        cerr << "failed to parse policy: " << e.what() << std::endl;
+        return -EINVAL;
+      }
+      std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name, tenant, path, assume_role_doc);
+      ret = role->create(dpp(), true, "", null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      show_role_info(role.get(), formatter.get());
+      return 0;
+    }
+  case OPT::ROLE_DELETE:
+    {
+      if (role_name.empty()) {
+        cerr << "ERROR: empty role name" << std::endl;
+        return -EINVAL;
+      }
+      std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name, tenant);
+      ret = role->delete_obj(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      cout << "role: " << role_name << " successfully deleted" << std::endl;
+      return 0;
+    }
+  case OPT::ROLE_GET:
+    {
+      if (role_name.empty()) {
+        cerr << "ERROR: empty role name" << std::endl;
+        return -EINVAL;
+      }
+      std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name, tenant);
+      ret = role->get(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      show_role_info(role.get(), formatter.get());
+      return 0;
+    }
+  case OPT::ROLE_TRUST_POLICY_MODIFY:
+    {
+      if (role_name.empty()) {
+        cerr << "ERROR: role name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      if (assume_role_doc.empty()) {
+        cerr << "ERROR: assume role policy document is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      bufferlist bl = bufferlist::static_from_string(assume_role_doc);
+      try {
+        const rgw::IAM::Policy p(g_ceph_context, tenant, bl,
+				 g_ceph_context->_conf.get_val<bool>(
+				   "rgw_policy_reject_invalid_principals"));
+      } catch (rgw::IAM::PolicyParseException& e) {
+        cerr << "failed to parse policy: " << e.what() << std::endl;
+        return -EINVAL;
+      }
+
+      std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name, tenant);
+      ret = role->get(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      role->update_trust_policy(assume_role_doc);
+      ret = role->update(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      cout << "Assume role policy document updated successfully for role: " << role_name << std::endl;
+      return 0;
+    }
+  case OPT::ROLE_LIST:
+    {
+      vector<std::unique_ptr<rgw::sal::RGWRole>> result;
+      ret = driver->get_roles(dpp(), null_yield, path_prefix, tenant, result);
+      if (ret < 0) {
+        return -ret;
+      }
+      show_roles_info(result, formatter.get());
+      return 0;
+    }
+  case OPT::ROLE_POLICY_PUT:
+    {
+      if (role_name.empty()) {
+        cerr << "role name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      if (policy_name.empty()) {
+        cerr << "policy name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      if (perm_policy_doc.empty() && infile.empty()) {
+        cerr << "permission policy document is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      bufferlist bl;
+      if (!infile.empty()) {
+        int ret = read_input(infile, bl);
+        if (ret < 0) {
+          cerr << "ERROR: failed to read input policy document: " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+        perm_policy_doc = bl.to_str();
+      } else {
+        bl = bufferlist::static_from_string(perm_policy_doc);
+      }
+      try {
+        const rgw::IAM::Policy p(g_ceph_context, tenant, bl,
+				 g_ceph_context->_conf.get_val<bool>(
+				   "rgw_policy_reject_invalid_principals"));
+      } catch (rgw::IAM::PolicyParseException& e) {
+        cerr << "failed to parse perm policy: " << e.what() << std::endl;
+        return -EINVAL;
+      }
+
+      std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name, tenant);
+      ret = role->get(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      role->set_perm_policy(policy_name, perm_policy_doc);
+      ret = role->update(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      cout << "Permission policy attached successfully" << std::endl;
+      return 0;
+    }
+  case OPT::ROLE_POLICY_LIST:
+    {
+      if (role_name.empty()) {
+        cerr << "ERROR: Role name is empty" << std::endl;
+        return -EINVAL;
+      }
+      std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name, tenant);
+      ret = role->get(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      std::vector<string> policy_names = role->get_role_policy_names();
+      show_policy_names(policy_names, formatter.get());
+      return 0;
+    }
+  case OPT::ROLE_POLICY_GET:
+    {
+      if (role_name.empty()) {
+        cerr << "ERROR: role name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      if (policy_name.empty()) {
+        cerr << "ERROR: policy name is empty" << std::endl;
+        return -EINVAL;
+      }
+      std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name, tenant);
+      int ret = role->get(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      string perm_policy;
+      ret = role->get_role_policy(dpp(), policy_name, perm_policy);
+      if (ret < 0) {
+        return -ret;
+      }
+      show_perm_policy(perm_policy, formatter.get());
+      return 0;
+    }
+  case OPT::ROLE_POLICY_DELETE:
+    {
+      if (role_name.empty()) {
+        cerr << "ERROR: role name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      if (policy_name.empty()) {
+        cerr << "ERROR: policy name is empty" << std::endl;
+        return -EINVAL;
+      }
+      std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name, tenant);
+      ret = role->get(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      ret = role->delete_policy(dpp(), policy_name);
+      if (ret < 0) {
+        return -ret;
+      }
+      ret = role->update(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      cout << "Policy: " << policy_name << " successfully deleted for role: "
+           << role_name << std::endl;
+      return 0;
+  }
+  case OPT::ROLE_UPDATE:
+    {
+      if (role_name.empty()) {
+        cerr << "ERROR: role name is empty" << std::endl;
+        return -EINVAL;
+      }
+
+      std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name, tenant);
+      ret = role->get(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      if (!role->validate_max_session_duration(dpp())) {
+        ret = -EINVAL;
+        return ret;
+      }
+      role->update_max_session_duration(max_session_duration);
+      ret = role->update(dpp(), null_yield);
+      if (ret < 0) {
+        return -ret;
+      }
+      cout << "Max session duration updated successfully for role: " << role_name << std::endl;
+      return 0;
+    }
+  default:
+    output_user_info = false;
+  }
+
+  // output the result of a user operation
+  if (output_user_info) {
+    ret = ruser.info(info, &err_msg);
+    if (ret < 0) {
+      cerr << "could not fetch user info: " << err_msg << std::endl;
+      return -ret;
+    }
+    show_user_info(info, formatter.get());
+  }
+
+  if (opt_cmd == OPT::POLICY) {
+    if (format == "xml") {
+      int ret = RGWBucketAdminOp::dump_s3_policy(driver, bucket_op, cout, dpp());
+      if (ret < 0) {
+        cerr << "ERROR: failed to get policy: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+    } else {
+      int ret = RGWBucketAdminOp::get_policy(driver, bucket_op, stream_flusher, dpp());
+      if (ret < 0) {
+        cerr << "ERROR: failed to get policy: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_LIMIT_CHECK) {
+    void *handle;
+    std::list<std::string> user_ids;
+    metadata_key = "user";
+    int max = 1000;
+
+    bool truncated;
+
+    if (!rgw::sal::User::empty(user)) {
+      user_ids.push_back(user->get_id().id);
+      ret =
+	RGWBucketAdminOp::limit_check(driver, bucket_op, user_ids, stream_flusher,
+				      null_yield, dpp(), warnings_only);
+    } else {
+      /* list users in groups of max-keys, then perform user-bucket
+       * limit-check on each group */
+     ret = driver->meta_list_keys_init(dpp(), metadata_key, string(), &handle);
+      if (ret < 0) {
+	cerr << "ERROR: buckets limit check can't get user metadata_key: "
+	     << cpp_strerror(-ret) << std::endl;
+	return -ret;
+      }
+
+      do {
+	ret = driver->meta_list_keys_next(dpp(), handle, max, user_ids,
+					      &truncated);
+	if (ret < 0 && ret != -ENOENT) {
+	  cerr << "ERROR: buckets limit check lists_keys_next(): "
+	       << cpp_strerror(-ret) << std::endl;
+	  break;
+	} else {
+	  /* ok, do the limit checks for this group */
+	  ret =
+	    RGWBucketAdminOp::limit_check(driver, bucket_op, user_ids, stream_flusher,
+					  null_yield, dpp(), warnings_only);
+	  if (ret < 0)
+	    break;
+	}
+	user_ids.clear();
+      } while (truncated);
+      driver->meta_list_keys_complete(handle);
+    }
+    return -ret;
+  } /* OPT::BUCKET_LIMIT_CHECK */
+
+  if (opt_cmd == OPT::BUCKETS_LIST) {
+    if (bucket_name.empty()) {
+      if (!rgw::sal::User::empty(user)) {
+        if (!user_op.has_existing_user()) {
+          cerr << "ERROR: could not find user: " << user << std::endl;
+          return -ENOENT;
+        }
+      }
+      RGWBucketAdminOp::info(driver, bucket_op, stream_flusher, null_yield, dpp());
+    } else {
+      int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+      if (ret < 0) {
+        cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+      formatter->open_array_section("entries");
+
+      int count = 0;
+
+      static constexpr int MAX_PAGINATE_SIZE = 10000;
+      static constexpr int DEFAULT_MAX_ENTRIES = 1000;
+
+      if (max_entries < 0) {
+	max_entries = DEFAULT_MAX_ENTRIES;
+      }
+      const int paginate_size = std::min(max_entries, MAX_PAGINATE_SIZE);
+
+      string prefix;
+      string delim;
+      string ns;
+
+      rgw::sal::Bucket::ListParams params;
+      rgw::sal::Bucket::ListResults results;
+
+      params.prefix = prefix;
+      params.delim = delim;
+      params.marker = rgw_obj_key(marker);
+      params.ns = ns;
+      params.enforce_ns = false;
+      params.list_versions = true;
+      params.allow_unordered = bool(allow_unordered);
+
+      do {
+        const int remaining = max_entries - count;
+	ret = bucket->list(dpp(), params, std::min(remaining, paginate_size), results,
+			   null_yield);
+        if (ret < 0) {
+          cerr << "ERROR: driver->list_objects(): " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+	ldpp_dout(dpp(), 20) << "INFO: " << __func__ <<
+	  ": list() returned without error; results.objs.sizie()=" <<
+	  results.objs.size() << "results.is_truncated=" << results.is_truncated << ", marker=" <<
+	  params.marker << dendl;
+
+        count += results.objs.size();
+
+        for (const auto& entry : results.objs) {
+          encode_json("entry", entry, formatter.get());
+        }
+        formatter->flush(cout);
+      } while (results.is_truncated && count < max_entries);
+      ldpp_dout(dpp(), 20) << "INFO: " << __func__ << ": done" << dendl;
+
+      formatter->close_section();
+      formatter->flush(cout);
+    } /* have bucket_name */
+  } /* OPT::BUCKETS_LIST */
+
+  if (opt_cmd == OPT::BUCKET_RADOS_LIST) {
+    RGWRadosList lister(static_cast<rgw::sal::RadosStore*>(driver),
+			max_concurrent_ios, orphan_stale_secs, tenant);
+    if (rgw_obj_fs) {
+      lister.set_field_separator(*rgw_obj_fs);
+    }
+
+    if (bucket_name.empty()) {
+      // yes_i_really_mean_it means continue with listing even if
+      // there are indexless buckets
+      ret = lister.run(dpp(), yes_i_really_mean_it);
+    } else {
+      ret = lister.run(dpp(), bucket_name);
+    }
+
+    if (ret < 0) {
+      std::cerr <<
+	"ERROR: bucket radoslist failed to finish before " <<
+	"encountering error: " << cpp_strerror(-ret) << std::endl;
+      std::cerr << "************************************"
+	"************************************" << std::endl;
+      std::cerr << "WARNING: THE RESULTS ARE NOT RELIABLE AND SHOULD NOT " <<
+	"BE USED IN DELETING ORPHANS" << std::endl;
+      std::cerr << "************************************"
+	"************************************" << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_LAYOUT) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      return -ret;
+    }
+    const auto& bucket_info = bucket->get_info();
+    formatter->open_object_section("layout");
+    encode_json("layout", bucket_info.layout, formatter.get());
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::BUCKET_STATS) {
+    if (bucket_name.empty() && !bucket_id.empty()) {
+      rgw_bucket bucket;
+      if (!rgw_find_bucket_by_id(dpp(), driver->ctx(), driver, marker, bucket_id, &bucket)) {
+        cerr << "failure: no such bucket id" << std::endl;
+        return -ENOENT;
+      }
+      bucket_op.set_tenant(bucket.tenant);
+      bucket_op.set_bucket_name(bucket.name);
+    }
+    bucket_op.set_fetch_stats(true);
+
+    int r = RGWBucketAdminOp::info(driver, bucket_op, stream_flusher, null_yield, dpp());
+    if (r < 0) {
+      cerr << "failure: " << cpp_strerror(-r) << ": " << err << std::endl;
+      return posix_errortrans(-r);
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_LINK) {
+    bucket_op.set_bucket_id(bucket_id);
+    bucket_op.set_new_bucket_name(new_bucket_name);
+    string err;
+    int r = RGWBucketAdminOp::link(driver, bucket_op, dpp(), &err);
+    if (r < 0) {
+      cerr << "failure: " << cpp_strerror(-r) << ": " << err << std::endl;
+      return -r;
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_UNLINK) {
+    int r = RGWBucketAdminOp::unlink(driver, bucket_op, dpp());
+    if (r < 0) {
+      cerr << "failure: " << cpp_strerror(-r) << std::endl;
+      return -r;
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_SHARD_OBJECTS) {
+    const auto prefix = opt_prefix ? *opt_prefix : "obj"s;
+    if (!num_shards_specified) {
+      cerr << "ERROR: num-shards must be specified."
+	   << std::endl;
+      return EINVAL;
+    }
+
+    if (specified_shard_id) {
+      if (shard_id >= num_shards) {
+	cerr << "ERROR: shard-id must be less than num-shards."
+	     << std::endl;
+	return EINVAL;
+      }
+      std::string obj;
+      uint64_t ctr = 0;
+      int shard;
+      do {
+	obj = fmt::format("{}{:0>20}", prefix, ctr);
+	shard = RGWSI_BucketIndex_RADOS::bucket_shard_index(obj, num_shards);
+	++ctr;
+      } while (shard != shard_id);
+
+      formatter->open_object_section("shard_obj");
+      encode_json("obj", obj, formatter.get());
+      formatter->close_section();
+      formatter->flush(cout);
+    } else {
+      std::vector<std::string> objs(num_shards);
+      for (uint64_t ctr = 0, shardsleft = num_shards; shardsleft > 0; ++ctr) {
+	auto key = fmt::format("{}{:0>20}", prefix, ctr);
+	auto shard = RGWSI_BucketIndex_RADOS::bucket_shard_index(key, num_shards);
+	if (objs[shard].empty()) {
+	  objs[shard] = std::move(key);
+	  --shardsleft;
+	}
+      }
+
+      formatter->open_object_section("shard_objs");
+      encode_json("objs", objs, formatter.get());
+      formatter->close_section();
+      formatter->flush(cout);
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_OBJECT_SHARD) {
+    if (!num_shards_specified || object.empty()) {
+      cerr << "ERROR: num-shards and object must be specified."
+	   << std::endl;
+      return EINVAL;
+    }
+    auto shard = RGWSI_BucketIndex_RADOS::bucket_shard_index(object, num_shards);
+    formatter->open_object_section("obj_shard");
+    encode_json("shard", shard, formatter.get());
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::BUCKET_RESYNC_ENCRYPTED_MULTIPART) {
+    // repair logic for replication of encrypted multipart uploads:
+    // https://tracker.ceph.com/issues/46062
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      return -ret;
+    }
+
+    auto rados_driver = dynamic_cast<rgw::sal::RadosStore*>(driver);
+    if (!rados_driver) {
+      cerr << "ERROR: this command can only work when the cluster "
+          "has a RADOS backing store." << std::endl;
+      return EPERM;
+    }
+
+    // fail if recovery wouldn't generate replication log entries
+    if (!rados_driver->svc()->zone->need_to_log_data() && !yes_i_really_mean_it) {
+      cerr << "This command is only necessary for replicated buckets." << std::endl;
+      cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl;
+      return EPERM;
+    }
+
+    formatter->open_object_section("modified");
+    encode_json("bucket", bucket->get_name(), formatter.get());
+    encode_json("bucket_id", bucket->get_bucket_id(), formatter.get());
+
+    ret = rados_driver->getRados()->bucket_resync_encrypted_multipart(
+        dpp(), null_yield, rados_driver, bucket->get_info(),
+        marker, stream_flusher);
+    if (ret < 0) {
+      return -ret;
+    }
+    formatter->close_section();
+    formatter->flush(cout);
+    return 0;
+  }
+
+  if (opt_cmd == OPT::BUCKET_CHOWN) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket name not specified" << std::endl;
+      return EINVAL;
+    }
+
+    bucket_op.set_bucket_name(bucket_name);
+    bucket_op.set_new_bucket_name(new_bucket_name);
+    string err;
+
+    int r = RGWBucketAdminOp::chown(driver, bucket_op, marker, dpp(), &err);
+    if (r < 0) {
+      cerr << "failure: " << cpp_strerror(-r) << ": " << err << std::endl;
+      return -r;
+    }
+  }
+
+  if (opt_cmd == OPT::LOG_LIST) {
+    // filter by date?
+    if (date.size() && date.size() != 10) {
+      cerr << "bad date format for '" << date << "', expect YYYY-MM-DD" << std::endl;
+      return EINVAL;
+    }
+
+    formatter->reset();
+    formatter->open_array_section("logs");
+    RGWAccessHandle h;
+    int r = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->log_list_init(dpp(), date, &h);
+    if (r == -ENOENT) {
+      // no logs.
+    } else {
+      if (r < 0) {
+        cerr << "log list: error " << r << std::endl;
+        return -r;
+      }
+      while (true) {
+        string name;
+        int r = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->log_list_next(h, &name);
+        if (r == -ENOENT)
+          break;
+        if (r < 0) {
+          cerr << "log list: error " << r << std::endl;
+          return -r;
+        }
+        formatter->dump_string("object", name);
+      }
+    }
+    formatter->close_section();
+    formatter->flush(cout);
+    cout << std::endl;
+  }
+
+  if (opt_cmd == OPT::LOG_SHOW || opt_cmd == OPT::LOG_RM) {
+    if (object.empty() && (date.empty() || bucket_name.empty() || bucket_id.empty())) {
+      cerr << "specify an object or a date, bucket and bucket-id" << std::endl;
+      exit(1);
+    }
+
+    string oid;
+    if (!object.empty()) {
+      oid = object;
+    } else {
+      oid = date;
+      oid += "-";
+      oid += bucket_id;
+      oid += "-";
+      oid += bucket_name;
+    }
+
+    if (opt_cmd == OPT::LOG_SHOW) {
+      RGWAccessHandle h;
+
+      int r = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->log_show_init(dpp(), oid, &h);
+      if (r < 0) {
+	cerr << "error opening log " << oid << ": " << cpp_strerror(-r) << std::endl;
+	return -r;
+      }
+
+      formatter->reset();
+      formatter->open_object_section("log");
+
+      struct rgw_log_entry entry;
+
+      // peek at first entry to get bucket metadata
+      r = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->log_show_next(dpp(), h, &entry);
+      if (r < 0) {
+	cerr << "error reading log " << oid << ": " << cpp_strerror(-r) << std::endl;
+	return -r;
+      }
+      formatter->dump_string("bucket_id", entry.bucket_id);
+      formatter->dump_string("bucket_owner", entry.bucket_owner.to_str());
+      formatter->dump_string("bucket", entry.bucket);
+
+      uint64_t agg_time = 0;
+      uint64_t agg_bytes_sent = 0;
+      uint64_t agg_bytes_received = 0;
+      uint64_t total_entries = 0;
+
+      if (show_log_entries)
+        formatter->open_array_section("log_entries");
+
+      do {
+        using namespace std::chrono;
+        uint64_t total_time = duration_cast<milliseconds>(entry.total_time).count();
+
+        agg_time += total_time;
+        agg_bytes_sent += entry.bytes_sent;
+        agg_bytes_received += entry.bytes_received;
+        total_entries++;
+
+        if (skip_zero_entries && entry.bytes_sent == 0 &&
+            entry.bytes_received == 0)
+          goto next;
+
+        if (show_log_entries) {
+
+	  rgw_format_ops_log_entry(entry, formatter.get());
+	  formatter->flush(cout);
+        }
+next:
+	r = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->log_show_next(dpp(), h, &entry);
+      } while (r > 0);
+
+      if (r < 0) {
+      	cerr << "error reading log " << oid << ": " << cpp_strerror(-r) << std::endl;
+	return -r;
+      }
+      if (show_log_entries)
+        formatter->close_section();
+
+      if (show_log_sum) {
+        formatter->open_object_section("log_sum");
+	formatter->dump_int("bytes_sent", agg_bytes_sent);
+	formatter->dump_int("bytes_received", agg_bytes_received);
+	formatter->dump_int("total_time", agg_time);
+	formatter->dump_int("total_entries", total_entries);
+        formatter->close_section();
+      }
+      formatter->close_section();
+      formatter->flush(cout);
+      cout << std::endl;
+    }
+    if (opt_cmd == OPT::LOG_RM) {
+      int r = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->log_remove(dpp(), oid);
+      if (r < 0) {
+	cerr << "error removing log " << oid << ": " << cpp_strerror(-r) << std::endl;
+	return -r;
+      }
+    }
+  }
+
+  if (opt_cmd == OPT::POOL_ADD) {
+    if (pool_name.empty()) {
+      cerr << "need to specify pool to add!" << std::endl;
+      exit(1);
+    }
+
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->add_bucket_placement(dpp(), pool, null_yield);
+    if (ret < 0)
+      cerr << "failed to add bucket placement: " << cpp_strerror(-ret) << std::endl;
+  }
+
+  if (opt_cmd == OPT::POOL_RM) {
+    if (pool_name.empty()) {
+      cerr << "need to specify pool to remove!" << std::endl;
+      exit(1);
+    }
+
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->remove_bucket_placement(dpp(), pool, null_yield);
+    if (ret < 0)
+      cerr << "failed to remove bucket placement: " << cpp_strerror(-ret) << std::endl;
+  }
+
+  if (opt_cmd == OPT::POOLS_LIST) {
+    set<rgw_pool> pools;
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->list_placement_set(dpp(), pools, null_yield);
+    if (ret < 0) {
+      cerr << "could not list placement set: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    formatter->reset();
+    formatter->open_array_section("pools");
+    for (auto siter = pools.begin(); siter != pools.end(); ++siter) {
+      formatter->open_object_section("pool");
+      formatter->dump_string("name",  siter->to_str());
+      formatter->close_section();
+    }
+    formatter->close_section();
+    formatter->flush(cout);
+    cout << std::endl;
+  }
+
+  if (opt_cmd == OPT::USAGE_SHOW) {
+    uint64_t start_epoch = 0;
+    uint64_t end_epoch = (uint64_t)-1;
+
+    int ret;
+
+    if (!start_date.empty()) {
+      ret = utime_t::parse_date(start_date, &start_epoch, NULL);
+      if (ret < 0) {
+        cerr << "ERROR: failed to parse start date" << std::endl;
+        return 1;
+      }
+    }
+    if (!end_date.empty()) {
+      ret = utime_t::parse_date(end_date, &end_epoch, NULL);
+      if (ret < 0) {
+        cerr << "ERROR: failed to parse end date" << std::endl;
+        return 1;
+      }
+    }
+
+
+    if (!bucket_name.empty()) {
+      int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+      if (ret < 0) {
+	cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+	return -ret;
+      }
+    }
+    ret = RGWUsage::show(dpp(), driver, user.get(), bucket.get(), start_epoch,
+			 end_epoch, show_log_entries, show_log_sum, &categories,
+			 stream_flusher);
+    if (ret < 0) {
+      cerr << "ERROR: failed to show usage" << std::endl;
+      return 1;
+    }
+  }
+
+  if (opt_cmd == OPT::USAGE_TRIM) {
+    if (rgw::sal::User::empty(user) && bucket_name.empty() &&
+	start_date.empty() && end_date.empty() && !yes_i_really_mean_it) {
+      cerr << "usage trim without user/date/bucket specified will remove *all* users data" << std::endl;
+      cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl;
+      return 1;
+    }
+    int ret;
+    uint64_t start_epoch = 0;
+    uint64_t end_epoch = (uint64_t)-1;
+
+
+    if (!start_date.empty()) {
+      ret = utime_t::parse_date(start_date, &start_epoch, NULL);
+      if (ret < 0) {
+        cerr << "ERROR: failed to parse start date" << std::endl;
+        return 1;
+      }
+    }
+
+    if (!end_date.empty()) {
+      ret = utime_t::parse_date(end_date, &end_epoch, NULL);
+      if (ret < 0) {
+        cerr << "ERROR: failed to parse end date" << std::endl;
+        return 1;
+      }
+    }
+
+    if (!bucket_name.empty()) {
+      int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+      if (ret < 0) {
+	cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+	return -ret;
+      }
+    }
+    ret = RGWUsage::trim(dpp(), driver, user.get(), bucket.get(), start_epoch, end_epoch);
+    if (ret < 0) {
+      cerr << "ERROR: read_usage() returned ret=" << ret << std::endl;
+      return 1;
+    }
+  }
+
+  if (opt_cmd == OPT::USAGE_CLEAR) {
+    if (!yes_i_really_mean_it) {
+      cerr << "usage clear would remove *all* users usage data for all time" << std::endl;
+      cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl;
+      return 1;
+    }
+
+    ret = RGWUsage::clear(dpp(), driver);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+
+  if (opt_cmd == OPT::OLH_GET || opt_cmd == OPT::OLH_READLOG) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    if (object.empty()) {
+      cerr << "ERROR: object not specified" << std::endl;
+      return EINVAL;
+    }
+  }
+
+  if (opt_cmd == OPT::OLH_GET) {
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    RGWOLHInfo olh;
+    rgw_obj obj(bucket->get_key(), object);
+    ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_olh(dpp(), bucket->get_info(), obj, &olh);
+    if (ret < 0) {
+      cerr << "ERROR: failed reading olh: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    encode_json("olh", olh, formatter.get());
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::OLH_READLOG) {
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
+    bool is_truncated;
+
+    std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(object);
+
+    RGWObjState *state;
+
+    ret = obj->get_obj_state(dpp(), &state, null_yield);
+    if (ret < 0) {
+      return -ret;
+    }
+
+    ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->bucket_index_read_olh_log(dpp(), bucket->get_info(), *state, obj->get_obj(), 0, &log, &is_truncated);
+    if (ret < 0) {
+      cerr << "ERROR: failed reading olh: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    formatter->open_object_section("result");
+    encode_json("is_truncated", is_truncated, formatter.get());
+    encode_json("log", log, formatter.get());
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::BI_GET) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket name not specified" << std::endl;
+      return EINVAL;
+    }
+    if (object.empty()) {
+      cerr << "ERROR: object not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    rgw_obj obj(bucket->get_key(), object);
+    if (!object_version.empty()) {
+      obj.key.set_instance(object_version);
+    }
+
+    rgw_cls_bi_entry entry;
+    ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->bi_get(dpp(), bucket->get_info(), obj, bi_index_type, &entry);
+    if (ret < 0) {
+      cerr << "ERROR: bi_get(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    encode_json("entry", entry, formatter.get());
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::BI_PUT) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket name not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    rgw_cls_bi_entry entry;
+    cls_rgw_obj_key key;
+    ret = read_decode_json(infile, entry, &key);
+    if (ret < 0) {
+      return 1;
+    }
+
+    rgw_obj obj(bucket->get_key(), key);
+
+    ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->bi_put(dpp(), bucket->get_key(), obj, entry);
+    if (ret < 0) {
+      cerr << "ERROR: bi_put(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::BI_LIST) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket name not specified" << std::endl;
+      return EINVAL;
+    }
+
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    std::list<rgw_cls_bi_entry> entries;
+    bool is_truncated;
+    const auto& index = bucket->get_info().layout.current_index;
+    const int max_shards = rgw::num_shards(index);
+    if (max_entries < 0) {
+      max_entries = 1000;
+    }
+
+    ldpp_dout(dpp(), 20) << "INFO: " << __func__ << ": max_entries=" << max_entries <<
+      ", index=" << index << ", max_shards=" << max_shards << dendl;
+
+    formatter->open_array_section("entries");
+
+    int i = (specified_shard_id ? shard_id : 0);
+    for (; i < max_shards; i++) {
+      ldpp_dout(dpp(), 20) << "INFO: " << __func__ << ": starting shard=" << i << dendl;
+
+      RGWRados::BucketShard bs(static_cast<rgw::sal::RadosStore*>(driver)->getRados());
+      int ret = bs.init(dpp(), bucket->get_info(), index, i);
+      marker.clear();
+
+      if (ret < 0) {
+        cerr << "ERROR: bs.init(bucket=" << bucket << ", shard=" << i << "): " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+
+      do {
+        entries.clear();
+	// if object is specified, we use that as a filter to only retrieve some some entries
+        ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->bi_list(bs, object, marker, max_entries, &entries, &is_truncated);
+        if (ret < 0) {
+          cerr << "ERROR: bi_list(): " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+	ldpp_dout(dpp(), 20) << "INFO: " << __func__ <<
+	  ": bi_list() returned without error; entries.size()=" <<
+	  entries.size() << ", is_truncated=" << is_truncated <<
+	  ", marker=" << marker << dendl;
+
+	for (const auto& entry : entries) {
+          encode_json("entry", entry, formatter.get());
+          marker = entry.idx;
+        }
+        formatter->flush(cout);
+      } while (is_truncated);
+
+      formatter->flush(cout);
+
+      if (specified_shard_id) {
+        break;
+      }
+    }
+    ldpp_dout(dpp(), 20) << "INFO: " << __func__ << ": done" << dendl;
+
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::BI_PURGE) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket name not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    std::unique_ptr<rgw::sal::Bucket> cur_bucket;
+    ret = init_bucket(user.get(), tenant, bucket_name, string(), &cur_bucket);
+    if (ret == -ENOENT) {
+      // no bucket entrypoint
+    } else if (ret < 0) {
+      cerr << "ERROR: could not init current bucket info for bucket_name=" << bucket_name << ": " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    } else if (cur_bucket->get_bucket_id() == bucket->get_bucket_id() &&
+               !yes_i_really_mean_it) {
+      cerr << "specified bucket instance points to a current bucket instance" << std::endl;
+      cerr << "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl;
+      return EINVAL;
+    }
+
+    const auto& index = bucket->get_info().layout.current_index;
+    if (index.layout.type == rgw::BucketIndexType::Indexless) {
+      cerr << "ERROR: indexless bucket has no index to purge" << std::endl;
+      return EINVAL;
+    }
+
+    const int max_shards = rgw::num_shards(index);
+    for (int i = 0; i < max_shards; i++) {
+      RGWRados::BucketShard bs(static_cast<rgw::sal::RadosStore*>(driver)->getRados());
+      int ret = bs.init(dpp(), bucket->get_info(), index, i);
+      if (ret < 0) {
+        cerr << "ERROR: bs.init(bucket=" << bucket << ", shard=" << i << "): " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+
+      ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->bi_remove(dpp(), bs);
+      if (ret < 0) {
+        cerr << "ERROR: failed to remove bucket index object: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+    }
+  }
+
+  if (opt_cmd == OPT::OBJECT_PUT) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    if (object.empty()) {
+      cerr << "ERROR: object not specified" << std::endl;
+      return EINVAL;
+    }
+
+    RGWDataAccess data_access(driver);
+    rgw_obj_key key(object, object_version);
+
+    RGWDataAccess::BucketRef b;
+    RGWDataAccess::ObjectRef obj;
+
+    int ret = data_access.get_bucket(dpp(), tenant, bucket_name, bucket_id, &b, null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: failed to init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    ret = b->get_object(key, &obj);
+    if (ret < 0) {
+      cerr << "ERROR: failed to get object: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    bufferlist bl;
+    ret = read_input(infile, bl);
+    if (ret < 0) {
+      cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl;
+    }
+
+    map<string, bufferlist> attrs;
+    ret = obj->put(bl, attrs, dpp(), null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: put object returned error: " << cpp_strerror(-ret) << std::endl;
+    }
+  }
+
+  if (opt_cmd == OPT::OBJECT_RM) {
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    rgw_obj_key key(object, object_version);
+    ret = rgw_remove_object(dpp(), driver, bucket.get(), key);
+
+    if (ret < 0) {
+      cerr << "ERROR: object remove returned: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::OBJECT_REWRITE) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    if (object.empty()) {
+      cerr << "ERROR: object not specified" << std::endl;
+      return EINVAL;
+    }
+
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(object);
+    obj->set_instance(object_version);
+    bool need_rewrite = true;
+    if (min_rewrite_stripe_size > 0) {
+      ret = check_min_obj_stripe_size(driver, obj.get(), min_rewrite_stripe_size, &need_rewrite);
+      if (ret < 0) {
+        ldpp_dout(dpp(), 0) << "WARNING: check_min_obj_stripe_size failed, r=" << ret << dendl;
+      }
+    }
+    if (need_rewrite) {
+      RGWRados* store = static_cast<rgw::sal::RadosStore*>(driver)->getRados();
+      ret = store->rewrite_obj(bucket->get_info(), obj->get_obj(), dpp(), null_yield);
+      if (ret < 0) {
+        cerr << "ERROR: object rewrite returned: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+    } else {
+      ldpp_dout(dpp(), 20) << "skipped object" << dendl;
+    }
+  } // OPT::OBJECT_REWRITE
+
+  if (opt_cmd == OPT::OBJECT_REINDEX) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: --bucket not specified." << std::endl;
+      return EINVAL;
+    }
+    if (object.empty() && objects_file.empty()) {
+      cerr << "ERROR: neither --object nor --objects-file specified." << std::endl;
+      return EINVAL;
+    } else if (!object.empty() && !objects_file.empty()) {
+      cerr << "ERROR: both --object and --objects-file specified and only one is allowed." << std::endl;
+      return EINVAL;
+    } else if (!objects_file.empty() && !object_version.empty()) {
+      cerr << "ERROR: cannot specify --object_version when --objects-file specified." << std::endl;
+      return EINVAL;
+    }
+
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) <<
+	"." << std::endl;
+      return -ret;
+    }
+
+    rgw::sal::RadosStore* rados_store = dynamic_cast<rgw::sal::RadosStore*>(driver);
+    if (!rados_store) {
+      cerr <<
+	"ERROR: this command can only work when the cluster has a RADOS backing store." <<
+	std::endl;
+      return EPERM;
+    }
+    RGWRados* store = rados_store->getRados();
+
+    auto process = [&](const std::string& p_object, const std::string& p_object_version) -> int {
+      std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(p_object);
+      obj->set_instance(p_object_version);
+      ret = store->reindex_obj(bucket->get_info(), obj->get_obj(), dpp(), null_yield);
+      if (ret < 0) {
+	return ret;
+      }
+      return 0;
+    };
+
+    if (!object.empty()) {
+      ret = process(object, object_version);
+      if (ret < 0) {
+	return -ret;
+      }
+    } else {
+      std::ifstream file;
+      file.open(objects_file);
+      if (!file.is_open()) {
+	std::cerr << "ERROR: unable to open objects-file \"" <<
+	  objects_file << "\"." << std::endl;
+	return ENOENT;
+      }
+
+      std::string obj_name;
+      const std::string empty_version;
+      while (std::getline(file, obj_name)) {
+	ret = process(obj_name, empty_version);
+	if (ret < 0) {
+	  std::cerr << "ERROR: while processing \"" << obj_name <<
+	    "\", received " << cpp_strerror(-ret) << "." << std::endl;
+	  if (!yes_i_really_mean_it) {
+	    std::cerr <<
+	      "NOTE: with *caution* you can use --yes-i-really-mean-it to push through errors and continue processing." <<
+	      std::endl;
+	    return -ret;
+	  }
+	}
+      } // while
+    }
+  } // OPT::OBJECT_REINDEX
+
+  if (opt_cmd == OPT::OBJECTS_EXPIRE) {
+    if (!static_cast<rgw::sal::RadosStore*>(driver)->getRados()->process_expire_objects(dpp())) {
+      cerr << "ERROR: process_expire_objects() processing returned error." << std::endl;
+      return 1;
+    }
+  }
+
+  if (opt_cmd == OPT::OBJECTS_EXPIRE_STALE_LIST) {
+    ret = RGWBucketAdminOp::fix_obj_expiry(driver, bucket_op, stream_flusher, dpp(), true);
+    if (ret < 0) {
+      cerr << "ERROR: listing returned " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::OBJECTS_EXPIRE_STALE_RM) {
+    ret = RGWBucketAdminOp::fix_obj_expiry(driver, bucket_op, stream_flusher, dpp(), false);
+    if (ret < 0) {
+      cerr << "ERROR: removing returned " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_REWRITE) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    uint64_t start_epoch = 0;
+    uint64_t end_epoch = 0;
+
+    if (!end_date.empty()) {
+      int ret = utime_t::parse_date(end_date, &end_epoch, NULL);
+      if (ret < 0) {
+        cerr << "ERROR: failed to parse end date" << std::endl;
+        return EINVAL;
+      }
+    }
+    if (!start_date.empty()) {
+      int ret = utime_t::parse_date(start_date, &start_epoch, NULL);
+      if (ret < 0) {
+        cerr << "ERROR: failed to parse start date" << std::endl;
+        return EINVAL;
+      }
+    }
+
+    bool is_truncated = true;
+    bool cls_filtered = true;
+
+    rgw_obj_index_key marker;
+    string empty_prefix;
+    string empty_delimiter;
+
+    formatter->open_object_section("result");
+    formatter->dump_string("bucket", bucket_name);
+    formatter->open_array_section("objects");
+
+    constexpr uint32_t NUM_ENTRIES = 1000;
+    uint16_t expansion_factor = 1;
+    while (is_truncated) {
+      RGWRados::ent_map_t result;
+      result.reserve(NUM_ENTRIES);
+
+      const auto& current_index = bucket->get_info().layout.current_index;
+      int r = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->cls_bucket_list_ordered(
+	dpp(), bucket->get_info(), current_index, RGW_NO_SHARD,
+	marker, empty_prefix, empty_delimiter,
+	NUM_ENTRIES, true, expansion_factor,
+	result, &is_truncated, &cls_filtered, &marker,
+	null_yield,
+	rgw_bucket_object_check_filter);
+      if (r < 0 && r != -ENOENT) {
+        cerr << "ERROR: failed operation r=" << r << std::endl;
+      } else if (r == -ENOENT) {
+        break;
+      }
+
+      if (result.size() < NUM_ENTRIES / 8) {
+	++expansion_factor;
+      } else if (result.size() > NUM_ENTRIES * 7 / 8 &&
+		 expansion_factor > 1) {
+	--expansion_factor;
+      }
+
+      for (auto iter = result.begin(); iter != result.end(); ++iter) {
+        rgw_obj_key key = iter->second.key;
+        rgw_bucket_dir_entry& entry = iter->second;
+
+        formatter->open_object_section("object");
+        formatter->dump_string("name", key.name);
+        formatter->dump_string("instance", key.instance);
+        formatter->dump_int("size", entry.meta.size);
+        utime_t ut(entry.meta.mtime);
+        ut.gmtime(formatter->dump_stream("mtime"));
+
+        if ((entry.meta.size < min_rewrite_size) ||
+            (entry.meta.size > max_rewrite_size) ||
+            (start_epoch > 0 && start_epoch > (uint64_t)ut.sec()) ||
+            (end_epoch > 0 && end_epoch < (uint64_t)ut.sec())) {
+          formatter->dump_string("status", "Skipped");
+        } else {
+	  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(key);
+
+          bool need_rewrite = true;
+          if (min_rewrite_stripe_size > 0) {
+            r = check_min_obj_stripe_size(driver, obj.get(), min_rewrite_stripe_size, &need_rewrite);
+            if (r < 0) {
+              ldpp_dout(dpp(), 0) << "WARNING: check_min_obj_stripe_size failed, r=" << r << dendl;
+            }
+          }
+          if (!need_rewrite) {
+            formatter->dump_string("status", "Skipped");
+          } else {
+            RGWRados* store = static_cast<rgw::sal::RadosStore*>(driver)->getRados();
+            r = store->rewrite_obj(bucket->get_info(), obj->get_obj(), dpp(), null_yield);
+            if (r == 0) {
+              formatter->dump_string("status", "Success");
+            } else {
+              formatter->dump_string("status", cpp_strerror(-r));
+            }
+          }
+        }
+        formatter->dump_int("flags", entry.flags);
+
+        formatter->close_section();
+        formatter->flush(cout);
+      }
+    }
+    formatter->close_section();
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::BUCKET_RESHARD) {
+    int ret = check_reshard_bucket_params(driver,
+					  bucket_name,
+					  tenant,
+					  bucket_id,
+					  num_shards_specified,
+					  num_shards,
+					  yes_i_really_mean_it,
+					  &bucket);
+    if (ret < 0) {
+      return ret;
+    }
+
+    auto zone_svc = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone;
+    if (!zone_svc->can_reshard()) {
+      const auto& zonegroup = zone_svc->get_zonegroup();
+      std::cerr << "The zonegroup '" << zonegroup.get_name() << "' does not "
+          "have the resharding feature enabled." << std::endl;
+      return ENOTSUP;
+    }
+    if (!RGWBucketReshard::can_reshard(bucket->get_info(), zone_svc) &&
+        !yes_i_really_mean_it) {
+      std::cerr << "Bucket '" << bucket->get_name() << "' already has too many "
+          "log generations (" << bucket->get_info().layout.logs.size() << ") "
+          "from previous reshards that peer zones haven't finished syncing. "
+          "Resharding is not recommended until the old generations sync, but "
+          "you can force a reshard with --yes-i-really-mean-it." << std::endl;
+      return EINVAL;
+    }
+
+    RGWBucketReshard br(static_cast<rgw::sal::RadosStore*>(driver),
+			bucket->get_info(), bucket->get_attrs(),
+			nullptr /* no callback */);
+
+#define DEFAULT_RESHARD_MAX_ENTRIES 1000
+    if (max_entries < 1) {
+      max_entries = DEFAULT_RESHARD_MAX_ENTRIES;
+    }
+
+    ReshardFaultInjector fault;
+    if (inject_error_at) {
+      const int code = -inject_error_code.value_or(EIO);
+      fault.inject(*inject_error_at, InjectError{code, dpp()});
+    } else if (inject_abort_at) {
+      fault.inject(*inject_abort_at, InjectAbort{});
+    } else if (inject_delay_at) {
+      fault.inject(*inject_delay_at, InjectDelay{inject_delay, dpp()});
+    }
+    ret = br.execute(num_shards, fault, max_entries, dpp(),
+                     verbose, &cout, formatter.get());
+    return -ret;
+  }
+
+  if (opt_cmd == OPT::RESHARD_ADD) {
+    int ret = check_reshard_bucket_params(driver,
+					  bucket_name,
+					  tenant,
+					  bucket_id,
+					  num_shards_specified,
+					  num_shards,
+					  yes_i_really_mean_it,
+					  &bucket);
+    if (ret < 0) {
+      return ret;
+    }
+
+    int num_source_shards = rgw::current_num_shards(bucket->get_info().layout);
+
+    RGWReshard reshard(static_cast<rgw::sal::RadosStore*>(driver), dpp());
+    cls_rgw_reshard_entry entry;
+    entry.time = real_clock::now();
+    entry.tenant = tenant;
+    entry.bucket_name = bucket_name;
+    entry.bucket_id = bucket->get_info().bucket.bucket_id;
+    entry.old_num_shards = num_source_shards;
+    entry.new_num_shards = num_shards;
+
+    return reshard.add(dpp(), entry);
+  }
+
+  if (opt_cmd == OPT::RESHARD_LIST) {
+    int ret;
+    int count = 0;
+    if (max_entries < 0) {
+      max_entries = 1000;
+    }
+
+    int num_logshards =
+      driver->ctx()->_conf.get_val<uint64_t>("rgw_reshard_num_logs");
+
+    RGWReshard reshard(static_cast<rgw::sal::RadosStore*>(driver), dpp());
+
+    formatter->open_array_section("reshard");
+    for (int i = 0; i < num_logshards; i++) {
+      bool is_truncated = true;
+      std::string marker;
+      do {
+	std::list<cls_rgw_reshard_entry> entries;
+        ret = reshard.list(dpp(), i, marker, max_entries - count, entries, &is_truncated);
+        if (ret < 0) {
+          cerr << "Error listing resharding buckets: " << cpp_strerror(-ret) << std::endl;
+          return ret;
+        }
+        for (const auto& entry : entries) {
+          encode_json("entry", entry, formatter.get());
+        }
+	if (is_truncated) {
+	  entries.crbegin()->get_key(&marker); // last entry's key becomes marker
+	}
+        count += entries.size();
+        formatter->flush(cout);
+      } while (is_truncated && count < max_entries);
+
+      if (count >= max_entries) {
+        break;
+      }
+    }
+
+    formatter->close_section();
+    formatter->flush(cout);
+
+    return 0;
+  }
+
+  if (opt_cmd == OPT::RESHARD_STATUS) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+
+    ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    RGWBucketReshard br(static_cast<rgw::sal::RadosStore*>(driver),
+			bucket->get_info(), bucket->get_attrs(),
+			nullptr /* no callback */);
+    list<cls_rgw_bucket_instance_entry> status;
+    int r = br.get_status(dpp(), &status);
+    if (r < 0) {
+      cerr << "ERROR: could not get resharding status for bucket " <<
+	bucket_name << std::endl;
+      return -r;
+    }
+
+    show_reshard_status(status, formatter.get());
+  }
+
+  if (opt_cmd == OPT::RESHARD_PROCESS) {
+    RGWReshard reshard(static_cast<rgw::sal::RadosStore*>(driver), true, &cout);
+
+    int ret = reshard.process_all_logshards(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: failed to process reshard logs, error=" << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::RESHARD_CANCEL) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+
+    bool bucket_initable = true;
+    ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      if (yes_i_really_mean_it) {
+        bucket_initable = false;
+      } else {
+        cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) <<
+          "; if you want to cancel the reshard request nonetheless, please "
+          "use the --yes-i-really-mean-it option" << std::endl;
+        return -ret;
+      }
+    }
+
+    bool resharding_underway = true;
+
+    if (bucket_initable) {
+      // we did not encounter an error, so let's work with the bucket
+	RGWBucketReshard br(static_cast<rgw::sal::RadosStore*>(driver),
+			    bucket->get_info(), bucket->get_attrs(),
+			    nullptr /* no callback */);
+      int ret = br.cancel(dpp());
+      if (ret < 0) {
+        if (ret == -EBUSY) {
+          cerr << "There is ongoing resharding, please retry after " <<
+            driver->ctx()->_conf.get_val<uint64_t>("rgw_reshard_bucket_lock_duration") <<
+            " seconds." << std::endl;
+	  return -ret;
+	} else if (ret == -EINVAL) {
+	  resharding_underway = false;
+	  // we can continue and try to unschedule
+        } else {
+          cerr << "Error cancelling bucket \"" << bucket_name <<
+            "\" resharding: " << cpp_strerror(-ret) << std::endl;
+	  return -ret;
+        }
+      }
+    }
+
+    RGWReshard reshard(static_cast<rgw::sal::RadosStore*>(driver), dpp());
+
+    cls_rgw_reshard_entry entry;
+    entry.tenant = tenant;
+    entry.bucket_name = bucket_name;
+
+    ret = reshard.remove(dpp(), entry);
+    if (ret == -ENOENT) {
+      if (!resharding_underway) {
+	cerr << "Error, bucket \"" << bucket_name <<
+	  "\" is neither undergoing resharding nor scheduled to undergo "
+	  "resharding." << std::endl;
+	return EINVAL;
+      } else {
+	// we cancelled underway resharding above, so we're good
+	return 0;
+      }
+    } else if (ret < 0) {
+      cerr << "Error in updating reshard log with bucket \"" <<
+        bucket_name << "\": " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  } // OPT_RESHARD_CANCEL
+
+  if (opt_cmd == OPT::OBJECT_UNLINK) {
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    list<rgw_obj_index_key> oid_list;
+    rgw_obj_key key(object, object_version);
+    rgw_obj_index_key index_key;
+    key.get_index_key(&index_key);
+    oid_list.push_back(index_key);
+
+    // note: under rados this removes directly from rados index objects
+    ret = bucket->remove_objs_from_index(dpp(), oid_list);
+    if (ret < 0) {
+      cerr << "ERROR: remove_obj_from_index() returned error: " << cpp_strerror(-ret) << std::endl;
+      return 1;
+    }
+  }
+
+  if (opt_cmd == OPT::OBJECT_STAT) {
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(object);
+    obj->set_instance(object_version);
+
+    ret = obj->get_obj_attrs(null_yield, dpp());
+    if (ret < 0) {
+      cerr << "ERROR: failed to stat object, returned error: " << cpp_strerror(-ret) << std::endl;
+      return 1;
+    }
+    formatter->open_object_section("object_metadata");
+    formatter->dump_string("name", object);
+    formatter->dump_unsigned("size", obj->get_obj_size());
+
+    map<string, bufferlist>::iterator iter;
+    map<string, bufferlist> other_attrs;
+    for (iter = obj->get_attrs().begin(); iter != obj->get_attrs().end(); ++iter) {
+      bufferlist& bl = iter->second;
+      bool handled = false;
+      if (iter->first == RGW_ATTR_MANIFEST) {
+        handled = decode_dump<RGWObjManifest>("manifest", bl, formatter.get());
+      } else if (iter->first == RGW_ATTR_ACL) {
+        handled = decode_dump<RGWAccessControlPolicy>("policy", bl, formatter.get());
+      } else if (iter->first == RGW_ATTR_ID_TAG) {
+        handled = dump_string("tag", bl, formatter.get());
+      } else if (iter->first == RGW_ATTR_ETAG) {
+        handled = dump_string("etag", bl, formatter.get());
+      } else if (iter->first == RGW_ATTR_COMPRESSION) {
+        handled = decode_dump<RGWCompressionInfo>("compression", bl, formatter.get());
+      } else if (iter->first == RGW_ATTR_DELETE_AT) {
+        handled = decode_dump<utime_t>("delete_at", bl, formatter.get());
+      }
+
+      if (!handled)
+        other_attrs[iter->first] = bl;
+    }
+
+    formatter->open_object_section("attrs");
+    for (iter = other_attrs.begin(); iter != other_attrs.end(); ++iter) {
+      dump_string(iter->first.c_str(), iter->second, formatter.get());
+    }
+    formatter->close_section();
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::BUCKET_CHECK) {
+    if (check_head_obj_locator) {
+      if (bucket_name.empty()) {
+        cerr << "ERROR: need to specify bucket name" << std::endl;
+        return EINVAL;
+      }
+      do_check_object_locator(tenant, bucket_name, fix, remove_bad, formatter.get());
+    } else {
+      RGWBucketAdminOp::check_index(driver, bucket_op, stream_flusher, null_yield, dpp());
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_CHECK_OLH) {
+    rgw::sal::RadosStore* store = dynamic_cast<rgw::sal::RadosStore*>(driver);
+    if (!store) {
+      cerr <<
+	      "WARNING: this command is only relevant when the cluster has a RADOS backing store." <<
+	      std::endl;
+      return 0;
+    }
+    RGWBucketAdminOp::check_index_olh(store, bucket_op, stream_flusher, dpp());
+  }
+
+  if (opt_cmd == OPT::BUCKET_CHECK_UNLINKED) {
+    rgw::sal::RadosStore* store = dynamic_cast<rgw::sal::RadosStore*>(driver);
+    if (!store) {
+      cerr <<
+	      "WARNING: this command is only relevant when the cluster has a RADOS backing store." <<
+	      std::endl;
+      return 0;
+    }
+    RGWBucketAdminOp::check_index_unlinked(store, bucket_op, stream_flusher, dpp());
+  }
+
+  if (opt_cmd == OPT::BUCKET_RM) {
+    if (!inconsistent_index) {
+      RGWBucketAdminOp::remove_bucket(driver, bucket_op, null_yield, dpp(), bypass_gc, true);
+    } else {
+      if (!yes_i_really_mean_it) {
+	cerr << "using --inconsistent_index can corrupt the bucket index " << std::endl
+	<< "do you really mean it? (requires --yes-i-really-mean-it)" << std::endl;
+	return 1;
+      }
+      RGWBucketAdminOp::remove_bucket(driver, bucket_op, null_yield, dpp(), bypass_gc, false);
+    }
+  }
+
+  if (opt_cmd == OPT::GC_LIST) {
+    int index = 0;
+    bool truncated;
+    bool processing_queue = false;
+    formatter->open_array_section("entries");
+
+    do {
+      list<cls_rgw_gc_obj_info> result;
+      int ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->list_gc_objs(&index, marker, 1000, !include_all, result, &truncated, processing_queue);
+      if (ret < 0) {
+	cerr << "ERROR: failed to list objs: " << cpp_strerror(-ret) << std::endl;
+	return 1;
+      }
+
+
+      list<cls_rgw_gc_obj_info>::iterator iter;
+      for (iter = result.begin(); iter != result.end(); ++iter) {
+	cls_rgw_gc_obj_info& info = *iter;
+	formatter->open_object_section("chain_info");
+	formatter->dump_string("tag", info.tag);
+	formatter->dump_stream("time") << info.time;
+	formatter->open_array_section("objs");
+        list<cls_rgw_obj>::iterator liter;
+	cls_rgw_obj_chain& chain = info.chain;
+	for (liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
+	  cls_rgw_obj& obj = *liter;
+          encode_json("obj", obj, formatter.get());
+	}
+	formatter->close_section(); // objs
+	formatter->close_section(); // obj_chain
+	formatter->flush(cout);
+      }
+    } while (truncated);
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::GC_PROCESS) {
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->process_gc(!include_all);
+    if (ret < 0) {
+      cerr << "ERROR: gc processing returned error: " << cpp_strerror(-ret) << std::endl;
+      return 1;
+    }
+  }
+
+  if (opt_cmd == OPT::LC_LIST) {
+    formatter->open_array_section("lifecycle_list");
+    vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> bucket_lc_map;
+    string marker;
+    int index{0};
+#define MAX_LC_LIST_ENTRIES 100
+    if (max_entries < 0) {
+      max_entries = MAX_LC_LIST_ENTRIES;
+    }
+    do {
+      int ret = static_cast<rgw::sal::RadosStore*>(driver)->getRados()->list_lc_progress(marker, max_entries,
+						    bucket_lc_map, index);
+      if (ret < 0) {
+        cerr << "ERROR: failed to list objs: " << cpp_strerror(-ret)
+	     << std::endl;
+        return 1;
+      }
+      for (const auto& entry : bucket_lc_map) {
+        formatter->open_object_section("bucket_lc_info");
+        formatter->dump_string("bucket", entry->get_bucket());
+	formatter->dump_string("shard", entry->get_oid());
+	char exp_buf[100];
+	time_t t{time_t(entry->get_start_time())};
+	if (std::strftime(
+	      exp_buf, sizeof(exp_buf),
+	      "%a, %d %b %Y %T %Z", std::gmtime(&t))) {
+	  formatter->dump_string("started", exp_buf);
+	}
+        string lc_status = LC_STATUS[entry->get_status()];
+        formatter->dump_string("status", lc_status);
+        formatter->close_section(); // objs
+        formatter->flush(cout);
+      }
+    } while (!bucket_lc_map.empty());
+
+    formatter->close_section(); //lifecycle list
+    formatter->flush(cout);
+  }
+
+
+  if (opt_cmd == OPT::LC_GET) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+
+    RGWLifecycleConfiguration config;
+    ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    auto aiter = bucket->get_attrs().find(RGW_ATTR_LC);
+    if (aiter == bucket->get_attrs().end()) {
+      return -ENOENT;
+    }
+
+    bufferlist::const_iterator iter{&aiter->second};
+    try {
+      config.decode(iter);
+    } catch (const buffer::error& e) {
+      cerr << "ERROR: decode life cycle config failed" << std::endl;
+      return -EIO;
+    }
+
+    encode_json("result", config, formatter.get());
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::LC_PROCESS) {
+    if ((! bucket_name.empty()) ||
+	(! bucket_id.empty())) {
+        int ret = init_bucket(nullptr, tenant, bucket_name, bucket_id, &bucket);
+	if (ret < 0) {
+	  cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret)
+	       << std::endl;
+	  return ret;
+	}
+    }
+
+    int ret =
+      static_cast<rgw::sal::RadosStore*>(driver)->getRados()->process_lc(bucket);
+    if (ret < 0) {
+      cerr << "ERROR: lc processing returned error: " << cpp_strerror(-ret) << std::endl;
+      return 1;
+    }
+  }
+
+  if (opt_cmd == OPT::LC_RESHARD_FIX) {
+    ret = RGWBucketAdminOp::fix_lc_shards(driver, bucket_op, stream_flusher, dpp());
+    if (ret < 0) {
+      cerr << "ERROR: fixing lc shards: " << cpp_strerror(-ret) << std::endl;
+    }
+
+  }
+
+  if (opt_cmd == OPT::ORPHANS_FIND) {
+    if (!yes_i_really_mean_it) {
+      cerr << "this command is now deprecated; please consider using the rgw-orphan-list tool; "
+	   << "accidental removal of active objects cannot be reversed; "
+	   << "do you really mean it? (requires --yes-i-really-mean-it)"
+	   << std::endl;
+      return EINVAL;
+    } else {
+      cerr << "IMPORTANT: this command is now deprecated; please consider using the rgw-orphan-list tool"
+	   << std::endl;
+    }
+
+    RGWOrphanSearch search(static_cast<rgw::sal::RadosStore*>(driver), max_concurrent_ios, orphan_stale_secs);
+
+    if (job_id.empty()) {
+      cerr << "ERROR: --job-id not specified" << std::endl;
+      return EINVAL;
+    }
+    if (pool_name.empty()) {
+      cerr << "ERROR: --pool not specified" << std::endl;
+      return EINVAL;
+    }
+
+    RGWOrphanSearchInfo info;
+
+    info.pool = pool;
+    info.job_name = job_id;
+    info.num_shards = num_shards;
+
+    int ret = search.init(dpp(), job_id, &info, detail);
+    if (ret < 0) {
+      cerr << "could not init search, ret=" << ret << std::endl;
+      return -ret;
+    }
+    ret = search.run(dpp());
+    if (ret < 0) {
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::ORPHANS_FINISH) {
+    if (!yes_i_really_mean_it) {
+      cerr << "this command is now deprecated; please consider using the rgw-orphan-list tool; "
+	   << "accidental removal of active objects cannot be reversed; "
+	   << "do you really mean it? (requires --yes-i-really-mean-it)"
+	   << std::endl;
+      return EINVAL;
+    } else {
+      cerr << "IMPORTANT: this command is now deprecated; please consider using the rgw-orphan-list tool"
+	   << std::endl;
+    }
+
+    RGWOrphanSearch search(static_cast<rgw::sal::RadosStore*>(driver), max_concurrent_ios, orphan_stale_secs);
+
+    if (job_id.empty()) {
+      cerr << "ERROR: --job-id not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = search.init(dpp(), job_id, NULL);
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+        cerr << "job not found" << std::endl;
+      }
+      return -ret;
+    }
+    ret = search.finish();
+    if (ret < 0) {
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::ORPHANS_LIST_JOBS){
+    if (!yes_i_really_mean_it) {
+      cerr << "this command is now deprecated; please consider using the rgw-orphan-list tool; "
+	   << "do you really mean it? (requires --yes-i-really-mean-it)"
+	   << std::endl;
+      return EINVAL;
+    } else {
+      cerr << "IMPORTANT: this command is now deprecated; please consider using the rgw-orphan-list tool"
+	   << std::endl;
+    }
+
+    RGWOrphanStore orphan_store(static_cast<rgw::sal::RadosStore*>(driver));
+    int ret = orphan_store.init(dpp());
+    if (ret < 0){
+      cerr << "connection to cluster failed!" << std::endl;
+      return -ret;
+    }
+
+    map <string,RGWOrphanSearchState> m;
+    ret = orphan_store.list_jobs(m);
+    if (ret < 0) {
+      cerr << "job list failed" << std::endl;
+      return -ret;
+    }
+    formatter->open_array_section("entries");
+    for (const auto &it: m){
+      if (!extra_info){
+	formatter->dump_string("job-id",it.first);
+      } else {
+	encode_json("orphan_search_state", it.second, formatter.get());
+      }
+    }
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::USER_CHECK) {
+    check_bad_user_bucket_mapping(driver, *user.get(), fix, null_yield, dpp());
+  }
+
+  if (opt_cmd == OPT::USER_STATS) {
+    if (rgw::sal::User::empty(user)) {
+      cerr << "ERROR: uid not specified" << std::endl;
+      return EINVAL;
+    }
+    if (reset_stats) {
+      if (!bucket_name.empty()) {
+	cerr << "ERROR: --reset-stats does not work on buckets and "
+	  "bucket specified" << std::endl;
+	return EINVAL;
+      }
+      if (sync_stats) {
+	cerr << "ERROR: sync-stats includes the reset-stats functionality, "
+	  "so at most one of the two should be specified" << std::endl;
+	return EINVAL;
+      }
+      ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->user->reset_bucket_stats(dpp(), user->get_id(), null_yield);
+      if (ret < 0) {
+	cerr << "ERROR: could not reset user stats: " << cpp_strerror(-ret) <<
+	  std::endl;
+	return -ret;
+      }
+    }
+
+    if (sync_stats) {
+      if (!bucket_name.empty()) {
+        int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+        if (ret < 0) {
+          cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+        ret = bucket->sync_user_stats(dpp(), null_yield);
+        if (ret < 0) {
+          cerr << "ERROR: could not sync bucket stats: " <<
+	    cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+      } else {
+        int ret = rgw_user_sync_all_stats(dpp(), driver, user.get(), null_yield);
+        if (ret < 0) {
+          cerr << "ERROR: could not sync user stats: " <<
+	    cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+      }
+    }
+
+    constexpr bool omit_utilized_stats = false;
+    RGWStorageStats stats(omit_utilized_stats);
+    ceph::real_time last_stats_sync;
+    ceph::real_time last_stats_update;
+    int ret = user->read_stats(dpp(), null_yield, &stats, &last_stats_sync, &last_stats_update);
+    if (ret < 0) {
+      if (ret == -ENOENT) { /* in case of ENOENT */
+        cerr << "User has not been initialized or user does not exist" << std::endl;
+      } else {
+        cerr << "ERROR: can't read user: " << cpp_strerror(ret) << std::endl;
+      }
+      return -ret;
+    }
+
+
+    {
+      Formatter::ObjectSection os(*formatter, "result");
+      encode_json("stats", stats, formatter.get());
+      utime_t last_sync_ut(last_stats_sync);
+      encode_json("last_stats_sync", last_sync_ut, formatter.get());
+      utime_t last_update_ut(last_stats_update);
+      encode_json("last_stats_update", last_update_ut, formatter.get());
+    }
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::METADATA_GET) {
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->get(metadata_key, formatter.get(), null_yield, dpp());
+    if (ret < 0) {
+      cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::METADATA_PUT) {
+    bufferlist bl;
+    int ret = read_input(infile, bl);
+    if (ret < 0) {
+      cerr << "ERROR: failed to read input: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    ret = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->put(metadata_key, bl, null_yield, dpp(), RGWMDLogSyncType::APPLY_ALWAYS, false);
+    if (ret < 0) {
+      cerr << "ERROR: can't put key: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::METADATA_RM) {
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->remove(metadata_key, null_yield, dpp());
+    if (ret < 0) {
+      cerr << "ERROR: can't remove key: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::METADATA_LIST || opt_cmd == OPT::USER_LIST) {
+    if (opt_cmd == OPT::USER_LIST) {
+      metadata_key = "user";
+    }
+    void *handle;
+    int max = 1000;
+    int ret = driver->meta_list_keys_init(dpp(), metadata_key, marker, &handle);
+    if (ret < 0) {
+      cerr << "ERROR: can't get key: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    bool truncated;
+    uint64_t count = 0;
+
+    if (max_entries_specified) {
+      formatter->open_object_section("result");
+    }
+    formatter->open_array_section("keys");
+
+    uint64_t left;
+    do {
+      list<string> keys;
+      left = (max_entries_specified ? max_entries - count : max);
+      ret = driver->meta_list_keys_next(dpp(), handle, left, keys, &truncated);
+      if (ret < 0 && ret != -ENOENT) {
+        cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      } if (ret != -ENOENT) {
+	for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+	  formatter->dump_string("key", *iter);
+          ++count;
+	}
+	formatter->flush(cout);
+      }
+    } while (truncated && left > 0);
+
+    formatter->close_section();
+
+    if (max_entries_specified) {
+      encode_json("truncated", truncated, formatter.get());
+      encode_json("count", count, formatter.get());
+      if (truncated) {
+        encode_json("marker", driver->meta_get_marker(handle), formatter.get());
+      }
+      formatter->close_section();
+    }
+    formatter->flush(cout);
+
+    driver->meta_list_keys_complete(handle);
+  }
+
+  if (opt_cmd == OPT::MDLOG_LIST) {
+    if (!start_date.empty()) {
+      std::cerr << "start-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_date.empty()) {
+      std::cerr << "end-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_marker.empty()) {
+      std::cerr << "end-marker not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!start_marker.empty()) {
+      if (marker.empty()) {
+	marker = start_marker;
+      } else {
+	std::cerr << "start-marker and marker not both allowed." << std::endl;
+	return -EINVAL;
+      }
+    }
+
+    int i = (specified_shard_id ? shard_id : 0);
+
+    if (period_id.empty()) {
+      // use realm's current period
+      RGWRealm realm;
+      int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                realm_id, realm_name, realm);
+      if (ret < 0 ) {
+        cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+      period_id = realm.current_period;
+      std::cerr << "No --period given, using current period="
+          << period_id << std::endl;
+    }
+    RGWMetadataLog *meta_log = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->get_log(period_id);
+
+    formatter->open_array_section("entries");
+    for (; i < g_ceph_context->_conf->rgw_md_log_max_shards; i++) {
+      void *handle;
+      list<cls_log_entry> entries;
+
+      meta_log->init_list_entries(i, {}, {}, marker, &handle);
+      bool truncated;
+      do {
+	  int ret = meta_log->list_entries(dpp(), handle, 1000, entries, NULL, &truncated);
+        if (ret < 0) {
+          cerr << "ERROR: meta_log->list_entries(): " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+
+        for (list<cls_log_entry>::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+          cls_log_entry& entry = *iter;
+          static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->dump_log_entry(entry, formatter.get());
+        }
+        formatter->flush(cout);
+      } while (truncated);
+
+      meta_log->complete_list_entries(handle);
+
+      if (specified_shard_id)
+        break;
+    }
+
+
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::MDLOG_STATUS) {
+    int i = (specified_shard_id ? shard_id : 0);
+
+    if (period_id.empty()) {
+      // use realm's current period
+      RGWRealm realm;
+      int ret = rgw::read_realm(dpp(), null_yield, cfgstore.get(),
+                                realm_id, realm_name, realm);
+      if (ret < 0 ) {
+        cerr << "failed to load realm: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+      period_id = realm.current_period;
+      std::cerr << "No --period given, using current period="
+          << period_id << std::endl;
+    }
+    RGWMetadataLog *meta_log = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->get_log(period_id);
+
+    formatter->open_array_section("entries");
+
+    for (; i < g_ceph_context->_conf->rgw_md_log_max_shards; i++) {
+      RGWMetadataLogInfo info;
+      meta_log->get_info(dpp(), i, &info);
+
+      ::encode_json("info", info, formatter.get());
+
+      if (specified_shard_id)
+        break;
+    }
+
+
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::MDLOG_AUTOTRIM) {
+    // need a full history for purging old mdlog periods
+    static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->init_oldest_log_period(null_yield, dpp());
+
+    RGWCoroutinesManager crs(driver->ctx(), driver->get_cr_registry());
+    RGWHTTPManager http(driver->ctx(), crs.get_completion_mgr());
+    int ret = http.start();
+    if (ret < 0) {
+      cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+
+    auto num_shards = g_conf()->rgw_md_log_max_shards;
+    auto mltcr = create_admin_meta_log_trim_cr(
+      dpp(), static_cast<rgw::sal::RadosStore*>(driver), &http, num_shards);
+    if (!mltcr) {
+      cerr << "Cluster misconfigured! Unable to trim." << std::endl;
+      return -EIO;
+    }
+    ret = crs.run(dpp(), mltcr);
+    if (ret < 0) {
+      cerr << "automated mdlog trim failed with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::MDLOG_TRIM) {
+    if (!start_date.empty()) {
+      std::cerr << "start-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_date.empty()) {
+      std::cerr << "end-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!start_marker.empty()) {
+      std::cerr << "start-marker not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_marker.empty()) {
+      if (marker.empty()) {
+	marker = end_marker;
+      } else {
+	std::cerr << "end-marker and marker not both allowed." << std::endl;
+	return -EINVAL;
+      }
+    }
+
+    if (!specified_shard_id) {
+      cerr << "ERROR: shard-id must be specified for trim operation" << std::endl;
+      return EINVAL;
+    }
+
+    if (marker.empty()) {
+      cerr << "ERROR: marker must be specified for trim operation" << std::endl;
+      return EINVAL;
+    }
+
+    if (period_id.empty()) {
+      std::cerr << "missing --period argument" << std::endl;
+      return EINVAL;
+    }
+    RGWMetadataLog *meta_log = static_cast<rgw::sal::RadosStore*>(driver)->svc()->mdlog->get_log(period_id);
+
+    // trim until -ENODATA
+    do {
+      ret = meta_log->trim(dpp(), shard_id, {}, {}, {}, marker);
+    } while (ret == 0);
+    if (ret < 0 && ret != -ENODATA) {
+      cerr << "ERROR: meta_log->trim(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::SYNC_INFO) {
+    sync_info(opt_effective_zone_id, opt_bucket, zone_formatter.get());
+  }
+
+  if (opt_cmd == OPT::SYNC_STATUS) {
+    sync_status(formatter.get());
+  }
+
+  if (opt_cmd == OPT::METADATA_SYNC_STATUS) {
+    RGWMetaSyncStatusManager sync(static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados->get_async_processor());
+
+    int ret = sync.init(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.init() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+
+    rgw_meta_sync_status sync_status;
+    ret = sync.read_sync_status(dpp(), &sync_status);
+    if (ret < 0) {
+      cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+
+    formatter->open_object_section("summary");
+    encode_json("sync_status", sync_status, formatter.get());
+
+    uint64_t full_total = 0;
+    uint64_t full_complete = 0;
+
+    for (auto marker_iter : sync_status.sync_markers) {
+      full_total += marker_iter.second.total_entries;
+      if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) {
+        full_complete += marker_iter.second.pos;
+      } else {
+        full_complete += marker_iter.second.total_entries;
+      }
+    }
+
+    formatter->open_object_section("full_sync");
+    encode_json("total", full_total, formatter.get());
+    encode_json("complete", full_complete, formatter.get());
+    formatter->close_section();
+    formatter->dump_string("current_time",
+			   to_iso_8601(ceph::real_clock::now(),
+				       iso_8601_format::YMDhms));
+    formatter->close_section();
+
+    formatter->flush(cout);
+
+  }
+
+  if (opt_cmd == OPT::METADATA_SYNC_INIT) {
+    RGWMetaSyncStatusManager sync(static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados->get_async_processor());
+
+    int ret = sync.init(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.init() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+    ret = sync.init_sync_status(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+  }
+
+
+  if (opt_cmd == OPT::METADATA_SYNC_RUN) {
+    RGWMetaSyncStatusManager sync(static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados->get_async_processor());
+
+    int ret = sync.init(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.init() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+
+    ret = sync.run(dpp(), null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: sync.run() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::DATA_SYNC_STATUS) {
+    if (source_zone.empty()) {
+      cerr << "ERROR: source zone not specified" << std::endl;
+      return EINVAL;
+    }
+    RGWDataSyncStatusManager sync(static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados->get_async_processor(), source_zone, nullptr);
+
+    int ret = sync.init(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.init() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+
+    rgw_data_sync_status sync_status;
+    if (specified_shard_id) {
+      set<string> pending_buckets;
+      set<string> recovering_buckets;
+      rgw_data_sync_marker sync_marker;
+      ret = sync.read_shard_status(dpp(), shard_id, pending_buckets, recovering_buckets, &sync_marker, 
+                                   max_entries_specified ? max_entries : 20);
+      if (ret < 0 && ret != -ENOENT) {
+        cerr << "ERROR: sync.read_shard_status() returned ret=" << ret << std::endl;
+        return -ret;
+      }
+      formatter->open_object_section("summary");
+      encode_json("shard_id", shard_id, formatter.get());
+      encode_json("marker", sync_marker, formatter.get());
+      encode_json("pending_buckets", pending_buckets, formatter.get());
+      encode_json("recovering_buckets", recovering_buckets, formatter.get());
+      formatter->dump_string("current_time",
+			     to_iso_8601(ceph::real_clock::now(),
+					 iso_8601_format::YMDhms));
+      formatter->close_section();
+      formatter->flush(cout);
+    } else {
+      ret = sync.read_sync_status(dpp(), &sync_status);
+      if (ret < 0 && ret != -ENOENT) {
+        cerr << "ERROR: sync.read_sync_status() returned ret=" << ret << std::endl;
+        return -ret;
+      }
+
+      formatter->open_object_section("summary");
+      encode_json("sync_status", sync_status, formatter.get());
+
+      uint64_t full_total = 0;
+      uint64_t full_complete = 0;
+
+      for (auto marker_iter : sync_status.sync_markers) {
+        full_total += marker_iter.second.total_entries;
+        if (marker_iter.second.state == rgw_meta_sync_marker::SyncState::FullSync) {
+          full_complete += marker_iter.second.pos;
+        } else {
+          full_complete += marker_iter.second.total_entries;
+        }
+      }
+
+      formatter->open_object_section("full_sync");
+      encode_json("total", full_total, formatter.get());
+      encode_json("complete", full_complete, formatter.get());
+      formatter->close_section();
+      formatter->dump_string("current_time",
+			     to_iso_8601(ceph::real_clock::now(),
+					 iso_8601_format::YMDhms));
+      formatter->close_section();
+
+      formatter->flush(cout);
+    }
+  }
+
+  if (opt_cmd == OPT::DATA_SYNC_INIT) {
+    if (source_zone.empty()) {
+      cerr << "ERROR: source zone not specified" << std::endl;
+      return EINVAL;
+    }
+
+    RGWDataSyncStatusManager sync(static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados->get_async_processor(), source_zone, nullptr);
+
+    int ret = sync.init(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.init() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+
+    ret = sync.init_sync_status(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::DATA_SYNC_RUN) {
+    if (source_zone.empty()) {
+      cerr << "ERROR: source zone not specified" << std::endl;
+      return EINVAL;
+    }
+
+    RGWSyncModuleInstanceRef sync_module;
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->sync_modules->get_manager()->create_instance(dpp(), g_ceph_context, static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone().tier_type,
+        static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone_params().tier_config, &sync_module);
+    if (ret < 0) {
+      ldpp_dout(dpp(), -1) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
+      return ret;
+    }
+
+    RGWDataSyncStatusManager sync(static_cast<rgw::sal::RadosStore*>(driver), static_cast<rgw::sal::RadosStore*>(driver)->svc()->rados->get_async_processor(), source_zone, nullptr, sync_module);
+
+    ret = sync.init(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.init() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+
+    ret = sync.run(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.run() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_SYNC_INIT) {
+    if (source_zone.empty()) {
+      cerr << "ERROR: source zone not specified" << std::endl;
+      return EINVAL;
+    }
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket_for_sync(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto opt_sb = opt_source_bucket;
+    if (opt_sb && opt_sb->bucket_id.empty()) {
+      string sbid;
+      std::unique_ptr<rgw::sal::Bucket> sbuck;
+      int ret = init_bucket_for_sync(user.get(), opt_sb->tenant, opt_sb->name, sbid, &sbuck);
+      if (ret < 0) {
+        return -ret;
+      }
+      opt_sb = sbuck->get_key();
+    }
+
+    auto sync = RGWBucketPipeSyncStatusManager::construct(
+      dpp(), static_cast<rgw::sal::RadosStore*>(driver), source_zone, opt_sb,
+      bucket->get_key(), extra_info ? &std::cout : nullptr);
+
+    if (!sync) {
+      cerr << "ERROR: sync.init() returned error=" << sync.error() << std::endl;
+      return -sync.error();
+    }
+    ret = (*sync)->init_sync_status(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.init_sync_status() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_SYNC_CHECKPOINT) {
+    std::optional<rgw_zone_id> opt_source_zone;
+    if (!source_zone.empty()) {
+      opt_source_zone = source_zone;
+    }
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      return -ret;
+    }
+
+    if (!static_cast<rgw::sal::RadosStore*>(driver)->ctl()->bucket->bucket_imports_data(bucket->get_key(), null_yield, dpp())) {
+      std::cout << "Sync is disabled for bucket " << bucket_name << std::endl;
+      return 0;
+    }
+
+    RGWBucketSyncPolicyHandlerRef handler;
+    ret = driver->get_sync_policy_handler(dpp(), std::nullopt, bucket->get_key(), &handler, null_yield);
+    if (ret < 0) {
+      std::cerr << "ERROR: failed to get policy handler for bucket ("
+          << bucket << "): r=" << ret << ": " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    auto timeout_at = ceph::coarse_mono_clock::now() + opt_timeout_sec;
+    ret = rgw_bucket_sync_checkpoint(dpp(), static_cast<rgw::sal::RadosStore*>(driver), *handler, bucket->get_info(),
+                                     opt_source_zone, opt_source_bucket,
+                                     opt_retry_delay_ms, timeout_at);
+    if (ret < 0) {
+      ldpp_dout(dpp(), -1) << "bucket sync checkpoint failed: " << cpp_strerror(ret) << dendl;
+      return -ret;
+    }
+  }
+
+  if ((opt_cmd == OPT::BUCKET_SYNC_DISABLE) || (opt_cmd == OPT::BUCKET_SYNC_ENABLE)) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    } 
+    if (opt_cmd == OPT::BUCKET_SYNC_DISABLE) {
+      bucket_op.set_sync_bucket(false);
+    } else {
+      bucket_op.set_sync_bucket(true);
+    }
+    bucket_op.set_tenant(tenant);
+    string err_msg;
+    ret = RGWBucketAdminOp::sync_bucket(driver, bucket_op, dpp(), &err_msg);
+    if (ret < 0) {
+      cerr << err_msg << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::BUCKET_SYNC_INFO) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      return -ret;
+    }
+    bucket_sync_info(driver, bucket->get_info(), std::cout);
+  }
+
+  if (opt_cmd == OPT::BUCKET_SYNC_STATUS) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      return -ret;
+    }
+    bucket_sync_status(driver, bucket->get_info(), source_zone, opt_source_bucket, std::cout);
+  }
+
+  if (opt_cmd == OPT::BUCKET_SYNC_MARKERS) {
+    if (source_zone.empty()) {
+      cerr << "ERROR: source zone not specified" << std::endl;
+      return EINVAL;
+    }
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket_for_sync(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto sync = RGWBucketPipeSyncStatusManager::construct(
+      dpp(), static_cast<rgw::sal::RadosStore*>(driver), source_zone,
+      opt_source_bucket, bucket->get_key(), nullptr);
+
+    if (!sync) {
+      cerr << "ERROR: sync.init() returned error=" << sync.error() << std::endl;
+      return -sync.error();
+    }
+
+    auto sync_status = (*sync)->read_sync_status(dpp());
+    if (!sync_status) {
+      cerr << "ERROR: sync.read_sync_status() returned error="
+	   << sync_status.error() << std::endl;
+      return -sync_status.error();
+    }
+
+    encode_json("sync_status", *sync_status, formatter.get());
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::BUCKET_SYNC_RUN) {
+    if (source_zone.empty()) {
+      cerr << "ERROR: source zone not specified" << std::endl;
+      return EINVAL;
+    }
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket_for_sync(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto sync = RGWBucketPipeSyncStatusManager::construct(
+      dpp(), static_cast<rgw::sal::RadosStore*>(driver), source_zone,
+      opt_source_bucket, bucket->get_key(), extra_info ? &std::cout : nullptr);
+
+    if (!sync) {
+      cerr << "ERROR: sync.init() returned error=" << sync.error() << std::endl;
+      return -sync.error();
+    }
+
+    ret = (*sync)->run(dpp());
+    if (ret < 0) {
+      cerr << "ERROR: sync.run() returned ret=" << ret << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::BILOG_LIST) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    formatter->open_array_section("entries");
+    bool truncated;
+    int count = 0;
+    if (max_entries < 0)
+      max_entries = 1000;
+
+    const auto& logs = bucket->get_info().layout.logs;
+    auto log_layout = std::reference_wrapper{logs.back()};
+    if (gen) {
+      auto i = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen));
+      if (i == logs.end()) {
+        cerr << "ERROR: no log layout with gen=" << *gen << std::endl;
+        return ENOENT;
+      }
+      log_layout = *i;
+    }
+
+    do {
+      list<rgw_bi_log_entry> entries;
+      ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->bilog_rados->log_list(dpp(), bucket->get_info(), log_layout, shard_id, marker, max_entries - count, entries, &truncated);
+      if (ret < 0) {
+        cerr << "ERROR: list_bi_log_entries(): " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+
+      count += entries.size();
+
+      for (list<rgw_bi_log_entry>::iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+        rgw_bi_log_entry& entry = *iter;
+        encode_json("entry", entry, formatter.get());
+
+        marker = entry.id;
+      }
+      formatter->flush(cout);
+    } while (truncated && count < max_entries);
+
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::SYNC_ERROR_LIST) {
+    if (max_entries < 0) {
+      max_entries = 1000;
+    }
+    if (!start_date.empty()) {
+      std::cerr << "start-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_date.empty()) {
+      std::cerr << "end-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_marker.empty()) {
+      std::cerr << "end-marker not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!start_marker.empty()) {
+      if (marker.empty()) {
+	marker = start_marker;
+      } else {
+	std::cerr << "start-marker and marker not both allowed." << std::endl;
+	return -EINVAL;
+      }
+    }
+
+    bool truncated;
+
+    if (shard_id < 0) {
+      shard_id = 0;
+    }
+
+    formatter->open_array_section("entries");
+
+    for (; shard_id < ERROR_LOGGER_SHARDS; ++shard_id) {
+      formatter->open_object_section("shard");
+      encode_json("shard_id", shard_id, formatter.get());
+      formatter->open_array_section("entries");
+
+      int count = 0;
+      string oid = RGWSyncErrorLogger::get_shard_oid(RGW_SYNC_ERROR_LOG_SHARD_PREFIX, shard_id);
+
+      do {
+        list<cls_log_entry> entries;
+        ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->timelog.list(dpp(), oid, {}, {}, max_entries - count, entries, marker, &marker, &truncated,
+					      null_yield);
+	if (ret == -ENOENT) {
+	  break;
+        }
+        if (ret < 0) {
+          cerr << "ERROR: svc.cls->timelog.list(): " << cpp_strerror(-ret) << std::endl;
+          return -ret;
+        }
+
+        count += entries.size();
+
+        for (auto& cls_entry : entries) {
+          rgw_sync_error_info log_entry;
+
+          auto iter = cls_entry.data.cbegin();
+          try {
+            decode(log_entry, iter);
+          } catch (buffer::error& err) {
+            cerr << "ERROR: failed to decode log entry" << std::endl;
+            continue;
+          }
+          formatter->open_object_section("entry");
+          encode_json("id", cls_entry.id, formatter.get());
+          encode_json("section", cls_entry.section, formatter.get());
+          encode_json("name", cls_entry.name, formatter.get());
+          encode_json("timestamp", cls_entry.timestamp, formatter.get());
+          encode_json("info", log_entry, formatter.get());
+          formatter->close_section();
+          formatter->flush(cout);
+        }
+      } while (truncated && count < max_entries);
+
+      formatter->close_section();
+      formatter->close_section();
+
+      if (specified_shard_id) {
+        break;
+      }
+    }
+
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::SYNC_ERROR_TRIM) {
+    if (!start_date.empty()) {
+      std::cerr << "start-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_date.empty()) {
+      std::cerr << "end-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!start_marker.empty()) {
+      std::cerr << "end-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_marker.empty()) {
+      std::cerr << "end-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+
+    if (shard_id < 0) {
+      shard_id = 0;
+    }
+
+    for (; shard_id < ERROR_LOGGER_SHARDS; ++shard_id) {
+      ret = trim_sync_error_log(shard_id, marker, trim_delay_ms);
+      if (ret < 0) {
+        cerr << "ERROR: sync error trim: " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+      if (specified_shard_id) {
+        break;
+      }
+    }
+  }
+
+  if (opt_cmd == OPT::SYNC_GROUP_CREATE ||
+      opt_cmd == OPT::SYNC_GROUP_MODIFY) {
+    CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL);
+    CHECK_TRUE(require_opt(opt_status), "ERROR: --status is not specified (options: forbidden, allowed, enabled)", EINVAL);
+
+    SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket);
+    ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto& sync_policy = sync_policy_ctx.get_policy();
+
+    if (opt_cmd == OPT::SYNC_GROUP_MODIFY) {
+      auto iter = sync_policy.groups.find(*opt_group_id);
+      if (iter == sync_policy.groups.end()) {
+        cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl;
+        return ENOENT;
+      }
+    }
+
+    auto& group = sync_policy.groups[*opt_group_id];
+    group.id = *opt_group_id;
+
+    if (opt_status) {
+      if (!group.set_status(*opt_status)) {
+        cerr << "ERROR: unrecognized status (options: forbidden, allowed, enabled)" << std::endl;
+        return EINVAL;
+      }
+    }
+
+    ret = sync_policy_ctx.write_policy();
+    if (ret < 0) {
+      return -ret;
+    }
+
+    show_result(sync_policy, zone_formatter.get(), cout);
+  }
+
+  if (opt_cmd == OPT::SYNC_GROUP_GET) {
+    SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket);
+    ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto& sync_policy = sync_policy_ctx.get_policy();
+
+    auto& groups = sync_policy.groups;
+
+    if (!opt_group_id) {
+      show_result(groups, zone_formatter.get(), cout);
+    } else {
+      auto iter = sync_policy.groups.find(*opt_group_id);
+      if (iter == sync_policy.groups.end()) {
+        cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl;
+        return ENOENT;
+      }
+
+      show_result(iter->second, zone_formatter.get(), cout);
+    }
+  }
+
+  if (opt_cmd == OPT::SYNC_GROUP_REMOVE) {
+    CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL);
+
+    SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket);
+    ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto& sync_policy = sync_policy_ctx.get_policy();
+
+    sync_policy.groups.erase(*opt_group_id);
+
+    ret = sync_policy_ctx.write_policy();
+    if (ret < 0) {
+      return -ret;
+    }
+
+    {
+      Formatter::ObjectSection os(*zone_formatter.get(), "result");
+      encode_json("sync_policy", sync_policy, zone_formatter.get());
+    }
+
+    zone_formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::SYNC_GROUP_FLOW_CREATE) {
+    CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL);
+    CHECK_TRUE(require_non_empty_opt(opt_flow_id), "ERROR: --flow-id not specified", EINVAL);
+    CHECK_TRUE(require_opt(opt_flow_type),
+                           "ERROR: --flow-type not specified (options: symmetrical, directional)", EINVAL);
+    CHECK_TRUE((symmetrical_flow_opt(*opt_flow_type) ||
+                            directional_flow_opt(*opt_flow_type)),
+                           "ERROR: --flow-type invalid (options: symmetrical, directional)", EINVAL);
+
+    SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket);
+    ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto& sync_policy = sync_policy_ctx.get_policy();
+
+    auto iter = sync_policy.groups.find(*opt_group_id);
+    if (iter == sync_policy.groups.end()) {
+      cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl;
+      return ENOENT;
+    }
+
+    auto& group = iter->second;
+
+    if (symmetrical_flow_opt(*opt_flow_type)) {
+      CHECK_TRUE(require_non_empty_opt(opt_zone_ids), "ERROR: --zones not provided for symmetrical flow, or is empty", EINVAL);
+
+      rgw_sync_symmetric_group *flow_group;
+
+      group.data_flow.find_or_create_symmetrical(*opt_flow_id, &flow_group);
+
+      for (auto& z : *opt_zone_ids) {
+        flow_group->zones.insert(z);
+      }
+    } else { /* directional */
+      CHECK_TRUE(require_non_empty_opt(opt_source_zone_id), "ERROR: --source-zone not provided for directional flow rule, or is empty", EINVAL);
+      CHECK_TRUE(require_non_empty_opt(opt_dest_zone_id), "ERROR: --dest-zone not provided for directional flow rule, or is empty", EINVAL);
+
+      rgw_sync_directional_rule *flow_rule;
+
+      group.data_flow.find_or_create_directional(*opt_source_zone_id, *opt_dest_zone_id, &flow_rule);
+    }
+
+    ret = sync_policy_ctx.write_policy();
+    if (ret < 0) {
+      return -ret;
+    }
+
+    show_result(sync_policy, zone_formatter.get(), cout);
+  }
+
+  if (opt_cmd == OPT::SYNC_GROUP_FLOW_REMOVE) {
+    CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL);
+    CHECK_TRUE(require_non_empty_opt(opt_flow_id), "ERROR: --flow-id not specified", EINVAL);
+    CHECK_TRUE(require_opt(opt_flow_type),
+                           "ERROR: --flow-type not specified (options: symmetrical, directional)", EINVAL);
+    CHECK_TRUE((symmetrical_flow_opt(*opt_flow_type) ||
+                            directional_flow_opt(*opt_flow_type)),
+                           "ERROR: --flow-type invalid (options: symmetrical, directional)", EINVAL);
+
+    SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket);
+    ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto& sync_policy = sync_policy_ctx.get_policy();
+
+    auto iter = sync_policy.groups.find(*opt_group_id);
+    if (iter == sync_policy.groups.end()) {
+      cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl;
+      return ENOENT;
+    }
+
+    auto& group = iter->second;
+
+    if (symmetrical_flow_opt(*opt_flow_type)) {
+      group.data_flow.remove_symmetrical(*opt_flow_id, opt_zone_ids);
+    } else { /* directional */
+      CHECK_TRUE(require_non_empty_opt(opt_source_zone_id), "ERROR: --source-zone not provided for directional flow rule, or is empty", EINVAL);
+      CHECK_TRUE(require_non_empty_opt(opt_dest_zone_id), "ERROR: --dest-zone not provided for directional flow rule, or is empty", EINVAL);
+
+      group.data_flow.remove_directional(*opt_source_zone_id, *opt_dest_zone_id);
+    }
+    
+    ret = sync_policy_ctx.write_policy();
+    if (ret < 0) {
+      return -ret;
+    }
+
+    show_result(sync_policy, zone_formatter.get(), cout);
+  }
+
+  if (opt_cmd == OPT::SYNC_GROUP_PIPE_CREATE ||
+      opt_cmd == OPT::SYNC_GROUP_PIPE_MODIFY) {
+    CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL);
+    CHECK_TRUE(require_non_empty_opt(opt_pipe_id), "ERROR: --pipe-id not specified", EINVAL);
+    if (opt_cmd == OPT::SYNC_GROUP_PIPE_CREATE) {
+      CHECK_TRUE(require_non_empty_opt(opt_source_zone_ids), "ERROR: --source-zones not provided or is empty; should be list of zones or '*'", EINVAL);
+      CHECK_TRUE(require_non_empty_opt(opt_dest_zone_ids), "ERROR: --dest-zones not provided or is empty; should be list of zones or '*'", EINVAL);
+    }
+
+    SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket);
+    ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto& sync_policy = sync_policy_ctx.get_policy();
+
+    auto iter = sync_policy.groups.find(*opt_group_id);
+    if (iter == sync_policy.groups.end()) {
+      cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl;
+      return ENOENT;
+    }
+
+    auto& group = iter->second;
+
+    rgw_sync_bucket_pipes *pipe;
+
+    if (opt_cmd == OPT::SYNC_GROUP_PIPE_CREATE) {
+      group.find_pipe(*opt_pipe_id, true, &pipe);
+    } else {
+      if (!group.find_pipe(*opt_pipe_id, false, &pipe)) {
+        cerr << "ERROR: could not find pipe '" << *opt_pipe_id << "'" << std::endl;
+        return ENOENT;
+      }
+    }
+
+    if (opt_source_zone_ids) {
+      pipe->source.add_zones(*opt_source_zone_ids);
+    }
+    pipe->source.set_bucket(opt_source_tenant,
+                            opt_source_bucket_name,
+                            opt_source_bucket_id);
+    if (opt_dest_zone_ids) {
+      pipe->dest.add_zones(*opt_dest_zone_ids);
+    }
+    pipe->dest.set_bucket(opt_dest_tenant,
+                            opt_dest_bucket_name,
+                            opt_dest_bucket_id);
+
+    pipe->params.source.filter.set_prefix(opt_prefix, !!opt_prefix_rm);
+    pipe->params.source.filter.set_tags(tags_add, tags_rm);
+    if (opt_dest_owner) {
+      pipe->params.dest.set_owner(*opt_dest_owner);
+    }
+    if (opt_storage_class) {
+      pipe->params.dest.set_storage_class(*opt_storage_class);
+    }
+    if (opt_priority) {
+      pipe->params.priority = *opt_priority;
+    }
+    if (opt_mode) {
+      if (*opt_mode == "system") {
+        pipe->params.mode = rgw_sync_pipe_params::MODE_SYSTEM;
+      } else if (*opt_mode == "user") {
+        pipe->params.mode = rgw_sync_pipe_params::MODE_USER;
+      } else {
+        cerr << "ERROR: bad mode value: should be one of the following: system, user" << std::endl;
+        return EINVAL;
+      }
+    }
+
+    if (!rgw::sal::User::empty(user)) {
+      pipe->params.user = user->get_id();
+    } else if (pipe->params.user.empty()) {
+      auto owner = sync_policy_ctx.get_owner();
+      if (owner) {
+        pipe->params.user = *owner;
+      }
+    }
+
+    ret = sync_policy_ctx.write_policy();
+    if (ret < 0) {
+      return -ret;
+    }
+
+    show_result(sync_policy, zone_formatter.get(), cout);
+  }
+
+  if (opt_cmd == OPT::SYNC_GROUP_PIPE_REMOVE) {
+    CHECK_TRUE(require_non_empty_opt(opt_group_id), "ERROR: --group-id not specified", EINVAL);
+    CHECK_TRUE(require_non_empty_opt(opt_pipe_id), "ERROR: --pipe-id not specified", EINVAL);
+
+    SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket);
+    ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto& sync_policy = sync_policy_ctx.get_policy();
+
+    auto iter = sync_policy.groups.find(*opt_group_id);
+    if (iter == sync_policy.groups.end()) {
+      cerr << "ERROR: could not find group '" << *opt_group_id << "'" << std::endl;
+      return ENOENT;
+    }
+
+    auto& group = iter->second;
+
+    rgw_sync_bucket_pipes *pipe;
+
+    if (!group.find_pipe(*opt_pipe_id, false, &pipe)) {
+      cerr << "ERROR: could not find pipe '" << *opt_pipe_id << "'" << std::endl;
+      return ENOENT;
+    }
+
+    if (opt_source_zone_ids) {
+      pipe->source.remove_zones(*opt_source_zone_ids);
+    }
+
+    pipe->source.remove_bucket(opt_source_tenant,
+                               opt_source_bucket_name,
+                               opt_source_bucket_id);
+    if (opt_dest_zone_ids) {
+      pipe->dest.remove_zones(*opt_dest_zone_ids);
+    }
+    pipe->dest.remove_bucket(opt_dest_tenant,
+                             opt_dest_bucket_name,
+                             opt_dest_bucket_id);
+
+    if (!(opt_source_zone_ids ||
+          opt_source_tenant ||
+          opt_source_bucket ||
+          opt_source_bucket_id ||
+          opt_dest_zone_ids ||
+          opt_dest_tenant ||
+          opt_dest_bucket ||
+          opt_dest_bucket_id)) {
+      group.remove_pipe(*opt_pipe_id);
+    }
+
+    ret = sync_policy_ctx.write_policy();
+    if (ret < 0) {
+      return -ret;
+    }
+
+    show_result(sync_policy, zone_formatter.get(), cout);
+  }
+
+  if (opt_cmd == OPT::SYNC_POLICY_GET) {
+    SyncPolicyContext sync_policy_ctx(cfgstore.get(), opt_bucket);
+    ret = sync_policy_ctx.init(zonegroup_id, zonegroup_name);
+    if (ret < 0) {
+      return -ret;
+    }
+    auto& sync_policy = sync_policy_ctx.get_policy();
+
+    show_result(sync_policy, zone_formatter.get(), cout);
+  }
+
+  if (opt_cmd == OPT::BILOG_TRIM) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    if (!gen) {
+      gen = 0;
+    }
+    ret = bilog_trim(dpp(), static_cast<rgw::sal::RadosStore*>(driver),
+		     bucket->get_info(), *gen,
+		     shard_id, start_marker, end_marker);
+    if (ret < 0) {
+      cerr << "ERROR: trim_bi_log_entries(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::BILOG_STATUS) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket not specified" << std::endl;
+      return EINVAL;
+    }
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    map<int, string> markers;
+    const auto& logs = bucket->get_info().layout.logs;
+    auto log_layout = std::reference_wrapper{logs.back()};
+    if (gen) {
+      auto i = std::find_if(logs.begin(), logs.end(), rgw::matches_gen(*gen));
+      if (i == logs.end()) {
+        cerr << "ERROR: no log layout with gen=" << *gen << std::endl;
+        return ENOENT;
+      }
+      log_layout = *i;
+    }
+
+    ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->bilog_rados->get_log_status(dpp(), bucket->get_info(), log_layout, shard_id,
+						    &markers, null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: get_bi_log_status(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    formatter->open_object_section("entries");
+    encode_json("markers", markers, formatter.get());
+    formatter->dump_string("current_time",
+			   to_iso_8601(ceph::real_clock::now(),
+				       iso_8601_format::YMDhms));
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::BILOG_AUTOTRIM) {
+    RGWCoroutinesManager crs(driver->ctx(), driver->get_cr_registry());
+    RGWHTTPManager http(driver->ctx(), crs.get_completion_mgr());
+    int ret = http.start();
+    if (ret < 0) {
+      cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+
+    rgw::BucketTrimConfig config;
+    configure_bucket_trim(driver->ctx(), config);
+
+    rgw::BucketTrimManager trim(static_cast<rgw::sal::RadosStore*>(driver), config);
+    ret = trim.init();
+    if (ret < 0) {
+      cerr << "trim manager init failed with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+    ret = crs.run(dpp(), trim.create_admin_bucket_trim_cr(&http));
+    if (ret < 0) {
+      cerr << "automated bilog trim failed with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::DATALOG_LIST) {
+    formatter->open_array_section("entries");
+    bool truncated;
+    int count = 0;
+    if (max_entries < 0)
+      max_entries = 1000;
+    if (!start_date.empty()) {
+      std::cerr << "start-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_date.empty()) {
+      std::cerr << "end-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_marker.empty()) {
+      std::cerr << "end-marker not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!start_marker.empty()) {
+      if (marker.empty()) {
+	marker = start_marker;
+      } else {
+	std::cerr << "start-marker and marker not both allowed." << std::endl;
+	return -EINVAL;
+      }
+    }
+
+    auto datalog_svc = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados;
+    RGWDataChangesLog::LogMarker log_marker;
+
+    do {
+      std::vector<rgw_data_change_log_entry> entries;
+      if (specified_shard_id) {
+        ret = datalog_svc->list_entries(dpp(), shard_id, max_entries - count,
+					entries, marker,
+					&marker, &truncated,
+					null_yield);
+      } else {
+        ret = datalog_svc->list_entries(dpp(), max_entries - count, entries,
+					log_marker, &truncated, null_yield);
+      }
+      if (ret < 0) {
+        cerr << "ERROR: datalog_svc->list_entries(): " << cpp_strerror(-ret) << std::endl;
+        return -ret;
+      }
+
+      count += entries.size();
+
+      for (const auto& entry : entries) {
+        if (!extra_info) {
+          encode_json("entry", entry.entry, formatter.get());
+        } else {
+          encode_json("entry", entry, formatter.get());
+        }
+      }
+      formatter.get()->flush(cout);
+    } while (truncated && count < max_entries);
+
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::DATALOG_STATUS) {
+    int i = (specified_shard_id ? shard_id : 0);
+
+    formatter->open_array_section("entries");
+    for (; i < g_ceph_context->_conf->rgw_data_log_num_shards; i++) {
+      list<cls_log_entry> entries;
+
+      RGWDataChangesLogInfo info;
+      static_cast<rgw::sal::RadosStore*>(driver)->svc()->
+	datalog_rados->get_info(dpp(), i, &info, null_yield);
+
+      ::encode_json("info", info, formatter.get());
+
+      if (specified_shard_id)
+        break;
+    }
+
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::DATALOG_AUTOTRIM) {
+    RGWCoroutinesManager crs(driver->ctx(), driver->get_cr_registry());
+    RGWHTTPManager http(driver->ctx(), crs.get_completion_mgr());
+    int ret = http.start();
+    if (ret < 0) {
+      cerr << "failed to initialize http client with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+
+    auto num_shards = g_conf()->rgw_data_log_num_shards;
+    std::vector<std::string> markers(num_shards);
+    ret = crs.run(dpp(), create_admin_data_log_trim_cr(dpp(), static_cast<rgw::sal::RadosStore*>(driver), &http, num_shards, markers));
+    if (ret < 0) {
+      cerr << "automated datalog trim failed with " << cpp_strerror(ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::DATALOG_TRIM) {
+    if (!start_date.empty()) {
+      std::cerr << "start-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_date.empty()) {
+      std::cerr << "end-date not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!start_marker.empty()) {
+      std::cerr << "start-marker not allowed." << std::endl;
+      return -EINVAL;
+    }
+    if (!end_marker.empty()) {
+      if (marker.empty()) {
+	marker = end_marker;
+      } else {
+	std::cerr << "end-marker and marker not both allowed." << std::endl;
+	return -EINVAL;
+      }
+    }
+
+    if (!specified_shard_id) {
+      cerr << "ERROR: requires a --shard-id" << std::endl;
+      return EINVAL;
+    }
+
+    if (marker.empty()) {
+      cerr << "ERROR: requires a --marker" << std::endl;
+      return EINVAL;
+    }
+
+    auto datalog = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados;
+    ret = datalog->trim_entries(dpp(), shard_id, marker, null_yield);
+
+    if (ret < 0 && ret != -ENODATA) {
+      cerr << "ERROR: trim_entries(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::DATALOG_TYPE) {
+    if (!opt_log_type) {
+      std::cerr << "log-type not specified." << std::endl;
+      return -EINVAL;
+    }
+    auto datalog = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados;
+    ret = datalog->change_format(dpp(), *opt_log_type, null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: change_format(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::DATALOG_PRUNE) {
+    auto datalog = static_cast<rgw::sal::RadosStore*>(driver)->svc()->datalog_rados;
+    std::optional<uint64_t> through;
+    ret = datalog->trim_generations(dpp(), through, null_yield);
+
+    if (ret < 0) {
+      cerr << "ERROR: trim_generations(): " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    if (through) {
+      std::cout << "Pruned " << *through << " empty generations." << std::endl;
+    } else {
+      std::cout << "No empty generations." << std::endl;
+    }
+  }
+
+  bool quota_op = (opt_cmd == OPT::QUOTA_SET || opt_cmd == OPT::QUOTA_ENABLE || opt_cmd == OPT::QUOTA_DISABLE);
+
+  if (quota_op) {
+    if (bucket_name.empty() && rgw::sal::User::empty(user)) {
+      cerr << "ERROR: bucket name or uid is required for quota operation" << std::endl;
+      return EINVAL;
+    }
+
+    if (!bucket_name.empty()) {
+      if (!quota_scope.empty() && quota_scope != "bucket") {
+        cerr << "ERROR: invalid quota scope specification." << std::endl;
+        return EINVAL;
+      }
+      set_bucket_quota(driver, opt_cmd, tenant, bucket_name,
+                       max_size, max_objects, have_max_size, have_max_objects);
+    } else if (!rgw::sal::User::empty(user)) {
+      if (quota_scope == "bucket") {
+        return set_user_bucket_quota(opt_cmd, ruser, user_op, max_size, max_objects, have_max_size, have_max_objects);
+      } else if (quota_scope == "user") {
+        return set_user_quota(opt_cmd, ruser, user_op, max_size, max_objects, have_max_size, have_max_objects);
+      } else {
+        cerr << "ERROR: invalid quota scope specification. Please specify either --quota-scope=bucket, or --quota-scope=user" << std::endl;
+        return EINVAL;
+      }
+    }
+  }
+
+  bool ratelimit_op_set = (opt_cmd == OPT::RATELIMIT_SET || opt_cmd == OPT::RATELIMIT_ENABLE || opt_cmd == OPT::RATELIMIT_DISABLE);
+  bool ratelimit_op_get = opt_cmd == OPT::RATELIMIT_GET;
+  if (ratelimit_op_set) {
+    if (bucket_name.empty() && rgw::sal::User::empty(user)) {
+      cerr << "ERROR: bucket name or uid is required for ratelimit operation" << std::endl;
+      return EINVAL;
+    }
+
+    if (!bucket_name.empty()) {
+      if (!ratelimit_scope.empty() && ratelimit_scope != "bucket") {
+        cerr << "ERROR: invalid ratelimit scope specification. (bucket scope is not bucket but bucket has been specified)" << std::endl;
+        return EINVAL;
+      }
+      return set_bucket_ratelimit(driver, opt_cmd, tenant, bucket_name,
+                           max_read_ops, max_write_ops,
+                           max_read_bytes, max_write_bytes,
+                           have_max_read_ops, have_max_write_ops,
+                           have_max_read_bytes, have_max_write_bytes);
+    } else if (!rgw::sal::User::empty(user)) {
+      } if (ratelimit_scope == "user") {
+        return set_user_ratelimit(opt_cmd, user, max_read_ops, max_write_ops,
+                         max_read_bytes, max_write_bytes,
+                         have_max_read_ops, have_max_write_ops,
+                         have_max_read_bytes, have_max_write_bytes);
+      } else {
+        cerr << "ERROR: invalid ratelimit scope specification. Please specify either --ratelimit-scope=bucket, or --ratelimit-scope=user" << std::endl;
+        return EINVAL;
+      }
+  }
+
+  if (ratelimit_op_get) {
+    if (bucket_name.empty() && rgw::sal::User::empty(user)) {
+      cerr << "ERROR: bucket name or uid is required for ratelimit operation" << std::endl;
+      return EINVAL;
+    }
+
+    if (!bucket_name.empty()) {
+      if (!ratelimit_scope.empty() && ratelimit_scope != "bucket") {
+        cerr << "ERROR: invalid ratelimit scope specification. (bucket scope is not bucket but bucket has been specified)" << std::endl;
+        return EINVAL;
+      }
+      return show_bucket_ratelimit(driver, tenant, bucket_name, formatter.get());
+    } else if (!rgw::sal::User::empty(user)) {
+      } if (ratelimit_scope == "user") {
+        return show_user_ratelimit(user, formatter.get());
+      } else {
+        cerr << "ERROR: invalid ratelimit scope specification. Please specify either --ratelimit-scope=bucket, or --ratelimit-scope=user" << std::endl;
+        return EINVAL;
+      }
+  }
+
+  if (opt_cmd == OPT::MFA_CREATE) {
+    rados::cls::otp::otp_info_t config;
+
+    if (rgw::sal::User::empty(user)) {
+      cerr << "ERROR: user id was not provided (via --uid)" << std::endl;
+      return EINVAL;
+    }
+
+    if (totp_serial.empty()) {
+      cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl;
+      return EINVAL;
+    }
+
+    if (totp_seed.empty()) {
+      cerr << "ERROR: TOTP device seed was not provided (via --totp-seed)" << std::endl;
+      return EINVAL;
+    }
+
+
+    rados::cls::otp::SeedType seed_type;
+    if (totp_seed_type == "hex") {
+      seed_type = rados::cls::otp::OTP_SEED_HEX;
+    } else if (totp_seed_type == "base32") {
+      seed_type = rados::cls::otp::OTP_SEED_BASE32;
+    } else {
+      cerr << "ERROR: invalid seed type: " << totp_seed_type << std::endl;
+      return EINVAL;
+    }
+
+    config.id = totp_serial;
+    config.seed = totp_seed;
+    config.seed_type = seed_type;
+
+    if (totp_seconds > 0) {
+      config.step_size = totp_seconds;
+    }
+
+    if (totp_window > 0) {
+      config.window = totp_window;
+    }
+
+    real_time mtime = real_clock::now();
+    string oid = static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->mfa.get_mfa_oid(user->get_id());
+
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->mutate(RGWSI_MetaBackend_OTP::get_meta_key(user->get_id()),
+					     mtime, &objv_tracker,
+					     null_yield, dpp(),
+					     MDLOG_STATUS_WRITE,
+					     [&] {
+      return static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->mfa.create_mfa(dpp(), user->get_id(), config, &objv_tracker, mtime, null_yield);
+    });
+    if (ret < 0) {
+      cerr << "MFA creation failed, error: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    
+    RGWUserInfo& user_info = user_op.get_user_info();
+    user_info.mfa_ids.insert(totp_serial);
+    user_op.set_mfa_ids(user_info.mfa_ids);
+    string err;
+    ret = ruser.modify(dpp(), user_op, null_yield, &err);
+    if (ret < 0) {
+      cerr << "ERROR: failed storing user info, error: " << err << std::endl;
+      return -ret;
+    }
+  }
+
+ if (opt_cmd == OPT::MFA_REMOVE) {
+    if (rgw::sal::User::empty(user)) {
+      cerr << "ERROR: user id was not provided (via --uid)" << std::endl;
+      return EINVAL;
+    }
+
+    if (totp_serial.empty()) {
+      cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl;
+      return EINVAL;
+    }
+
+    real_time mtime = real_clock::now();
+
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->mutate(RGWSI_MetaBackend_OTP::get_meta_key(user->get_id()),
+					     mtime, &objv_tracker,
+					     null_yield, dpp(),
+					     MDLOG_STATUS_WRITE,
+					     [&] {
+      return static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->mfa.remove_mfa(dpp(), user->get_id(), totp_serial, &objv_tracker, mtime, null_yield);
+    });
+    if (ret < 0) {
+      cerr << "MFA removal failed, error: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    RGWUserInfo& user_info = user_op.get_user_info();
+    user_info.mfa_ids.erase(totp_serial);
+    user_op.set_mfa_ids(user_info.mfa_ids);
+    string err;
+    ret = ruser.modify(dpp(), user_op, null_yield, &err);
+    if (ret < 0) {
+      cerr << "ERROR: failed storing user info, error: " << err << std::endl;
+      return -ret;
+    }
+  }
+
+ if (opt_cmd == OPT::MFA_GET) {
+    if (rgw::sal::User::empty(user)) {
+      cerr << "ERROR: user id was not provided (via --uid)" << std::endl;
+      return EINVAL;
+    }
+
+    if (totp_serial.empty()) {
+      cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl;
+      return EINVAL;
+    }
+
+    rados::cls::otp::otp_info_t result;
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->mfa.get_mfa(dpp(), user->get_id(), totp_serial, &result, null_yield);
+    if (ret < 0) {
+      if (ret == -ENOENT || ret == -ENODATA) {
+        cerr << "MFA serial id not found" << std::endl;
+      } else {
+        cerr << "MFA retrieval failed, error: " << cpp_strerror(-ret) << std::endl;
+      }
+      return -ret;
+    }
+    formatter->open_object_section("result");
+    encode_json("entry", result, formatter.get());
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+ if (opt_cmd == OPT::MFA_LIST) {
+    if (rgw::sal::User::empty(user)) {
+      cerr << "ERROR: user id was not provided (via --uid)" << std::endl;
+      return EINVAL;
+    }
+
+    list<rados::cls::otp::otp_info_t> result;
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->mfa.list_mfa(dpp(), user->get_id(), &result, null_yield);
+    if (ret < 0 && ret != -ENOENT) {
+      cerr << "MFA listing failed, error: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    formatter->open_object_section("result");
+    encode_json("entries", result, formatter.get());
+    formatter->close_section();
+    formatter->flush(cout);
+  }
+
+ if (opt_cmd == OPT::MFA_CHECK) {
+    if (rgw::sal::User::empty(user)) {
+      cerr << "ERROR: user id was not provided (via --uid)" << std::endl;
+      return EINVAL;
+    }
+
+    if (totp_serial.empty()) {
+      cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl;
+      return EINVAL;
+    }
+
+    if (totp_pin.empty()) {
+      cerr << "ERROR: TOTP device serial number was not provided (via --totp-pin)" << std::endl;
+      return EINVAL;
+    }
+
+    list<rados::cls::otp::otp_info_t> result;
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->mfa.check_mfa(dpp(), user->get_id(), totp_serial, totp_pin.front(), null_yield);
+    if (ret < 0) {
+      cerr << "MFA check failed, error: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    cout << "ok" << std::endl;
+  }
+
+ if (opt_cmd == OPT::MFA_RESYNC) {
+    if (rgw::sal::User::empty(user)) {
+      cerr << "ERROR: user id was not provided (via --uid)" << std::endl;
+      return EINVAL;
+    }
+
+    if (totp_serial.empty()) {
+      cerr << "ERROR: TOTP device serial number was not provided (via --totp-serial)" << std::endl;
+      return EINVAL;
+    }
+
+    if (totp_pin.size() != 2) {
+      cerr << "ERROR: missing two --totp-pin params (--totp-pin=<first> --totp-pin=<second>)" << std::endl;
+      return EINVAL;
+    }
+
+    rados::cls::otp::otp_info_t config;
+    int ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->mfa.get_mfa(dpp(), user->get_id(), totp_serial, &config, null_yield);
+    if (ret < 0) {
+      if (ret == -ENOENT || ret == -ENODATA) {
+        cerr << "MFA serial id not found" << std::endl;
+      } else {
+        cerr << "MFA retrieval failed, error: " << cpp_strerror(-ret) << std::endl;
+      }
+      return -ret;
+    }
+
+    ceph::real_time now;
+
+    ret = static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->mfa.otp_get_current_time(dpp(), user->get_id(), &now, null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: failed to fetch current time from osd: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    time_t time_ofs;
+
+    ret = scan_totp(driver->ctx(), now, config, totp_pin, &time_ofs);
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+        cerr << "failed to resync, TOTP values not found in range" << std::endl;
+      } else {
+        cerr << "ERROR: failed to scan for TOTP values: " << cpp_strerror(-ret) << std::endl;
+      }
+      return -ret;
+    }
+
+    config.time_ofs = time_ofs;
+
+    /* now update the backend */
+    real_time mtime = real_clock::now();
+
+    ret = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->mutate(RGWSI_MetaBackend_OTP::get_meta_key(user->get_id()),
+				         mtime, &objv_tracker,
+				         null_yield, dpp(),
+				         MDLOG_STATUS_WRITE,
+				         [&] {
+      return static_cast<rgw::sal::RadosStore*>(driver)->svc()->cls->mfa.create_mfa(dpp(), user->get_id(), config, &objv_tracker, mtime, null_yield);
+    });
+    if (ret < 0) {
+      cerr << "MFA update failed, error: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+ }
+
+ if (opt_cmd == OPT::RESHARD_STALE_INSTANCES_LIST) {
+   if (!static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->can_reshard() && !yes_i_really_mean_it) {
+     cerr << "Resharding disabled in a multisite env, stale instances unlikely from resharding" << std::endl;
+     cerr << "These instances may not be safe to delete." << std::endl;
+     cerr << "Use --yes-i-really-mean-it to force displaying these instances." << std::endl;
+     return EINVAL;
+   }
+
+   ret = RGWBucketAdminOp::list_stale_instances(driver, bucket_op, stream_flusher, dpp());
+   if (ret < 0) {
+     cerr << "ERROR: listing stale instances" << cpp_strerror(-ret) << std::endl;
+   }
+ }
+
+ if (opt_cmd == OPT::RESHARD_STALE_INSTANCES_DELETE) {
+   if (!static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->can_reshard()) {
+     cerr << "Resharding disabled in a multisite env. Stale instances are not safe to be deleted." << std::endl;
+     return EINVAL;
+   }
+
+   ret = RGWBucketAdminOp::clear_stale_instances(driver, bucket_op, stream_flusher, dpp());
+   if (ret < 0) {
+     cerr << "ERROR: deleting stale instances" << cpp_strerror(-ret) << std::endl;
+   }
+ }
+
+  if (opt_cmd == OPT::PUBSUB_NOTIFICATION_LIST) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket name was not provided (via --bucket)" << std::endl;
+      return EINVAL;
+    }
+
+    RGWPubSub ps(driver, tenant);
+
+    rgw_pubsub_bucket_topics result;
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    const RGWPubSub::Bucket b(ps, bucket.get());
+    ret = b.get_topics(dpp(), result, null_yield);
+    if (ret < 0 && ret != -ENOENT) {
+      cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    encode_json("result", result, formatter.get());
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::PUBSUB_TOPIC_LIST) {
+    RGWPubSub ps(driver, tenant);
+
+    rgw_pubsub_topics result;
+    int ret = ps.get_topics(dpp(), result, null_yield);
+    if (ret < 0 && ret != -ENOENT) {
+      cerr << "ERROR: could not get topics: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    encode_json("result", result, formatter.get());
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::PUBSUB_TOPIC_GET) {
+    if (topic_name.empty()) {
+      cerr << "ERROR: topic name was not provided (via --topic)" << std::endl;
+      return EINVAL;
+    }
+
+    RGWPubSub ps(driver, tenant);
+
+    rgw_pubsub_topic topic;
+    ret = ps.get_topic(dpp(), topic_name, topic, null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: could not get topic: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    encode_json("topic", topic, formatter.get());
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::PUBSUB_NOTIFICATION_GET) {
+    if (notification_id.empty()) {
+      cerr << "ERROR: notification-id was not provided (via --notification-id)" << std::endl;
+      return EINVAL;
+    }
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket name was not provided (via --bucket)" << std::endl;
+      return EINVAL;
+    }
+
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    RGWPubSub ps(driver, tenant);
+
+    rgw_pubsub_bucket_topics bucket_topics;
+    const RGWPubSub::Bucket b(ps, bucket.get());
+    ret = b.get_topics(dpp(), bucket_topics, null_yield);
+    if (ret < 0 && ret != -ENOENT) {
+      cerr << "ERROR: could not get bucket notifications: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    rgw_pubsub_topic_filter bucket_topic;
+    ret = b.get_notification_by_id(dpp(), notification_id, bucket_topic, null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: could not get notification: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+    encode_json("notification", bucket_topic, formatter.get());
+    formatter->flush(cout);
+  }
+
+  if (opt_cmd == OPT::PUBSUB_TOPIC_RM) {
+    if (topic_name.empty()) {
+      cerr << "ERROR: topic name was not provided (via --topic)" << std::endl;
+      return EINVAL;
+    }
+
+    ret = rgw::notify::remove_persistent_topic(dpp(), static_cast<rgw::sal::RadosStore*>(driver)->getRados()->get_notif_pool_ctx(), topic_name, null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: could not remove persistent topic: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    RGWPubSub ps(driver, tenant);
+
+    ret = ps.remove_topic(dpp(), topic_name, null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: could not remove topic: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+  }
+
+  if (opt_cmd == OPT::PUBSUB_NOTIFICATION_RM) {
+    if (bucket_name.empty()) {
+      cerr << "ERROR: bucket name was not provided (via --bucket)" << std::endl;
+      return EINVAL;
+    }
+
+    int ret = init_bucket(user.get(), tenant, bucket_name, bucket_id, &bucket);
+    if (ret < 0) {
+      cerr << "ERROR: could not init bucket: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    RGWPubSub ps(driver, tenant);
+
+    rgw_pubsub_bucket_topics bucket_topics;
+    const RGWPubSub::Bucket b(ps, bucket.get());
+    ret = b.get_topics(dpp(), bucket_topics, null_yield);
+    if (ret < 0 && ret != -ENOENT) {
+      cerr << "ERROR: could not get bucket notifications: " << cpp_strerror(-ret) << std::endl;
+      return -ret;
+    }
+
+    rgw_pubsub_topic_filter bucket_topic;
+    if(notification_id.empty()) {
+      ret = b.remove_notifications(dpp(), null_yield);
+    } else {
+      ret = b.remove_notification_by_id(dpp(), notification_id, null_yield);
+    }
+  }
+
+  if (opt_cmd == OPT::SCRIPT_PUT) {
+    if (!str_script_ctx) {
+      cerr << "ERROR: context was not provided (via --context)" << std::endl;
+      return EINVAL;
+    }
+    if (infile.empty()) {
+      cerr << "ERROR: infile was not provided (via --infile)" << std::endl;
+      return EINVAL;
+    }
+    bufferlist bl;
+    auto rc = read_input(infile, bl);
+    if (rc < 0) {
+      cerr << "ERROR: failed to read script: '" << infile << "'. error: " << rc << std::endl;
+      return -rc;
+    }
+    const std::string script = bl.to_str();
+    std::string err_msg;
+    if (!rgw::lua::verify(script, err_msg)) {
+      cerr << "ERROR: script: '" << infile << "' has error: " << std::endl << err_msg << std::endl;
+      return EINVAL;
+    }
+    const rgw::lua::context script_ctx = rgw::lua::to_context(*str_script_ctx);
+    if (script_ctx == rgw::lua::context::none) {
+      cerr << "ERROR: invalid script context: " << *str_script_ctx << ". must be one of: " << LUA_CONTEXT_LIST << std::endl;
+      return EINVAL;
+    }
+    if (script_ctx == rgw::lua::context::background && !tenant.empty()) {
+      cerr << "ERROR: cannot specify tenant in background context" << std::endl;
+      return EINVAL;
+    }
+    auto lua_manager = driver->get_lua_manager();
+    rc = rgw::lua::write_script(dpp(), lua_manager.get(), tenant, null_yield, script_ctx, script);
+    if (rc < 0) {
+      cerr << "ERROR: failed to put script. error: " << rc << std::endl;
+      return -rc;
+    }
+  }
+
+  if (opt_cmd == OPT::SCRIPT_GET) {
+    if (!str_script_ctx) {
+      cerr << "ERROR: context was not provided (via --context)" << std::endl;
+      return EINVAL;
+    }
+    const rgw::lua::context script_ctx = rgw::lua::to_context(*str_script_ctx);
+    if (script_ctx == rgw::lua::context::none) {
+      cerr << "ERROR: invalid script context: " << *str_script_ctx << ". must be one of: " << LUA_CONTEXT_LIST << std::endl;
+      return EINVAL;
+    }
+    auto lua_manager = driver->get_lua_manager();
+    std::string script;
+    const auto rc = rgw::lua::read_script(dpp(), lua_manager.get(), tenant, null_yield, script_ctx, script);
+    if (rc == -ENOENT) {
+      std::cout << "no script exists for context: " << *str_script_ctx << 
+        (tenant.empty() ? "" : (" in tenant: " + tenant)) << std::endl;
+    } else if (rc < 0) {
+      cerr << "ERROR: failed to read script. error: " << rc << std::endl;
+      return -rc;
+    } else {
+      std::cout << script << std::endl;
+    }
+  }
+  
+  if (opt_cmd == OPT::SCRIPT_RM) {
+    if (!str_script_ctx) {
+      cerr << "ERROR: context was not provided (via --context)" << std::endl;
+      return EINVAL;
+    }
+    const rgw::lua::context script_ctx = rgw::lua::to_context(*str_script_ctx);
+    if (script_ctx == rgw::lua::context::none) {
+      cerr << "ERROR: invalid script context: " << *str_script_ctx << ". must be one of: " << LUA_CONTEXT_LIST << std::endl;
+      return EINVAL;
+    }
+    auto lua_manager = driver->get_lua_manager();
+    const auto rc = rgw::lua::delete_script(dpp(), lua_manager.get(), tenant, null_yield, script_ctx);
+    if (rc < 0) {
+      cerr << "ERROR: failed to remove script. error: " << rc << std::endl;
+      return -rc;
+    }
+  }
+
+  if (opt_cmd == OPT::SCRIPT_PACKAGE_ADD) {
+#ifdef WITH_RADOSGW_LUA_PACKAGES
+    if (!script_package) {
+      cerr << "ERROR: lua package name was not provided (via --package)" << std::endl;
+      return EINVAL;
+    }
+    const auto rc = rgw::lua::add_package(dpp(), driver, null_yield, *script_package, bool(allow_compilation));
+    if (rc < 0) {
+      cerr << "ERROR: failed to add lua package: " << script_package << " .error: " << rc << std::endl;
+      return -rc;
+    }
+#else
+    cerr << "ERROR: adding lua packages is not permitted" << std::endl;
+    return EPERM;
+#endif
+  }
+
+  if (opt_cmd == OPT::SCRIPT_PACKAGE_RM) {
+#ifdef WITH_RADOSGW_LUA_PACKAGES
+    if (!script_package) {
+      cerr << "ERROR: lua package name was not provided (via --package)" << std::endl;
+      return EINVAL;
+    }
+    const auto rc = rgw::lua::remove_package(dpp(), driver, null_yield, *script_package);
+    if (rc == -ENOENT) {
+      cerr << "WARNING: package " << script_package << " did not exists or already removed" << std::endl;
+      return 0;
+    }
+    if (rc < 0) {
+      cerr << "ERROR: failed to remove lua package: " << script_package << " .error: " << rc << std::endl;
+      return -rc;
+    }
+#else
+    cerr << "ERROR: removing lua packages in not permitted" << std::endl;
+    return EPERM;
+#endif
+  }
+
+  if (opt_cmd == OPT::SCRIPT_PACKAGE_LIST) {
+#ifdef WITH_RADOSGW_LUA_PACKAGES
+    rgw::lua::packages_t packages;
+    const auto rc = rgw::lua::list_packages(dpp(), driver, null_yield, packages);
+    if (rc == -ENOENT) {
+      std::cout << "no lua packages in allowlist" << std::endl;
+    } else if (rc < 0) {
+      cerr << "ERROR: failed to read lua packages allowlist. error: " << rc << std::endl;
+      return rc;
+    } else {
+      for (const auto& package : packages) {
+          std::cout << package << std::endl;
+      }
+    }
+#else
+    cerr << "ERROR: listing lua packages in not permitted" << std::endl;
+    return EPERM;
+#endif
+  }
+
+  return 0;
+}
+
diff --git a/src/rgw/rgw_aio.cc b/src/rgw/rgw_aio.cc
new file mode 100644
index 000000000..4fba513b8
--- /dev/null
+++ b/src/rgw/rgw_aio.cc
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <type_traits>
+#include "include/rados/librados.hpp"
+#include "librados/librados_asio.h"
+
+#include "rgw_aio.h"
+#include "rgw_d3n_cacherequest.h"
+
+namespace rgw {
+
+namespace {
+
+void cb(librados::completion_t, void* arg);
+
+struct state {
+  Aio* aio;
+  librados::AioCompletion* c;
+
+  state(Aio* aio, AioResult& r)
+    : aio(aio),
+      c(librados::Rados::aio_create_completion(&r, &cb)) {}
+};
+
+void cb(librados::completion_t, void* arg) {
+  static_assert(sizeof(AioResult::user_data) >= sizeof(state));
+  static_assert(std::is_trivially_destructible_v<state>);
+  auto& r = *(static_cast<AioResult*>(arg));
+  auto s = reinterpret_cast<state*>(&r.user_data);
+  r.result = s->c->get_return_value();
+  s->c->release();
+  s->aio->put(r);
+}
+
+template <typename Op>
+Aio::OpFunc aio_abstract(Op&& op) {
+  return [op = std::move(op)] (Aio* aio, AioResult& r) mutable {
+      constexpr bool read = std::is_same_v<std::decay_t<Op>, librados::ObjectReadOperation>;
+      auto s = new (&r.user_data) state(aio, r);
+      if constexpr (read) {
+        r.result = r.obj.aio_operate(s->c, &op, &r.data);
+      } else {
+        r.result = r.obj.aio_operate(s->c, &op);
+      }
+      if (r.result < 0) {
+        s->c->release();
+        aio->put(r);
+      }
+    };
+}
+
+struct Handler {
+  Aio* throttle = nullptr;
+  AioResult& r;
+  // write callback
+  void operator()(boost::system::error_code ec) const {
+    r.result = -ec.value();
+    throttle->put(r);
+  }
+  // read callback
+  void operator()(boost::system::error_code ec, bufferlist bl) const {
+    r.result = -ec.value();
+    r.data = std::move(bl);
+    throttle->put(r);
+  }
+};
+
+template <typename Op>
+Aio::OpFunc aio_abstract(Op&& op, boost::asio::io_context& context,
+                         yield_context yield) {
+  return [op = std::move(op), &context, yield] (Aio* aio, AioResult& r) mutable {
+      // arrange for the completion Handler to run on the yield_context's strand
+      // executor so it can safely call back into Aio without locking
+      using namespace boost::asio;
+      async_completion<yield_context, void()> init(yield);
+      auto ex = get_associated_executor(init.completion_handler);
+
+      auto& ref = r.obj.get_ref();
+      librados::async_operate(context, ref.pool.ioctx(), ref.obj.oid, &op, 0,
+                              bind_executor(ex, Handler{aio, r}));
+    };
+}
+
+
+Aio::OpFunc d3n_cache_aio_abstract(const DoutPrefixProvider *dpp, optional_yield y, off_t read_ofs, off_t read_len, std::string& cache_location) {
+  return [dpp, y, read_ofs, read_len, cache_location] (Aio* aio, AioResult& r) mutable {
+    // d3n data cache requires yield context (rgw_beast_enable_async=true)
+    ceph_assert(y);
+    auto& ref = r.obj.get_ref();
+    auto c = std::make_unique<D3nL1CacheRequest>();
+    lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: d3n_cache_aio_abstract(): libaio Read From Cache, oid=" << ref.obj.oid << dendl;
+    c->file_aio_read_abstract(dpp, y.get_io_context(), y.get_yield_context(), cache_location, read_ofs, read_len, aio, r);
+  };
+}
+
+
+template <typename Op>
+Aio::OpFunc aio_abstract(Op&& op, optional_yield y) {
+  static_assert(std::is_base_of_v<librados::ObjectOperation, std::decay_t<Op>>);
+  static_assert(!std::is_lvalue_reference_v<Op>);
+  static_assert(!std::is_const_v<Op>);
+  if (y) {
+    return aio_abstract(std::forward<Op>(op), y.get_io_context(),
+                        y.get_yield_context());
+  }
+  return aio_abstract(std::forward<Op>(op));
+}
+
+} // anonymous namespace
+
+Aio::OpFunc Aio::librados_op(librados::ObjectReadOperation&& op,
+                             optional_yield y) {
+  return aio_abstract(std::move(op), y);
+}
+Aio::OpFunc Aio::librados_op(librados::ObjectWriteOperation&& op,
+                             optional_yield y) {
+  return aio_abstract(std::move(op), y);
+}
+
+Aio::OpFunc Aio::d3n_cache_op(const DoutPrefixProvider *dpp, optional_yield y,
+                              off_t read_ofs, off_t read_len, std::string& cache_location) {
+  return d3n_cache_aio_abstract(dpp, y, read_ofs, read_len, cache_location);
+}
+
+} // namespace rgw
diff --git a/src/rgw/rgw_aio.h b/src/rgw/rgw_aio.h
new file mode 100644
index 000000000..a2c539c17
--- /dev/null
+++ b/src/rgw/rgw_aio.h
@@ -0,0 +1,104 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include <boost/intrusive/list.hpp>
+#include "include/rados/librados_fwd.hpp"
+#include "common/async/yield_context.h"
+
+#include "services/svc_rados.h" // cant forward declare RGWSI_RADOS::Obj
+
+#include "rgw_common.h"
+
+#include "include/function2.hpp"
+
+struct D3nGetObjData;
+
+namespace rgw {
+
+struct AioResult {
+  RGWSI_RADOS::Obj obj;
+  uint64_t id = 0; // id allows caller to associate a result with its request
+  bufferlist data; // result buffer for reads
+  int result = 0;
+  std::aligned_storage_t<3 * sizeof(void*)> user_data;
+
+  AioResult() = default;
+  AioResult(const AioResult&) = delete;
+  AioResult& operator =(const AioResult&) = delete;
+  AioResult(AioResult&&) = delete;
+  AioResult& operator =(AioResult&&) = delete;
+};
+struct AioResultEntry : AioResult, boost::intrusive::list_base_hook<> {
+  virtual ~AioResultEntry() {}
+};
+// a list of polymorphic entries that frees them on destruction
+template <typename T, typename ...Args>
+struct OwningList : boost::intrusive::list<T, Args...> {
+  OwningList() = default;
+  ~OwningList() { this->clear_and_dispose(std::default_delete<T>{}); }
+  OwningList(OwningList&&) = default;
+  OwningList& operator=(OwningList&&) = default;
+  OwningList(const OwningList&) = delete;
+  OwningList& operator=(const OwningList&) = delete;
+};
+using AioResultList = OwningList<AioResultEntry>;
+
+// returns the first error code or 0 if all succeeded
+inline int check_for_errors(const AioResultList& results) {
+  for (auto& e : results) {
+    if (e.result < 0) {
+      return e.result;
+    }
+  }
+  return 0;
+}
+
+// interface to submit async librados operations and wait on their completions.
+// each call returns a list of results from prior completions
+class Aio {
+ public:
+  using OpFunc = fu2::unique_function<void(Aio*, AioResult&) &&>;
+
+  virtual ~Aio() {}
+
+  virtual AioResultList get(const RGWSI_RADOS::Obj& obj,
+			    OpFunc&& f,
+			    uint64_t cost, uint64_t id) = 0;
+  virtual void put(AioResult& r) = 0;
+
+  // poll for any ready completions without waiting
+  virtual AioResultList poll() = 0;
+
+  // return any ready completions. if there are none, wait for the next
+  virtual AioResultList wait() = 0;
+
+  // wait for all outstanding completions and return their results
+  virtual AioResultList drain() = 0;
+
+  static OpFunc librados_op(librados::ObjectReadOperation&& op,
+                            optional_yield y);
+  static OpFunc librados_op(librados::ObjectWriteOperation&& op,
+                            optional_yield y);
+  static OpFunc d3n_cache_op(const DoutPrefixProvider *dpp, optional_yield y,
+                             off_t read_ofs, off_t read_len, std::string& location);
+};
+
+} // namespace rgw
diff --git a/src/rgw/rgw_aio_throttle.cc b/src/rgw/rgw_aio_throttle.cc
new file mode 100644
index 000000000..8ada6db34
--- /dev/null
+++ b/src/rgw/rgw_aio_throttle.cc
@@ -0,0 +1,202 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/rados/librados.hpp"
+
+#include "rgw_aio_throttle.h"
+
+namespace rgw {
+
+bool Throttle::waiter_ready() const
+{
+  switch (waiter) {
+  case Wait::Available: return is_available();
+  case Wait::Completion: return has_completion();
+  case Wait::Drained: return is_drained();
+  default: return false;
+  }
+}
+
+AioResultList BlockingAioThrottle::get(const RGWSI_RADOS::Obj& obj,
+                                       OpFunc&& f,
+                                       uint64_t cost, uint64_t id)
+{
+  auto p = std::make_unique<Pending>();
+  p->obj = obj;
+  p->id = id;
+  p->cost = cost;
+
+  std::unique_lock lock{mutex};
+  if (cost > window) {
+    p->result = -EDEADLK; // would never succeed
+    completed.push_back(*p);
+  } else {
+    // wait for the write size to become available
+    pending_size += p->cost;
+    if (!is_available()) {
+      ceph_assert(waiter == Wait::None);
+      waiter = Wait::Available;
+      cond.wait(lock, [this] { return is_available(); });
+      waiter = Wait::None;
+    }
+
+    // register the pending write and attach a completion
+    p->parent = this;
+    pending.push_back(*p);
+    lock.unlock();
+    std::move(f)(this, *static_cast<AioResult*>(p.get()));
+    lock.lock();
+  }
+  p.release();
+  return std::move(completed);
+}
+
+void BlockingAioThrottle::put(AioResult& r)
+{
+  auto& p = static_cast<Pending&>(r);
+  std::scoped_lock lock{mutex};
+
+  // move from pending to completed
+  pending.erase(pending.iterator_to(p));
+  completed.push_back(p);
+
+  pending_size -= p.cost;
+
+  if (waiter_ready()) {
+    cond.notify_one();
+  }
+}
+
+AioResultList BlockingAioThrottle::poll()
+{
+  std::unique_lock lock{mutex};
+  return std::move(completed);
+}
+
+AioResultList BlockingAioThrottle::wait()
+{
+  std::unique_lock lock{mutex};
+  if (completed.empty() && !pending.empty()) {
+    ceph_assert(waiter == Wait::None);
+    waiter = Wait::Completion;
+    cond.wait(lock, [this] { return has_completion(); });
+    waiter = Wait::None;
+  }
+  return std::move(completed);
+}
+
+AioResultList BlockingAioThrottle::drain()
+{
+  std::unique_lock lock{mutex};
+  if (!pending.empty()) {
+    ceph_assert(waiter == Wait::None);
+    waiter = Wait::Drained;
+    cond.wait(lock, [this] { return is_drained(); });
+    waiter = Wait::None;
+  }
+  return std::move(completed);
+}
+
+template <typename CompletionToken>
+auto YieldingAioThrottle::async_wait(CompletionToken&& token)
+{
+  using boost::asio::async_completion;
+  using Signature = void(boost::system::error_code);
+  async_completion<CompletionToken, Signature> init(token);
+  completion = Completion::create(context.get_executor(),
+                                  std::move(init.completion_handler));
+  return init.result.get();
+}
+
+AioResultList YieldingAioThrottle::get(const RGWSI_RADOS::Obj& obj,
+                                       OpFunc&& f,
+                                       uint64_t cost, uint64_t id)
+{
+  auto p = std::make_unique<Pending>();
+  p->obj = obj;
+  p->id = id;
+  p->cost = cost;
+
+  if (cost > window) {
+    p->result = -EDEADLK; // would never succeed
+    completed.push_back(*p);
+  } else {
+    // wait for the write size to become available
+    pending_size += p->cost;
+    if (!is_available()) {
+      ceph_assert(waiter == Wait::None);
+      ceph_assert(!completion);
+
+      boost::system::error_code ec;
+      waiter = Wait::Available;
+      async_wait(yield[ec]);
+    }
+
+    // register the pending write and initiate the operation
+    pending.push_back(*p);
+    std::move(f)(this, *static_cast<AioResult*>(p.get()));
+  }
+  p.release();
+  return std::move(completed);
+}
+
+void YieldingAioThrottle::put(AioResult& r)
+{
+  auto& p = static_cast<Pending&>(r);
+
+  // move from pending to completed
+  pending.erase(pending.iterator_to(p));
+  completed.push_back(p);
+
+  pending_size -= p.cost;
+
+  if (waiter_ready()) {
+    ceph_assert(completion);
+    ceph::async::post(std::move(completion), boost::system::error_code{});
+    waiter = Wait::None;
+  }
+}
+
+AioResultList YieldingAioThrottle::poll()
+{
+  return std::move(completed);
+}
+
+AioResultList YieldingAioThrottle::wait()
+{
+  if (!has_completion() && !pending.empty()) {
+    ceph_assert(waiter == Wait::None);
+    ceph_assert(!completion);
+
+    boost::system::error_code ec;
+    waiter = Wait::Completion;
+    async_wait(yield[ec]);
+  }
+  return std::move(completed);
+}
+
+AioResultList YieldingAioThrottle::drain()
+{
+  if (!is_drained()) {
+    ceph_assert(waiter == Wait::None);
+    ceph_assert(!completion);
+
+    boost::system::error_code ec;
+    waiter = Wait::Drained;
+    async_wait(yield[ec]);
+  }
+  return std::move(completed);
+}
+} // namespace rgw
diff --git a/src/rgw/rgw_aio_throttle.h b/src/rgw/rgw_aio_throttle.h
new file mode 100644
index 000000000..30ae93cd6
--- /dev/null
+++ b/src/rgw/rgw_aio_throttle.h
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "include/rados/librados_fwd.hpp"
+#include <memory>
+#include "common/ceph_mutex.h"
+#include "common/async/completion.h"
+#include "common/async/yield_context.h"
+#include "services/svc_rados.h"
+#include "rgw_aio.h"
+
+namespace rgw {
+
+class Throttle {
+ protected:
+  const uint64_t window;
+  uint64_t pending_size = 0;
+
+  AioResultList pending;
+  AioResultList completed;
+
+  bool is_available() const { return pending_size <= window; }
+  bool has_completion() const { return !completed.empty(); }
+  bool is_drained() const { return pending.empty(); }
+
+  enum class Wait { None, Available, Completion, Drained };
+  Wait waiter = Wait::None;
+
+  bool waiter_ready() const;
+
+ public:
+  Throttle(uint64_t window) : window(window) {}
+
+  virtual ~Throttle() {
+    // must drain before destructing
+    ceph_assert(pending.empty());
+    ceph_assert(completed.empty());
+  }
+};
+
+// a throttle for aio operations. all public functions must be called from
+// the same thread
+class BlockingAioThrottle final : public Aio, private Throttle {
+  ceph::mutex mutex = ceph::make_mutex("AioThrottle");
+  ceph::condition_variable cond;
+
+  struct Pending : AioResultEntry {
+    BlockingAioThrottle *parent = nullptr;
+    uint64_t cost = 0;
+    librados::AioCompletion *completion = nullptr;
+  };
+ public:
+  BlockingAioThrottle(uint64_t window) : Throttle(window) {}
+
+  virtual ~BlockingAioThrottle() override {};
+
+  AioResultList get(const RGWSI_RADOS::Obj& obj, OpFunc&& f,
+                    uint64_t cost, uint64_t id) override final;
+
+  void put(AioResult& r) override final;
+
+  AioResultList poll() override final;
+
+  AioResultList wait() override final;
+
+  AioResultList drain() override final;
+};
+
+// a throttle that yields the coroutine instead of blocking. all public
+// functions must be called within the coroutine strand
+class YieldingAioThrottle final : public Aio, private Throttle {
+  boost::asio::io_context& context;
+  yield_context yield;
+  struct Handler;
+
+  // completion callback associated with the waiter
+  using Completion = ceph::async::Completion<void(boost::system::error_code)>;
+  std::unique_ptr<Completion> completion;
+
+  template <typename CompletionToken>
+  auto async_wait(CompletionToken&& token);
+
+  struct Pending : AioResultEntry { uint64_t cost = 0; };
+
+ public:
+  YieldingAioThrottle(uint64_t window, boost::asio::io_context& context,
+                      yield_context yield)
+    : Throttle(window), context(context), yield(yield)
+  {}
+
+  virtual ~YieldingAioThrottle() override {};
+
+  AioResultList get(const RGWSI_RADOS::Obj& obj, OpFunc&& f,
+                    uint64_t cost, uint64_t id) override final;
+
+  void put(AioResult& r) override final;
+
+  AioResultList poll() override final;
+
+  AioResultList wait() override final;
+
+  AioResultList drain() override final;
+};
+
+// return a smart pointer to Aio
+inline auto make_throttle(uint64_t window_size, optional_yield y)
+{
+  std::unique_ptr<Aio> aio;
+  if (y) {
+    aio = std::make_unique<YieldingAioThrottle>(window_size,
+                                                y.get_io_context(),
+                                                y.get_yield_context());
+  } else {
+    aio = std::make_unique<BlockingAioThrottle>(window_size);
+  }
+  return aio;
+}
+
+} // namespace rgw
diff --git a/src/rgw/rgw_amqp.cc b/src/rgw/rgw_amqp.cc
new file mode 100644
index 000000000..3014edd1d
--- /dev/null
+++ b/src/rgw/rgw_amqp.cc
@@ -0,0 +1,1051 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_amqp.h"
+#include <amqp.h>
+#include <amqp_ssl_socket.h>
+#include <amqp_tcp_socket.h>
+#include <amqp_framing.h>
+#include "include/ceph_assert.h"
+#include <sstream>
+#include <cstring>
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <thread>
+#include <atomic>
+#include <mutex>
+#include <boost/lockfree/queue.hpp>
+#include <boost/functional/hash.hpp>
+#include "common/dout.h"
+#include <openssl/ssl.h>
+
+#define dout_subsys ceph_subsys_rgw
+
+// TODO investigation, not necessarily issues:
+// (1) in case of single threaded writer context use spsc_queue
+// (2) support multiple channels
+// (3) check performance of emptying queue to local list, and go over the list and publish
+// (4) use std::shared_mutex (c++17) or equivalent for the connections lock
+
+namespace rgw::amqp {
+
+// RGW AMQP status codes for publishing
+static const int RGW_AMQP_STATUS_BROKER_NACK =            -0x1001;
+static const int RGW_AMQP_STATUS_CONNECTION_CLOSED =      -0x1002;
+static const int RGW_AMQP_STATUS_QUEUE_FULL =             -0x1003;
+static const int RGW_AMQP_STATUS_MAX_INFLIGHT =           -0x1004;
+static const int RGW_AMQP_STATUS_MANAGER_STOPPED =        -0x1005;
+// RGW AMQP status code for connection opening
+static const int RGW_AMQP_STATUS_CONN_ALLOC_FAILED =      -0x2001;
+static const int RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED =    -0x2002;
+static const int RGW_AMQP_STATUS_SOCKET_OPEN_FAILED =     -0x2003;
+static const int RGW_AMQP_STATUS_LOGIN_FAILED =           -0x2004;
+static const int RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED =    -0x2005;
+static const int RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED = -0x2006;
+static const int RGW_AMQP_STATUS_Q_DECLARE_FAILED =       -0x2007;
+static const int RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED = -0x2008;
+static const int RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED = -0x2009;
+static const int RGW_AMQP_STATUS_SOCKET_CACERT_FAILED =   -0x2010;
+
+static const int RGW_AMQP_RESPONSE_SOCKET_ERROR =         -0x3008;
+static const int RGW_AMQP_NO_REPLY_CODE =                 0x0;
+
+// the amqp_connection_info struct does not hold any memory and just points to the URL string
+// so, strings are copied into connection_id_t
+connection_id_t::connection_id_t(const amqp_connection_info& info, const std::string& _exchange)
+    : host(info.host), port(info.port), vhost(info.vhost), exchange(_exchange), ssl(info.ssl) {}
+
+// equality operator and hasher functor are needed
+// so that connection_id_t could be used as key in unordered_map
+bool operator==(const connection_id_t& lhs, const connection_id_t& rhs) {
+  return lhs.host == rhs.host && lhs.port == rhs.port &&
+    lhs.vhost == rhs.vhost && lhs.exchange == rhs.exchange;
+}
+
+struct connection_id_hasher {
+  std::size_t operator()(const connection_id_t& k) const {
+    std::size_t h = 0;
+    boost::hash_combine(h, k.host);
+    boost::hash_combine(h, k.port);
+    boost::hash_combine(h, k.vhost);
+    boost::hash_combine(h, k.exchange);
+    return h;
+  }
+};
+
+std::string to_string(const connection_id_t& id) {
+  return fmt::format("{}://{}:{}{}?exchange={}",
+      id.ssl ? "amqps" : "amqp",
+      id.host, id.port, id.vhost, id.exchange);
+}
+
+// automatically cleans amqp state when gets out of scope
+class ConnectionCleaner {
+  private:
+    amqp_connection_state_t state;
+  public:
+    ConnectionCleaner(amqp_connection_state_t _state) : state(_state) {}
+    ~ConnectionCleaner() {
+      if (state) {
+        amqp_destroy_connection(state);
+      }
+    }
+    // call reset() if cleanup is not needed anymore
+    void reset() {
+      state = nullptr;
+    }
+};
+
+// struct for holding the callback and its tag in the callback list
+struct reply_callback_with_tag_t {
+  uint64_t tag;
+  reply_callback_t cb;
+
+  reply_callback_with_tag_t(uint64_t _tag, reply_callback_t _cb) : tag(_tag), cb(_cb) {}
+
+  bool operator==(uint64_t rhs) {
+    return tag == rhs;
+  }
+};
+
+typedef std::vector<reply_callback_with_tag_t> CallbackList;
+
+// struct for holding the connection state object as well as the exchange
+struct connection_t {
+  CephContext* cct = nullptr;
+  amqp_connection_state_t state = nullptr;
+  amqp_bytes_t reply_to_queue = amqp_empty_bytes;
+  uint64_t delivery_tag = 1;
+  int status = AMQP_STATUS_OK;
+  int reply_type = AMQP_RESPONSE_NORMAL;
+  int reply_code = RGW_AMQP_NO_REPLY_CODE;
+  CallbackList callbacks;
+  ceph::coarse_real_clock::time_point next_reconnect = ceph::coarse_real_clock::now();
+  bool mandatory = false;
+  const bool use_ssl = false;
+  std::string user;
+  std::string password;
+  bool verify_ssl = true;
+  boost::optional<std::string> ca_location;
+  utime_t timestamp = ceph_clock_now();
+
+  connection_t(CephContext* _cct, const amqp_connection_info& info, bool _verify_ssl, boost::optional<const std::string&> _ca_location) :
+    cct(_cct), use_ssl(info.ssl), user(info.user), password(info.password), verify_ssl(_verify_ssl), ca_location(_ca_location) {}
+
+  // cleanup of all internal connection resource
+  // the object can still remain, and internal connection
+  // resources created again on successful reconnection
+  void destroy(int s) {
+    status = s;
+    ConnectionCleaner clean_state(state);
+    state = nullptr;
+    amqp_bytes_free(reply_to_queue);
+    reply_to_queue = amqp_empty_bytes;
+    // fire all remaining callbacks
+    std::for_each(callbacks.begin(), callbacks.end(), [this](auto& cb_tag) {
+        cb_tag.cb(status);
+        ldout(cct, 20) << "AMQP destroy: invoking callback with tag=" << cb_tag.tag << dendl;
+      });
+    callbacks.clear();
+    delivery_tag = 1;
+  }
+
+  bool is_ok() const {
+    return (state != nullptr);
+  }
+
+  // dtor also destroys the internals
+  ~connection_t() {
+    destroy(RGW_AMQP_STATUS_CONNECTION_CLOSED);
+  }
+};
+
+// convert connection info to string
+std::string to_string(const amqp_connection_info& info) {
+  std::stringstream ss;
+  ss << "connection info:" <<
+        "\nHost: " << info.host <<
+        "\nPort: " << info.port <<
+        "\nUser: " << info.user <<
+        "\nPassword: " << info.password <<
+        "\nvhost: " << info.vhost <<
+        "\nSSL support: " << info.ssl << std::endl;
+  return ss.str();
+}
+
+// convert reply to error code
+int reply_to_code(const amqp_rpc_reply_t& reply) {
+  switch (reply.reply_type) {
+    case AMQP_RESPONSE_NONE:
+    case AMQP_RESPONSE_NORMAL:
+      return RGW_AMQP_NO_REPLY_CODE;
+    case AMQP_RESPONSE_LIBRARY_EXCEPTION:
+      return reply.library_error;
+    case AMQP_RESPONSE_SERVER_EXCEPTION:
+      if (reply.reply.decoded) {
+        const amqp_connection_close_t* m = (amqp_connection_close_t*)reply.reply.decoded;
+        return m->reply_code;
+      }
+      return reply.reply.id;
+  }
+  return RGW_AMQP_NO_REPLY_CODE;
+}
+
+// convert reply to string
+std::string to_string(const amqp_rpc_reply_t& reply) {
+  std::stringstream ss;
+  switch (reply.reply_type) {
+    case AMQP_RESPONSE_NORMAL:
+      return "";
+    case AMQP_RESPONSE_NONE:
+      return "missing RPC reply type";
+    case AMQP_RESPONSE_LIBRARY_EXCEPTION:
+      return amqp_error_string2(reply.library_error);
+    case AMQP_RESPONSE_SERVER_EXCEPTION:
+      {
+        switch (reply.reply.id) {
+          case AMQP_CONNECTION_CLOSE_METHOD:
+            ss << "server connection error: ";
+            break;
+          case AMQP_CHANNEL_CLOSE_METHOD:
+            ss << "server channel error: ";
+            break;
+          default:
+            ss << "server unknown error: ";
+            break;
+        }
+        if (reply.reply.decoded) {
+          amqp_connection_close_t* m = (amqp_connection_close_t*)reply.reply.decoded;
+          ss << m->reply_code << " text: " << std::string((char*)m->reply_text.bytes, m->reply_text.len);
+        }
+        return ss.str();
+      }
+    default:
+      ss << "unknown error, method id: " << reply.reply.id;
+      return ss.str();
+  }
+}
+
+// convert status enum to string
+std::string to_string(amqp_status_enum s) {
+  switch (s) {
+    case AMQP_STATUS_OK:
+      return "AMQP_STATUS_OK";
+    case AMQP_STATUS_NO_MEMORY:
+      return "AMQP_STATUS_NO_MEMORY";
+    case AMQP_STATUS_BAD_AMQP_DATA:
+      return "AMQP_STATUS_BAD_AMQP_DATA";
+    case AMQP_STATUS_UNKNOWN_CLASS:
+      return "AMQP_STATUS_UNKNOWN_CLASS";
+    case AMQP_STATUS_UNKNOWN_METHOD:
+      return "AMQP_STATUS_UNKNOWN_METHOD";
+    case AMQP_STATUS_HOSTNAME_RESOLUTION_FAILED:
+      return "AMQP_STATUS_HOSTNAME_RESOLUTION_FAILED";
+    case AMQP_STATUS_INCOMPATIBLE_AMQP_VERSION:
+      return "AMQP_STATUS_INCOMPATIBLE_AMQP_VERSION";
+    case AMQP_STATUS_CONNECTION_CLOSED:
+      return "AMQP_STATUS_CONNECTION_CLOSED";
+    case AMQP_STATUS_BAD_URL:
+      return "AMQP_STATUS_BAD_URL";
+    case AMQP_STATUS_SOCKET_ERROR:
+      return "AMQP_STATUS_SOCKET_ERROR";
+    case AMQP_STATUS_INVALID_PARAMETER:
+      return "AMQP_STATUS_INVALID_PARAMETER";
+    case AMQP_STATUS_TABLE_TOO_BIG:
+      return "AMQP_STATUS_TABLE_TOO_BIG";
+    case AMQP_STATUS_WRONG_METHOD:
+      return "AMQP_STATUS_WRONG_METHOD";
+    case AMQP_STATUS_TIMEOUT:
+      return "AMQP_STATUS_TIMEOUT";
+    case AMQP_STATUS_TIMER_FAILURE:
+      return "AMQP_STATUS_TIMER_FAILURE";
+    case AMQP_STATUS_HEARTBEAT_TIMEOUT:
+      return "AMQP_STATUS_HEARTBEAT_TIMEOUT";
+    case AMQP_STATUS_UNEXPECTED_STATE:
+      return "AMQP_STATUS_UNEXPECTED_STATE";
+    case AMQP_STATUS_SOCKET_CLOSED:
+      return "AMQP_STATUS_SOCKET_CLOSED";
+    case AMQP_STATUS_SOCKET_INUSE:
+      return "AMQP_STATUS_SOCKET_INUSE";
+    case AMQP_STATUS_BROKER_UNSUPPORTED_SASL_METHOD:
+      return "AMQP_STATUS_BROKER_UNSUPPORTED_SASL_METHOD";
+#if AMQP_VERSION >= AMQP_VERSION_CODE(0, 8, 0, 0)
+    case AMQP_STATUS_UNSUPPORTED:
+      return "AMQP_STATUS_UNSUPPORTED";
+#endif
+    case _AMQP_STATUS_NEXT_VALUE:
+      return "AMQP_STATUS_INTERNAL";
+    case AMQP_STATUS_TCP_ERROR:
+        return "AMQP_STATUS_TCP_ERROR";
+    case AMQP_STATUS_TCP_SOCKETLIB_INIT_ERROR:
+      return "AMQP_STATUS_TCP_SOCKETLIB_INIT_ERROR";
+    case _AMQP_STATUS_TCP_NEXT_VALUE:
+      return "AMQP_STATUS_INTERNAL";
+    case AMQP_STATUS_SSL_ERROR:
+      return "AMQP_STATUS_SSL_ERROR";
+    case AMQP_STATUS_SSL_HOSTNAME_VERIFY_FAILED:
+      return "AMQP_STATUS_SSL_HOSTNAME_VERIFY_FAILED";
+    case AMQP_STATUS_SSL_PEER_VERIFY_FAILED:
+      return "AMQP_STATUS_SSL_PEER_VERIFY_FAILED";
+    case AMQP_STATUS_SSL_CONNECTION_FAILED:
+      return "AMQP_STATUS_SSL_CONNECTION_FAILED";
+    case _AMQP_STATUS_SSL_NEXT_VALUE:
+      return "AMQP_STATUS_INTERNAL";
+#if AMQP_VERSION >= AMQP_VERSION_CODE(0, 11, 0, 0)
+    case AMQP_STATUS_SSL_SET_ENGINE_FAILED:
+      return "AMQP_STATUS_SSL_SET_ENGINE_FAILED";
+#endif
+    default:
+      return "AMQP_STATUS_UNKNOWN";
+  }
+}
+
+// TODO: add status_to_string on the connection object to prinf full status
+
+// convert int status to string - including RGW specific values
+std::string status_to_string(int s) {
+  switch (s) {
+    case RGW_AMQP_STATUS_BROKER_NACK:
+      return "RGW_AMQP_STATUS_BROKER_NACK";
+    case RGW_AMQP_STATUS_CONNECTION_CLOSED:
+      return "RGW_AMQP_STATUS_CONNECTION_CLOSED";
+    case RGW_AMQP_STATUS_QUEUE_FULL:
+      return "RGW_AMQP_STATUS_QUEUE_FULL";
+    case RGW_AMQP_STATUS_MAX_INFLIGHT:
+      return "RGW_AMQP_STATUS_MAX_INFLIGHT";
+    case RGW_AMQP_STATUS_MANAGER_STOPPED:
+      return "RGW_AMQP_STATUS_MANAGER_STOPPED";
+    case RGW_AMQP_STATUS_CONN_ALLOC_FAILED:
+      return "RGW_AMQP_STATUS_CONN_ALLOC_FAILED";
+    case RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED:
+      return "RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED";
+    case RGW_AMQP_STATUS_SOCKET_OPEN_FAILED:
+      return "RGW_AMQP_STATUS_SOCKET_OPEN_FAILED";
+    case RGW_AMQP_STATUS_LOGIN_FAILED:
+      return "RGW_AMQP_STATUS_LOGIN_FAILED";
+    case RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED:
+      return "RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED";
+    case RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED:
+      return "RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED";
+    case RGW_AMQP_STATUS_Q_DECLARE_FAILED:
+      return "RGW_AMQP_STATUS_Q_DECLARE_FAILED";
+    case RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED:
+      return "RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED";
+    case RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED:
+      return "RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED";
+    case RGW_AMQP_STATUS_SOCKET_CACERT_FAILED:
+      return "RGW_AMQP_STATUS_SOCKET_CACERT_FAILED";
+  }
+  return to_string((amqp_status_enum)s);
+}
+
+// check the result from calls and return if error (=null)
+#define RETURN_ON_ERROR(C, S, OK) \
+  if (!OK) { \
+    C->status = S; \
+    return false; \
+  }
+
+// in case of RPC calls, getting the RPC reply and return if an error is detected
+#define RETURN_ON_REPLY_ERROR(C, ST, S) { \
+      const auto reply = amqp_get_rpc_reply(ST); \
+      if (reply.reply_type != AMQP_RESPONSE_NORMAL) { \
+        C->status = S; \
+        C->reply_type = reply.reply_type; \
+        C->reply_code = reply_to_code(reply); \
+        return false; \
+      } \
+    }
+
+static const amqp_channel_t CHANNEL_ID = 1;
+static const amqp_channel_t CONFIRMING_CHANNEL_ID = 2;
+
+// utility function to create a connection, when the connection object already exists
+bool new_state(connection_t* conn, const connection_id_t& conn_id) {
+  // state must be null at this point
+  ceph_assert(!conn->state);
+  // reset all status codes
+  conn->status = AMQP_STATUS_OK;
+  conn->reply_type = AMQP_RESPONSE_NORMAL;
+  conn->reply_code = RGW_AMQP_NO_REPLY_CODE;
+
+  auto state = amqp_new_connection();
+  if (!state) {
+    conn->status = RGW_AMQP_STATUS_CONN_ALLOC_FAILED;
+    return false;
+  }
+  // make sure that the connection state is cleaned up in case of error
+  ConnectionCleaner state_guard(state);
+
+  // create and open socket
+  amqp_socket_t *socket = nullptr;
+  if (conn->use_ssl) {
+    socket = amqp_ssl_socket_new(state);
+#if AMQP_VERSION >= AMQP_VERSION_CODE(0, 10, 0, 1)
+    SSL_CTX* ssl_ctx = reinterpret_cast<SSL_CTX*>(amqp_ssl_socket_get_context(socket));
+#else
+    // taken from https://github.com/alanxz/rabbitmq-c/pull/560
+    struct hack {
+		  const struct amqp_socket_class_t *klass;
+		  SSL_CTX *ctx;
+	  };
+
+	  struct hack *h = reinterpret_cast<struct hack*>(socket);
+    SSL_CTX* ssl_ctx = h->ctx;
+#endif
+    // ensure system CA certificates get loaded
+    SSL_CTX_set_default_verify_paths(ssl_ctx);
+  }
+  else {
+    socket = amqp_tcp_socket_new(state);
+  }
+
+  if (!socket) {
+    conn->status = RGW_AMQP_STATUS_SOCKET_ALLOC_FAILED;
+    return false;
+  }
+  if (conn->use_ssl) {
+    if (!conn->verify_ssl) {
+      amqp_ssl_socket_set_verify_peer(socket, 0);
+      amqp_ssl_socket_set_verify_hostname(socket, 0);
+    }
+    if (conn->ca_location.has_value()) {
+      const auto s = amqp_ssl_socket_set_cacert(socket, conn->ca_location.get().c_str());
+      if (s != AMQP_STATUS_OK) {
+        conn->status = RGW_AMQP_STATUS_SOCKET_CACERT_FAILED;
+        conn->reply_code = s;
+        return false;
+      }
+    }
+  }
+  const auto s = amqp_socket_open(socket, conn_id.host.c_str(), conn_id.port);
+  if (s < 0) {
+    conn->status = RGW_AMQP_STATUS_SOCKET_OPEN_FAILED;
+    conn->reply_type = RGW_AMQP_RESPONSE_SOCKET_ERROR;
+    conn->reply_code = s;
+    return false;
+  }
+
+  // login to broker
+  const auto reply = amqp_login(state,
+      conn_id.vhost.c_str(),
+      AMQP_DEFAULT_MAX_CHANNELS,
+      AMQP_DEFAULT_FRAME_SIZE,
+      0,                        // no heartbeat TODO: add conf
+      AMQP_SASL_METHOD_PLAIN,   // TODO: add other types of security
+      conn->user.c_str(),
+      conn->password.c_str());
+  if (reply.reply_type != AMQP_RESPONSE_NORMAL) {
+    conn->status = RGW_AMQP_STATUS_LOGIN_FAILED;
+    conn->reply_type = reply.reply_type;
+    conn->reply_code = reply_to_code(reply);
+    return false;
+  }
+
+  // open channels
+  {
+    const auto ok = amqp_channel_open(state, CHANNEL_ID);
+    RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED, ok);
+    RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED);
+  }
+  {
+    const auto ok = amqp_channel_open(state, CONFIRMING_CHANNEL_ID);
+    RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED, ok);
+    RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CHANNEL_OPEN_FAILED);
+  }
+  {
+    const auto ok = amqp_confirm_select(state, CONFIRMING_CHANNEL_ID);
+    RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED, ok);
+    RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CONFIRM_DECLARE_FAILED);
+  }
+
+  // verify that the topic exchange is there
+  // TODO: make this step optional
+  {
+    const auto ok = amqp_exchange_declare(state,
+      CHANNEL_ID,
+      amqp_cstring_bytes(conn_id.exchange.c_str()),
+      amqp_cstring_bytes("topic"),
+      1, // passive - exchange must already exist on broker
+      1, // durable
+      0, // dont auto-delete
+      0, // not internal
+      amqp_empty_table);
+    RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED, ok);
+    RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_VERIFY_EXCHANGE_FAILED);
+  }
+  {
+    // create queue for confirmations
+    const auto queue_ok = amqp_queue_declare(state,
+        CHANNEL_ID,         // use the regular channel for this call
+        amqp_empty_bytes,   // let broker allocate queue name
+        0,                  // not passive - create the queue
+        0,                  // not durable
+        1,                  // exclusive
+        1,                  // auto-delete
+        amqp_empty_table    // not args TODO add args from conf: TTL, max length etc.
+        );
+    RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_Q_DECLARE_FAILED, queue_ok);
+    RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_Q_DECLARE_FAILED);
+
+    // define consumption for connection
+    const auto consume_ok = amqp_basic_consume(state,
+        CONFIRMING_CHANNEL_ID,
+        queue_ok->queue,
+        amqp_empty_bytes, // broker will generate consumer tag
+        1,                // messages sent from client are never routed back
+        1,                // client does not ack thr acks
+        1,                // exclusive access to queue
+        amqp_empty_table  // no parameters
+        );
+
+    RETURN_ON_ERROR(conn, RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED, consume_ok);
+    RETURN_ON_REPLY_ERROR(conn, state, RGW_AMQP_STATUS_CONSUME_DECLARE_FAILED);
+    // broker generated consumer_tag could be used to cancel sending of n/acks from broker - not needed
+
+    state_guard.reset();
+    conn->state = state;
+    conn->reply_to_queue = amqp_bytes_malloc_dup(queue_ok->queue);
+  }
+  return true;
+}
+
+/// struct used for holding messages in the message queue
+struct message_wrapper_t {
+  connection_id_t conn_id;
+  std::string topic;
+  std::string message;
+  reply_callback_t cb;
+
+  message_wrapper_t(const connection_id_t& _conn_id,
+      const std::string& _topic,
+      const std::string& _message,
+      reply_callback_t _cb) : conn_id(_conn_id), topic(_topic), message(_message), cb(_cb) {}
+};
+
+using connection_t_ptr = std::unique_ptr<connection_t>;
+
+typedef std::unordered_map<connection_id_t, connection_t_ptr, connection_id_hasher> ConnectionList;
+typedef boost::lockfree::queue<message_wrapper_t*, boost::lockfree::fixed_sized<true>> MessageQueue;
+
+// macros used inside a loop where an iterator is either incremented or erased
+#define INCREMENT_AND_CONTINUE(IT) \
+          ++IT; \
+          continue;
+
+#define ERASE_AND_CONTINUE(IT,CONTAINER) \
+          IT=CONTAINER.erase(IT); \
+          --connection_count; \
+          continue;
+
+class Manager {
+public:
+  const size_t max_connections;
+  const size_t max_inflight;
+  const size_t max_queue;
+  const size_t max_idle_time;
+private:
+  std::atomic<size_t> connection_count;
+  std::atomic<bool> stopped;
+  struct timeval read_timeout;
+  ConnectionList connections;
+  MessageQueue messages;
+  std::atomic<size_t> queued;
+  std::atomic<size_t> dequeued;
+  CephContext* const cct;
+  mutable std::mutex connections_lock;
+  const ceph::coarse_real_clock::duration idle_time;
+  const ceph::coarse_real_clock::duration reconnect_time;
+  std::thread runner;
+
+  void publish_internal(message_wrapper_t* message) {
+    const std::unique_ptr<message_wrapper_t> msg_owner(message);
+    const auto& conn_id = message->conn_id;
+    auto conn_it = connections.find(conn_id);
+    if (conn_it == connections.end()) {
+      ldout(cct, 1) << "AMQP publish: connection '" << to_string(conn_id) << "' not found" << dendl;
+      if (message->cb) {
+        message->cb(RGW_AMQP_STATUS_CONNECTION_CLOSED);
+      }
+      return;
+    }
+
+    auto& conn = conn_it->second;
+
+    conn->timestamp = ceph_clock_now();
+
+    if (!conn->is_ok()) {
+      // connection had an issue while message was in the queue
+      ldout(cct, 1) << "AMQP publish: connection '" << to_string(conn_id) << "' is closed" << dendl;
+      if (message->cb) {
+        message->cb(RGW_AMQP_STATUS_CONNECTION_CLOSED);
+      }
+      return;
+    }
+
+    if (message->cb == nullptr) {
+      const auto rc = amqp_basic_publish(conn->state,
+        CHANNEL_ID,
+        amqp_cstring_bytes(conn_id.exchange.c_str()),
+        amqp_cstring_bytes(message->topic.c_str()),
+        0, // does not have to be routable
+        0, // not immediate
+        nullptr, // no properties needed
+        amqp_cstring_bytes(message->message.c_str()));
+      if (rc == AMQP_STATUS_OK) {
+        ldout(cct, 20) << "AMQP publish (no callback): OK" << dendl;
+        return;
+      }
+      ldout(cct, 1) << "AMQP publish (no callback): failed with error " << status_to_string(rc) << dendl;
+      // an error occurred, close connection
+      // it will be retied by the main loop
+      conn->destroy(rc);
+      return;
+    }
+
+    amqp_basic_properties_t props;
+    props._flags =
+      AMQP_BASIC_DELIVERY_MODE_FLAG |
+      AMQP_BASIC_REPLY_TO_FLAG;
+    props.delivery_mode = 2; // persistent delivery TODO take from conf
+    props.reply_to = conn->reply_to_queue;
+
+    const auto rc = amqp_basic_publish(conn->state,
+      CONFIRMING_CHANNEL_ID,
+      amqp_cstring_bytes(conn_id.exchange.c_str()),
+      amqp_cstring_bytes(message->topic.c_str()),
+      conn->mandatory,
+      0, // not immediate
+      &props,
+      amqp_cstring_bytes(message->message.c_str()));
+
+    if (rc == AMQP_STATUS_OK) {
+      auto const q_len = conn->callbacks.size();
+      if (q_len < max_inflight) {
+        ldout(cct, 20) << "AMQP publish (with callback, tag=" << conn->delivery_tag << "): OK. Queue has: " << q_len << " callbacks" << dendl;
+        conn->callbacks.emplace_back(conn->delivery_tag++, message->cb);
+      } else {
+        // immediately invoke callback with error
+        ldout(cct, 1) << "AMQP publish (with callback): failed with error: callback queue full" << dendl;
+        message->cb(RGW_AMQP_STATUS_MAX_INFLIGHT);
+      }
+    } else {
+      // an error occurred, close connection
+      // it will be retied by the main loop
+      ldout(cct, 1) << "AMQP publish (with callback): failed with error: " << status_to_string(rc) << dendl;
+      conn->destroy(rc);
+      // immediately invoke callback with error
+      message->cb(rc);
+    }
+  }
+
+  // the managers thread:
+  // (1) empty the queue of messages to be published
+  // (2) loop over all connections and read acks
+  // (3) manages deleted connections
+  // (4) TODO reconnect on connection errors
+  // (5) TODO cleanup timedout callbacks
+  void run() noexcept {
+    amqp_frame_t frame;
+    while (!stopped) {
+
+      // publish all messages in the queue
+      const auto count = messages.consume_all(std::bind(&Manager::publish_internal, this, std::placeholders::_1));
+      dequeued += count;
+      ConnectionList::iterator conn_it;
+      ConnectionList::const_iterator end_it;
+      {
+        // thread safe access to the connection list
+        // once the iterators are fetched they are guaranteed to remain valid
+        std::lock_guard lock(connections_lock);
+        conn_it = connections.begin();
+        end_it = connections.end();
+      }
+      auto incoming_message = false;
+      // loop over all connections to read acks
+      for (;conn_it != end_it;) {
+
+        const auto& conn_id = conn_it->first;
+        auto& conn = conn_it->second;
+
+        if(conn->timestamp.sec() + max_idle_time < ceph_clock_now()) {
+          ldout(cct, 20) << "AMQP run: Time for deleting a connection due to idle behaviour: " << ceph_clock_now() << dendl;
+          ERASE_AND_CONTINUE(conn_it, connections);
+        }
+
+        // try to reconnect the connection if it has an error
+        if (!conn->is_ok()) {
+          const auto now = ceph::coarse_real_clock::now();
+          if (now >= conn->next_reconnect) {
+            // pointers are used temporarily inside the amqp_connection_info object
+            // as read-only values, hence the assignment, and const_cast are safe here
+            ldout(cct, 20) << "AMQP run: retry connection" << dendl;
+            if (!new_state(conn.get(), conn_id)) {
+              ldout(cct, 10) << "AMQP run: connection '" << to_string(conn_id) << "' retry failed. error: " <<
+                status_to_string(conn->status) << " (" << conn->reply_code << ")"  << dendl;
+              // TODO: add error counter for failed retries
+              // TODO: add exponential backoff for retries
+              conn->next_reconnect = now + reconnect_time;
+            } else {
+              ldout(cct, 10) << "AMQP run: connection '" << to_string(conn_id) << "' retry successfull" << dendl;
+            }
+          }
+          INCREMENT_AND_CONTINUE(conn_it);
+        }
+
+        const auto rc = amqp_simple_wait_frame_noblock(conn->state, &frame, &read_timeout);
+
+        if (rc == AMQP_STATUS_TIMEOUT) {
+          // TODO mark connection as idle
+          INCREMENT_AND_CONTINUE(conn_it);
+        }
+
+        // this is just to prevent spinning idle, does not indicate that a message
+        // was successfully processed or not
+        incoming_message = true;
+
+        // check if error occurred that require reopening the connection
+        if (rc != AMQP_STATUS_OK) {
+          // an error occurred, close connection
+          // it will be retied by the main loop
+          ldout(cct, 1) << "AMQP run: connection read error: " << status_to_string(rc) << dendl;
+          conn->destroy(rc);
+          INCREMENT_AND_CONTINUE(conn_it);
+        }
+
+        if (frame.frame_type != AMQP_FRAME_METHOD) {
+          ldout(cct, 10) << "AMQP run: ignoring non n/ack messages. frame type: "
+            << unsigned(frame.frame_type) << dendl;
+          // handler is for publish confirmation only - handle only method frames
+          INCREMENT_AND_CONTINUE(conn_it);
+        }
+
+        uint64_t tag;
+        bool multiple;
+        int result;
+
+        switch (frame.payload.method.id) {
+          case AMQP_BASIC_ACK_METHOD:
+            {
+              result = AMQP_STATUS_OK;
+              const auto ack = (amqp_basic_ack_t*)frame.payload.method.decoded;
+              ceph_assert(ack);
+              tag = ack->delivery_tag;
+              multiple = ack->multiple;
+              break;
+            }
+          case AMQP_BASIC_NACK_METHOD:
+            {
+              result = RGW_AMQP_STATUS_BROKER_NACK;
+              const auto nack = (amqp_basic_nack_t*)frame.payload.method.decoded;
+              ceph_assert(nack);
+              tag = nack->delivery_tag;
+              multiple = nack->multiple;
+              break;
+            }
+          case AMQP_BASIC_REJECT_METHOD:
+            {
+              result = RGW_AMQP_STATUS_BROKER_NACK;
+              const auto reject = (amqp_basic_reject_t*)frame.payload.method.decoded;
+              tag = reject->delivery_tag;
+              multiple = false;
+              break;
+            }
+          case AMQP_CONNECTION_CLOSE_METHOD:
+            // TODO on channel close, no need to reopen the connection
+          case AMQP_CHANNEL_CLOSE_METHOD:
+            {
+              // other side closed the connection, no need to continue
+              ldout(cct, 10) << "AMQP run: connection was closed by broker" << dendl;
+              conn->destroy(rc);
+              INCREMENT_AND_CONTINUE(conn_it);
+            }
+          case AMQP_BASIC_RETURN_METHOD:
+            // message was not delivered, returned to sender
+            ldout(cct, 10) << "AMQP run: message was not routable" << dendl;
+            INCREMENT_AND_CONTINUE(conn_it);
+            break;
+          default:
+            // unexpected method
+            ldout(cct, 10) << "AMQP run: unexpected message" << dendl;
+            INCREMENT_AND_CONTINUE(conn_it);
+        }
+
+        const auto tag_it = std::find(conn->callbacks.begin(), conn->callbacks.end(), tag);
+        if (tag_it != conn->callbacks.end()) {
+          if (multiple) {
+            // n/ack all up to (and including) the tag
+            ldout(cct, 20) << "AMQP run: multiple n/acks received with tag=" << tag << " and result=" << result << dendl;
+            auto it = conn->callbacks.begin();
+            while (it->tag <= tag && it != conn->callbacks.end()) {
+              ldout(cct, 20) << "AMQP run: invoking callback with tag=" << it->tag << dendl;
+              it->cb(result);
+              it = conn->callbacks.erase(it);
+            }
+          } else {
+            // n/ack a specific tag
+            ldout(cct, 20) << "AMQP run: n/ack received, invoking callback with tag=" << tag << " and result=" << result << dendl;
+            tag_it->cb(result);
+            conn->callbacks.erase(tag_it);
+          }
+        } else {
+          ldout(cct, 10) << "AMQP run: unsolicited n/ack received with tag=" << tag << dendl;
+        }
+        // just increment the iterator
+        ++conn_it;
+      }
+      // if no messages were received or published, sleep for 100ms
+      if (count == 0 && !incoming_message) {
+        std::this_thread::sleep_for(idle_time);
+      }
+    }
+  }
+
+  // used in the dtor for message cleanup
+  static void delete_message(const message_wrapper_t* message) {
+    delete message;
+  }
+
+public:
+  Manager(size_t _max_connections,
+      size_t _max_inflight,
+      size_t _max_queue,
+      long _usec_timeout,
+      unsigned reconnect_time_ms,
+      unsigned idle_time_ms,
+      CephContext* _cct) :
+    max_connections(_max_connections),
+    max_inflight(_max_inflight),
+    max_queue(_max_queue),
+    max_idle_time(30),
+    connection_count(0),
+    stopped(false),
+    read_timeout{0, _usec_timeout},
+    connections(_max_connections),
+    messages(max_queue),
+    queued(0),
+    dequeued(0),
+    cct(_cct),
+    idle_time(std::chrono::milliseconds(idle_time_ms)),
+    reconnect_time(std::chrono::milliseconds(reconnect_time_ms)),
+    runner(&Manager::run, this) {
+      // The hashmap has "max connections" as the initial number of buckets,
+      // and allows for 10 collisions per bucket before rehash.
+      // This is to prevent rehashing so that iterators are not invalidated
+      // when a new connection is added.
+      connections.max_load_factor(10.0);
+      // give the runner thread a name for easier debugging
+      const auto rc = ceph_pthread_setname(runner.native_handle(), "amqp_manager");
+      ceph_assert(rc==0);
+  }
+
+  // non copyable
+  Manager(const Manager&) = delete;
+  const Manager& operator=(const Manager&) = delete;
+
+  // stop the main thread
+  void stop() {
+    stopped = true;
+  }
+
+  // connect to a broker, or reuse an existing connection if already connected
+  bool connect(connection_id_t& id, const std::string& url, const std::string& exchange, bool mandatory_delivery, bool verify_ssl,
+        boost::optional<const std::string&> ca_location) {
+    if (stopped) {
+      ldout(cct, 1) << "AMQP connect: manager is stopped" << dendl;
+      return false;
+    }
+
+    amqp_connection_info info;
+    // cache the URL so that parsing could happen in-place
+    std::vector<char> url_cache(url.c_str(), url.c_str()+url.size()+1);
+    const auto retcode = amqp_parse_url(url_cache.data(), &info);
+    if (AMQP_STATUS_OK != retcode) {
+      ldout(cct, 1) << "AMQP connect: URL parsing failed. error: " << retcode << dendl;
+      return false;
+    }
+    connection_id_t tmp_id(info, exchange);
+
+    std::lock_guard lock(connections_lock);
+    const auto it = connections.find(tmp_id);
+    if (it != connections.end()) {
+      // connection found - return even if non-ok
+      ldout(cct, 20) << "AMQP connect: connection found" << dendl;
+      id = it->first;
+      return true;
+    }
+
+    // connection not found, creating a new one
+    if (connection_count >= max_connections) {
+      ldout(cct, 1) << "AMQP connect: max connections exceeded" << dendl;
+      return false;
+    }
+    // if error occurred during creation the creation will be retried in the main thread
+    ++connection_count;
+    auto conn = connections.emplace(tmp_id, std::make_unique<connection_t>(cct, info, verify_ssl, ca_location)).first->second.get();
+    ldout(cct, 10) << "AMQP connect: new connection is created. Total connections: " << connection_count << dendl;
+    if (!new_state(conn, tmp_id)) {
+      ldout(cct, 1) << "AMQP connect: new connection '" << to_string(tmp_id) << "' is created. but state creation failed (will retry). error: " <<
+        status_to_string(conn->status) << " (" << conn->reply_code << ")"  << dendl;
+    }
+    id = std::move(tmp_id);
+    return true;
+  }
+
+  // TODO publish with confirm is needed in "none" case as well, cb should be invoked publish is ok (no ack)
+  int publish(const connection_id_t& conn_id,
+    const std::string& topic,
+    const std::string& message) {
+    if (stopped) {
+      ldout(cct, 1) << "AMQP publish: manager is not running" << dendl;
+      return RGW_AMQP_STATUS_MANAGER_STOPPED;
+    }
+    auto wrapper = std::make_unique<message_wrapper_t>(conn_id, topic, message, nullptr);
+    if (messages.push(wrapper.get())) {
+      std::ignore = wrapper.release();
+      ++queued;
+      return AMQP_STATUS_OK;
+    }
+    ldout(cct, 1) << "AMQP publish: queue is full" << dendl;
+    return RGW_AMQP_STATUS_QUEUE_FULL;
+  }
+
+  int publish_with_confirm(const connection_id_t& conn_id,
+    const std::string& topic,
+    const std::string& message,
+    reply_callback_t cb) {
+    if (stopped) {
+      ldout(cct, 1) << "AMQP publish_with_confirm: manager is not running" << dendl;
+      return RGW_AMQP_STATUS_MANAGER_STOPPED;
+    }
+    auto wrapper = std::make_unique<message_wrapper_t>(conn_id, topic, message, cb);
+    if (messages.push(wrapper.get())) {
+      std::ignore = wrapper.release();
+      ++queued;
+      return AMQP_STATUS_OK;
+    }
+    ldout(cct, 1) << "AMQP publish_with_confirm: queue is full" << dendl;
+    return RGW_AMQP_STATUS_QUEUE_FULL;
+  }
+
+  // dtor wait for thread to stop
+  // then connection are cleaned-up
+  ~Manager() {
+    stopped = true;
+    runner.join();
+    messages.consume_all(delete_message);
+  }
+
+  // get the number of connections
+  size_t get_connection_count() const {
+    return connection_count;
+  }
+
+  // get the number of in-flight messages
+  size_t get_inflight() const {
+    size_t sum = 0;
+    std::lock_guard lock(connections_lock);
+    std::for_each(connections.begin(), connections.end(), [&sum](auto& conn_pair) {
+        // concurrent access to the callback vector is safe without locking
+        sum += conn_pair.second->callbacks.size();
+      });
+    return sum;
+  }
+
+  // running counter of the queued messages
+  size_t get_queued() const {
+    return queued;
+  }
+
+  // running counter of the dequeued messages
+  size_t get_dequeued() const {
+    return dequeued;
+  }
+};
+
+// singleton manager
+// note that the manager itself is not a singleton, and multiple instances may co-exist
+// TODO make the pointer atomic in allocation and deallocation to avoid race conditions
+static Manager* s_manager = nullptr;
+
+static const size_t MAX_CONNECTIONS_DEFAULT = 256;
+static const size_t MAX_INFLIGHT_DEFAULT = 8192;
+static const size_t MAX_QUEUE_DEFAULT = 8192;
+static const long READ_TIMEOUT_USEC = 100;
+static const unsigned IDLE_TIME_MS = 100;
+static const unsigned RECONNECT_TIME_MS = 100;
+
+bool init(CephContext* cct) {
+  if (s_manager) {
+    return false;
+  }
+  // TODO: take conf from CephContext
+  s_manager = new Manager(MAX_CONNECTIONS_DEFAULT, MAX_INFLIGHT_DEFAULT, MAX_QUEUE_DEFAULT,
+      READ_TIMEOUT_USEC, IDLE_TIME_MS, RECONNECT_TIME_MS, cct);
+  return true;
+}
+
+void shutdown() {
+  delete s_manager;
+  s_manager = nullptr;
+}
+
+bool connect(connection_id_t& conn_id, const std::string& url, const std::string& exchange, bool mandatory_delivery, bool verify_ssl,
+        boost::optional<const std::string&> ca_location) {
+  if (!s_manager) return false;
+  return s_manager->connect(conn_id, url, exchange, mandatory_delivery, verify_ssl, ca_location);
+}
+
+int publish(const connection_id_t& conn_id,
+    const std::string& topic,
+    const std::string& message) {
+  if (!s_manager) return RGW_AMQP_STATUS_MANAGER_STOPPED;
+  return s_manager->publish(conn_id, topic, message);
+}
+
+int publish_with_confirm(const connection_id_t& conn_id,
+    const std::string& topic,
+    const std::string& message,
+    reply_callback_t cb) {
+  if (!s_manager) return RGW_AMQP_STATUS_MANAGER_STOPPED;
+  return s_manager->publish_with_confirm(conn_id, topic, message, cb);
+}
+
+size_t get_connection_count() {
+  if (!s_manager) return 0;
+  return s_manager->get_connection_count();
+}
+
+size_t get_inflight() {
+  if (!s_manager) return 0;
+  return s_manager->get_inflight();
+}
+
+size_t get_queued() {
+  if (!s_manager) return 0;
+  return s_manager->get_queued();
+}
+
+size_t get_dequeued() {
+  if (!s_manager) return 0;
+  return s_manager->get_dequeued();
+}
+
+size_t get_max_connections() {
+  if (!s_manager) return MAX_CONNECTIONS_DEFAULT;
+  return s_manager->max_connections;
+}
+
+size_t get_max_inflight() {
+  if (!s_manager) return MAX_INFLIGHT_DEFAULT;
+  return s_manager->max_inflight;
+}
+
+size_t get_max_queue() {
+  if (!s_manager) return MAX_QUEUE_DEFAULT;
+  return s_manager->max_queue;
+}
+
+} // namespace amqp
+
diff --git a/src/rgw/rgw_amqp.h b/src/rgw/rgw_amqp.h
new file mode 100644
index 000000000..c363f4d74
--- /dev/null
+++ b/src/rgw/rgw_amqp.h
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <functional>
+#include <boost/optional.hpp>
+
+#include "include/common_fwd.h"
+
+struct amqp_connection_info;
+
+namespace rgw::amqp {
+
+// the reply callback is expected to get an integer parameter
+// indicating the result, and not to return anything
+typedef std::function<void(int)> reply_callback_t;
+
+// initialize the amqp manager
+bool init(CephContext* cct);
+
+// shutdown the amqp manager
+void shutdown();
+
+// key class for the connection list
+struct connection_id_t {
+  std::string host;
+  int port;
+  std::string vhost;
+  std::string exchange;
+  bool ssl;
+  connection_id_t() = default;
+  connection_id_t(const amqp_connection_info& info, const std::string& _exchange);
+};
+
+std::string to_string(const connection_id_t& id);
+
+// connect to an amqp endpoint
+bool connect(connection_id_t& conn_id, const std::string& url, const std::string& exchange, bool mandatory_delivery, bool verify_ssl,
+        boost::optional<const std::string&> ca_location);
+
+// publish a message over a connection that was already created
+int publish(const connection_id_t& conn_id,
+    const std::string& topic,
+    const std::string& message);
+
+// publish a message over a connection that was already created
+// and pass a callback that will be invoked (async) when broker confirms
+// receiving the message
+int publish_with_confirm(const connection_id_t& conn_id, 
+    const std::string& topic,
+    const std::string& message,
+    reply_callback_t cb);
+
+// convert the integer status returned from the "publish" function to a string
+std::string status_to_string(int s);
+
+// number of connections
+size_t get_connection_count();
+  
+// return the number of messages that were sent
+// to broker, but were not yet acked/nacked/timedout
+size_t get_inflight();
+
+// running counter of successfully queued messages
+size_t get_queued();
+
+// running counter of dequeued messages
+size_t get_dequeued();
+
+// number of maximum allowed connections
+size_t get_max_connections();
+
+// number of maximum allowed inflight messages
+size_t get_max_inflight();
+
+// maximum number of messages in the queue
+size_t get_max_queue();
+
+}
+
diff --git a/src/rgw/rgw_appmain.cc b/src/rgw/rgw_appmain.cc
new file mode 100644
index 000000000..361f622b9
--- /dev/null
+++ b/src/rgw/rgw_appmain.cc
@@ -0,0 +1,605 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <boost/intrusive/list.hpp>
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/TracepointProvider.h"
+#include "common/openssl_opts_handler.h"
+#include "common/numa.h"
+#include "include/compat.h"
+#include "include/str_list.h"
+#include "include/stringify.h"
+#include "rgw_main.h"
+#include "rgw_common.h"
+#include "rgw_sal_rados.h"
+#include "rgw_period_pusher.h"
+#include "rgw_realm_reloader.h"
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_swift.h"
+#include "rgw_rest_admin.h"
+#include "rgw_rest_info.h"
+#include "rgw_rest_usage.h"
+#include "rgw_rest_bucket.h"
+#include "rgw_rest_metadata.h"
+#include "rgw_rest_log.h"
+#include "rgw_rest_config.h"
+#include "rgw_rest_realm.h"
+#include "rgw_rest_ratelimit.h"
+#include "rgw_swift_auth.h"
+#include "rgw_log.h"
+#include "rgw_lib.h"
+#include "rgw_frontend.h"
+#include "rgw_lib_frontend.h"
+#include "rgw_tools.h"
+#include "rgw_resolve.h"
+#include "rgw_process.h"
+#include "rgw_frontend.h"
+#include "rgw_http_client_curl.h"
+#include "rgw_kmip_client.h"
+#include "rgw_kmip_client_impl.h"
+#include "rgw_perf_counters.h"
+#include "rgw_signal.h"
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+#include "rgw_amqp.h"
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+#include "rgw_kafka.h"
+#endif
+#ifdef WITH_ARROW_FLIGHT
+#include "rgw_flight_frontend.h"
+#endif
+#include "rgw_asio_frontend.h"
+#include "rgw_dmclock_scheduler_ctx.h"
+#include "rgw_lua.h"
+#ifdef WITH_RADOSGW_DBSTORE
+#include "rgw_sal_dbstore.h"
+#endif
+#include "rgw_lua_background.h"
+#include "services/svc_zone.h"
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace {
+  TracepointProvider::Traits rgw_op_tracepoint_traits(
+    "librgw_op_tp.so", "rgw_op_tracing");
+  TracepointProvider::Traits rgw_rados_tracepoint_traits(
+    "librgw_rados_tp.so", "rgw_rados_tracing");
+}
+
+OpsLogFile* rgw::AppMain::ops_log_file;
+
+void rgw::AppMain::init_frontends1(bool nfs) 
+{
+  this->nfs = nfs;
+  std::string fe_key = (nfs) ? "rgw_nfs_frontends" : "rgw_frontends";
+  std::vector<std::string> frontends;
+  std::string rgw_frontends_str = g_conf().get_val<string>(fe_key);
+  g_conf().early_expand_meta(rgw_frontends_str, &cerr);
+  get_str_vec(rgw_frontends_str, ",", frontends);
+
+  /* default frontends */
+  if (nfs) {
+    const auto is_rgw_nfs = [](const auto& s){return s == "rgw-nfs";};
+    if (std::find_if(frontends.begin(), frontends.end(), is_rgw_nfs) == frontends.end()) {
+      frontends.push_back("rgw-nfs");
+    }
+  } else {
+    if (frontends.empty()) {
+      frontends.push_back("beast");
+    }
+  }
+
+  for (auto &f : frontends) {
+    if (f.find("beast") != string::npos) {
+      have_http_frontend = true;
+      if (f.find("port") != string::npos) {
+        // check for the most common ws problems
+        if ((f.find("port=") == string::npos) ||
+            (f.find("port= ") != string::npos)) {
+          derr <<
+    R"(WARNING: radosgw frontend config found unexpected spacing around 'port'
+    (ensure frontend port parameter has the form 'port=80' with no spaces
+    before or after '='))"
+               << dendl;
+        }
+      }
+    } else {
+      if (f.find("civetweb") != string::npos) {
+        have_http_frontend = true;
+      }
+    } /* fe !beast */
+
+    RGWFrontendConfig *config = new RGWFrontendConfig(f);
+    int r = config->init();
+    if (r < 0) {
+      delete config;
+      cerr << "ERROR: failed to init config: " << f << std::endl;
+      continue;
+    }
+
+    fe_configs.push_back(config);
+    fe_map.insert(
+        pair<string, RGWFrontendConfig *>(config->get_framework(), config));
+  } /* for each frontend */
+
+  // maintain existing region root pool for new multisite objects
+  if (!g_conf()->rgw_region_root_pool.empty()) {
+    const char *root_pool = g_conf()->rgw_region_root_pool.c_str();
+    if (g_conf()->rgw_zonegroup_root_pool.empty()) {
+      g_conf().set_val_or_die("rgw_zonegroup_root_pool", root_pool);
+    }
+    if (g_conf()->rgw_period_root_pool.empty()) {
+      g_conf().set_val_or_die("rgw_period_root_pool", root_pool);
+    }
+    if (g_conf()->rgw_realm_root_pool.empty()) {
+      g_conf().set_val_or_die("rgw_realm_root_pool", root_pool);
+    }
+  }
+
+  // for region -> zonegroup conversion (must happen before
+  // common_init_finish())
+  if (!g_conf()->rgw_region.empty() && g_conf()->rgw_zonegroup.empty()) {
+    g_conf().set_val_or_die("rgw_zonegroup", g_conf()->rgw_region.c_str());
+  }
+
+  ceph::crypto::init_openssl_engine_once();
+} /* init_frontends1 */
+
+void rgw::AppMain::init_numa()
+{
+  if (nfs) {
+    return;
+  }
+
+  int numa_node = g_conf().get_val<int64_t>("rgw_numa_node");
+  size_t numa_cpu_set_size = 0;
+  cpu_set_t numa_cpu_set;
+
+  if (numa_node >= 0) {
+    int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
+    if (r < 0) {
+      dout(1) << __func__ << " unable to determine rgw numa node " << numa_node
+              << " CPUs" << dendl;
+      numa_node = -1;
+    } else {
+      r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
+      if (r < 0) {
+        derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
+        << dendl;
+      }
+    }
+  } else {
+    dout(1) << __func__ << " not setting numa affinity" << dendl;
+  }
+} /* init_numa */
+
+void rgw::AppMain::init_storage()
+{
+    auto run_gc =
+    (g_conf()->rgw_enable_gc_threads &&
+      ((!nfs) || (nfs && g_conf()->rgw_nfs_run_gc_threads)));
+
+  auto run_lc =
+    (g_conf()->rgw_enable_lc_threads &&
+      ((!nfs) || (nfs && g_conf()->rgw_nfs_run_lc_threads)));
+
+  auto run_quota =
+    (g_conf()->rgw_enable_quota_threads &&
+      ((!nfs) || (nfs && g_conf()->rgw_nfs_run_quota_threads)));
+
+  auto run_sync =
+    (g_conf()->rgw_run_sync_thread &&
+      ((!nfs) || (nfs && g_conf()->rgw_nfs_run_sync_thread)));
+
+  DriverManager::Config cfg = DriverManager::get_config(false, g_ceph_context);
+  env.driver = DriverManager::get_storage(dpp, dpp->get_cct(),
+          cfg,
+          run_gc,
+          run_lc,
+          run_quota,
+          run_sync,
+          g_conf().get_val<bool>("rgw_dynamic_resharding"),
+          g_conf()->rgw_cache_enabled);
+
+} /* init_storage */
+
+void rgw::AppMain::init_perfcounters()
+{
+  (void) rgw_perf_start(dpp->get_cct());
+} /* init_perfcounters */
+
+void rgw::AppMain::init_http_clients()
+{
+  rgw_init_resolver();
+  rgw::curl::setup_curl(fe_map);
+  rgw_http_client_init(dpp->get_cct());
+  rgw_kmip_client_init(*new RGWKMIPManagerImpl(dpp->get_cct()));
+} /* init_http_clients */
+
+void rgw::AppMain::cond_init_apis() 
+{
+   rgw_rest_init(g_ceph_context, env.driver->get_zone()->get_zonegroup());
+
+  if (have_http_frontend) {
+    std::vector<std::string> apis;
+    get_str_vec(g_conf()->rgw_enable_apis, apis);
+
+    std::map<std::string, bool> apis_map;
+    for (auto &api : apis) {
+      apis_map[api] = true;
+    }
+
+    /* warn about insecure keystone secret config options */
+    if (!(g_ceph_context->_conf->rgw_keystone_admin_token.empty() ||
+          g_ceph_context->_conf->rgw_keystone_admin_password.empty())) {
+      dout(0)
+          << "WARNING: rgw_keystone_admin_token and "
+             "rgw_keystone_admin_password should be avoided as they can "
+             "expose secrets.  Prefer the new rgw_keystone_admin_token_path "
+             "and rgw_keystone_admin_password_path options, which read their "
+             "secrets from files."
+          << dendl;
+    }
+
+    // S3 website mode is a specialization of S3
+    const bool s3website_enabled = apis_map.count("s3website") > 0;
+    const bool sts_enabled = apis_map.count("sts") > 0;
+    const bool iam_enabled = apis_map.count("iam") > 0;
+    const bool pubsub_enabled =
+        apis_map.count("pubsub") > 0 || apis_map.count("notifications") > 0;
+    // Swift API entrypoint could placed in the root instead of S3
+    const bool swift_at_root = g_conf()->rgw_swift_url_prefix == "/";
+    if (apis_map.count("s3") > 0 || s3website_enabled) {
+      if (!swift_at_root) {
+        rest.register_default_mgr(set_logging(
+            rest_filter(env.driver, RGW_REST_S3,
+                        new RGWRESTMgr_S3(s3website_enabled, sts_enabled,
+                                          iam_enabled, pubsub_enabled))));
+      } else {
+        derr << "Cannot have the S3 or S3 Website enabled together with "
+             << "Swift API placed in the root of hierarchy" << dendl;
+      }
+    }
+
+    if (apis_map.count("swift") > 0) {
+      RGWRESTMgr_SWIFT* const swift_resource = new RGWRESTMgr_SWIFT;
+
+      if (! g_conf()->rgw_cross_domain_policy.empty()) {
+        swift_resource->register_resource("crossdomain.xml",
+                            set_logging(new RGWRESTMgr_SWIFT_CrossDomain));
+      }
+
+      swift_resource->register_resource("healthcheck",
+                            set_logging(new RGWRESTMgr_SWIFT_HealthCheck));
+
+      swift_resource->register_resource("info",
+                            set_logging(new RGWRESTMgr_SWIFT_Info));
+
+      if (! swift_at_root) {
+        rest.register_resource(g_conf()->rgw_swift_url_prefix,
+                            set_logging(rest_filter(env.driver, RGW_REST_SWIFT,
+                                                    swift_resource)));
+      } else {
+        if (env.driver->get_zone()->get_zonegroup().get_zone_count() > 1) {
+          derr << "Placing Swift API in the root of URL hierarchy while running"
+              << " multi-site configuration requires another instance of RadosGW"
+              << " with S3 API enabled!" << dendl;
+        }
+
+        rest.register_default_mgr(set_logging(swift_resource));
+      }
+    }
+
+    if (apis_map.count("swift_auth") > 0) {
+      rest.register_resource(g_conf()->rgw_swift_auth_entry,
+                set_logging(new RGWRESTMgr_SWIFT_Auth));
+    }
+
+    if (apis_map.count("admin") > 0) {
+      RGWRESTMgr_Admin *admin_resource = new RGWRESTMgr_Admin;
+      admin_resource->register_resource("info", new RGWRESTMgr_Info);
+      admin_resource->register_resource("usage", new RGWRESTMgr_Usage);
+      /* Register driver-specific admin APIs */
+      env.driver->register_admin_apis(admin_resource);
+      rest.register_resource(g_conf()->rgw_admin_entry, admin_resource);
+    }
+  } /* have_http_frontend */
+} /* init_apis */
+
+void rgw::AppMain::init_ldap()
+{
+  CephContext* cct = env.driver->ctx();
+  const string &ldap_uri = cct->_conf->rgw_ldap_uri;
+  const string &ldap_binddn = cct->_conf->rgw_ldap_binddn;
+  const string &ldap_searchdn = cct->_conf->rgw_ldap_searchdn;
+  const string &ldap_searchfilter = cct->_conf->rgw_ldap_searchfilter;
+  const string &ldap_dnattr = cct->_conf->rgw_ldap_dnattr;
+  std::string ldap_bindpw = parse_rgw_ldap_bindpw(cct);
+
+  ldh.reset(new rgw::LDAPHelper(ldap_uri, ldap_binddn,
+            ldap_bindpw.c_str(), ldap_searchdn, ldap_searchfilter, ldap_dnattr));
+  ldh->init();
+  ldh->bind();
+} /* init_ldap */
+
+void rgw::AppMain::init_opslog()
+{
+  rgw_log_usage_init(dpp->get_cct(), env.driver);
+
+  OpsLogManifold *olog_manifold = new OpsLogManifold();
+  if (!g_conf()->rgw_ops_log_socket_path.empty()) {
+    OpsLogSocket *olog_socket =
+        new OpsLogSocket(g_ceph_context, g_conf()->rgw_ops_log_data_backlog);
+    olog_socket->init(g_conf()->rgw_ops_log_socket_path);
+    olog_manifold->add_sink(olog_socket);
+  }
+  if (!g_conf()->rgw_ops_log_file_path.empty()) {
+    ops_log_file =
+        new OpsLogFile(g_ceph_context, g_conf()->rgw_ops_log_file_path,
+                       g_conf()->rgw_ops_log_data_backlog);
+    ops_log_file->start();
+    olog_manifold->add_sink(ops_log_file);
+  }
+  olog_manifold->add_sink(new OpsLogRados(env.driver));
+  olog = olog_manifold;
+} /* init_opslog */
+
+int rgw::AppMain::init_frontends2(RGWLib* rgwlib)
+{
+  int r{0};
+  vector<string> frontends_def;
+  std::string frontend_defs_str =
+    g_conf().get_val<string>("rgw_frontend_defaults");
+  get_str_vec(frontend_defs_str, ",", frontends_def);
+
+  service_map_meta["pid"] = stringify(getpid());
+
+  std::map<std::string, std::unique_ptr<RGWFrontendConfig> > fe_def_map;
+  for (auto& f : frontends_def) {
+    RGWFrontendConfig *config = new RGWFrontendConfig(f);
+    int r = config->init();
+    if (r < 0) {
+      delete config;
+      cerr << "ERROR: failed to init default config: " << f << std::endl;
+      continue;
+    }
+    fe_def_map[config->get_framework()].reset(config);
+  }
+
+  /* Initialize the registry of auth strategies which will coordinate
+   * the dynamic reconfiguration. */
+  implicit_tenant_context.reset(new rgw::auth::ImplicitTenants{g_conf()});
+  g_conf().add_observer(implicit_tenant_context.get());
+
+  /* allocate a mime table (you'd never guess that from the name) */
+  rgw_tools_init(dpp, dpp->get_cct());
+
+  /* Header custom behavior */
+  rest.register_x_headers(g_conf()->rgw_log_http_headers);
+
+  sched_ctx.reset(new rgw::dmclock::SchedulerCtx{dpp->get_cct()});
+  ratelimiter.reset(new ActiveRateLimiter{dpp->get_cct()});
+  ratelimiter->start();
+
+  // initialize RGWProcessEnv
+  env.rest = &rest;
+  env.olog = olog;
+  env.auth_registry = rgw::auth::StrategyRegistry::create(
+      dpp->get_cct(), *implicit_tenant_context, env.driver);
+  env.ratelimiting = ratelimiter.get();
+
+  int fe_count = 0;
+  for (multimap<string, RGWFrontendConfig *>::iterator fiter = fe_map.begin();
+       fiter != fe_map.end(); ++fiter, ++fe_count) {
+    RGWFrontendConfig *config = fiter->second;
+    string framework = config->get_framework();
+
+    auto def_iter = fe_def_map.find(framework);
+    if (def_iter != fe_def_map.end()) {
+      config->set_default_config(*def_iter->second);
+    }
+
+    RGWFrontend* fe = nullptr;
+
+    if (framework == "loadgen") {
+      fe = new RGWLoadGenFrontend(env, config);
+    }
+    else if (framework == "beast") {
+      fe = new RGWAsioFrontend(env, config, *sched_ctx);
+    }
+    else if (framework == "rgw-nfs") {
+      fe = new RGWLibFrontend(env, config);
+      if (rgwlib) {
+        rgwlib->set_fe(static_cast<RGWLibFrontend*>(fe));
+      }
+    }
+    else if (framework == "arrow_flight") {
+#ifdef WITH_ARROW_FLIGHT
+      int port;
+      config->get_val("port", 8077, &port);
+      fe = new rgw::flight::FlightFrontend(env, config, port);
+#else
+      derr << "WARNING: arrow_flight frontend requested, but not included in build; skipping" << dendl;
+      continue;
+#endif
+    }
+
+    service_map_meta["frontend_type#" + stringify(fe_count)] = framework;
+    service_map_meta["frontend_config#" + stringify(fe_count)] = config->get_config();
+
+    if (! fe) {
+      dout(0) << "WARNING: skipping unknown framework: " << framework << dendl;
+      continue;
+    }
+
+    dout(0) << "starting handler: " << fiter->first << dendl;
+    int r = fe->init();
+    if (r < 0) {
+      derr << "ERROR: failed initializing frontend" << dendl;
+      return -r;
+    }
+    r = fe->run();
+    if (r < 0) {
+      derr << "ERROR: failed run" << dendl;
+      return -r;
+    }
+
+    fes.push_back(fe);
+  }
+
+  std::string daemon_type = (nfs) ? "rgw-nfs" : "rgw";
+  r = env.driver->register_to_service_map(dpp, daemon_type, service_map_meta);
+  if (r < 0) {
+    derr << "ERROR: failed to register to service map: " << cpp_strerror(-r) << dendl;
+    /* ignore error */
+  }
+
+  if (env.driver->get_name() == "rados") {
+    // add a watcher to respond to realm configuration changes
+    pusher = std::make_unique<RGWPeriodPusher>(dpp, env.driver, null_yield);
+    fe_pauser = std::make_unique<RGWFrontendPauser>(fes, pusher.get());
+    rgw_pauser = std::make_unique<RGWPauser>();
+    rgw_pauser->add_pauser(fe_pauser.get());
+    if (env.lua.background) {
+      rgw_pauser->add_pauser(env.lua.background);
+    }
+    reloader = std::make_unique<RGWRealmReloader>(
+        env, *implicit_tenant_context, service_map_meta, rgw_pauser.get());
+    realm_watcher = std::make_unique<RGWRealmWatcher>(dpp, g_ceph_context,
+				  static_cast<rgw::sal::RadosStore*>(env.driver)->svc()->zone->get_realm());
+    realm_watcher->add_watcher(RGWRealmNotify::Reload, *reloader);
+    realm_watcher->add_watcher(RGWRealmNotify::ZonesNeedPeriod, *pusher.get());
+  }
+
+  return r;
+} /* init_frontends2 */
+
+void rgw::AppMain::init_tracepoints()
+{
+  TracepointProvider::initialize<rgw_rados_tracepoint_traits>(dpp->get_cct());
+  TracepointProvider::initialize<rgw_op_tracepoint_traits>(dpp->get_cct());
+  tracing::rgw::tracer.init("rgw");
+} /* init_tracepoints() */
+
+void rgw::AppMain::init_notification_endpoints()
+{
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+  if (!rgw::amqp::init(dpp->get_cct())) {
+    derr << "ERROR: failed to initialize AMQP manager" << dendl;
+  }
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+  if (!rgw::kafka::init(dpp->get_cct())) {
+    derr << "ERROR: failed to initialize Kafka manager" << dendl;
+  }
+#endif
+} /* init_notification_endpoints */
+
+void rgw::AppMain::init_lua()
+{
+  rgw::sal::Driver* driver = env.driver;
+  int r{0};
+  std::string path = g_conf().get_val<std::string>("rgw_luarocks_location");
+  if (!path.empty()) {
+    path += "/" + g_conf()->name.to_str();
+  }
+  env.lua.luarocks_path = path;
+
+#ifdef WITH_RADOSGW_LUA_PACKAGES
+  rgw::lua::packages_t failed_packages;
+  std::string output;
+  r = rgw::lua::install_packages(dpp, driver, null_yield, path,
+                                 failed_packages, output);
+  if (r < 0) {
+    dout(1) << "WARNING: failed to install lua packages from allowlist"
+            << dendl;
+  }
+  if (!output.empty()) {
+    dout(10) << "INFO: lua packages installation output: \n" << output << dendl;
+  }
+  for (const auto &p : failed_packages) {
+    dout(5) << "WARNING: failed to install lua package: " << p
+            << " from allowlist" << dendl;
+  }
+#endif
+
+  env.lua.manager = env.driver->get_lua_manager();
+
+  if (driver->get_name() == "rados") { /* Supported for only RadosStore */
+    lua_background = std::make_unique<
+      rgw::lua::Background>(driver, dpp->get_cct(), path);
+    lua_background->start();
+    env.lua.background = lua_background.get();
+  }
+} /* init_lua */
+
+void rgw::AppMain::shutdown(std::function<void(void)> finalize_async_signals)
+{
+  if (env.driver->get_name() == "rados") {
+    reloader.reset(); // stop the realm reloader
+  }
+
+  for (auto& fe : fes) {
+    fe->stop();
+  }
+
+  for (auto& fe : fes) {
+    fe->join();
+    delete fe;
+  }
+
+  for (auto& fec : fe_configs) {
+    delete fec;
+  }
+
+  ldh.reset(nullptr); // deletes
+  finalize_async_signals(); // callback
+  rgw_log_usage_finalize();
+  
+  delete olog;
+
+  if (lua_background) {
+    lua_background->shutdown();
+  }
+
+  DriverManager::close_storage(env.driver);
+
+  rgw_tools_cleanup();
+  rgw_shutdown_resolver();
+  rgw_http_client_cleanup();
+  rgw_kmip_client_cleanup();
+  rgw::curl::cleanup_curl();
+  g_conf().remove_observer(implicit_tenant_context.get());
+  implicit_tenant_context.reset(); // deletes
+#ifdef WITH_RADOSGW_AMQP_ENDPOINT
+  rgw::amqp::shutdown();
+#endif
+#ifdef WITH_RADOSGW_KAFKA_ENDPOINT
+  rgw::kafka::shutdown();
+#endif
+  rgw_perf_stop(g_ceph_context);
+  ratelimiter.reset(); // deletes--ensure this happens before we destruct
+} /* AppMain::shutdown */
diff --git a/src/rgw/rgw_arn.cc b/src/rgw/rgw_arn.cc
new file mode 100644
index 000000000..fddc3d769
--- /dev/null
+++ b/src/rgw/rgw_arn.cc
@@ -0,0 +1,387 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_arn.h"
+#include "rgw_common.h"
+#include <regex>
+
+using namespace std;
+
+namespace rgw {
+
+namespace {
+boost::optional<Partition> to_partition(const smatch::value_type& p,
+					bool wildcards) {
+  if (p == "aws") {
+    return Partition::aws;
+  } else if (p == "aws-cn") {
+    return Partition::aws_cn;
+  } else if (p == "aws-us-gov") {
+    return Partition::aws_us_gov;
+  } else if (p == "*" && wildcards) {
+    return Partition::wildcard;
+  } else {
+    return boost::none;
+  }
+
+  ceph_abort();
+}
+
+boost::optional<Service> to_service(const smatch::value_type& s,
+				    bool wildcards) {
+  static const unordered_map<string, Service> services = {
+    { "acm", Service::acm },
+    { "apigateway", Service::apigateway },
+    { "appstream", Service::appstream },
+    { "artifact", Service::artifact },
+    { "autoscaling", Service::autoscaling },
+    { "aws-marketplace", Service::aws_marketplace },
+    { "aws-marketplace-management",
+      Service::aws_marketplace_management },
+    { "aws-portal", Service::aws_portal },
+    { "cloudformation", Service::cloudformation },
+    { "cloudfront", Service::cloudfront },
+    { "cloudhsm", Service::cloudhsm },
+    { "cloudsearch", Service::cloudsearch },
+    { "cloudtrail", Service::cloudtrail },
+    { "cloudwatch", Service::cloudwatch },
+    { "codebuild", Service::codebuild },
+    { "codecommit", Service::codecommit },
+    { "codedeploy", Service::codedeploy },
+    { "codepipeline", Service::codepipeline },
+    { "cognito-identity", Service::cognito_identity },
+    { "cognito-idp", Service::cognito_idp },
+    { "cognito-sync", Service::cognito_sync },
+    { "config", Service::config },
+    { "datapipeline", Service::datapipeline },
+    { "devicefarm", Service::devicefarm },
+    { "directconnect", Service::directconnect },
+    { "dms", Service::dms },
+    { "ds", Service::ds },
+    { "dynamodb", Service::dynamodb },
+    { "ec2", Service::ec2 },
+    { "ecr", Service::ecr },
+    { "ecs", Service::ecs },
+    { "elasticache", Service::elasticache },
+    { "elasticbeanstalk", Service::elasticbeanstalk },
+    { "elasticfilesystem", Service::elasticfilesystem },
+    { "elasticloadbalancing", Service::elasticloadbalancing },
+    { "elasticmapreduce", Service::elasticmapreduce },
+    { "elastictranscoder", Service::elastictranscoder },
+    { "es", Service::es },
+    { "events", Service::events },
+    { "firehose", Service::firehose },
+    { "gamelift", Service::gamelift },
+    { "glacier", Service::glacier },
+    { "health", Service::health },
+    { "iam", Service::iam },
+    { "importexport", Service::importexport },
+    { "inspector", Service::inspector },
+    { "iot", Service::iot },
+    { "kinesis", Service::kinesis },
+    { "kinesisanalytics", Service::kinesisanalytics },
+    { "kms", Service::kms },
+    { "lambda", Service::lambda },
+    { "lightsail", Service::lightsail },
+    { "logs", Service::logs },
+    { "machinelearning", Service::machinelearning },
+    { "mobileanalytics", Service::mobileanalytics },
+    { "mobilehub", Service::mobilehub },
+    { "opsworks", Service::opsworks },
+    { "opsworks-cm", Service::opsworks_cm },
+    { "polly", Service::polly },
+    { "rds", Service::rds },
+    { "redshift", Service::redshift },
+    { "route53", Service::route53 },
+    { "route53domains", Service::route53domains },
+    { "s3", Service::s3 },
+    { "sdb", Service::sdb },
+    { "servicecatalog", Service::servicecatalog },
+    { "ses", Service::ses },
+    { "sns", Service::sns },
+    { "sqs", Service::sqs },
+    { "ssm", Service::ssm },
+    { "states", Service::states },
+    { "storagegateway", Service::storagegateway },
+    { "sts", Service::sts },
+    { "support", Service::support },
+    { "swf", Service::swf },
+    { "trustedadvisor", Service::trustedadvisor },
+    { "waf", Service::waf },
+    { "workmail", Service::workmail },
+    { "workspaces", Service::workspaces }};
+
+  if (wildcards && s == "*") {
+    return Service::wildcard;
+  }
+
+  auto i = services.find(s);
+  if (i == services.end()) {
+    return boost::none;
+  } else {
+    return i->second;
+  }
+}
+}
+ARN::ARN(const rgw_obj& o)
+  : partition(Partition::aws),
+    service(Service::s3),
+    region(),
+    account(o.bucket.tenant),
+    resource(o.bucket.name)
+{
+  resource.push_back('/');
+  resource.append(o.key.name);
+}
+
+ARN::ARN(const rgw_bucket& b)
+  : partition(Partition::aws),
+    service(Service::s3),
+    region(),
+    account(b.tenant),
+    resource(b.name) { }
+
+ARN::ARN(const rgw_bucket& b, const std::string& o)
+  : partition(Partition::aws),
+    service(Service::s3),
+    region(),
+    account(b.tenant),
+    resource(b.name) {
+  resource.push_back('/');
+  resource.append(o);
+}
+
+ARN::ARN(const std::string& resource_name, const std::string& type, const std::string& tenant, bool has_path)
+  : partition(Partition::aws),
+    service(Service::iam),
+    region(),
+    account(tenant),
+    resource(type) {
+  if (! has_path)
+    resource.push_back('/');
+  resource.append(resource_name);
+}
+
+boost::optional<ARN> ARN::parse(const std::string& s, bool wildcards) {
+  static const std::regex rx_wild("arn:([^:]*):([^:]*):([^:]*):([^:]*):([^:]*)",
+			     std::regex_constants::ECMAScript |
+			     std::regex_constants::optimize);
+  static const std::regex rx_no_wild(
+    "arn:([^:*]*):([^:*]*):([^:*]*):([^:*]*):(.*)",
+    std::regex_constants::ECMAScript |
+    std::regex_constants::optimize);
+
+  smatch match;
+
+  if ((s == "*") && wildcards) {
+    return ARN(Partition::wildcard, Service::wildcard, "*", "*", "*");
+  } else if (regex_match(s, match, wildcards ? rx_wild : rx_no_wild) &&
+	     match.size() == 6) {
+    if (auto p = to_partition(match[1], wildcards)) {
+      if (auto s = to_service(match[2], wildcards)) {
+	return ARN(*p, *s, match[3], match[4], match[5]);
+      }
+    }
+  }
+  return boost::none;
+}
+
+std::string ARN::to_string() const {
+  std::string s{"arn:"};
+
+  if (partition == Partition::aws) {
+    s.append("aws:");
+  } else if (partition == Partition::aws_cn) {
+    s.append("aws-cn:");
+  } else if (partition == Partition::aws_us_gov) {
+    s.append("aws-us-gov:");
+  } else {
+    s.append("*:");
+  }
+
+  static const std::unordered_map<Service, string> services = {
+    { Service::acm, "acm" },
+    { Service::apigateway, "apigateway" },
+    { Service::appstream, "appstream" },
+    { Service::artifact, "artifact" },
+    { Service::autoscaling, "autoscaling" },
+    { Service::aws_marketplace, "aws-marketplace" },
+    { Service::aws_marketplace_management, "aws-marketplace-management" },
+    { Service::aws_portal, "aws-portal" },
+    { Service::cloudformation, "cloudformation" },
+    { Service::cloudfront, "cloudfront" },
+    { Service::cloudhsm, "cloudhsm" },
+    { Service::cloudsearch, "cloudsearch" },
+    { Service::cloudtrail, "cloudtrail" },
+    { Service::cloudwatch, "cloudwatch" },
+    { Service::codebuild, "codebuild" },
+    { Service::codecommit, "codecommit" },
+    { Service::codedeploy, "codedeploy" },
+    { Service::codepipeline, "codepipeline" },
+    { Service::cognito_identity, "cognito-identity" },
+    { Service::cognito_idp, "cognito-idp" },
+    { Service::cognito_sync, "cognito-sync" },
+    { Service::config, "config" },
+    { Service::datapipeline, "datapipeline" },
+    { Service::devicefarm, "devicefarm" },
+    { Service::directconnect, "directconnect" },
+    { Service::dms, "dms" },
+    { Service::ds, "ds" },
+    { Service::dynamodb, "dynamodb" },
+    { Service::ec2, "ec2" },
+    { Service::ecr, "ecr" },
+    { Service::ecs, "ecs" },
+    { Service::elasticache, "elasticache" },
+    { Service::elasticbeanstalk, "elasticbeanstalk" },
+    { Service::elasticfilesystem, "elasticfilesystem" },
+    { Service::elasticloadbalancing, "elasticloadbalancing" },
+    { Service::elasticmapreduce, "elasticmapreduce" },
+    { Service::elastictranscoder, "elastictranscoder" },
+    { Service::es, "es" },
+    { Service::events, "events" },
+    { Service::firehose, "firehose" },
+    { Service::gamelift, "gamelift" },
+    { Service::glacier, "glacier" },
+    { Service::health, "health" },
+    { Service::iam, "iam" },
+    { Service::importexport, "importexport" },
+    { Service::inspector, "inspector" },
+    { Service::iot, "iot" },
+    { Service::kinesis, "kinesis" },
+    { Service::kinesisanalytics, "kinesisanalytics" },
+    { Service::kms, "kms" },
+    { Service::lambda, "lambda" },
+    { Service::lightsail, "lightsail" },
+    { Service::logs, "logs" },
+    { Service::machinelearning, "machinelearning" },
+    { Service::mobileanalytics, "mobileanalytics" },
+    { Service::mobilehub, "mobilehub" },
+    { Service::opsworks, "opsworks" },
+    { Service::opsworks_cm, "opsworks-cm" },
+    { Service::polly, "polly" },
+    { Service::rds, "rds" },
+    { Service::redshift, "redshift" },
+    { Service::route53, "route53" },
+    { Service::route53domains, "route53domains" },
+    { Service::s3, "s3" },
+    { Service::sdb, "sdb" },
+    { Service::servicecatalog, "servicecatalog" },
+    { Service::ses, "ses" },
+    { Service::sns, "sns" },
+    { Service::sqs, "sqs" },
+    { Service::ssm, "ssm" },
+    { Service::states, "states" },
+    { Service::storagegateway, "storagegateway" },
+    { Service::sts, "sts" },
+    { Service::support, "support" },
+    { Service::swf, "swf" },
+    { Service::trustedadvisor, "trustedadvisor" },
+    { Service::waf, "waf" },
+    { Service::workmail, "workmail" },
+    { Service::workspaces, "workspaces" }};
+
+  auto i = services.find(service);
+  if (i != services.end()) {
+    s.append(i->second);
+  } else {
+    s.push_back('*');
+  }
+  s.push_back(':');
+
+  s.append(region);
+  s.push_back(':');
+
+  s.append(account);
+  s.push_back(':');
+
+  s.append(resource);
+
+  return s;
+}
+
+bool operator ==(const ARN& l, const ARN& r) {
+  return ((l.partition == r.partition) &&
+	  (l.service == r.service) &&
+	  (l.region == r.region) &&
+	  (l.account == r.account) &&
+	  (l.resource == r.resource));
+}
+bool operator <(const ARN& l, const ARN& r) {
+  return ((l.partition < r.partition) ||
+	  (l.service < r.service) ||
+	  (l.region < r.region) ||
+	  (l.account < r.account) ||
+	  (l.resource < r.resource));
+}
+
+// The candidate is not allowed to have wildcards. The only way to
+// do that sanely would be to use unification rather than matching.
+bool ARN::match(const ARN& candidate) const {
+  if ((candidate.partition == Partition::wildcard) ||
+      (partition != candidate.partition && partition
+       != Partition::wildcard)) {
+    return false;
+  }
+
+  if ((candidate.service == Service::wildcard) ||
+      (service != candidate.service && service != Service::wildcard)) {
+    return false;
+  }
+
+  if (!match_policy(region, candidate.region, MATCH_POLICY_ARN)) {
+    return false;
+  }
+
+  if (!match_policy(account, candidate.account, MATCH_POLICY_ARN)) {
+    return false;
+  }
+
+  if (!match_policy(resource, candidate.resource, MATCH_POLICY_RESOURCE)) {
+    return false;
+  }
+
+  return true;
+}
+
+boost::optional<ARNResource> ARNResource::parse(const std::string& s) {
+  static const std::regex rx("^([^:/]*)[:/]?([^:/]*)?[:/]?(.*)$",
+			     std::regex_constants::ECMAScript |
+			     std::regex_constants::optimize);
+  std::smatch match;
+  if (!regex_match(s, match, rx)) {
+    return boost::none;
+  }
+  if (match[2].str().empty() && match[3].str().empty()) {
+    // only resource exist
+    return rgw::ARNResource("", match[1], "");
+  }
+
+  // resource type also exist, and cannot be wildcard
+  if (match[1] != std::string(wildcard)) {
+    // resource type cannot be wildcard
+    return rgw::ARNResource(match[1], match[2], match[3]);
+  }
+
+  return boost::none;
+}
+
+std::string ARNResource::to_string() const {
+  std::string s;
+
+  if (!resource_type.empty()) {
+    s.append(resource_type);
+    s.push_back(':');
+
+    s.append(resource);
+    s.push_back(':');
+
+    s.append(qualifier);
+  } else {
+    s.append(resource);
+  }
+
+  return s;
+}
+
+}
+
diff --git a/src/rgw/rgw_arn.h b/src/rgw/rgw_arn.h
new file mode 100644
index 000000000..406a9f429
--- /dev/null
+++ b/src/rgw/rgw_arn.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+#include <string>
+#include <boost/optional.hpp>
+
+class rgw_obj;
+class rgw_bucket;
+
+namespace rgw {
+
+enum struct Partition {
+  aws, aws_cn, aws_us_gov, wildcard
+  // If we wanted our own ARNs for principal type unique to us
+  // (maybe to integrate better with Swift) or for anything else we
+  // provide that doesn't map onto S3, we could add an 'rgw'
+  // partition type.
+};
+
+enum struct Service {
+  apigateway, appstream, artifact, autoscaling, aws_portal, acm,
+  cloudformation, cloudfront, cloudhsm, cloudsearch, cloudtrail,
+  cloudwatch, events, logs, codebuild, codecommit, codedeploy,
+  codepipeline, cognito_idp, cognito_identity, cognito_sync,
+  config, datapipeline, dms, devicefarm, directconnect,
+  ds, dynamodb, ec2, ecr, ecs, ssm, elasticbeanstalk, elasticfilesystem,
+  elasticloadbalancing, elasticmapreduce, elastictranscoder, elasticache,
+  es, gamelift, glacier, health, iam, importexport, inspector, iot,
+  kms, kinesisanalytics, firehose, kinesis, lambda, lightsail,
+  machinelearning, aws_marketplace, aws_marketplace_management,
+  mobileanalytics, mobilehub, opsworks, opsworks_cm, polly,
+  redshift, rds, route53, route53domains, sts, servicecatalog,
+  ses, sns, sqs, s3, swf, sdb, states, storagegateway, support,
+  trustedadvisor, waf, workmail, workspaces, wildcard
+};
+
+/* valid format:
+ * 'arn:partition:service:region:account-id:resource'
+ * The 'resource' part can be further broken down via ARNResource
+*/
+struct ARN {
+  Partition partition;
+  Service service;
+  std::string region;
+  // Once we refit tenant, we should probably use that instead of a
+  // string.
+  std::string account;
+  std::string resource;
+
+  ARN()
+    : partition(Partition::wildcard), service(Service::wildcard) {}
+  ARN(Partition partition, Service service, std::string region,
+      std::string account, std::string resource)
+    : partition(partition), service(service), region(std::move(region)),
+      account(std::move(account)), resource(std::move(resource)) {}
+  ARN(const rgw_obj& o);
+  ARN(const rgw_bucket& b);
+  ARN(const rgw_bucket& b, const std::string& o);
+  ARN(const std::string& resource_name, const std::string& type, const std::string& tenant, bool has_path=false);
+
+  static boost::optional<ARN> parse(const std::string& s,
+				    bool wildcard = false);
+  std::string to_string() const;
+
+  // `this` is the pattern
+  bool match(const ARN& candidate) const;
+};
+
+inline std::string to_string(const ARN& a) {
+  return a.to_string();
+}
+
+inline std::ostream& operator <<(std::ostream& m, const ARN& a) {
+  return m << to_string(a);
+}
+
+bool operator ==(const ARN& l, const ARN& r);
+bool operator <(const ARN& l, const ARN& r);
+
+/* valid formats (only resource part):
+ * 'resource'
+ * 'resourcetype/resource'
+ * 'resourcetype/resource/qualifier'
+ * 'resourcetype/resource:qualifier'
+ * 'resourcetype:resource'
+ * 'resourcetype:resource:qualifier'
+ * Note that 'resourceType' cannot be wildcard
+*/
+struct ARNResource {
+  constexpr static const char* const wildcard = "*";
+  std::string resource_type;
+  std::string resource;
+  std::string qualifier;
+
+  ARNResource() : resource_type(""), resource(wildcard), qualifier("") {}
+  
+  ARNResource(const std::string& _resource_type, const std::string& _resource, const std::string& _qualifier) : 
+    resource_type(std::move(_resource_type)), resource(std::move(_resource)), qualifier(std::move(_qualifier)) {}
+
+  static boost::optional<ARNResource> parse(const std::string& s);
+  
+  std::string to_string() const;
+};
+
+inline std::string to_string(const ARNResource& r) {
+  return r.to_string();
+}
+
+} // namespace rgw
+
+namespace std {
+template<>
+struct hash<::rgw::Service> {
+  size_t operator()(const ::rgw::Service& s) const noexcept {
+    // Invoke a default-constructed hash object for int.
+    return hash<int>()(static_cast<int>(s));
+  }
+};
+} // namespace std
+
diff --git a/src/rgw/rgw_asio_client.cc b/src/rgw/rgw_asio_client.cc
new file mode 100644
index 000000000..a0ec0bf5c
--- /dev/null
+++ b/src/rgw/rgw_asio_client.cc
@@ -0,0 +1,192 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/asio/write.hpp>
+
+#include "rgw_asio_client.h"
+#include "rgw_perf_counters.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace rgw::asio;
+
+ClientIO::ClientIO(parser_type& parser, bool is_ssl,
+                   const endpoint_type& local_endpoint,
+                   const endpoint_type& remote_endpoint)
+  : parser(parser), is_ssl(is_ssl),
+    local_endpoint(local_endpoint),
+    remote_endpoint(remote_endpoint),
+    txbuf(*this)
+{
+}
+
+ClientIO::~ClientIO() = default;
+
+int ClientIO::init_env(CephContext *cct)
+{
+  env.init(cct);
+
+  perfcounter->inc(l_rgw_qlen);
+  perfcounter->inc(l_rgw_qactive);
+
+  const auto& request = parser.get();
+  const auto& headers = request;
+  for (auto header = headers.begin(); header != headers.end(); ++header) {
+    const auto& field = header->name(); // enum type for known headers
+    const auto& name = header->name_string();
+    const auto& value = header->value();
+
+    if (field == beast::http::field::content_length) {
+      env.set("CONTENT_LENGTH", value.to_string());
+      continue;
+    }
+    if (field == beast::http::field::content_type) {
+      env.set("CONTENT_TYPE", value.to_string());
+      continue;
+    }
+
+    static const std::string_view HTTP_{"HTTP_"};
+
+    char buf[name.size() + HTTP_.size() + 1];
+    auto dest = std::copy(std::begin(HTTP_), std::end(HTTP_), buf);
+    for (auto src = name.begin(); src != name.end(); ++src, ++dest) {
+      if (*src == '-') {
+        *dest = '_';
+      } else if (*src == '_') {
+        *dest = '-';
+      } else {
+        *dest = std::toupper(*src);
+      }
+    }
+    *dest = '\0';
+
+    env.set(buf, value.to_string());
+  }
+
+  int major = request.version() / 10;
+  int minor = request.version() % 10;
+  env.set("HTTP_VERSION", std::to_string(major) + '.' + std::to_string(minor));
+
+  env.set("REQUEST_METHOD", request.method_string().to_string());
+
+  // split uri from query
+  auto uri = request.target();
+  auto pos = uri.find('?');
+  if (pos != uri.npos) {
+    auto query = uri.substr(pos + 1);
+    env.set("QUERY_STRING", query.to_string());
+    uri = uri.substr(0, pos);
+  }
+  env.set("SCRIPT_URI", uri.to_string());
+
+  env.set("REQUEST_URI", request.target().to_string());
+
+  char port_buf[16];
+  snprintf(port_buf, sizeof(port_buf), "%d", local_endpoint.port());
+  env.set("SERVER_PORT", port_buf);
+  if (is_ssl) {
+    env.set("SERVER_PORT_SECURE", port_buf);
+  }
+  env.set("REMOTE_ADDR", remote_endpoint.address().to_string());
+  // TODO: set REMOTE_USER if authenticated
+  return 0;
+}
+
+size_t ClientIO::complete_request()
+{
+  perfcounter->inc(l_rgw_qlen, -1);
+  perfcounter->inc(l_rgw_qactive, -1);
+  return 0;
+}
+
+void ClientIO::flush()
+{
+  txbuf.pubsync();
+}
+
+size_t ClientIO::send_status(int status, const char* status_name)
+{
+  static constexpr size_t STATUS_BUF_SIZE = 128;
+
+  char statusbuf[STATUS_BUF_SIZE];
+  const auto statuslen = snprintf(statusbuf, sizeof(statusbuf),
+                                  "HTTP/1.1 %d %s\r\n", status, status_name);
+
+  return txbuf.sputn(statusbuf, statuslen);
+}
+
+size_t ClientIO::send_100_continue()
+{
+  const char HTTTP_100_CONTINUE[] = "HTTP/1.1 100 CONTINUE\r\n\r\n";
+  const size_t sent = txbuf.sputn(HTTTP_100_CONTINUE,
+                                  sizeof(HTTTP_100_CONTINUE) - 1);
+  flush();
+  sent100continue = true;
+  return sent;
+}
+
+static constexpr size_t TIME_BUF_SIZE = 128;
+static size_t dump_date_header(char (&timestr)[TIME_BUF_SIZE])
+{
+  const time_t gtime = time(nullptr);
+  struct tm result;
+  struct tm const * const tmp = gmtime_r(&gtime, &result);
+  if (tmp == nullptr) {
+    return 0;
+  }
+  return strftime(timestr, sizeof(timestr),
+                  "Date: %a, %d %b %Y %H:%M:%S %Z\r\n", tmp);
+}
+
+size_t ClientIO::complete_header()
+{
+  size_t sent = 0;
+
+  char timestr[TIME_BUF_SIZE];
+  if (dump_date_header(timestr)) {
+    sent += txbuf.sputn(timestr, strlen(timestr));
+  }
+
+  if (parser.keep_alive()) {
+    constexpr char CONN_KEEP_ALIVE[] = "Connection: Keep-Alive\r\n";
+    sent += txbuf.sputn(CONN_KEEP_ALIVE, sizeof(CONN_KEEP_ALIVE) - 1);
+  } else {
+    constexpr char CONN_KEEP_CLOSE[] = "Connection: close\r\n";
+    sent += txbuf.sputn(CONN_KEEP_CLOSE, sizeof(CONN_KEEP_CLOSE) - 1);
+  }
+
+  constexpr char HEADER_END[] = "\r\n";
+  sent += txbuf.sputn(HEADER_END, sizeof(HEADER_END) - 1);
+
+  flush();
+  return sent;
+}
+
+size_t ClientIO::send_header(const std::string_view& name,
+                             const std::string_view& value)
+{
+  static constexpr char HEADER_SEP[] = ": ";
+  static constexpr char HEADER_END[] = "\r\n";
+
+  size_t sent = 0;
+
+  sent += txbuf.sputn(name.data(), name.length());
+  sent += txbuf.sputn(HEADER_SEP, sizeof(HEADER_SEP) - 1);
+  sent += txbuf.sputn(value.data(), value.length());
+  sent += txbuf.sputn(HEADER_END, sizeof(HEADER_END) - 1);
+
+  return sent;
+}
+
+size_t ClientIO::send_content_length(uint64_t len)
+{
+  static constexpr size_t CONLEN_BUF_SIZE = 128;
+
+  char sizebuf[CONLEN_BUF_SIZE];
+  const auto sizelen = snprintf(sizebuf, sizeof(sizebuf),
+                                "Content-Length: %" PRIu64 "\r\n", len);
+
+  return txbuf.sputn(sizebuf, sizelen);
+}
diff --git a/src/rgw/rgw_asio_client.h b/src/rgw/rgw_asio_client.h
new file mode 100644
index 000000000..e2ab943dd
--- /dev/null
+++ b/src/rgw/rgw_asio_client.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <boost/asio/ip/tcp.hpp>
+#include <boost/beast/core.hpp>
+#include <boost/beast/http.hpp>
+#include "include/ceph_assert.h"
+
+#include "rgw_client_io.h"
+
+namespace rgw {
+namespace asio {
+
+namespace beast = boost::beast;
+using parser_type = beast::http::request_parser<beast::http::buffer_body>;
+
+class ClientIO : public io::RestfulClient,
+                 public io::BuffererSink {
+ protected:
+  parser_type& parser;
+ private:
+  const bool is_ssl;
+  using endpoint_type = boost::asio::ip::tcp::endpoint;
+  endpoint_type local_endpoint;
+  endpoint_type remote_endpoint;
+
+  RGWEnv env;
+
+  rgw::io::StaticOutputBufferer<> txbuf;
+  bool sent100continue = false;
+
+ public:
+  ClientIO(parser_type& parser, bool is_ssl,
+           const endpoint_type& local_endpoint,
+           const endpoint_type& remote_endpoint);
+  ~ClientIO() override;
+
+  int init_env(CephContext *cct) override;
+  size_t complete_request() override;
+  void flush() override;
+  size_t send_status(int status, const char *status_name) override;
+  size_t send_100_continue() override;
+  size_t send_header(const std::string_view& name,
+                     const std::string_view& value) override;
+  size_t send_content_length(uint64_t len) override;
+  size_t complete_header() override;
+
+  size_t send_body(const char* buf, size_t len) override {
+    return write_data(buf, len);
+  }
+
+  RGWEnv& get_env() noexcept override {
+    return env;
+  }
+
+  bool sent_100_continue() const { return sent100continue; }
+};
+
+} // namespace asio
+} // namespace rgw
diff --git a/src/rgw/rgw_asio_frontend.cc b/src/rgw/rgw_asio_frontend.cc
new file mode 100644
index 000000000..633a29633
--- /dev/null
+++ b/src/rgw/rgw_asio_frontend.cc
@@ -0,0 +1,1199 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <atomic>
+#include <ctime>
+#include <thread>
+#include <vector>
+
+#include <boost/asio.hpp>
+#include <boost/intrusive/list.hpp>
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+
+#include <boost/context/protected_fixedsize_stack.hpp>
+#include <spawn/spawn.hpp>
+
+#include "common/async/shared_mutex.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+
+#include "rgw_asio_client.h"
+#include "rgw_asio_frontend.h"
+
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+#include <boost/asio/ssl.hpp>
+#endif
+
+#include "common/split.h"
+
+#include "services/svc_config_key.h"
+#include "services/svc_zone.h"
+
+#include "rgw_zone.h"
+
+#include "rgw_asio_frontend_timer.h"
+#include "rgw_dmclock_async_scheduler.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace {
+
+using tcp = boost::asio::ip::tcp;
+namespace http = boost::beast::http;
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+namespace ssl = boost::asio::ssl;
+#endif
+
+struct Connection;
+
+// use explicit executor types instead of the type-erased boost::asio::executor
+using executor_type = boost::asio::io_context::executor_type;
+
+using tcp_socket = boost::asio::basic_stream_socket<tcp, executor_type>;
+using tcp_stream = boost::beast::basic_stream<tcp, executor_type>;
+
+using timeout_timer = rgw::basic_timeout_timer<ceph::coarse_mono_clock,
+      executor_type, Connection>;
+
+static constexpr size_t parse_buffer_size = 65536;
+using parse_buffer = boost::beast::flat_static_buffer<parse_buffer_size>;
+
+// use mmap/mprotect to allocate 512k coroutine stacks
+auto make_stack_allocator() {
+  return boost::context::protected_fixedsize_stack{512*1024};
+}
+
+using namespace std;
+
+template <typename Stream>
+class StreamIO : public rgw::asio::ClientIO {
+  CephContext* const cct;
+  Stream& stream;
+  timeout_timer& timeout;
+  yield_context yield;
+  parse_buffer& buffer;
+  boost::system::error_code fatal_ec;
+ public:
+  StreamIO(CephContext *cct, Stream& stream, timeout_timer& timeout,
+           rgw::asio::parser_type& parser, yield_context yield,
+           parse_buffer& buffer, bool is_ssl,
+           const tcp::endpoint& local_endpoint,
+           const tcp::endpoint& remote_endpoint)
+      : ClientIO(parser, is_ssl, local_endpoint, remote_endpoint),
+        cct(cct), stream(stream), timeout(timeout), yield(yield),
+        buffer(buffer)
+  {}
+
+  boost::system::error_code get_fatal_error_code() const { return fatal_ec; }
+
+  size_t write_data(const char* buf, size_t len) override {
+    boost::system::error_code ec;
+    timeout.start();
+    auto bytes = boost::asio::async_write(stream, boost::asio::buffer(buf, len),
+                                          yield[ec]);
+    timeout.cancel();
+    if (ec) {
+      ldout(cct, 4) << "write_data failed: " << ec.message() << dendl;
+      if (ec == boost::asio::error::broken_pipe) {
+        boost::system::error_code ec_ignored;
+        stream.lowest_layer().shutdown(tcp_socket::shutdown_both, ec_ignored);
+      }
+      if (!fatal_ec) {
+        fatal_ec = ec;
+      }
+      throw rgw::io::Exception(ec.value(), std::system_category());
+    }
+    return bytes;
+  }
+
+  size_t recv_body(char* buf, size_t max) override {
+    auto& message = parser.get();
+    auto& body_remaining = message.body();
+    body_remaining.data = buf;
+    body_remaining.size = max;
+
+    while (body_remaining.size && !parser.is_done()) {
+      boost::system::error_code ec;
+      timeout.start();
+      http::async_read_some(stream, buffer, parser, yield[ec]);
+      timeout.cancel();
+      if (ec == http::error::need_buffer) {
+        break;
+      }
+      if (ec) {
+        ldout(cct, 4) << "failed to read body: " << ec.message() << dendl;
+        if (!fatal_ec) {
+          fatal_ec = ec;
+        }
+        throw rgw::io::Exception(ec.value(), std::system_category());
+      }
+    }
+    return max - body_remaining.size;
+  }
+};
+
+// output the http version as a string, ie 'HTTP/1.1'
+struct http_version {
+  unsigned major_ver;
+  unsigned minor_ver;
+  explicit http_version(unsigned version)
+    : major_ver(version / 10), minor_ver(version % 10) {}
+};
+std::ostream& operator<<(std::ostream& out, const http_version& v) {
+  return out << "HTTP/" << v.major_ver << '.' << v.minor_ver;
+}
+
+// log an http header value or '-' if it's missing
+struct log_header {
+  const http::fields& fields;
+  http::field field;
+  std::string_view quote;
+  log_header(const http::fields& fields, http::field field,
+             std::string_view quote = "")
+    : fields(fields), field(field), quote(quote) {}
+};
+std::ostream& operator<<(std::ostream& out, const log_header& h) {
+  auto p = h.fields.find(h.field);
+  if (p == h.fields.end()) {
+    return out << '-';
+  }
+  return out << h.quote << p->value() << h.quote;
+}
+
+// log fractional seconds in milliseconds
+struct log_ms_remainder {
+  ceph::coarse_real_time t;
+  log_ms_remainder(ceph::coarse_real_time t) : t(t) {}
+};
+std::ostream& operator<<(std::ostream& out, const log_ms_remainder& m) {
+  using namespace std::chrono;
+  return out << std::setfill('0') << std::setw(3)
+      << duration_cast<milliseconds>(m.t.time_since_epoch()).count() % 1000;
+}
+
+// log time in apache format: day/month/year:hour:minute:second zone
+struct log_apache_time {
+  ceph::coarse_real_time t;
+  log_apache_time(ceph::coarse_real_time t) : t(t) {}
+};
+std::ostream& operator<<(std::ostream& out, const log_apache_time& a) {
+  const auto t = ceph::coarse_real_clock::to_time_t(a.t);
+  const auto local = std::localtime(&t);
+  return out << std::put_time(local, "%d/%b/%Y:%T.") << log_ms_remainder{a.t}
+      << std::put_time(local, " %z");
+};
+
+using SharedMutex = ceph::async::SharedMutex<boost::asio::io_context::executor_type>;
+
+template <typename Stream>
+void handle_connection(boost::asio::io_context& context,
+                       RGWProcessEnv& env, Stream& stream,
+                       timeout_timer& timeout, size_t header_limit,
+                       parse_buffer& buffer, bool is_ssl,
+                       SharedMutex& pause_mutex,
+                       rgw::dmclock::Scheduler *scheduler,
+                       const std::string& uri_prefix,
+                       boost::system::error_code& ec,
+                       yield_context yield)
+{
+  // don't impose a limit on the body, since we read it in pieces
+  static constexpr size_t body_limit = std::numeric_limits<size_t>::max();
+
+  auto cct = env.driver->ctx();
+
+  // read messages from the stream until eof
+  for (;;) {
+    // configure the parser
+    rgw::asio::parser_type parser;
+    parser.header_limit(header_limit);
+    parser.body_limit(body_limit);
+    timeout.start();
+    // parse the header
+    http::async_read_header(stream, buffer, parser, yield[ec]);
+    timeout.cancel();
+    if (ec == boost::asio::error::connection_reset ||
+        ec == boost::asio::error::bad_descriptor ||
+        ec == boost::asio::error::operation_aborted ||
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+        ec == ssl::error::stream_truncated ||
+#endif
+        ec == http::error::end_of_stream) {
+      ldout(cct, 20) << "failed to read header: " << ec.message() << dendl;
+      return;
+    }
+    auto& message = parser.get();
+    if (ec) {
+      ldout(cct, 1) << "failed to read header: " << ec.message() << dendl;
+      http::response<http::empty_body> response;
+      response.result(http::status::bad_request);
+      response.version(message.version() == 10 ? 10 : 11);
+      response.prepare_payload();
+      timeout.start();
+      http::async_write(stream, response, yield[ec]);
+      timeout.cancel();
+      if (ec) {
+        ldout(cct, 5) << "failed to write response: " << ec.message() << dendl;
+      }
+      ldout(cct, 1) << "====== req done http_status=400 ======" << dendl;
+      return;
+    }
+
+    bool expect_continue = (message[http::field::expect] == "100-continue");
+
+    {
+      auto lock = pause_mutex.async_lock_shared(yield[ec]);
+      if (ec == boost::asio::error::operation_aborted) {
+        return;
+      } else if (ec) {
+        ldout(cct, 1) << "failed to lock: " << ec.message() << dendl;
+        return;
+      }
+
+      // process the request
+      RGWRequest req{env.driver->get_new_req_id()};
+
+      auto& socket = stream.lowest_layer();
+      const auto& remote_endpoint = socket.remote_endpoint(ec);
+      if (ec) {
+        ldout(cct, 1) << "failed to connect client: " << ec.message() << dendl;
+        return;
+      }
+      const auto& local_endpoint = socket.local_endpoint(ec);
+      if (ec) {
+        ldout(cct, 1) << "failed to connect client: " << ec.message() << dendl;
+        return;
+      }
+
+      StreamIO real_client{cct, stream, timeout, parser, yield, buffer,
+                           is_ssl, local_endpoint, remote_endpoint};
+
+      auto real_client_io = rgw::io::add_reordering(
+                              rgw::io::add_buffering(cct,
+                                rgw::io::add_chunking(
+                                  rgw::io::add_conlen_controlling(
+                                    &real_client))));
+      RGWRestfulIO client(cct, &real_client_io);
+      optional_yield y = null_yield;
+      if (cct->_conf->rgw_beast_enable_async) {
+        y = optional_yield{context, yield};
+      }
+      int http_ret = 0;
+      string user = "-";
+      const auto started = ceph::coarse_real_clock::now();
+      ceph::coarse_real_clock::duration latency{};
+      process_request(env, &req, uri_prefix, &client, y,
+                      scheduler, &user, &latency, &http_ret);
+
+      if (cct->_conf->subsys.should_gather(ceph_subsys_rgw_access, 1)) {
+        // access log line elements begin per Apache Combined Log Format with additions following
+        lsubdout(cct, rgw_access, 1) << "beast: " << std::hex << &req << std::dec << ": "
+            << remote_endpoint.address() << " - " << user << " [" << log_apache_time{started} << "] \""
+            << message.method_string() << ' ' << message.target() << ' '
+            << http_version{message.version()} << "\" " << http_ret << ' '
+            << client.get_bytes_sent() + client.get_bytes_received() << ' '
+            << log_header{message, http::field::referer, "\""} << ' '
+            << log_header{message, http::field::user_agent, "\""} << ' '
+            << log_header{message, http::field::range} << " latency="
+            << latency << dendl;
+      }
+
+      // process_request() can't distinguish between connection errors and
+      // http/s3 errors, so check StreamIO for fatal connection errors
+      ec = real_client.get_fatal_error_code();
+      if (ec) {
+        return;
+      }
+
+      if (real_client.sent_100_continue()) {
+        expect_continue = false;
+      }
+    }
+
+    if (!parser.keep_alive()) {
+      return;
+    }
+
+    // if we failed before reading the entire message, discard any remaining
+    // bytes before reading the next
+    while (!expect_continue && !parser.is_done()) {
+      static std::array<char, 1024> discard_buffer;
+
+      auto& body = parser.get().body();
+      body.size = discard_buffer.size();
+      body.data = discard_buffer.data();
+
+      timeout.start();
+      http::async_read_some(stream, buffer, parser, yield[ec]);
+      timeout.cancel();
+      if (ec == http::error::need_buffer) {
+        continue;
+      }
+      if (ec == boost::asio::error::connection_reset) {
+        return;
+      }
+      if (ec) {
+        ldout(cct, 5) << "failed to discard unread message: "
+            << ec.message() << dendl;
+        return;
+      }
+    }
+  }
+}
+
+// timeout support requires that connections are reference-counted, because the
+// timeout_handler can outlive the coroutine
+struct Connection : boost::intrusive::list_base_hook<>,
+                    boost::intrusive_ref_counter<Connection>
+{
+  tcp_socket socket;
+  parse_buffer buffer;
+
+  explicit Connection(tcp_socket&& socket) noexcept
+      : socket(std::move(socket)) {}
+
+  void close(boost::system::error_code& ec) {
+    socket.close(ec);
+  }
+
+  tcp_socket& get_socket() { return socket; }
+};
+
+class ConnectionList {
+  using List = boost::intrusive::list<Connection>;
+  List connections;
+  std::mutex mutex;
+
+  void remove(Connection& c) {
+    std::lock_guard lock{mutex};
+    if (c.is_linked()) {
+      connections.erase(List::s_iterator_to(c));
+    }
+  }
+ public:
+  class Guard {
+    ConnectionList *list;
+    Connection *conn;
+   public:
+    Guard(ConnectionList *list, Connection *conn) : list(list), conn(conn) {}
+    ~Guard() { list->remove(*conn); }
+  };
+  [[nodiscard]] Guard add(Connection& conn) {
+    std::lock_guard lock{mutex};
+    connections.push_back(conn);
+    return Guard{this, &conn};
+  }
+  void close(boost::system::error_code& ec) {
+    std::lock_guard lock{mutex};
+    for (auto& conn : connections) {
+      conn.socket.close(ec);
+    }
+    connections.clear();
+  }
+};
+
+namespace dmc = rgw::dmclock;
+class AsioFrontend {
+  RGWProcessEnv& env;
+  RGWFrontendConfig* conf;
+  boost::asio::io_context context;
+  std::string uri_prefix;
+  ceph::timespan request_timeout = std::chrono::milliseconds(REQUEST_TIMEOUT);
+  size_t header_limit = 16384;
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+  boost::optional<ssl::context> ssl_context;
+  int get_config_key_val(string name,
+                         const string& type,
+                         bufferlist *pbl);
+  int ssl_set_private_key(const string& name, bool is_ssl_cert);
+  int ssl_set_certificate_chain(const string& name);
+  int init_ssl();
+#endif
+  SharedMutex pause_mutex;
+  std::unique_ptr<rgw::dmclock::Scheduler> scheduler;
+
+  struct Listener {
+    tcp::endpoint endpoint;
+    tcp::acceptor acceptor;
+    tcp_socket socket;
+    bool use_ssl = false;
+    bool use_nodelay = false;
+
+    explicit Listener(boost::asio::io_context& context)
+      : acceptor(context), socket(context) {}
+  };
+  std::vector<Listener> listeners;
+
+  ConnectionList connections;
+
+  // work guard to keep run() threads busy while listeners are paused
+  using Executor = boost::asio::io_context::executor_type;
+  std::optional<boost::asio::executor_work_guard<Executor>> work;
+
+  std::vector<std::thread> threads;
+  std::atomic<bool> going_down{false};
+
+  CephContext* ctx() const { return env.driver->ctx(); }
+  std::optional<dmc::ClientCounters> client_counters;
+  std::unique_ptr<dmc::ClientConfig> client_config;
+  void accept(Listener& listener, boost::system::error_code ec);
+
+ public:
+  AsioFrontend(RGWProcessEnv& env, RGWFrontendConfig* conf,
+	       dmc::SchedulerCtx& sched_ctx)
+    : env(env), conf(conf), pause_mutex(context.get_executor())
+  {
+    auto sched_t = dmc::get_scheduler_t(ctx());
+    switch(sched_t){
+    case dmc::scheduler_t::dmclock:
+      scheduler.reset(new dmc::AsyncScheduler(ctx(),
+                                              context,
+                                              std::ref(sched_ctx.get_dmc_client_counters()),
+                                              sched_ctx.get_dmc_client_config(),
+                                              *sched_ctx.get_dmc_client_config(),
+                                              dmc::AtLimit::Reject));
+      break;
+    case dmc::scheduler_t::none:
+      lderr(ctx()) << "Got invalid scheduler type for beast, defaulting to throttler" << dendl;
+      [[fallthrough]];
+    case dmc::scheduler_t::throttler:
+      scheduler.reset(new dmc::SimpleThrottler(ctx()));
+
+    }
+  }
+
+  int init();
+  int run();
+  void stop();
+  void join();
+  void pause();
+  void unpause();
+};
+
+unsigned short parse_port(const char *input, boost::system::error_code& ec)
+{
+  char *end = nullptr;
+  auto port = std::strtoul(input, &end, 10);
+  if (port > std::numeric_limits<unsigned short>::max()) {
+    ec.assign(ERANGE, boost::system::system_category());
+  } else if (port == 0 && end == input) {
+    ec.assign(EINVAL, boost::system::system_category());
+  }
+  return port;
+}
+	
+tcp::endpoint parse_endpoint(boost::asio::string_view input,
+                             unsigned short default_port,
+                             boost::system::error_code& ec)
+{
+  tcp::endpoint endpoint;
+
+  if (input.empty()) {
+    ec = boost::asio::error::invalid_argument;
+    return endpoint;
+  }
+
+  if (input[0] == '[') { // ipv6
+    const size_t addr_begin = 1;
+    const size_t addr_end = input.find(']');
+    if (addr_end == input.npos) { // no matching ]
+      ec = boost::asio::error::invalid_argument;
+      return endpoint;
+    }
+    if (addr_end + 1 < input.size()) {
+      // :port must must follow [ipv6]
+      if (input[addr_end + 1] != ':') {
+        ec = boost::asio::error::invalid_argument;
+        return endpoint;
+      } else {
+        auto port_str = input.substr(addr_end + 2);
+        endpoint.port(parse_port(port_str.data(), ec));
+      }
+    } else {
+      endpoint.port(default_port);
+    }
+    auto addr = input.substr(addr_begin, addr_end - addr_begin);
+    endpoint.address(boost::asio::ip::make_address_v6(addr, ec));
+  } else { // ipv4
+    auto colon = input.find(':');
+    if (colon != input.npos) {
+      auto port_str = input.substr(colon + 1);
+      endpoint.port(parse_port(port_str.data(), ec));
+      if (ec) {
+        return endpoint;
+      }
+    } else {
+      endpoint.port(default_port);
+    }
+    auto addr = input.substr(0, colon);
+    endpoint.address(boost::asio::ip::make_address_v4(addr, ec));
+  }
+  return endpoint;
+}
+
+static int drop_privileges(CephContext *ctx)
+{
+  uid_t uid = ctx->get_set_uid();
+  gid_t gid = ctx->get_set_gid();
+  std::string uid_string = ctx->get_set_uid_string();
+  std::string gid_string = ctx->get_set_gid_string();
+  if (gid && setgid(gid) != 0) {
+    int err = errno;
+    ldout(ctx, -1) << "unable to setgid " << gid << ": " << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  if (uid && setuid(uid) != 0) {
+    int err = errno;
+    ldout(ctx, -1) << "unable to setuid " << uid << ": " << cpp_strerror(err) << dendl;
+    return -err;
+  }
+  if (uid && gid) {
+    ldout(ctx, 0) << "set uid:gid to " << uid << ":" << gid
+                  << " (" << uid_string << ":" << gid_string << ")" << dendl;
+  }
+  return 0;
+}
+
+int AsioFrontend::init()
+{
+  boost::system::error_code ec;
+  auto& config = conf->get_config_map();
+
+  if (auto i = config.find("prefix"); i != config.end()) {
+    uri_prefix = i->second;
+  }
+
+// Setting global timeout
+  auto timeout = config.find("request_timeout_ms");
+  if (timeout != config.end()) {
+    auto timeout_number = ceph::parse<uint64_t>(timeout->second);
+    if (timeout_number) {
+      request_timeout =  std::chrono::milliseconds(*timeout_number);
+    } else {
+      lderr(ctx()) << "WARNING: invalid value for request_timeout_ms: "
+      << timeout->second << " setting it to the default value: "
+      << REQUEST_TIMEOUT << dendl;
+    }
+  }
+
+  auto max_header_size = config.find("max_header_size");
+  if (max_header_size != config.end()) {
+    auto limit = ceph::parse<uint64_t>(max_header_size->second);
+    if (!limit) {
+      lderr(ctx()) << "WARNING: invalid value for max_header_size: "
+          << max_header_size->second << ", using the default value: "
+          << header_limit << dendl;
+    } else if (*limit > parse_buffer_size) { // can't exceed parse buffer size
+      header_limit = parse_buffer_size;
+      lderr(ctx()) << "WARNING: max_header_size " << max_header_size->second
+          << " capped at maximum value " << header_limit << dendl;
+    } else {
+      header_limit = *limit;
+    }
+  }
+
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+  int r = init_ssl();
+  if (r < 0) {
+    return r;
+  }
+#endif
+
+  // parse endpoints
+  auto ports = config.equal_range("port");
+  for (auto i = ports.first; i != ports.second; ++i) {
+    auto port = parse_port(i->second.c_str(), ec);
+    if (ec) {
+      lderr(ctx()) << "failed to parse port=" << i->second << dendl;
+      return -ec.value();
+    }
+    listeners.emplace_back(context);
+    listeners.back().endpoint.port(port);
+
+    listeners.emplace_back(context);
+    listeners.back().endpoint = tcp::endpoint(tcp::v6(), port);
+  }
+
+  auto endpoints = config.equal_range("endpoint");
+  for (auto i = endpoints.first; i != endpoints.second; ++i) {
+    auto endpoint = parse_endpoint(i->second, 80, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to parse endpoint=" << i->second << dendl;
+      return -ec.value();
+    }
+    listeners.emplace_back(context);
+    listeners.back().endpoint = endpoint;
+  }
+  // parse tcp nodelay
+  auto nodelay = config.find("tcp_nodelay");
+  if (nodelay != config.end()) {
+    for (auto& l : listeners) {
+      l.use_nodelay = (nodelay->second == "1");
+    }
+  }
+  
+
+  bool socket_bound = false;
+  // start listeners
+  for (auto& l : listeners) {
+    l.acceptor.open(l.endpoint.protocol(), ec);
+    if (ec) {
+      if (ec == boost::asio::error::address_family_not_supported) {
+	ldout(ctx(), 0) << "WARNING: cannot open socket for endpoint=" << l.endpoint
+			<< ", " << ec.message() << dendl;
+	continue;
+      }
+
+      lderr(ctx()) << "failed to open socket: " << ec.message() << dendl;
+      return -ec.value();
+    }
+
+    if (l.endpoint.protocol() == tcp::v6()) {
+      l.acceptor.set_option(boost::asio::ip::v6_only(true), ec);
+      if (ec) {
+        lderr(ctx()) << "failed to set v6_only socket option: "
+		     << ec.message() << dendl;
+	return -ec.value();
+      }
+    }
+
+    l.acceptor.set_option(tcp::acceptor::reuse_address(true));
+    l.acceptor.bind(l.endpoint, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to bind address " << l.endpoint
+          << ": " << ec.message() << dendl;
+      return -ec.value();
+    }
+
+    auto it = config.find("max_connection_backlog");
+    auto max_connection_backlog = boost::asio::socket_base::max_listen_connections;
+    if (it != config.end()) {
+      string err;
+      max_connection_backlog = strict_strtol(it->second.c_str(), 10, &err);
+      if (!err.empty()) {
+        ldout(ctx(), 0) << "WARNING: invalid value for max_connection_backlog=" << it->second << dendl;
+        max_connection_backlog = boost::asio::socket_base::max_listen_connections;
+      }
+    }
+    l.acceptor.listen(max_connection_backlog);
+    l.acceptor.async_accept(l.socket,
+                            [this, &l] (boost::system::error_code ec) {
+                              accept(l, ec);
+                            });
+
+    ldout(ctx(), 4) << "frontend listening on " << l.endpoint << dendl;
+    socket_bound = true;
+  }
+  if (!socket_bound) {
+    lderr(ctx()) << "Unable to listen at any endpoints" << dendl;
+    return -EINVAL;
+  }
+
+  return drop_privileges(ctx());
+}
+
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+
+static string config_val_prefix = "config://";
+
+namespace {
+
+class ExpandMetaVar {
+  map<string, string> meta_map;
+
+public:
+  ExpandMetaVar(rgw::sal::Zone* zone_svc) {
+    meta_map["realm"] = zone_svc->get_realm_name();
+    meta_map["realm_id"] = zone_svc->get_realm_id();
+    meta_map["zonegroup"] = zone_svc->get_zonegroup().get_name();
+    meta_map["zonegroup_id"] = zone_svc->get_zonegroup().get_id();
+    meta_map["zone"] = zone_svc->get_name();
+    meta_map["zone_id"] = zone_svc->get_id();
+  }
+
+  string process_str(const string& in);
+};
+
+string ExpandMetaVar::process_str(const string& in)
+{
+  if (meta_map.empty()) {
+    return in;
+  }
+
+  auto pos = in.find('$');
+  if (pos == std::string::npos) {
+    return in;
+  }
+
+  string out;
+  decltype(pos) last_pos = 0;
+
+  while (pos != std::string::npos) {
+    if (pos > last_pos) {
+      out += in.substr(last_pos, pos - last_pos);
+    }
+
+    string var;
+    const char *valid_chars = "abcdefghijklmnopqrstuvwxyz_";
+
+    size_t endpos = 0;
+    if (in[pos+1] == '{') {
+      // ...${foo_bar}...
+      endpos = in.find_first_not_of(valid_chars, pos + 2);
+      if (endpos != std::string::npos &&
+	  in[endpos] == '}') {
+	var = in.substr(pos + 2, endpos - pos - 2);
+	endpos++;
+      }
+    } else {
+      // ...$foo...
+      endpos = in.find_first_not_of(valid_chars, pos + 1);
+      if (endpos != std::string::npos)
+	var = in.substr(pos + 1, endpos - pos - 1);
+      else
+	var = in.substr(pos + 1);
+    }
+    string var_source = in.substr(pos, endpos - pos);
+    last_pos = endpos;
+
+    auto iter = meta_map.find(var);
+    if (iter != meta_map.end()) {
+      out += iter->second;
+    } else {
+      out += var_source;
+    }
+    pos = in.find('$', last_pos);
+  }
+  if (last_pos != std::string::npos) {
+    out += in.substr(last_pos);
+  }
+
+  return out;
+}
+
+} /* anonymous namespace */
+
+int AsioFrontend::get_config_key_val(string name,
+                                     const string& type,
+                                     bufferlist *pbl)
+{
+  if (name.empty()) {
+    lderr(ctx()) << "bad " << type << " config value" << dendl;
+    return -EINVAL;
+  }
+
+  int r = env.driver->get_config_key_val(name, pbl);
+  if (r < 0) {
+    lderr(ctx()) << type << " was not found: " << name << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int AsioFrontend::ssl_set_private_key(const string& name, bool is_ssl_certificate)
+{
+  boost::system::error_code ec;
+
+  if (!boost::algorithm::starts_with(name, config_val_prefix)) {
+    ssl_context->use_private_key_file(name, ssl::context::pem, ec);
+  } else {
+    bufferlist bl;
+    int r = get_config_key_val(name.substr(config_val_prefix.size()),
+                               "ssl_private_key",
+                               &bl);
+    if (r < 0) {
+      return r;
+    }
+    ssl_context->use_private_key(boost::asio::buffer(bl.c_str(), bl.length()),
+                                 ssl::context::pem, ec);
+  }
+
+  if (ec) {
+    if (!is_ssl_certificate) {
+      lderr(ctx()) << "failed to add ssl_private_key=" << name
+        << ": " << ec.message() << dendl;
+    } else {
+      lderr(ctx()) << "failed to use ssl_certificate=" << name
+        << " as a private key: " << ec.message() << dendl;
+    }
+    return -ec.value();
+  }
+
+  return 0;
+}
+
+int AsioFrontend::ssl_set_certificate_chain(const string& name)
+{
+  boost::system::error_code ec;
+
+  if (!boost::algorithm::starts_with(name, config_val_prefix)) {
+    ssl_context->use_certificate_chain_file(name, ec);
+  } else {
+    bufferlist bl;
+    int r = get_config_key_val(name.substr(config_val_prefix.size()),
+                               "ssl_certificate",
+                               &bl);
+    if (r < 0) {
+      return r;
+    }
+    ssl_context->use_certificate_chain(boost::asio::buffer(bl.c_str(), bl.length()),
+                                 ec);
+  }
+
+  if (ec) {
+    lderr(ctx()) << "failed to use ssl_certificate=" << name
+      << ": " << ec.message() << dendl;
+    return -ec.value();
+  }
+
+  return 0;
+}
+
+int AsioFrontend::init_ssl()
+{
+  boost::system::error_code ec;
+  auto& config = conf->get_config_map();
+
+  // ssl configuration
+  std::optional<string> cert = conf->get_val("ssl_certificate");
+  if (cert) {
+    // only initialize the ssl context if it's going to be used
+    ssl_context = boost::in_place(ssl::context::tls);
+  }
+
+  std::optional<string> key = conf->get_val("ssl_private_key");
+  bool have_cert = false;
+
+  if (key && !cert) {
+    lderr(ctx()) << "no ssl_certificate configured for ssl_private_key" << dendl;
+    return -EINVAL;
+  }
+
+  std::optional<string> options = conf->get_val("ssl_options");
+  if (options) {
+    if (!cert) {
+      lderr(ctx()) << "no ssl_certificate configured for ssl_options" << dendl;
+      return -EINVAL;
+    }
+  } else if (cert) {
+    options = "no_sslv2:no_sslv3:no_tlsv1:no_tlsv1_1";
+  }
+
+  if (options) {
+    for (auto &option : ceph::split(*options, ":")) {
+      if (option == "default_workarounds") {
+        ssl_context->set_options(ssl::context::default_workarounds);
+      } else if (option == "no_compression") {
+        ssl_context->set_options(ssl::context::no_compression);
+      } else if (option == "no_sslv2") {
+        ssl_context->set_options(ssl::context::no_sslv2);
+      } else if (option == "no_sslv3") {
+        ssl_context->set_options(ssl::context::no_sslv3);
+      } else if (option == "no_tlsv1") {
+        ssl_context->set_options(ssl::context::no_tlsv1);
+      } else if (option == "no_tlsv1_1") {
+        ssl_context->set_options(ssl::context::no_tlsv1_1);
+      } else if (option == "no_tlsv1_2") {
+        ssl_context->set_options(ssl::context::no_tlsv1_2);
+      } else if (option == "single_dh_use") {
+        ssl_context->set_options(ssl::context::single_dh_use);
+      } else {
+        lderr(ctx()) << "ignoring unknown ssl option '" << option << "'" << dendl;
+      }
+    }
+  }
+
+  std::optional<string> ciphers = conf->get_val("ssl_ciphers");
+  if (ciphers) {
+    if (!cert) {
+      lderr(ctx()) << "no ssl_certificate configured for ssl_ciphers" << dendl;
+      return -EINVAL;
+    }
+
+    int r = SSL_CTX_set_cipher_list(ssl_context->native_handle(),
+                                    ciphers->c_str());
+    if (r == 0) {
+      lderr(ctx()) << "no cipher could be selected from ssl_ciphers: "
+                   << *ciphers << dendl;
+      return -EINVAL;
+    }
+  }
+
+  auto ports = config.equal_range("ssl_port");
+  auto endpoints = config.equal_range("ssl_endpoint");
+
+  /*
+   * don't try to config certificate if frontend isn't configured for ssl
+   */
+  if (ports.first == ports.second &&
+      endpoints.first == endpoints.second) {
+    return 0;
+  }
+
+  bool key_is_cert = false;
+
+  if (cert) {
+    if (!key) {
+      key = cert;
+      key_is_cert = true;
+    }
+
+    ExpandMetaVar emv(env.driver->get_zone());
+
+    cert = emv.process_str(*cert);
+    key = emv.process_str(*key);
+
+    int r = ssl_set_private_key(*key, key_is_cert);
+    bool have_private_key = (r >= 0);
+    if (r < 0) {
+      if (!key_is_cert) {
+        r = ssl_set_private_key(*cert, true);
+        have_private_key = (r >= 0);
+      }
+    }
+
+    if (have_private_key) {
+      int r = ssl_set_certificate_chain(*cert);
+      have_cert = (r >= 0);
+    }
+  }
+
+  // parse ssl endpoints
+  for (auto i = ports.first; i != ports.second; ++i) {
+    if (!have_cert) {
+      lderr(ctx()) << "no ssl_certificate configured for ssl_port" << dendl;
+      return -EINVAL;
+    }
+    auto port = parse_port(i->second.c_str(), ec);
+    if (ec) {
+      lderr(ctx()) << "failed to parse ssl_port=" << i->second << dendl;
+      return -ec.value();
+    }
+    listeners.emplace_back(context);
+    listeners.back().endpoint.port(port);
+    listeners.back().use_ssl = true;
+
+    listeners.emplace_back(context);
+    listeners.back().endpoint = tcp::endpoint(tcp::v6(), port);
+    listeners.back().use_ssl = true;
+  }
+
+  for (auto i = endpoints.first; i != endpoints.second; ++i) {
+    if (!have_cert) {
+      lderr(ctx()) << "no ssl_certificate configured for ssl_endpoint" << dendl;
+      return -EINVAL;
+    }
+    auto endpoint = parse_endpoint(i->second, 443, ec);
+    if (ec) {
+      lderr(ctx()) << "failed to parse ssl_endpoint=" << i->second << dendl;
+      return -ec.value();
+    }
+    listeners.emplace_back(context);
+    listeners.back().endpoint = endpoint;
+    listeners.back().use_ssl = true;
+  }
+  return 0;
+}
+#endif // WITH_RADOSGW_BEAST_OPENSSL
+
+void AsioFrontend::accept(Listener& l, boost::system::error_code ec)
+{
+  if (!l.acceptor.is_open()) {
+    return;
+  } else if (ec == boost::asio::error::operation_aborted) {
+    return;
+  } else if (ec) {
+    ldout(ctx(), 1) << "accept failed: " << ec.message() << dendl;
+    return;
+  }
+  auto stream = std::move(l.socket);
+  stream.set_option(tcp::no_delay(l.use_nodelay), ec);
+  l.acceptor.async_accept(l.socket,
+                          [this, &l] (boost::system::error_code ec) {
+                            accept(l, ec);
+                          });
+  
+  // spawn a coroutine to handle the connection
+#ifdef WITH_RADOSGW_BEAST_OPENSSL
+  if (l.use_ssl) {
+    spawn::spawn(context,
+      [this, s=std::move(stream)] (yield_context yield) mutable {
+        auto conn = boost::intrusive_ptr{new Connection(std::move(s))};
+        auto c = connections.add(*conn);
+        // wrap the tcp stream in an ssl stream
+        boost::asio::ssl::stream<tcp_socket&> stream{conn->socket, *ssl_context};
+        auto timeout = timeout_timer{context.get_executor(), request_timeout, conn};
+        // do ssl handshake
+        boost::system::error_code ec;
+        timeout.start();
+        auto bytes = stream.async_handshake(ssl::stream_base::server,
+                                            conn->buffer.data(), yield[ec]);
+        timeout.cancel();
+        if (ec) {
+          ldout(ctx(), 1) << "ssl handshake failed: " << ec.message() << dendl;
+          return;
+        }
+        conn->buffer.consume(bytes);
+        handle_connection(context, env, stream, timeout, header_limit,
+                          conn->buffer, true, pause_mutex, scheduler.get(),
+                          uri_prefix, ec, yield);
+        if (!ec) {
+          // ssl shutdown (ignoring errors)
+          stream.async_shutdown(yield[ec]);
+        }
+        conn->socket.shutdown(tcp::socket::shutdown_both, ec);
+      }, make_stack_allocator());
+  } else {
+#else
+  {
+#endif // WITH_RADOSGW_BEAST_OPENSSL
+    spawn::spawn(context,
+      [this, s=std::move(stream)] (yield_context yield) mutable {
+        auto conn = boost::intrusive_ptr{new Connection(std::move(s))};
+        auto c = connections.add(*conn);
+        auto timeout = timeout_timer{context.get_executor(), request_timeout, conn};
+        boost::system::error_code ec;
+        handle_connection(context, env, conn->socket, timeout, header_limit,
+                          conn->buffer, false, pause_mutex, scheduler.get(),
+                          uri_prefix, ec, yield);
+        conn->socket.shutdown(tcp_socket::shutdown_both, ec);
+      }, make_stack_allocator());
+  }
+}
+
+int AsioFrontend::run()
+{
+  auto cct = ctx();
+  const int thread_count = cct->_conf->rgw_thread_pool_size;
+  threads.reserve(thread_count);
+
+  ldout(cct, 4) << "frontend spawning " << thread_count << " threads" << dendl;
+
+  // the worker threads call io_context::run(), which will return when there's
+  // no work left. hold a work guard to keep these threads going until join()
+  work.emplace(boost::asio::make_work_guard(context));
+
+  for (int i = 0; i < thread_count; i++) {
+    threads.emplace_back([this]() noexcept {
+      // request warnings on synchronous librados calls in this thread
+      is_asio_thread = true;
+      // Have uncaught exceptions kill the process and give a
+      // stacktrace, not be swallowed.
+      context.run();
+    });
+  }
+  return 0;
+}
+
+void AsioFrontend::stop()
+{
+  ldout(ctx(), 4) << "frontend initiating shutdown..." << dendl;
+
+  going_down = true;
+
+  boost::system::error_code ec;
+  // close all listeners
+  for (auto& listener : listeners) {
+    listener.acceptor.close(ec);
+  }
+  // close all connections
+  connections.close(ec);
+  pause_mutex.cancel();
+}
+
+void AsioFrontend::join()
+{
+  if (!going_down) {
+    stop();
+  }
+  work.reset();
+
+  ldout(ctx(), 4) << "frontend joining threads..." << dendl;
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  ldout(ctx(), 4) << "frontend done" << dendl;
+}
+
+void AsioFrontend::pause()
+{
+  ldout(ctx(), 4) << "frontend pausing connections..." << dendl;
+
+  // cancel pending calls to accept(), but don't close the sockets
+  boost::system::error_code ec;
+  for (auto& l : listeners) {
+    l.acceptor.cancel(ec);
+  }
+
+  // pause and wait for outstanding requests to complete
+  pause_mutex.lock(ec);
+
+  if (ec) {
+    ldout(ctx(), 1) << "frontend failed to pause: " << ec.message() << dendl;
+  } else {
+    ldout(ctx(), 4) << "frontend paused" << dendl;
+  }
+}
+
+void AsioFrontend::unpause()
+{
+  // unpause to unblock connections
+  pause_mutex.unlock();
+
+  // start accepting connections again
+  for (auto& l : listeners) {
+    l.acceptor.async_accept(l.socket,
+                            [this, &l] (boost::system::error_code ec) {
+                              accept(l, ec);
+                            });
+  }
+
+  ldout(ctx(), 4) << "frontend unpaused" << dendl;
+}
+
+} // anonymous namespace
+
+class RGWAsioFrontend::Impl : public AsioFrontend {
+ public:
+  Impl(RGWProcessEnv& env, RGWFrontendConfig* conf,
+       rgw::dmclock::SchedulerCtx& sched_ctx)
+    : AsioFrontend(env, conf, sched_ctx) {}
+};
+
+RGWAsioFrontend::RGWAsioFrontend(RGWProcessEnv& env,
+                                 RGWFrontendConfig* conf,
+				 rgw::dmclock::SchedulerCtx& sched_ctx)
+  : impl(new Impl(env, conf, sched_ctx))
+{
+}
+
+RGWAsioFrontend::~RGWAsioFrontend() = default;
+
+int RGWAsioFrontend::init()
+{
+  return impl->init();
+}
+
+int RGWAsioFrontend::run()
+{
+  return impl->run();
+}
+
+void RGWAsioFrontend::stop()
+{
+  impl->stop();
+}
+
+void RGWAsioFrontend::join()
+{
+  impl->join();
+}
+
+void RGWAsioFrontend::pause_for_new_config()
+{
+  impl->pause();
+}
+
+void RGWAsioFrontend::unpause_with_new_config()
+{
+  impl->unpause();
+}
diff --git a/src/rgw/rgw_asio_frontend.h b/src/rgw/rgw_asio_frontend.h
new file mode 100644
index 000000000..2de6f337a
--- /dev/null
+++ b/src/rgw/rgw_asio_frontend.h
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <memory>
+#include "rgw_frontend.h"
+#define REQUEST_TIMEOUT 65000
+
+class RGWAsioFrontend : public RGWFrontend {
+  class Impl;
+  std::unique_ptr<Impl> impl;
+public:
+  RGWAsioFrontend(RGWProcessEnv& env, RGWFrontendConfig* conf,
+		  rgw::dmclock::SchedulerCtx& sched_ctx);
+  ~RGWAsioFrontend() override;
+
+  int init() override;
+  int run() override;
+  void stop() override;
+  void join() override;
+
+  void pause_for_new_config() override;
+  void unpause_with_new_config() override;
+};
diff --git a/src/rgw/rgw_asio_frontend_timer.h b/src/rgw/rgw_asio_frontend_timer.h
new file mode 100644
index 000000000..bc58790d6
--- /dev/null
+++ b/src/rgw/rgw_asio_frontend_timer.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <boost/asio/basic_waitable_timer.hpp>
+#include <boost/intrusive_ptr.hpp>
+
+#include "common/ceph_time.h"
+
+namespace rgw {
+
+// a WaitHandler that closes a stream if the timeout expires
+template <typename Stream>
+struct timeout_handler {
+  // this handler may outlive the timer/stream, so we need to hold a reference
+  // to keep the stream alive
+  boost::intrusive_ptr<Stream> stream;
+
+  explicit timeout_handler(boost::intrusive_ptr<Stream> stream) noexcept
+      : stream(std::move(stream)) {}
+
+  void operator()(boost::system::error_code ec) {
+    if (!ec) { // wait was not canceled
+      boost::system::error_code ec_ignored;
+      stream->get_socket().cancel();
+      stream->get_socket().shutdown(boost::asio::ip::tcp::socket::shutdown_both, ec_ignored);
+    }
+  }
+};
+
+// a timeout timer for stream operations
+template <typename Clock, typename Executor, typename Stream>
+class basic_timeout_timer {
+ public:
+  using clock_type = Clock;
+  using duration = typename clock_type::duration;
+  using executor_type = Executor;
+
+  explicit basic_timeout_timer(const executor_type& ex, duration dur,
+                               boost::intrusive_ptr<Stream> stream)
+      : timer(ex), dur(dur), stream(std::move(stream))
+  {}
+
+  basic_timeout_timer(const basic_timeout_timer&) = delete;
+  basic_timeout_timer& operator=(const basic_timeout_timer&) = delete;
+
+  void start() {
+    if (dur.count() > 0) {
+      timer.expires_after(dur);
+      timer.async_wait(timeout_handler{stream});
+    }
+  }
+
+  void cancel() {
+    if (dur.count() > 0) {
+      timer.cancel();
+    }
+  }
+
+ private:
+  using Timer = boost::asio::basic_waitable_timer<clock_type,
+        boost::asio::wait_traits<clock_type>, executor_type>;
+  Timer timer;
+  duration dur;
+  boost::intrusive_ptr<Stream> stream;
+};
+
+} // namespace rgw
diff --git a/src/rgw/rgw_auth.cc b/src/rgw/rgw_auth.cc
new file mode 100644
index 000000000..2c61b8361
--- /dev/null
+++ b/src/rgw/rgw_auth.cc
@@ -0,0 +1,934 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <array>
+#include <string>
+
+#include "rgw_common.h"
+#include "rgw_auth.h"
+#include "rgw_quota.h"
+#include "rgw_user.h"
+#include "rgw_http_client.h"
+#include "rgw_keystone.h"
+#include "rgw_sal.h"
+#include "rgw_log.h"
+
+#include "include/str_list.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw {
+namespace auth {
+
+std::unique_ptr<rgw::auth::Identity>
+transform_old_authinfo(CephContext* const cct,
+                       const rgw_user& auth_id,
+                       const int perm_mask,
+                       const bool is_admin,
+                       const uint32_t type)
+{
+  /* This class is not intended for public use. Should be removed altogether
+   * with this function after moving all our APIs to the new authentication
+   * infrastructure. */
+  class DummyIdentityApplier : public rgw::auth::Identity {
+    CephContext* const cct;
+
+    /* For this particular case it's OK to use rgw_user structure to convey
+     * the identity info as this was the policy for doing that before the
+     * new auth. */
+    const rgw_user id;
+    const int perm_mask;
+    const bool is_admin;
+    const uint32_t type;
+  public:
+    DummyIdentityApplier(CephContext* const cct,
+                         const rgw_user& auth_id,
+                         const int perm_mask,
+                         const bool is_admin,
+                         const uint32_t type)
+      : cct(cct),
+        id(auth_id),
+        perm_mask(perm_mask),
+        is_admin(is_admin),
+        type(type) {
+    }
+
+    uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override {
+      return rgw_perms_from_aclspec_default_strategy(id, aclspec, dpp);
+    }
+
+    bool is_admin_of(const rgw_user& acct_id) const override {
+      return is_admin;
+    }
+
+    bool is_owner_of(const rgw_user& acct_id) const override {
+      return id == acct_id;
+    }
+
+    bool is_identity(const idset_t& ids) const override {
+      for (auto& p : ids) {
+	if (p.is_wildcard()) {
+	  return true;
+	} else if (p.is_tenant() && p.get_tenant() == id.tenant) {
+	  return true;
+	} else if (p.is_user() &&
+		   (p.get_tenant() == id.tenant) &&
+		   (p.get_id() == id.id)) {
+	  return true;
+	}
+      }
+      return false;
+    }
+
+    uint32_t get_perm_mask() const override {
+      return perm_mask;
+    }
+
+    uint32_t get_identity_type() const override {
+      return type;
+    }
+
+    string get_acct_name() const override {
+      return {};
+    }
+
+    string get_subuser() const override {
+      return {};
+    }
+
+    void to_str(std::ostream& out) const override {
+      out << "RGWDummyIdentityApplier(auth_id=" << id
+          << ", perm_mask=" << perm_mask
+          << ", is_admin=" << is_admin << ")";
+    }
+  };
+
+  return std::unique_ptr<rgw::auth::Identity>(
+        new DummyIdentityApplier(cct,
+                                 auth_id,
+                                 perm_mask,
+                                 is_admin,
+                                 type));
+}
+
+std::unique_ptr<rgw::auth::Identity>
+transform_old_authinfo(const req_state* const s)
+{
+  return transform_old_authinfo(s->cct,
+                                s->user->get_id(),
+                                s->perm_mask,
+  /* System user has admin permissions by default - it's supposed to pass
+   * through any security check. */
+                                s->system_request,
+                                s->user->get_type());
+}
+
+} /* namespace auth */
+} /* namespace rgw */
+
+
+uint32_t rgw_perms_from_aclspec_default_strategy(
+  const rgw_user& uid,
+  const rgw::auth::Identity::aclspec_t& aclspec,
+  const DoutPrefixProvider *dpp)
+{
+  ldpp_dout(dpp, 5) << "Searching permissions for uid=" << uid <<  dendl;
+
+  const auto iter = aclspec.find(uid.to_str());
+  if (std::end(aclspec) != iter) {
+    ldpp_dout(dpp, 5) << "Found permission: " << iter->second << dendl;
+    return iter->second;
+  }
+
+  ldpp_dout(dpp, 5) << "Permissions for user not found" << dendl;
+  return 0;
+}
+
+
+static inline const std::string make_spec_item(const std::string& tenant,
+                                               const std::string& id)
+{
+  return tenant + ":" + id;
+}
+
+
+static inline std::pair<bool, rgw::auth::Engine::result_t>
+strategy_handle_rejected(rgw::auth::Engine::result_t&& engine_result,
+                         const rgw::auth::Strategy::Control policy,
+                         rgw::auth::Engine::result_t&& strategy_result)
+{
+  using Control = rgw::auth::Strategy::Control;
+  switch (policy) {
+    case Control::REQUISITE:
+      /* Don't try next. */
+      return std::make_pair(false, std::move(engine_result));
+
+    case Control::SUFFICIENT:
+      /* Don't try next. */
+      return std::make_pair(false, std::move(engine_result));
+
+    case Control::FALLBACK:
+      /* Don't try next. */
+      return std::make_pair(false, std::move(strategy_result));
+
+    default:
+      /* Huh, memory corruption? */
+      ceph_abort();
+  }
+}
+
+static inline std::pair<bool, rgw::auth::Engine::result_t>
+strategy_handle_denied(rgw::auth::Engine::result_t&& engine_result,
+                       const rgw::auth::Strategy::Control policy,
+                       rgw::auth::Engine::result_t&& strategy_result)
+{
+  using Control = rgw::auth::Strategy::Control;
+  switch (policy) {
+    case Control::REQUISITE:
+      /* Don't try next. */
+      return std::make_pair(false, std::move(engine_result));
+
+    case Control::SUFFICIENT:
+      /* Just try next. */
+      return std::make_pair(true, std::move(engine_result));
+
+    case Control::FALLBACK:
+      return std::make_pair(true, std::move(strategy_result));
+
+    default:
+      /* Huh, memory corruption? */
+      ceph_abort();
+  }
+}
+
+static inline std::pair<bool, rgw::auth::Engine::result_t>
+strategy_handle_granted(rgw::auth::Engine::result_t&& engine_result,
+                        const rgw::auth::Strategy::Control policy,
+                        rgw::auth::Engine::result_t&& strategy_result)
+{
+  using Control = rgw::auth::Strategy::Control;
+  switch (policy) {
+    case Control::REQUISITE:
+      /* Try next. */
+      return std::make_pair(true, std::move(engine_result));
+
+    case Control::SUFFICIENT:
+      /* Don't try next. */
+      return std::make_pair(false, std::move(engine_result));
+
+    case Control::FALLBACK:
+      /* Don't try next. */
+      return std::make_pair(false, std::move(engine_result));
+
+    default:
+      /* Huh, memory corruption? */
+      ceph_abort();
+  }
+}
+
+rgw::auth::Engine::result_t
+rgw::auth::Strategy::authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const
+{
+  result_t strategy_result = result_t::deny();
+
+  for (const stack_item_t& kv : auth_stack) {
+    const rgw::auth::Engine& engine = kv.first;
+    const auto& policy = kv.second;
+
+    ldpp_dout(dpp, 20) << get_name() << ": trying " << engine.get_name() << dendl;
+
+    result_t engine_result = result_t::deny();
+    try {
+      engine_result = engine.authenticate(dpp, s, y);
+    } catch (const int err) {
+      engine_result = result_t::deny(err);
+    }
+
+    bool try_next = true;
+    switch (engine_result.get_status()) {
+      case result_t::Status::REJECTED: {
+        ldpp_dout(dpp, 20) << engine.get_name() << " rejected with reason="
+                 << engine_result.get_reason() << dendl;
+
+        std::tie(try_next, strategy_result) = \
+          strategy_handle_rejected(std::move(engine_result), policy,
+                                   std::move(strategy_result));
+        break;
+      }
+      case result_t::Status::DENIED: {
+        ldpp_dout(dpp, 20) << engine.get_name() << " denied with reason="
+                 << engine_result.get_reason() << dendl;
+
+        std::tie(try_next, strategy_result) = \
+          strategy_handle_denied(std::move(engine_result), policy,
+                                 std::move(strategy_result));
+        break;
+      }
+      case result_t::Status::GRANTED: {
+        ldpp_dout(dpp, 20) << engine.get_name() << " granted access" << dendl;
+
+        std::tie(try_next, strategy_result) = \
+          strategy_handle_granted(std::move(engine_result), policy,
+                                  std::move(strategy_result));
+        break;
+      }
+      default: {
+        ceph_abort();
+      }
+    }
+
+    if (! try_next) {
+      break;
+    }
+  }
+
+  return strategy_result;
+}
+
+int
+rgw::auth::Strategy::apply(const DoutPrefixProvider *dpp, const rgw::auth::Strategy& auth_strategy,
+                           req_state* const s, optional_yield y) noexcept
+{
+  try {
+    auto result = auth_strategy.authenticate(dpp, s, y);
+    if (result.get_status() != decltype(result)::Status::GRANTED) {
+      /* Access denied is acknowledged by returning a std::unique_ptr with
+       * nullptr inside. */
+      ldpp_dout(dpp, 5) << "Failed the auth strategy, reason="
+                       << result.get_reason() << dendl;
+      return result.get_reason();
+    }
+
+    try {
+      rgw::auth::IdentityApplier::aplptr_t applier = result.get_applier();
+      rgw::auth::Completer::cmplptr_t completer = result.get_completer();
+
+      /* Account used by a given RGWOp is decoupled from identity employed
+       * in the authorization phase (RGWOp::verify_permissions). */
+      applier->load_acct_info(dpp, s->user->get_info());
+      s->perm_mask = applier->get_perm_mask();
+
+      /* This is the single place where we pass req_state as a pointer
+       * to non-const and thus its modification is allowed. In the time
+       * of writing only RGWTempURLEngine needed that feature. */
+      applier->modify_request_state(dpp, s);
+      if (completer) {
+        completer->modify_request_state(dpp, s);
+      }
+
+      s->auth.identity = std::move(applier);
+      s->auth.completer = std::move(completer);
+
+      return 0;
+    } catch (const int err) {
+      ldpp_dout(dpp, 5) << "applier throwed err=" << err << dendl;
+      return err;
+    } catch (const std::exception& e) {
+      ldpp_dout(dpp, 5) << "applier throwed unexpected err: " << e.what()
+                        << dendl;
+      return -EPERM;
+    }
+  } catch (const int err) {
+    ldpp_dout(dpp, 5) << "auth engine throwed err=" << err << dendl;
+    return err;
+  } catch (const std::exception& e) {
+    ldpp_dout(dpp, 5) << "auth engine throwed unexpected err: " << e.what()
+                      << dendl;
+  }
+
+  /* We never should be here. */
+  return -EPERM;
+}
+
+void
+rgw::auth::Strategy::add_engine(const Control ctrl_flag,
+                                const Engine& engine) noexcept
+{
+  auth_stack.push_back(std::make_pair(std::cref(engine), ctrl_flag));
+}
+
+void rgw::auth::WebIdentityApplier::to_str(std::ostream& out) const
+{
+  out << "rgw::auth::WebIdentityApplier(sub =" << sub
+      << ", user_name=" << user_name
+      << ", provider_id =" << iss << ")";
+}
+
+string rgw::auth::WebIdentityApplier::get_idp_url() const
+{
+  string idp_url = this->iss;
+  idp_url = url_remove_prefix(idp_url);
+  return idp_url;
+}
+
+void rgw::auth::WebIdentityApplier::create_account(const DoutPrefixProvider* dpp,
+                                              const rgw_user& acct_user,
+                                              const string& display_name,
+                                              RGWUserInfo& user_info) const      /* out */
+{
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(acct_user);
+  user->get_info().display_name = display_name;
+  user->get_info().type = TYPE_WEB;
+  user->get_info().max_buckets =
+    cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+  rgw_apply_default_bucket_quota(user->get_info().quota.bucket_quota, cct->_conf);
+  rgw_apply_default_user_quota(user->get_info().quota.user_quota, cct->_conf);
+
+  int ret = user->store_user(dpp, null_yield, true);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to store new user info: user="
+                  << user << " ret=" << ret << dendl;
+    throw ret;
+  }
+  user_info = user->get_info();
+}
+
+void rgw::auth::WebIdentityApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const {
+  rgw_user federated_user;
+  federated_user.id = this->sub;
+  federated_user.tenant = role_tenant;
+  federated_user.ns = "oidc";
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(federated_user);
+
+  //Check in oidc namespace
+  if (user->load_user(dpp, null_yield) >= 0) {
+    /* Succeeded. */
+    user_info = user->get_info();
+    return;
+  }
+
+  user->clear_ns();
+  //Check for old users which wouldn't have been created in oidc namespace
+  if (user->load_user(dpp, null_yield) >= 0) {
+    /* Succeeded. */
+    user_info = user->get_info();
+    return;
+  }
+
+  //Check if user_id.buckets already exists, may have been from the time, when shadow users didnt exist
+  RGWStorageStats stats;
+  int ret = user->read_stats(dpp, null_yield, &stats);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: reading stats for the user returned error " << ret << dendl;
+    return;
+  }
+  if (ret == -ENOENT) { /* in case of ENOENT, which means user doesnt have buckets */
+    //In this case user will be created in oidc namespace
+    ldpp_dout(dpp, 5) << "NOTICE: incoming user has no buckets " << federated_user << dendl;
+    federated_user.ns = "oidc";
+  } else {
+    //User already has buckets associated, hence wont be created in oidc namespace.
+    ldpp_dout(dpp, 5) << "NOTICE: incoming user already has buckets associated " << federated_user << ", won't be created in oidc namespace"<< dendl;
+    federated_user.ns = "";
+  }
+
+  ldpp_dout(dpp, 0) << "NOTICE: couldn't map oidc federated user " << federated_user << dendl;
+  create_account(dpp, federated_user, this->user_name, user_info);
+}
+
+void rgw::auth::WebIdentityApplier::modify_request_state(const DoutPrefixProvider *dpp, req_state* s) const
+{
+  s->info.args.append("sub", this->sub);
+  s->info.args.append("aud", this->aud);
+  s->info.args.append("provider_id", this->iss);
+  s->info.args.append("client_id", this->client_id);
+
+  string condition;
+  string idp_url = get_idp_url();
+  for (auto& claim : token_claims) {
+    if (claim.first == "aud") {
+      condition.clear();
+      condition = idp_url + ":app_id";
+      s->env.emplace(condition, claim.second);
+    }
+    condition.clear();
+    condition = idp_url + ":" + claim.first;
+    s->env.emplace(condition, claim.second);
+  }
+
+  if (principal_tags) {
+    constexpr size_t KEY_SIZE = 128, VAL_SIZE = 256;
+    std::set<std::pair<string, string>> p_tags = principal_tags.get();
+    for (auto& it : p_tags) {
+      string key = it.first;
+      string val = it.second;
+      if (key.find("aws:") == 0 || val.find("aws:") == 0) {
+        ldpp_dout(dpp, 0) << "ERROR: Tag/Value can't start with aws:, hence skipping it" << dendl;
+        continue;
+      }
+      if (key.size() > KEY_SIZE || val.size() > VAL_SIZE)  {
+        ldpp_dout(dpp, 0) << "ERROR: Invalid tag/value size, hence skipping it" << dendl;
+        continue;
+      }
+      std::string p_key = "aws:PrincipalTag/";
+      p_key.append(key);
+      s->principal_tags.emplace_back(std::make_pair(p_key, val));
+      ldpp_dout(dpp, 10) << "Principal Tag Key: " << p_key << " Value: " << val << dendl;
+
+      std::string e_key = "aws:RequestTag/";
+      e_key.append(key);
+      s->env.emplace(e_key, val);
+      ldpp_dout(dpp, 10) << "RGW Env Tag Key: " << e_key << " Value: " << val << dendl;
+
+      s->env.emplace("aws:TagKeys", key);
+        ldpp_dout(dpp, 10) << "aws:TagKeys: " << key << dendl;
+
+      if (s->principal_tags.size() == 50) {
+        ldpp_dout(dpp, 0) << "ERROR: Number of tag/value pairs exceeding 50, hence skipping the rest" << dendl;
+        break;
+      }
+    }
+  }
+
+  if (role_tags) {
+    for (auto& it : role_tags.get()) {
+      std::string p_key = "aws:PrincipalTag/";
+      p_key.append(it.first);
+      s->principal_tags.emplace_back(std::make_pair(p_key, it.second));
+      ldpp_dout(dpp, 10) << "Principal Tag Key: " << p_key << " Value: " << it.second << dendl;
+
+      std::string e_key = "iam:ResourceTag/";
+      e_key.append(it.first);
+      s->env.emplace(e_key, it.second);
+      ldpp_dout(dpp, 10) << "RGW Env Tag Key: " << e_key << " Value: " << it.second << dendl;
+    }
+  }
+}
+
+bool rgw::auth::WebIdentityApplier::is_identity(const idset_t& ids) const
+{
+  if (ids.size() > 1) {
+    return false;
+  }
+
+  for (auto id : ids) {
+    string idp_url = get_idp_url();
+    if (id.is_oidc_provider() && id.get_idp_url() == idp_url) {
+      return true;
+    }
+  }
+    return false;
+}
+
+const std::string rgw::auth::RemoteApplier::AuthInfo::NO_SUBUSER;
+const std::string rgw::auth::RemoteApplier::AuthInfo::NO_ACCESS_KEY;
+
+/* rgw::auth::RemoteAuthApplier */
+uint32_t rgw::auth::RemoteApplier::get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const
+{
+  uint32_t perm = 0;
+
+  /* For backward compatibility with ACLOwner. */
+  perm |= rgw_perms_from_aclspec_default_strategy(info.acct_user,
+                                                  aclspec, dpp);
+
+  /* We also need to cover cases where rgw_keystone_implicit_tenants
+   * was enabled. */
+  if (info.acct_user.tenant.empty()) {
+    const rgw_user tenanted_acct_user(info.acct_user.id, info.acct_user.id);
+
+    perm |= rgw_perms_from_aclspec_default_strategy(tenanted_acct_user,
+                                                    aclspec, dpp);
+  }
+
+  /* Now it's a time for invoking additional strategy that was supplied by
+   * a specific auth engine. */
+  if (extra_acl_strategy) {
+    perm |= extra_acl_strategy(aclspec);
+  }
+
+  ldpp_dout(dpp, 20) << "from ACL got perm=" << perm << dendl;
+  return perm;
+}
+
+bool rgw::auth::RemoteApplier::is_admin_of(const rgw_user& uid) const
+{
+  return info.is_admin;
+}
+
+bool rgw::auth::RemoteApplier::is_owner_of(const rgw_user& uid) const
+{
+  if (info.acct_user.tenant.empty()) {
+    const rgw_user tenanted_acct_user(info.acct_user.id, info.acct_user.id);
+
+    if (tenanted_acct_user == uid) {
+      return true;
+    }
+  }
+
+  return info.acct_user == uid;
+}
+
+bool rgw::auth::RemoteApplier::is_identity(const idset_t& ids) const {
+  for (auto& id : ids) {
+    if (id.is_wildcard()) {
+      return true;
+
+      // We also need to cover cases where rgw_keystone_implicit_tenants
+      // was enabled. */
+    } else if (id.is_tenant() &&
+	       (info.acct_user.tenant.empty() ?
+		info.acct_user.id :
+		info.acct_user.tenant) == id.get_tenant()) {
+      return true;
+    } else if (id.is_user() &&
+	       info.acct_user.id == id.get_id() &&
+	       (info.acct_user.tenant.empty() ?
+		info.acct_user.id :
+		info.acct_user.tenant) == id.get_tenant()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void rgw::auth::RemoteApplier::to_str(std::ostream& out) const
+{
+  out << "rgw::auth::RemoteApplier(acct_user=" << info.acct_user
+      << ", acct_name=" << info.acct_name
+      << ", perm_mask=" << info.perm_mask
+      << ", is_admin=" << info.is_admin << ")";
+}
+
+void rgw::auth::ImplicitTenants::recompute_value(const ConfigProxy& c)
+{
+  std::string s = c.get_val<std::string>("rgw_keystone_implicit_tenants");
+  int v = 0;
+  if (boost::iequals(s, "both")
+    || boost::iequals(s, "true")
+    || boost::iequals(s, "1")) {
+    v = IMPLICIT_TENANTS_S3|IMPLICIT_TENANTS_SWIFT;
+  } else if (boost::iequals(s, "0")
+    || boost::iequals(s, "none")
+    || boost::iequals(s, "false")) {
+    v = 0;
+  } else if (boost::iequals(s, "s3")) {
+    v = IMPLICIT_TENANTS_S3;
+  } else if (boost::iequals(s, "swift")) {
+    v = IMPLICIT_TENANTS_SWIFT;
+  } else {  /* "" (and anything else) */
+    v = IMPLICIT_TENANTS_BAD;
+    // assert(0);
+  }
+  saved = v;
+}
+
+const char **rgw::auth::ImplicitTenants::get_tracked_conf_keys() const
+{
+  static const char *keys[] = {
+    "rgw_keystone_implicit_tenants",
+  nullptr };
+  return keys;
+}
+
+void rgw::auth::ImplicitTenants::handle_conf_change(const ConfigProxy& c,
+	const std::set <std::string> &changed)
+{
+  if (changed.count("rgw_keystone_implicit_tenants")) {
+    recompute_value(c);
+  }
+}
+
+void rgw::auth::RemoteApplier::create_account(const DoutPrefixProvider* dpp,
+                                              const rgw_user& acct_user,
+                                              bool implicit_tenant,
+                                              RGWUserInfo& user_info) const      /* out */
+{
+  rgw_user new_acct_user = acct_user;
+
+  /* An upper layer may enforce creating new accounts within their own
+   * tenants. */
+  if (new_acct_user.tenant.empty() && implicit_tenant) {
+    new_acct_user.tenant = new_acct_user.id;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(new_acct_user);
+  user->get_info().display_name = info.acct_name;
+  if (info.acct_type) {
+    //ldap/keystone for s3 users
+    user->get_info().type = info.acct_type;
+  }
+  user->get_info().max_buckets =
+    cct->_conf.get_val<int64_t>("rgw_user_max_buckets");
+  rgw_apply_default_bucket_quota(user->get_info().quota.bucket_quota, cct->_conf);
+  rgw_apply_default_user_quota(user->get_info().quota.user_quota, cct->_conf);
+  user_info = user->get_info();
+
+  int ret = user->store_user(dpp, null_yield, true);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to store new user info: user="
+                  << user << " ret=" << ret << dendl;
+    throw ret;
+  }
+}
+
+void rgw::auth::RemoteApplier::write_ops_log_entry(rgw_log_entry& entry) const
+{
+  entry.access_key_id = info.access_key_id;
+  entry.subuser = info.subuser;
+}
+
+/* TODO(rzarzynski): we need to handle display_name changes. */
+void rgw::auth::RemoteApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const      /* out */
+{
+  /* It's supposed that RGWRemoteAuthApplier tries to load account info
+   * that belongs to the authenticated identity. Another policy may be
+   * applied by using a RGWThirdPartyAccountAuthApplier decorator. */
+  const rgw_user& acct_user = info.acct_user;
+  auto implicit_value = implicit_tenant_context.get_value();
+  bool implicit_tenant = implicit_value.implicit_tenants_for_(implicit_tenant_bit);
+  bool split_mode = implicit_value.is_split_mode();
+  std::unique_ptr<rgw::sal::User> user;
+
+  /* Normally, empty "tenant" field of acct_user means the authenticated
+   * identity has the legacy, global tenant. However, due to inclusion
+   * of multi-tenancy, we got some special compatibility kludge for remote
+   * backends like Keystone.
+   * If the global tenant is the requested one, we try the same tenant as
+   * the user name first. If that RGWUserInfo exists, we use it. This way,
+   * migrated OpenStack users can get their namespaced containers and nobody's
+   * the wiser.
+   * If that fails, we look up in the requested (possibly empty) tenant.
+   * If that fails too, we create the account within the global or separated
+   * namespace depending on rgw_keystone_implicit_tenants.
+   * For compatibility with previous versions of ceph, it is possible
+   * to enable implicit_tenants for only s3 or only swift.
+   * in this mode ("split_mode"), we must constrain the id lookups to
+   * only use the identifier space that would be used if the id were
+   * to be created. */
+
+  if (split_mode && !implicit_tenant)
+	;	/* suppress lookup for id used by "other" protocol */
+  else if (acct_user.tenant.empty()) {
+    const rgw_user tenanted_uid(acct_user.id, acct_user.id);
+    user = driver->get_user(tenanted_uid);
+
+    if (user->load_user(dpp, null_yield) >= 0) {
+      /* Succeeded. */
+      user_info = user->get_info();
+      return;
+    }
+  }
+
+  user = driver->get_user(acct_user);
+
+  if (split_mode && implicit_tenant)
+	;	/* suppress lookup for id used by "other" protocol */
+  else if (user->load_user(dpp, null_yield) >= 0) {
+    /* Succeeded. */
+    user_info = user->get_info();
+    return;
+  }
+
+  ldpp_dout(dpp, 0) << "NOTICE: couldn't map swift user " << acct_user << dendl;
+  create_account(dpp, acct_user, implicit_tenant, user_info);
+
+  /* Succeeded if we are here (create_account() hasn't throwed). */
+}
+
+/* rgw::auth::LocalApplier */
+/* static declaration */
+const std::string rgw::auth::LocalApplier::NO_SUBUSER;
+const std::string rgw::auth::LocalApplier::NO_ACCESS_KEY;
+
+uint32_t rgw::auth::LocalApplier::get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const
+{
+  return rgw_perms_from_aclspec_default_strategy(user_info.user_id, aclspec, dpp);
+}
+
+bool rgw::auth::LocalApplier::is_admin_of(const rgw_user& uid) const
+{
+  return user_info.admin || user_info.system;
+}
+
+bool rgw::auth::LocalApplier::is_owner_of(const rgw_user& uid) const
+{
+  return uid == user_info.user_id;
+}
+
+bool rgw::auth::LocalApplier::is_identity(const idset_t& ids) const {
+  for (auto& id : ids) {
+    if (id.is_wildcard()) {
+      return true;
+    } else if (id.is_tenant() &&
+	       id.get_tenant() == user_info.user_id.tenant) {
+      return true;
+    } else if (id.is_user() &&
+	       (id.get_tenant() == user_info.user_id.tenant)) {
+      if (id.get_id() == user_info.user_id.id) {
+        return true;
+      }
+      std::string wildcard_subuser = user_info.user_id.id;
+      wildcard_subuser.append(":*");
+      if (wildcard_subuser == id.get_id()) {
+        return true;
+      } else if (subuser != NO_SUBUSER) {
+        std::string user = user_info.user_id.id;
+        user.append(":");
+        user.append(subuser);
+        if (user == id.get_id()) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+void rgw::auth::LocalApplier::to_str(std::ostream& out) const {
+  out << "rgw::auth::LocalApplier(acct_user=" << user_info.user_id
+      << ", acct_name=" << user_info.display_name
+      << ", subuser=" << subuser
+      << ", perm_mask=" << get_perm_mask()
+      << ", is_admin=" << static_cast<bool>(user_info.admin) << ")";
+}
+
+uint32_t rgw::auth::LocalApplier::get_perm_mask(const std::string& subuser_name,
+                                                const RGWUserInfo &uinfo) const
+{
+  if (! subuser_name.empty() && subuser_name != NO_SUBUSER) {
+    const auto iter = uinfo.subusers.find(subuser_name);
+
+    if (iter != std::end(uinfo.subusers)) {
+      return iter->second.perm_mask;
+    } else {
+      /* Subuser specified but not found. */
+      return RGW_PERM_NONE;
+    }
+  } else {
+    /* Due to backward compatibility. */
+    return RGW_PERM_FULL_CONTROL;
+  }
+}
+
+void rgw::auth::LocalApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */
+{
+  /* Load the account that belongs to the authenticated identity. An extra call
+   * to RADOS may be safely skipped in this case. */
+  user_info = this->user_info;
+}
+
+void rgw::auth::LocalApplier::write_ops_log_entry(rgw_log_entry& entry) const
+{
+  entry.access_key_id = access_key_id;
+  entry.subuser = subuser;
+}
+
+void rgw::auth::RoleApplier::to_str(std::ostream& out) const {
+  out << "rgw::auth::RoleApplier(role name =" << role.name;
+  for (auto& policy: role.role_policies) {
+    out << ", role policy =" << policy;
+  }
+  out << ", token policy =" << token_attrs.token_policy;
+  out << ")";
+}
+
+bool rgw::auth::RoleApplier::is_identity(const idset_t& ids) const {
+  for (auto& p : ids) {
+    if (p.is_wildcard()) {
+      return true;
+    } else if (p.is_role()) {
+      string name = p.get_id();
+      string tenant = p.get_tenant();
+      if (name == role.name && tenant == role.tenant) {
+        return true;
+      }
+    } else if (p.is_assumed_role()) {
+      string tenant = p.get_tenant();
+      string role_session = role.name + "/" + token_attrs.role_session_name; //role/role-session
+      if (role.tenant == tenant && role_session == p.get_role_session()) {
+        return true;
+      }
+    } else {
+      string id = p.get_id();
+      string tenant = p.get_tenant();
+      string oidc_id;
+      if (token_attrs.user_id.ns.empty()) {
+        oidc_id = token_attrs.user_id.id;
+      } else {
+        oidc_id = token_attrs.user_id.ns + "$" + token_attrs.user_id.id;
+      }
+      if (oidc_id == id && token_attrs.user_id.tenant == tenant) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void rgw::auth::RoleApplier::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const /* out */
+{
+  /* Load the user id */
+  user_info.user_id = this->token_attrs.user_id;
+}
+
+void rgw::auth::RoleApplier::modify_request_state(const DoutPrefixProvider *dpp, req_state* s) const
+{
+  for (auto it: role.role_policies) {
+    try {
+      bufferlist bl = bufferlist::static_from_string(it);
+      const rgw::IAM::Policy p(s->cct, role.tenant, bl, false);
+      s->iam_user_policies.push_back(std::move(p));
+    } catch (rgw::IAM::PolicyParseException& e) {
+      //Control shouldn't reach here as the policy has already been
+      //verified earlier
+      ldpp_dout(dpp, 20) << "failed to parse role policy: " << e.what() << dendl;
+    }
+  }
+
+  if (!this->token_attrs.token_policy.empty()) {
+    try {
+      string policy = this->token_attrs.token_policy;
+      bufferlist bl = bufferlist::static_from_string(policy);
+      const rgw::IAM::Policy p(s->cct, role.tenant, bl, false);
+      s->session_policies.push_back(std::move(p));
+    } catch (rgw::IAM::PolicyParseException& e) {
+      //Control shouldn't reach here as the policy has already been
+      //verified earlier
+      ldpp_dout(dpp, 20) << "failed to parse token policy: " << e.what() << dendl;
+    }
+  }
+
+  string condition = "aws:userid";
+  string value = role.id + ":" + token_attrs.role_session_name;
+  s->env.emplace(condition, value);
+
+  s->env.emplace("aws:TokenIssueTime", token_attrs.token_issued_at);
+
+  for (auto& m : token_attrs.principal_tags) {
+    s->env.emplace(m.first, m.second);
+    ldpp_dout(dpp, 10) << "Principal Tag Key: " << m.first << " Value: " << m.second << dendl;
+    std::size_t pos = m.first.find('/');
+    string key = m.first.substr(pos + 1);
+    s->env.emplace("aws:TagKeys", key);
+    ldpp_dout(dpp, 10) << "aws:TagKeys: " << key << dendl;
+  }
+
+  s->token_claims.emplace_back("sts");
+  s->token_claims.emplace_back("role_name:" + role.tenant + "$" + role.name);
+  s->token_claims.emplace_back("role_session:" + token_attrs.role_session_name);
+  for (auto& it : token_attrs.token_claims) {
+    s->token_claims.emplace_back(it);
+  }
+}
+
+rgw::auth::Engine::result_t
+rgw::auth::AnonymousEngine::authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const
+{
+  if (! is_applicable(s)) {
+    return result_t::deny(-EPERM);
+  } else {
+    RGWUserInfo user_info;
+    rgw_get_anon_user(user_info);
+
+    auto apl = \
+      apl_factory->create_apl_local(cct, s, user_info,
+                                    rgw::auth::LocalApplier::NO_SUBUSER,
+                                    std::nullopt, rgw::auth::LocalApplier::NO_ACCESS_KEY);
+    return result_t::grant(std::move(apl));
+  }
+}
diff --git a/src/rgw/rgw_auth.h b/src/rgw/rgw_auth.h
new file mode 100644
index 000000000..82e0d0c97
--- /dev/null
+++ b/src/rgw/rgw_auth.h
@@ -0,0 +1,791 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <functional>
+#include <optional>
+#include <ostream>
+#include <type_traits>
+#include <system_error>
+#include <utility>
+
+#include "rgw_common.h"
+#include "rgw_web_idp.h"
+
+#define RGW_USER_ANON_ID "anonymous"
+
+class RGWCtl;
+struct rgw_log_entry;
+struct req_state;
+
+namespace rgw {
+namespace auth {
+
+using Exception = std::system_error;
+
+
+/* Load information about identity that will be used by RGWOp to authorize
+ * any operation that comes from an authenticated user. */
+class Identity {
+public:
+  typedef std::map<std::string, int> aclspec_t;
+  using idset_t = boost::container::flat_set<Principal>;
+
+  virtual ~Identity() = default;
+
+  /* Translate the ACL provided in @aclspec into concrete permission set that
+   * can be used during the authorization phase (RGWOp::verify_permission).
+   * On error throws rgw::auth::Exception storing the reason.
+   *
+   * NOTE: an implementation is responsible for giving the real semantic to
+   * the items in @aclspec. That is, their meaning may depend on particular
+   * applier that is being used. */
+  virtual uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const = 0;
+
+  /* Verify whether a given identity *can be treated as* an admin of rgw_user
+  * (account in Swift's terminology) specified in @uid. On error throws
+  * rgw::auth::Exception storing the reason. */
+  virtual bool is_admin_of(const rgw_user& uid) const = 0;
+
+  /* Verify whether a given identity *is* the owner of the rgw_user (account
+   * in the Swift's terminology) specified in @uid. On internal error throws
+   * rgw::auth::Exception storing the reason. */
+  virtual bool is_owner_of(const rgw_user& uid) const = 0;
+
+  /* Return the permission mask that is used to narrow down the set of
+   * operations allowed for a given identity. This method reflects the idea
+   * of subuser tied to RGWUserInfo. On  error throws rgw::auth::Exception
+   * with the reason. */
+  virtual uint32_t get_perm_mask() const = 0;
+
+  virtual bool is_anonymous() const {
+    /* If the identity owns the anonymous account (rgw_user), it's considered
+     * the anonymous identity. On error throws rgw::auth::Exception storing
+     * the reason. */
+    return is_owner_of(rgw_user(RGW_USER_ANON_ID));
+  }
+
+  virtual void to_str(std::ostream& out) const = 0;
+
+  /* Verify whether a given identity corresponds to an identity in the
+     provided set */
+  virtual bool is_identity(const idset_t& ids) const = 0;
+
+  /* Identity Type: RGW/ LDAP/ Keystone */
+  virtual uint32_t get_identity_type() const = 0;
+
+  /* Name of Account */
+  virtual std::string get_acct_name() const = 0;
+
+  /* Subuser of Account */
+  virtual std::string get_subuser() const = 0;
+
+  virtual std::string get_role_tenant() const { return ""; }
+
+  /* write any auth-specific fields that are safe to expose in the ops log */
+  virtual void write_ops_log_entry(rgw_log_entry& entry) const {};
+};
+
+inline std::ostream& operator<<(std::ostream& out,
+                                const rgw::auth::Identity& id) {
+  id.to_str(out);
+  return out;
+}
+
+
+std::unique_ptr<rgw::auth::Identity>
+transform_old_authinfo(CephContext* const cct,
+                       const rgw_user& auth_id,
+                       const int perm_mask,
+                       const bool is_admin,
+                       const uint32_t type);
+std::unique_ptr<Identity> transform_old_authinfo(const req_state* const s);
+
+
+/* Interface for classes applying changes to request state/RADOS store
+ * imposed by a particular rgw::auth::Engine.
+ *
+ * In contrast to rgw::auth::Engine, implementations of this interface
+ * are allowed to handle req_state or RGWUserCtl in the read-write manner.
+ *
+ * It's expected that most (if not all) of implementations will also
+ * conform to rgw::auth::Identity interface to provide authorization
+ * policy (ACLs, account's ownership and entitlement). */
+class IdentityApplier : public Identity {
+public:
+  typedef std::unique_ptr<IdentityApplier> aplptr_t;
+
+  virtual ~IdentityApplier() {};
+
+  /* Fill provided RGWUserInfo with information about the account that
+   * RGWOp will operate on. Errors are handled solely through exceptions.
+   *
+   * XXX: be aware that the "account" term refers to rgw_user. The naming
+   * is legacy. */
+  virtual void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const = 0; /* out */
+
+  /* Apply any changes to request state. This method will be most useful for
+   * TempURL of Swift API. */
+  virtual void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const {}      /* in/out */
+};
+
+
+/* Interface class for completing the two-step authentication process.
+ * Completer provides the second step - the complete() method that should
+ * be called after Engine::authenticate() but before *committing* results
+ * of an RGWOp (or sending a response in the case of non-mutating ops).
+ *
+ * The motivation driving the interface is to address those authentication
+ * schemas that require message integrity verification *without* in-memory
+ * data buffering. Typical examples are AWS Auth v4 and the auth mechanism
+ * of browser uploads facilities both in S3 and Swift APIs (see RGWPostObj).
+ * The workflow of request from the authentication point-of-view does look
+ * like following one:
+ *  A. authenticate (Engine::authenticate),
+ *  B. authorize (see RGWOp::verify_permissions),
+ *  C. execute-prepare (init potential data modifications),
+ *  D. authenticate-complete - (Completer::complete),
+ *  E. execute-commit - commit the modifications from point C. */
+class Completer {
+public:
+  /* It's expected that Completers would tend to implement many interfaces
+   * and be used not only in req_state::auth::completer. Ref counting their
+   * instances would be helpful. */
+  typedef std::shared_ptr<Completer> cmplptr_t;
+
+  virtual ~Completer() = default;
+
+  /* Complete the authentication process. Return boolean indicating whether
+   * the completion succeeded. On error throws rgw::auth::Exception storing
+   * the reason. */
+  virtual bool complete() = 0;
+
+  /* Apply any changes to request state. The initial use case was injecting
+   * the AWSv4 filter over rgw::io::RestfulClient in req_state. */
+  virtual void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) = 0;     /* in/out */
+};
+
+
+/* Interface class for authentication backends (auth engines) in RadosGW.
+ *
+ * An engine is supposed only to authenticate (not authorize!) requests
+ * basing on their req_state and - if access has been granted - provide
+ * an upper layer with:
+ *  - rgw::auth::IdentityApplier to commit all changes to the request state as
+ *    well as to the RADOS store (creating an account, synchronizing
+ *    user-related information with external databases and so on).
+ *  - rgw::auth::Completer (optionally) to finish the authentication
+ *    of the request. Typical use case is verifying message integrity
+ *    in AWS Auth v4 and browser uploads (RGWPostObj).
+ *
+ * Both of them are supposed to be wrapped in Engine::AuthResult.
+ *
+ * The authentication process consists of two steps:
+ *  - Engine::authenticate() which should be called before *initiating*
+ *    any modifications to RADOS store that are related to an operation
+ *    a client wants to perform (RGWOp::execute).
+ *  - Completer::complete() supposed to be called, if completer has been
+ *    returned, after the authenticate() step but before *committing*
+ *    those modifications or sending a response (RGWOp::complete).
+ *
+ * An engine outlives both Applier and Completer. It's intended to live
+ * since RadosGW's initialization and handle multiple requests till
+ * a reconfiguration.
+ *
+ * Auth engine MUST NOT make any changes to req_state nor RADOS store.
+ * This is solely an Applier's responsibility!
+ *
+ * Separation between authentication and global state modification has
+ * been introduced because many auth engines are orthogonal to appliers
+ * and thus they can be decoupled. Additional motivation is to clearly
+ * distinguish all portions of code modifying data structures. */
+class Engine {
+public:
+  virtual ~Engine() = default;
+
+  class AuthResult {
+    struct rejection_mark_t {};
+    bool is_rejected = false;
+    int reason = 0;
+
+    std::pair<IdentityApplier::aplptr_t, Completer::cmplptr_t> result_pair;
+
+    explicit AuthResult(const int reason)
+      : reason(reason) {
+    }
+
+    AuthResult(rejection_mark_t&&, const int reason)
+      : is_rejected(true),
+        reason(reason) {
+    }
+
+    /* Allow only the reasonable combintations - returning just Completer
+     * without accompanying IdentityApplier is strictly prohibited! */
+    explicit AuthResult(IdentityApplier::aplptr_t&& applier)
+      : result_pair(std::move(applier), nullptr) {
+    }
+
+    AuthResult(IdentityApplier::aplptr_t&& applier,
+               Completer::cmplptr_t&& completer)
+      : result_pair(std::move(applier), std::move(completer)) {
+    }
+
+  public:
+    enum class Status {
+      /* Engine doesn't grant the access but also doesn't reject it. */
+      DENIED,
+
+      /* Engine successfully authenicated requester. */
+      GRANTED,
+
+      /* Engine strictly indicates that a request should be rejected
+       * without trying any further engine. */
+      REJECTED
+    };
+
+    Status get_status() const {
+      if (is_rejected) {
+        return Status::REJECTED;
+      } else if (! result_pair.first) {
+        return Status::DENIED;
+      } else {
+        return Status::GRANTED;
+      }
+    }
+
+    int get_reason() const {
+      return reason;
+    }
+
+    IdentityApplier::aplptr_t get_applier() {
+      return std::move(result_pair.first);
+    }
+
+    Completer::cmplptr_t&& get_completer() {
+      return std::move(result_pair.second);
+    }
+
+    static AuthResult reject(const int reason = -EACCES) {
+      return AuthResult(rejection_mark_t(), reason);
+    }
+
+    static AuthResult deny(const int reason = -EACCES) {
+      return AuthResult(reason);
+    }
+
+    static AuthResult grant(IdentityApplier::aplptr_t&& applier) {
+      return AuthResult(std::move(applier));
+    }
+
+    static AuthResult grant(IdentityApplier::aplptr_t&& applier,
+                            Completer::cmplptr_t&& completer) {
+      return AuthResult(std::move(applier), std::move(completer));
+    }
+  };
+
+  using result_t = AuthResult;
+
+  /* Get name of the auth engine. */
+  virtual const char* get_name() const noexcept = 0;
+
+  /* Throwing method for identity verification. When the check is positive
+   * an implementation should return Engine::result_t containing:
+   *  - a non-null pointer to an object conforming the Applier interface.
+   *    Otherwise, the authentication is treated as failed.
+   *  - a (potentially null) pointer to an object conforming the Completer
+   *    interface.
+   *
+   * On error throws rgw::auth::Exception containing the reason. */
+  virtual result_t authenticate(const DoutPrefixProvider* dpp, const req_state* s, optional_yield y) const = 0;
+};
+
+
+/* Interface for extracting a token basing from data carried by req_state. */
+class TokenExtractor {
+public:
+  virtual ~TokenExtractor() = default;
+  virtual std::string get_token(const req_state* s) const = 0;
+};
+
+
+/* Abstract class for stacking sub-engines to expose them as a single
+ * Engine. It is responsible for ordering its sub-engines and managing
+ * fall-backs between them. Derivatee is supposed to encapsulate engine
+ * instances and add them using the add_engine() method in the order it
+ * wants to be tried during the call to authenticate().
+ *
+ * Each new Strategy should be exposed to StrategyRegistry for handling
+ * the dynamic reconfiguration. */
+class Strategy : public Engine {
+public:
+  /* Specifiers controlling what happens when an associated engine fails.
+   * The names and semantic has been borrowed mostly from libpam. */
+  enum class Control {
+    /* Failure of an engine injected with the REQUISITE specifier aborts
+     * the strategy's authentication process immediately. No other engine
+     * will be tried. */
+    REQUISITE,
+
+    /* Success of an engine injected with the SUFFICIENT specifier ends
+     * strategy's authentication process successfully. However, denying
+     * doesn't abort it -- there will be fall-back to following engine
+     * if the one that failed wasn't the last one. */
+    SUFFICIENT,
+
+    /* Like SUFFICIENT with the exception that on failure the reason code
+     * is not overridden. Instead, it's taken directly from the last tried
+     * non-FALLBACK engine. If there was no previous non-FALLBACK engine
+     * in a Strategy, then the result_t::deny(reason = -EACCES) is used. */
+    FALLBACK,
+  };
+
+  Engine::result_t authenticate(const DoutPrefixProvider* dpp, const req_state* s, optional_yield y) const override final;
+
+  bool is_empty() const {
+    return auth_stack.empty();
+  }
+
+  static int apply(const DoutPrefixProvider* dpp, const Strategy& auth_strategy, req_state* s, optional_yield y) noexcept;
+
+private:
+  /* Using the reference wrapper here to explicitly point out we are not
+   * interested in storing nulls while preserving the dynamic polymorphism. */
+  using stack_item_t = std::pair<std::reference_wrapper<const Engine>,
+                                 Control>;
+  std::vector<stack_item_t> auth_stack;
+
+protected:
+  void add_engine(Control ctrl_flag, const Engine& engine) noexcept;
+};
+
+
+/* A class aggregating the knowledge about all Strategies in RadosGW. It is
+ * responsible for handling the dynamic reconfiguration on e.g. realm update.
+ * The definition is in rgw/rgw_auth_registry.h,
+ *
+ * Each new Strategy should be exposed to it. */
+class StrategyRegistry;
+
+class WebIdentityApplier : public IdentityApplier {
+  std::string sub;
+  std::string iss;
+  std::string aud;
+  std::string client_id;
+  std::string user_name;
+protected:
+  CephContext* const cct;
+  rgw::sal::Driver* driver;
+  std::string role_session;
+  std::string role_tenant;
+  std::unordered_multimap<std::string, std::string> token_claims;
+  boost::optional<std::multimap<std::string,std::string>> role_tags;
+  boost::optional<std::set<std::pair<std::string, std::string>>> principal_tags;
+
+  std::string get_idp_url() const;
+
+  void create_account(const DoutPrefixProvider* dpp,
+                      const rgw_user& acct_user,
+                      const std::string& display_name,
+                      RGWUserInfo& user_info) const;     /* out */
+public:
+  WebIdentityApplier( CephContext* const cct,
+                      rgw::sal::Driver* driver,
+                      const std::string& role_session,
+                      const std::string& role_tenant,
+                      const std::unordered_multimap<std::string, std::string>& token_claims,
+                      boost::optional<std::multimap<std::string,std::string>> role_tags,
+                      boost::optional<std::set<std::pair<std::string, std::string>>> principal_tags)
+      : cct(cct),
+      driver(driver),
+      role_session(role_session),
+      role_tenant(role_tenant),
+      token_claims(token_claims),
+      role_tags(role_tags),
+      principal_tags(principal_tags) {
+      const auto& sub = token_claims.find("sub");
+      if(sub != token_claims.end()) {
+        this->sub = sub->second;
+      }
+
+      const auto& iss = token_claims.find("iss");
+      if(iss != token_claims.end()) {
+        this->iss = iss->second;
+      }
+
+      const auto& aud = token_claims.find("aud");
+      if(aud != token_claims.end()) {
+        this->aud = aud->second;
+      }
+
+      const auto& client_id = token_claims.find("client_id");
+      if(client_id != token_claims.end()) {
+        this->client_id = client_id->second;
+      } else {
+        const auto& azp = token_claims.find("azp");
+        if (azp != token_claims.end()) {
+          this->client_id = azp->second;
+        }
+      }
+
+      const auto& user_name = token_claims.find("username");
+      if(user_name != token_claims.end()) {
+        this->user_name = user_name->second;
+      } else {
+        const auto& given_username = token_claims.find("given_username");
+        if (given_username != token_claims.end()) {
+          this->user_name = given_username->second;
+        }
+      }
+  }
+
+  void modify_request_state(const DoutPrefixProvider *dpp, req_state* s) const override;
+
+  uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const  override {
+    return RGW_PERM_NONE;
+  }
+
+  bool is_admin_of(const rgw_user& uid) const override {
+    return false;
+  }
+
+  bool is_owner_of(const rgw_user& uid) const override {
+    if (uid.id == this->sub && uid.tenant == role_tenant && uid.ns == "oidc") {
+      return true;
+    }
+    return false;
+  }
+
+  uint32_t get_perm_mask() const override {
+    return RGW_PERM_NONE;
+  }
+
+  void to_str(std::ostream& out) const override;
+
+  bool is_identity(const idset_t& ids) const override;
+
+  void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override;
+
+  uint32_t get_identity_type() const override {
+    return TYPE_WEB;
+  }
+
+  std::string get_acct_name() const override {
+    return this->user_name;
+  }
+
+  std::string get_subuser() const override {
+    return {};
+  }
+
+  struct Factory {
+    virtual ~Factory() {}
+
+    virtual aplptr_t create_apl_web_identity( CephContext* cct,
+                                              const req_state* s,
+                                              const std::string& role_session,
+                                              const std::string& role_tenant,
+                                              const std::unordered_multimap<std::string, std::string>& token,
+                                              boost::optional<std::multimap<std::string, std::string>>,
+                                              boost::optional<std::set<std::pair<std::string, std::string>>> principal_tags) const = 0;
+  };
+};
+
+class ImplicitTenants: public md_config_obs_t {
+public:
+  enum implicit_tenant_flag_bits {IMPLICIT_TENANTS_SWIFT=1,
+	IMPLICIT_TENANTS_S3=2, IMPLICIT_TENANTS_BAD = -1, };
+private:
+  int saved;
+  void recompute_value(const ConfigProxy& );
+  class ImplicitTenantValue {
+    friend class ImplicitTenants;
+    int v;
+    ImplicitTenantValue(int v) : v(v) {};
+  public:
+    bool inline is_split_mode()
+    {
+      assert(v != IMPLICIT_TENANTS_BAD);
+      return v == IMPLICIT_TENANTS_SWIFT || v == IMPLICIT_TENANTS_S3;
+    }
+    bool inline implicit_tenants_for_(const implicit_tenant_flag_bits bit)
+    {
+      assert(v != IMPLICIT_TENANTS_BAD);
+      return static_cast<bool>(v&bit);
+    }
+  };
+public:
+  ImplicitTenants(const ConfigProxy& c) { recompute_value(c);}
+  ImplicitTenantValue get_value() const {
+    return ImplicitTenantValue(saved);
+  }
+private:
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+    const std::set <std::string> &changed) override;
+};
+
+std::tuple<bool,bool> implicit_tenants_enabled_for_swift(CephContext * const cct);
+std::tuple<bool,bool> implicit_tenants_enabled_for_s3(CephContext * const cct);
+
+/* rgw::auth::RemoteApplier targets those authentication engines which don't
+ * need to ask the RADOS store while performing the auth process. Instead,
+ * they obtain credentials from an external source like Keystone or LDAP.
+ *
+ * As the authenticated user may not have an account yet, RGWRemoteAuthApplier
+ * must be able to create it basing on data passed by an auth engine. Those
+ * data will be used to fill RGWUserInfo structure. */
+class RemoteApplier : public IdentityApplier {
+public:
+  class AuthInfo {
+    friend class RemoteApplier;
+  protected:
+    const rgw_user acct_user;
+    const std::string acct_name;
+    const uint32_t perm_mask;
+    const bool is_admin;
+    const uint32_t acct_type;
+    const std::string access_key_id;
+    const std::string subuser;
+
+  public:
+    enum class acct_privilege_t {
+      IS_ADMIN_ACCT,
+      IS_PLAIN_ACCT
+    };
+
+    static const std::string NO_SUBUSER;
+    static const std::string NO_ACCESS_KEY;
+
+    AuthInfo(const rgw_user& acct_user,
+             const std::string& acct_name,
+             const uint32_t perm_mask,
+             const acct_privilege_t level,
+             const std::string access_key_id,
+             const std::string subuser,
+             const uint32_t acct_type=TYPE_NONE)
+    : acct_user(acct_user),
+      acct_name(acct_name),
+      perm_mask(perm_mask),
+      is_admin(acct_privilege_t::IS_ADMIN_ACCT == level),
+      acct_type(acct_type),
+      access_key_id(access_key_id),
+      subuser(subuser) {
+    }
+  };
+
+  using aclspec_t = rgw::auth::Identity::aclspec_t;
+  typedef std::function<uint32_t(const aclspec_t&)> acl_strategy_t;
+
+protected:
+  CephContext* const cct;
+
+  /* Read-write is intensional here due to RGWUserInfo creation process. */
+  rgw::sal::Driver* driver;
+
+  /* Supplemental strategy for extracting permissions from ACLs. Its results
+   * will be combined (ORed) with a default strategy that is responsible for
+   * handling backward compatibility. */
+  const acl_strategy_t extra_acl_strategy;
+
+  const AuthInfo info;
+  const rgw::auth::ImplicitTenants& implicit_tenant_context;
+  const rgw::auth::ImplicitTenants::implicit_tenant_flag_bits implicit_tenant_bit;
+
+  virtual void create_account(const DoutPrefixProvider* dpp,
+                              const rgw_user& acct_user,
+                              bool implicit_tenant,
+                              RGWUserInfo& user_info) const;          /* out */
+
+public:
+  RemoteApplier(CephContext* const cct,
+                rgw::sal::Driver* driver,
+                acl_strategy_t&& extra_acl_strategy,
+                const AuthInfo& info,
+		const rgw::auth::ImplicitTenants& implicit_tenant_context,
+                rgw::auth::ImplicitTenants::implicit_tenant_flag_bits implicit_tenant_bit)
+    : cct(cct),
+      driver(driver),
+      extra_acl_strategy(std::move(extra_acl_strategy)),
+      info(info),
+      implicit_tenant_context(implicit_tenant_context),
+      implicit_tenant_bit(implicit_tenant_bit) {
+  }
+
+  uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override;
+  bool is_admin_of(const rgw_user& uid) const override;
+  bool is_owner_of(const rgw_user& uid) const override;
+  bool is_identity(const idset_t& ids) const override;
+
+  uint32_t get_perm_mask() const override { return info.perm_mask; }
+  void to_str(std::ostream& out) const override;
+  void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */
+  void write_ops_log_entry(rgw_log_entry& entry) const override;
+  uint32_t get_identity_type() const override { return info.acct_type; }
+  std::string get_acct_name() const override { return info.acct_name; }
+  std::string get_subuser() const override { return {}; }
+
+  struct Factory {
+    virtual ~Factory() {}
+    /* Providing r-value reference here is required intensionally. Callee is
+     * thus disallowed to handle std::function in a way that could inhibit
+     * the move behaviour (like forgetting about std::moving a l-value). */
+    virtual aplptr_t create_apl_remote(CephContext* cct,
+                                       const req_state* s,
+                                       acl_strategy_t&& extra_acl_strategy,
+                                       const AuthInfo &info) const = 0;
+  };
+};
+
+
+/* rgw::auth::LocalApplier targets those auth engines that base on the data
+ * enclosed in the RGWUserInfo control structure. As a side effect of doing
+ * the authentication process, they must have it loaded. Leveraging this is
+ * a way to avoid unnecessary calls to underlying RADOS store. */
+class LocalApplier : public IdentityApplier {
+  using aclspec_t = rgw::auth::Identity::aclspec_t;
+
+protected:
+  const RGWUserInfo user_info;
+  const std::string subuser;
+  uint32_t perm_mask;
+  const std::string access_key_id;
+
+  uint32_t get_perm_mask(const std::string& subuser_name,
+                         const RGWUserInfo &uinfo) const;
+
+public:
+  static const std::string NO_SUBUSER;
+  static const std::string NO_ACCESS_KEY;
+
+  LocalApplier(CephContext* const cct,
+               const RGWUserInfo& user_info,
+               std::string subuser,
+               const std::optional<uint32_t>& perm_mask,
+               const std::string access_key_id)
+    : user_info(user_info),
+      subuser(std::move(subuser)),
+      perm_mask(perm_mask.value_or(RGW_PERM_INVALID)),
+      access_key_id(access_key_id) {
+  }
+
+
+  uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override;
+  bool is_admin_of(const rgw_user& uid) const override;
+  bool is_owner_of(const rgw_user& uid) const override;
+  bool is_identity(const idset_t& ids) const override;
+  uint32_t get_perm_mask() const override {
+    if (this->perm_mask == RGW_PERM_INVALID) {
+      return get_perm_mask(subuser, user_info);
+    } else {
+      return this->perm_mask;
+    }
+  }
+  void to_str(std::ostream& out) const override;
+  void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */
+  uint32_t get_identity_type() const override { return TYPE_RGW; }
+  std::string get_acct_name() const override { return {}; }
+  std::string get_subuser() const override { return subuser; }
+  void write_ops_log_entry(rgw_log_entry& entry) const override;
+
+  struct Factory {
+    virtual ~Factory() {}
+    virtual aplptr_t create_apl_local(CephContext* cct,
+                                      const req_state* s,
+                                      const RGWUserInfo& user_info,
+                                      const std::string& subuser,
+                                      const std::optional<uint32_t>& perm_mask,
+                                      const std::string& access_key_id) const = 0;
+    };
+};
+
+class RoleApplier : public IdentityApplier {
+public:
+  struct Role {
+    std::string id;
+    std::string name;
+    std::string tenant;
+    std::vector<std::string> role_policies;
+  };
+  struct TokenAttrs {
+    rgw_user user_id;
+    std::string token_policy;
+    std::string role_session_name;
+    std::vector<std::string> token_claims;
+    std::string token_issued_at;
+    std::vector<std::pair<std::string, std::string>> principal_tags;
+  };
+protected:
+  Role role;
+  TokenAttrs token_attrs;
+
+public:
+
+  RoleApplier(CephContext* const cct,
+               const Role& role,
+               const TokenAttrs& token_attrs)
+    : role(role),
+      token_attrs(token_attrs) {}
+
+  uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override {
+    return 0;
+  }
+  bool is_admin_of(const rgw_user& uid) const override {
+    return false;
+  }
+  bool is_owner_of(const rgw_user& uid) const override {
+    return (this->token_attrs.user_id.id == uid.id && this->token_attrs.user_id.tenant == uid.tenant && this->token_attrs.user_id.ns == uid.ns);
+  }
+  bool is_identity(const idset_t& ids) const override;
+  uint32_t get_perm_mask() const override {
+    return RGW_PERM_NONE; 
+  }
+  void to_str(std::ostream& out) const override;
+  void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override; /* out */
+  uint32_t get_identity_type() const override { return TYPE_ROLE; }
+  std::string get_acct_name() const override { return {}; }
+  std::string get_subuser() const override { return {}; }
+  void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const override;
+  std::string get_role_tenant() const override { return role.tenant; }
+
+  struct Factory {
+    virtual ~Factory() {}
+    virtual aplptr_t create_apl_role( CephContext* cct,
+                                      const req_state* s,
+                                      const rgw::auth::RoleApplier::Role& role,
+                                      const rgw::auth::RoleApplier::TokenAttrs& token_attrs) const = 0;
+    };
+};
+
+/* The anonymous abstract engine. */
+class AnonymousEngine : public Engine {
+  CephContext* const cct;
+  const rgw::auth::LocalApplier::Factory* const apl_factory;
+
+public:
+  AnonymousEngine(CephContext* const cct,
+                  const rgw::auth::LocalApplier::Factory* const apl_factory)
+    : cct(cct),
+      apl_factory(apl_factory) {
+  }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::AnonymousEngine";
+  }
+
+  Engine::result_t authenticate(const DoutPrefixProvider* dpp, const req_state* s, optional_yield y) const override final;
+
+protected:
+  virtual bool is_applicable(const req_state*) const noexcept {
+    return true;
+  }
+};
+
+} /* namespace auth */
+} /* namespace rgw */
+
+
+uint32_t rgw_perms_from_aclspec_default_strategy(
+  const rgw_user& uid,
+  const rgw::auth::Identity::aclspec_t& aclspec,
+  const DoutPrefixProvider *dpp);
diff --git a/src/rgw/rgw_auth_filters.h b/src/rgw/rgw_auth_filters.h
new file mode 100644
index 000000000..9e3818bef
--- /dev/null
+++ b/src/rgw/rgw_auth_filters.h
@@ -0,0 +1,302 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <type_traits>
+
+#include <boost/logic/tribool.hpp>
+#include <boost/optional.hpp>
+
+#include "rgw_service.h"
+#include "rgw_common.h"
+#include "rgw_auth.h"
+#include "rgw_user.h"
+
+namespace rgw {
+namespace auth {
+
+/* Abstract decorator over any implementation of rgw::auth::IdentityApplier
+ * which could be provided both as a pointer-to-object or the object itself. */
+template <typename DecorateeT>
+class DecoratedApplier : public rgw::auth::IdentityApplier {
+  typedef typename std::remove_pointer<DecorateeT>::type DerefedDecorateeT;
+
+  static_assert(std::is_base_of<rgw::auth::IdentityApplier,
+                                DerefedDecorateeT>::value,
+                "DecorateeT must be a subclass of rgw::auth::IdentityApplier");
+
+  DecorateeT decoratee;
+
+  /* There is an indirection layer over accessing decoratee to share the same
+   * code base between dynamic and static decorators. The difference is about
+   * what we store internally: pointer to a decorated object versus the whole
+   * object itself. Googling for "SFINAE" can help to understand the code. */
+  template <typename T = void,
+            typename std::enable_if<
+    std::is_pointer<DecorateeT>::value, T>::type* = nullptr>
+  DerefedDecorateeT& get_decoratee() {
+    return *decoratee;
+  }
+
+  template <typename T = void,
+            typename std::enable_if<
+    ! std::is_pointer<DecorateeT>::value, T>::type* = nullptr>
+  DerefedDecorateeT& get_decoratee() {
+    return decoratee;
+  }
+
+  template <typename T = void,
+            typename std::enable_if<
+    std::is_pointer<DecorateeT>::value, T>::type* = nullptr>
+  const DerefedDecorateeT& get_decoratee() const {
+    return *decoratee;
+  }
+
+  template <typename T = void,
+            typename std::enable_if<
+    ! std::is_pointer<DecorateeT>::value, T>::type* = nullptr>
+  const DerefedDecorateeT& get_decoratee() const {
+    return decoratee;
+  }
+
+public:
+  explicit DecoratedApplier(DecorateeT&& decoratee)
+    : decoratee(std::forward<DecorateeT>(decoratee)) {
+  }
+
+  uint32_t get_perms_from_aclspec(const DoutPrefixProvider* dpp, const aclspec_t& aclspec) const override {
+    return get_decoratee().get_perms_from_aclspec(dpp, aclspec);
+  }
+
+  bool is_admin_of(const rgw_user& uid) const override {
+    return get_decoratee().is_admin_of(uid);
+  }
+
+  bool is_owner_of(const rgw_user& uid) const override {
+    return get_decoratee().is_owner_of(uid);
+  }
+
+  bool is_anonymous() const override {
+    return get_decoratee().is_anonymous();
+  }
+
+  uint32_t get_perm_mask() const override {
+    return get_decoratee().get_perm_mask();
+  }
+
+  uint32_t get_identity_type() const override {
+    return get_decoratee().get_identity_type();
+  }
+
+  std::string get_acct_name() const override {
+    return get_decoratee().get_acct_name();
+  }
+
+  std::string get_subuser() const override {
+    return get_decoratee().get_subuser();
+  }
+
+  bool is_identity(
+    const boost::container::flat_set<Principal>& ids) const override {
+    return get_decoratee().is_identity(ids);
+  }
+
+  void to_str(std::ostream& out) const override {
+    get_decoratee().to_str(out);
+  }
+
+  std::string get_role_tenant() const override {     /* in/out */
+    return get_decoratee().get_role_tenant();
+  }
+
+  void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override {  /* out */
+    return get_decoratee().load_acct_info(dpp, user_info);
+  }
+
+  void modify_request_state(const DoutPrefixProvider* dpp, req_state * s) const override {     /* in/out */
+    return get_decoratee().modify_request_state(dpp, s);
+  }
+
+  void write_ops_log_entry(rgw_log_entry& entry) const override {
+    return get_decoratee().write_ops_log_entry(entry);
+  }
+};
+
+
+template <typename T>
+class ThirdPartyAccountApplier : public DecoratedApplier<T> {
+  rgw::sal::Driver* driver;
+  const rgw_user acct_user_override;
+
+public:
+  /* A value representing situations where there is no requested account
+   * override. In other words, acct_user_override will be equal to this
+   * constant where the request isn't a cross-tenant one. */
+  static const rgw_user UNKNOWN_ACCT;
+
+  template <typename U>
+  ThirdPartyAccountApplier(rgw::sal::Driver* driver,
+                           const rgw_user &acct_user_override,
+                           U&& decoratee)
+    : DecoratedApplier<T>(std::move(decoratee)),
+      driver(driver),
+      acct_user_override(acct_user_override) {
+  }
+
+  void to_str(std::ostream& out) const override;
+  void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override;   /* out */
+};
+
+/* static declaration: UNKNOWN_ACCT will be an empty rgw_user that is a result
+ * of the default construction. */
+template <typename T>
+const rgw_user ThirdPartyAccountApplier<T>::UNKNOWN_ACCT;
+
+template <typename T>
+void ThirdPartyAccountApplier<T>::to_str(std::ostream& out) const
+{
+  out << "rgw::auth::ThirdPartyAccountApplier(" + acct_user_override.to_str() + ")"
+      <<   " -> ";
+  DecoratedApplier<T>::to_str(out);
+}
+
+template <typename T>
+void ThirdPartyAccountApplier<T>::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const
+{
+  if (UNKNOWN_ACCT == acct_user_override) {
+    /* There is no override specified by the upper layer. This means that we'll
+     * load the account owned by the authenticated identity (aka auth_user). */
+    DecoratedApplier<T>::load_acct_info(dpp, user_info);
+  } else if (DecoratedApplier<T>::is_owner_of(acct_user_override)) {
+    /* The override has been specified but the account belongs to the authenticated
+     * identity. We may safely forward the call to a next stage. */
+    DecoratedApplier<T>::load_acct_info(dpp, user_info);
+  } else if (this->is_anonymous()) {
+    /* If the user was authed by the anonymous engine then scope the ANON user
+     * to the correct tenant */
+    if (acct_user_override.tenant.empty())
+      user_info.user_id = rgw_user(acct_user_override.id, RGW_USER_ANON_ID);
+    else
+      user_info.user_id = rgw_user(acct_user_override.tenant, RGW_USER_ANON_ID);
+  } else {
+    /* Compatibility mechanism for multi-tenancy. For more details refer to
+     * load_acct_info method of rgw::auth::RemoteApplier. */
+    std::unique_ptr<rgw::sal::User> user;
+
+    if (acct_user_override.tenant.empty()) {
+      const rgw_user tenanted_uid(acct_user_override.id, acct_user_override.id);
+      user = driver->get_user(tenanted_uid);
+
+      if (user->load_user(dpp, null_yield) >= 0) {
+	user_info = user->get_info();
+        /* Succeeded. */
+        return;
+      }
+    }
+
+    user = driver->get_user(acct_user_override);
+    const int ret = user->load_user(dpp, null_yield);
+    if (ret < 0) {
+      /* We aren't trying to recover from ENOENT here. It's supposed that creating
+       * someone else's account isn't a thing we want to support in this filter. */
+      if (ret == -ENOENT) {
+        throw -EACCES;
+      } else {
+        throw ret;
+      }
+    }
+    user_info = user->get_info();
+  }
+}
+
+template <typename T> static inline
+ThirdPartyAccountApplier<T> add_3rdparty(rgw::sal::Driver* driver,
+                                         const rgw_user &acct_user_override,
+                                         T&& t) {
+  return ThirdPartyAccountApplier<T>(driver, acct_user_override,
+                                     std::forward<T>(t));
+}
+
+
+template <typename T>
+class SysReqApplier : public DecoratedApplier<T> {
+  CephContext* const cct;
+  rgw::sal::Driver* driver;
+  const RGWHTTPArgs& args;
+  mutable boost::tribool is_system;
+
+public:
+  template <typename U>
+  SysReqApplier(CephContext* const cct,
+		rgw::sal::Driver* driver,
+                const req_state* const s,
+                U&& decoratee)
+    : DecoratedApplier<T>(std::forward<T>(decoratee)),
+      cct(cct),
+      driver(driver),
+      args(s->info.args),
+      is_system(boost::logic::indeterminate) {
+  }
+
+  void to_str(std::ostream& out) const override;
+  void load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const override;   /* out */
+  void modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const override;       /* in/out */
+};
+
+template <typename T>
+void SysReqApplier<T>::to_str(std::ostream& out) const
+{
+  out << "rgw::auth::SysReqApplier" << " -> ";
+  DecoratedApplier<T>::to_str(out);
+}
+
+template <typename T>
+void SysReqApplier<T>::load_acct_info(const DoutPrefixProvider* dpp, RGWUserInfo& user_info) const
+{
+  DecoratedApplier<T>::load_acct_info(dpp, user_info);
+  is_system = user_info.system;
+
+  if (is_system) {
+    //ldpp_dout(dpp, 20) << "system request" << dendl;
+
+    rgw_user effective_uid(args.sys_get(RGW_SYS_PARAM_PREFIX "uid"));
+    if (! effective_uid.empty()) {
+      /* We aren't writing directly to user_info for consistency and security
+       * reasons. rgw_get_user_info_by_uid doesn't trigger the operator=() but
+       * calls ::decode instead. */
+      std::unique_ptr<rgw::sal::User> user = driver->get_user(effective_uid);
+      if (user->load_user(dpp, null_yield) < 0) {
+        //ldpp_dout(dpp, 0) << "User lookup failed!" << dendl;
+        throw -EACCES;
+      }
+      user_info = user->get_info();
+    }
+  }
+}
+
+template <typename T>
+void SysReqApplier<T>::modify_request_state(const DoutPrefixProvider* dpp, req_state* const s) const
+{
+  if (boost::logic::indeterminate(is_system)) {
+    RGWUserInfo unused_info;
+    load_acct_info(dpp, unused_info);
+  }
+
+  if (is_system) {
+    s->info.args.set_system();
+    s->system_request = true;
+  }
+  DecoratedApplier<T>::modify_request_state(dpp, s);
+}
+
+template <typename T> static inline
+SysReqApplier<T> add_sysreq(CephContext* const cct,
+			    rgw::sal::Driver* driver,
+                            const req_state* const s,
+                            T&& t) {
+  return SysReqApplier<T>(cct, driver, s, std::forward<T>(t));
+}
+
+} /* namespace auth */
+} /* namespace rgw */
diff --git a/src/rgw/rgw_auth_keystone.cc b/src/rgw/rgw_auth_keystone.cc
new file mode 100644
index 000000000..81588d50c
--- /dev/null
+++ b/src/rgw/rgw_auth_keystone.cc
@@ -0,0 +1,767 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string>
+#include <vector>
+
+#include <errno.h>
+#include <fnmatch.h>
+
+#include "rgw_b64.h"
+
+#include "common/errno.h"
+#include "common/ceph_json.h"
+#include "include/types.h"
+#include "include/str_list.h"
+
+#include "rgw_common.h"
+#include "rgw_keystone.h"
+#include "rgw_auth_keystone.h"
+#include "rgw_rest_s3.h"
+#include "rgw_auth_s3.h"
+
+#include "common/ceph_crypto.h"
+#include "common/Cond.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw {
+namespace auth {
+namespace keystone {
+
+bool
+TokenEngine::is_applicable(const std::string& token) const noexcept
+{
+  return ! token.empty() && ! cct->_conf->rgw_keystone_url.empty();
+}
+
+boost::optional<TokenEngine::token_envelope_t>
+TokenEngine::get_from_keystone(const DoutPrefixProvider* dpp, const std::string& token, bool allow_expired) const
+{
+  /* Unfortunately, we can't use the short form of "using" here. It's because
+   * we're aliasing a class' member, not namespace. */
+  using RGWValidateKeystoneToken = \
+    rgw::keystone::Service::RGWValidateKeystoneToken;
+
+  /* The container for plain response obtained from Keystone. It will be
+   * parsed token_envelope_t::parse method. */
+  ceph::bufferlist token_body_bl;
+  RGWValidateKeystoneToken validate(cct, "GET", "", &token_body_bl);
+
+  std::string url = config.get_endpoint_url();
+  if (url.empty()) {
+    throw -EINVAL;
+  }
+
+  const auto keystone_version = config.get_api_version();
+  if (keystone_version == rgw::keystone::ApiVersion::VER_2) {
+    url.append("v2.0/tokens/" + token);
+  } else if (keystone_version == rgw::keystone::ApiVersion::VER_3) {
+    url.append("v3/auth/tokens");
+
+    if (allow_expired) {
+      url.append("?allow_expired=1");
+    }
+
+    validate.append_header("X-Subject-Token", token);
+  }
+
+  std::string admin_token;
+  if (rgw::keystone::Service::get_admin_token(dpp, cct, token_cache, config,
+                                              admin_token) < 0) {
+    throw -EINVAL;
+  }
+
+  validate.append_header("X-Auth-Token", admin_token);
+  validate.set_send_length(0);
+
+  validate.set_url(url);
+
+  int ret = validate.process(null_yield);
+
+  /* NULL terminate for debug output. */
+  token_body_bl.append(static_cast<char>(0));
+
+  /* Detect Keystone rejection earlier than during the token parsing.
+   * Although failure at the parsing phase doesn't impose a threat,
+   * this allows to return proper error code (EACCESS instead of EINVAL
+   * or similar) and thus improves logging. */
+  if (validate.get_http_status() ==
+          /* Most likely: wrong admin credentials or admin token. */
+          RGWValidateKeystoneToken::HTTP_STATUS_UNAUTHORIZED ||
+      validate.get_http_status() ==
+          /* Most likely: non-existent token supplied by the client. */
+          RGWValidateKeystoneToken::HTTP_STATUS_NOTFOUND) {
+    ldpp_dout(dpp, 5) << "Failed keystone auth from " << url << " with "
+                  << validate.get_http_status() << dendl;
+    return boost::none;
+  }
+  // throw any other http or connection errors
+  if (ret < 0) {
+    throw ret;
+  }
+
+  ldpp_dout(dpp, 20) << "received response status=" << validate.get_http_status()
+                 << ", body=" << token_body_bl.c_str() << dendl;
+
+  TokenEngine::token_envelope_t token_body;
+  ret = token_body.parse(dpp, cct, token, token_body_bl, config.get_api_version());
+  if (ret < 0) {
+    throw ret;
+  }
+
+  return token_body;
+}
+
+TokenEngine::auth_info_t
+TokenEngine::get_creds_info(const TokenEngine::token_envelope_t& token,
+                            const std::vector<std::string>& admin_roles
+                           ) const noexcept
+{
+  using acct_privilege_t = rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t;
+
+  /* Check whether the user has an admin status. */
+  acct_privilege_t level = acct_privilege_t::IS_PLAIN_ACCT;
+  for (const auto& admin_role : admin_roles) {
+    if (token.has_role(admin_role)) {
+      level = acct_privilege_t::IS_ADMIN_ACCT;
+      break;
+    }
+  }
+
+  return auth_info_t {
+    /* Suggested account name for the authenticated user. */
+    rgw_user(token.get_project_id()),
+    /* User's display name (aka real name). */
+    token.get_project_name(),
+    /* Keystone doesn't support RGW's subuser concept, so we cannot cut down
+     * the access rights through the perm_mask. At least at this layer. */
+    RGW_PERM_FULL_CONTROL,
+    level,
+    rgw::auth::RemoteApplier::AuthInfo::NO_ACCESS_KEY,
+    rgw::auth::RemoteApplier::AuthInfo::NO_SUBUSER,
+    TYPE_KEYSTONE
+};
+}
+
+static inline const std::string
+make_spec_item(const std::string& tenant, const std::string& id)
+{
+  return tenant + ":" + id;
+}
+
+TokenEngine::acl_strategy_t
+TokenEngine::get_acl_strategy(const TokenEngine::token_envelope_t& token) const
+{
+  /* The primary identity is constructed upon UUIDs. */
+  const auto& tenant_uuid = token.get_project_id();
+  const auto& user_uuid = token.get_user_id();
+
+  /* For Keystone v2 an alias may be also used. */
+  const auto& tenant_name = token.get_project_name();
+  const auto& user_name = token.get_user_name();
+
+  /* Construct all possible combinations including Swift's wildcards. */
+  const std::array<std::string, 6> allowed_items = {
+    make_spec_item(tenant_uuid, user_uuid),
+    make_spec_item(tenant_name, user_name),
+
+    /* Wildcards. */
+    make_spec_item(tenant_uuid, "*"),
+    make_spec_item(tenant_name, "*"),
+    make_spec_item("*", user_uuid),
+    make_spec_item("*", user_name),
+  };
+
+  /* Lambda will obtain a copy of (not a reference to!) allowed_items. */
+  return [allowed_items](const rgw::auth::Identity::aclspec_t& aclspec) {
+    uint32_t perm = 0;
+
+    for (const auto& allowed_item : allowed_items) {
+      const auto iter = aclspec.find(allowed_item);
+
+      if (std::end(aclspec) != iter) {
+        perm |= iter->second;
+      }
+    }
+
+    return perm;
+  };
+}
+
+TokenEngine::result_t
+TokenEngine::authenticate(const DoutPrefixProvider* dpp,
+                          const std::string& token,
+                          const std::string& service_token,
+                          const req_state* const s) const
+{
+  bool allow_expired = false;
+  boost::optional<TokenEngine::token_envelope_t> t;
+
+  /* This will be initialized on the first call to this method. In C++11 it's
+   * also thread-safe. */
+  static const struct RolesCacher {
+    explicit RolesCacher(CephContext* const cct) {
+      get_str_vec(cct->_conf->rgw_keystone_accepted_roles, plain);
+      get_str_vec(cct->_conf->rgw_keystone_accepted_admin_roles, admin);
+
+      /* Let's suppose that having an admin role implies also a regular one. */
+      plain.insert(std::end(plain), std::begin(admin), std::end(admin));
+    }
+
+    std::vector<std::string> plain;
+    std::vector<std::string> admin;
+  } roles(cct);
+
+  static const struct ServiceTokenRolesCacher {
+    explicit ServiceTokenRolesCacher(CephContext* const cct) {
+      get_str_vec(cct->_conf->rgw_keystone_service_token_accepted_roles, plain);
+    }
+
+    std::vector<std::string> plain;
+  } service_token_roles(cct);
+
+  if (! is_applicable(token)) {
+    return result_t::deny();
+  }
+
+  /* Token ID is a legacy of supporting the service-side validation
+   * of PKI/PKIz token type which are already-removed-in-OpenStack.
+   * The idea was to bury in cache only a short hash instead of few
+   * kilobytes. RadosGW doesn't do the local validation anymore. */
+  const auto& token_id = rgw_get_token_id(token);
+  ldpp_dout(dpp, 20) << "token_id=" << token_id << dendl;
+
+  /* Check cache first. */
+  t = token_cache.find(token_id);
+  if (t) {
+    ldpp_dout(dpp, 20) << "cached token.project.id=" << t->get_project_id()
+                   << dendl;
+    auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(*t),
+                                              get_creds_info(*t, roles.admin));
+    return result_t::grant(std::move(apl));
+  }
+
+  /* We have a service token and a token so we verify the service
+   * token and if it's invalid the request is invalid. If it's valid
+   * we allow an expired token to be used when doing lookup in Keystone.
+   * We never get to this if the token is in the cache. */
+  if (g_conf()->rgw_keystone_service_token_enabled && ! service_token.empty()) {
+    boost::optional<TokenEngine::token_envelope_t> st;
+
+    const auto& service_token_id = rgw_get_token_id(service_token);
+    ldpp_dout(dpp, 20) << "service_token_id=" << service_token_id << dendl;
+
+    /* Check cache for service token first. */
+    st = token_cache.find_service(service_token_id);
+    if (st) {
+      ldpp_dout(dpp, 20) << "cached service_token.project.id=" << st->get_project_id()
+                     << dendl;
+
+      /* We found the service token in the cache so we allow using an expired
+       * token for this request. */
+      allow_expired = true;
+      ldpp_dout(dpp, 20) << "allowing expired tokens because service_token_id="
+                     << service_token_id
+                     << " was found in cache" << dendl;
+    } else {
+      /* Service token was not found in cache. Go to Keystone for validating
+       * the token. The allow_expired here must always be false. */
+      ceph_assert(allow_expired == false);
+      st = get_from_keystone(dpp, service_token, allow_expired);
+
+      if (! st) {
+        return result_t::deny(-EACCES);
+      }
+
+      /* Verify expiration of service token. */
+      if (st->expired()) {
+        ldpp_dout(dpp, 0) << "got expired service token: " << st->get_project_name()
+                       << ":" << st->get_user_name()
+                       << " expired " << st->get_expires() << dendl;
+        return result_t::deny(-EPERM);
+      }
+
+      /* Check for necessary roles for service token. */
+      for (const auto& role : service_token_roles.plain) {
+        if (st->has_role(role) == true) {
+          /* Service token is valid so we allow using an expired token for
+           * this request. */
+          ldpp_dout(dpp, 20) << "allowing expired tokens because service_token_id="
+                         << service_token_id
+                         << " is valid, role: "
+                         << role << dendl;
+          allow_expired = true;
+          token_cache.add_service(service_token_id, *st);
+          break;
+        }
+      }
+
+      if (!allow_expired) {
+        ldpp_dout(dpp, 0) << "service token user does not hold a matching role; required roles: "
+                  << g_conf()->rgw_keystone_service_token_accepted_roles << dendl;
+        return result_t::deny(-EPERM);
+      }
+    }
+  }
+
+  /* Token not in cache. Go to the Keystone for validation. This happens even
+   * for the legacy PKI/PKIz token types. That's it, after the PKI/PKIz
+   * RadosGW-side validation has been removed, we always ask Keystone. */
+  t = get_from_keystone(dpp, token, allow_expired);
+
+  if (! t) {
+    return result_t::deny(-EACCES);
+  }
+
+  /* Verify expiration. */
+  if (t->expired()) {
+    if (allow_expired) {
+      ldpp_dout(dpp, 20) << "allowing expired token: " << t->get_project_name()
+                    << ":" << t->get_user_name()
+                    << " expired: " << t->get_expires()
+                    << " because of valid service token" << dendl;
+    } else {
+      ldpp_dout(dpp, 0) << "got expired token: " << t->get_project_name()
+                    << ":" << t->get_user_name()
+                    << " expired: " << t->get_expires() << dendl;
+      return result_t::deny(-EPERM);
+    }
+  }
+
+  /* Check for necessary roles. */
+  for (const auto& role : roles.plain) {
+    if (t->has_role(role) == true) {
+      /* If this token was an allowed expired token because we got a
+       * service token we need to update the expiration before we cache it. */
+      if (allow_expired) {
+        time_t now = ceph_clock_now().sec();
+        time_t new_expires = now + g_conf()->rgw_keystone_expired_token_cache_expiration;
+        ldpp_dout(dpp, 20) << "updating expiration of allowed expired token"
+                           << " from old " << t->get_expires() << " to now " << now << " + "
+                           << g_conf()->rgw_keystone_expired_token_cache_expiration
+                           << " secs = "
+                           << new_expires << dendl;
+        t->set_expires(new_expires);
+      }
+      ldpp_dout(dpp, 0) << "validated token: " << t->get_project_name()
+                    << ":" << t->get_user_name()
+                    << " expires: " << t->get_expires() << dendl;
+      token_cache.add(token_id, *t);
+      auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(*t),
+                                            get_creds_info(*t, roles.admin));
+      return result_t::grant(std::move(apl));
+    }
+  }
+
+  ldpp_dout(dpp, 0) << "user does not hold a matching role; required roles: "
+                << g_conf()->rgw_keystone_accepted_roles << dendl;
+
+  return result_t::deny(-EPERM);
+}
+
+
+/*
+ * Try to validate S3 auth against keystone s3token interface
+ */
+std::pair<boost::optional<rgw::keystone::TokenEnvelope>, int>
+EC2Engine::get_from_keystone(const DoutPrefixProvider* dpp, const std::string_view& access_key_id,
+                             const std::string& string_to_sign,
+                             const std::string_view& signature) const
+{
+  /* prepare keystone url */
+  std::string keystone_url = config.get_endpoint_url();
+  if (keystone_url.empty()) {
+    throw -EINVAL;
+  }
+
+  const auto api_version = config.get_api_version();
+  if (api_version == rgw::keystone::ApiVersion::VER_3) {
+    keystone_url.append("v3/s3tokens");
+  } else {
+    keystone_url.append("v2.0/s3tokens");
+  }
+
+  /* get authentication token for Keystone. */
+  std::string admin_token;
+  int ret = rgw::keystone::Service::get_admin_token(dpp, cct, token_cache, config,
+                                                    admin_token);
+  if (ret < 0) {
+    ldpp_dout(dpp, 2) << "s3 keystone: cannot get token for keystone access"
+                  << dendl;
+    throw ret;
+  }
+
+  using RGWValidateKeystoneToken
+    = rgw::keystone::Service::RGWValidateKeystoneToken;
+
+  /* The container for plain response obtained from Keystone. It will be
+   * parsed token_envelope_t::parse method. */
+  ceph::bufferlist token_body_bl;
+  RGWValidateKeystoneToken validate(cct, "POST", keystone_url, &token_body_bl);
+
+  /* set required headers for keystone request */
+  validate.append_header("X-Auth-Token", admin_token);
+  validate.append_header("Content-Type", "application/json");
+
+  /* check if we want to verify keystone's ssl certs */
+  validate.set_verify_ssl(cct->_conf->rgw_keystone_verify_ssl);
+
+  /* create json credentials request body */
+  JSONFormatter credentials(false);
+  credentials.open_object_section("");
+  credentials.open_object_section("credentials");
+  credentials.dump_string("access", sview2cstr(access_key_id).data());
+  credentials.dump_string("token", rgw::to_base64(string_to_sign));
+  credentials.dump_string("signature", sview2cstr(signature).data());
+  credentials.close_section();
+  credentials.close_section();
+
+  std::stringstream os;
+  credentials.flush(os);
+  validate.set_post_data(os.str());
+  validate.set_send_length(os.str().length());
+
+  /* send request */
+  ret = validate.process(null_yield);
+
+  /* if the supplied signature is wrong, we will get 401 from Keystone */
+  if (validate.get_http_status() ==
+          decltype(validate)::HTTP_STATUS_UNAUTHORIZED) {
+    return std::make_pair(boost::none, -ERR_SIGNATURE_NO_MATCH);
+  } else if (validate.get_http_status() ==
+          decltype(validate)::HTTP_STATUS_NOTFOUND) {
+    return std::make_pair(boost::none, -ERR_INVALID_ACCESS_KEY);
+  }
+  // throw any other http or connection errors
+  if (ret < 0) {
+    ldpp_dout(dpp, 2) << "s3 keystone: token validation ERROR: "
+                  << token_body_bl.c_str() << dendl;
+    throw ret;
+  }
+
+  /* now parse response */
+  rgw::keystone::TokenEnvelope token_envelope;
+  ret = token_envelope.parse(dpp, cct, std::string(), token_body_bl, api_version);
+  if (ret < 0) {
+    ldpp_dout(dpp, 2) << "s3 keystone: token parsing failed, ret=0" << ret
+                  << dendl;
+    throw ret;
+  }
+
+  return std::make_pair(std::move(token_envelope), 0);
+}
+
+std::pair<boost::optional<std::string>, int> EC2Engine::get_secret_from_keystone(const DoutPrefixProvider* dpp,
+                                                                                 const std::string& user_id,
+                                                                                 const std::string_view& access_key_id) const
+{
+  /*  Fetch from /users/{USER_ID}/credentials/OS-EC2/{ACCESS_KEY_ID} */
+  /* Should return json with response key "credential" which contains entry "secret"*/
+
+  /* prepare keystone url */
+  std::string keystone_url = config.get_endpoint_url();
+  if (keystone_url.empty()) {
+    return make_pair(boost::none, -EINVAL);
+  }
+
+  const auto api_version = config.get_api_version();
+  if (api_version == rgw::keystone::ApiVersion::VER_3) {
+    keystone_url.append("v3/");
+  } else {
+    keystone_url.append("v2.0/");
+  }
+  keystone_url.append("users/");
+  keystone_url.append(user_id);
+  keystone_url.append("/credentials/OS-EC2/");
+  keystone_url.append(std::string(access_key_id));
+
+  /* get authentication token for Keystone. */
+  std::string admin_token;
+  int ret = rgw::keystone::Service::get_admin_token(dpp, cct, token_cache, config,
+                                                    admin_token);
+  if (ret < 0) {
+    ldpp_dout(dpp, 2) << "s3 keystone: cannot get token for keystone access"
+                  << dendl;
+    return make_pair(boost::none, ret);
+  }
+
+  using RGWGetAccessSecret
+    = rgw::keystone::Service::RGWKeystoneHTTPTransceiver;
+
+  /* The container for plain response obtained from Keystone.*/
+  ceph::bufferlist token_body_bl;
+  RGWGetAccessSecret secret(cct, "GET", keystone_url, &token_body_bl);
+
+  /* set required headers for keystone request */
+  secret.append_header("X-Auth-Token", admin_token);
+
+  /* check if we want to verify keystone's ssl certs */
+  secret.set_verify_ssl(cct->_conf->rgw_keystone_verify_ssl);
+
+  /* send request */
+  ret = secret.process(null_yield);
+
+  /* if the supplied access key isn't found, we will get 404 from Keystone */
+  if (secret.get_http_status() ==
+          decltype(secret)::HTTP_STATUS_NOTFOUND) {
+    return make_pair(boost::none, -ERR_INVALID_ACCESS_KEY);
+  }
+  // return any other http or connection errors
+  if (ret < 0) {
+    ldpp_dout(dpp, 2) << "s3 keystone: secret fetching error: "
+                  << token_body_bl.c_str() << dendl;
+    return make_pair(boost::none, ret);
+  }
+
+  /* now parse response */
+
+  JSONParser parser;
+  if (! parser.parse(token_body_bl.c_str(), token_body_bl.length())) {
+    ldpp_dout(dpp, 0) << "Keystone credential parse error: malformed json" << dendl;
+    return make_pair(boost::none, -EINVAL);
+  }
+
+  JSONObjIter credential_iter = parser.find_first("credential");
+  std::string secret_string;
+
+  try {
+    if (!credential_iter.end()) {
+      JSONDecoder::decode_json("secret", secret_string, *credential_iter, true);
+    } else {
+      ldpp_dout(dpp, 0) << "Keystone credential not present in return from server" << dendl;
+      return make_pair(boost::none, -EINVAL);
+    }
+  } catch (const JSONDecoder::err& err) {
+    ldpp_dout(dpp, 0) << "Keystone credential parse error: " << err.what() << dendl;
+    return make_pair(boost::none, -EINVAL);
+  }
+
+  return make_pair(secret_string, 0);
+}
+
+/*
+ * Try to get a token for S3 authentication, using a secret cache if available
+ */
+auto EC2Engine::get_access_token(const DoutPrefixProvider* dpp,
+                                 const std::string_view& access_key_id,
+                                 const std::string& string_to_sign,
+                                 const std::string_view& signature,
+                                 const signature_factory_t& signature_factory) const
+    -> access_token_result
+{
+  using server_signature_t = VersionAbstractor::server_signature_t;
+  boost::optional<rgw::keystone::TokenEnvelope> token;
+  boost::optional<std::string> secret;
+  int failure_reason;
+
+  /* Get a token from the cache if one has already been stored */
+  boost::optional<boost::tuple<rgw::keystone::TokenEnvelope, std::string>>
+    t = secret_cache.find(std::string(access_key_id));
+
+  /* Check that credentials can correctly be used to sign data */
+  if (t) {
+    std::string sig(signature);
+    server_signature_t server_signature = signature_factory(cct, t->get<1>(), string_to_sign);
+    if (sig.compare(server_signature) == 0) {
+      return {t->get<0>(), t->get<1>(), 0};
+    } else {
+      ldpp_dout(dpp, 0) << "Secret string does not correctly sign payload, cache miss" << dendl;
+    }
+  } else {
+    ldpp_dout(dpp, 0) << "No stored secret string, cache miss" << dendl;
+  }
+
+  /* No cached token, token expired, or secret invalid: fall back to keystone */
+  std::tie(token, failure_reason) = get_from_keystone(dpp, access_key_id, string_to_sign, signature);
+
+  if (token) {
+    /* Fetch secret from keystone for the access_key_id */
+    std::tie(secret, failure_reason) =
+        get_secret_from_keystone(dpp, token->get_user_id(), access_key_id);
+
+    if (secret) {
+      /* Add token, secret pair to cache, and set timeout */
+      secret_cache.add(std::string(access_key_id), *token, *secret);
+    }
+  }
+
+  return {token, secret, failure_reason};
+}
+
+EC2Engine::acl_strategy_t
+EC2Engine::get_acl_strategy(const EC2Engine::token_envelope_t&) const
+{
+  /* This is based on the assumption that the default acl strategy in
+   * get_perms_from_aclspec, will take care. Extra acl spec is not required. */
+  return nullptr;
+}
+
+EC2Engine::auth_info_t
+EC2Engine::get_creds_info(const EC2Engine::token_envelope_t& token,
+                          const std::vector<std::string>& admin_roles,
+                          const std::string& access_key_id
+                         ) const noexcept
+{
+  using acct_privilege_t = \
+    rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t;
+
+  /* Check whether the user has an admin status. */
+  acct_privilege_t level = acct_privilege_t::IS_PLAIN_ACCT;
+  for (const auto& admin_role : admin_roles) {
+    if (token.has_role(admin_role)) {
+      level = acct_privilege_t::IS_ADMIN_ACCT;
+      break;
+    }
+  }
+
+  return auth_info_t {
+    /* Suggested account name for the authenticated user. */
+    rgw_user(token.get_project_id()),
+    /* User's display name (aka real name). */
+    token.get_project_name(),
+    /* Keystone doesn't support RGW's subuser concept, so we cannot cut down
+     * the access rights through the perm_mask. At least at this layer. */
+    RGW_PERM_FULL_CONTROL,
+    level,
+    access_key_id,
+    rgw::auth::RemoteApplier::AuthInfo::NO_SUBUSER,
+    TYPE_KEYSTONE
+  };
+}
+
+rgw::auth::Engine::result_t EC2Engine::authenticate(
+  const DoutPrefixProvider* dpp,
+  const std::string_view& access_key_id,
+  const std::string_view& signature,
+  const std::string_view& session_token,
+  const string_to_sign_t& string_to_sign,
+  const signature_factory_t& signature_factory,
+  const completer_factory_t& completer_factory,
+  /* Passthorugh only! */
+  const req_state* s,
+  optional_yield y) const
+{
+  /* This will be initialized on the first call to this method. In C++11 it's
+   * also thread-safe. */
+  static const struct RolesCacher {
+    explicit RolesCacher(CephContext* const cct) {
+      get_str_vec(cct->_conf->rgw_keystone_accepted_roles, plain);
+      get_str_vec(cct->_conf->rgw_keystone_accepted_admin_roles, admin);
+
+      /* Let's suppose that having an admin role implies also a regular one. */
+      plain.insert(std::end(plain), std::begin(admin), std::end(admin));
+    }
+
+    std::vector<std::string> plain;
+    std::vector<std::string> admin;
+  } accepted_roles(cct);
+
+  auto [t, secret_key, failure_reason] =
+    get_access_token(dpp, access_key_id, string_to_sign, signature, signature_factory);
+  if (! t) {
+    if (failure_reason == -ERR_SIGNATURE_NO_MATCH) {
+      // we looked up a secret but it didn't generate the same signature as
+      // the client. since we found this access key in keystone, we should
+      // reject the request instead of trying other engines
+      return result_t::reject(failure_reason);
+    }
+    return result_t::deny(failure_reason);
+  }
+
+  /* Verify expiration. */
+  if (t->expired()) {
+    ldpp_dout(dpp, 0) << "got expired token: " << t->get_project_name()
+                  << ":" << t->get_user_name()
+                  << " expired: " << t->get_expires() << dendl;
+    return result_t::deny();
+  }
+
+  /* check if we have a valid role */
+  bool found = false;
+  for (const auto& role : accepted_roles.plain) {
+    if (t->has_role(role) == true) {
+      found = true;
+      break;
+    }
+  }
+
+  if (! found) {
+    ldpp_dout(dpp, 5) << "s3 keystone: user does not hold a matching role;"
+                     " required roles: "
+                  << cct->_conf->rgw_keystone_accepted_roles << dendl;
+    return result_t::deny();
+  } else {
+    /* everything seems fine, continue with this user */
+    ldpp_dout(dpp, 5) << "s3 keystone: validated token: " << t->get_project_name()
+                  << ":" << t->get_user_name()
+                  << " expires: " << t->get_expires() << dendl;
+
+    auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(*t),
+                                              get_creds_info(*t, accepted_roles.admin, std::string(access_key_id)));
+    return result_t::grant(std::move(apl), completer_factory(secret_key));
+  }
+}
+
+bool SecretCache::find(const std::string& token_id,
+                       SecretCache::token_envelope_t& token,
+		       std::string &secret)
+{
+  std::lock_guard<std::mutex> l(lock);
+
+  map<std::string, secret_entry>::iterator iter = secrets.find(token_id);
+  if (iter == secrets.end()) {
+    return false;
+  }
+
+  secret_entry& entry = iter->second;
+  secrets_lru.erase(entry.lru_iter);
+
+  const utime_t now = ceph_clock_now();
+  if (entry.token.expired() || now > entry.expires) {
+    secrets.erase(iter);
+    return false;
+  }
+  token = entry.token;
+  secret = entry.secret;
+
+  secrets_lru.push_front(token_id);
+  entry.lru_iter = secrets_lru.begin();
+
+  return true;
+}
+
+void SecretCache::add(const std::string& token_id,
+                      const SecretCache::token_envelope_t& token,
+		      const std::string& secret)
+{
+  std::lock_guard<std::mutex> l(lock);
+
+  map<string, secret_entry>::iterator iter = secrets.find(token_id);
+  if (iter != secrets.end()) {
+    secret_entry& e = iter->second;
+    secrets_lru.erase(e.lru_iter);
+  }
+
+  const utime_t now = ceph_clock_now();
+  secrets_lru.push_front(token_id);
+  secret_entry& entry = secrets[token_id];
+  entry.token = token;
+  entry.secret = secret;
+  entry.expires = now + s3_token_expiry_length;
+  entry.lru_iter = secrets_lru.begin();
+
+  while (secrets_lru.size() > max) {
+    list<string>::reverse_iterator riter = secrets_lru.rbegin();
+    iter = secrets.find(*riter);
+    assert(iter != secrets.end());
+    secrets.erase(iter);
+    secrets_lru.pop_back();
+  }
+}
+
+}; /* namespace keystone */
+}; /* namespace auth */
+}; /* namespace rgw */
diff --git a/src/rgw/rgw_auth_keystone.h b/src/rgw/rgw_auth_keystone.h
new file mode 100644
index 000000000..9d79bc878
--- /dev/null
+++ b/src/rgw/rgw_auth_keystone.h
@@ -0,0 +1,202 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string_view>
+#include <utility>
+#include <boost/optional.hpp>
+
+#include "rgw_auth.h"
+#include "rgw_rest_s3.h"
+#include "rgw_common.h"
+#include "rgw_keystone.h"
+
+namespace rgw {
+namespace auth {
+namespace keystone {
+
+/* Dedicated namespace for Keystone-related auth engines. We need it because
+ * Keystone offers three different authentication mechanisms (token, EC2 and
+ * regular user/pass). RadosGW actually does support the first two. */
+
+class TokenEngine : public rgw::auth::Engine {
+  CephContext* const cct;
+
+  using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t;
+  using auth_info_t = rgw::auth::RemoteApplier::AuthInfo;
+  using result_t = rgw::auth::Engine::result_t;
+  using token_envelope_t = rgw::keystone::TokenEnvelope;
+
+  const rgw::auth::TokenExtractor* const auth_token_extractor;
+  const rgw::auth::TokenExtractor* const service_token_extractor;
+  const rgw::auth::RemoteApplier::Factory* const apl_factory;
+  rgw::keystone::Config& config;
+  rgw::keystone::TokenCache& token_cache;
+
+  /* Helper methods. */
+  bool is_applicable(const std::string& token) const noexcept;
+
+  boost::optional<token_envelope_t>
+  get_from_keystone(const DoutPrefixProvider* dpp, const std::string& token, bool allow_expired) const;
+
+  acl_strategy_t get_acl_strategy(const token_envelope_t& token) const;
+  auth_info_t get_creds_info(const token_envelope_t& token,
+                             const std::vector<std::string>& admin_roles
+                            ) const noexcept;
+  result_t authenticate(const DoutPrefixProvider* dpp,
+                        const std::string& token,
+                        const std::string& service_token,
+                        const req_state* s) const;
+
+public:
+  TokenEngine(CephContext* const cct,
+              const rgw::auth::TokenExtractor* const auth_token_extractor,
+              const rgw::auth::TokenExtractor* const service_token_extractor,
+              const rgw::auth::RemoteApplier::Factory* const apl_factory,
+              rgw::keystone::Config& config,
+              rgw::keystone::TokenCache& token_cache)
+    : cct(cct),
+      auth_token_extractor(auth_token_extractor),
+      service_token_extractor(service_token_extractor),
+      apl_factory(apl_factory),
+      config(config),
+      token_cache(token_cache) {
+  }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::keystone::TokenEngine";
+  }
+
+  result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s,
+			optional_yield y) const override {
+    return authenticate(dpp, auth_token_extractor->get_token(s), service_token_extractor->get_token(s), s);
+  }
+}; /* class TokenEngine */
+
+class SecretCache {
+  using token_envelope_t = rgw::keystone::TokenEnvelope;
+
+  struct secret_entry {
+    token_envelope_t token;
+    std::string secret;
+    utime_t expires;
+    std::list<std::string>::iterator lru_iter;
+  };
+
+  const boost::intrusive_ptr<CephContext> cct;
+
+  std::map<std::string, secret_entry> secrets;
+  std::list<std::string> secrets_lru;
+
+  std::mutex lock;
+
+  const size_t max;
+
+  const utime_t s3_token_expiry_length;
+
+  SecretCache()
+    : cct(g_ceph_context),
+      lock(),
+      max(cct->_conf->rgw_keystone_token_cache_size),
+      s3_token_expiry_length(300, 0) {
+  }
+
+  ~SecretCache() {}
+
+public:
+  SecretCache(const SecretCache&) = delete;
+  void operator=(const SecretCache&) = delete;
+
+  static SecretCache& get_instance() {
+    /* In C++11 this is thread safe. */
+    static SecretCache instance;
+    return instance;
+  }
+
+  bool find(const std::string& token_id, token_envelope_t& token, std::string& secret);
+  boost::optional<boost::tuple<token_envelope_t, std::string>> find(const std::string& token_id) {
+    token_envelope_t token_envlp;
+    std::string secret;
+    if (find(token_id, token_envlp, secret)) {
+      return boost::make_tuple(token_envlp, secret);
+    }
+    return boost::none;
+  }
+  void add(const std::string& token_id, const token_envelope_t& token, const std::string& secret);
+}; /* class SecretCache */
+
+class EC2Engine : public rgw::auth::s3::AWSEngine {
+  using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t;
+  using auth_info_t = rgw::auth::RemoteApplier::AuthInfo;
+  using result_t = rgw::auth::Engine::result_t;
+  using token_envelope_t = rgw::keystone::TokenEnvelope;
+
+  const rgw::auth::RemoteApplier::Factory* const apl_factory;
+  rgw::keystone::Config& config;
+  rgw::keystone::TokenCache& token_cache;
+  rgw::auth::keystone::SecretCache& secret_cache;
+
+  /* Helper methods. */
+  acl_strategy_t get_acl_strategy(const token_envelope_t& token) const;
+  auth_info_t get_creds_info(const token_envelope_t& token,
+                             const std::vector<std::string>& admin_roles,
+                             const std::string& access_key_id
+                            ) const noexcept;
+  std::pair<boost::optional<token_envelope_t>, int>
+  get_from_keystone(const DoutPrefixProvider* dpp,
+                    const std::string_view& access_key_id,
+                    const std::string& string_to_sign,
+                    const std::string_view& signature) const;
+
+  struct access_token_result {
+    boost::optional<token_envelope_t> token;
+    boost::optional<std::string> secret_key;
+    int failure_reason = 0;
+  };
+  access_token_result
+  get_access_token(const DoutPrefixProvider* dpp,
+                   const std::string_view& access_key_id,
+                   const std::string& string_to_sign,
+                   const std::string_view& signature,
+		   const signature_factory_t& signature_factory) const;
+  result_t authenticate(const DoutPrefixProvider* dpp,
+                        const std::string_view& access_key_id,
+                        const std::string_view& signature,
+                        const std::string_view& session_token,
+                        const string_to_sign_t& string_to_sign,
+                        const signature_factory_t& signature_factory,
+                        const completer_factory_t& completer_factory,
+                        const req_state* s,
+			optional_yield y) const override;
+  std::pair<boost::optional<std::string>, int> get_secret_from_keystone(const DoutPrefixProvider* dpp,
+                                                                        const std::string& user_id,
+                                                                        const std::string_view& access_key_id) const;
+public:
+  EC2Engine(CephContext* const cct,
+            const rgw::auth::s3::AWSEngine::VersionAbstractor* const ver_abstractor,
+            const rgw::auth::RemoteApplier::Factory* const apl_factory,
+            rgw::keystone::Config& config,
+            /* The token cache is used ONLY for the retrieving admin token.
+             * Due to the architecture of AWS Auth S3 credentials cannot be
+             * cached at all. */
+            rgw::keystone::TokenCache& token_cache,
+	    rgw::auth::keystone::SecretCache& secret_cache)
+    : AWSEngine(cct, *ver_abstractor),
+      apl_factory(apl_factory),
+      config(config),
+      token_cache(token_cache),
+      secret_cache(secret_cache) {
+  }
+
+  using AWSEngine::authenticate;
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::keystone::EC2Engine";
+  }
+
+}; /* class EC2Engine */
+
+}; /* namespace keystone */
+}; /* namespace auth */
+}; /* namespace rgw */
diff --git a/src/rgw/rgw_auth_registry.h b/src/rgw/rgw_auth_registry.h
new file mode 100644
index 000000000..b9d239aec
--- /dev/null
+++ b/src/rgw/rgw_auth_registry.h
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <ostream>
+#include <type_traits>
+#include <utility>
+
+#include "rgw_auth.h"
+#include "rgw_auth_s3.h"
+#include "rgw_swift_auth.h"
+#include "rgw_rest_sts.h"
+
+namespace rgw {
+namespace auth {
+
+/* A class aggregating the knowledge about all Strategies in RadosGW. It is
+ * responsible for handling the dynamic reconfiguration on e.g. realm update. */
+class StrategyRegistry {
+  template <class AbstractorT,
+            bool AllowAnonAccessT = false>
+  using s3_strategy_t = \
+    rgw::auth::s3::AWSAuthStrategy<AbstractorT, AllowAnonAccessT>;
+
+  struct s3_main_strategy_t : public Strategy {
+    using s3_main_strategy_plain_t = \
+      s3_strategy_t<rgw::auth::s3::AWSGeneralAbstractor, true>;
+    using s3_main_strategy_boto2_t = \
+      s3_strategy_t<rgw::auth::s3::AWSGeneralBoto2Abstractor>;
+
+    s3_main_strategy_plain_t s3_main_strategy_plain;
+    s3_main_strategy_boto2_t s3_main_strategy_boto2;
+
+    s3_main_strategy_t(CephContext* const cct,
+		       const ImplicitTenants& implicit_tenant_context,
+		       rgw::sal::Driver* driver)
+      : s3_main_strategy_plain(cct, implicit_tenant_context, driver),
+        s3_main_strategy_boto2(cct, implicit_tenant_context, driver) {
+      add_engine(Strategy::Control::SUFFICIENT, s3_main_strategy_plain);
+      add_engine(Strategy::Control::FALLBACK, s3_main_strategy_boto2);
+    }
+
+    const char* get_name() const noexcept override {
+      return "rgw::auth::StrategyRegistry::s3_main_strategy_t";
+    }
+  } s3_main_strategy;
+
+  using s3_post_strategy_t = \
+    s3_strategy_t<rgw::auth::s3::AWSBrowserUploadAbstractor>;
+  s3_post_strategy_t s3_post_strategy;
+
+  rgw::auth::swift::DefaultStrategy swift_strategy;
+
+  rgw::auth::sts::DefaultStrategy sts_strategy;
+
+public:
+  StrategyRegistry(CephContext* const cct,
+                   const ImplicitTenants& implicit_tenant_context,
+                   rgw::sal::Driver* driver)
+    : s3_main_strategy(cct, implicit_tenant_context, driver),
+      s3_post_strategy(cct, implicit_tenant_context, driver),
+      swift_strategy(cct, implicit_tenant_context, driver),
+      sts_strategy(cct, implicit_tenant_context, driver) {
+  }
+
+  const s3_main_strategy_t& get_s3_main() const {
+    return s3_main_strategy;
+  }
+
+  const s3_post_strategy_t& get_s3_post() const {
+    return s3_post_strategy;
+  }
+
+  const rgw::auth::swift::DefaultStrategy& get_swift() const {
+    return swift_strategy;
+  }
+
+  const rgw::auth::sts::DefaultStrategy& get_sts() const {
+    return sts_strategy;
+  }
+
+  static std::unique_ptr<StrategyRegistry>
+  create(CephContext* const cct,
+         const ImplicitTenants& implicit_tenant_context,
+         rgw::sal::Driver* driver) {
+    return std::make_unique<StrategyRegistry>(cct, implicit_tenant_context, driver);
+  }
+};
+
+} /* namespace auth */
+} /* namespace rgw */
+
+using rgw_auth_registry_t = rgw::auth::StrategyRegistry;
+using rgw_auth_registry_ptr_t = std::unique_ptr<rgw_auth_registry_t>;
diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc
new file mode 100644
index 000000000..0797f8184
--- /dev/null
+++ b/src/rgw/rgw_auth_s3.cc
@@ -0,0 +1,1355 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <algorithm>
+#include <map>
+#include <iterator>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "common/armor.h"
+#include "common/utf8.h"
+#include "rgw_rest_s3.h"
+#include "rgw_auth_s3.h"
+#include "rgw_common.h"
+#include "rgw_client_io.h"
+#include "rgw_rest.h"
+#include "rgw_crypt_sanitize.h"
+
+#include <boost/container/small_vector.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/trim_all.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static const auto signed_subresources = {
+  "acl",
+  "cors",
+  "delete",
+  "encryption",
+  "lifecycle",
+  "location",
+  "logging",
+  "notification",
+  "partNumber",
+  "policy",
+  "policyStatus",
+  "publicAccessBlock",
+  "requestPayment",
+  "response-cache-control",
+  "response-content-disposition",
+  "response-content-encoding",
+  "response-content-language",
+  "response-content-type",
+  "response-expires",
+  "tagging",
+  "torrent",
+  "uploadId",
+  "uploads",
+  "versionId",
+  "versioning",
+  "versions",
+  "website",
+  "object-lock"
+};
+
+/*
+ * ?get the canonical amazon-style header for something?
+ */
+
+static std::string
+get_canon_amz_hdr(const meta_map_t& meta_map)
+{
+  std::string dest;
+
+  for (const auto& kv : meta_map) {
+    dest.append(kv.first);
+    dest.append(":");
+    dest.append(kv.second);
+    dest.append("\n");
+  }
+
+  return dest;
+}
+
+/*
+ * ?get the canonical representation of the object's location
+ */
+static std::string
+get_canon_resource(const DoutPrefixProvider *dpp, const char* const request_uri,
+                   const std::map<std::string, std::string>& sub_resources)
+{
+  std::string dest;
+
+  if (request_uri) {
+    dest.append(request_uri);
+  }
+
+  bool initial = true;
+  for (const auto& subresource : signed_subresources) {
+    const auto iter = sub_resources.find(subresource);
+    if (iter == std::end(sub_resources)) {
+      continue;
+    }
+    
+    if (initial) {
+      dest.append("?");
+      initial = false;
+    } else {
+      dest.append("&");
+    }
+
+    dest.append(iter->first);
+    if (! iter->second.empty()) {
+      dest.append("=");
+      dest.append(iter->second);
+    }
+  }
+
+  ldpp_dout(dpp, 10) << "get_canon_resource(): dest=" << dest << dendl;
+  return dest;
+}
+
+/*
+ * get the header authentication  information required to
+ * compute a request's signature
+ */
+void rgw_create_s3_canonical_header(
+  const DoutPrefixProvider *dpp,
+  const char* const method,
+  const char* const content_md5,
+  const char* const content_type,
+  const char* const date,
+  const meta_map_t& meta_map,
+  const meta_map_t& qs_map,
+  const char* const request_uri,
+  const std::map<std::string, std::string>& sub_resources,
+  std::string& dest_str)
+{
+  std::string dest;
+
+  if (method) {
+    dest = method;
+  }
+  dest.append("\n");
+  
+  if (content_md5) {
+    dest.append(content_md5);
+  }
+  dest.append("\n");
+
+  if (content_type) {
+    dest.append(content_type);
+  }
+  dest.append("\n");
+
+  if (date) {
+    dest.append(date);
+  }
+  dest.append("\n");
+
+  dest.append(get_canon_amz_hdr(meta_map));
+  dest.append(get_canon_amz_hdr(qs_map));
+  dest.append(get_canon_resource(dpp, request_uri, sub_resources));
+
+  dest_str = dest;
+}
+
+static inline bool is_base64_for_content_md5(unsigned char c) {
+  return (isalnum(c) || isspace(c) || (c == '+') || (c == '/') || (c == '='));
+}
+
+static inline void get_v2_qs_map(const req_info& info,
+				 meta_map_t& qs_map) {
+  const auto& params = const_cast<RGWHTTPArgs&>(info.args).get_params();
+  for (const auto& elt : params) {
+    std::string k = boost::algorithm::to_lower_copy(elt.first);
+    if (k.find("x-amz-meta-") == /* offset */ 0) {
+      rgw_add_amz_meta_header(qs_map, k, elt.second);
+    }
+    if (k == "x-amz-security-token") {
+      qs_map[k] = elt.second;
+    }
+  }
+}
+
+/*
+ * get the header authentication  information required to
+ * compute a request's signature
+ */
+bool rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp,
+                                    const req_info& info,
+                                    utime_t* const header_time,
+                                    std::string& dest,
+                                    const bool qsr)
+{
+  const char* const content_md5 = info.env->get("HTTP_CONTENT_MD5");
+  if (content_md5) {
+    for (const char *p = content_md5; *p; p++) {
+      if (!is_base64_for_content_md5(*p)) {
+        ldpp_dout(dpp, 0) << "NOTICE: bad content-md5 provided (not base64),"
+                << " aborting request p=" << *p << " " << (int)*p << dendl;
+        return false;
+      }
+    }
+  }
+
+  const char *content_type = info.env->get("CONTENT_TYPE");
+
+  std::string date;
+  meta_map_t qs_map;
+
+  if (qsr) {
+    get_v2_qs_map(info, qs_map); // handle qs metadata
+    date = info.args.get("Expires");
+  } else {
+    const char *str = info.env->get("HTTP_X_AMZ_DATE");
+    const char *req_date = str;
+    if (str == NULL) {
+      req_date = info.env->get("HTTP_DATE");
+      if (!req_date) {
+        ldpp_dout(dpp, 0) << "NOTICE: missing date for auth header" << dendl;
+        return false;
+      }
+      date = req_date;
+    }
+
+    if (header_time) {
+      struct tm t;
+      uint32_t ns = 0;
+      if (!parse_rfc2616(req_date, &t) && !parse_iso8601(req_date, &t, &ns, false)) {
+        ldpp_dout(dpp, 0) << "NOTICE: failed to parse date <" << req_date << "> for auth header" << dendl;
+        return false;
+      }
+      if (t.tm_year < 70) {
+        ldpp_dout(dpp, 0) << "NOTICE: bad date (predates epoch): " << req_date << dendl;
+        return false;
+      }
+      *header_time = utime_t(internal_timegm(&t), 0);
+      *header_time -= t.tm_gmtoff;
+    }
+  }
+
+  const auto& meta_map = info.x_meta_map;
+  const auto& sub_resources = info.args.get_sub_resources();
+
+  std::string request_uri;
+  if (info.effective_uri.empty()) {
+    request_uri = info.request_uri;
+  } else {
+    request_uri = info.effective_uri;
+  }
+
+  rgw_create_s3_canonical_header(dpp, info.method, content_md5, content_type,
+                                 date.c_str(), meta_map, qs_map,
+				 request_uri.c_str(), sub_resources, dest);
+  return true;
+}
+
+
+namespace rgw::auth::s3 {
+
+bool is_time_skew_ok(time_t t)
+{
+  auto req_tp = ceph::coarse_real_clock::from_time_t(t);
+  auto cur_tp = ceph::coarse_real_clock::now();
+
+  if (std::chrono::abs(cur_tp - req_tp) > RGW_AUTH_GRACE) {
+    dout(10) << "NOTICE: request time skew too big." << dendl;
+    using ceph::operator<<;
+    dout(10) << "req_tp=" << req_tp << ", cur_tp=" << cur_tp << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+static inline int parse_v4_query_string(const req_info& info,              /* in */
+                                        std::string_view& credential,    /* out */
+                                        std::string_view& signedheaders, /* out */
+                                        std::string_view& signature,     /* out */
+                                        std::string_view& date,          /* out */
+                                        std::string_view& sessiontoken)  /* out */
+{
+  /* auth ships with req params ... */
+
+  /* look for required params */
+  credential = info.args.get("x-amz-credential");
+  if (credential.size() == 0) {
+    return -EPERM;
+  }
+
+  date = info.args.get("x-amz-date");
+  struct tm date_t;
+  if (!parse_iso8601(sview2cstr(date).data(), &date_t, nullptr, false)) {
+    return -EPERM;
+  }
+
+  std::string_view expires = info.args.get("x-amz-expires");
+  if (expires.empty()) {
+    return -EPERM;
+  }
+  /* X-Amz-Expires provides the time period, in seconds, for which
+     the generated presigned URL is valid. The minimum value
+     you can set is 1, and the maximum is 604800 (seven days) */
+  time_t exp = atoll(expires.data());
+  if ((exp < 1) || (exp > 7*24*60*60)) {
+    dout(10) << "NOTICE: exp out of range, exp = " << exp << dendl;
+    return -EPERM;
+  }
+  /* handle expiration in epoch time */
+  uint64_t req_sec = (uint64_t)internal_timegm(&date_t);
+  uint64_t now = ceph_clock_now();
+  if (now >= req_sec + exp) {
+    dout(10) << "NOTICE: now = " << now << ", req_sec = " << req_sec << ", exp = " << exp << dendl;
+    return -EPERM;
+  }
+
+  signedheaders = info.args.get("x-amz-signedheaders");
+  if (signedheaders.size() == 0) {
+    return -EPERM;
+  }
+
+  signature = info.args.get("x-amz-signature");
+  if (signature.size() == 0) {
+    return -EPERM;
+  }
+
+  if (info.args.exists("x-amz-security-token")) {
+    sessiontoken = info.args.get("x-amz-security-token");
+    if (sessiontoken.size() == 0) {
+      return -EPERM;
+    }
+  }
+
+  return 0;
+}
+
+static bool get_next_token(const std::string_view& s,
+                           size_t& pos,
+                           const char* const delims,
+                           std::string_view& token)
+{
+  const size_t start = s.find_first_not_of(delims, pos);
+  if (start == std::string_view::npos) {
+    pos = s.size();
+    return false;
+  }
+
+  size_t end = s.find_first_of(delims, start);
+  if (end != std::string_view::npos)
+    pos = end + 1;
+  else {
+    pos = end = s.size();
+  }
+
+  token = s.substr(start, end - start);
+  return true;
+}
+
+template<std::size_t ExpectedStrNum>
+boost::container::small_vector<std::string_view, ExpectedStrNum>
+get_str_vec(const std::string_view& str, const char* const delims)
+{
+  boost::container::small_vector<std::string_view, ExpectedStrNum> str_vec;
+
+  size_t pos = 0;
+  std::string_view token;
+  while (pos < str.size()) {
+    if (get_next_token(str, pos, delims, token)) {
+      if (token.size() > 0) {
+        str_vec.push_back(token);
+      }
+    }
+  }
+
+  return str_vec;
+}
+
+template<std::size_t ExpectedStrNum>
+boost::container::small_vector<std::string_view, ExpectedStrNum>
+get_str_vec(const std::string_view& str)
+{
+  const char delims[] = ";,= \t";
+  return get_str_vec<ExpectedStrNum>(str, delims);
+}
+
+static inline int parse_v4_auth_header(const req_info& info,               /* in */
+                                       std::string_view& credential,     /* out */
+                                       std::string_view& signedheaders,  /* out */
+                                       std::string_view& signature,      /* out */
+                                       std::string_view& date,           /* out */
+                                       std::string_view& sessiontoken,   /* out */
+                                       const DoutPrefixProvider *dpp)
+{
+  std::string_view input(info.env->get("HTTP_AUTHORIZATION", ""));
+  try {
+    input = input.substr(::strlen(AWS4_HMAC_SHA256_STR) + 1);
+  } catch (std::out_of_range&) {
+    /* We should never ever run into this situation as the presence of
+     * AWS4_HMAC_SHA256_STR had been verified earlier. */
+    ldpp_dout(dpp, 10) << "credentials string is too short" << dendl;
+    return -EINVAL;
+  }
+
+  std::map<std::string_view, std::string_view> kv;
+  for (const auto& s : get_str_vec<4>(input, ",")) {
+    const auto parsed_pair = parse_key_value(s);
+    if (parsed_pair) {
+      kv[parsed_pair->first] = parsed_pair->second;
+    } else {
+      ldpp_dout(dpp, 10) << "NOTICE: failed to parse auth header (s=" << s << ")"
+               << dendl;
+      return -EINVAL;
+    }
+  }
+
+  static const std::array<std::string_view, 3> required_keys = {
+    "Credential",
+    "SignedHeaders",
+    "Signature"
+  };
+
+  /* Ensure that the presigned required keys are really there. */
+  for (const auto& k : required_keys) {
+    if (kv.find(k) == std::end(kv)) {
+      ldpp_dout(dpp, 10) << "NOTICE: auth header missing key: " << k << dendl;
+      return -EINVAL;
+    }
+  }
+
+  credential = kv["Credential"];
+  signedheaders = kv["SignedHeaders"];
+  signature = kv["Signature"];
+
+  /* sig hex str */
+  ldpp_dout(dpp, 10) << "v4 signature format = " << signature << dendl;
+
+  /* ------------------------- handle x-amz-date header */
+
+  /* grab date */
+
+  const char *d = info.env->get("HTTP_X_AMZ_DATE");
+
+  struct tm t;
+  if (unlikely(d == NULL)) {
+    d = info.env->get("HTTP_DATE");
+  }
+  if (!d || !parse_iso8601(d, &t, NULL, false)) {
+    ldpp_dout(dpp, 10) << "error reading date via http_x_amz_date and http_date" << dendl;
+    return -EACCES;
+  }
+  date = d;
+
+  if (!is_time_skew_ok(internal_timegm(&t))) {
+    return -ERR_REQUEST_TIME_SKEWED;
+  }
+
+  auto token = info.env->get_optional("HTTP_X_AMZ_SECURITY_TOKEN");
+  if (token) {
+    sessiontoken = *token;
+  }
+
+  return 0;
+}
+
+bool is_non_s3_op(RGWOpType op_type)
+{
+  if (op_type == RGW_STS_GET_SESSION_TOKEN ||
+      op_type == RGW_STS_ASSUME_ROLE ||
+      op_type == RGW_STS_ASSUME_ROLE_WEB_IDENTITY ||
+      op_type == RGW_OP_CREATE_ROLE ||
+      op_type == RGW_OP_DELETE_ROLE ||
+      op_type == RGW_OP_GET_ROLE ||
+      op_type == RGW_OP_MODIFY_ROLE_TRUST_POLICY ||
+      op_type == RGW_OP_LIST_ROLES ||
+      op_type == RGW_OP_PUT_ROLE_POLICY ||
+      op_type == RGW_OP_GET_ROLE_POLICY ||
+      op_type == RGW_OP_LIST_ROLE_POLICIES ||
+      op_type == RGW_OP_DELETE_ROLE_POLICY ||
+      op_type == RGW_OP_PUT_USER_POLICY ||
+      op_type == RGW_OP_GET_USER_POLICY ||
+      op_type == RGW_OP_LIST_USER_POLICIES ||
+      op_type == RGW_OP_DELETE_USER_POLICY ||
+      op_type == RGW_OP_CREATE_OIDC_PROVIDER ||
+      op_type == RGW_OP_DELETE_OIDC_PROVIDER ||
+      op_type == RGW_OP_GET_OIDC_PROVIDER ||
+      op_type == RGW_OP_LIST_OIDC_PROVIDERS ||
+      op_type == RGW_OP_PUBSUB_TOPIC_CREATE ||
+      op_type == RGW_OP_PUBSUB_TOPICS_LIST ||
+      op_type == RGW_OP_PUBSUB_TOPIC_GET ||
+      op_type == RGW_OP_PUBSUB_TOPIC_DELETE ||
+      op_type == RGW_OP_TAG_ROLE ||
+      op_type == RGW_OP_LIST_ROLE_TAGS ||
+      op_type == RGW_OP_UNTAG_ROLE ||
+      op_type == RGW_OP_UPDATE_ROLE) {
+    return true;
+  }
+  return false;
+}
+
+int parse_v4_credentials(const req_info& info,                     /* in */
+			 std::string_view& access_key_id,        /* out */
+			 std::string_view& credential_scope,     /* out */
+			 std::string_view& signedheaders,        /* out */
+			 std::string_view& signature,            /* out */
+			 std::string_view& date,                 /* out */
+			 std::string_view& session_token,        /* out */
+			 const bool using_qs,                    /* in */
+                         const DoutPrefixProvider *dpp)
+{
+  std::string_view credential;
+  int ret;
+  if (using_qs) {
+    ret = parse_v4_query_string(info, credential, signedheaders,
+                                signature, date, session_token);
+  } else {
+    ret = parse_v4_auth_header(info, credential, signedheaders,
+                               signature, date, session_token, dpp);
+  }
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  /* access_key/YYYYMMDD/region/service/aws4_request */
+  ldpp_dout(dpp, 10) << "v4 credential format = " << credential << dendl;
+
+  if (std::count(credential.begin(), credential.end(), '/') != 4) {
+    return -EINVAL;
+  }
+
+  /* credential must end with 'aws4_request' */
+  if (credential.find("aws4_request") == std::string::npos) {
+    return -EINVAL;
+  }
+
+  /* grab access key id */
+  const size_t pos = credential.find("/");
+  access_key_id = credential.substr(0, pos);
+  ldpp_dout(dpp, 10) << "access key id = " << access_key_id << dendl;
+
+  /* grab credential scope */
+  credential_scope = credential.substr(pos + 1);
+  ldpp_dout(dpp, 10) << "credential scope = " << credential_scope << dendl;
+
+  return 0;
+}
+
+string gen_v4_scope(const ceph::real_time& timestamp,
+                    const string& region,
+                    const string& service)
+{
+
+  auto sec = real_clock::to_time_t(timestamp);
+
+  struct tm bt;
+  gmtime_r(&sec, &bt);
+
+  auto year = 1900 + bt.tm_year;
+  auto mon = bt.tm_mon + 1;
+  auto day = bt.tm_mday;
+
+  return fmt::format(FMT_STRING("{:d}{:02d}{:02d}/{:s}/{:s}/aws4_request"),
+                     year, mon, day, region, service);
+}
+
+std::string get_v4_canonical_qs(const req_info& info, const bool using_qs)
+{
+  const std::string *params = &info.request_params;
+  std::string copy_params;
+  if (params->empty()) {
+    /* Optimize the typical flow. */
+    return std::string();
+  }
+  if (params->find_first_of('+') != std::string::npos) {
+    copy_params = *params;
+    boost::replace_all(copy_params, "+", "%20");
+    params = &copy_params;
+  }
+
+  /* Handle case when query string exists. Step 3 described in: http://docs.
+   * aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html */
+  std::map<std::string, std::string> canonical_qs_map;
+  for (const auto& s : get_str_vec<5>(*params, "&")) {
+    std::string_view key, val;
+    const auto parsed_pair = parse_key_value(s);
+    if (parsed_pair) {
+      std::tie(key, val) = *parsed_pair;
+    } else {
+      /* Handling a parameter without any value (even the empty one). That's
+       * it, we've encountered something like "this_param&other_param=val"
+       * which is used by S3 for subresources. */
+      key = s;
+    }
+
+    if (using_qs && boost::iequals(key, "X-Amz-Signature")) {
+      /* Preserving the original behaviour of get_v4_canonical_qs() here. */
+      continue;
+    }
+
+    // while awsv4 specs ask for all slashes to be encoded, s3 itself is relaxed
+    // in its implementation allowing non-url-encoded slashes to be present in
+    // presigned urls for instance
+    canonical_qs_map[aws4_uri_recode(key, true)] = aws4_uri_recode(val, true);
+  }
+
+  /* Thanks to the early exist we have the guarantee that canonical_qs_map has
+   * at least one element. */
+  auto iter = std::begin(canonical_qs_map);
+  std::string canonical_qs;
+  canonical_qs.append(iter->first)
+              .append("=", ::strlen("="))
+              .append(iter->second);
+
+  for (iter++; iter != std::end(canonical_qs_map); iter++) {
+    canonical_qs.append("&", ::strlen("&"))
+                .append(iter->first)
+                .append("=", ::strlen("="))
+                .append(iter->second);
+  }
+
+  return canonical_qs;
+}
+
+static void add_v4_canonical_params_from_map(const map<string, string>& m,
+                                        std::map<string, string> *result,
+                                        bool is_non_s3_op)
+{
+  for (auto& entry : m) {
+    const auto& key = entry.first;
+    if (key.empty() || (is_non_s3_op && key == "PayloadHash")) {
+      continue;
+    }
+
+    (*result)[aws4_uri_recode(key, true)] = aws4_uri_recode(entry.second, true);
+  }
+}
+
+std::string gen_v4_canonical_qs(const req_info& info, bool is_non_s3_op)
+{
+  std::map<std::string, std::string> canonical_qs_map;
+
+  add_v4_canonical_params_from_map(info.args.get_params(), &canonical_qs_map, is_non_s3_op);
+  add_v4_canonical_params_from_map(info.args.get_sys_params(), &canonical_qs_map, false);
+
+  if (canonical_qs_map.empty()) {
+    return string();
+  }
+
+  /* Thanks to the early exit we have the guarantee that canonical_qs_map has
+   * at least one element. */
+  auto iter = std::begin(canonical_qs_map);
+  std::string canonical_qs;
+  canonical_qs.append(iter->first)
+              .append("=", ::strlen("="))
+              .append(iter->second);
+
+  for (iter++; iter != std::end(canonical_qs_map); iter++) {
+    canonical_qs.append("&", ::strlen("&"))
+                .append(iter->first)
+                .append("=", ::strlen("="))
+                .append(iter->second);
+  }
+
+  return canonical_qs;
+}
+
+std::string get_v4_canonical_method(const req_state* s)
+{
+  /* If this is a OPTIONS request we need to compute the v4 signature for the
+   * intended HTTP method and not the OPTIONS request itself. */
+  if (s->op_type == RGW_OP_OPTIONS_CORS) {
+    const char *cors_method = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD");
+
+    if (cors_method) {
+      /* Validate request method passed in access-control-request-method is valid. */
+      auto cors_flags = get_cors_method_flags(cors_method);
+      if (!cors_flags) {
+          ldpp_dout(s, 1) << "invalid access-control-request-method header = "
+                          << cors_method << dendl;
+          throw -EINVAL;
+      }
+
+      ldpp_dout(s, 10) << "canonical req method = " << cors_method
+                       << ", due to access-control-request-method header" << dendl;
+      return cors_method;
+    } else {
+      ldpp_dout(s, 1) << "invalid http options req missing "
+                      << "access-control-request-method header" << dendl;
+      throw -EINVAL;
+    }
+  }
+
+  return s->info.method;
+}
+
+boost::optional<std::string>
+get_v4_canonical_headers(const req_info& info,
+                         const std::string_view& signedheaders,
+                         const bool using_qs,
+                         const bool force_boto2_compat)
+{
+  std::map<std::string_view, std::string> canonical_hdrs_map;
+  for (const auto& token : get_str_vec<5>(signedheaders, ";")) {
+    /* TODO(rzarzynski): we'd like to switch to sstring here but it should
+     * get push_back() and reserve() first. */
+    std::string token_env = "HTTP_";
+    token_env.reserve(token.length() + std::strlen("HTTP_") + 1);
+
+    std::transform(std::begin(token), std::end(token),
+                   std::back_inserter(token_env), [](const int c) {
+                     return c == '-' ? '_' : c == '_' ? '-' : std::toupper(c);
+                   });
+
+    if (token_env == "HTTP_CONTENT_LENGTH") {
+      token_env = "CONTENT_LENGTH";
+    } else if (token_env == "HTTP_CONTENT_TYPE") {
+      token_env = "CONTENT_TYPE";
+    }
+    const char* const t = info.env->get(token_env.c_str());
+    if (!t) {
+      dout(10) << "warning env var not available " << token_env.c_str() << dendl;
+      continue;
+    }
+
+    std::string token_value(t);
+    if (token_env == "HTTP_CONTENT_MD5" &&
+        !std::all_of(std::begin(token_value), std::end(token_value),
+                     is_base64_for_content_md5)) {
+      dout(0) << "NOTICE: bad content-md5 provided (not base64)"
+            << ", aborting request" << dendl;
+      return boost::none;
+    }
+
+    if (force_boto2_compat && using_qs && token == "host") {
+      std::string_view port = info.env->get("SERVER_PORT", "");
+      std::string_view secure_port = info.env->get("SERVER_PORT_SECURE", "");
+
+      if (!secure_port.empty()) {
+	if (secure_port != "443")
+	  token_value.append(":", std::strlen(":"))
+                     .append(secure_port.data(), secure_port.length());
+      } else if (!port.empty()) {
+	if (port != "80")
+	  token_value.append(":", std::strlen(":"))
+                     .append(port.data(), port.length());
+      }
+    }
+
+    canonical_hdrs_map[token] = rgw_trim_whitespace(token_value);
+  }
+
+  std::string canonical_hdrs;
+  for (const auto& header : canonical_hdrs_map) {
+    const std::string_view& name = header.first;
+    std::string value = header.second;
+    boost::trim_all<std::string>(value);
+
+    canonical_hdrs.append(name.data(), name.length())
+                  .append(":", std::strlen(":"))
+                  .append(value)
+                  .append("\n", std::strlen("\n"));
+  }
+  return canonical_hdrs;
+}
+
+static void handle_header(const string& header, const string& val,
+                          std::map<std::string, std::string> *canonical_hdrs_map)
+{
+  /* TODO(rzarzynski): we'd like to switch to sstring here but it should
+   * get push_back() and reserve() first. */
+
+  std::string token;
+  token.reserve(header.length());
+
+  if (header == "HTTP_CONTENT_LENGTH") {
+    token = "content-length";
+  } else if (header == "HTTP_CONTENT_TYPE") {
+    token = "content-type";
+  } else {
+    auto start = std::begin(header);
+    if (boost::algorithm::starts_with(header, "HTTP_")) {
+      start += 5; /* len("HTTP_") */
+    }
+
+    std::transform(start, std::end(header),
+                   std::back_inserter(token), [](const int c) {
+                   return c == '_' ? '-' : std::tolower(c);
+                   });
+  }
+
+  (*canonical_hdrs_map)[token] = rgw_trim_whitespace(val);
+}
+
+std::string gen_v4_canonical_headers(const req_info& info,
+                                     const map<string, string>& extra_headers,
+                                     string *signed_hdrs)
+{
+  std::map<std::string, std::string> canonical_hdrs_map;
+  for (auto& entry : info.env->get_map()) {
+    handle_header(entry.first, entry.second, &canonical_hdrs_map);
+  }
+  for (auto& entry : extra_headers) {
+    handle_header(entry.first, entry.second, &canonical_hdrs_map);
+  }
+
+  std::string canonical_hdrs;
+  signed_hdrs->clear();
+  for (const auto& header : canonical_hdrs_map) {
+    const auto& name = header.first;
+    std::string value = header.second;
+    boost::trim_all<std::string>(value);
+
+    if (!signed_hdrs->empty()) {
+      signed_hdrs->append(";");
+    }
+    signed_hdrs->append(name);
+
+    canonical_hdrs.append(name.data(), name.length())
+                  .append(":", std::strlen(":"))
+                  .append(value)
+                  .append("\n", std::strlen("\n"));
+  }
+
+  return canonical_hdrs;
+}
+
+/*
+ * create canonical request for signature version 4
+ *
+ * http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html
+ */
+sha256_digest_t
+get_v4_canon_req_hash(CephContext* cct,
+                      const std::string_view& http_verb,
+                      const std::string& canonical_uri,
+                      const std::string& canonical_qs,
+                      const std::string& canonical_hdrs,
+                      const std::string_view& signed_hdrs,
+                      const std::string_view& request_payload_hash,
+                      const DoutPrefixProvider *dpp)
+{
+  ldpp_dout(dpp, 10) << "payload request hash = " << request_payload_hash << dendl;
+
+  const auto canonical_req = string_join_reserve("\n",
+    http_verb,
+    canonical_uri,
+    canonical_qs,
+    canonical_hdrs,
+    signed_hdrs,
+    request_payload_hash);
+
+  const auto canonical_req_hash = calc_hash_sha256(canonical_req);
+
+  using sanitize = rgw::crypt_sanitize::log_content;
+  ldpp_dout(dpp, 10) << "canonical request = " << sanitize{canonical_req} << dendl;
+  ldpp_dout(dpp, 10) << "canonical request hash = "
+                 << canonical_req_hash << dendl;
+
+  return canonical_req_hash;
+}
+
+/*
+ * create string to sign for signature version 4
+ *
+ * http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html
+ */
+AWSEngine::VersionAbstractor::string_to_sign_t
+get_v4_string_to_sign(CephContext* const cct,
+                      const std::string_view& algorithm,
+                      const std::string_view& request_date,
+                      const std::string_view& credential_scope,
+                      const sha256_digest_t& canonreq_hash,
+                      const DoutPrefixProvider *dpp)
+{
+  const auto hexed_cr_hash = canonreq_hash.to_str();
+  const std::string_view hexed_cr_hash_str(hexed_cr_hash);
+
+  const auto string_to_sign = string_join_reserve("\n",
+    algorithm,
+    request_date,
+    credential_scope,
+    hexed_cr_hash_str);
+
+  ldpp_dout(dpp, 10) << "string to sign = "
+                 << rgw::crypt_sanitize::log_content{string_to_sign}
+                 << dendl;
+
+  return string_to_sign;
+}
+
+
+static inline std::tuple<std::string_view,            /* date */
+                         std::string_view,            /* region */
+                         std::string_view>            /* service */
+parse_cred_scope(std::string_view credential_scope)
+{
+  /* date cred */
+  size_t pos = credential_scope.find("/");
+  const auto date_cs = credential_scope.substr(0, pos);
+  credential_scope = credential_scope.substr(pos + 1);
+
+  /* region cred */
+  pos = credential_scope.find("/");
+  const auto region_cs = credential_scope.substr(0, pos);
+  credential_scope = credential_scope.substr(pos + 1);
+
+  /* service cred */
+  pos = credential_scope.find("/");
+  const auto service_cs = credential_scope.substr(0, pos);
+
+  return std::make_tuple(date_cs, region_cs, service_cs);
+}
+
+static inline std::vector<unsigned char>
+transform_secret_key(const std::string_view& secret_access_key)
+{
+  /* TODO(rzarzynski): switch to constexpr when C++14 becomes available. */
+  static const std::initializer_list<unsigned char> AWS4 { 'A', 'W', 'S', '4' };
+
+  /* boost::container::small_vector might be used here if someone wants to
+   * optimize out even more dynamic allocations. */
+  std::vector<unsigned char> secret_key_utf8;
+  secret_key_utf8.reserve(AWS4.size() + secret_access_key.size());
+  secret_key_utf8.assign(AWS4);
+
+  for (const auto c : secret_access_key) {
+    std::array<unsigned char, MAX_UTF8_SZ> buf;
+    const size_t n = encode_utf8(c, buf.data());
+    secret_key_utf8.insert(std::end(secret_key_utf8),
+                           std::begin(buf), std::begin(buf) + n);
+  }
+
+  return secret_key_utf8;
+}
+
+/*
+ * calculate the SigningKey of AWS auth version 4
+ */
+static sha256_digest_t
+get_v4_signing_key(CephContext* const cct,
+                   const std::string_view& credential_scope,
+                   const std::string_view& secret_access_key,
+                   const DoutPrefixProvider *dpp)
+{
+  std::string_view date, region, service;
+  std::tie(date, region, service) = parse_cred_scope(credential_scope);
+
+  const auto utfed_sec_key = transform_secret_key(secret_access_key);
+  const auto date_k = calc_hmac_sha256(utfed_sec_key, date);
+  const auto region_k = calc_hmac_sha256(date_k, region);
+  const auto service_k = calc_hmac_sha256(region_k, service);
+
+  /* aws4_request */
+  const auto signing_key = calc_hmac_sha256(service_k,
+                                            std::string_view("aws4_request"));
+
+  ldpp_dout(dpp, 10) << "date_k    = " << date_k << dendl;
+  ldpp_dout(dpp, 10) << "region_k  = " << region_k << dendl;
+  ldpp_dout(dpp, 10) << "service_k = " << service_k << dendl;
+  ldpp_dout(dpp, 10) << "signing_k = " << signing_key << dendl;
+
+  return signing_key;
+}
+
+/*
+ * calculate the AWS signature version 4
+ *
+ * http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html
+ *
+ * srv_signature_t is an alias over Ceph's basic_sstring. We're using
+ * it to keep everything within the stack boundaries instead of doing
+ * dynamic allocations.
+ */
+AWSEngine::VersionAbstractor::server_signature_t
+get_v4_signature(const std::string_view& credential_scope,
+                 CephContext* const cct,
+                 const std::string_view& secret_key,
+                 const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign,
+                 const DoutPrefixProvider *dpp)
+{
+  auto signing_key = get_v4_signing_key(cct, credential_scope, secret_key, dpp);
+
+  /* The server-side generated digest for comparison. */
+  const auto digest = calc_hmac_sha256(signing_key, string_to_sign);
+
+  /* TODO(rzarzynski): I would love to see our sstring having reserve() and
+   * the non-const data() variant like C++17's std::string. */
+  using srv_signature_t = AWSEngine::VersionAbstractor::server_signature_t;
+  srv_signature_t signature(srv_signature_t::initialized_later(),
+                            digest.SIZE * 2);
+  buf_to_hex(digest.v, digest.SIZE, signature.begin());
+
+  ldpp_dout(dpp, 10) << "generated signature = " << signature << dendl;
+
+  return signature;
+}
+
+AWSEngine::VersionAbstractor::server_signature_t
+get_v2_signature(CephContext* const cct,
+                 const std::string& secret_key,
+                 const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign)
+{
+  if (secret_key.empty()) {
+    throw -EINVAL;
+  }
+
+  const auto digest = calc_hmac_sha1(secret_key, string_to_sign);
+
+  /* 64 is really enough */;
+  char buf[64];
+  const int ret = ceph_armor(std::begin(buf),
+                             std::begin(buf) + 64,
+                             reinterpret_cast<const char *>(digest.v),
+                             reinterpret_cast<const char *>(digest.v + digest.SIZE));
+  if (ret < 0) {
+    ldout(cct, 10) << "ceph_armor failed" << dendl;
+    throw ret;
+  } else {
+    buf[ret] = '\0';
+    using srv_signature_t = AWSEngine::VersionAbstractor::server_signature_t;
+    return srv_signature_t(buf, ret);
+  }
+}
+
+bool AWSv4ComplMulti::ChunkMeta::is_new_chunk_in_stream(size_t stream_pos) const
+{
+  return stream_pos >= (data_offset_in_stream + data_length);
+}
+
+size_t AWSv4ComplMulti::ChunkMeta::get_data_size(size_t stream_pos) const
+{
+  if (stream_pos > (data_offset_in_stream + data_length)) {
+    /* Data in parsing_buf. */
+    return data_length;
+  } else {
+    return data_offset_in_stream + data_length - stream_pos;
+  }
+}
+
+
+/* AWSv4 completers begin. */
+std::pair<AWSv4ComplMulti::ChunkMeta, size_t /* consumed */>
+AWSv4ComplMulti::ChunkMeta::create_next(CephContext* const cct,
+                                        ChunkMeta&& old,
+                                        const char* const metabuf,
+                                        const size_t metabuf_len)
+{
+  std::string_view metastr(metabuf, metabuf_len);
+
+  const size_t semicolon_pos = metastr.find(";");
+  if (semicolon_pos == std::string_view::npos) {
+    ldout(cct, 20) << "AWSv4ComplMulti cannot find the ';' separator"
+                   << dendl;
+    throw rgw::io::Exception(EINVAL, std::system_category());
+  }
+
+  char* data_field_end;
+  /* strtoull ignores the "\r\n" sequence after each non-first chunk. */
+  const size_t data_length = std::strtoull(metabuf, &data_field_end, 16);
+  if (data_length == 0 && data_field_end == metabuf) {
+    ldout(cct, 20) << "AWSv4ComplMulti: cannot parse the data size"
+                   << dendl;
+    throw rgw::io::Exception(EINVAL, std::system_category());
+  }
+
+  /* Parse the chunk_signature=... part. */
+  const auto signature_part = metastr.substr(semicolon_pos + 1);
+  const size_t eq_sign_pos = signature_part.find("=");
+  if (eq_sign_pos == std::string_view::npos) {
+    ldout(cct, 20) << "AWSv4ComplMulti: cannot find the '=' separator"
+                   << dendl;
+    throw rgw::io::Exception(EINVAL, std::system_category());
+  }
+
+  /* OK, we have at least the beginning of a signature. */
+  const size_t data_sep_pos = signature_part.find("\r\n");
+  if (data_sep_pos == std::string_view::npos) {
+    ldout(cct, 20) << "AWSv4ComplMulti: no new line at signature end"
+                   << dendl;
+    throw rgw::io::Exception(EINVAL, std::system_category());
+  }
+
+  const auto signature = \
+    signature_part.substr(eq_sign_pos + 1, data_sep_pos - 1 - eq_sign_pos);
+  if (signature.length() != SIG_SIZE) {
+    ldout(cct, 20) << "AWSv4ComplMulti: signature.length() != 64"
+                   << dendl;
+    throw rgw::io::Exception(EINVAL, std::system_category());
+  }
+
+  const size_t data_starts_in_stream = \
+    + semicolon_pos + strlen(";") + data_sep_pos  + strlen("\r\n")
+    + old.data_offset_in_stream + old.data_length;
+
+  ldout(cct, 20) << "parsed new chunk; signature=" << signature
+                 << ", data_length=" << data_length
+                 << ", data_starts_in_stream=" << data_starts_in_stream
+                 << dendl;
+
+  return std::make_pair(ChunkMeta(data_starts_in_stream,
+                                  data_length,
+                                  signature),
+                        semicolon_pos + 83);
+}
+
+std::string
+AWSv4ComplMulti::calc_chunk_signature(const std::string& payload_hash) const
+{
+  const auto string_to_sign = string_join_reserve("\n",
+    AWS4_HMAC_SHA256_PAYLOAD_STR,
+    date,
+    credential_scope,
+    prev_chunk_signature,
+    AWS4_EMPTY_PAYLOAD_HASH,
+    payload_hash);
+
+  ldout(cct, 20) << "AWSv4ComplMulti: string_to_sign=\n" << string_to_sign
+                 << dendl;
+
+  /* new chunk signature */
+  const auto sig = calc_hmac_sha256(signing_key, string_to_sign);
+  /* FIXME(rzarzynski): std::string here is really unnecessary. */
+  return sig.to_str();
+}
+
+
+bool AWSv4ComplMulti::is_signature_mismatched()
+{
+  /* The validity of previous chunk can be verified only after getting meta-
+   * data of the next one. */
+  const auto payload_hash = calc_hash_sha256_restart_stream(&sha256_hash);
+  const auto calc_signature = calc_chunk_signature(payload_hash);
+
+  if (chunk_meta.get_signature() != calc_signature) {
+    ldout(cct, 20) << "AWSv4ComplMulti: ERROR: chunk signature mismatch"
+                   << dendl;
+    ldout(cct, 20) << "AWSv4ComplMulti: declared signature="
+                   << chunk_meta.get_signature() << dendl;
+    ldout(cct, 20) << "AWSv4ComplMulti: calculated signature="
+                   << calc_signature << dendl;
+
+    return true;
+  } else {
+    prev_chunk_signature = chunk_meta.get_signature();
+    return false;
+  }
+}
+
+size_t AWSv4ComplMulti::recv_chunk(char* const buf, const size_t buf_max, bool& eof)
+{
+  /* Buffer stores only parsed stream. Raw values reflect the stream
+   * we're getting from a client. */
+  size_t buf_pos = 0;
+
+  if (chunk_meta.is_new_chunk_in_stream(stream_pos)) {
+    /* Verify signature of the previous chunk. We aren't doing that for new
+     * one as the procedure requires calculation of payload hash. This code
+     * won't be triggered for the last, zero-length chunk. Instead, is will
+     * be checked in the complete() method.  */
+    if (stream_pos >= ChunkMeta::META_MAX_SIZE && is_signature_mismatched()) {
+      throw rgw::io::Exception(ERR_SIGNATURE_NO_MATCH, std::system_category());
+    }
+
+    /* We don't have metadata for this range. This means a new chunk, so we
+     * need to parse a fresh portion of the stream. Let's start. */
+    size_t to_extract = parsing_buf.capacity() - parsing_buf.size();
+    do {
+      const size_t orig_size = parsing_buf.size();
+      parsing_buf.resize(parsing_buf.size() + to_extract);
+      const size_t received = io_base_t::recv_body(parsing_buf.data() + orig_size,
+                                                   to_extract);
+      parsing_buf.resize(parsing_buf.size() - (to_extract - received));
+      if (received == 0) {
+        eof = true;
+        break;
+      }
+
+      stream_pos += received;
+      to_extract -= received;
+    } while (to_extract > 0);
+
+    size_t consumed;
+    std::tie(chunk_meta, consumed) = \
+      ChunkMeta::create_next(cct, std::move(chunk_meta),
+                             parsing_buf.data(), parsing_buf.size());
+
+    /* We can drop the bytes consumed during metadata parsing. The remainder
+     * can be chunk's data plus possibly beginning of next chunks' metadata. */
+    parsing_buf.erase(std::begin(parsing_buf),
+                      std::begin(parsing_buf) + consumed);
+  }
+
+  size_t stream_pos_was = stream_pos - parsing_buf.size();
+
+  size_t to_extract = \
+    std::min(chunk_meta.get_data_size(stream_pos_was), buf_max);
+  dout(30) << "AWSv4ComplMulti: stream_pos_was=" << stream_pos_was << ", to_extract=" << to_extract << dendl;
+  
+  /* It's quite probable we have a couple of real data bytes stored together
+   * with meta-data in the parsing_buf. We need to extract them and move to
+   * the final buffer. This is a trade-off between frontend's read overhead
+   * and memcpy. */
+  if (to_extract > 0 && parsing_buf.size() > 0) {
+    const auto data_len = std::min(to_extract, parsing_buf.size());
+    const auto data_end_iter = std::begin(parsing_buf) + data_len;
+    dout(30) << "AWSv4ComplMulti: to_extract=" << to_extract << ", data_len=" << data_len << dendl;
+
+    std::copy(std::begin(parsing_buf), data_end_iter, buf);
+    parsing_buf.erase(std::begin(parsing_buf), data_end_iter);
+
+    calc_hash_sha256_update_stream(sha256_hash, buf, data_len);
+
+    to_extract -= data_len;
+    buf_pos += data_len;
+  }
+
+  /* Now we can do the bulk read directly from RestfulClient without any extra
+   * buffering. */
+  while (to_extract > 0) {
+    const size_t received = io_base_t::recv_body(buf + buf_pos, to_extract);
+    dout(30) << "AWSv4ComplMulti: to_extract=" << to_extract << ", received=" << received << dendl;
+
+    if (received == 0) {
+      eof = true;
+      break;
+    }
+
+    calc_hash_sha256_update_stream(sha256_hash, buf + buf_pos, received);
+
+    buf_pos += received;
+    stream_pos += received;
+    to_extract -= received;
+  }
+
+  dout(20) << "AWSv4ComplMulti: filled=" << buf_pos << dendl;
+  return buf_pos;
+}
+
+size_t AWSv4ComplMulti::recv_body(char* const buf, const size_t buf_max)
+{
+  bool eof = false;
+  size_t total = 0;
+
+  while (total < buf_max && !eof) {
+    const size_t received = recv_chunk(buf + total, buf_max - total, eof);
+    total += received;
+  }
+  dout(20) << "AWSv4ComplMulti: received=" << total << dendl;
+  return total;
+}
+
+void AWSv4ComplMulti::modify_request_state(const DoutPrefixProvider* dpp, req_state* const s_rw)
+{
+  const char* const decoded_length = \
+    s_rw->info.env->get("HTTP_X_AMZ_DECODED_CONTENT_LENGTH");
+
+  if (!decoded_length) {
+    throw -EINVAL;
+  } else {
+    s_rw->length = decoded_length;
+    s_rw->content_length = parse_content_length(decoded_length);
+
+    if (s_rw->content_length < 0) {
+      ldpp_dout(dpp, 10) << "negative AWSv4's content length, aborting" << dendl;
+      throw -EINVAL;
+    }
+  }
+
+  /* Install the filter over rgw::io::RestfulClient. */
+  AWS_AUTHv4_IO(s_rw)->add_filter(
+    std::static_pointer_cast<io_base_t>(shared_from_this()));
+}
+
+bool AWSv4ComplMulti::complete()
+{
+  /* Now it's time to verify the signature of the last, zero-length chunk. */
+  if (is_signature_mismatched()) {
+    ldout(cct, 10) << "ERROR: signature of last chunk does not match"
+                   << dendl;
+    return false;
+  } else {
+    return true;
+  }
+}
+
+rgw::auth::Completer::cmplptr_t
+AWSv4ComplMulti::create(const req_state* const s,
+                        std::string_view date,
+                        std::string_view credential_scope,
+                        std::string_view seed_signature,
+                        const boost::optional<std::string>& secret_key)
+{
+  if (!secret_key) {
+    /* Some external authorizers (like Keystone) aren't fully compliant with
+     * AWSv4. They do not provide the secret_key which is necessary to handle
+     * the streamed upload. */
+    throw -ERR_NOT_IMPLEMENTED;
+  }
+
+  const auto signing_key = \
+    rgw::auth::s3::get_v4_signing_key(s->cct, credential_scope, *secret_key, s);
+
+  return std::make_shared<AWSv4ComplMulti>(s,
+                                           std::move(date),
+                                           std::move(credential_scope),
+                                           std::move(seed_signature),
+                                           signing_key);
+}
+
+size_t AWSv4ComplSingle::recv_body(char* const buf, const size_t max)
+{
+  const auto received = io_base_t::recv_body(buf, max);
+  calc_hash_sha256_update_stream(sha256_hash, buf, received);
+
+  return received;
+}
+
+void AWSv4ComplSingle::modify_request_state(const DoutPrefixProvider* dpp, req_state* const s_rw)
+{
+  /* Install the filter over rgw::io::RestfulClient. */
+  AWS_AUTHv4_IO(s_rw)->add_filter(
+    std::static_pointer_cast<io_base_t>(shared_from_this()));
+}
+
+bool AWSv4ComplSingle::complete()
+{
+  /* The completer is only for the cases where signed payload has been
+   * requested. It won't be used, for instance, during the query string-based
+   * authentication. */
+  const auto payload_hash = calc_hash_sha256_close_stream(&sha256_hash);
+
+  /* Validate x-amz-sha256 */
+  if (payload_hash.compare(expected_request_payload_hash) == 0) {
+    return true;
+  } else {
+    ldout(cct, 10) << "ERROR: x-amz-content-sha256 does not match"
+                   << dendl;
+    ldout(cct, 10) << "ERROR:   grab_aws4_sha256_hash()="
+                   << payload_hash << dendl;
+    ldout(cct, 10) << "ERROR:   expected_request_payload_hash="
+                   << expected_request_payload_hash << dendl;
+    return false;
+  }
+}
+
+AWSv4ComplSingle::AWSv4ComplSingle(const req_state* const s)
+  : io_base_t(nullptr),
+    cct(s->cct),
+    expected_request_payload_hash(get_v4_exp_payload_hash(s->info)),
+    sha256_hash(calc_hash_sha256_open_stream()) {
+}
+
+rgw::auth::Completer::cmplptr_t
+AWSv4ComplSingle::create(const req_state* const s,
+                         const boost::optional<std::string>&)
+{
+  return std::make_shared<AWSv4ComplSingle>(s);
+}
+
+} // namespace rgw::auth::s3
diff --git a/src/rgw/rgw_auth_s3.h b/src/rgw/rgw_auth_s3.h
new file mode 100644
index 000000000..c03dfad82
--- /dev/null
+++ b/src/rgw/rgw_auth_s3.h
@@ -0,0 +1,649 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <tuple>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/container/static_vector.hpp>
+
+#include "common/sstring.hh"
+#include "rgw_common.h"
+#include "rgw_rest_s3.h"
+#include "rgw_auth.h"
+#include "rgw_auth_filters.h"
+#include "rgw_auth_keystone.h"
+
+
+namespace rgw {
+namespace auth {
+namespace s3 {
+
+static constexpr auto RGW_AUTH_GRACE = std::chrono::minutes{15};
+
+// returns true if the request time is within RGW_AUTH_GRACE of the current time
+bool is_time_skew_ok(time_t t);
+
+class STSAuthStrategy : public rgw::auth::Strategy,
+                        public rgw::auth::RemoteApplier::Factory,
+                        public rgw::auth::LocalApplier::Factory,
+                        public rgw::auth::RoleApplier::Factory {
+  typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t;
+  rgw::sal::Driver* driver;
+  const rgw::auth::ImplicitTenants& implicit_tenant_context;
+
+  STSEngine  sts_engine;
+
+  aplptr_t create_apl_remote(CephContext* const cct,
+                             const req_state* const s,
+                             rgw::auth::RemoteApplier::acl_strategy_t&& acl_alg,
+                             const rgw::auth::RemoteApplier::AuthInfo &info) const override {
+    auto apl = rgw::auth::add_sysreq(cct, driver, s,
+      rgw::auth::RemoteApplier(cct, driver, std::move(acl_alg), info,
+			       implicit_tenant_context,
+                               rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_S3));
+    return aplptr_t(new decltype(apl)(std::move(apl)));
+  }
+
+  aplptr_t create_apl_local(CephContext* const cct,
+                            const req_state* const s,
+                            const RGWUserInfo& user_info,
+                            const std::string& subuser,
+                            const std::optional<uint32_t>& perm_mask,
+                            const std::string& access_key_id) const override {
+    auto apl = rgw::auth::add_sysreq(cct, driver, s,
+      rgw::auth::LocalApplier(cct, user_info, subuser, perm_mask, access_key_id));
+    return aplptr_t(new decltype(apl)(std::move(apl)));
+  }
+
+  aplptr_t create_apl_role(CephContext* const cct,
+                            const req_state* const s,
+                            const rgw::auth::RoleApplier::Role& role,
+                            const rgw::auth::RoleApplier::TokenAttrs& token_attrs) const override {
+    auto apl = rgw::auth::add_sysreq(cct, driver, s,
+      rgw::auth::RoleApplier(cct, role, token_attrs));
+    return aplptr_t(new decltype(apl)(std::move(apl)));
+  }
+
+public:
+  STSAuthStrategy(CephContext* const cct,
+                       rgw::sal::Driver* driver,
+                       const rgw::auth::ImplicitTenants& implicit_tenant_context,
+                       AWSEngine::VersionAbstractor* const ver_abstractor)
+    : driver(driver),
+      implicit_tenant_context(implicit_tenant_context),
+      sts_engine(cct, driver, *ver_abstractor,
+                  static_cast<rgw::auth::LocalApplier::Factory*>(this),
+                  static_cast<rgw::auth::RemoteApplier::Factory*>(this),
+                  static_cast<rgw::auth::RoleApplier::Factory*>(this)) {
+      if (cct->_conf->rgw_s3_auth_use_sts) {
+        add_engine(Control::SUFFICIENT, sts_engine);
+      }
+    }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::s3::STSAuthStrategy";
+  }
+};
+
+class ExternalAuthStrategy : public rgw::auth::Strategy,
+                             public rgw::auth::RemoteApplier::Factory {
+  typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t;
+  rgw::sal::Driver* driver;
+  const rgw::auth::ImplicitTenants& implicit_tenant_context;
+
+  using keystone_config_t = rgw::keystone::CephCtxConfig;
+  using keystone_cache_t = rgw::keystone::TokenCache;
+  using secret_cache_t = rgw::auth::keystone::SecretCache;
+  using EC2Engine = rgw::auth::keystone::EC2Engine;
+
+  boost::optional <EC2Engine> keystone_engine;
+  LDAPEngine ldap_engine;
+
+  aplptr_t create_apl_remote(CephContext* const cct,
+                             const req_state* const s,
+                             rgw::auth::RemoteApplier::acl_strategy_t&& acl_alg,
+                             const rgw::auth::RemoteApplier::AuthInfo &info) const override {
+    auto apl = rgw::auth::add_sysreq(cct, driver, s,
+      rgw::auth::RemoteApplier(cct, driver, std::move(acl_alg), info,
+                               implicit_tenant_context,
+                               rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_S3));
+    /* TODO(rzarzynski): replace with static_ptr. */
+    return aplptr_t(new decltype(apl)(std::move(apl)));
+  }
+
+public:
+  ExternalAuthStrategy(CephContext* const cct,
+                       rgw::sal::Driver* driver,
+                       const rgw::auth::ImplicitTenants& implicit_tenant_context,
+                       AWSEngine::VersionAbstractor* const ver_abstractor)
+    : driver(driver),
+      implicit_tenant_context(implicit_tenant_context),
+      ldap_engine(cct, driver, *ver_abstractor,
+                  static_cast<rgw::auth::RemoteApplier::Factory*>(this)) {
+
+    if (cct->_conf->rgw_s3_auth_use_keystone &&
+        ! cct->_conf->rgw_keystone_url.empty()) {
+
+      keystone_engine.emplace(cct, ver_abstractor,
+                              static_cast<rgw::auth::RemoteApplier::Factory*>(this),
+                              keystone_config_t::get_instance(),
+                              keystone_cache_t::get_instance<keystone_config_t>(),
+			      secret_cache_t::get_instance());
+      add_engine(Control::SUFFICIENT, *keystone_engine);
+
+    }
+
+    if (ldap_engine.valid()) {
+      add_engine(Control::SUFFICIENT, ldap_engine);
+    }
+  }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::s3::AWSv2ExternalAuthStrategy";
+  }
+};
+
+
+template <class AbstractorT,
+          bool AllowAnonAccessT = false>
+class AWSAuthStrategy : public rgw::auth::Strategy,
+                        public rgw::auth::LocalApplier::Factory {
+  typedef rgw::auth::IdentityApplier::aplptr_t aplptr_t;
+
+  static_assert(std::is_base_of<rgw::auth::s3::AWSEngine::VersionAbstractor,
+                                AbstractorT>::value,
+                "AbstractorT must be a subclass of rgw::auth::s3::VersionAbstractor");
+
+  rgw::sal::Driver* driver;
+  AbstractorT ver_abstractor;
+
+  S3AnonymousEngine anonymous_engine;
+  ExternalAuthStrategy external_engines;
+  STSAuthStrategy sts_engine;
+  LocalEngine local_engine;
+
+  aplptr_t create_apl_local(CephContext* const cct,
+                            const req_state* const s,
+                            const RGWUserInfo& user_info,
+                            const std::string& subuser,
+                            const std::optional<uint32_t>& perm_mask,
+                            const std::string& access_key_id) const override {
+    auto apl = rgw::auth::add_sysreq(cct, driver, s,
+      rgw::auth::LocalApplier(cct, user_info, subuser, perm_mask, access_key_id));
+    /* TODO(rzarzynski): replace with static_ptr. */
+    return aplptr_t(new decltype(apl)(std::move(apl)));
+  }
+
+public:
+  using engine_map_t = std::map <std::string, std::reference_wrapper<const Engine>>;
+  void add_engines(const std::vector <std::string>& auth_order,
+		   engine_map_t eng_map)
+  {
+    auto ctrl_flag = Control::SUFFICIENT;
+    for (const auto &eng : auth_order) {
+      // fallback to the last engine, in case of multiple engines, since ctrl
+      // flag is sufficient for others, error from earlier engine is returned
+      if (&eng == &auth_order.back() && eng_map.size() > 1) {
+        ctrl_flag = Control::FALLBACK;
+      }
+      if (const auto kv = eng_map.find(eng);
+          kv != eng_map.end()) {
+        add_engine(ctrl_flag, kv->second);
+      }
+    }
+  }
+
+  auto parse_auth_order(CephContext* const cct)
+  {
+    std::vector <std::string> result;
+
+    const std::set <std::string_view> allowed_auth = { "sts", "external", "local" };
+    std::vector <std::string> default_order = { "sts", "external", "local" };
+    // supplied strings may contain a space, so let's bypass that
+    boost::split(result, cct->_conf->rgw_s3_auth_order,
+		 boost::is_any_of(", "), boost::token_compress_on);
+
+    if (std::any_of(result.begin(), result.end(),
+		    [allowed_auth](std::string_view s)
+		    { return allowed_auth.find(s) == allowed_auth.end();})){
+      return default_order;
+    }
+    return result;
+  }
+
+  AWSAuthStrategy(CephContext* const cct,
+                  const rgw::auth::ImplicitTenants& implicit_tenant_context,
+                  rgw::sal::Driver* driver)
+    : driver(driver),
+      ver_abstractor(cct),
+      anonymous_engine(cct,
+                       static_cast<rgw::auth::LocalApplier::Factory*>(this)),
+      external_engines(cct, driver, implicit_tenant_context, &ver_abstractor),
+      sts_engine(cct, driver, implicit_tenant_context, &ver_abstractor),
+      local_engine(cct, driver, ver_abstractor,
+                   static_cast<rgw::auth::LocalApplier::Factory*>(this)) {
+    /* The anonymous auth. */
+    if (AllowAnonAccessT) {
+      add_engine(Control::SUFFICIENT, anonymous_engine);
+    }
+
+    auto auth_order = parse_auth_order(cct);
+    engine_map_t engine_map;
+
+    /* STS Auth*/
+    if (! sts_engine.is_empty()) {
+      engine_map.insert(std::make_pair("sts", std::cref(sts_engine)));
+    }
+
+    /* The external auth. */
+    if (! external_engines.is_empty()) {
+      engine_map.insert(std::make_pair("external", std::cref(external_engines)));
+    }
+    /* The local auth. */
+    if (cct->_conf->rgw_s3_auth_use_rados) {
+      engine_map.insert(std::make_pair("local", std::cref(local_engine)));
+    }
+
+    add_engines(auth_order, engine_map);
+  }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::s3::AWSAuthStrategy";
+  }
+};
+
+
+class AWSv4ComplMulti : public rgw::auth::Completer,
+                        public rgw::io::DecoratedRestfulClient<rgw::io::RestfulClient*>,
+                        public std::enable_shared_from_this<AWSv4ComplMulti> {
+  using io_base_t = rgw::io::DecoratedRestfulClient<rgw::io::RestfulClient*>;
+  using signing_key_t = sha256_digest_t;
+
+  CephContext* const cct;
+
+  const std::string_view date;
+  const std::string_view credential_scope;
+  const signing_key_t signing_key;
+
+  class ChunkMeta {
+    size_t data_offset_in_stream = 0;
+    size_t data_length = 0;
+    std::string signature;
+
+    ChunkMeta(const size_t data_starts_in_stream,
+              const size_t data_length,
+              const std::string_view signature)
+      : data_offset_in_stream(data_starts_in_stream),
+        data_length(data_length),
+        signature(std::string(signature)) {
+    }
+
+    explicit ChunkMeta(const std::string_view& signature)
+      : signature(std::string(signature)) {
+    }
+
+  public:
+    static constexpr size_t SIG_SIZE = 64;
+
+    /* Let's suppose the data length fields can't exceed uint64_t. */
+    static constexpr size_t META_MAX_SIZE = \
+      sarrlen("\r\nffffffffffffffff;chunk-signature=") + SIG_SIZE + sarrlen("\r\n");
+
+    /* The metadata size of for the last, empty chunk. */
+    static constexpr size_t META_MIN_SIZE = \
+      sarrlen("0;chunk-signature=") + SIG_SIZE + sarrlen("\r\n");
+
+    /* Detect whether a given stream_pos fits in boundaries of a chunk. */
+    bool is_new_chunk_in_stream(size_t stream_pos) const;
+
+    /* Get the remaining data size. */
+    size_t get_data_size(size_t stream_pos) const;
+
+    const std::string& get_signature() const {
+      return signature;
+    }
+
+    /* Factory: create an object representing metadata of first, initial chunk
+     * in a stream. */
+    static ChunkMeta create_first(const std::string_view& seed_signature) {
+      return ChunkMeta(seed_signature);
+    }
+
+    /* Factory: parse a block of META_MAX_SIZE bytes and creates an object
+     * representing non-first chunk in a stream. As the process is sequential
+     * and depends on the previous chunk, caller must pass it. */
+    static std::pair<ChunkMeta, size_t> create_next(CephContext* cct,
+                                                    ChunkMeta&& prev,
+                                                    const char* metabuf,
+                                                    size_t metabuf_len);
+  } chunk_meta;
+
+  size_t stream_pos;
+  boost::container::static_vector<char, ChunkMeta::META_MAX_SIZE> parsing_buf;
+  ceph::crypto::SHA256* sha256_hash;
+  std::string prev_chunk_signature;
+
+  bool is_signature_mismatched();
+  std::string calc_chunk_signature(const std::string& payload_hash) const;
+  size_t recv_chunk(char* buf, size_t max, bool& eof);
+
+public:
+  /* We need the constructor to be public because of the std::make_shared that
+   * is employed by the create() method. */
+  AWSv4ComplMulti(const req_state* const s,
+                  std::string_view date,
+                  std::string_view credential_scope,
+                  std::string_view seed_signature,
+                  const signing_key_t& signing_key)
+    : io_base_t(nullptr),
+      cct(s->cct),
+      date(std::move(date)),
+      credential_scope(std::move(credential_scope)),
+      signing_key(signing_key),
+
+      /* The evolving state. */
+      chunk_meta(ChunkMeta::create_first(seed_signature)),
+      stream_pos(0),
+      sha256_hash(calc_hash_sha256_open_stream()),
+      prev_chunk_signature(std::move(seed_signature)) {
+  }
+
+  ~AWSv4ComplMulti() {
+    if (sha256_hash) {
+      calc_hash_sha256_close_stream(&sha256_hash);
+    }
+  }
+
+  /* rgw::io::DecoratedRestfulClient. */
+  size_t recv_body(char* buf, size_t max) override;
+
+  /* rgw::auth::Completer. */
+  void modify_request_state(const DoutPrefixProvider* dpp, req_state* s_rw) override;
+  bool complete() override;
+
+  /* Factories. */
+  static cmplptr_t create(const req_state* s,
+                          std::string_view date,
+                          std::string_view credential_scope,
+                          std::string_view seed_signature,
+                          const boost::optional<std::string>& secret_key);
+
+};
+
+class AWSv4ComplSingle : public rgw::auth::Completer,
+                         public rgw::io::DecoratedRestfulClient<rgw::io::RestfulClient*>,
+                         public std::enable_shared_from_this<AWSv4ComplSingle> {
+  using io_base_t = rgw::io::DecoratedRestfulClient<rgw::io::RestfulClient*>;
+
+  CephContext* const cct;
+  const char* const expected_request_payload_hash;
+  ceph::crypto::SHA256* sha256_hash = nullptr;
+
+public:
+  /* Defined in rgw_auth_s3.cc because of get_v4_exp_payload_hash(). We need
+   * the constructor to be public because of the std::make_shared employed by
+   * the create() method. */
+  explicit AWSv4ComplSingle(const req_state* const s);
+
+  ~AWSv4ComplSingle() {
+    if (sha256_hash) {
+      calc_hash_sha256_close_stream(&sha256_hash);
+    }
+  }
+
+  /* rgw::io::DecoratedRestfulClient. */
+  size_t recv_body(char* buf, size_t max) override;
+
+  /* rgw::auth::Completer. */
+  void modify_request_state(const DoutPrefixProvider* dpp, req_state* s_rw) override;
+  bool complete() override;
+
+  /* Factories. */
+  static cmplptr_t create(const req_state* s,
+                          const boost::optional<std::string>&);
+
+};
+
+} /* namespace s3 */
+} /* namespace auth */
+} /* namespace rgw */
+
+void rgw_create_s3_canonical_header(
+  const DoutPrefixProvider *dpp,
+  const char *method,
+  const char *content_md5,
+  const char *content_type,
+  const char *date,
+  const meta_map_t& meta_map,
+  const meta_map_t& qs_map,
+  const char *request_uri,
+  const std::map<std::string, std::string>& sub_resources,
+  std::string& dest_str);
+bool rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp,
+                                    const req_info& info,
+                                    utime_t *header_time,       /* out */
+                                    std::string& dest,          /* out */
+                                    bool qsr);
+static inline std::tuple<bool, std::string, utime_t>
+rgw_create_s3_canonical_header(const DoutPrefixProvider *dpp, const req_info& info, const bool qsr) {
+  std::string dest;
+  utime_t header_time;
+
+  const bool ok = rgw_create_s3_canonical_header(dpp, info, &header_time, dest, qsr);
+  return std::make_tuple(ok, dest, header_time);
+}
+
+namespace rgw {
+namespace auth {
+namespace s3 {
+
+static constexpr char AWS4_HMAC_SHA256_STR[] = "AWS4-HMAC-SHA256";
+static constexpr char AWS4_HMAC_SHA256_PAYLOAD_STR[] = "AWS4-HMAC-SHA256-PAYLOAD";
+
+static constexpr char AWS4_EMPTY_PAYLOAD_HASH[] = \
+  "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
+
+static constexpr char AWS4_UNSIGNED_PAYLOAD_HASH[] = "UNSIGNED-PAYLOAD";
+
+static constexpr char AWS4_STREAMING_PAYLOAD_HASH[] = \
+  "STREAMING-AWS4-HMAC-SHA256-PAYLOAD";
+
+bool is_non_s3_op(RGWOpType op_type);
+
+int parse_v4_credentials(const req_info& info,                     /* in */
+			 std::string_view& access_key_id,        /* out */
+			 std::string_view& credential_scope,     /* out */
+			 std::string_view& signedheaders,        /* out */
+			 std::string_view& signature,            /* out */
+			 std::string_view& date,                 /* out */
+			 std::string_view& session_token,        /* out */
+			 const bool using_qs,                    /* in  */
+                         const DoutPrefixProvider *dpp);         /* in */
+
+string gen_v4_scope(const ceph::real_time& timestamp,
+                    const string& region,
+                    const string& service);
+
+static inline bool char_needs_aws4_escaping(const char c, bool encode_slash)
+{
+  if ((c >= 'a' && c <= 'z') ||
+      (c >= 'A' && c <= 'Z') ||
+      (c >= '0' && c <= '9')) {
+    return false;
+  }
+
+  switch (c) {
+    case '-':
+    case '_':
+    case '.':
+    case '~':
+      return false;
+  }
+
+  if (c == '/' && !encode_slash)
+    return false;
+
+  return true;
+}
+
+static inline std::string aws4_uri_encode(const std::string& src, bool encode_slash)
+{
+  std::string result;
+
+  for (const std::string::value_type c : src) {
+    if (char_needs_aws4_escaping(c, encode_slash)) {
+      rgw_uri_escape_char(c, result);
+    } else {
+      result.push_back(c);
+    }
+  }
+
+  return result;
+}
+
+static inline std::string aws4_uri_recode(const std::string_view& src, bool encode_slash)
+{
+  std::string decoded = url_decode(src);
+  return aws4_uri_encode(decoded, encode_slash);
+}
+
+static inline std::string get_v4_canonical_uri(const req_info& info) {
+  /* The code should normalize according to RFC 3986 but S3 does NOT do path
+   * normalization that SigV4 typically does. This code follows the same
+   * approach that boto library. See auth.py:canonical_uri(...). */
+
+  std::string canonical_uri = aws4_uri_recode(info.request_uri_aws4, false);
+
+  if (canonical_uri.empty()) {
+    canonical_uri = "/";
+  } else {
+    boost::replace_all(canonical_uri, "+", "%20");
+  }
+
+  return canonical_uri;
+}
+
+static inline std::string gen_v4_canonical_uri(const req_info& info) {
+  /* The code should normalize according to RFC 3986 but S3 does NOT do path
+   * normalization that SigV4 typically does. This code follows the same
+   * approach that boto library. See auth.py:canonical_uri(...). */
+
+  std::string canonical_uri = aws4_uri_recode(info.request_uri, false);
+
+  if (canonical_uri.empty()) {
+    canonical_uri = "/";
+  } else {
+    boost::replace_all(canonical_uri, "+", "%20");
+  }
+
+  return canonical_uri;
+}
+
+static inline const string calc_v4_payload_hash(const string& payload)
+{
+  ceph::crypto::SHA256* sha256_hash = calc_hash_sha256_open_stream();
+  calc_hash_sha256_update_stream(sha256_hash, payload.c_str(), payload.length());
+  const auto payload_hash = calc_hash_sha256_close_stream(&sha256_hash);
+  return payload_hash;
+}
+
+static inline const char* get_v4_exp_payload_hash(const req_info& info)
+{
+  /* In AWSv4 the hash of real, transferred payload IS NOT necessary to form
+   * a Canonical Request, and thus verify a Signature. x-amz-content-sha256
+   * header lets get the information very early -- before seeing first byte
+   * of HTTP body. As a consequence, we can decouple Signature verification
+   * from payload's fingerprint check. */
+  const char *expected_request_payload_hash = \
+    info.env->get("HTTP_X_AMZ_CONTENT_SHA256");
+
+  if (!expected_request_payload_hash) {
+    /* An HTTP client MUST send x-amz-content-sha256. The single exception
+     * is the case of using the Query Parameters where "UNSIGNED-PAYLOAD"
+     * literals are used for crafting Canonical Request:
+     *
+     *  You don't include a payload hash in the Canonical Request, because
+     *  when you create a presigned URL, you don't know the payload content
+     *  because the URL is used to upload an arbitrary payload. Instead, you
+     *  use a constant string UNSIGNED-PAYLOAD. */
+    expected_request_payload_hash = AWS4_UNSIGNED_PAYLOAD_HASH;
+  }
+
+  return expected_request_payload_hash;
+}
+
+static inline bool is_v4_payload_unsigned(const char* const exp_payload_hash)
+{
+  return boost::equals(exp_payload_hash, AWS4_UNSIGNED_PAYLOAD_HASH);
+}
+
+static inline bool is_v4_payload_empty(const req_state* const s)
+{
+  /* from rfc2616 - 4.3 Message Body
+   *
+   * "The presence of a message-body in a request is signaled by the inclusion
+   * of a Content-Length or Transfer-Encoding header field in the request's
+   * message-headers." */
+  return s->content_length == 0 &&
+         s->info.env->get("HTTP_TRANSFER_ENCODING") == nullptr;
+}
+
+static inline bool is_v4_payload_streamed(const char* const exp_payload_hash)
+{
+  return boost::equals(exp_payload_hash, AWS4_STREAMING_PAYLOAD_HASH);
+}
+
+std::string get_v4_canonical_qs(const req_info& info, bool using_qs);
+
+std::string gen_v4_canonical_qs(const req_info& info, bool is_non_s3_op);
+
+std::string get_v4_canonical_method(const req_state* s);
+
+boost::optional<std::string>
+get_v4_canonical_headers(const req_info& info,
+                         const std::string_view& signedheaders,
+                         bool using_qs,
+                         bool force_boto2_compat);
+
+std::string gen_v4_canonical_headers(const req_info& info,
+                                     const std::map<std::string, std::string>& extra_headers,
+                                     string *signed_hdrs);
+
+extern sha256_digest_t
+get_v4_canon_req_hash(CephContext* cct,
+                      const std::string_view& http_verb,
+                      const std::string& canonical_uri,
+                      const std::string& canonical_qs,
+                      const std::string& canonical_hdrs,
+                      const std::string_view& signed_hdrs,
+                      const std::string_view& request_payload_hash,
+                      const DoutPrefixProvider *dpp);
+
+AWSEngine::VersionAbstractor::string_to_sign_t
+get_v4_string_to_sign(CephContext* cct,
+                      const std::string_view& algorithm,
+                      const std::string_view& request_date,
+                      const std::string_view& credential_scope,
+                      const sha256_digest_t& canonreq_hash,
+                      const DoutPrefixProvider *dpp);
+
+extern AWSEngine::VersionAbstractor::server_signature_t
+get_v4_signature(const std::string_view& credential_scope,
+                 CephContext* const cct,
+                 const std::string_view& secret_key,
+                 const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign,
+                 const DoutPrefixProvider *dpp);
+
+extern AWSEngine::VersionAbstractor::server_signature_t
+get_v2_signature(CephContext*,
+                 const std::string& secret_key,
+                 const AWSEngine::VersionAbstractor::string_to_sign_t& string_to_sign);
+} /* namespace s3 */
+} /* namespace auth */
+} /* namespace rgw */
diff --git a/src/rgw/rgw_b64.h b/src/rgw/rgw_b64.h
new file mode 100644
index 000000000..2948f6f31
--- /dev/null
+++ b/src/rgw/rgw_b64.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <boost/archive/iterators/base64_from_binary.hpp>
+#include <boost/archive/iterators/binary_from_base64.hpp>
+#include <boost/archive/iterators/insert_linebreaks.hpp>
+#include <boost/archive/iterators/transform_width.hpp>
+#include <boost/archive/iterators/remove_whitespace.hpp>
+#include <limits>
+#include <string>
+#include <string_view>
+
+namespace rgw {
+
+  /*
+   * A header-only Base64 encoder built on boost::archive.  The
+   * formula is based on a class poposed for inclusion in boost in
+   * 2011 by Denis Shevchenko (abandoned), updated slightly
+   * (e.g., uses std::string_view).
+   *
+   * Also, wrap_width added as template argument, based on
+   * feedback from Marcus.
+   */
+
+  template<int wrap_width = std::numeric_limits<int>::max()>
+  inline std::string to_base64(std::string_view sview)
+  {
+    using namespace boost::archive::iterators;
+
+    // output must be =padded modulo 3
+    auto psize = sview.size();
+    while ((psize % 3) != 0) {
+      ++psize;
+    }
+
+    /* RFC 2045 requires linebreaks to be present in the output
+     * sequence every at-most 76 characters (MIME-compliance),
+     * but we could likely omit it. */
+    typedef
+      insert_linebreaks<
+        base64_from_binary<
+          transform_width<
+	    std::string_view::const_iterator
+            ,6,8>
+          >
+          ,wrap_width
+        > b64_iter;
+
+    std::string outstr(b64_iter(sview.data()),
+		       b64_iter(sview.data() + sview.size()));
+
+    // pad outstr with '=' to a length that is a multiple of 3
+    for (size_t ix = 0; ix < (psize-sview.size()); ++ix)
+      outstr.push_back('=');
+
+    return outstr;
+  }
+
+  inline std::string from_base64(std::string_view sview)
+  {
+    using namespace boost::archive::iterators;
+    if (sview.empty())
+      return std::string();
+    /* MIME-compliant input will have line-breaks, so we have to
+     * filter WS */
+    typedef
+      transform_width<
+      binary_from_base64<
+	remove_whitespace<
+	  std::string_view::const_iterator>>
+      ,8,6
+      > b64_iter;
+
+    while (sview.back() == '=')
+      sview.remove_suffix(1);
+
+    std::string outstr(b64_iter(sview.data()),
+		      b64_iter(sview.data() + sview.size()));
+
+    return outstr;
+  }
+} /* namespace */
diff --git a/src/rgw/rgw_basic_types.cc b/src/rgw/rgw_basic_types.cc
new file mode 100644
index 000000000..5a09c017f
--- /dev/null
+++ b/src/rgw/rgw_basic_types.cc
@@ -0,0 +1,180 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "cls/user/cls_user_types.h"
+
+#include "rgw_basic_types.h"
+#include "rgw_bucket.h"
+#include "rgw_xml.h"
+
+#include "common/ceph_json.h"
+#include "common/Formatter.h"
+#include "cls/user/cls_user_types.h"
+#include "cls/rgw/cls_rgw_types.h"
+
+using std::ostream;
+using std::string;
+using std::stringstream;
+
+using namespace std;
+
+void decode_json_obj(rgw_user& val, JSONObj *obj)
+{
+  val.from_str(obj->get_data());
+}
+
+void encode_json(const char *name, const rgw_user& val, Formatter *f)
+{
+  f->dump_string(name, val.to_str());
+}
+
+void encode_xml(const char *name, const rgw_user& val, Formatter *f)
+{
+  encode_xml(name, val.to_str(), f);
+}
+
+rgw_bucket::rgw_bucket(const rgw_user& u, const cls_user_bucket& b) :
+    tenant(u.tenant),
+    name(b.name),
+    marker(b.marker),
+    bucket_id(b.bucket_id),
+    explicit_placement(b.explicit_placement.data_pool,
+                       b.explicit_placement.data_extra_pool,
+                       b.explicit_placement.index_pool)
+{
+}
+
+void rgw_bucket::convert(cls_user_bucket *b) const
+{
+  b->name = name;
+  b->marker = marker;
+  b->bucket_id = bucket_id;
+  b->explicit_placement.data_pool = explicit_placement.data_pool.to_str();
+  b->explicit_placement.data_extra_pool = explicit_placement.data_extra_pool.to_str();
+  b->explicit_placement.index_pool = explicit_placement.index_pool.to_str();
+}
+
+std::string rgw_bucket::get_key(char tenant_delim, char id_delim, size_t reserve) const
+{
+  const size_t max_len = tenant.size() + sizeof(tenant_delim) +
+      name.size() + sizeof(id_delim) + bucket_id.size() + reserve;
+
+  std::string key;
+  key.reserve(max_len);
+  if (!tenant.empty() && tenant_delim) {
+    key.append(tenant);
+    key.append(1, tenant_delim);
+  }
+  key.append(name);
+  if (!bucket_id.empty() && id_delim) {
+    key.append(1, id_delim);
+    key.append(bucket_id);
+  }
+  return key;
+}
+
+void rgw_bucket::generate_test_instances(list<rgw_bucket*>& o)
+{
+  rgw_bucket *b = new rgw_bucket;
+  init_bucket(b, "tenant", "name", "pool", ".index_pool", "marker", "123");
+  o.push_back(b);
+  o.push_back(new rgw_bucket);
+}
+
+std::string rgw_bucket_shard::get_key(char tenant_delim, char id_delim,
+                                      char shard_delim, size_t reserve) const
+{
+  static constexpr size_t shard_len{12}; // ":4294967295\0"
+  auto key = bucket.get_key(tenant_delim, id_delim, reserve + shard_len);
+  if (shard_id >= 0 && shard_delim) {
+    key.append(1, shard_delim);
+    key.append(std::to_string(shard_id));
+  }
+  return key;
+}
+
+void encode(const rgw_bucket_shard& b, bufferlist& bl, uint64_t f)
+{
+  encode(b.bucket, bl, f);
+  encode(b.shard_id, bl, f);
+}
+
+void decode(rgw_bucket_shard& b, bufferlist::const_iterator& bl)
+{
+  decode(b.bucket, bl);
+  decode(b.shard_id, bl);
+}
+
+void encode_json_impl(const char *name, const rgw_zone_id& zid, Formatter *f)
+{
+  encode_json(name, zid.id, f);
+}
+
+void decode_json_obj(rgw_zone_id& zid, JSONObj *obj)
+{
+  decode_json_obj(zid.id, obj);
+}
+
+void rgw_user::generate_test_instances(list<rgw_user*>& o)
+{
+  rgw_user *u = new rgw_user("tenant", "user");
+
+  o.push_back(u);
+  o.push_back(new rgw_user);
+}
+
+void rgw_data_placement_target::dump(Formatter *f) const
+{
+  encode_json("data_pool", data_pool, f);
+  encode_json("data_extra_pool", data_extra_pool, f);
+  encode_json("index_pool", index_pool, f);
+}
+
+void rgw_data_placement_target::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("data_pool", data_pool, obj);
+  JSONDecoder::decode_json("data_extra_pool", data_extra_pool, obj);
+  JSONDecoder::decode_json("index_pool", index_pool, obj);
+}
+
+void rgw_bucket::dump(Formatter *f) const
+{
+  encode_json("name", name, f);
+  encode_json("marker", marker, f);
+  encode_json("bucket_id", bucket_id, f);
+  encode_json("tenant", tenant, f);
+  encode_json("explicit_placement", explicit_placement, f);
+}
+
+void rgw_bucket::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("name", name, obj);
+  JSONDecoder::decode_json("marker", marker, obj);
+  JSONDecoder::decode_json("bucket_id", bucket_id, obj);
+  JSONDecoder::decode_json("tenant", tenant, obj);
+  JSONDecoder::decode_json("explicit_placement", explicit_placement, obj);
+  if (explicit_placement.data_pool.empty()) {
+    /* decoding old format */
+    JSONDecoder::decode_json("pool", explicit_placement.data_pool, obj);
+    JSONDecoder::decode_json("data_extra_pool", explicit_placement.data_extra_pool, obj);
+    JSONDecoder::decode_json("index_pool", explicit_placement.index_pool, obj);
+  }
+}
+
+namespace rgw {
+namespace auth {
+ostream& operator <<(ostream& m, const Principal& p) {
+  if (p.is_wildcard()) {
+    return m << "*";
+  }
+
+  m << "arn:aws:iam:" << p.get_tenant() << ":";
+  if (p.is_tenant()) {
+    return m << "root";
+  }
+  return m << (p.is_user() ? "user/" : "role/") << p.get_id();
+}
+}
+}
diff --git a/src/rgw/rgw_basic_types.h b/src/rgw/rgw_basic_types.h
new file mode 100644
index 000000000..25d70bdbf
--- /dev/null
+++ b/src/rgw/rgw_basic_types.h
@@ -0,0 +1,291 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * introduce changes or include files which can only be compiled in
+ * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h)
+ */
+
+#pragma once
+
+#include <string>
+#include <fmt/format.h>
+
+#include "include/types.h"
+#include "rgw_compression_types.h"
+#include "rgw_pool_types.h"
+#include "rgw_acl_types.h"
+#include "rgw_zone_types.h"
+#include "rgw_user_types.h"
+#include "rgw_bucket_types.h"
+#include "rgw_obj_types.h"
+#include "rgw_obj_manifest.h"
+
+#include "common/Formatter.h"
+
+class JSONObj;
+class cls_user_bucket;
+
+enum RGWIntentEvent {
+  DEL_OBJ = 0,
+  DEL_DIR = 1,
+};
+
+/** Store error returns for output at a different point in the program */
+struct rgw_err {
+  rgw_err();
+  void clear();
+  bool is_clear() const;
+  bool is_err() const;
+  friend std::ostream& operator<<(std::ostream& oss, const rgw_err &err);
+
+  int http_ret;
+  int ret;
+  std::string err_code;
+  std::string message;
+}; /* rgw_err */
+
+struct rgw_zone_id {
+  std::string id;
+
+  rgw_zone_id() {}
+  rgw_zone_id(const std::string& _id) : id(_id) {}
+  rgw_zone_id(std::string&& _id) : id(std::move(_id)) {}
+
+  void encode(ceph::buffer::list& bl) const {
+    /* backward compatiblity, not using ENCODE_{START,END} macros */
+    ceph::encode(id, bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    /* backward compatiblity, not using DECODE_{START,END} macros */
+    ceph::decode(id, bl);
+  }
+
+  void clear() {
+    id.clear();
+  }
+
+  bool operator==(const std::string& _id) const {
+    return (id == _id);
+  }
+  bool operator==(const rgw_zone_id& zid) const {
+    return (id == zid.id);
+  }
+  bool operator!=(const rgw_zone_id& zid) const {
+    return (id != zid.id);
+  }
+  bool operator<(const rgw_zone_id& zid) const {
+    return (id < zid.id);
+  }
+  bool operator>(const rgw_zone_id& zid) const {
+    return (id > zid.id);
+  }
+
+  bool empty() const {
+    return id.empty();
+  }
+};
+WRITE_CLASS_ENCODER(rgw_zone_id)
+
+inline std::ostream& operator<<(std::ostream& os, const rgw_zone_id& zid) {
+  os << zid.id;
+  return os;
+}
+
+struct obj_version;
+struct rgw_placement_rule;
+struct RGWAccessKey;
+class RGWUserCaps;
+
+extern void encode_json(const char *name, const obj_version& v, Formatter *f);
+extern void encode_json(const char *name, const RGWUserCaps& val, Formatter *f);
+extern void encode_json(const char *name, const rgw_pool& pool, Formatter *f);
+extern void encode_json(const char *name, const rgw_placement_rule& r, Formatter *f);
+extern void encode_json_impl(const char *name, const rgw_zone_id& zid, ceph::Formatter *f);
+extern void encode_json_plain(const char *name, const RGWAccessKey& val, Formatter *f);
+
+extern void decode_json_obj(obj_version& v, JSONObj *obj);
+extern void decode_json_obj(rgw_zone_id& zid, JSONObj *obj);
+extern void decode_json_obj(rgw_pool& pool, JSONObj *obj);
+extern void decode_json_obj(rgw_placement_rule& v, JSONObj *obj);
+
+// Represents an identity. This is more wide-ranging than a
+// 'User'. Its purposes is to be matched against by an
+// IdentityApplier. The internal representation will doubtless change as
+// more types are added. We may want to expose the type enum and make
+// the member public so people can switch/case on it.
+
+namespace rgw {
+namespace auth {
+class Principal {
+  enum types { User, Role, Tenant, Wildcard, OidcProvider, AssumedRole };
+  types t;
+  rgw_user u;
+  std::string idp_url;
+
+  explicit Principal(types t)
+    : t(t) {}
+
+  Principal(types t, std::string&& n, std::string i)
+    : t(t), u(std::move(n), std::move(i)) {}
+
+  Principal(std::string&& idp_url)
+    : t(OidcProvider), idp_url(std::move(idp_url)) {}
+
+public:
+
+  static Principal wildcard() {
+    return Principal(Wildcard);
+  }
+
+  static Principal user(std::string&& t, std::string&& u) {
+    return Principal(User, std::move(t), std::move(u));
+  }
+
+  static Principal role(std::string&& t, std::string&& u) {
+    return Principal(Role, std::move(t), std::move(u));
+  }
+
+  static Principal tenant(std::string&& t) {
+    return Principal(Tenant, std::move(t), {});
+  }
+
+  static Principal oidc_provider(std::string&& idp_url) {
+    return Principal(std::move(idp_url));
+  }
+
+  static Principal assumed_role(std::string&& t, std::string&& u) {
+    return Principal(AssumedRole, std::move(t), std::move(u));
+  }
+
+  bool is_wildcard() const {
+    return t == Wildcard;
+  }
+
+  bool is_user() const {
+    return t == User;
+  }
+
+  bool is_role() const {
+    return t == Role;
+  }
+
+  bool is_tenant() const {
+    return t == Tenant;
+  }
+
+  bool is_oidc_provider() const {
+    return t == OidcProvider;
+  }
+
+  bool is_assumed_role() const {
+    return t == AssumedRole;
+  }
+
+  const std::string& get_tenant() const {
+    return u.tenant;
+  }
+
+  const std::string& get_id() const {
+    return u.id;
+  }
+
+  const std::string& get_idp_url() const {
+    return idp_url;
+  }
+
+  const std::string& get_role_session() const {
+    return u.id;
+  }
+
+  const std::string& get_role() const {
+    return u.id;
+  }
+
+  bool operator ==(const Principal& o) const {
+    return (t == o.t) && (u == o.u);
+  }
+
+  bool operator <(const Principal& o) const {
+    return (t < o.t) || ((t == o.t) && (u < o.u));
+  }
+};
+
+std::ostream& operator <<(std::ostream& m, const Principal& p);
+}
+}
+
+class JSONObj;
+
+void decode_json_obj(rgw_user& val, JSONObj *obj);
+void encode_json(const char *name, const rgw_user& val, ceph::Formatter *f);
+void encode_xml(const char *name, const rgw_user& val, ceph::Formatter *f);
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_user &u) {
+  std::string s;
+  u.to_str(s);
+  return out << s;
+}
+
+struct RGWUploadPartInfo {
+  uint32_t num;
+  uint64_t size;
+  uint64_t accounted_size{0};
+  std::string etag;
+  ceph::real_time modified;
+  RGWObjManifest manifest;
+  RGWCompressionInfo cs_info;
+
+  // Previous part obj prefixes. Recorded here for later cleanup.
+  std::set<std::string> past_prefixes; 
+
+  RGWUploadPartInfo() : num(0), size(0) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(5, 2, bl);
+    encode(num, bl);
+    encode(size, bl);
+    encode(etag, bl);
+    encode(modified, bl);
+    encode(manifest, bl);
+    encode(cs_info, bl);
+    encode(accounted_size, bl);
+    encode(past_prefixes, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(5, 2, 2, bl);
+    decode(num, bl);
+    decode(size, bl);
+    decode(etag, bl);
+    decode(modified, bl);
+    if (struct_v >= 3)
+      decode(manifest, bl);
+    if (struct_v >= 4) {
+      decode(cs_info, bl);
+      decode(accounted_size, bl);
+    } else {
+      accounted_size = size;
+    }
+    if (struct_v >= 5) {
+      decode(past_prefixes, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWUploadPartInfo*>& o);
+};
+WRITE_CLASS_ENCODER(RGWUploadPartInfo)
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
new file mode 100644
index 000000000..852469b7e
--- /dev/null
+++ b/src/rgw/rgw_bucket.cc
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_bucket.h"
+
+#include "common/errno.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+// stolen from src/cls/version/cls_version.cc
+#define VERSION_ATTR "ceph.objclass.version"
+
+using namespace std;
+
+static void set_err_msg(std::string *sink, std::string msg)
+{
+  if (sink && !msg.empty())
+    *sink = msg;
+}
+
+void init_bucket(rgw_bucket *b, const char *t, const char *n, const char *dp, const char *ip, const char *m, const char *id)
+{
+  b->tenant = t;
+  b->name = n;
+  b->marker = m;
+  b->bucket_id = id;
+  b->explicit_placement.data_pool = rgw_pool(dp);
+  b->explicit_placement.index_pool = rgw_pool(ip);
+}
+
+// parse key in format: [tenant/]name:instance[:shard_id]
+int rgw_bucket_parse_bucket_key(CephContext *cct, const string& key,
+                                rgw_bucket *bucket, int *shard_id)
+{
+  std::string_view name{key};
+  std::string_view instance;
+
+  // split tenant/name
+  auto pos = name.find('/');
+  if (pos != string::npos) {
+    auto tenant = name.substr(0, pos);
+    bucket->tenant.assign(tenant.begin(), tenant.end());
+    name = name.substr(pos + 1);
+  } else {
+    bucket->tenant.clear();
+  }
+
+  // split name:instance
+  pos = name.find(':');
+  if (pos != string::npos) {
+    instance = name.substr(pos + 1);
+    name = name.substr(0, pos);
+  }
+  bucket->name.assign(name.begin(), name.end());
+
+  // split instance:shard
+  pos = instance.find(':');
+  if (pos == string::npos) {
+    bucket->bucket_id.assign(instance.begin(), instance.end());
+    if (shard_id) {
+      *shard_id = -1;
+    }
+    return 0;
+  }
+
+  // parse shard id
+  auto shard = instance.substr(pos + 1);
+  string err;
+  auto id = strict_strtol(shard.data(), 10, &err);
+  if (!err.empty()) {
+    if (cct) {
+      ldout(cct, 0) << "ERROR: failed to parse bucket shard '"
+          << instance.data() << "': " << err << dendl;
+    }
+    return -EINVAL;
+  }
+
+  if (shard_id) {
+    *shard_id = id;
+  }
+  instance = instance.substr(0, pos);
+  bucket->bucket_id.assign(instance.begin(), instance.end());
+  return 0;
+}
+
+/*
+ * Note that this is not a reversal of parse_bucket(). That one deals
+ * with the syntax we need in metadata and such. This one deals with
+ * the representation in RADOS pools. We chose '/' because it's not
+ * acceptable in bucket names and thus qualified buckets cannot conflict
+ * with the legacy or S3 buckets.
+ */
+std::string rgw_make_bucket_entry_name(const std::string& tenant_name,
+                                       const std::string& bucket_name) {
+  std::string bucket_entry;
+
+  if (bucket_name.empty()) {
+    bucket_entry.clear();
+  } else if (tenant_name.empty()) {
+    bucket_entry = bucket_name;
+  } else {
+    bucket_entry = tenant_name + "/" + bucket_name;
+  }
+
+  return bucket_entry;
+}
+
+/*
+ * Tenants are separated from buckets in URLs by a colon in S3.
+ * This function is not to be used on Swift URLs, not even for COPY arguments.
+ */
+int rgw_parse_url_bucket(const string &bucket, const string& auth_tenant,
+                         string &tenant_name, string &bucket_name) {
+
+  int pos = bucket.find(':');
+  if (pos >= 0) {
+    /*
+     * N.B.: We allow ":bucket" syntax with explicit empty tenant in order
+     * to refer to the legacy tenant, in case users in new named tenants
+     * want to access old global buckets.
+     */
+    tenant_name = bucket.substr(0, pos);
+    bucket_name = bucket.substr(pos + 1);
+    if (bucket_name.empty()) {
+      return -ERR_INVALID_BUCKET_NAME;
+    }
+  } else {
+    tenant_name = auth_tenant;
+    bucket_name = bucket;
+  }
+  return 0;
+}
+
+int rgw_chown_bucket_and_objects(rgw::sal::Driver* driver, rgw::sal::Bucket* bucket,
+				 rgw::sal::User* new_user,
+				 const std::string& marker, std::string *err_msg,
+				 const DoutPrefixProvider *dpp, optional_yield y)
+{
+  /* Chown on the bucket */
+  int ret = bucket->chown(dpp, *new_user, y);
+  if (ret < 0) {
+    set_err_msg(err_msg, "Failed to change object ownership: " + cpp_strerror(-ret));
+  }
+
+  /* Now chown on all the objects in the bucket */
+  map<string, bool> common_prefixes;
+
+  rgw::sal::Bucket::ListParams params;
+  rgw::sal::Bucket::ListResults results;
+
+  params.list_versions = true;
+  params.allow_unordered = true;
+  params.marker = marker;
+
+  int count = 0;
+  int max_entries = 1000;
+
+  //Loop through objects and update object acls to point to bucket owner
+
+  do {
+    results.objs.clear();
+    ret = bucket->list(dpp, params, max_entries, results, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: list objects failed: " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    params.marker = results.next_marker;
+    count += results.objs.size();
+
+    for (const auto& obj : results.objs) {
+      std::unique_ptr<rgw::sal::Object> r_obj = bucket->get_object(obj.key);
+
+      ret = r_obj->chown(*new_user, dpp, y);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: chown failed on " << r_obj << " :" << cpp_strerror(-ret) << dendl;
+          return ret;
+        }
+      }
+    cerr << count << " objects processed in " << bucket
+        << ". Next marker " << params.marker.name << std::endl;
+  } while(results.is_truncated);
+
+  return ret;
+}
+
diff --git a/src/rgw/rgw_bucket.h b/src/rgw/rgw_bucket.h
new file mode 100644
index 000000000..e62b46898
--- /dev/null
+++ b/src/rgw/rgw_bucket.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <memory>
+#include <variant>
+
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+
+#include "include/types.h"
+#include "rgw_common.h"
+#include "rgw_sal.h"
+
+extern void init_bucket(rgw_bucket *b, const char *t, const char *n, const char *dp, const char *ip, const char *m, const char *id);
+
+extern int rgw_bucket_parse_bucket_key(CephContext *cct, const std::string& key,
+                                       rgw_bucket* bucket, int *shard_id);
+
+extern std::string rgw_make_bucket_entry_name(const std::string& tenant_name,
+                                              const std::string& bucket_name);
+
+[[nodiscard]] int rgw_parse_url_bucket(const std::string& bucket,
+                                       const std::string& auth_tenant,
+                                       std::string &tenant_name,
+                                       std::string &bucket_name);
+
+extern int rgw_chown_bucket_and_objects(rgw::sal::Driver* driver,
+					rgw::sal::Bucket* bucket,
+					rgw::sal::User* new_user,
+					const std::string& marker,
+					std::string *err_msg,
+					const DoutPrefixProvider *dpp,
+					optional_yield y);
diff --git a/src/rgw/rgw_bucket_encryption.cc b/src/rgw/rgw_bucket_encryption.cc
new file mode 100644
index 000000000..f029709db
--- /dev/null
+++ b/src/rgw/rgw_bucket_encryption.cc
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+//
+#include "rgw_bucket_encryption.h"
+#include "rgw_xml.h"
+#include "common/ceph_json.h"
+
+void ApplyServerSideEncryptionByDefault::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("KMSMasterKeyID", kmsMasterKeyID, obj, false);
+  RGWXMLDecoder::decode_xml("SSEAlgorithm", sseAlgorithm, obj, false);
+}
+
+void ApplyServerSideEncryptionByDefault::dump_xml(Formatter *f) const {
+  encode_xml("SSEAlgorithm", sseAlgorithm, f);
+  if (kmsMasterKeyID != "") {
+    encode_xml("KMSMasterKeyID", kmsMasterKeyID, f);
+  }
+}
+
+void ServerSideEncryptionConfiguration::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("ApplyServerSideEncryptionByDefault", applyServerSideEncryptionByDefault, obj, false);
+  RGWXMLDecoder::decode_xml("BucketKeyEnabled", bucketKeyEnabled, obj, false);
+}
+
+void ServerSideEncryptionConfiguration::dump_xml(Formatter *f) const {
+  encode_xml("ApplyServerSideEncryptionByDefault", applyServerSideEncryptionByDefault, f);
+  if (bucketKeyEnabled) {
+    encode_xml("BucketKeyEnabled", true, f);
+  }
+}
+
+void RGWBucketEncryptionConfig::decode_xml(XMLObj *obj) {
+  rule_exist = RGWXMLDecoder::decode_xml("Rule", rule, obj);
+}
+
+void RGWBucketEncryptionConfig::dump_xml(Formatter *f) const {
+  if (rule_exist) {
+    encode_xml("Rule", rule, f);
+  }
+}
+
+void RGWBucketEncryptionConfig::dump(Formatter *f) const {
+  encode_json("rule_exist", has_rule(), f);
+  if (has_rule()) {
+    encode_json("sse_algorithm", sse_algorithm(), f);
+    encode_json("kms_master_key_id", kms_master_key_id(), f);
+    encode_json("bucket_key_enabled", bucket_key_enabled(), f);
+  }
+}
diff --git a/src/rgw/rgw_bucket_encryption.h b/src/rgw/rgw_bucket_encryption.h
new file mode 100644
index 000000000..ba567bc71
--- /dev/null
+++ b/src/rgw/rgw_bucket_encryption.h
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+#include <include/types.h>
+
+class XMLObj;
+
+class ApplyServerSideEncryptionByDefault
+{
+  std::string kmsMasterKeyID;
+  std::string sseAlgorithm;
+
+public:
+  ApplyServerSideEncryptionByDefault() {};
+  ApplyServerSideEncryptionByDefault(const std::string &algorithm,
+     const std::string &key_id)
+   : kmsMasterKeyID(key_id), sseAlgorithm(algorithm) {};
+
+  const std::string& kms_master_key_id() const {
+    return kmsMasterKeyID;
+  }
+
+  const std::string& sse_algorithm() const {
+    return sseAlgorithm;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(kmsMasterKeyID, bl);
+    encode(sseAlgorithm, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(kmsMasterKeyID, bl);
+    decode(sseAlgorithm, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(ApplyServerSideEncryptionByDefault)
+
+class ServerSideEncryptionConfiguration
+{
+protected:
+  ApplyServerSideEncryptionByDefault applyServerSideEncryptionByDefault;
+  bool bucketKeyEnabled;
+
+public:
+  ServerSideEncryptionConfiguration(): bucketKeyEnabled(false) {};
+  ServerSideEncryptionConfiguration(const std::string &algorithm,
+    const std::string &keyid="", bool enabled = false)
+      : applyServerSideEncryptionByDefault(algorithm, keyid),
+        bucketKeyEnabled(enabled) {}
+
+  const std::string& kms_master_key_id() const {
+    return applyServerSideEncryptionByDefault.kms_master_key_id();
+  }
+
+  const std::string& sse_algorithm() const {
+    return applyServerSideEncryptionByDefault.sse_algorithm();
+  }
+
+  bool bucket_key_enabled() const {
+    return bucketKeyEnabled;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(applyServerSideEncryptionByDefault, bl);
+    encode(bucketKeyEnabled, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(applyServerSideEncryptionByDefault, bl);
+    decode(bucketKeyEnabled, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(ServerSideEncryptionConfiguration)
+
+class RGWBucketEncryptionConfig
+{
+protected:
+  bool rule_exist;
+  ServerSideEncryptionConfiguration rule;
+
+public:
+  RGWBucketEncryptionConfig(): rule_exist(false) {}
+  RGWBucketEncryptionConfig(const std::string &algorithm,
+    const std::string &keyid = "", bool enabled = false)
+      : rule_exist(true), rule(algorithm, keyid, enabled) {}
+
+  const std::string& kms_master_key_id() const {
+    return rule.kms_master_key_id();
+  }
+
+  const std::string& sse_algorithm() const {
+    return rule.sse_algorithm();
+  }
+
+  bool bucket_key_enabled() const {
+    return rule.bucket_key_enabled();
+  }
+
+  bool has_rule() const {
+    return rule_exist;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(rule_exist, bl);
+    if (rule_exist) {
+      encode(rule, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(rule_exist, bl);
+    if (rule_exist) {
+      decode(rule, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWBucketEncryptionConfig*>& o);
+};
+WRITE_CLASS_ENCODER(RGWBucketEncryptionConfig)
diff --git a/src/rgw/rgw_bucket_layout.cc b/src/rgw/rgw_bucket_layout.cc
new file mode 100644
index 000000000..499e8f0cd
--- /dev/null
+++ b/src/rgw/rgw_bucket_layout.cc
@@ -0,0 +1,380 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <boost/algorithm/string.hpp>
+#include "rgw_bucket_layout.h"
+
+namespace rgw {
+
+// BucketIndexType
+std::string_view to_string(const BucketIndexType& t)
+{
+  switch (t) {
+  case BucketIndexType::Normal: return "Normal";
+  case BucketIndexType::Indexless: return "Indexless";
+  default: return "Unknown";
+  }
+}
+bool parse(std::string_view str, BucketIndexType& t)
+{
+  if (boost::iequals(str, "Normal")) {
+    t = BucketIndexType::Normal;
+    return true;
+  }
+  if (boost::iequals(str, "Indexless")) {
+    t = BucketIndexType::Indexless;
+    return true;
+  }
+  return false;
+}
+void encode_json_impl(const char *name, const BucketIndexType& t, ceph::Formatter *f)
+{
+  encode_json(name, to_string(t), f);
+}
+void decode_json_obj(BucketIndexType& t, JSONObj *obj)
+{
+  std::string str;
+  decode_json_obj(str, obj);
+  parse(str, t);
+}
+
+// BucketHashType
+std::string_view to_string(const BucketHashType& t)
+{
+  switch (t) {
+  case BucketHashType::Mod: return "Mod";
+  default: return "Unknown";
+  }
+}
+bool parse(std::string_view str, BucketHashType& t)
+{
+  if (boost::iequals(str, "Mod")) {
+    t = BucketHashType::Mod;
+    return true;
+  }
+  return false;
+}
+void encode_json_impl(const char *name, const BucketHashType& t, ceph::Formatter *f)
+{
+  encode_json(name, to_string(t), f);
+}
+void decode_json_obj(BucketHashType& t, JSONObj *obj)
+{
+  std::string str;
+  decode_json_obj(str, obj);
+  parse(str, t);
+}
+
+// bucket_index_normal_layout
+void encode(const bucket_index_normal_layout& l, bufferlist& bl, uint64_t f)
+{
+  ENCODE_START(1, 1, bl);
+  encode(l.num_shards, bl);
+  encode(l.hash_type, bl);
+  ENCODE_FINISH(bl);
+}
+void decode(bucket_index_normal_layout& l, bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(l.num_shards, bl);
+  decode(l.hash_type, bl);
+  DECODE_FINISH(bl);
+}
+void encode_json_impl(const char *name, const bucket_index_normal_layout& l, ceph::Formatter *f)
+{
+  f->open_object_section(name);
+  encode_json("num_shards", l.num_shards, f);
+  encode_json("hash_type", l.hash_type, f);
+  f->close_section();
+}
+void decode_json_obj(bucket_index_normal_layout& l, JSONObj *obj)
+{
+  JSONDecoder::decode_json("num_shards", l.num_shards, obj);
+  JSONDecoder::decode_json("hash_type", l.hash_type, obj);
+}
+
+// bucket_index_layout
+void encode(const bucket_index_layout& l, bufferlist& bl, uint64_t f)
+{
+  ENCODE_START(1, 1, bl);
+  encode(l.type, bl);
+  switch (l.type) {
+  case BucketIndexType::Normal:
+    encode(l.normal, bl);
+    break;
+  case BucketIndexType::Indexless:
+    break;
+  }
+  ENCODE_FINISH(bl);
+}
+void decode(bucket_index_layout& l, bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(l.type, bl);
+  switch (l.type) {
+  case BucketIndexType::Normal:
+    decode(l.normal, bl);
+    break;
+  case BucketIndexType::Indexless:
+    break;
+  }
+  DECODE_FINISH(bl);
+}
+void encode_json_impl(const char *name, const bucket_index_layout& l, ceph::Formatter *f)
+{
+  f->open_object_section(name);
+  encode_json("type", l.type, f);
+  encode_json("normal", l.normal, f);
+  f->close_section();
+}
+void decode_json_obj(bucket_index_layout& l, JSONObj *obj)
+{
+  JSONDecoder::decode_json("type", l.type, obj);
+  JSONDecoder::decode_json("normal", l.normal, obj);
+}
+
+// bucket_index_layout_generation
+void encode(const bucket_index_layout_generation& l, bufferlist& bl, uint64_t f)
+{
+  ENCODE_START(1, 1, bl);
+  encode(l.gen, bl);
+  encode(l.layout, bl);
+  ENCODE_FINISH(bl);
+}
+void decode(bucket_index_layout_generation& l, bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(l.gen, bl);
+  decode(l.layout, bl);
+  DECODE_FINISH(bl);
+}
+void encode_json_impl(const char *name, const bucket_index_layout_generation& l, ceph::Formatter *f)
+{
+  f->open_object_section(name);
+  encode_json("gen", l.gen, f);
+  encode_json("layout", l.layout, f);
+  f->close_section();
+}
+void decode_json_obj(bucket_index_layout_generation& l, JSONObj *obj)
+{
+  JSONDecoder::decode_json("gen", l.gen, obj);
+  JSONDecoder::decode_json("layout", l.layout, obj);
+}
+
+// BucketLogType
+std::string_view to_string(const BucketLogType& t)
+{
+  switch (t) {
+  case BucketLogType::InIndex: return "InIndex";
+  default: return "Unknown";
+  }
+}
+bool parse(std::string_view str, BucketLogType& t)
+{
+  if (boost::iequals(str, "InIndex")) {
+    t = BucketLogType::InIndex;
+    return true;
+  }
+  return false;
+}
+void encode_json_impl(const char *name, const BucketLogType& t, ceph::Formatter *f)
+{
+  encode_json(name, to_string(t), f);
+}
+void decode_json_obj(BucketLogType& t, JSONObj *obj)
+{
+  std::string str;
+  decode_json_obj(str, obj);
+  parse(str, t);
+}
+
+// bucket_index_log_layout
+void encode(const bucket_index_log_layout& l, bufferlist& bl, uint64_t f)
+{
+  ENCODE_START(1, 1, bl);
+  encode(l.gen, bl);
+  encode(l.layout, bl);
+  ENCODE_FINISH(bl);
+}
+void decode(bucket_index_log_layout& l, bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(l.gen, bl);
+  decode(l.layout, bl);
+  DECODE_FINISH(bl);
+}
+void encode_json_impl(const char *name, const bucket_index_log_layout& l, ceph::Formatter *f)
+{
+  f->open_object_section(name);
+  encode_json("gen", l.gen, f);
+  encode_json("layout", l.layout, f);
+  f->close_section();
+}
+void decode_json_obj(bucket_index_log_layout& l, JSONObj *obj)
+{
+  JSONDecoder::decode_json("gen", l.gen, obj);
+  JSONDecoder::decode_json("layout", l.layout, obj);
+}
+
+// bucket_log_layout
+void encode(const bucket_log_layout& l, bufferlist& bl, uint64_t f)
+{
+  ENCODE_START(1, 1, bl);
+  encode(l.type, bl);
+  switch (l.type) {
+  case BucketLogType::InIndex:
+    encode(l.in_index, bl);
+    break;
+  }
+  ENCODE_FINISH(bl);
+}
+void decode(bucket_log_layout& l, bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(l.type, bl);
+  switch (l.type) {
+  case BucketLogType::InIndex:
+    decode(l.in_index, bl);
+    break;
+  }
+  DECODE_FINISH(bl);
+}
+void encode_json_impl(const char *name, const bucket_log_layout& l, ceph::Formatter *f)
+{
+  f->open_object_section(name);
+  encode_json("type", l.type, f);
+  if (l.type == BucketLogType::InIndex) {
+    encode_json("in_index", l.in_index, f);
+  }
+  f->close_section();
+}
+void decode_json_obj(bucket_log_layout& l, JSONObj *obj)
+{
+  JSONDecoder::decode_json("type", l.type, obj);
+  JSONDecoder::decode_json("in_index", l.in_index, obj);
+}
+
+// bucket_log_layout_generation
+void encode(const bucket_log_layout_generation& l, bufferlist& bl, uint64_t f)
+{
+  ENCODE_START(1, 1, bl);
+  encode(l.gen, bl);
+  encode(l.layout, bl);
+  ENCODE_FINISH(bl);
+}
+void decode(bucket_log_layout_generation& l, bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(l.gen, bl);
+  decode(l.layout, bl);
+  DECODE_FINISH(bl);
+}
+void encode_json_impl(const char *name, const bucket_log_layout_generation& l, ceph::Formatter *f)
+{
+  f->open_object_section(name);
+  encode_json("gen", l.gen, f);
+  encode_json("layout", l.layout, f);
+  f->close_section();
+}
+void decode_json_obj(bucket_log_layout_generation& l, JSONObj *obj)
+{
+  JSONDecoder::decode_json("gen", l.gen, obj);
+  JSONDecoder::decode_json("layout", l.layout, obj);
+}
+
+// BucketReshardState
+std::string_view to_string(const BucketReshardState& s)
+{
+  switch (s) {
+  case BucketReshardState::None: return "None";
+  case BucketReshardState::InProgress: return "InProgress";
+  default: return "Unknown";
+  }
+}
+bool parse(std::string_view str, BucketReshardState& s)
+{
+  if (boost::iequals(str, "None")) {
+    s = BucketReshardState::None;
+    return true;
+  }
+  if (boost::iequals(str, "InProgress")) {
+    s = BucketReshardState::InProgress;
+    return true;
+  }
+  return false;
+}
+void encode_json_impl(const char *name, const BucketReshardState& s, ceph::Formatter *f)
+{
+  encode_json(name, to_string(s), f);
+}
+void decode_json_obj(BucketReshardState& s, JSONObj *obj)
+{
+  std::string str;
+  decode_json_obj(str, obj);
+  parse(str, s);
+}
+
+
+// BucketLayout
+void encode(const BucketLayout& l, bufferlist& bl, uint64_t f)
+{
+  ENCODE_START(2, 1, bl);
+  encode(l.resharding, bl);
+  encode(l.current_index, bl);
+  encode(l.target_index, bl);
+  encode(l.logs, bl);
+  ENCODE_FINISH(bl);
+}
+void decode(BucketLayout& l, bufferlist::const_iterator& bl)
+{
+  DECODE_START(2, bl);
+  decode(l.resharding, bl);
+  decode(l.current_index, bl);
+  decode(l.target_index, bl);
+  if (struct_v < 2) {
+    l.logs.clear();
+    // initialize the log layout to match the current index layout
+    if (l.current_index.layout.type == BucketIndexType::Normal) {
+      l.logs.push_back(log_layout_from_index(0, l.current_index));
+    }
+  } else {
+    decode(l.logs, bl);
+  }
+  DECODE_FINISH(bl);
+}
+void encode_json_impl(const char *name, const BucketLayout& l, ceph::Formatter *f)
+{
+  f->open_object_section(name);
+  encode_json("resharding", l.resharding, f);
+  encode_json("current_index", l.current_index, f);
+  if (l.target_index) {
+    encode_json("target_index", *l.target_index, f);
+  }
+  f->open_array_section("logs");
+  for (const auto& log : l.logs) {
+    encode_json("log", log, f);
+  }
+  f->close_section(); // logs[]
+  f->close_section();
+}
+void decode_json_obj(BucketLayout& l, JSONObj *obj)
+{
+  JSONDecoder::decode_json("resharding", l.resharding, obj);
+  JSONDecoder::decode_json("current_index", l.current_index, obj);
+  JSONDecoder::decode_json("target_index", l.target_index, obj);
+  JSONDecoder::decode_json("logs", l.logs, obj);
+}
+
+} // namespace rgw
diff --git a/src/rgw/rgw_bucket_layout.h b/src/rgw/rgw_bucket_layout.h
new file mode 100644
index 000000000..40aafd4dd
--- /dev/null
+++ b/src/rgw/rgw_bucket_layout.h
@@ -0,0 +1,282 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * introduce changes or include files which can only be compiled in
+ * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h)
+ */
+
+#pragma once
+
+#include <optional>
+#include <string>
+#include "include/encoding.h"
+#include "common/ceph_json.h"
+
+namespace rgw {
+
+enum class BucketIndexType : uint8_t {
+  Normal, // normal hash-based sharded index layout
+  Indexless, // no bucket index, so listing is unsupported
+};
+
+std::string_view to_string(const BucketIndexType& t);
+bool parse(std::string_view str, BucketIndexType& t);
+void encode_json_impl(const char *name, const BucketIndexType& t, ceph::Formatter *f);
+void decode_json_obj(BucketIndexType& t, JSONObj *obj);
+
+inline std::ostream& operator<<(std::ostream& out, const BucketIndexType& t)
+{
+  return out << to_string(t);
+}
+
+enum class BucketHashType : uint8_t {
+  Mod, // rjenkins hash of object name, modulo num_shards
+};
+
+std::string_view to_string(const BucketHashType& t);
+bool parse(std::string_view str, BucketHashType& t);
+void encode_json_impl(const char *name, const BucketHashType& t, ceph::Formatter *f);
+void decode_json_obj(BucketHashType& t, JSONObj *obj);
+
+struct bucket_index_normal_layout {
+  uint32_t num_shards = 1;
+
+  BucketHashType hash_type = BucketHashType::Mod;
+
+  friend std::ostream& operator<<(std::ostream& out, const bucket_index_normal_layout& l) {
+    out << "num_shards=" << l.num_shards << ", hash_type=" << to_string(l.hash_type);
+    return out;
+  }
+};
+
+inline bool operator==(const bucket_index_normal_layout& l,
+                       const bucket_index_normal_layout& r) {
+  return l.num_shards == r.num_shards
+      && l.hash_type == r.hash_type;
+}
+inline bool operator!=(const bucket_index_normal_layout& l,
+                       const bucket_index_normal_layout& r) {
+  return !(l == r);
+}
+
+void encode(const bucket_index_normal_layout& l, bufferlist& bl, uint64_t f=0);
+void decode(bucket_index_normal_layout& l, bufferlist::const_iterator& bl);
+void encode_json_impl(const char *name, const bucket_index_normal_layout& l, ceph::Formatter *f);
+void decode_json_obj(bucket_index_normal_layout& l, JSONObj *obj);
+
+struct bucket_index_layout {
+  BucketIndexType type = BucketIndexType::Normal;
+
+  // TODO: variant of layout types?
+  bucket_index_normal_layout normal;
+
+  friend std::ostream& operator<<(std::ostream& out, const bucket_index_layout& l) {
+    out << "type=" << to_string(l.type) << ", normal=" << l.normal;
+    return out;
+  }
+};
+
+inline bool operator==(const bucket_index_layout& l,
+                       const bucket_index_layout& r) {
+  return l.type == r.type && l.normal == r.normal;
+}
+inline bool operator!=(const bucket_index_layout& l,
+                       const bucket_index_layout& r) {
+  return !(l == r);
+}
+
+void encode(const bucket_index_layout& l, bufferlist& bl, uint64_t f=0);
+void decode(bucket_index_layout& l, bufferlist::const_iterator& bl);
+void encode_json_impl(const char *name, const bucket_index_layout& l, ceph::Formatter *f);
+void decode_json_obj(bucket_index_layout& l, JSONObj *obj);
+
+struct bucket_index_layout_generation {
+  uint64_t gen = 0;
+  bucket_index_layout layout;
+
+  friend std::ostream& operator<<(std::ostream& out, const bucket_index_layout_generation& g) {
+    out << "gen=" << g.gen;
+    return out;
+  }
+};
+
+inline bool operator==(const bucket_index_layout_generation& l,
+                       const bucket_index_layout_generation& r) {
+  return l.gen == r.gen && l.layout == r.layout;
+}
+inline bool operator!=(const bucket_index_layout_generation& l,
+                       const bucket_index_layout_generation& r) {
+  return !(l == r);
+}
+
+void encode(const bucket_index_layout_generation& l, bufferlist& bl, uint64_t f=0);
+void decode(bucket_index_layout_generation& l, bufferlist::const_iterator& bl);
+void encode_json_impl(const char *name, const bucket_index_layout_generation& l, ceph::Formatter *f);
+void decode_json_obj(bucket_index_layout_generation& l, JSONObj *obj);
+
+
+enum class BucketLogType : uint8_t {
+  // colocated with bucket index, so the log layout matches the index layout
+  InIndex,
+};
+
+std::string_view to_string(const BucketLogType& t);
+bool parse(std::string_view str, BucketLogType& t);
+void encode_json_impl(const char *name, const BucketLogType& t, ceph::Formatter *f);
+void decode_json_obj(BucketLogType& t, JSONObj *obj);
+
+inline std::ostream& operator<<(std::ostream& out, const BucketLogType &log_type)
+{
+  switch (log_type) {
+    case BucketLogType::InIndex:
+      return out << "InIndex";
+    default:
+      return out << "Unknown";
+  }
+}
+
+struct bucket_index_log_layout {
+  uint64_t gen = 0;
+  bucket_index_normal_layout layout;
+  operator bucket_index_layout_generation() const {
+    bucket_index_layout_generation bilg;
+    bilg.gen = gen;
+    bilg.layout.type = BucketIndexType::Normal;
+    bilg.layout.normal = layout;
+    return bilg;
+  }
+};
+
+void encode(const bucket_index_log_layout& l, bufferlist& bl, uint64_t f=0);
+void decode(bucket_index_log_layout& l, bufferlist::const_iterator& bl);
+void encode_json_impl(const char *name, const bucket_index_log_layout& l, ceph::Formatter *f);
+void decode_json_obj(bucket_index_log_layout& l, JSONObj *obj);
+
+struct bucket_log_layout {
+  BucketLogType type = BucketLogType::InIndex;
+
+  bucket_index_log_layout in_index;
+
+  friend std::ostream& operator<<(std::ostream& out, const bucket_log_layout& l) {
+    out << "type=" << to_string(l.type);
+    return out;
+  }
+};
+
+void encode(const bucket_log_layout& l, bufferlist& bl, uint64_t f=0);
+void decode(bucket_log_layout& l, bufferlist::const_iterator& bl);
+void encode_json_impl(const char *name, const bucket_log_layout& l, ceph::Formatter *f);
+void decode_json_obj(bucket_log_layout& l, JSONObj *obj);
+
+struct bucket_log_layout_generation {
+  uint64_t gen = 0;
+  bucket_log_layout layout;
+
+  friend std::ostream& operator<<(std::ostream& out, const bucket_log_layout_generation& g) {
+    out << "gen=" << g.gen << ", layout=[ " << g.layout << " ]";
+    return out;
+  }
+};
+
+void encode(const bucket_log_layout_generation& l, bufferlist& bl, uint64_t f=0);
+void decode(bucket_log_layout_generation& l, bufferlist::const_iterator& bl);
+void encode_json_impl(const char *name, const bucket_log_layout_generation& l, ceph::Formatter *f);
+void decode_json_obj(bucket_log_layout_generation& l, JSONObj *obj);
+
+// return a log layout that shares its layout with the index
+inline bucket_log_layout_generation log_layout_from_index(
+    uint64_t gen, const bucket_index_layout_generation& index)
+{
+  return {gen, {BucketLogType::InIndex, {index.gen, index.layout.normal}}};
+}
+
+inline auto matches_gen(uint64_t gen)
+{
+  return [gen] (const bucket_log_layout_generation& l) { return l.gen == gen; };
+}
+
+inline bucket_index_layout_generation log_to_index_layout(const bucket_log_layout_generation& log_layout)
+{
+  ceph_assert(log_layout.layout.type == BucketLogType::InIndex);
+  bucket_index_layout_generation index;
+  index.gen = log_layout.layout.in_index.gen;
+  index.layout.normal = log_layout.layout.in_index.layout;
+  return index;
+}
+
+enum class BucketReshardState : uint8_t {
+  None,
+  InProgress,
+};
+std::string_view to_string(const BucketReshardState& s);
+bool parse(std::string_view str, BucketReshardState& s);
+void encode_json_impl(const char *name, const BucketReshardState& s, ceph::Formatter *f);
+void decode_json_obj(BucketReshardState& s, JSONObj *obj);
+
+// describes the layout of bucket index objects
+struct BucketLayout {
+  BucketReshardState resharding = BucketReshardState::None;
+
+  // current bucket index layout
+  bucket_index_layout_generation current_index;
+
+  // target index layout of a resharding operation
+  std::optional<bucket_index_layout_generation> target_index;
+
+  // history of untrimmed bucket log layout generations, with the current
+  // generation at the back()
+  std::vector<bucket_log_layout_generation> logs;
+
+  friend std::ostream& operator<<(std::ostream& out, const BucketLayout& l) {
+    std::stringstream ss;
+    if (l.target_index) {
+      ss << *l.target_index;
+    } else {
+      ss << "none";
+    }
+    out << "resharding=" << to_string(l.resharding) <<
+      ", current_index=[" << l.current_index << "], target_index=[" <<
+      ss.str() << "], logs.size()=" << l.logs.size();
+
+    return out;
+  }
+};
+
+void encode(const BucketLayout& l, bufferlist& bl, uint64_t f=0);
+void decode(BucketLayout& l, bufferlist::const_iterator& bl);
+void encode_json_impl(const char *name, const BucketLayout& l, ceph::Formatter *f);
+void decode_json_obj(BucketLayout& l, JSONObj *obj);
+
+
+inline uint32_t num_shards(const bucket_index_normal_layout& index) {
+  // old buckets used num_shards=0 to mean 1
+  return index.num_shards > 0 ? index.num_shards : 1;
+}
+inline uint32_t num_shards(const bucket_index_layout& index) {
+  ceph_assert(index.type == BucketIndexType::Normal);
+  return num_shards(index.normal);
+}
+inline uint32_t num_shards(const bucket_index_layout_generation& index) {
+  return num_shards(index.layout);
+}
+inline uint32_t current_num_shards(const BucketLayout& layout) {
+  return num_shards(layout.current_index);
+}
+inline bool is_layout_indexless(const bucket_index_layout_generation& layout) {
+  return layout.layout.type == BucketIndexType::Indexless;
+}
+
+} // namespace rgw
diff --git a/src/rgw/rgw_bucket_sync_cache.h b/src/rgw/rgw_bucket_sync_cache.h
new file mode 100644
index 000000000..064fdce48
--- /dev/null
+++ b/src/rgw/rgw_bucket_sync_cache.h
@@ -0,0 +1,116 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ */
+
+#pragma once
+
+#include <boost/smart_ptr/intrusive_ref_counter.hpp>
+#include "common/intrusive_lru.h"
+#include "rgw_data_sync.h"
+
+namespace rgw::bucket_sync {
+
+// per bucket-shard state cached by DataSyncShardCR
+struct State {
+  // the source bucket shard to sync
+  std::pair<rgw_bucket_shard, std::optional<uint64_t>> key;
+  // current sync obligation being processed by DataSyncSingleEntry
+  std::optional<rgw_data_sync_obligation> obligation;
+  // incremented with each new obligation
+  uint32_t counter = 0;
+  // highest timestamp applied by all sources
+  ceph::real_time progress_timestamp;
+
+  State(const std::pair<rgw_bucket_shard, std::optional<uint64_t>>& key ) noexcept
+    : key(key) {}
+  State(const rgw_bucket_shard& shard, std::optional<uint64_t> gen) noexcept
+    : key(shard, gen) {}
+};
+
+struct Entry;
+struct EntryToKey;
+class Handle;
+
+using lru_config = ceph::common::intrusive_lru_config<
+  std::pair<rgw_bucket_shard, std::optional<uint64_t>>, Entry, EntryToKey>;
+
+// a recyclable cache entry
+struct Entry : State, ceph::common::intrusive_lru_base<lru_config> {
+  using State::State;
+};
+
+struct EntryToKey {
+  using type = std::pair<rgw_bucket_shard, std::optional<uint64_t>>;
+  const type& operator()(const Entry& e) { return e.key; }
+};
+
+// use a non-atomic reference count since these aren't shared across threads
+template <typename T>
+using thread_unsafe_ref_counter = boost::intrusive_ref_counter<
+    T, boost::thread_unsafe_counter>;
+
+// a state cache for entries within a single datalog shard
+class Cache : public thread_unsafe_ref_counter<Cache> {
+  ceph::common::intrusive_lru<lru_config> cache;
+ protected:
+  // protected ctor to enforce the use of factory function create()
+  explicit Cache(size_t target_size) {
+    cache.set_target_size(target_size);
+  }
+ public:
+  static boost::intrusive_ptr<Cache> create(size_t target_size) {
+    return new Cache(target_size);
+  }
+
+  // find or create a cache entry for the given key, and return a Handle that
+  // keeps it lru-pinned until destruction
+  Handle get(const rgw_bucket_shard& shard, std::optional<uint64_t> gen);
+};
+
+// a State handle that keeps the Cache referenced
+class Handle {
+  boost::intrusive_ptr<Cache> cache;
+  boost::intrusive_ptr<Entry> entry;
+ public:
+  Handle() noexcept = default;
+  ~Handle() = default;
+  Handle(boost::intrusive_ptr<Cache> cache,
+         boost::intrusive_ptr<Entry> entry) noexcept
+    : cache(std::move(cache)), entry(std::move(entry)) {}
+  Handle(Handle&&) = default;
+  Handle(const Handle&) = default;
+  Handle& operator=(Handle&& o) noexcept {
+    // move the entry first so that its cache stays referenced over destruction
+    entry = std::move(o.entry);
+    cache = std::move(o.cache);
+    return *this;
+  }
+  Handle& operator=(const Handle& o) noexcept {
+    // copy the entry first so that its cache stays referenced over destruction
+    entry = o.entry;
+    cache = o.cache;
+    return *this;
+  }
+
+  explicit operator bool() const noexcept { return static_cast<bool>(entry); }
+  State& operator*() const noexcept { return *entry; }
+  State* operator->() const noexcept { return entry.get(); }
+};
+
+inline Handle Cache::get(const rgw_bucket_shard& shard, std::optional<uint64_t> gen)
+{
+  auto result = cache.get_or_create({ shard, gen });
+  return {this, std::move(result.first)};
+}
+
+} // namespace rgw::bucket_sync
diff --git a/src/rgw/rgw_bucket_types.h b/src/rgw/rgw_bucket_types.h
new file mode 100644
index 000000000..61acc58bb
--- /dev/null
+++ b/src/rgw/rgw_bucket_types.h
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * include files which can only be compiled in radosgw or OSD
+ * contexts (e.g., rgw_sal.h, rgw_common.h) */
+
+#pragma once
+
+#include <fmt/format.h>
+
+#include "rgw_pool_types.h"
+#include "rgw_user_types.h"
+#include "rgw_placement_types.h"
+
+#include "common/dout.h"
+#include "common/Formatter.h"
+
+struct cls_user_bucket;
+
+struct rgw_bucket_key {
+  std::string tenant;
+  std::string name;
+  std::string bucket_id;
+
+  rgw_bucket_key(const std::string& _tenant,
+                 const std::string& _name,
+                 const std::string& _bucket_id) : tenant(_tenant),
+                                                  name(_name),
+                                                  bucket_id(_bucket_id) {}
+  rgw_bucket_key(const std::string& _tenant,
+                 const std::string& _name) : tenant(_tenant),
+                                             name(_name) {}
+};
+
+struct rgw_bucket {
+  std::string tenant;
+  std::string name;
+  std::string marker;
+  std::string bucket_id;
+  rgw_data_placement_target explicit_placement;
+
+  rgw_bucket() { }
+  // cppcheck-suppress noExplicitConstructor
+  explicit rgw_bucket(const rgw_user& u, const cls_user_bucket& b);
+
+  rgw_bucket(const std::string& _tenant,
+	     const std::string& _name,
+	     const std::string& _bucket_id) : tenant(_tenant),
+                                              name(_name),
+                                              bucket_id(_bucket_id) {}
+  rgw_bucket(const rgw_bucket_key& bk) : tenant(bk.tenant),
+                                         name(bk.name),
+                                         bucket_id(bk.bucket_id) {}
+  rgw_bucket(const rgw_bucket&) = default;
+  rgw_bucket(rgw_bucket&&) = default;
+
+  bool match(const rgw_bucket& b) const {
+    return (tenant == b.tenant &&
+	    name == b.name &&
+	    (bucket_id == b.bucket_id ||
+	     bucket_id.empty() ||
+	     b.bucket_id.empty()));
+  }
+
+  void convert(cls_user_bucket *b) const;
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(10, 10, bl);
+    encode(name, bl);
+    encode(marker, bl);
+    encode(bucket_id, bl);
+    encode(tenant, bl);
+    bool encode_explicit = !explicit_placement.data_pool.empty();
+    encode(encode_explicit, bl);
+    if (encode_explicit) {
+      encode(explicit_placement.data_pool, bl);
+      encode(explicit_placement.data_extra_pool, bl);
+      encode(explicit_placement.index_pool, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(10, 3, 3, bl);
+    decode(name, bl);
+    if (struct_v < 10) {
+      decode(explicit_placement.data_pool.name, bl);
+    }
+    if (struct_v >= 2) {
+      decode(marker, bl);
+      if (struct_v <= 3) {
+        uint64_t id;
+        decode(id, bl);
+        char buf[16];
+        snprintf(buf, sizeof(buf), "%" PRIu64, id);
+        bucket_id = buf;
+      } else {
+        decode(bucket_id, bl);
+      }
+    }
+    if (struct_v < 10) {
+      if (struct_v >= 5) {
+        decode(explicit_placement.index_pool.name, bl);
+      } else {
+        explicit_placement.index_pool = explicit_placement.data_pool;
+      }
+      if (struct_v >= 7) {
+        decode(explicit_placement.data_extra_pool.name, bl);
+      }
+    }
+    if (struct_v >= 8) {
+      decode(tenant, bl);
+    }
+    if (struct_v >= 10) {
+      bool decode_explicit = !explicit_placement.data_pool.empty();
+      decode(decode_explicit, bl);
+      if (decode_explicit) {
+        decode(explicit_placement.data_pool, bl);
+        decode(explicit_placement.data_extra_pool, bl);
+        decode(explicit_placement.index_pool, bl);
+      }
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void update_bucket_id(const std::string& new_bucket_id) {
+    bucket_id = new_bucket_id;
+  }
+
+  // format a key for the bucket/instance. pass delim=0 to skip a field
+  std::string get_key(char tenant_delim = '/',
+                      char id_delim = ':',
+                      size_t reserve = 0) const;
+
+  const rgw_pool& get_data_extra_pool() const {
+    return explicit_placement.get_data_extra_pool();
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<rgw_bucket*>& o);
+
+  rgw_bucket& operator=(const rgw_bucket&) = default;
+
+  bool operator<(const rgw_bucket& b) const {
+    if (tenant < b.tenant) {
+      return true;
+    } else if (tenant > b.tenant) {
+      return false;
+    }
+
+    if (name < b.name) {
+      return true;
+    } else if (name > b.name) {
+      return false;
+    }
+
+    return (bucket_id < b.bucket_id);
+  }
+
+  bool operator==(const rgw_bucket& b) const {
+    return (tenant == b.tenant) && (name == b.name) && \
+           (bucket_id == b.bucket_id);
+  }
+  bool operator!=(const rgw_bucket& b) const {
+    return (tenant != b.tenant) || (name != b.name) ||
+           (bucket_id != b.bucket_id);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_bucket)
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_bucket &b) {
+  out << b.tenant << ":" << b.name << "[" << b.bucket_id << "])";
+  return out;
+}
+
+struct rgw_bucket_placement {
+  rgw_placement_rule placement_rule;
+  rgw_bucket bucket;
+
+  void dump(Formatter *f) const;
+}; /* rgw_bucket_placement */
+
+struct rgw_bucket_shard {
+  rgw_bucket bucket;
+  int shard_id;
+
+  rgw_bucket_shard() : shard_id(-1) {}
+  rgw_bucket_shard(const rgw_bucket& _b, int _sid) : bucket(_b), shard_id(_sid) {}
+
+  std::string get_key(char tenant_delim = '/', char id_delim = ':',
+                      char shard_delim = ':',
+                      size_t reserve = 0) const;
+
+  bool operator<(const rgw_bucket_shard& b) const {
+    if (bucket < b.bucket) {
+      return true;
+    }
+    if (b.bucket < bucket) {
+      return false;
+    }
+    return shard_id < b.shard_id;
+  }
+
+  bool operator==(const rgw_bucket_shard& b) const {
+    return (bucket == b.bucket &&
+            shard_id == b.shard_id);
+  }
+}; /* rgw_bucket_shard */
+
+void encode(const rgw_bucket_shard& b, bufferlist& bl, uint64_t f=0);
+void decode(rgw_bucket_shard& b, bufferlist::const_iterator& bl);
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_bucket_shard& bs) {
+  if (bs.shard_id <= 0) {
+    return out << bs.bucket;
+  }
+
+  return out << bs.bucket << ":" << bs.shard_id;
+}
diff --git a/src/rgw/rgw_cache.cc b/src/rgw/rgw_cache.cc
new file mode 100644
index 000000000..dd7a826cd
--- /dev/null
+++ b/src/rgw/rgw_cache.cc
@@ -0,0 +1,419 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_cache.h"
+#include "rgw_perf_counters.h"
+
+#include <errno.h>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int ObjectCache::get(const DoutPrefixProvider *dpp, const string& name, ObjectCacheInfo& info, uint32_t mask, rgw_cache_entry_info *cache_info)
+{
+
+  std::shared_lock rl{lock};
+  std::unique_lock wl{lock, std::defer_lock}; // may be promoted to write lock
+  if (!enabled) {
+    return -ENOENT;
+  }
+  auto iter = cache_map.find(name);
+  if (iter == cache_map.end()) {
+    ldpp_dout(dpp, 10) << "cache get: name=" << name << " : miss" << dendl;
+    if (perfcounter) {
+      perfcounter->inc(l_rgw_cache_miss);
+    }
+    return -ENOENT;
+  }
+
+  if (expiry.count() &&
+       (ceph::coarse_mono_clock::now() - iter->second.info.time_added) > expiry) {
+    ldpp_dout(dpp, 10) << "cache get: name=" << name << " : expiry miss" << dendl;
+    rl.unlock();
+    wl.lock(); // write lock for expiration
+    // check that wasn't already removed by other thread
+    iter = cache_map.find(name);
+    if (iter != cache_map.end()) {
+      for (auto &kv : iter->second.chained_entries)
+        kv.first->invalidate(kv.second);
+      remove_lru(name, iter->second.lru_iter);
+      cache_map.erase(iter);
+    }
+    if (perfcounter) {
+      perfcounter->inc(l_rgw_cache_miss);
+    }
+    return -ENOENT;
+  }
+
+  ObjectCacheEntry *entry = &iter->second;
+
+  if (lru_counter - entry->lru_promotion_ts > lru_window) {
+    ldpp_dout(dpp, 20) << "cache get: touching lru, lru_counter=" << lru_counter
+                   << " promotion_ts=" << entry->lru_promotion_ts << dendl;
+    rl.unlock();
+    wl.lock(); // write lock for touch_lru()
+    /* need to redo this because entry might have dropped off the cache */
+    iter = cache_map.find(name);
+    if (iter == cache_map.end()) {
+      ldpp_dout(dpp, 10) << "lost race! cache get: name=" << name << " : miss" << dendl;
+      if(perfcounter) perfcounter->inc(l_rgw_cache_miss);
+      return -ENOENT;
+    }
+
+    entry = &iter->second;
+    /* check again, we might have lost a race here */
+    if (lru_counter - entry->lru_promotion_ts > lru_window) {
+      touch_lru(dpp, name, *entry, iter->second.lru_iter);
+    }
+  }
+
+  ObjectCacheInfo& src = iter->second.info;
+  if(src.status == -ENOENT) {
+    ldpp_dout(dpp, 10) << "cache get: name=" << name << " : hit (negative entry)" << dendl;
+    if (perfcounter) perfcounter->inc(l_rgw_cache_hit);
+    return -ENODATA;
+  }
+  if ((src.flags & mask) != mask) {
+    ldpp_dout(dpp, 10) << "cache get: name=" << name << " : type miss (requested=0x"
+                   << std::hex << mask << ", cached=0x" << src.flags
+                   << std::dec << ")" << dendl;
+    if(perfcounter) perfcounter->inc(l_rgw_cache_miss);
+    return -ENOENT;
+  }
+  ldpp_dout(dpp, 10) << "cache get: name=" << name << " : hit (requested=0x"
+                 << std::hex << mask << ", cached=0x" << src.flags
+                 << std::dec << ")" << dendl;
+
+  info = src;
+  if (cache_info) {
+    cache_info->cache_locator = name;
+    cache_info->gen = entry->gen;
+  }
+  if(perfcounter) perfcounter->inc(l_rgw_cache_hit);
+
+  return 0;
+}
+
+bool ObjectCache::chain_cache_entry(const DoutPrefixProvider *dpp,
+                                    std::initializer_list<rgw_cache_entry_info*> cache_info_entries,
+				    RGWChainedCache::Entry *chained_entry)
+{
+  std::unique_lock l{lock};
+
+  if (!enabled) {
+    return false;
+  }
+
+  std::vector<ObjectCacheEntry*> entries;
+  entries.reserve(cache_info_entries.size());
+  /* first verify that all entries are still valid */
+  for (auto cache_info : cache_info_entries) {
+    ldpp_dout(dpp, 10) << "chain_cache_entry: cache_locator="
+		   << cache_info->cache_locator << dendl;
+    auto iter = cache_map.find(cache_info->cache_locator);
+    if (iter == cache_map.end()) {
+      ldpp_dout(dpp, 20) << "chain_cache_entry: couldn't find cache locator" << dendl;
+      return false;
+    }
+
+    auto entry = &iter->second;
+
+    if (entry->gen != cache_info->gen) {
+      ldpp_dout(dpp, 20) << "chain_cache_entry: entry.gen (" << entry->gen
+		     << ") != cache_info.gen (" << cache_info->gen << ")"
+		     << dendl;
+      return false;
+    }
+    entries.push_back(entry);
+  }
+
+
+  chained_entry->cache->chain_cb(chained_entry->key, chained_entry->data);
+
+  for (auto entry : entries) {
+    entry->chained_entries.push_back(make_pair(chained_entry->cache,
+					       chained_entry->key));
+  }
+
+  return true;
+}
+
+void ObjectCache::put(const DoutPrefixProvider *dpp, const string& name, ObjectCacheInfo& info, rgw_cache_entry_info *cache_info)
+{
+  std::unique_lock l{lock};
+
+  if (!enabled) {
+    return;
+  }
+
+  ldpp_dout(dpp, 10) << "cache put: name=" << name << " info.flags=0x"
+                 << std::hex << info.flags << std::dec << dendl;
+
+  auto [iter, inserted] = cache_map.emplace(name, ObjectCacheEntry{});
+  ObjectCacheEntry& entry = iter->second;
+  entry.info.time_added = ceph::coarse_mono_clock::now();
+  if (inserted) {
+    entry.lru_iter = lru.end();
+  }
+  ObjectCacheInfo& target = entry.info;
+
+  invalidate_lru(entry);
+
+  entry.chained_entries.clear();
+  entry.gen++;
+
+  touch_lru(dpp, name, entry, entry.lru_iter);
+
+  target.status = info.status;
+
+  if (info.status < 0) {
+    target.flags = 0;
+    target.xattrs.clear();
+    target.data.clear();
+    return;
+  }
+
+  if (cache_info) {
+    cache_info->cache_locator = name;
+    cache_info->gen = entry.gen;
+  }
+
+  // put() must include the latest version if we're going to keep caching it
+  target.flags &= ~CACHE_FLAG_OBJV;
+
+  target.flags |= info.flags;
+
+  if (info.flags & CACHE_FLAG_META)
+    target.meta = info.meta;
+  else if (!(info.flags & CACHE_FLAG_MODIFY_XATTRS))
+    target.flags &= ~CACHE_FLAG_META; // non-meta change should reset meta
+
+  if (info.flags & CACHE_FLAG_XATTRS) {
+    target.xattrs = info.xattrs;
+    map<string, bufferlist>::iterator iter;
+    for (iter = target.xattrs.begin(); iter != target.xattrs.end(); ++iter) {
+      ldpp_dout(dpp, 10) << "updating xattr: name=" << iter->first << " bl.length()=" << iter->second.length() << dendl;
+    }
+  } else if (info.flags & CACHE_FLAG_MODIFY_XATTRS) {
+    map<string, bufferlist>::iterator iter;
+    for (iter = info.rm_xattrs.begin(); iter != info.rm_xattrs.end(); ++iter) {
+      ldpp_dout(dpp, 10) << "removing xattr: name=" << iter->first << dendl;
+      target.xattrs.erase(iter->first);
+    }
+    for (iter = info.xattrs.begin(); iter != info.xattrs.end(); ++iter) {
+      ldpp_dout(dpp, 10) << "appending xattr: name=" << iter->first << " bl.length()=" << iter->second.length() << dendl;
+      target.xattrs[iter->first] = iter->second;
+    }
+  }
+
+  if (info.flags & CACHE_FLAG_DATA)
+    target.data = info.data;
+
+  if (info.flags & CACHE_FLAG_OBJV)
+    target.version = info.version;
+}
+
+// WARNING: This function /must not/ be modified to cache a
+// negative lookup. It must only invalidate.
+bool ObjectCache::invalidate_remove(const DoutPrefixProvider *dpp, const string& name)
+{
+  std::unique_lock l{lock};
+
+  if (!enabled) {
+    return false;
+  }
+
+  auto iter = cache_map.find(name);
+  if (iter == cache_map.end())
+    return false;
+
+  ldpp_dout(dpp, 10) << "removing " << name << " from cache" << dendl;
+  ObjectCacheEntry& entry = iter->second;
+
+  for (auto& kv : entry.chained_entries) {
+    kv.first->invalidate(kv.second);
+  }
+
+  remove_lru(name, iter->second.lru_iter);
+  cache_map.erase(iter);
+  return true;
+}
+
+void ObjectCache::touch_lru(const DoutPrefixProvider *dpp, const string& name, ObjectCacheEntry& entry,
+			    std::list<string>::iterator& lru_iter)
+{
+  while (lru_size > (size_t)cct->_conf->rgw_cache_lru_size) {
+    auto iter = lru.begin();
+    if ((*iter).compare(name) == 0) {
+      /*
+       * if the entry we're touching happens to be at the lru end, don't remove it,
+       * lru shrinking can wait for next time
+       */
+      break;
+    }
+    auto map_iter = cache_map.find(*iter);
+    ldout(cct, 10) << "removing entry: name=" << *iter << " from cache LRU" << dendl;
+    if (map_iter != cache_map.end()) {
+      ObjectCacheEntry& entry = map_iter->second;
+      invalidate_lru(entry);
+      cache_map.erase(map_iter);
+    }
+    lru.pop_front();
+    lru_size--;
+  }
+
+  if (lru_iter == lru.end()) {
+    lru.push_back(name);
+    lru_size++;
+    lru_iter--;
+    ldpp_dout(dpp, 10) << "adding " << name << " to cache LRU end" << dendl;
+  } else {
+    ldpp_dout(dpp, 10) << "moving " << name << " to cache LRU end" << dendl;
+    lru.erase(lru_iter);
+    lru.push_back(name);
+    lru_iter = lru.end();
+    --lru_iter;
+  }
+
+  lru_counter++;
+  entry.lru_promotion_ts = lru_counter;
+}
+
+void ObjectCache::remove_lru(const string& name,
+			     std::list<string>::iterator& lru_iter)
+{
+  if (lru_iter == lru.end())
+    return;
+
+  lru.erase(lru_iter);
+  lru_size--;
+  lru_iter = lru.end();
+}
+
+void ObjectCache::invalidate_lru(ObjectCacheEntry& entry)
+{
+  for (auto iter = entry.chained_entries.begin();
+       iter != entry.chained_entries.end(); ++iter) {
+    RGWChainedCache *chained_cache = iter->first;
+    chained_cache->invalidate(iter->second);
+  }
+}
+
+void ObjectCache::set_enabled(bool status)
+{
+  std::unique_lock l{lock};
+
+  enabled = status;
+
+  if (!enabled) {
+    do_invalidate_all();
+  }
+}
+
+void ObjectCache::invalidate_all()
+{
+  std::unique_lock l{lock};
+
+  do_invalidate_all();
+}
+
+void ObjectCache::do_invalidate_all()
+{
+  cache_map.clear();
+  lru.clear();
+
+  lru_size = 0;
+  lru_counter = 0;
+  lru_window = 0;
+
+  for (auto& cache : chained_cache) {
+    cache->invalidate_all();
+  }
+}
+
+void ObjectCache::chain_cache(RGWChainedCache *cache) {
+  std::unique_lock l{lock};
+  chained_cache.push_back(cache);
+}
+
+void ObjectCache::unchain_cache(RGWChainedCache *cache) {
+  std::unique_lock l{lock};
+
+  auto iter = chained_cache.begin();
+  for (; iter != chained_cache.end(); ++iter) {
+    if (cache == *iter) {
+      chained_cache.erase(iter);
+      cache->unregistered();
+      return;
+    }
+  }
+}
+
+ObjectCache::~ObjectCache()
+{
+  for (auto cache : chained_cache) {
+    cache->unregistered();
+  }
+}
+
+void ObjectMetaInfo::generate_test_instances(list<ObjectMetaInfo*>& o)
+{
+  ObjectMetaInfo *m = new ObjectMetaInfo;
+  m->size = 1024 * 1024;
+  o.push_back(m);
+  o.push_back(new ObjectMetaInfo);
+}
+
+void ObjectMetaInfo::dump(Formatter *f) const
+{
+  encode_json("size", size, f);
+  encode_json("mtime", utime_t(mtime), f);
+}
+
+void ObjectCacheInfo::generate_test_instances(list<ObjectCacheInfo*>& o)
+{
+  using ceph::encode;
+  ObjectCacheInfo *i = new ObjectCacheInfo;
+  i->status = 0;
+  i->flags = CACHE_FLAG_MODIFY_XATTRS;
+  string s = "this is a string";
+  string s2 = "this is a another string";
+  bufferlist data, data2;
+  encode(s, data);
+  encode(s2, data2);
+  i->data = data;
+  i->xattrs["x1"] = data;
+  i->xattrs["x2"] = data2;
+  i->rm_xattrs["r2"] = data2;
+  i->rm_xattrs["r3"] = data;
+  i->meta.size = 512 * 1024;
+  o.push_back(i);
+  o.push_back(new ObjectCacheInfo);
+}
+
+void ObjectCacheInfo::dump(Formatter *f) const
+{
+  encode_json("status", status, f);
+  encode_json("flags", flags, f);
+  encode_json("data", data, f);
+  encode_json_map("xattrs", "name", "value", "length", xattrs, f);
+  encode_json_map("rm_xattrs", "name", "value", "length", rm_xattrs, f);
+  encode_json("meta", meta, f);
+
+}
+
+void RGWCacheNotifyInfo::generate_test_instances(list<RGWCacheNotifyInfo*>& o)
+{
+  o.push_back(new RGWCacheNotifyInfo);
+}
+
+void RGWCacheNotifyInfo::dump(Formatter *f) const
+{
+  encode_json("op", op, f);
+  encode_json("obj", obj, f);
+  encode_json("obj_info", obj_info, f);
+  encode_json("ofs", ofs, f);
+  encode_json("ns", ns, f);
+}
+
diff --git a/src/rgw/rgw_cache.h b/src/rgw/rgw_cache.h
new file mode 100644
index 000000000..e70beb064
--- /dev/null
+++ b/src/rgw/rgw_cache.h
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <unordered_map>
+#include "include/types.h"
+#include "include/utime.h"
+#include "include/ceph_assert.h"
+#include "common/ceph_mutex.h"
+
+#include "cls/version/cls_version_types.h"
+#include "rgw_common.h"
+
+enum {
+  UPDATE_OBJ,
+  INVALIDATE_OBJ,
+};
+
+#define CACHE_FLAG_DATA           0x01
+#define CACHE_FLAG_XATTRS         0x02
+#define CACHE_FLAG_META           0x04
+#define CACHE_FLAG_MODIFY_XATTRS  0x08
+#define CACHE_FLAG_OBJV           0x10
+
+struct ObjectMetaInfo {
+  uint64_t size;
+  real_time mtime;
+
+  ObjectMetaInfo() : size(0) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(size, bl);
+    encode(mtime, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+    decode(size, bl);
+    decode(mtime, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<ObjectMetaInfo*>& o);
+};
+WRITE_CLASS_ENCODER(ObjectMetaInfo)
+
+struct ObjectCacheInfo {
+  int status = 0;
+  uint32_t flags = 0;
+  uint64_t epoch = 0;
+  bufferlist data;
+  std::map<std::string, bufferlist> xattrs;
+  std::map<std::string, bufferlist> rm_xattrs;
+  ObjectMetaInfo meta;
+  obj_version version = {};
+  ceph::coarse_mono_time time_added;
+
+  ObjectCacheInfo() = default;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(5, 3, bl);
+    encode(status, bl);
+    encode(flags, bl);
+    encode(data, bl);
+    encode(xattrs, bl);
+    encode(meta, bl);
+    encode(rm_xattrs, bl);
+    encode(epoch, bl);
+    encode(version, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+    decode(status, bl);
+    decode(flags, bl);
+    decode(data, bl);
+    decode(xattrs, bl);
+    decode(meta, bl);
+    if (struct_v >= 2)
+      decode(rm_xattrs, bl);
+    if (struct_v >= 4)
+      decode(epoch, bl);
+    if (struct_v >= 5)
+      decode(version, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<ObjectCacheInfo*>& o);
+};
+WRITE_CLASS_ENCODER(ObjectCacheInfo)
+
+struct RGWCacheNotifyInfo {
+  uint32_t op;
+  rgw_raw_obj obj;
+  ObjectCacheInfo obj_info;
+  off_t ofs;
+  std::string ns;
+
+  RGWCacheNotifyInfo() : op(0), ofs(0) {}
+
+  void encode(bufferlist& obl) const {
+    ENCODE_START(2, 2, obl);
+    encode(op, obl);
+    encode(obj, obl);
+    encode(obj_info, obl);
+    encode(ofs, obl);
+    encode(ns, obl);
+    ENCODE_FINISH(obl);
+  }
+  void decode(bufferlist::const_iterator& ibl) {
+    DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, ibl);
+    decode(op, ibl);
+    decode(obj, ibl);
+    decode(obj_info, ibl);
+    decode(ofs, ibl);
+    decode(ns, ibl);
+    DECODE_FINISH(ibl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWCacheNotifyInfo*>& o);
+};
+WRITE_CLASS_ENCODER(RGWCacheNotifyInfo)
+inline std::ostream& operator <<(std::ostream& m, const RGWCacheNotifyInfo& cni) {
+  return m << "[op: " << cni.op << ", obj: " << cni.obj
+	   << ", ofs" << cni.ofs << ", ns" << cni.ns << "]";
+}
+
+
+class RGWChainedCache {
+public:
+  virtual ~RGWChainedCache() {}
+  virtual void chain_cb(const std::string& key, void *data) = 0;
+  virtual void invalidate(const std::string& key) = 0;
+  virtual void invalidate_all() = 0;
+  virtual void unregistered() {}
+
+  struct Entry {
+    RGWChainedCache *cache;
+    const std::string& key;
+    void *data;
+
+    Entry(RGWChainedCache *_c, const std::string& _k, void *_d) : cache(_c), key(_k), data(_d) {}
+  };
+};
+
+
+struct ObjectCacheEntry {
+  ObjectCacheInfo info;
+  std::list<std::string>::iterator lru_iter;
+  uint64_t lru_promotion_ts;
+  uint64_t gen;
+  std::vector<std::pair<RGWChainedCache *, std::string> > chained_entries;
+
+  ObjectCacheEntry() : lru_promotion_ts(0), gen(0) {}
+};
+
+class ObjectCache {
+  std::unordered_map<std::string, ObjectCacheEntry> cache_map;
+  std::list<std::string> lru;
+  unsigned long lru_size;
+  unsigned long lru_counter;
+  unsigned long lru_window;
+  ceph::shared_mutex lock = ceph::make_shared_mutex("ObjectCache");
+  CephContext *cct;
+
+  std::vector<RGWChainedCache *> chained_cache;
+
+  bool enabled;
+  ceph::timespan expiry;
+
+  void touch_lru(const DoutPrefixProvider *dpp, const std::string& name, ObjectCacheEntry& entry,
+		 std::list<std::string>::iterator& lru_iter);
+  void remove_lru(const std::string& name, std::list<std::string>::iterator& lru_iter);
+  void invalidate_lru(ObjectCacheEntry& entry);
+
+  void do_invalidate_all();
+
+public:
+  ObjectCache() : lru_size(0), lru_counter(0), lru_window(0), cct(NULL), enabled(false) { }
+  ~ObjectCache();
+  int get(const DoutPrefixProvider *dpp, const std::string& name, ObjectCacheInfo& bl, uint32_t mask, rgw_cache_entry_info *cache_info);
+  std::optional<ObjectCacheInfo> get(const DoutPrefixProvider *dpp, const std::string& name) {
+    std::optional<ObjectCacheInfo> info{std::in_place};
+    auto r = get(dpp, name, *info, 0, nullptr);
+    return r < 0 ? std::nullopt : info;
+  }
+
+  template<typename F>
+  void for_each(const F& f) {
+    std::shared_lock l{lock};
+    if (enabled) {
+      auto now  = ceph::coarse_mono_clock::now();
+      for (const auto& [name, entry] : cache_map) {
+        if (expiry.count() && (now - entry.info.time_added) < expiry) {
+          f(name, entry);
+        }
+      }
+    }
+  }
+
+  void put(const DoutPrefixProvider *dpp, const std::string& name, ObjectCacheInfo& bl, rgw_cache_entry_info *cache_info);
+  bool invalidate_remove(const DoutPrefixProvider *dpp, const std::string& name);
+  void set_ctx(CephContext *_cct) {
+    cct = _cct;
+    lru_window = cct->_conf->rgw_cache_lru_size / 2;
+    expiry = std::chrono::seconds(cct->_conf.get_val<uint64_t>(
+						"rgw_cache_expiry_interval"));
+  }
+  bool chain_cache_entry(const DoutPrefixProvider *dpp,
+                         std::initializer_list<rgw_cache_entry_info*> cache_info_entries,
+			 RGWChainedCache::Entry *chained_entry);
+
+  void set_enabled(bool status);
+
+  void chain_cache(RGWChainedCache *cache);
+  void unchain_cache(RGWChainedCache *cache);
+  void invalidate_all();
+};
diff --git a/src/rgw/rgw_client_io.cc b/src/rgw/rgw_client_io.cc
new file mode 100644
index 000000000..ed0925093
--- /dev/null
+++ b/src/rgw/rgw_client_io.cc
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include "rgw_client_io.h"
+#include "rgw_crypt.h"
+#include "rgw_crypt_sanitize.h"
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw {
+namespace io {
+
+[[nodiscard]] int BasicClient::init(CephContext *cct) {
+  int init_error = init_env(cct);
+
+  if (init_error != 0)
+    return init_error;
+
+  if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    const auto& env_map = get_env().get_map();
+
+    for (const auto& iter: env_map) {
+      rgw::crypt_sanitize::env x{iter.first, iter.second};
+      ldout(cct, 20) << iter.first << "=" << (x) << dendl;
+    }
+  }
+  return init_error;
+}
+
+} /* namespace io */
+} /* namespace rgw */
diff --git a/src/rgw/rgw_client_io.h b/src/rgw/rgw_client_io.h
new file mode 100644
index 000000000..aedfe4500
--- /dev/null
+++ b/src/rgw/rgw_client_io.h
@@ -0,0 +1,435 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <exception>
+#include <string>
+#include <string_view>
+#include <streambuf>
+#include <istream>
+#include <stdlib.h>
+#include <system_error>
+
+#include "include/types.h"
+#include "rgw_common.h"
+
+
+class RGWRestfulIO;
+
+namespace rgw {
+namespace io {
+
+using Exception = std::system_error;
+
+/* The minimal and simplest subset of methods that a client of RadosGW can be
+ * interacted with. */
+class BasicClient {
+protected:
+  virtual int init_env(CephContext *cct) = 0;
+
+public:
+  virtual ~BasicClient() = default;
+
+  /* Initialize the BasicClient and inject CephContext. */
+  int init(CephContext *cct);
+
+  /* Return the RGWEnv describing the environment that a given request lives in.
+   * The method does not throw exceptions. */
+  virtual RGWEnv& get_env() noexcept = 0;
+
+  /* Complete request.
+   * On success returns number of bytes generated for a direct client of RadosGW.
+   * On failure throws rgw::io::Exception containing errno. */
+  virtual size_t complete_request() = 0;
+}; /* rgw::io::Client */
+
+
+class Accounter {
+public:
+  virtual ~Accounter() = default;
+
+  /* Enable or disable the accounting of both sent and received data. Changing
+   * the state does not affect the counters. */
+  virtual void set_account(bool enabled) = 0;
+
+  /* Return number of bytes sent to a direct client of RadosGW (direct means
+   * eg. a web server instance in the case of using FastCGI front-end) when
+   * the accounting was enabled. */
+  virtual uint64_t get_bytes_sent() const = 0;
+
+  /* Return number of bytes received from a direct client of RadosGW (direct
+   * means eg. a web server instance in the case of using FastCGI front-end)
+   * when the accounting was enabled. */
+  virtual uint64_t get_bytes_received() const = 0;
+}; /* rgw::io::Accounter */
+
+
+/* Interface abstracting restful interactions with clients, usually through
+ * the HTTP protocol. The methods participating in the response generation
+ * process should be called in the specific order:
+ *   1. send_100_continue() - at most once,
+ *   2. send_status() - exactly once,
+ *   3. Any of:
+ *      a. send_header(),
+ *      b. send_content_length() XOR send_chunked_transfer_encoding()
+ *         Please note that only one of those two methods must be called
+           at most once.
+ *   4. complete_header() - exactly once,
+ *   5. send_body()
+ *   6. complete_request() - exactly once.
+ * There are no restrictions on flush() - it may be called in any moment.
+ *
+ * Receiving data from a client isn't a subject to any further call order
+ * restrictions besides those imposed by BasicClient. That is, get_env()
+ * and recv_body can be mixed. */
+class RestfulClient : public BasicClient {
+  template<typename T> friend class DecoratedRestfulClient;
+
+public:
+  /* Generate the 100 Continue message.
+   * On success returns number of bytes generated for a direct client of RadosGW.
+   * On failure throws rgw::io::Exception containing errno. */
+  virtual size_t send_100_continue() = 0;
+
+  /* Generate the response's status part taking the HTTP status code as @status
+   * and its name pointed in @status_name.
+   * On success returns number of bytes generated for a direct client of RadosGW.
+   * On failure throws rgw::io::Exception containing errno. */
+  virtual size_t send_status(int status, const char *status_name) = 0;
+
+  /* Generate header. On success returns number of bytes generated for a direct
+   * client of RadosGW. On failure throws rgw::io::Exception containing errno.
+   *
+   * std::string_view is being used because of length it internally carries. */
+  virtual size_t send_header(const std::string_view& name,
+                             const std::string_view& value) = 0;
+
+  /* Inform a client about a content length. Takes number of bytes as @len.
+   * On success returns number of bytes generated for a direct client of RadosGW.
+   * On failure throws rgw::io::Exception containing errno.
+   *
+   * CALL LIMITATIONS:
+   *  - The method must be called EXACTLY ONCE.
+   *  - The method is interchangeable with send_chunked_transfer_encoding(). */
+  virtual size_t send_content_length(uint64_t len) = 0;
+
+  /* Inform a client that the chunked transfer encoding will be used.
+   * On success returns number of bytes generated for a direct client of RadosGW.
+   * On failure throws rgw::io::Exception containing errno.
+   *
+   * CALL LIMITATIONS:
+   *  - The method must be called EXACTLY ONCE.
+   *  - The method is interchangeable with send_content_length(). */
+  virtual size_t send_chunked_transfer_encoding() {
+    /* This is a null implementation. We don't send anything here, even the HTTP
+     * header. The intended behaviour should be provided through a decorator or
+     * directly by a given front-end. */
+    return 0;
+  }
+
+  /* Generate completion (the CRLF sequence separating headers and body in
+   * the case of HTTP) of headers. On success returns number of generated bytes
+   * for a direct client of RadosGW. On failure throws rgw::io::Exception with
+   * errno. */
+  virtual size_t complete_header() = 0;
+
+  /* Receive no more than @max bytes from a request's body and store it in
+   * buffer pointed by @buf. On success returns number of bytes received from
+   * a direct client of RadosGW that has been stored in @buf. On failure throws
+   * rgw::io::Exception containing errno. */
+  virtual size_t recv_body(char* buf, size_t max) = 0;
+
+  /* Generate a part of response's body by taking exactly @len bytes from
+   * the buffer pointed by @buf. On success returns number of generated bytes
+   * of response's body. On failure throws rgw::io::Exception. */
+  virtual size_t send_body(const char* buf, size_t len) = 0;
+
+  /* Flushes all already generated data to a direct client of RadosGW.
+   * On failure throws rgw::io::Exception containing errno. */
+  virtual void flush() = 0;
+} /* rgw::io::RestfulClient */;
+
+
+/* Abstract decorator over any implementation of rgw::io::RestfulClient
+ * which could be provided both as a pointer-to-object or the object itself. */
+template <typename DecorateeT>
+class DecoratedRestfulClient : public RestfulClient {
+  template<typename T> friend class DecoratedRestfulClient;
+  friend RGWRestfulIO;
+
+  typedef typename std::remove_pointer<DecorateeT>::type DerefedDecorateeT;
+
+  static_assert(std::is_base_of<RestfulClient, DerefedDecorateeT>::value,
+                "DecorateeT must be a subclass of rgw::io::RestfulClient");
+
+  DecorateeT decoratee;
+
+  /* There is an indirection layer over accessing decoratee to share the same
+   * code base between dynamic and static decorators. The difference is about
+   * what we store internally: pointer to a decorated object versus the whole
+   * object itself. */
+  template <typename T = void,
+            typename std::enable_if<
+    ! std::is_pointer<DecorateeT>::value, T>::type* = nullptr>
+  DerefedDecorateeT& get_decoratee() {
+    return decoratee;
+  }
+
+protected:
+  template <typename T = void,
+            typename std::enable_if<
+    std::is_pointer<DecorateeT>::value, T>::type* = nullptr>
+  DerefedDecorateeT& get_decoratee() {
+    return *decoratee;
+  }
+
+  /* Dynamic decorators (those storing a pointer instead of the decorated
+   * object itself) can be reconfigured on-the-fly. HOWEVER: there are no
+   * facilities for orchestrating such changes. Callers must take care of
+   * atomicity and thread-safety. */
+  template <typename T = void,
+            typename std::enable_if<
+    std::is_pointer<DecorateeT>::value, T>::type* = nullptr>
+  void set_decoratee(DerefedDecorateeT& new_dec) {
+    decoratee = &new_dec;
+  }
+
+  int init_env(CephContext *cct) override {
+    return get_decoratee().init_env(cct);
+  }
+
+public:
+  explicit DecoratedRestfulClient(DecorateeT&& decoratee)
+    : decoratee(std::forward<DecorateeT>(decoratee)) {
+  }
+
+  size_t send_status(const int status,
+                     const char* const status_name) override {
+    return get_decoratee().send_status(status, status_name);
+  }
+
+  size_t send_100_continue() override {
+    return get_decoratee().send_100_continue();
+  }
+
+  size_t send_header(const std::string_view& name,
+                     const std::string_view& value) override {
+    return get_decoratee().send_header(name, value);
+  }
+
+  size_t send_content_length(const uint64_t len) override {
+    return get_decoratee().send_content_length(len);
+  }
+
+  size_t send_chunked_transfer_encoding() override {
+    return get_decoratee().send_chunked_transfer_encoding();
+  }
+
+  size_t complete_header() override {
+    return get_decoratee().complete_header();
+  }
+
+  size_t recv_body(char* const buf, const size_t max) override {
+    return get_decoratee().recv_body(buf, max);
+  }
+
+  size_t send_body(const char* const buf,
+                   const size_t len) override {
+    return get_decoratee().send_body(buf, len);
+  }
+
+  void flush() override {
+    return get_decoratee().flush();
+  }
+
+  RGWEnv& get_env() noexcept override {
+    return get_decoratee().get_env();
+  }
+
+  size_t complete_request() override {
+    return get_decoratee().complete_request();
+  }
+} /* rgw::io::DecoratedRestfulClient */;
+
+
+/* Interface that should be provided by a front-end class wanting to use
+ * the low-level buffering offered by i.e. StaticOutputBufferer. */
+class BuffererSink {
+public:
+  virtual ~BuffererSink() = default;
+
+  /* Send exactly @len bytes from the memory location pointed by @buf.
+   * On success returns @len. On failure throws rgw::io::Exception. */
+  virtual size_t write_data(const char *buf, size_t len) = 0;
+};
+
+/* Utility class providing RestfulClient's implementations with facilities
+ * for low-level buffering without relying on dynamic memory allocations.
+ * The buffer is carried entirely on stack. This narrows down applicability
+ * to these situations where buffers are relatively small. This perfectly
+ * fits the needs of composing an HTTP header. Without that a front-end
+ * might need to issue a lot of small IO operations leading to increased
+ * overhead on syscalls and fragmentation of a message if the Nagle's
+ * algorithm won't be able to form a single TCP segment (usually when
+ * running on extremely fast network interfaces like the loopback). */
+template <size_t BufferSizeV = 4096>
+class StaticOutputBufferer : public std::streambuf {
+  static_assert(BufferSizeV >= sizeof(std::streambuf::char_type),
+                "Buffer size must be bigger than a single char_type.");
+
+  using std::streambuf::int_type;
+
+  int_type overflow(const int_type c) override {
+    *pptr() = c;
+    pbump(sizeof(std::streambuf::char_type));
+
+    if (! sync()) {
+      /* No error, the buffer has been successfully synchronized. */
+      return c;
+    } else {
+      return std::streambuf::traits_type::eof();
+    }
+  }
+
+  int sync() override {
+    const auto len = static_cast<size_t>(std::streambuf::pptr() -
+                                         std::streambuf::pbase());
+    std::streambuf::pbump(-len);
+    sink.write_data(std::streambuf::pbase(), len);
+    /* Always return success here. In case of failure write_data() will throw
+     * rgw::io::Exception. */
+    return 0;
+  }
+
+  BuffererSink& sink;
+  std::streambuf::char_type buffer[BufferSizeV];
+
+public:
+  explicit StaticOutputBufferer(BuffererSink& sink)
+    : sink(sink) {
+    constexpr size_t len = sizeof(buffer) - sizeof(std::streambuf::char_type);
+    std::streambuf::setp(buffer, buffer + len);
+  }
+};
+
+} /* namespace io */
+} /* namespace rgw */
+
+
+/* We're doing this nasty thing only because of extensive usage of templates
+ * to implement the static decorator pattern. C++ templates de facto enforce
+ * mixing interfaces with implementation. Additionally, those classes derive
+ * from RGWRestfulIO defined here. I believe that including in the middle of
+ * file is still better than polluting it directly. */
+#include "rgw_client_io_filters.h"
+
+
+/* RGWRestfulIO: high level interface to interact with RESTful clients. What
+ * differentiates it from rgw::io::RestfulClient is providing more specific APIs
+ * like rgw::io::Accounter or the AWS Auth v4 stuff implemented by filters
+ * while hiding the pipelined architecture from clients.
+ *
+ * rgw::io::Accounter came in as a part of rgw::io::AccountingFilter. */
+class RGWRestfulIO : public rgw::io::AccountingFilter<rgw::io::RestfulClient*> {
+  std::vector<std::shared_ptr<DecoratedRestfulClient>> filters;
+
+public:
+  ~RGWRestfulIO() override = default;
+
+  RGWRestfulIO(CephContext *_cx, rgw::io::RestfulClient* engine)
+    : AccountingFilter<rgw::io::RestfulClient*>(_cx, std::move(engine)) {
+  }
+
+  void add_filter(std::shared_ptr<DecoratedRestfulClient> new_filter) {
+    new_filter->set_decoratee(this->get_decoratee());
+    this->set_decoratee(*new_filter);
+    filters.emplace_back(std::move(new_filter));
+  }
+}; /* RGWRestfulIO */
+
+
+/* Type conversions to work around lack of req_state type hierarchy matching
+ * (e.g.) REST backends (may be replaced w/dynamic typed req_state). */
+static inline rgw::io::RestfulClient* RESTFUL_IO(req_state* s) {
+  ceph_assert(dynamic_cast<rgw::io::RestfulClient*>(s->cio) != nullptr);
+
+  return static_cast<rgw::io::RestfulClient*>(s->cio);
+}
+
+static inline rgw::io::Accounter* ACCOUNTING_IO(req_state* s) {
+  auto ptr = dynamic_cast<rgw::io::Accounter*>(s->cio);
+  ceph_assert(ptr != nullptr);
+
+  return ptr;
+}
+
+static inline RGWRestfulIO* AWS_AUTHv4_IO(const req_state* const s) {
+  ceph_assert(dynamic_cast<RGWRestfulIO*>(s->cio) != nullptr);
+
+  return static_cast<RGWRestfulIO*>(s->cio);
+}
+
+
+class RGWClientIOStreamBuf : public std::streambuf {
+protected:
+  RGWRestfulIO &rio;
+  size_t const window_size;
+  size_t const putback_size;
+  std::vector<char> buffer;
+
+public:
+  RGWClientIOStreamBuf(RGWRestfulIO &rio, size_t ws, size_t ps = 1)
+    : rio(rio),
+      window_size(ws),
+      putback_size(ps),
+      buffer(ws + ps)
+  {
+    setg(nullptr, nullptr, nullptr);
+  }
+
+  std::streambuf::int_type underflow() override {
+    if (gptr() < egptr()) {
+      return traits_type::to_int_type(*gptr());
+    }
+
+    char * const base = buffer.data();
+    char * start;
+
+    if (nullptr != eback()) {
+      /* We need to skip moving bytes on first underflow. In such case
+       * there is simply no previous data we should preserve for unget()
+       * or something similar. */
+      std::memmove(base, egptr() - putback_size, putback_size);
+      start = base + putback_size;
+    } else {
+      start = base;
+    }
+
+    size_t read_len = 0;
+    try {
+      read_len = rio.recv_body(base, window_size);
+    } catch (rgw::io::Exception&) {
+      return traits_type::eof();
+    }
+    if (0 == read_len) {
+      return traits_type::eof();
+    }
+
+    setg(base, start, start + read_len);
+
+    return traits_type::to_int_type(*gptr());
+  }
+};
+
+class RGWClientIOStream : private RGWClientIOStreamBuf, public std::istream {
+/* Inheritance from RGWClientIOStreamBuf is a kind of shadow, undirect
+ * form of composition here. We cannot do that explicitly because istream
+ * ctor is being called prior to construction of any member of this class. */
+
+public:
+  explicit RGWClientIOStream(RGWRestfulIO &s)
+    : RGWClientIOStreamBuf(s, 1, 2),
+      std::istream(static_cast<RGWClientIOStreamBuf *>(this)) {
+  }
+};
diff --git a/src/rgw/rgw_client_io_filters.h b/src/rgw/rgw_client_io_filters.h
new file mode 100644
index 000000000..55d405e1b
--- /dev/null
+++ b/src/rgw/rgw_client_io_filters.h
@@ -0,0 +1,454 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <type_traits>
+
+#include <boost/optional.hpp>
+
+#include "rgw_common.h"
+#include "rgw_client_io.h"
+
+namespace rgw {
+namespace io {
+
+template <typename T>
+class AccountingFilter : public DecoratedRestfulClient<T>,
+                         public Accounter {
+  bool enabled;
+  uint64_t total_sent;
+  uint64_t total_received;
+  CephContext *cct;
+
+public:
+  template <typename U>
+  AccountingFilter(CephContext *cct, U&& decoratee)
+    : DecoratedRestfulClient<T>(std::forward<U>(decoratee)),
+      enabled(false),
+      total_sent(0),
+      total_received(0), cct(cct) {
+  }
+
+  size_t send_status(const int status,
+                     const char* const status_name) override {
+    const auto sent = DecoratedRestfulClient<T>::send_status(status,
+                                                             status_name);
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_status: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
+    if (enabled) {
+      total_sent += sent;
+    }
+    return sent;
+  }
+
+  size_t send_100_continue() override {
+    const auto sent = DecoratedRestfulClient<T>::send_100_continue();
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_100_continue: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
+    if (enabled) {
+      total_sent += sent;
+    }
+    return sent;
+  }
+
+  size_t send_header(const std::string_view& name,
+                     const std::string_view& value) override {
+    const auto sent = DecoratedRestfulClient<T>::send_header(name, value);
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_header: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
+    if (enabled) {
+      total_sent += sent;
+    }
+    return sent;
+  }
+
+  size_t send_content_length(const uint64_t len) override {
+    const auto sent = DecoratedRestfulClient<T>::send_content_length(len);
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_content_length: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
+    if (enabled) {
+      total_sent += sent;
+    }
+    return sent;
+  }
+
+  size_t send_chunked_transfer_encoding() override {
+    const auto sent = DecoratedRestfulClient<T>::send_chunked_transfer_encoding();
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_chunked_transfer_encoding: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
+    if (enabled) {
+      total_sent += sent;
+    }
+    return sent;
+  }
+
+  size_t complete_header() override {
+    const auto sent = DecoratedRestfulClient<T>::complete_header();
+    lsubdout(cct, rgw, 30) << "AccountingFilter::complete_header: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
+    if (enabled) {
+      total_sent += sent;
+    }
+    return sent;
+  }
+
+  size_t recv_body(char* buf, size_t max) override {
+    const auto received = DecoratedRestfulClient<T>::recv_body(buf, max);
+    lsubdout(cct, rgw, 30) << "AccountingFilter::recv_body: e="
+        << (enabled ? "1" : "0") << ", received=" << received << dendl;
+    if (enabled) {
+      total_received += received;
+    }
+    return received;
+  }
+
+  size_t send_body(const char* const buf,
+                   const size_t len) override {
+    const auto sent = DecoratedRestfulClient<T>::send_body(buf, len);
+    lsubdout(cct, rgw, 30) << "AccountingFilter::send_body: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
+    if (enabled) {
+      total_sent += sent;
+    }
+    return sent;
+  }
+
+  size_t complete_request() override {
+    const auto sent = DecoratedRestfulClient<T>::complete_request();
+    lsubdout(cct, rgw, 30) << "AccountingFilter::complete_request: e="
+        << (enabled ? "1" : "0") << ", sent=" << sent << ", total="
+        << total_sent << dendl;
+    if (enabled) {
+      total_sent += sent;
+    }
+    return sent;
+  }
+
+  uint64_t get_bytes_sent() const override {
+    return total_sent;
+  }
+
+  uint64_t get_bytes_received() const override {
+    return total_received;
+  }
+
+  void set_account(bool enabled) override {
+    this->enabled = enabled;
+    lsubdout(cct, rgw, 30) << "AccountingFilter::set_account: e="
+        << (enabled ? "1" : "0") << dendl;
+  }
+};
+
+
+/* Filter for in-memory buffering incoming data and calculating the content
+ * length header if it isn't present. */
+template <typename T>
+class BufferingFilter : public DecoratedRestfulClient<T> {
+  template<typename Td> friend class DecoratedRestfulClient;
+protected:
+  ceph::bufferlist data;
+
+  bool has_content_length;
+  bool buffer_data;
+  CephContext *cct;
+
+public:
+  template <typename U>
+  BufferingFilter(CephContext *cct, U&& decoratee)
+    : DecoratedRestfulClient<T>(std::forward<U>(decoratee)),
+      has_content_length(false),
+      buffer_data(false), cct(cct) {
+  }
+
+  size_t send_content_length(const uint64_t len) override;
+  size_t send_chunked_transfer_encoding() override;
+  size_t complete_header() override;
+  size_t send_body(const char* buf, size_t len) override;
+  size_t complete_request() override;
+};
+
+template <typename T>
+size_t BufferingFilter<T>::send_body(const char* const buf,
+                                     const size_t len)
+{
+  if (buffer_data) {
+    data.append(buf, len);
+
+    lsubdout(cct, rgw, 30) << "BufferingFilter<T>::send_body: defer count = "
+        << len << dendl;
+    return 0;
+  }
+
+  return DecoratedRestfulClient<T>::send_body(buf, len);
+}
+
+template <typename T>
+size_t BufferingFilter<T>::send_content_length(const uint64_t len)
+{
+  has_content_length = true;
+  return DecoratedRestfulClient<T>::send_content_length(len);
+}
+
+template <typename T>
+size_t BufferingFilter<T>::send_chunked_transfer_encoding()
+{
+  has_content_length = true;
+  return DecoratedRestfulClient<T>::send_chunked_transfer_encoding();
+}
+
+template <typename T>
+size_t BufferingFilter<T>::complete_header()
+{
+  if (! has_content_length) {
+    /* We will dump everything in complete_request(). */
+    buffer_data = true;
+    lsubdout(cct, rgw, 30) << "BufferingFilter<T>::complete_header: has_content_length="
+        << (has_content_length ? "1" : "0") << dendl;
+    return 0;
+  }
+
+  return DecoratedRestfulClient<T>::complete_header();
+}
+
+template <typename T>
+size_t BufferingFilter<T>::complete_request()
+{
+  size_t sent = 0;
+
+  if (! has_content_length) {
+    /* It is not correct to count these bytes here,
+     * because they can only be part of the header.
+     * Therefore force count to 0.
+     */
+    sent += DecoratedRestfulClient<T>::send_content_length(data.length());
+    sent += DecoratedRestfulClient<T>::complete_header();
+    lsubdout(cct, rgw, 30) <<
+        "BufferingFilter::complete_request: !has_content_length: IGNORE: sent="
+        << sent << dendl;
+    sent = 0;
+  }
+
+  if (buffer_data) {
+    /* We are sending each buffer separately to avoid extra memory shuffling
+     * that would occur on data.c_str() to provide a continuous memory area. */
+    for (const auto& ptr : data.buffers()) {
+      sent += DecoratedRestfulClient<T>::send_body(ptr.c_str(),
+                                                   ptr.length());
+    }
+    data.clear();
+    buffer_data = false;
+    lsubdout(cct, rgw, 30) << "BufferingFilter::complete_request: buffer_data: sent="
+        << sent << dendl;
+  }
+
+  return sent + DecoratedRestfulClient<T>::complete_request();
+}
+
+template <typename T> static inline
+BufferingFilter<T> add_buffering(
+CephContext *cct,
+T&& t) {
+  return BufferingFilter<T>(cct, std::forward<T>(t));
+}
+
+
+template <typename T>
+class ChunkingFilter : public DecoratedRestfulClient<T> {
+  template<typename Td> friend class DecoratedRestfulClient;
+protected:
+  bool chunking_enabled;
+
+public:
+  template <typename U>
+  explicit ChunkingFilter(U&& decoratee)
+    : DecoratedRestfulClient<T>(std::forward<U>(decoratee)),
+      chunking_enabled(false) {
+  }
+
+  size_t send_chunked_transfer_encoding() override {
+    chunking_enabled = true;
+    return DecoratedRestfulClient<T>::send_header("Transfer-Encoding",
+                                                  "chunked");
+  }
+
+  size_t send_body(const char* buf,
+                   const size_t len) override {
+    if (! chunking_enabled) {
+      return DecoratedRestfulClient<T>::send_body(buf, len);
+    } else {
+      static constexpr char HEADER_END[] = "\r\n";
+      /* https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.6.1 */
+      // TODO: we have no support for sending chunked-encoding
+      // extensions/trailing headers.
+      char chunk_size[32];
+      const auto chunk_size_len = snprintf(chunk_size, sizeof(chunk_size),
+                                           "%zx\r\n", len);
+      size_t sent = 0;
+
+      sent += DecoratedRestfulClient<T>::send_body(chunk_size, chunk_size_len);
+      sent += DecoratedRestfulClient<T>::send_body(buf, len);
+      sent += DecoratedRestfulClient<T>::send_body(HEADER_END,
+                                                   sizeof(HEADER_END) - 1);
+      return sent;
+    }
+  }
+
+  size_t complete_request() override {
+    size_t sent = 0;
+
+    if (chunking_enabled) {
+      static constexpr char CHUNKED_RESP_END[] = "0\r\n\r\n";
+      sent += DecoratedRestfulClient<T>::send_body(CHUNKED_RESP_END,
+                                                   sizeof(CHUNKED_RESP_END) - 1);
+    }
+
+    return sent + DecoratedRestfulClient<T>::complete_request();
+  }
+};
+
+template <typename T> static inline
+ChunkingFilter<T> add_chunking(T&& t) {
+  return ChunkingFilter<T>(std::forward<T>(t));
+}
+
+
+/* Class that controls and inhibits the process of sending Content-Length HTTP
+ * header where RFC 7230 requests so. The cases worth our attention are 204 No
+ * Content as well as 304 Not Modified. */
+template <typename T>
+class ConLenControllingFilter : public DecoratedRestfulClient<T> {
+protected:
+  enum class ContentLengthAction {
+    FORWARD,
+    INHIBIT,
+    UNKNOWN
+  } action;
+
+public:
+  template <typename U>
+  explicit ConLenControllingFilter(U&& decoratee)
+    : DecoratedRestfulClient<T>(std::forward<U>(decoratee)),
+      action(ContentLengthAction::UNKNOWN) {
+  }
+
+  size_t send_status(const int status,
+                     const char* const status_name) override {
+    if ((204 == status || 304 == status) &&
+        ! g_conf()->rgw_print_prohibited_content_length) {
+      action = ContentLengthAction::INHIBIT;
+    } else {
+      action = ContentLengthAction::FORWARD;
+    }
+
+    return DecoratedRestfulClient<T>::send_status(status, status_name);
+  }
+
+  size_t send_content_length(const uint64_t len) override {
+    switch(action) {
+    case ContentLengthAction::FORWARD:
+      return DecoratedRestfulClient<T>::send_content_length(len);
+    case ContentLengthAction::INHIBIT:
+      return 0;
+    case ContentLengthAction::UNKNOWN:
+    default:
+      return -EINVAL;
+    }
+  }
+};
+
+template <typename T> static inline
+ConLenControllingFilter<T> add_conlen_controlling(T&& t) {
+  return ConLenControllingFilter<T>(std::forward<T>(t));
+}
+
+
+/* Filter that rectifies the wrong behaviour of some clients of the RGWRestfulIO
+ * interface. Should be removed after fixing those clients. */
+template <typename T>
+class ReorderingFilter : public DecoratedRestfulClient<T> {
+protected:
+  enum class ReorderState {
+    RGW_EARLY_HEADERS,  /* Got headers sent before calling send_status. */
+    RGW_STATUS_SEEN,    /* Status has been seen. */
+    RGW_DATA            /* Header has been completed. */
+  } phase;
+
+  boost::optional<uint64_t> content_length;
+
+  std::vector<std::pair<std::string, std::string>> headers;
+
+  size_t send_header(const std::string_view& name,
+                     const std::string_view& value) override {
+    switch (phase) {
+    case ReorderState::RGW_EARLY_HEADERS:
+    case ReorderState::RGW_STATUS_SEEN:
+      headers.emplace_back(std::make_pair(std::string(name.data(), name.size()),
+                                          std::string(value.data(), value.size())));
+      return 0;
+    case ReorderState::RGW_DATA:
+      return DecoratedRestfulClient<T>::send_header(name, value);
+    }
+
+    return -EIO;
+  }
+
+public:
+  template <typename U>
+  explicit ReorderingFilter(U&& decoratee)
+    : DecoratedRestfulClient<T>(std::forward<U>(decoratee)),
+      phase(ReorderState::RGW_EARLY_HEADERS) {
+  }
+
+  size_t send_status(const int status,
+                     const char* const status_name) override {
+    phase = ReorderState::RGW_STATUS_SEEN;
+
+    return DecoratedRestfulClient<T>::send_status(status, status_name);
+  }
+
+  size_t send_content_length(const uint64_t len) override {
+    if (ReorderState::RGW_EARLY_HEADERS == phase) {
+      /* Oh great, someone tries to send content length before status. */
+      content_length = len;
+      return 0;
+    } else {
+      return DecoratedRestfulClient<T>::send_content_length(len);
+    }
+  }
+
+  size_t complete_header() override {
+    size_t sent = 0;
+
+    /* Change state in order to immediately send everything we get. */
+    phase = ReorderState::RGW_DATA;
+
+    /* Sent content length if necessary. */
+    if (content_length) {
+      sent += DecoratedRestfulClient<T>::send_content_length(*content_length);
+    }
+
+    /* Header data in buffers are already counted. */
+    for (const auto& kv : headers) {
+      sent += DecoratedRestfulClient<T>::send_header(kv.first, kv.second);
+    }
+    headers.clear();
+
+    return sent + DecoratedRestfulClient<T>::complete_header();
+  }
+};
+
+template <typename T> static inline
+ReorderingFilter<T> add_reordering(T&& t) {
+  return ReorderingFilter<T>(std::forward<T>(t));
+}
+
+} /* namespace io */
+} /* namespace rgw */
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
new file mode 100644
index 000000000..f5d7912ea
--- /dev/null
+++ b/src/rgw/rgw_common.cc
@@ -0,0 +1,3075 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <vector>
+#include <algorithm>
+#include <string>
+#include <boost/tokenizer.hpp>
+
+#include "json_spirit/json_spirit.h"
+#include "common/ceph_json.h"
+#include "common/Formatter.h"
+
+#include "rgw_op.h"
+#include "rgw_common.h"
+#include "rgw_acl.h"
+#include "rgw_string.h"
+#include "rgw_http_errors.h"
+#include "rgw_arn.h"
+#include "rgw_data_sync.h"
+
+#include "global/global_init.h"
+#include "common/ceph_crypto.h"
+#include "common/armor.h"
+#include "common/errno.h"
+#include "common/Clock.h"
+#include "common/convenience.h"
+#include "common/strtol.h"
+#include "include/str_list.h"
+#include "rgw_crypt_sanitize.h"
+#include "rgw_bucket_sync.h"
+#include "rgw_sync_policy.h"
+
+#include "services/svc_zone.h"
+
+#include <sstream>
+
+#define dout_context g_ceph_context
+
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+using rgw::ARN;
+using rgw::IAM::Effect;
+using rgw::IAM::op_to_perm;
+using rgw::IAM::Policy;
+
+const uint32_t RGWBucketInfo::NUM_SHARDS_BLIND_BUCKET(UINT32_MAX);
+
+rgw_http_errors rgw_http_s3_errors({
+    { 0, {200, "" }},
+    { STATUS_CREATED, {201, "Created" }},
+    { STATUS_ACCEPTED, {202, "Accepted" }},
+    { STATUS_NO_CONTENT, {204, "NoContent" }},
+    { STATUS_PARTIAL_CONTENT, {206, "" }},
+    { ERR_PERMANENT_REDIRECT, {301, "PermanentRedirect" }},
+    { ERR_WEBSITE_REDIRECT, {301, "WebsiteRedirect" }},
+    { STATUS_REDIRECT, {303, "" }},
+    { ERR_NOT_MODIFIED, {304, "NotModified" }},
+    { EINVAL, {400, "InvalidArgument" }},
+    { ERR_INVALID_REQUEST, {400, "InvalidRequest" }},
+    { ERR_INVALID_DIGEST, {400, "InvalidDigest" }},
+    { ERR_BAD_DIGEST, {400, "BadDigest" }},
+    { ERR_INVALID_LOCATION_CONSTRAINT, {400, "InvalidLocationConstraint" }},
+    { ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION, {400, "ZonegroupDefaultPlacementMisconfiguration" }},
+    { ERR_INVALID_BUCKET_NAME, {400, "InvalidBucketName" }},
+    { ERR_INVALID_OBJECT_NAME, {400, "InvalidObjectName" }},
+    { ERR_UNRESOLVABLE_EMAIL, {400, "UnresolvableGrantByEmailAddress" }},
+    { ERR_INVALID_PART, {400, "InvalidPart" }},
+    { ERR_INVALID_PART_ORDER, {400, "InvalidPartOrder" }},
+    { ERR_REQUEST_TIMEOUT, {400, "RequestTimeout" }},
+    { ERR_TOO_LARGE, {400, "EntityTooLarge" }},
+    { ERR_TOO_SMALL, {400, "EntityTooSmall" }},
+    { ERR_TOO_MANY_BUCKETS, {400, "TooManyBuckets" }},
+    { ERR_MALFORMED_XML, {400, "MalformedXML" }},
+    { ERR_AMZ_CONTENT_SHA256_MISMATCH, {400, "XAmzContentSHA256Mismatch" }},
+    { ERR_MALFORMED_DOC, {400, "MalformedPolicyDocument"}},
+    { ERR_INVALID_TAG, {400, "InvalidTag"}},
+    { ERR_MALFORMED_ACL_ERROR, {400, "MalformedACLError" }},
+    { ERR_INVALID_CORS_RULES_ERROR, {400, "InvalidRequest" }},
+    { ERR_INVALID_WEBSITE_ROUTING_RULES_ERROR, {400, "InvalidRequest" }},
+    { ERR_INVALID_ENCRYPTION_ALGORITHM, {400, "InvalidEncryptionAlgorithmError" }},
+    { ERR_INVALID_RETENTION_PERIOD,{400, "InvalidRetentionPeriod"}},
+    { ERR_LIMIT_EXCEEDED, {400, "LimitExceeded" }},
+    { ERR_LENGTH_REQUIRED, {411, "MissingContentLength" }},
+    { EACCES, {403, "AccessDenied" }},
+    { EPERM, {403, "AccessDenied" }},
+    { ERR_SIGNATURE_NO_MATCH, {403, "SignatureDoesNotMatch" }},
+    { ERR_INVALID_ACCESS_KEY, {403, "InvalidAccessKeyId" }},
+    { ERR_USER_SUSPENDED, {403, "UserSuspended" }},
+    { ERR_REQUEST_TIME_SKEWED, {403, "RequestTimeTooSkewed" }},
+    { ERR_QUOTA_EXCEEDED, {403, "QuotaExceeded" }},
+    { ERR_MFA_REQUIRED, {403, "AccessDenied" }},
+    { ENOENT, {404, "NoSuchKey" }},
+    { ERR_NO_SUCH_BUCKET, {404, "NoSuchBucket" }},
+    { ERR_NO_SUCH_WEBSITE_CONFIGURATION, {404, "NoSuchWebsiteConfiguration" }},
+    { ERR_NO_SUCH_UPLOAD, {404, "NoSuchUpload" }},
+    { ERR_NOT_FOUND, {404, "Not Found"}},
+    { ERR_NO_SUCH_LC, {404, "NoSuchLifecycleConfiguration"}},
+    { ERR_NO_SUCH_BUCKET_POLICY, {404, "NoSuchBucketPolicy"}},
+    { ERR_NO_SUCH_USER, {404, "NoSuchUser"}},
+    { ERR_NO_ROLE_FOUND, {404, "NoSuchEntity"}},
+    { ERR_NO_CORS_FOUND, {404, "NoSuchCORSConfiguration"}},
+    { ERR_NO_SUCH_SUBUSER, {404, "NoSuchSubUser"}},
+    { ERR_NO_SUCH_ENTITY, {404, "NoSuchEntity"}},
+    { ERR_NO_SUCH_CORS_CONFIGURATION, {404, "NoSuchCORSConfiguration"}},
+    { ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION, {404, "ObjectLockConfigurationNotFoundError"}},
+    { ERR_METHOD_NOT_ALLOWED, {405, "MethodNotAllowed" }},
+    { ETIMEDOUT, {408, "RequestTimeout" }},
+    { EEXIST, {409, "BucketAlreadyExists" }},
+    { ERR_USER_EXIST, {409, "UserAlreadyExists" }},
+    { ERR_EMAIL_EXIST, {409, "EmailExists" }},
+    { ERR_KEY_EXIST, {409, "KeyExists"}},
+    { ERR_TAG_CONFLICT, {409, "OperationAborted"}},
+    { ERR_POSITION_NOT_EQUAL_TO_LENGTH, {409, "PositionNotEqualToLength"}},
+    { ERR_OBJECT_NOT_APPENDABLE, {409, "ObjectNotAppendable"}},
+    { ERR_INVALID_BUCKET_STATE, {409, "InvalidBucketState"}},
+    { ERR_INVALID_OBJECT_STATE, {403, "InvalidObjectState"}},
+    { ERR_INVALID_SECRET_KEY, {400, "InvalidSecretKey"}},
+    { ERR_INVALID_KEY_TYPE, {400, "InvalidKeyType"}},
+    { ERR_INVALID_CAP, {400, "InvalidCapability"}},
+    { ERR_INVALID_TENANT_NAME, {400, "InvalidTenantName" }},
+    { ENOTEMPTY, {409, "BucketNotEmpty" }},
+    { ERR_PRECONDITION_FAILED, {412, "PreconditionFailed" }},
+    { ERANGE, {416, "InvalidRange" }},
+    { ERR_UNPROCESSABLE_ENTITY, {422, "UnprocessableEntity" }},
+    { ERR_LOCKED, {423, "Locked" }},
+    { ERR_INTERNAL_ERROR, {500, "InternalError" }},
+    { ERR_NOT_IMPLEMENTED, {501, "NotImplemented" }},
+    { ERR_SERVICE_UNAVAILABLE, {503, "ServiceUnavailable"}},
+    { ERR_RATE_LIMITED, {503, "SlowDown"}},
+    { ERR_ZERO_IN_URL, {400, "InvalidRequest" }},
+    { ERR_NO_SUCH_TAG_SET, {404, "NoSuchTagSet"}},
+    { ERR_NO_SUCH_BUCKET_ENCRYPTION_CONFIGURATION, {404, "ServerSideEncryptionConfigurationNotFoundError"}},
+});
+
+rgw_http_errors rgw_http_swift_errors({
+    { EACCES, {403, "AccessDenied" }},
+    { EPERM, {401, "AccessDenied" }},
+    { ENAMETOOLONG, {400, "Metadata name too long" }},
+    { ERR_USER_SUSPENDED, {401, "UserSuspended" }},
+    { ERR_INVALID_UTF8, {412, "Invalid UTF8" }},
+    { ERR_BAD_URL, {412, "Bad URL" }},
+    { ERR_NOT_SLO_MANIFEST, {400, "Not an SLO manifest" }},
+    { ERR_QUOTA_EXCEEDED, {413, "QuotaExceeded" }},
+    { ENOTEMPTY, {409, "There was a conflict when trying "
+                       "to complete your request." }},
+    /* FIXME(rzarzynski): we need to find a way to apply Swift's error handling
+     * procedures also for ERR_ZERO_IN_URL. This make a problem as the validation
+     * is performed very early, even before setting the req_state::proto_flags. */
+    { ERR_ZERO_IN_URL, {412, "Invalid UTF8 or contains NULL"}},
+    { ERR_RATE_LIMITED, {498, "Rate Limited"}},
+});
+
+rgw_http_errors rgw_http_sts_errors({
+    { ERR_PACKED_POLICY_TOO_LARGE, {400, "PackedPolicyTooLarge" }},
+    { ERR_INVALID_IDENTITY_TOKEN, {400, "InvalidIdentityToken" }},
+});
+
+rgw_http_errors rgw_http_iam_errors({
+    { EINVAL, {400, "InvalidInput" }},
+    { ENOENT, {404, "NoSuchEntity"}},
+    { ERR_ROLE_EXISTS, {409, "EntityAlreadyExists"}},
+    { ERR_DELETE_CONFLICT, {409, "DeleteConflict"}},
+    { EEXIST, {409, "EntityAlreadyExists"}},
+    { ERR_INTERNAL_ERROR, {500, "ServiceFailure" }},
+});
+
+using namespace std;
+using namespace ceph::crypto;
+
+thread_local bool is_asio_thread = false;
+
+rgw_err::
+rgw_err()
+{
+  clear();
+}
+
+void rgw_err::
+clear()
+{
+  http_ret = 200;
+  ret = 0;
+  err_code.clear();
+}
+
+bool rgw_err::
+is_clear() const
+{
+  return (http_ret == 200);
+}
+
+bool rgw_err::
+is_err() const
+{
+  return !(http_ret >= 200 && http_ret <= 399);
+}
+
+// The requestURI transferred from the frontend can be abs_path or absoluteURI
+// If it is absoluteURI, we should adjust it to abs_path for the following 
+// S3 authorization and some other processes depending on the requestURI
+// The absoluteURI can start with "http://", "https://", "ws://" or "wss://"
+static string get_abs_path(const string& request_uri) {
+  const static string ABS_PREFIXS[] = {"http://", "https://", "ws://", "wss://"};
+  bool isAbs = false;
+  for (int i = 0; i < 4; ++i) {
+    if (boost::algorithm::starts_with(request_uri, ABS_PREFIXS[i])) {
+      isAbs = true;
+      break;
+    } 
+  }
+  if (!isAbs) {  // it is not a valid absolute uri
+    return request_uri;
+  }
+  size_t beg_pos = request_uri.find("://") + 3;
+  size_t len = request_uri.size();
+  beg_pos = request_uri.find('/', beg_pos);
+  if (beg_pos == string::npos) return request_uri;
+  return request_uri.substr(beg_pos, len - beg_pos);
+}
+
+req_info::req_info(CephContext *cct, const class RGWEnv *env) : env(env) {
+  method = env->get("REQUEST_METHOD", "");
+  script_uri = env->get("SCRIPT_URI", cct->_conf->rgw_script_uri.c_str());
+  request_uri = env->get("REQUEST_URI", cct->_conf->rgw_request_uri.c_str());
+  if (request_uri[0] != '/') {
+    request_uri = get_abs_path(request_uri);
+  }
+  auto pos = request_uri.find('?');
+  if (pos != string::npos) {
+    request_params = request_uri.substr(pos + 1);
+    request_uri = request_uri.substr(0, pos);
+  } else {
+    request_params = env->get("QUERY_STRING", "");
+  }
+  host = env->get("HTTP_HOST", "");
+
+  // strip off any trailing :port from host (added by CrossFTP and maybe others)
+  size_t colon_offset = host.find_last_of(':');
+  if (colon_offset != string::npos) {
+    bool all_digits = true;
+    for (unsigned i = colon_offset + 1; i < host.size(); ++i) {
+      if (!isdigit(host[i])) {
+	all_digits = false;
+	break;
+      }
+    }
+    if (all_digits) {
+      host.resize(colon_offset);
+    }
+  }
+}
+
+void req_info::rebuild_from(req_info& src)
+{
+  method = src.method;
+  script_uri = src.script_uri;
+  args = src.args;
+  if (src.effective_uri.empty()) {
+    request_uri = src.request_uri;
+  } else {
+    request_uri = src.effective_uri;
+  }
+  effective_uri.clear();
+  host = src.host;
+
+  x_meta_map = src.x_meta_map;
+  x_meta_map.erase("x-amz-date");
+}
+
+
+req_state::req_state(CephContext* _cct, const RGWProcessEnv& penv,
+                     RGWEnv* e, uint64_t id)
+  : cct(_cct), penv(penv), info(_cct, e), id(id)
+{
+  enable_ops_log = e->get_enable_ops_log();
+  enable_usage_log = e->get_enable_usage_log();
+  defer_to_bucket_acls = e->get_defer_to_bucket_acls();
+
+  time = Clock::now();
+}
+
+req_state::~req_state() {
+  delete formatter;
+}
+
+std::ostream& req_state::gen_prefix(std::ostream& out) const
+{
+  auto p = out.precision();
+  return out << "req " << id << ' '
+      << std::setprecision(3) << std::fixed << time_elapsed() // '0.123s'
+      << std::setprecision(p) << std::defaultfloat << ' ';
+}
+
+bool search_err(rgw_http_errors& errs, int err_no, int& http_ret, string& code)
+{
+  auto r = errs.find(err_no);
+  if (r != errs.end()) {
+    http_ret = r->second.first;
+    code = r->second.second;
+    return true;
+  }
+  return false;
+}
+
+void set_req_state_err(struct rgw_err& err,	/* out */
+			int err_no,		/* in  */
+			const int prot_flags)	/* in  */
+{
+  if (err_no < 0)
+    err_no = -err_no;
+
+  err.ret = -err_no;
+
+  if (prot_flags & RGW_REST_SWIFT) {
+    if (search_err(rgw_http_swift_errors, err_no, err.http_ret, err.err_code))
+      return;
+  }
+
+  if (prot_flags & RGW_REST_STS) {
+    if (search_err(rgw_http_sts_errors, err_no, err.http_ret, err.err_code))
+      return;
+  }
+
+  if (prot_flags & RGW_REST_IAM) {
+    if (search_err(rgw_http_iam_errors, err_no, err.http_ret, err.err_code))
+      return;
+  }
+
+  //Default to searching in s3 errors
+  if (search_err(rgw_http_s3_errors, err_no, err.http_ret, err.err_code))
+      return;
+  dout(0) << "WARNING: set_req_state_err err_no=" << err_no
+	<< " resorting to 500" << dendl;
+
+  err.http_ret = 500;
+  err.err_code = "UnknownError";
+}
+
+void set_req_state_err(req_state* s, int err_no, const string& err_msg)
+{
+  if (s) {
+    set_req_state_err(s, err_no);
+    if (s->prot_flags & RGW_REST_SWIFT && !err_msg.empty()) {
+      /* TODO(rzarzynski): there never ever should be a check like this one.
+       * It's here only for the sake of the patch's backportability. Further
+       * commits will move the logic to a per-RGWHandler replacement of
+       * the end_header() function. Alternativaly, we might consider making
+       * that just for the dump(). Please take a look on @cbodley's comments
+       * in PR #10690 (https://github.com/ceph/ceph/pull/10690). */
+      s->err.err_code = err_msg;
+    } else {
+      s->err.message = err_msg;
+    }
+  }
+}
+
+void set_req_state_err(req_state* s, int err_no)
+{
+  if (s) {
+    set_req_state_err(s->err, err_no, s->prot_flags);
+  }
+}
+
+void dump(req_state* s)
+{
+  if (s->format != RGWFormat::HTML)
+    s->formatter->open_object_section("Error");
+  if (!s->err.err_code.empty())
+    s->formatter->dump_string("Code", s->err.err_code);
+  s->formatter->dump_string("Message", s->err.message);
+  if (!s->bucket_name.empty())	// TODO: connect to expose_bucket
+    s->formatter->dump_string("BucketName", s->bucket_name);
+  if (!s->trans_id.empty())	// TODO: connect to expose_bucket or another toggle
+    s->formatter->dump_string("RequestId", s->trans_id);
+  s->formatter->dump_string("HostId", s->host_id);
+  if (s->format != RGWFormat::HTML)
+    s->formatter->close_section();
+}
+
+struct str_len {
+  const char *str;
+  int len;
+};
+
+#define STR_LEN_ENTRY(s) { s, sizeof(s) - 1 }
+
+struct str_len meta_prefixes[] = { STR_LEN_ENTRY("HTTP_X_AMZ_"),
+                                   STR_LEN_ENTRY("HTTP_X_GOOG_"),
+                                   STR_LEN_ENTRY("HTTP_X_DHO_"),
+                                   STR_LEN_ENTRY("HTTP_X_RGW_"),
+                                   STR_LEN_ENTRY("HTTP_X_OBJECT_"),
+                                   STR_LEN_ENTRY("HTTP_X_CONTAINER_"),
+                                   STR_LEN_ENTRY("HTTP_X_ACCOUNT_"),
+                                   {NULL, 0} };
+
+void req_info::init_meta_info(const DoutPrefixProvider *dpp, bool *found_bad_meta)
+{
+  x_meta_map.clear();
+  crypt_attribute_map.clear();
+
+  for (const auto& kv: env->get_map()) {
+    const char *prefix;
+    const string& header_name = kv.first;
+    const string& val = kv.second;
+    for (int prefix_num = 0; (prefix = meta_prefixes[prefix_num].str) != NULL; prefix_num++) {
+      int len = meta_prefixes[prefix_num].len;
+      const char *p = header_name.c_str();
+      if (strncmp(p, prefix, len) == 0) {
+        ldpp_dout(dpp, 10) << "meta>> " << p << dendl;
+        const char *name = p+len; /* skip the prefix */
+        int name_len = header_name.size() - len;
+
+        if (found_bad_meta && strncmp(name, "META_", name_len) == 0)
+          *found_bad_meta = true;
+
+        char name_low[meta_prefixes[0].len + name_len + 1];
+        snprintf(name_low, meta_prefixes[0].len - 5 + name_len + 1, "%s%s", meta_prefixes[0].str + 5 /* skip HTTP_ */, name); // normalize meta prefix
+        int j;
+        for (j = 0; name_low[j]; j++) {
+          if (name_low[j] == '_')
+            name_low[j] = '-';
+          else if (name_low[j] == '-')
+            name_low[j] = '_';
+          else
+            name_low[j] = tolower(name_low[j]);
+        }
+        name_low[j] = 0;
+
+        auto it = x_meta_map.find(name_low);
+        if (it != x_meta_map.end()) {
+          string old = it->second;
+          boost::algorithm::trim_right(old);
+          old.append(",");
+          old.append(val);
+          x_meta_map[name_low] = old;
+        } else {
+          x_meta_map[name_low] = val;
+        }
+        if (strncmp(name_low, "x-amz-server-side-encryption", 20) == 0) {
+          crypt_attribute_map[name_low] = val;
+        }
+      }
+    }
+  }
+  for (const auto& kv: x_meta_map) {
+    ldpp_dout(dpp, 10) << "x>> " << kv.first << ":" << rgw::crypt_sanitize::x_meta_map{kv.first, kv.second} << dendl;
+  }
+}
+
+std::ostream& operator<<(std::ostream& oss, const rgw_err &err)
+{
+  oss << "rgw_err(http_ret=" << err.http_ret << ", err_code='" << err.err_code << "') ";
+  return oss;
+}
+
+void rgw_add_amz_meta_header(
+  meta_map_t& x_meta_map,
+  const std::string& k,
+  const std::string& v)
+{
+  auto it = x_meta_map.find(k);
+  if (it != x_meta_map.end()) {
+    std::string old = it->second;
+    boost::algorithm::trim_right(old);
+    old.append(",");
+    old.append(v);
+    x_meta_map[k] = old;
+  } else {
+    x_meta_map[k] = v;
+  }
+}
+
+bool rgw_set_amz_meta_header(
+  meta_map_t& x_meta_map,
+  const std::string& k,
+  const std::string& v,
+  rgw_set_action_if_set a)
+{
+  auto it { x_meta_map.find(k) };
+  bool r { it != x_meta_map.end() };
+  switch(a) {
+  default:
+    ceph_assert(a == 0);
+  case DISCARD:
+    break;
+  case APPEND:
+    if (r) {
+	std::string old { it->second };
+	boost::algorithm::trim_right(old);
+	old.append(",");
+	old.append(v);
+	x_meta_map[k] = old;
+	break;
+    }
+    /* fall through */
+  case OVERWRITE:
+    x_meta_map[k] = v;
+  }
+  return r;
+}
+
+string rgw_string_unquote(const string& s)
+{
+  if (s[0] != '"' || s.size() < 2)
+    return s;
+
+  int len;
+  for (len = s.size(); len > 2; --len) {
+    if (s[len - 1] != ' ')
+      break;
+  }
+
+  if (s[len-1] != '"')
+    return s;
+
+  return s.substr(1, len - 2);
+}
+
+static bool check_str_end(const char *s)
+{
+  if (!s)
+    return false;
+
+  while (*s) {
+    if (!isspace(*s))
+      return false;
+    s++;
+  }
+  return true;
+}
+
+static bool check_gmt_end(const char *s)
+{
+  if (!s || !*s)
+    return false;
+
+  while (isspace(*s)) {
+    ++s;
+  }
+
+  /* check for correct timezone */
+  if ((strncmp(s, "GMT", 3) != 0) &&
+      (strncmp(s, "UTC", 3) != 0)) {
+    return false;
+  }
+
+  return true;
+}
+
+static bool parse_rfc850(const char *s, struct tm *t)
+{
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(t, 0, sizeof(*t));
+  return check_gmt_end(strptime(s, "%A, %d-%b-%y %H:%M:%S ", t));
+}
+
+static bool parse_asctime(const char *s, struct tm *t)
+{
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(t, 0, sizeof(*t));
+  return check_str_end(strptime(s, "%a %b %d %H:%M:%S %Y", t));
+}
+
+static bool parse_rfc1123(const char *s, struct tm *t)
+{
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(t, 0, sizeof(*t));
+  return check_gmt_end(strptime(s, "%a, %d %b %Y %H:%M:%S ", t));
+}
+
+static bool parse_rfc1123_alt(const char *s, struct tm *t)
+{
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(t, 0, sizeof(*t));
+  return check_str_end(strptime(s, "%a, %d %b %Y %H:%M:%S %z", t));
+}
+
+bool parse_rfc2616(const char *s, struct tm *t)
+{
+  return parse_rfc850(s, t) || parse_asctime(s, t) || parse_rfc1123(s, t) || parse_rfc1123_alt(s,t);
+}
+
+bool parse_iso8601(const char *s, struct tm *t, uint32_t *pns, bool extended_format)
+{
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(t, 0, sizeof(*t));
+  const char *p;
+
+  if (!s)
+    s = "";
+
+  if (extended_format) {
+    p = strptime(s, "%Y-%m-%dT%T", t);
+    if (!p) {
+      p = strptime(s, "%Y-%m-%d %T", t);
+    }
+  } else {
+    p = strptime(s, "%Y%m%dT%H%M%S", t);
+  }
+  if (!p) {
+    dout(0) << "parse_iso8601 failed" << dendl;
+    return false;
+  }
+  const std::string_view str = rgw_trim_whitespace(std::string_view(p));
+  int len = str.size();
+
+  if (len == 0 || (len == 1 && str[0] == 'Z'))
+    return true;
+
+  if (str[0] != '.' ||
+      str[len - 1] != 'Z')
+    return false;
+
+  uint32_t ms;
+  std::string_view nsstr = str.substr(1,  len - 2);
+  int r = stringtoul(std::string(nsstr), &ms);
+  if (r < 0)
+    return false;
+
+  if (!pns) {
+    return true;
+  }
+
+  if (nsstr.size() > 9) {
+    nsstr = nsstr.substr(0, 9);
+  }
+
+  uint64_t mul_table[] = { 0,
+    100000000LL,
+    10000000LL,
+    1000000LL,
+    100000LL,
+    10000LL,
+    1000LL,
+    100LL,
+    10LL,
+    1 };
+
+
+  *pns = ms * mul_table[nsstr.size()];
+
+  return true;
+}
+
+int parse_key_value(string& in_str, const char *delim, string& key, string& val)
+{
+  if (delim == NULL)
+    return -EINVAL;
+
+  auto pos = in_str.find(delim);
+  if (pos == string::npos)
+    return -EINVAL;
+
+  key = rgw_trim_whitespace(in_str.substr(0, pos));
+  val = rgw_trim_whitespace(in_str.substr(pos + 1));
+
+  return 0;
+}
+
+int parse_key_value(string& in_str, string& key, string& val)
+{
+  return parse_key_value(in_str, "=", key,val);
+}
+
+boost::optional<std::pair<std::string_view, std::string_view>>
+parse_key_value(const std::string_view& in_str,
+                const std::string_view& delim)
+{
+  const size_t pos = in_str.find(delim);
+  if (pos == std::string_view::npos) {
+    return boost::none;
+  }
+
+  const auto key = rgw_trim_whitespace(in_str.substr(0, pos));
+  const auto val = rgw_trim_whitespace(in_str.substr(pos + 1));
+
+  return std::make_pair(key, val);
+}
+
+boost::optional<std::pair<std::string_view, std::string_view>>
+parse_key_value(const std::string_view& in_str)
+{
+  return parse_key_value(in_str, "=");
+}
+
+int parse_time(const char *time_str, real_time *time)
+{
+  struct tm tm;
+  uint32_t ns = 0;
+
+  if (!parse_rfc2616(time_str, &tm) && !parse_iso8601(time_str, &tm, &ns)) {
+    return -EINVAL;
+  }
+
+  time_t sec = internal_timegm(&tm);
+  *time = utime_t(sec, ns).to_real_time();
+
+  return 0;
+}
+
+#define TIME_BUF_SIZE 128
+
+void rgw_to_iso8601(const real_time& t, char *dest, int buf_size)
+{
+  utime_t ut(t);
+
+  char buf[TIME_BUF_SIZE];
+  struct tm result;
+  time_t epoch = ut.sec();
+  struct tm *tmp = gmtime_r(&epoch, &result);
+  if (tmp == NULL)
+    return;
+
+  if (strftime(buf, sizeof(buf), "%Y-%m-%dT%T", tmp) == 0)
+    return;
+
+  snprintf(dest, buf_size, "%s.%03dZ", buf, (int)(ut.usec() / 1000));
+}
+
+void rgw_to_iso8601(const real_time& t, string *dest)
+{
+  char buf[TIME_BUF_SIZE];
+  rgw_to_iso8601(t, buf, sizeof(buf));
+  *dest = buf;
+}
+
+
+string rgw_to_asctime(const utime_t& t)
+{
+  stringstream s;
+  t.asctime(s);
+  return s.str();
+}
+
+/*
+ * calculate the sha1 value of a given msg and key
+ */
+void calc_hmac_sha1(const char *key, int key_len,
+                    const char *msg, int msg_len, char *dest)
+/* destination should be CEPH_CRYPTO_HMACSHA1_DIGESTSIZE bytes long */
+{
+  HMACSHA1 hmac((const unsigned char *)key, key_len);
+  hmac.Update((const unsigned char *)msg, msg_len);
+  hmac.Final((unsigned char *)dest);
+}
+
+/*
+ * calculate the sha256 value of a given msg and key
+ */
+void calc_hmac_sha256(const char *key, int key_len,
+                      const char *msg, int msg_len, char *dest)
+{
+  char hash_sha256[CEPH_CRYPTO_HMACSHA256_DIGESTSIZE];
+
+  HMACSHA256 hmac((const unsigned char *)key, key_len);
+  hmac.Update((const unsigned char *)msg, msg_len);
+  hmac.Final((unsigned char *)hash_sha256);
+
+  memcpy(dest, hash_sha256, CEPH_CRYPTO_HMACSHA256_DIGESTSIZE);
+}
+
+using ceph::crypto::SHA256;
+
+/*
+ * calculate the sha256 hash value of a given msg
+ */
+sha256_digest_t calc_hash_sha256(const std::string_view& msg)
+{
+  sha256_digest_t hash;
+
+  SHA256 hasher;
+  hasher.Update(reinterpret_cast<const unsigned char*>(msg.data()), msg.size());
+  hasher.Final(hash.v);
+
+  return hash;
+}
+
+SHA256* calc_hash_sha256_open_stream()
+{
+  return new SHA256;
+}
+
+void calc_hash_sha256_update_stream(SHA256 *hash, const char *msg, int len)
+{
+  hash->Update((const unsigned char *)msg, len);
+}
+
+string calc_hash_sha256_close_stream(SHA256 **phash)
+{
+  SHA256 *hash = *phash;
+  if (!hash) {
+    hash = calc_hash_sha256_open_stream();
+  }
+  char hash_sha256[CEPH_CRYPTO_HMACSHA256_DIGESTSIZE];
+
+  hash->Final((unsigned char *)hash_sha256);
+
+  char hex_str[(CEPH_CRYPTO_SHA256_DIGESTSIZE * 2) + 1];
+  buf_to_hex((unsigned char *)hash_sha256, CEPH_CRYPTO_SHA256_DIGESTSIZE, hex_str);
+
+  delete hash;
+  *phash = NULL;
+  
+  return std::string(hex_str);
+}
+
+std::string calc_hash_sha256_restart_stream(SHA256 **phash)
+{
+  const auto hash = calc_hash_sha256_close_stream(phash);
+  *phash = calc_hash_sha256_open_stream();
+
+  return hash;
+}
+
+int NameVal::parse()
+{
+  auto delim_pos = str.find('=');
+  int ret = 0;
+
+  if (delim_pos == string::npos) {
+    name = str;
+    val = "";
+    ret = 1;
+  } else {
+    name = str.substr(0, delim_pos);
+    val = str.substr(delim_pos + 1);
+  }
+
+  return ret; 
+}
+
+int RGWHTTPArgs::parse(const DoutPrefixProvider *dpp)
+{
+  int pos = 0;
+  bool end = false;
+
+  if (str.empty())
+    return 0;
+
+  if (str[pos] == '?')
+    pos++;
+
+  while (!end) {
+    int fpos = str.find('&', pos);
+    if (fpos  < pos) {
+       end = true;
+       fpos = str.size(); 
+    }
+    std::string nameval = url_decode(str.substr(pos, fpos - pos), true);
+    NameVal nv(std::move(nameval));
+    int ret = nv.parse();
+    if (ret >= 0) {
+      string& name = nv.get_name();
+      if (name.find("X-Amz-") != string::npos) {
+        std::for_each(name.begin(),
+          name.end(),
+          [](char &c){
+            if (c != '-') {
+              c = ::tolower(static_cast<unsigned char>(c));
+            }
+        });
+      }
+      string& val = nv.get_val();
+      ldpp_dout(dpp, 10) << "name: " << name << " val: " << val << dendl;
+      append(name, val);
+    }
+
+    pos = fpos + 1;  
+  }
+
+  return 0;
+}
+
+void RGWHTTPArgs::remove(const string& name)
+{
+  auto val_iter = val_map.find(name);
+  if (val_iter != std::end(val_map)) {
+    val_map.erase(val_iter);
+  }
+
+  auto sys_val_iter = sys_val_map.find(name);
+  if (sys_val_iter != std::end(sys_val_map)) {
+    sys_val_map.erase(sys_val_iter);
+  }
+
+  auto subres_iter = sub_resources.find(name);
+  if (subres_iter != std::end(sub_resources)) {
+    sub_resources.erase(subres_iter);
+  }
+}
+
+void RGWHTTPArgs::append(const string& name, const string& val)
+{
+  if (name.compare(0, sizeof(RGW_SYS_PARAM_PREFIX) - 1, RGW_SYS_PARAM_PREFIX) == 0) {
+    sys_val_map[name] = val;
+  } else {
+    val_map[name] = val;
+  }
+
+// when sub_resources exclusive by object are added, please remember to update obj_sub_resource in RGWHTTPArgs::exist_obj_excl_sub_resource().
+  if ((name.compare("acl") == 0) ||
+      (name.compare("cors") == 0) ||
+      (name.compare("notification") == 0) ||
+      (name.compare("location") == 0) ||
+      (name.compare("logging") == 0) ||
+      (name.compare("usage") == 0) ||
+      (name.compare("lifecycle") == 0) ||
+      (name.compare("delete") == 0) ||
+      (name.compare("uploads") == 0) ||
+      (name.compare("partNumber") == 0) ||
+      (name.compare("uploadId") == 0) ||
+      (name.compare("versionId") == 0) ||
+      (name.compare("start-date") == 0) ||
+      (name.compare("end-date") == 0) ||
+      (name.compare("versions") == 0) ||
+      (name.compare("versioning") == 0) ||
+      (name.compare("website") == 0) ||
+      (name.compare("requestPayment") == 0) ||
+      (name.compare("torrent") == 0) ||
+      (name.compare("tagging") == 0) ||
+      (name.compare("append") == 0) ||
+      (name.compare("position") == 0) ||
+      (name.compare("policyStatus") == 0) ||
+      (name.compare("publicAccessBlock") == 0)) {
+    sub_resources[name] = val;
+  } else if (name[0] == 'r') { // root of all evil
+    if ((name.compare("response-content-type") == 0) ||
+        (name.compare("response-content-language") == 0) ||
+        (name.compare("response-expires") == 0) ||
+        (name.compare("response-cache-control") == 0) ||
+        (name.compare("response-content-disposition") == 0) ||
+        (name.compare("response-content-encoding") == 0)) {
+      sub_resources[name] = val;
+      has_resp_modifier = true;
+    }
+  } else if  ((name.compare("subuser") == 0) ||
+              (name.compare("key") == 0) ||
+              (name.compare("caps") == 0) ||
+              (name.compare("index") == 0) ||
+              (name.compare("policy") == 0) ||
+              (name.compare("quota") == 0) ||
+              (name.compare("list") == 0) ||
+              (name.compare("object") == 0) ||
+              (name.compare("sync") == 0)) {
+    if (!admin_subresource_added) {
+      sub_resources[name] = "";
+      admin_subresource_added = true;
+    }
+  }
+}
+
+const string& RGWHTTPArgs::get(const string& name, bool *exists) const
+{
+  auto iter = val_map.find(name);
+  bool e = (iter != std::end(val_map));
+  if (exists)
+    *exists = e;
+  if (e)
+    return iter->second;
+  return empty_str;
+}
+
+boost::optional<const std::string&>
+RGWHTTPArgs::get_optional(const std::string& name) const
+{
+  bool exists;
+  const std::string& value = get(name, &exists);
+  if (exists) {
+    return value;
+  } else {
+    return boost::none;
+  }
+}
+
+int RGWHTTPArgs::get_bool(const string& name, bool *val, bool *exists) const
+{
+  map<string, string>::const_iterator iter;
+  iter = val_map.find(name);
+  bool e = (iter != val_map.end());
+  if (exists)
+    *exists = e;
+
+  if (e) {
+    const char *s = iter->second.c_str();
+
+    if (strcasecmp(s, "false") == 0) {
+      *val = false;
+    } else if (strcasecmp(s, "true") == 0) {
+      *val = true;
+    } else {
+      return -EINVAL;
+    }
+  }
+
+  return 0;
+}
+
+int RGWHTTPArgs::get_bool(const char *name, bool *val, bool *exists) const
+{
+  string s(name);
+  return get_bool(s, val, exists);
+}
+
+void RGWHTTPArgs::get_bool(const char *name, bool *val, bool def_val) const
+{
+  bool exists = false;
+  if ((get_bool(name, val, &exists) < 0) ||
+      !exists) {
+    *val = def_val;
+  }
+}
+
+int RGWHTTPArgs::get_int(const char *name, int *val, int def_val) const
+{
+  bool exists = false;
+  string val_str;
+  val_str = get(name, &exists);
+  if (!exists) {
+    *val = def_val;
+    return 0;
+  }
+
+  string err;
+
+  *val = (int)strict_strtol(val_str.c_str(), 10, &err);
+  if (!err.empty()) {
+    *val = def_val;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+string RGWHTTPArgs::sys_get(const string& name, bool * const exists) const
+{
+  const auto iter = sys_val_map.find(name);
+  const bool e = (iter != sys_val_map.end());
+
+  if (exists) {
+    *exists = e;
+  }
+
+  return e ? iter->second : string();
+}
+
+bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env)
+{
+  const auto& m = env.get_map();
+  // frontend connected with ssl
+  if (m.count("SERVER_PORT_SECURE")) {
+    return true;
+  }
+  // ignore proxy headers unless explicitly enabled
+  if (!cct->_conf->rgw_trust_forwarded_https) {
+    return false;
+  }
+  // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Forwarded
+  // Forwarded: by=<identifier>; for=<identifier>; host=<host>; proto=<http|https>
+  auto i = m.find("HTTP_FORWARDED");
+  if (i != m.end() && i->second.find("proto=https") != std::string::npos) {
+    return true;
+  }
+  // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-Proto
+  i = m.find("HTTP_X_FORWARDED_PROTO");
+  if (i != m.end() && i->second == "https") {
+    return true;
+  }
+  return false;
+}
+
+
+namespace {
+
+struct perm_state_from_req_state : public perm_state_base {
+  req_state * const s;
+  perm_state_from_req_state(req_state * const _s)
+    : perm_state_base(_s->cct,
+		      _s->env,
+		      _s->auth.identity.get(),
+		      _s->bucket.get() ? _s->bucket->get_info() : RGWBucketInfo(),
+		      _s->perm_mask,
+		      _s->defer_to_bucket_acls,
+		      _s->bucket_access_conf),
+      s(_s) {}
+
+  std::optional<bool> get_request_payer() const override {
+    const char *request_payer = s->info.env->get("HTTP_X_AMZ_REQUEST_PAYER");
+    if (!request_payer) {
+      bool exists;
+      request_payer = s->info.args.get("x-amz-request-payer", &exists).c_str();
+      if (!exists) {
+        return false;
+      }
+    }
+
+    if (strcasecmp(request_payer, "requester") == 0) {
+      return true;
+    }
+
+    return std::nullopt;
+  }
+
+  const char *get_referer() const override {
+    return s->info.env->get("HTTP_REFERER");
+  }
+};
+
+Effect eval_or_pass(const DoutPrefixProvider* dpp,
+		    const boost::optional<Policy>& policy,
+		    const rgw::IAM::Environment& env,
+		    boost::optional<const rgw::auth::Identity&> id,
+		    const uint64_t op,
+		    const ARN& resource,
+				boost::optional<rgw::IAM::PolicyPrincipal&> princ_type=boost::none) {
+  if (!policy)
+    return Effect::Pass;
+  else
+    return policy->eval(env, id, op, resource, princ_type);
+}
+
+}
+
+Effect eval_identity_or_session_policies(const DoutPrefixProvider* dpp,
+			  const vector<Policy>& policies,
+                          const rgw::IAM::Environment& env,
+                          const uint64_t op,
+                          const ARN& arn) {
+  auto policy_res = Effect::Pass, prev_res = Effect::Pass;
+  for (auto& policy : policies) {
+    if (policy_res = eval_or_pass(dpp, policy, env, boost::none, op, arn); policy_res == Effect::Deny)
+      return policy_res;
+    else if (policy_res == Effect::Allow)
+      prev_res = Effect::Allow;
+    else if (policy_res == Effect::Pass && prev_res == Effect::Allow)
+      policy_res = Effect::Allow;
+  }
+  return policy_res;
+}
+
+bool verify_user_permission(const DoutPrefixProvider* dpp,
+                            perm_state_base * const s,
+                            RGWAccessControlPolicy * const user_acl,
+                            const vector<rgw::IAM::Policy>& user_policies,
+                            const vector<rgw::IAM::Policy>& session_policies,
+                            const rgw::ARN& res,
+                            const uint64_t op,
+                            bool mandatory_policy)
+{
+  auto identity_policy_res = eval_identity_or_session_policies(dpp, user_policies, s->env, op, res);
+  if (identity_policy_res == Effect::Deny) {
+    return false;
+  }
+
+  if (! session_policies.empty()) {
+    auto session_policy_res = eval_identity_or_session_policies(dpp, session_policies, s->env, op, res);
+    if (session_policy_res == Effect::Deny) {
+      return false;
+    }
+    //Intersection of identity policies and session policies
+    if (identity_policy_res == Effect::Allow && session_policy_res == Effect::Allow) {
+      return true;
+    }
+    return false;
+  }
+
+  if (identity_policy_res == Effect::Allow) {
+    return true;
+  }
+
+  if (mandatory_policy) {
+    // no policies, and policy is mandatory
+    ldpp_dout(dpp, 20) << "no policies for a policy mandatory op " << op << dendl;
+    return false;
+  }
+
+  auto perm = op_to_perm(op);
+
+  return verify_user_permission_no_policy(dpp, s, user_acl, perm);
+}
+
+bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp,
+                                      struct perm_state_base * const s,
+                                      RGWAccessControlPolicy * const user_acl,
+                                      const int perm)
+{
+  if (s->identity->get_identity_type() == TYPE_ROLE)
+    return false;
+
+  /* S3 doesn't support account ACLs. */
+  if (!user_acl)
+    return true;
+
+  if ((perm & (int)s->perm_mask) != perm)
+    return false;
+
+  return user_acl->verify_permission(dpp, *s->identity, perm, perm);
+}
+
+bool verify_user_permission(const DoutPrefixProvider* dpp,
+                            req_state * const s,
+                            const rgw::ARN& res,
+                            const uint64_t op,
+                            bool mandatory_policy)
+{
+  perm_state_from_req_state ps(s);
+  return verify_user_permission(dpp, &ps, s->user_acl.get(), s->iam_user_policies, s->session_policies, res, op, mandatory_policy);
+}
+
+bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp, 
+                                      req_state * const s,
+                                      const int perm)
+{
+  perm_state_from_req_state ps(s);
+  return verify_user_permission_no_policy(dpp, &ps, s->user_acl.get(), perm);
+}
+
+bool verify_requester_payer_permission(struct perm_state_base *s)
+{
+  if (!s->bucket_info.requester_pays)
+    return true;
+
+  if (s->identity->is_owner_of(s->bucket_info.owner))
+    return true;
+  
+  if (s->identity->is_anonymous()) {
+    return false;
+  }
+
+  auto request_payer = s->get_request_payer();
+  if (request_payer) {
+    return *request_payer;
+  }
+
+  return false;
+}
+
+bool verify_bucket_permission(const DoutPrefixProvider* dpp,
+                              struct perm_state_base * const s,
+			      const rgw_bucket& bucket,
+                              RGWAccessControlPolicy * const user_acl,
+                              RGWAccessControlPolicy * const bucket_acl,
+			      const boost::optional<Policy>& bucket_policy,
+                              const vector<Policy>& identity_policies,
+                              const vector<Policy>& session_policies,
+                              const uint64_t op)
+{
+  if (!verify_requester_payer_permission(s))
+    return false;
+
+  auto identity_policy_res = eval_identity_or_session_policies(dpp, identity_policies, s->env, op, ARN(bucket));
+  if (identity_policy_res == Effect::Deny)
+    return false;
+
+  rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+  if (bucket_policy) {
+    ldpp_dout(dpp, 16) << __func__ << ": policy: " << bucket_policy.get()
+		       << "resource: " << ARN(bucket) << dendl;
+  }
+  auto r = eval_or_pass(dpp, bucket_policy, s->env, *s->identity,
+			op, ARN(bucket), princ_type);
+  if (r == Effect::Deny)
+    return false;
+
+  //Take into account session policies, if the identity making a request is a role
+  if (!session_policies.empty()) {
+    auto session_policy_res = eval_identity_or_session_policies(dpp, session_policies, s->env, op, ARN(bucket));
+    if (session_policy_res == Effect::Deny) {
+        return false;
+    }
+    if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+      //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+      if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+          (session_policy_res == Effect::Allow && r == Effect::Allow))
+        return true;
+    } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+      //Intersection of session policy and identity policy plus bucket policy
+      if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || r == Effect::Allow)
+        return true;
+    } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+      if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow)
+        return true;
+    }
+    return false;
+  }
+
+  if (r == Effect::Allow || identity_policy_res == Effect::Allow)
+    // It looks like S3 ACLs only GRANT permissions rather than
+    // denying them, so this should be safe.
+    return true;
+
+  const auto perm = op_to_perm(op);
+
+  return verify_bucket_permission_no_policy(dpp, s, user_acl, bucket_acl, perm);
+}
+
+bool verify_bucket_permission(const DoutPrefixProvider* dpp,
+                              req_state * const s,
+			      const rgw_bucket& bucket,
+                              RGWAccessControlPolicy * const user_acl,
+                              RGWAccessControlPolicy * const bucket_acl,
+			      const boost::optional<Policy>& bucket_policy,
+                              const vector<Policy>& user_policies,
+                              const vector<Policy>& session_policies,
+                              const uint64_t op)
+{
+  perm_state_from_req_state ps(s);
+  return verify_bucket_permission(dpp, &ps, bucket,
+                                  user_acl, bucket_acl,
+                                  bucket_policy, user_policies,
+                                  session_policies, op);
+}
+
+bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp, struct perm_state_base * const s,
+					RGWAccessControlPolicy * const user_acl,
+					RGWAccessControlPolicy * const bucket_acl,
+					const int perm)
+{
+  if (!bucket_acl)
+    return false;
+
+  if ((perm & (int)s->perm_mask) != perm)
+    return false;
+
+  if (bucket_acl->verify_permission(dpp, *s->identity, perm, perm,
+                                    s->get_referer(),
+                                    s->bucket_access_conf &&
+                                    s->bucket_access_conf->ignore_public_acls()))
+    return true;
+
+  if (!user_acl)
+    return false;
+
+  return user_acl->verify_permission(dpp, *s->identity, perm, perm);
+}
+
+bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp, req_state * const s,
+					RGWAccessControlPolicy * const user_acl,
+					RGWAccessControlPolicy * const bucket_acl,
+					const int perm)
+{
+  perm_state_from_req_state ps(s);
+  return verify_bucket_permission_no_policy(dpp,
+                                            &ps,
+                                            user_acl,
+                                            bucket_acl,
+                                            perm);
+}
+
+bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp, req_state * const s, const int perm)
+{
+  perm_state_from_req_state ps(s);
+
+  if (!verify_requester_payer_permission(&ps))
+    return false;
+
+  return verify_bucket_permission_no_policy(dpp,
+                                            &ps,
+                                            s->user_acl.get(),
+                                            s->bucket_acl.get(),
+                                            perm);
+}
+
+bool verify_bucket_permission(const DoutPrefixProvider* dpp, req_state * const s, const uint64_t op)
+{
+  if (rgw::sal::Bucket::empty(s->bucket)) {
+    // request is missing a bucket name
+    return false;
+  }
+
+  perm_state_from_req_state ps(s);
+
+  return verify_bucket_permission(dpp, 
+                                  &ps,
+                                  s->bucket->get_key(),
+                                  s->user_acl.get(),
+                                  s->bucket_acl.get(),
+                                  s->iam_policy,
+                                  s->iam_user_policies,
+                                  s->session_policies,
+                                  op);
+}
+
+// Authorize anyone permitted by the bucket policy, identity policies, session policies and the bucket owner
+// unless explicitly denied by the policy.
+
+int verify_bucket_owner_or_policy(req_state* const s,
+				  const uint64_t op)
+{
+  auto identity_policy_res = eval_identity_or_session_policies(s, s->iam_user_policies, s->env, op, ARN(s->bucket->get_key()));
+  if (identity_policy_res == Effect::Deny) {
+    return -EACCES;
+  }
+
+  rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+  auto e = eval_or_pass(s, s->iam_policy,
+			s->env, *s->auth.identity,
+			op, ARN(s->bucket->get_key()), princ_type);
+  if (e == Effect::Deny) {
+    return -EACCES;
+  }
+
+  if (!s->session_policies.empty()) {
+    auto session_policy_res = eval_identity_or_session_policies(s, s->session_policies, s->env, op,
+								ARN(s->bucket->get_key()));
+    if (session_policy_res == Effect::Deny) {
+        return -EACCES;
+    }
+    if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+      //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+      if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+          (session_policy_res == Effect::Allow && e == Effect::Allow))
+        return 0;
+    } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+      //Intersection of session policy and identity policy plus bucket policy
+      if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow)
+        return 0;
+    } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+      if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow)
+        return 0;
+    }
+    return -EACCES;
+  }
+
+  if (e == Effect::Allow ||
+      identity_policy_res == Effect::Allow ||
+      (e == Effect::Pass &&
+       identity_policy_res == Effect::Pass &&
+       s->auth.identity->is_owner_of(s->bucket_owner.get_id()))) {
+    return 0;
+  } else {
+    return -EACCES;
+  }
+}
+
+
+static inline bool check_deferred_bucket_perms(const DoutPrefixProvider* dpp,
+                                               struct perm_state_base * const s,
+					       const rgw_bucket& bucket,
+					       RGWAccessControlPolicy * const user_acl,
+					       RGWAccessControlPolicy * const bucket_acl,
+					       const boost::optional<Policy>& bucket_policy,
+                 const vector<Policy>& identity_policies,
+                 const vector<Policy>& session_policies,
+					       const uint8_t deferred_check,
+					       const uint64_t op)
+{
+  return (s->defer_to_bucket_acls == deferred_check \
+	  && verify_bucket_permission(dpp, s, bucket, user_acl, bucket_acl, bucket_policy, identity_policies, session_policies,op));
+}
+
+static inline bool check_deferred_bucket_only_acl(const DoutPrefixProvider* dpp,
+                                                  struct perm_state_base * const s,
+						  RGWAccessControlPolicy * const user_acl,
+						  RGWAccessControlPolicy * const bucket_acl,
+						  const uint8_t deferred_check,
+						  const int perm)
+{
+  return (s->defer_to_bucket_acls == deferred_check \
+	  && verify_bucket_permission_no_policy(dpp, s, user_acl, bucket_acl, perm));
+}
+
+bool verify_object_permission(const DoutPrefixProvider* dpp, struct perm_state_base * const s,
+			      const rgw_obj& obj,
+                              RGWAccessControlPolicy * const user_acl,
+                              RGWAccessControlPolicy * const bucket_acl,
+                              RGWAccessControlPolicy * const object_acl,
+                              const boost::optional<Policy>& bucket_policy,
+                              const vector<Policy>& identity_policies,
+                              const vector<Policy>& session_policies,
+                              const uint64_t op)
+{
+  if (!verify_requester_payer_permission(s))
+    return false;
+
+  auto identity_policy_res = eval_identity_or_session_policies(dpp, identity_policies, s->env, op, ARN(obj));
+  if (identity_policy_res == Effect::Deny)
+    return false;
+
+  rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+  auto r = eval_or_pass(dpp, bucket_policy, s->env, *s->identity, op, ARN(obj), princ_type);
+  if (r == Effect::Deny)
+    return false;
+
+  if (!session_policies.empty()) {
+    auto session_policy_res = eval_identity_or_session_policies(dpp, session_policies, s->env, op, ARN(obj));
+    if (session_policy_res == Effect::Deny) {
+        return false;
+    }
+    if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+      //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+      if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+          (session_policy_res == Effect::Allow && r == Effect::Allow))
+        return true;
+    } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+      //Intersection of session policy and identity policy plus bucket policy
+      if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || r == Effect::Allow)
+        return true;
+    } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+      if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow)
+        return true;
+    }
+    return false;
+  }
+
+  if (r == Effect::Allow || identity_policy_res == Effect::Allow)
+    // It looks like S3 ACLs only GRANT permissions rather than
+    // denying them, so this should be safe.
+    return true;
+
+  const auto perm = op_to_perm(op);
+
+  if (check_deferred_bucket_perms(dpp, s, obj.bucket, user_acl, bucket_acl, bucket_policy,
+				  identity_policies, session_policies, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, op) ||
+      check_deferred_bucket_perms(dpp, s, obj.bucket, user_acl, bucket_acl, bucket_policy,
+				  identity_policies, session_policies, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, rgw::IAM::s3All)) {
+    return true;
+  }
+
+  if (!object_acl) {
+    return false;
+  }
+
+  bool ret = object_acl->verify_permission(dpp, *s->identity, s->perm_mask, perm,
+					   nullptr, /* http_referrer */
+					   s->bucket_access_conf &&
+					   s->bucket_access_conf->ignore_public_acls());
+  if (ret) {
+    return true;
+  }
+
+  if (!s->cct->_conf->rgw_enforce_swift_acls)
+    return ret;
+
+  if ((perm & (int)s->perm_mask) != perm)
+    return false;
+
+  int swift_perm = 0;
+  if (perm & (RGW_PERM_READ | RGW_PERM_READ_ACP))
+    swift_perm |= RGW_PERM_READ_OBJS;
+  if (perm & RGW_PERM_WRITE)
+    swift_perm |= RGW_PERM_WRITE_OBJS;
+
+  if (!swift_perm)
+    return false;
+
+  /* we already verified the user mask above, so we pass swift_perm as the mask here,
+     otherwise the mask might not cover the swift permissions bits */
+  if (bucket_acl->verify_permission(dpp, *s->identity, swift_perm, swift_perm,
+                                    s->get_referer()))
+    return true;
+
+  if (!user_acl)
+    return false;
+
+  return user_acl->verify_permission(dpp, *s->identity, swift_perm, swift_perm);
+}
+
+bool verify_object_permission(const DoutPrefixProvider* dpp, req_state * const s,
+			      const rgw_obj& obj,
+                              RGWAccessControlPolicy * const user_acl,
+                              RGWAccessControlPolicy * const bucket_acl,
+                              RGWAccessControlPolicy * const object_acl,
+                              const boost::optional<Policy>& bucket_policy,
+                              const vector<Policy>& identity_policies,
+                              const vector<Policy>& session_policies,
+                              const uint64_t op)
+{
+  perm_state_from_req_state ps(s);
+  return verify_object_permission(dpp, &ps, obj,
+                                  user_acl, bucket_acl,
+                                  object_acl, bucket_policy,
+                                  identity_policies, session_policies, op);
+}
+
+bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp,
+                                        struct perm_state_base * const s,
+					RGWAccessControlPolicy * const user_acl,
+					RGWAccessControlPolicy * const bucket_acl,
+					RGWAccessControlPolicy * const object_acl,
+					const int perm)
+{
+  if (check_deferred_bucket_only_acl(dpp, s, user_acl, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, perm) ||
+      check_deferred_bucket_only_acl(dpp, s, user_acl, bucket_acl, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, RGW_PERM_FULL_CONTROL)) {
+    return true;
+  }
+
+  if (!object_acl) {
+    return false;
+  }
+
+  bool ret = object_acl->verify_permission(dpp, *s->identity, s->perm_mask, perm,
+					   nullptr, /* http referrer */
+					   s->bucket_access_conf &&
+					   s->bucket_access_conf->ignore_public_acls());
+  if (ret) {
+    return true;
+  }
+
+  if (!s->cct->_conf->rgw_enforce_swift_acls)
+    return ret;
+
+  if ((perm & (int)s->perm_mask) != perm)
+    return false;
+
+  int swift_perm = 0;
+  if (perm & (RGW_PERM_READ | RGW_PERM_READ_ACP))
+    swift_perm |= RGW_PERM_READ_OBJS;
+  if (perm & RGW_PERM_WRITE)
+    swift_perm |= RGW_PERM_WRITE_OBJS;
+
+  if (!swift_perm)
+    return false;
+
+  /* we already verified the user mask above, so we pass swift_perm as the mask here,
+     otherwise the mask might not cover the swift permissions bits */
+  if (bucket_acl->verify_permission(dpp, *s->identity, swift_perm, swift_perm,
+                                    s->get_referer()))
+    return true;
+
+  if (!user_acl)
+    return false;
+
+  return user_acl->verify_permission(dpp, *s->identity, swift_perm, swift_perm);
+}
+
+bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp, req_state *s, int perm)
+{
+  perm_state_from_req_state ps(s);
+
+  if (!verify_requester_payer_permission(&ps))
+    return false;
+
+  return verify_object_permission_no_policy(dpp,
+                                            &ps,
+                                            s->user_acl.get(),
+                                            s->bucket_acl.get(),
+                                            s->object_acl.get(),
+                                            perm);
+}
+
+bool verify_object_permission(const DoutPrefixProvider* dpp, req_state *s, uint64_t op)
+{
+  perm_state_from_req_state ps(s);
+
+  return verify_object_permission(dpp,
+                                  &ps,
+                                  rgw_obj(s->bucket->get_key(), s->object->get_key()),
+                                  s->user_acl.get(),
+                                  s->bucket_acl.get(),
+                                  s->object_acl.get(),
+                                  s->iam_policy,
+                                  s->iam_user_policies,
+                                  s->session_policies,
+                                  op);
+}
+
+
+int verify_object_lock(const DoutPrefixProvider* dpp, const rgw::sal::Attrs& attrs, const bool bypass_perm, const bool bypass_governance_mode) {
+  auto aiter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+  if (aiter != attrs.end()) {
+    RGWObjectRetention obj_retention;
+    try {
+      decode(obj_retention, aiter->second);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode RGWObjectRetention" << dendl;
+      return -EIO;
+    }
+    if (ceph::real_clock::to_time_t(obj_retention.get_retain_until_date()) > ceph_clock_now()) {
+      if (obj_retention.get_mode().compare("GOVERNANCE") != 0 || !bypass_perm || !bypass_governance_mode) {
+        return -EACCES;
+      }
+    }
+  }
+  aiter = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD);
+  if (aiter != attrs.end()) {
+    RGWObjectLegalHold obj_legal_hold;
+    try {
+      decode(obj_legal_hold, aiter->second);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode RGWObjectLegalHold" << dendl;
+      return -EIO;
+    }
+    if (obj_legal_hold.is_enabled()) {
+      return -EACCES;
+    }
+  }
+  
+  return 0;
+}
+
+
+class HexTable
+{
+  char table[256];
+
+public:
+  HexTable() {
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(table, -1, sizeof(table));
+    int i;
+    for (i = '0'; i<='9'; i++)
+      table[i] = i - '0';
+    for (i = 'A'; i<='F'; i++)
+      table[i] = i - 'A' + 0xa;
+    for (i = 'a'; i<='f'; i++)
+      table[i] = i - 'a' + 0xa;
+  }
+
+  char to_num(char c) {
+    return table[(int)c];
+  }
+};
+
+static char hex_to_num(char c)
+{
+  static HexTable hex_table;
+  return hex_table.to_num(c);
+}
+
+std::string url_decode(const std::string_view& src_str, bool in_query)
+{
+  std::string dest_str;
+  dest_str.reserve(src_str.length() + 1);
+
+  for (auto src = std::begin(src_str); src != std::end(src_str); ++src) {
+    if (*src != '%') {
+      if (!in_query || *src != '+') {
+        if (*src == '?') {
+          in_query = true;
+        }
+        dest_str.push_back(*src);
+      } else {
+        dest_str.push_back(' ');
+      }
+    } else {
+      /* 3 == strlen("%%XX") */
+      if (std::distance(src, std::end(src_str)) < 3) {
+        break;
+      }
+
+      src++;
+      const char c1 = hex_to_num(*src++);
+      const char c2 = hex_to_num(*src);
+      if (c1 < 0 || c2 < 0) {
+        return std::string();
+      } else {
+        dest_str.push_back(c1 << 4 | c2);
+      }
+    }
+  }
+
+  return dest_str;
+}
+
+void rgw_uri_escape_char(char c, string& dst)
+{
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%%%.2X", (int)(unsigned char)c);
+  dst.append(buf);
+}
+
+static bool char_needs_url_encoding(char c)
+{
+  if (c <= 0x20 || c >= 0x7f)
+    return true;
+
+  switch (c) {
+    case 0x22:
+    case 0x23:
+    case 0x25:
+    case 0x26:
+    case 0x2B:
+    case 0x2C:
+    case 0x2F:
+    case 0x3A:
+    case 0x3B:
+    case 0x3C:
+    case 0x3E:
+    case 0x3D:
+    case 0x3F:
+    case 0x40:
+    case 0x5B:
+    case 0x5D:
+    case 0x5C:
+    case 0x5E:
+    case 0x60:
+    case 0x7B:
+    case 0x7D:
+      return true;
+  }
+  return false;
+}
+
+void url_encode(const string& src, string& dst, bool encode_slash)
+{
+  const char *p = src.c_str();
+  for (unsigned i = 0; i < src.size(); i++, p++) {
+    if ((!encode_slash && *p == 0x2F) || !char_needs_url_encoding(*p)) {
+      dst.append(p, 1);
+    }else {
+      rgw_uri_escape_char(*p, dst);
+    }
+  }
+}
+
+std::string url_encode(const std::string& src, bool encode_slash)
+{
+  std::string dst;
+  url_encode(src, dst, encode_slash);
+
+  return dst;
+}
+
+std::string url_remove_prefix(const std::string& url)
+{
+  std::string dst = url;
+  auto pos = dst.find("http://");
+  if (pos == std::string::npos) {
+    pos = dst.find("https://");
+    if (pos != std::string::npos) {
+      dst.erase(pos, 8);
+    } else {
+      pos = dst.find("www.");
+      if (pos != std::string::npos) {
+        dst.erase(pos, 4);
+      }
+    }
+  } else {
+    dst.erase(pos, 7);
+  }
+
+  return dst;
+}
+
+string rgw_trim_whitespace(const string& src)
+{
+  if (src.empty()) {
+    return string();
+  }
+
+  int start = 0;
+  for (; start != (int)src.size(); start++) {
+    if (!isspace(src[start]))
+      break;
+  }
+
+  int end = src.size() - 1;
+  if (end < start) {
+    return string();
+  }
+
+  for (; end > start; end--) {
+    if (!isspace(src[end]))
+      break;
+  }
+
+  return src.substr(start, end - start + 1);
+}
+
+std::string_view rgw_trim_whitespace(const std::string_view& src)
+{
+  std::string_view res = src;
+
+  while (res.size() > 0 && std::isspace(res.front())) {
+    res.remove_prefix(1);
+  }
+  while (res.size() > 0 && std::isspace(res.back())) {
+    res.remove_suffix(1);
+  }
+  return res;
+}
+
+string rgw_trim_quotes(const string& val)
+{
+  string s = rgw_trim_whitespace(val);
+  if (s.size() < 2)
+    return s;
+
+  int start = 0;
+  int end = s.size() - 1;
+  int quotes_count = 0;
+
+  if (s[start] == '"') {
+    start++;
+    quotes_count++;
+  }
+  if (s[end] == '"') {
+    end--;
+    quotes_count++;
+  }
+  if (quotes_count == 2) {
+    return s.substr(start, end - start + 1);
+  }
+  return s;
+}
+
+static struct rgw_name_to_flag cap_names[] = { {"*",     RGW_CAP_ALL},
+                  {"read",  RGW_CAP_READ},
+		  {"write", RGW_CAP_WRITE},
+		  {NULL, 0} };
+
+static int rgw_parse_list_of_flags(struct rgw_name_to_flag *mapping,
+			    const string& str, uint32_t *perm)
+{
+  list<string> strs;
+  get_str_list(str, strs);
+  list<string>::iterator iter;
+  uint32_t v = 0;
+  for (iter = strs.begin(); iter != strs.end(); ++iter) {
+    string& s = *iter;
+    for (int i = 0; mapping[i].type_name; i++) {
+      if (s.compare(mapping[i].type_name) == 0)
+        v |= mapping[i].flag;
+    }
+  }
+
+  *perm = v;
+  return 0;
+}
+
+int RGWUserCaps::parse_cap_perm(const string& str, uint32_t *perm)
+{
+  return rgw_parse_list_of_flags(cap_names, str, perm);
+}
+
+int RGWUserCaps::get_cap(const string& cap, string& type, uint32_t *pperm)
+{
+  int pos = cap.find('=');
+  if (pos >= 0) {
+    type = rgw_trim_whitespace(cap.substr(0, pos));
+  }
+
+  if (!is_valid_cap_type(type))
+    return -ERR_INVALID_CAP;
+
+  string cap_perm;
+  uint32_t perm = 0;
+  if (pos < (int)cap.size() - 1) {
+    cap_perm = cap.substr(pos + 1);
+    int r = RGWUserCaps::parse_cap_perm(cap_perm, &perm);
+    if (r < 0)
+      return r;
+  }
+
+  *pperm = perm;
+
+  return 0;
+}
+
+int RGWUserCaps::add_cap(const string& cap)
+{
+  uint32_t perm;
+  string type;
+
+  int r = get_cap(cap, type, &perm);
+  if (r < 0)
+    return r;
+
+  caps[type] |= perm;
+
+  return 0;
+}
+
+int RGWUserCaps::remove_cap(const string& cap)
+{
+  uint32_t perm;
+  string type;
+
+  int r = get_cap(cap, type, &perm);
+  if (r < 0)
+    return r;
+
+  map<string, uint32_t>::iterator iter = caps.find(type);
+  if (iter == caps.end())
+    return 0;
+
+  uint32_t& old_perm = iter->second;
+  old_perm &= ~perm;
+  if (!old_perm)
+    caps.erase(iter);
+
+  return 0;
+}
+
+int RGWUserCaps::add_from_string(const string& str)
+{
+  int start = 0;
+  do {
+    auto end = str.find(';', start);
+    if (end == string::npos)
+      end = str.size();
+
+    int r = add_cap(str.substr(start, end - start));
+    if (r < 0)
+      return r;
+
+    start = end + 1;
+  } while (start < (int)str.size());
+
+  return 0;
+}
+
+int RGWUserCaps::remove_from_string(const string& str)
+{
+  int start = 0;
+  do {
+    auto end = str.find(';', start);
+    if (end == string::npos)
+      end = str.size();
+
+    int r = remove_cap(str.substr(start, end - start));
+    if (r < 0)
+      return r;
+
+    start = end + 1;
+  } while (start < (int)str.size());
+
+  return 0;
+}
+
+void RGWUserCaps::dump(Formatter *f) const
+{
+  dump(f, "caps");
+}
+
+void RGWUserCaps::dump(Formatter *f, const char *name) const
+{
+  f->open_array_section(name);
+  map<string, uint32_t>::const_iterator iter;
+  for (iter = caps.begin(); iter != caps.end(); ++iter)
+  {
+    f->open_object_section("cap");
+    f->dump_string("type", iter->first);
+    uint32_t perm = iter->second;
+    string perm_str;
+    for (int i=0; cap_names[i].type_name; i++) {
+      if ((perm & cap_names[i].flag) == cap_names[i].flag) {
+	if (perm_str.size())
+	  perm_str.append(", ");
+
+	perm_str.append(cap_names[i].type_name);
+	perm &= ~cap_names[i].flag;
+      }
+    }
+    if (perm_str.empty())
+      perm_str = "<none>";
+
+    f->dump_string("perm", perm_str);
+    f->close_section();
+  }
+
+  f->close_section();
+}
+
+struct RGWUserCap {
+  string type;
+  uint32_t perm;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("type", type, obj);
+    string perm_str;
+    JSONDecoder::decode_json("perm", perm_str, obj);
+    if (RGWUserCaps::parse_cap_perm(perm_str, &perm) < 0) {
+      throw JSONDecoder::err("failed to parse permissions");
+    }
+  }
+};
+
+void RGWUserCaps::decode_json(JSONObj *obj)
+{
+  list<RGWUserCap> caps_list;
+  decode_json_obj(caps_list, obj);
+
+  list<RGWUserCap>::iterator iter;
+  for (iter = caps_list.begin(); iter != caps_list.end(); ++iter) {
+    RGWUserCap& cap = *iter;
+    caps[cap.type] = cap.perm;
+  }
+}
+
+int RGWUserCaps::check_cap(const string& cap, uint32_t perm) const
+{
+  auto iter = caps.find(cap);
+
+  if ((iter == caps.end()) ||
+      (iter->second & perm) != perm) {
+    return -EPERM;
+  }
+
+  return 0;
+}
+
+bool RGWUserCaps::is_valid_cap_type(const string& tp)
+{
+  static const char *cap_type[] = { "user",
+                                    "users",
+                                    "buckets",
+                                    "metadata",
+                                    "info",
+                                    "usage",
+                                    "zone",
+                                    "bilog",
+                                    "mdlog",
+                                    "datalog",
+                                    "roles",
+                                    "user-policy",
+                                    "amz-cache",
+                                    "oidc-provider",
+				                            "ratelimit"};
+
+  for (unsigned int i = 0; i < sizeof(cap_type) / sizeof(char *); ++i) {
+    if (tp.compare(cap_type[i]) == 0) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void rgw_pool::from_str(const string& s)
+{
+  size_t pos = rgw_unescape_str(s, 0, '\\', ':', &name);
+  if (pos != string::npos) {
+    pos = rgw_unescape_str(s, pos, '\\', ':', &ns);
+    /* ignore return; if pos != string::npos it means that we had a colon
+     * in the middle of ns that wasn't escaped, we're going to stop there
+     */
+  }
+}
+
+string rgw_pool::to_str() const
+{
+  string esc_name;
+  rgw_escape_str(name, '\\', ':', &esc_name);
+  if (ns.empty()) {
+    return esc_name;
+  }
+  string esc_ns;
+  rgw_escape_str(ns, '\\', ':', &esc_ns);
+  return esc_name + ":" + esc_ns;
+}
+
+void rgw_raw_obj::decode_from_rgw_obj(bufferlist::const_iterator& bl)
+{
+  using ceph::decode;
+  rgw_obj old_obj;
+  decode(old_obj, bl);
+
+  get_obj_bucket_and_oid_loc(old_obj, oid, loc);
+  pool = old_obj.get_explicit_data_pool();
+}
+
+static struct rgw_name_to_flag op_type_mapping[] = { {"*",  RGW_OP_TYPE_ALL},
+                  {"read",  RGW_OP_TYPE_READ},
+		  {"write", RGW_OP_TYPE_WRITE},
+		  {"delete", RGW_OP_TYPE_DELETE},
+		  {NULL, 0} };
+
+
+int rgw_parse_op_type_list(const string& str, uint32_t *perm)
+{
+  return rgw_parse_list_of_flags(op_type_mapping, str, perm);
+}
+
+bool match_policy(std::string_view pattern, std::string_view input,
+                  uint32_t flag)
+{
+  const uint32_t flag2 = flag & (MATCH_POLICY_ACTION|MATCH_POLICY_ARN) ?
+      MATCH_CASE_INSENSITIVE : 0;
+  const bool colonblocks = !(flag & (MATCH_POLICY_RESOURCE |
+				     MATCH_POLICY_STRING));
+
+  const auto npos = std::string_view::npos;
+  std::string_view::size_type last_pos_input = 0, last_pos_pattern = 0;
+  while (true) {
+    auto cur_pos_input = colonblocks ? input.find(":", last_pos_input) : npos;
+    auto cur_pos_pattern =
+      colonblocks ? pattern.find(":", last_pos_pattern) : npos;
+
+    auto substr_input = input.substr(last_pos_input, cur_pos_input);
+    auto substr_pattern = pattern.substr(last_pos_pattern, cur_pos_pattern);
+
+    if (!match_wildcards(substr_pattern, substr_input, flag2))
+      return false;
+
+    if (cur_pos_pattern == npos)
+      return cur_pos_input == npos;
+    if (cur_pos_input == npos)
+      return false;
+
+    last_pos_pattern = cur_pos_pattern + 1;
+    last_pos_input = cur_pos_input + 1;
+  }
+}
+
+/*
+ * make attrs look-like-this
+ * converts underscores to dashes
+ */
+string lowercase_dash_http_attr(const string& orig)
+{
+  const char *s = orig.c_str();
+  char buf[orig.size() + 1];
+  buf[orig.size()] = '\0';
+
+  for (size_t i = 0; i < orig.size(); ++i, ++s) {
+    switch (*s) {
+      case '_':
+        buf[i] = '-';
+        break;
+      default:
+        buf[i] = tolower(*s);
+    }
+  }
+  return string(buf);
+}
+
+/*
+ * make attrs Look-Like-This
+ * converts underscores to dashes
+ */
+string camelcase_dash_http_attr(const string& orig)
+{
+  const char *s = orig.c_str();
+  char buf[orig.size() + 1];
+  buf[orig.size()] = '\0';
+
+  bool last_sep = true;
+
+  for (size_t i = 0; i < orig.size(); ++i, ++s) {
+    switch (*s) {
+      case '_':
+      case '-':
+        buf[i] = '-';
+        last_sep = true;
+        break;
+      default:
+        if (last_sep) {
+          buf[i] = toupper(*s);
+        } else {
+          buf[i] = tolower(*s);
+        }
+        last_sep = false;
+    }
+  }
+  return string(buf);
+}
+
+RGWBucketInfo::RGWBucketInfo()
+{
+}
+
+RGWBucketInfo::~RGWBucketInfo()
+{
+}
+
+void RGWBucketInfo::encode(bufferlist& bl) const {
+  ENCODE_START(23, 4, bl);
+  encode(bucket, bl);
+  encode(owner.id, bl);
+  encode(flags, bl);
+  encode(zonegroup, bl);
+  uint64_t ct = real_clock::to_time_t(creation_time);
+  encode(ct, bl);
+  encode(placement_rule, bl);
+  encode(has_instance_obj, bl);
+  encode(quota, bl);
+  encode(requester_pays, bl);
+  encode(owner.tenant, bl);
+  encode(has_website, bl);
+  if (has_website) {
+    encode(website_conf, bl);
+  }
+  encode(swift_versioning, bl);
+  if (swift_versioning) {
+    encode(swift_ver_location, bl);
+  }
+  encode(creation_time, bl);
+  encode(mdsearch_config, bl);
+  encode(reshard_status, bl);
+  encode(new_bucket_instance_id, bl);
+  if (obj_lock_enabled()) {
+    encode(obj_lock, bl);
+  }
+  bool has_sync_policy = !empty_sync_policy();
+  encode(has_sync_policy, bl);
+  if (has_sync_policy) {
+    encode(*sync_policy, bl);
+  }
+  encode(layout, bl);
+  encode(owner.ns, bl);
+  ENCODE_FINISH(bl);
+}
+
+void RGWBucketInfo::decode(bufferlist::const_iterator& bl) {
+  DECODE_START_LEGACY_COMPAT_LEN_32(23, 4, 4, bl);
+  decode(bucket, bl);
+  if (struct_v >= 2) {
+    string s;
+    decode(s, bl);
+    owner.from_str(s);
+  }
+  if (struct_v >= 3)
+    decode(flags, bl);
+  if (struct_v >= 5)
+    decode(zonegroup, bl);
+  if (struct_v >= 6) {
+    uint64_t ct;
+    decode(ct, bl);
+    if (struct_v < 17)
+      creation_time = ceph::real_clock::from_time_t((time_t)ct);
+  }
+  if (struct_v >= 7)
+    decode(placement_rule, bl);
+  if (struct_v >= 8)
+    decode(has_instance_obj, bl);
+  if (struct_v >= 9)
+    decode(quota, bl);
+  static constexpr uint8_t new_layout_v = 22;
+  if (struct_v >= 10 && struct_v < new_layout_v)
+    decode(layout.current_index.layout.normal.num_shards, bl);
+  if (struct_v >= 11 && struct_v < new_layout_v)
+    decode(layout.current_index.layout.normal.hash_type, bl);
+  if (struct_v >= 12)
+    decode(requester_pays, bl);
+  if (struct_v >= 13)
+    decode(owner.tenant, bl);
+  if (struct_v >= 14) {
+    decode(has_website, bl);
+    if (has_website) {
+      decode(website_conf, bl);
+    } else {
+      website_conf = RGWBucketWebsiteConf();
+    }
+  }
+  if (struct_v >= 15 && struct_v < new_layout_v) {
+    uint32_t it;
+    decode(it, bl);
+    layout.current_index.layout.type = (rgw::BucketIndexType)it;
+  } else {
+    layout.current_index.layout.type = rgw::BucketIndexType::Normal;
+  }
+  swift_versioning = false;
+  swift_ver_location.clear();
+  if (struct_v >= 16) {
+    decode(swift_versioning, bl);
+    if (swift_versioning) {
+      decode(swift_ver_location, bl);
+   }
+  }
+  if (struct_v >= 17) {
+    decode(creation_time, bl);
+  }
+  if (struct_v >= 18) {
+    decode(mdsearch_config, bl);
+  }
+  if (struct_v >= 19) {
+    decode(reshard_status, bl);
+    decode(new_bucket_instance_id, bl);
+  }
+  if (struct_v >= 20 && obj_lock_enabled()) {
+    decode(obj_lock, bl);
+  }
+  if (struct_v >= 21) {
+    decode(sync_policy, bl);
+  }
+  if (struct_v >= 22) {
+    decode(layout, bl);
+  }
+  if (struct_v >= 23) {
+    decode(owner.ns, bl);
+  }
+
+  if (layout.logs.empty() &&
+      layout.current_index.layout.type == rgw::BucketIndexType::Normal) {
+    layout.logs.push_back(rgw::log_layout_from_index(0, layout.current_index));
+  }
+  DECODE_FINISH(bl);
+}
+
+void RGWBucketInfo::set_sync_policy(rgw_sync_policy_info&& policy)
+{
+  sync_policy = std::move(policy);
+}
+
+bool RGWBucketInfo::empty_sync_policy() const
+{
+  if (!sync_policy) {
+    return true;
+  }
+
+  return sync_policy->empty();
+}
+
+struct rgw_pool;
+struct rgw_placement_rule;
+class RGWUserCaps;
+
+void decode_json_obj(rgw_pool& pool, JSONObj *obj)
+{
+  string s;
+  decode_json_obj(s, obj);
+  pool = rgw_pool(s);
+}
+
+void encode_json(const char *name, const rgw_placement_rule& r, Formatter *f)
+{
+  encode_json(name, r.to_str(), f);
+}
+
+void encode_json(const char *name, const rgw_pool& pool, Formatter *f)
+{
+  f->dump_string(name, pool.to_str());
+}
+
+void encode_json(const char *name, const RGWUserCaps& val, Formatter *f)
+{
+  val.dump(f, name);
+}
+
+void RGWBucketEnt::generate_test_instances(list<RGWBucketEnt*>& o)
+{
+  RGWBucketEnt *e = new RGWBucketEnt;
+  init_bucket(&e->bucket, "tenant", "bucket", "pool", ".index_pool", "marker", "10");
+  e->size = 1024;
+  e->size_rounded = 4096;
+  e->count = 1;
+  o.push_back(e);
+  o.push_back(new RGWBucketEnt);
+}
+
+void RGWBucketEnt::dump(Formatter *f) const
+{
+  encode_json("bucket", bucket, f);
+  encode_json("size", size, f);
+  encode_json("size_rounded", size_rounded, f);
+  utime_t ut(creation_time);
+  encode_json("mtime", ut, f); /* mtime / creation time discrepency needed for backward compatibility */
+  encode_json("count", count, f);
+  encode_json("placement_rule", placement_rule.to_str(), f);
+}
+
+void rgw_obj::generate_test_instances(list<rgw_obj*>& o)
+{
+  rgw_bucket b;
+  init_bucket(&b, "tenant", "bucket", "pool", ".index_pool", "marker", "10");
+  rgw_obj *obj = new rgw_obj(b, "object");
+  o.push_back(obj);
+  o.push_back(new rgw_obj);
+}
+
+void rgw_bucket_placement::dump(Formatter *f) const
+{
+  encode_json("bucket", bucket, f);
+  encode_json("placement_rule", placement_rule, f);
+}
+
+void RGWBucketInfo::generate_test_instances(list<RGWBucketInfo*>& o)
+{
+  // Since things without a log will have one synthesized on decode,
+  // ensure the things we attempt to encode will have one added so we
+  // round-trip properly.
+  auto gen_layout = [](rgw::BucketLayout& l) {
+    l.current_index.gen = 0;
+    l.current_index.layout.normal.hash_type = rgw::BucketHashType::Mod;
+    l.current_index.layout.type = rgw::BucketIndexType::Normal;
+    l.current_index.layout.normal.num_shards = 11;
+    l.logs.push_back(log_layout_from_index(
+                       l.current_index.gen,
+                       l.current_index));
+  };
+
+
+  RGWBucketInfo *i = new RGWBucketInfo;
+  init_bucket(&i->bucket, "tenant", "bucket", "pool", ".index_pool", "marker", "10");
+  i->owner = "owner";
+  i->flags = BUCKET_SUSPENDED;
+  gen_layout(i->layout);
+  o.push_back(i);
+  i = new RGWBucketInfo;
+  gen_layout(i->layout);
+  o.push_back(i);
+}
+
+void RGWBucketInfo::dump(Formatter *f) const
+{
+  encode_json("bucket", bucket, f);
+  utime_t ut(creation_time);
+  encode_json("creation_time", ut, f);
+  encode_json("owner", owner.to_str(), f);
+  encode_json("flags", flags, f);
+  encode_json("zonegroup", zonegroup, f);
+  encode_json("placement_rule", placement_rule, f);
+  encode_json("has_instance_obj", has_instance_obj, f);
+  encode_json("quota", quota, f);
+  encode_json("num_shards", layout.current_index.layout.normal.num_shards, f);
+  encode_json("bi_shard_hash_type", (uint32_t)layout.current_index.layout.normal.hash_type, f);
+  encode_json("requester_pays", requester_pays, f);
+  encode_json("has_website", has_website, f);
+  if (has_website) {
+    encode_json("website_conf", website_conf, f);
+  }
+  encode_json("swift_versioning", swift_versioning, f);
+  encode_json("swift_ver_location", swift_ver_location, f);
+  encode_json("index_type", (uint32_t)layout.current_index.layout.type, f);
+  encode_json("mdsearch_config", mdsearch_config, f);
+  encode_json("reshard_status", (int)reshard_status, f);
+  encode_json("new_bucket_instance_id", new_bucket_instance_id, f);
+  if (!empty_sync_policy()) {
+    encode_json("sync_policy", *sync_policy, f);
+  }
+}
+
+void RGWBucketInfo::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("bucket", bucket, obj);
+  utime_t ut;
+  JSONDecoder::decode_json("creation_time", ut, obj);
+  creation_time = ut.to_real_time();
+  JSONDecoder::decode_json("owner", owner, obj);
+  JSONDecoder::decode_json("flags", flags, obj);
+  JSONDecoder::decode_json("zonegroup", zonegroup, obj);
+  /* backward compatability with region */
+  if (zonegroup.empty()) {
+    JSONDecoder::decode_json("region", zonegroup, obj);
+  }
+  string pr;
+  JSONDecoder::decode_json("placement_rule", pr, obj);
+  placement_rule.from_str(pr);
+  JSONDecoder::decode_json("has_instance_obj", has_instance_obj, obj);
+  JSONDecoder::decode_json("quota", quota, obj);
+  JSONDecoder::decode_json("num_shards", layout.current_index.layout.normal.num_shards, obj);
+  uint32_t hash_type;
+  JSONDecoder::decode_json("bi_shard_hash_type", hash_type, obj);
+  layout.current_index.layout.normal.hash_type = static_cast<rgw::BucketHashType>(hash_type);
+  JSONDecoder::decode_json("requester_pays", requester_pays, obj);
+  JSONDecoder::decode_json("has_website", has_website, obj);
+  if (has_website) {
+    JSONDecoder::decode_json("website_conf", website_conf, obj);
+  }
+  JSONDecoder::decode_json("swift_versioning", swift_versioning, obj);
+  JSONDecoder::decode_json("swift_ver_location", swift_ver_location, obj);
+  uint32_t it;
+  JSONDecoder::decode_json("index_type", it, obj);
+  layout.current_index.layout.type = (rgw::BucketIndexType)it;
+  JSONDecoder::decode_json("mdsearch_config", mdsearch_config, obj);
+  int rs;
+  JSONDecoder::decode_json("reshard_status", rs, obj);
+  reshard_status = (cls_rgw_reshard_status)rs;
+
+  rgw_sync_policy_info sp;
+  JSONDecoder::decode_json("sync_policy", sp, obj);
+  if (!sp.empty()) {
+    set_sync_policy(std::move(sp));
+  }
+}
+
+void RGWUserInfo::generate_test_instances(list<RGWUserInfo*>& o)
+{
+  RGWUserInfo *i = new RGWUserInfo;
+  i->user_id = "user_id";
+  i->display_name =  "display_name";
+  i->user_email = "user@email";
+  RGWAccessKey k1, k2;
+  k1.id = "id1";
+  k1.key = "key1";
+  k2.id = "id2";
+  k2.subuser = "subuser";
+  RGWSubUser u;
+  u.name = "id2";
+  u.perm_mask = 0x1;
+  i->access_keys[k1.id] = k1;
+  i->swift_keys[k2.id] = k2;
+  i->subusers[u.name] = u;
+  o.push_back(i);
+
+  o.push_back(new RGWUserInfo);
+}
+
+static void user_info_dump_subuser(const char *name, const RGWSubUser& subuser, Formatter *f, void *parent)
+{
+  RGWUserInfo *info = static_cast<RGWUserInfo *>(parent);
+  subuser.dump(f, info->user_id.to_str());
+}
+
+static void user_info_dump_key(const char *name, const RGWAccessKey& key, Formatter *f, void *parent)
+{
+  RGWUserInfo *info = static_cast<RGWUserInfo *>(parent);
+  key.dump(f, info->user_id.to_str(), false);
+}
+
+static void user_info_dump_swift_key(const char *name, const RGWAccessKey& key, Formatter *f, void *parent)
+{
+  RGWUserInfo *info = static_cast<RGWUserInfo *>(parent);
+  key.dump(f, info->user_id.to_str(), true);
+}
+
+static void decode_access_keys(map<string, RGWAccessKey>& m, JSONObj *o)
+{
+  RGWAccessKey k;
+  k.decode_json(o);
+  m[k.id] = k;
+}
+
+static void decode_swift_keys(map<string, RGWAccessKey>& m, JSONObj *o)
+{
+  RGWAccessKey k;
+  k.decode_json(o, true);
+  m[k.id] = k;
+}
+
+static void decode_subusers(map<string, RGWSubUser>& m, JSONObj *o)
+{
+  RGWSubUser u;
+  u.decode_json(o);
+  m[u.name] = u;
+}
+
+
+struct rgw_flags_desc {
+  uint32_t mask;
+  const char *str;
+};
+
+static struct rgw_flags_desc rgw_perms[] = {
+ { RGW_PERM_FULL_CONTROL, "full-control" },
+ { RGW_PERM_READ | RGW_PERM_WRITE, "read-write" },
+ { RGW_PERM_READ, "read" },
+ { RGW_PERM_WRITE, "write" },
+ { RGW_PERM_READ_ACP, "read-acp" },
+ { RGW_PERM_WRITE_ACP, "write-acp" },
+ { 0, NULL }
+};
+
+void rgw_perm_to_str(uint32_t mask, char *buf, int len)
+{
+  const char *sep = "";
+  int pos = 0;
+  if (!mask) {
+    snprintf(buf, len, "<none>");
+    return;
+  }
+  while (mask) {
+    uint32_t orig_mask = mask;
+    for (int i = 0; rgw_perms[i].mask; i++) {
+      struct rgw_flags_desc *desc = &rgw_perms[i];
+      if ((mask & desc->mask) == desc->mask) {
+        pos += snprintf(buf + pos, len - pos, "%s%s", sep, desc->str);
+        if (pos == len)
+          return;
+        sep = ", ";
+        mask &= ~desc->mask;
+        if (!mask)
+          return;
+      }
+    }
+    if (mask == orig_mask) // no change
+      break;
+  }
+}
+
+uint32_t rgw_str_to_perm(const char *str)
+{
+  if (strcasecmp(str, "") == 0)
+    return RGW_PERM_NONE;
+  else if (strcasecmp(str, "read") == 0)
+    return RGW_PERM_READ;
+  else if (strcasecmp(str, "write") == 0)
+    return RGW_PERM_WRITE;
+  else if (strcasecmp(str, "readwrite") == 0)
+    return RGW_PERM_READ | RGW_PERM_WRITE;
+  else if (strcasecmp(str, "full") == 0)
+    return RGW_PERM_FULL_CONTROL;
+
+  return RGW_PERM_INVALID;
+}
+
+template <class T>
+static void mask_to_str(T *mask_list, uint32_t mask, char *buf, int len)
+{
+  const char *sep = "";
+  int pos = 0;
+  if (!mask) {
+    snprintf(buf, len, "<none>");
+    return;
+  }
+  while (mask) {
+    uint32_t orig_mask = mask;
+    for (int i = 0; mask_list[i].mask; i++) {
+      T *desc = &mask_list[i];
+      if ((mask & desc->mask) == desc->mask) {
+        pos += snprintf(buf + pos, len - pos, "%s%s", sep, desc->str);
+        if (pos == len)
+          return;
+        sep = ", ";
+        mask &= ~desc->mask;
+        if (!mask)
+          return;
+      }
+    }
+    if (mask == orig_mask) // no change
+      break;
+  }
+}
+
+static void perm_to_str(uint32_t mask, char *buf, int len)
+{
+  return mask_to_str(rgw_perms, mask, buf, len);
+}
+
+static struct rgw_flags_desc op_type_flags[] = {
+ { RGW_OP_TYPE_READ, "read" },
+ { RGW_OP_TYPE_WRITE, "write" },
+ { RGW_OP_TYPE_DELETE, "delete" },
+ { 0, NULL }
+};
+
+void op_type_to_str(uint32_t mask, char *buf, int len)
+{
+  return mask_to_str(op_type_flags, mask, buf, len);
+}
+
+void RGWRateLimitInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("max_read_ops", max_read_ops, obj);
+  JSONDecoder::decode_json("max_write_ops", max_write_ops, obj);
+  JSONDecoder::decode_json("max_read_bytes", max_read_bytes, obj);
+  JSONDecoder::decode_json("max_write_bytes", max_write_bytes, obj);
+  JSONDecoder::decode_json("enabled", enabled, obj);
+}
+
+void RGWRateLimitInfo::dump(Formatter *f) const
+{
+  f->dump_int("max_read_ops", max_read_ops);
+  f->dump_int("max_write_ops", max_write_ops);
+  f->dump_int("max_read_bytes", max_read_bytes);
+  f->dump_int("max_write_bytes", max_write_bytes);
+  f->dump_bool("enabled", enabled);
+}
+
+void RGWUserInfo::dump(Formatter *f) const
+{
+
+  encode_json("user_id", user_id.to_str(), f);
+  encode_json("display_name", display_name, f);
+  encode_json("email", user_email, f);
+  encode_json("suspended", (int)suspended, f);
+  encode_json("max_buckets", (int)max_buckets, f);
+
+  encode_json_map("subusers", NULL, "subuser", NULL, user_info_dump_subuser,(void *)this, subusers, f);
+  encode_json_map("keys", NULL, "key", NULL, user_info_dump_key,(void *)this, access_keys, f);
+  encode_json_map("swift_keys", NULL, "key", NULL, user_info_dump_swift_key,(void *)this, swift_keys, f);
+
+  encode_json("caps", caps, f);
+
+  char buf[256];
+  op_type_to_str(op_mask, buf, sizeof(buf));
+  encode_json("op_mask", (const char *)buf, f);
+
+  if (system) { /* no need to show it for every user */
+    encode_json("system", (bool)system, f);
+  }
+  if (admin) {
+    encode_json("admin", (bool)admin, f);
+  }
+  encode_json("default_placement", default_placement.name, f);
+  encode_json("default_storage_class", default_placement.storage_class, f);
+  encode_json("placement_tags", placement_tags, f);
+  encode_json("bucket_quota", quota.bucket_quota, f);
+  encode_json("user_quota", quota.user_quota, f);
+  encode_json("temp_url_keys", temp_url_keys, f);
+
+  string user_source_type;
+  switch ((RGWIdentityType)type) {
+  case TYPE_RGW:
+    user_source_type = "rgw";
+    break;
+  case TYPE_KEYSTONE:
+    user_source_type = "keystone";
+    break;
+  case TYPE_LDAP:
+    user_source_type = "ldap";
+    break;
+  case TYPE_NONE:
+    user_source_type = "none";
+    break;
+  default:
+    user_source_type = "none";
+    break;
+  }
+  encode_json("type", user_source_type, f);
+  encode_json("mfa_ids", mfa_ids, f);
+}
+
+void RGWUserInfo::decode_json(JSONObj *obj)
+{
+  string uid;
+
+  JSONDecoder::decode_json("user_id", uid, obj, true);
+  user_id.from_str(uid);
+
+  JSONDecoder::decode_json("display_name", display_name, obj);
+  JSONDecoder::decode_json("email", user_email, obj);
+  bool susp = false;
+  JSONDecoder::decode_json("suspended", susp, obj);
+  suspended = (__u8)susp;
+  JSONDecoder::decode_json("max_buckets", max_buckets, obj);
+
+  JSONDecoder::decode_json("keys", access_keys, decode_access_keys, obj);
+  JSONDecoder::decode_json("swift_keys", swift_keys, decode_swift_keys, obj);
+  JSONDecoder::decode_json("subusers", subusers, decode_subusers, obj);
+
+  JSONDecoder::decode_json("caps", caps, obj);
+
+  string mask_str;
+  JSONDecoder::decode_json("op_mask", mask_str, obj);
+  rgw_parse_op_type_list(mask_str, &op_mask);
+
+  bool sys = false;
+  JSONDecoder::decode_json("system", sys, obj);
+  system = (__u8)sys;
+  bool ad = false;
+  JSONDecoder::decode_json("admin", ad, obj);
+  admin = (__u8)ad;
+  JSONDecoder::decode_json("default_placement", default_placement.name, obj);
+  JSONDecoder::decode_json("default_storage_class", default_placement.storage_class, obj);
+  JSONDecoder::decode_json("placement_tags", placement_tags, obj);
+  JSONDecoder::decode_json("bucket_quota", quota.bucket_quota, obj);
+  JSONDecoder::decode_json("user_quota", quota.user_quota, obj);
+  JSONDecoder::decode_json("temp_url_keys", temp_url_keys, obj);
+
+  string user_source_type;
+  JSONDecoder::decode_json("type", user_source_type, obj);
+  if (user_source_type == "rgw") {
+    type = TYPE_RGW;
+  } else if (user_source_type == "keystone") {
+    type = TYPE_KEYSTONE;
+  } else if (user_source_type == "ldap") {
+    type = TYPE_LDAP;
+  } else if (user_source_type == "none") {
+    type = TYPE_NONE;
+  }
+  JSONDecoder::decode_json("mfa_ids", mfa_ids, obj);
+}
+
+
+void RGWSubUser::generate_test_instances(list<RGWSubUser*>& o)
+{
+  RGWSubUser *u = new RGWSubUser;
+  u->name = "name";
+  u->perm_mask = 0xf;
+  o.push_back(u);
+  o.push_back(new RGWSubUser);
+}
+
+void RGWSubUser::dump(Formatter *f) const
+{
+  encode_json("id", name, f);
+  char buf[256];
+  perm_to_str(perm_mask, buf, sizeof(buf));
+  encode_json("permissions", (const char *)buf, f);
+}
+
+void RGWSubUser::dump(Formatter *f, const string& user) const
+{
+  string s = user;
+  s.append(":");
+  s.append(name);
+  encode_json("id", s, f);
+  char buf[256];
+  perm_to_str(perm_mask, buf, sizeof(buf));
+  encode_json("permissions", (const char *)buf, f);
+}
+
+uint32_t str_to_perm(const string& s)
+{
+  if (s.compare("read") == 0)
+    return RGW_PERM_READ;
+  else if (s.compare("write") == 0)
+    return RGW_PERM_WRITE;
+  else if (s.compare("read-write") == 0)
+    return RGW_PERM_READ | RGW_PERM_WRITE;
+  else if (s.compare("full-control") == 0)
+    return RGW_PERM_FULL_CONTROL;
+  return 0;
+}
+
+void RGWSubUser::decode_json(JSONObj *obj)
+{
+  string uid;
+  JSONDecoder::decode_json("id", uid, obj);
+  int pos = uid.find(':');
+  if (pos >= 0)
+    name = uid.substr(pos + 1);
+  string perm_str;
+  JSONDecoder::decode_json("permissions", perm_str, obj);
+  perm_mask = str_to_perm(perm_str);
+}
+
+void RGWAccessKey::generate_test_instances(list<RGWAccessKey*>& o)
+{
+  RGWAccessKey *k = new RGWAccessKey;
+  k->id = "id";
+  k->key = "key";
+  k->subuser = "subuser";
+  o.push_back(k);
+  o.push_back(new RGWAccessKey);
+}
+
+void RGWAccessKey::dump(Formatter *f) const
+{
+  encode_json("access_key", id, f);
+  encode_json("secret_key", key, f);
+  encode_json("subuser", subuser, f);
+}
+
+void RGWAccessKey::dump_plain(Formatter *f) const
+{
+  encode_json("access_key", id, f);
+  encode_json("secret_key", key, f);
+}
+
+void RGWAccessKey::dump(Formatter *f, const string& user, bool swift) const
+{
+  string u = user;
+  if (!subuser.empty()) {
+    u.append(":");
+    u.append(subuser);
+  }
+  encode_json("user", u, f);
+  if (!swift) {
+    encode_json("access_key", id, f);
+  }
+  encode_json("secret_key", key, f);
+}
+
+void RGWAccessKey::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("access_key", id, obj, true);
+  JSONDecoder::decode_json("secret_key", key, obj, true);
+  if (!JSONDecoder::decode_json("subuser", subuser, obj)) {
+    string user;
+    JSONDecoder::decode_json("user", user, obj);
+    int pos = user.find(':');
+    if (pos >= 0) {
+      subuser = user.substr(pos + 1);
+    }
+  }
+}
+
+void RGWAccessKey::decode_json(JSONObj *obj, bool swift) {
+  if (!swift) {
+    decode_json(obj);
+    return;
+  }
+
+  if (!JSONDecoder::decode_json("subuser", subuser, obj)) {
+    JSONDecoder::decode_json("user", id, obj, true);
+    int pos = id.find(':');
+    if (pos >= 0) {
+      subuser = id.substr(pos + 1);
+    }
+  }
+  JSONDecoder::decode_json("secret_key", key, obj, true);
+}
+
+void RGWStorageStats::dump(Formatter *f) const
+{
+  encode_json("size", size, f);
+  encode_json("size_actual", size_rounded, f);
+  if (dump_utilized) {
+    encode_json("size_utilized", size_utilized, f);
+  }
+  encode_json("size_kb", rgw_rounded_kb(size), f);
+  encode_json("size_kb_actual", rgw_rounded_kb(size_rounded), f);
+  if (dump_utilized) {
+    encode_json("size_kb_utilized", rgw_rounded_kb(size_utilized), f);
+  }
+  encode_json("num_objects", num_objects, f);
+}
+
+void rgw_obj_key::dump(Formatter *f) const
+{
+  encode_json("name", name, f);
+  encode_json("instance", instance, f);
+  encode_json("ns", ns, f);
+}
+
+void rgw_obj_key::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("name", name, obj);
+  JSONDecoder::decode_json("instance", instance, obj);
+  JSONDecoder::decode_json("ns", ns, obj);
+}
+
+void rgw_raw_obj::dump(Formatter *f) const
+{
+  encode_json("pool", pool, f);
+  encode_json("oid", oid, f);
+  encode_json("loc", loc, f);
+}
+
+void rgw_raw_obj::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("pool", pool, obj);
+  JSONDecoder::decode_json("oid", oid, obj);
+  JSONDecoder::decode_json("loc", loc, obj);
+}
+
+void rgw_obj::dump(Formatter *f) const
+{
+  encode_json("bucket", bucket, f);
+  encode_json("key", key, f);
+}
+
+int rgw_bucket_parse_bucket_instance(const string& bucket_instance, string *bucket_name, string *bucket_id, int *shard_id)
+{
+  auto pos = bucket_instance.rfind(':');
+  if (pos == string::npos) {
+    return -EINVAL;
+  }
+
+  string first = bucket_instance.substr(0, pos);
+  string second = bucket_instance.substr(pos + 1);
+
+  pos = first.find(':');
+
+  if (pos == string::npos) {
+    *shard_id = -1;
+    *bucket_name = first;
+    *bucket_id = second;
+    return 0;
+  }
+
+  *bucket_name = first.substr(0, pos);
+  *bucket_id = first.substr(pos + 1);
+
+  string err;
+  *shard_id = strict_strtol(second.c_str(), 10, &err);
+  if (!err.empty()) {
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+boost::intrusive_ptr<CephContext>
+rgw_global_init(const std::map<std::string,std::string> *defaults,
+		    std::vector < const char* >& args,
+		    uint32_t module_type, code_environment_t code_env,
+		    int flags)
+{
+  // Load the config from the files, but not the mon
+  global_pre_init(defaults, args, module_type, code_env, flags);
+
+  // Get the store backend
+  const auto& config_store = g_conf().get_val<std::string>("rgw_backend_store");
+
+  if ((config_store == "dbstore") ||
+      (config_store == "motr") || 
+      (config_store == "daos")) {
+    // These stores don't use the mon
+    flags |= CINIT_FLAG_NO_MON_CONFIG;
+  }
+
+  // Finish global init, indicating we already ran pre-init
+  return global_init(defaults, args, module_type, code_env, flags, false);
+}
+
+void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
+{
+  write_version.ver = 1;
+#define TAG_LEN 24
+
+  write_version.tag.clear();
+  append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
+}
+
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
new file mode 100644
index 000000000..648b2e087
--- /dev/null
+++ b/src/rgw/rgw_common.h
@@ -0,0 +1,1842 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net>
+ * Copyright (C) 2015 Yehuda Sadeh <yehuda@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <array>
+#include <string_view>
+#include <atomic>
+#include <unordered_map>
+
+#include <fmt/format.h>
+
+#include "common/ceph_crypto.h"
+#include "common/random_string.h"
+#include "rgw_acl.h"
+#include "rgw_bucket_layout.h"
+#include "rgw_cors.h"
+#include "rgw_basic_types.h"
+#include "rgw_iam_policy.h"
+#include "rgw_quota_types.h"
+#include "rgw_string.h"
+#include "common/async/yield_context.h"
+#include "rgw_website.h"
+#include "rgw_object_lock.h"
+#include "rgw_tag.h"
+#include "rgw_op_type.h"
+#include "rgw_sync_policy.h"
+#include "cls/version/cls_version_types.h"
+#include "cls/user/cls_user_types.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "include/rados/librados.hpp"
+#include "rgw_public_access.h"
+#include "common/tracer.h"
+#include "rgw_sal_fwd.h"
+
+namespace ceph {
+  class Formatter;
+}
+
+namespace rgw::sal {
+  using Attrs = std::map<std::string, ceph::buffer::list>;
+}
+
+namespace rgw::lua {
+  class Background;
+}
+
+struct RGWProcessEnv;
+
+using ceph::crypto::MD5;
+
+#define RGW_ATTR_PREFIX  "user.rgw."
+
+#define RGW_HTTP_RGWX_ATTR_PREFIX "RGWX_ATTR_"
+#define RGW_HTTP_RGWX_ATTR_PREFIX_OUT "Rgwx-Attr-"
+
+#define RGW_AMZ_PREFIX "x-amz-"
+#define RGW_AMZ_META_PREFIX RGW_AMZ_PREFIX "meta-"
+#define RGW_AMZ_WEBSITE_REDIRECT_LOCATION RGW_AMZ_PREFIX "website-redirect-location"
+#define RGW_AMZ_TAG_COUNT RGW_AMZ_PREFIX "tagging-count"
+
+#define RGW_SYS_PARAM_PREFIX "rgwx-"
+
+#define RGW_ATTR_ACL		RGW_ATTR_PREFIX "acl"
+#define RGW_ATTR_RATELIMIT		RGW_ATTR_PREFIX "ratelimit"
+#define RGW_ATTR_LC            RGW_ATTR_PREFIX "lc"
+#define RGW_ATTR_CORS		RGW_ATTR_PREFIX "cors"
+#define RGW_ATTR_ETAG    	RGW_ATTR_PREFIX "etag"
+#define RGW_ATTR_BUCKETS	RGW_ATTR_PREFIX "buckets"
+#define RGW_ATTR_META_PREFIX	RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX
+#define RGW_ATTR_CONTENT_TYPE	RGW_ATTR_PREFIX "content_type"
+#define RGW_ATTR_CACHE_CONTROL	RGW_ATTR_PREFIX "cache_control"
+#define RGW_ATTR_CONTENT_DISP	RGW_ATTR_PREFIX "content_disposition"
+#define RGW_ATTR_CONTENT_ENC	RGW_ATTR_PREFIX "content_encoding"
+#define RGW_ATTR_CONTENT_LANG	RGW_ATTR_PREFIX "content_language"
+#define RGW_ATTR_EXPIRES	RGW_ATTR_PREFIX "expires"
+#define RGW_ATTR_DELETE_AT 	RGW_ATTR_PREFIX "delete_at"
+#define RGW_ATTR_ID_TAG    	RGW_ATTR_PREFIX "idtag"
+#define RGW_ATTR_TAIL_TAG    	RGW_ATTR_PREFIX "tail_tag"
+#define RGW_ATTR_SHADOW_OBJ    	RGW_ATTR_PREFIX "shadow_name"
+#define RGW_ATTR_MANIFEST    	RGW_ATTR_PREFIX "manifest"
+#define RGW_ATTR_USER_MANIFEST  RGW_ATTR_PREFIX "user_manifest"
+#define RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION	RGW_ATTR_PREFIX RGW_AMZ_WEBSITE_REDIRECT_LOCATION
+#define RGW_ATTR_SLO_MANIFEST   RGW_ATTR_PREFIX "slo_manifest"
+/* Information whether an object is SLO or not must be exposed to
+ * user through custom HTTP header named X-Static-Large-Object. */
+#define RGW_ATTR_SLO_UINDICATOR RGW_ATTR_META_PREFIX "static-large-object"
+#define RGW_ATTR_X_ROBOTS_TAG	RGW_ATTR_PREFIX "x-robots-tag"
+#define RGW_ATTR_STORAGE_CLASS  RGW_ATTR_PREFIX "storage_class"
+
+/* S3 Object Lock*/
+#define RGW_ATTR_OBJECT_LOCK        RGW_ATTR_PREFIX "object-lock"
+#define RGW_ATTR_OBJECT_RETENTION   RGW_ATTR_PREFIX "object-retention"
+#define RGW_ATTR_OBJECT_LEGAL_HOLD  RGW_ATTR_PREFIX "object-legal-hold"
+
+
+#define RGW_ATTR_PG_VER 	RGW_ATTR_PREFIX "pg_ver"
+#define RGW_ATTR_SOURCE_ZONE    RGW_ATTR_PREFIX "source_zone"
+#define RGW_ATTR_TAGS           RGW_ATTR_PREFIX RGW_AMZ_PREFIX "tagging"
+
+#define RGW_ATTR_TEMPURL_KEY1   RGW_ATTR_META_PREFIX "temp-url-key"
+#define RGW_ATTR_TEMPURL_KEY2   RGW_ATTR_META_PREFIX "temp-url-key-2"
+
+/* Account/container quota of the Swift API. */
+#define RGW_ATTR_QUOTA_NOBJS    RGW_ATTR_META_PREFIX "quota-count"
+#define RGW_ATTR_QUOTA_MSIZE    RGW_ATTR_META_PREFIX "quota-bytes"
+
+/* Static Web Site of Swift API. */
+#define RGW_ATTR_WEB_INDEX      RGW_ATTR_META_PREFIX "web-index"
+#define RGW_ATTR_WEB_ERROR      RGW_ATTR_META_PREFIX "web-error"
+#define RGW_ATTR_WEB_LISTINGS   RGW_ATTR_META_PREFIX "web-listings"
+#define RGW_ATTR_WEB_LIST_CSS   RGW_ATTR_META_PREFIX "web-listings-css"
+#define RGW_ATTR_SUBDIR_MARKER  RGW_ATTR_META_PREFIX "web-directory-type"
+
+#define RGW_ATTR_OLH_PREFIX     RGW_ATTR_PREFIX "olh."
+
+#define RGW_ATTR_OLH_INFO       RGW_ATTR_OLH_PREFIX "info"
+#define RGW_ATTR_OLH_VER        RGW_ATTR_OLH_PREFIX "ver"
+#define RGW_ATTR_OLH_ID_TAG     RGW_ATTR_OLH_PREFIX "idtag"
+#define RGW_ATTR_OLH_PENDING_PREFIX RGW_ATTR_OLH_PREFIX "pending."
+
+#define RGW_ATTR_COMPRESSION    RGW_ATTR_PREFIX "compression"
+
+#define RGW_ATTR_APPEND_PART_NUM    RGW_ATTR_PREFIX "append_part_num"
+
+/* Attrs to store cloudtier config information. These are used internally
+ * for the replication of cloudtiered objects but not stored as xattrs in
+ * the head object. */
+#define RGW_ATTR_CLOUD_TIER_TYPE    RGW_ATTR_PREFIX "cloud_tier_type"
+#define RGW_ATTR_CLOUD_TIER_CONFIG    RGW_ATTR_PREFIX "cloud_tier_config"
+
+#define RGW_ATTR_OBJ_REPLICATION_STATUS RGW_ATTR_PREFIX "amz-replication-status"
+#define RGW_ATTR_OBJ_REPLICATION_TRACE RGW_ATTR_PREFIX "replication-trace"
+
+/* IAM Policy */
+#define RGW_ATTR_IAM_POLICY	RGW_ATTR_PREFIX "iam-policy"
+#define RGW_ATTR_USER_POLICY    RGW_ATTR_PREFIX "user-policy"
+#define RGW_ATTR_PUBLIC_ACCESS  RGW_ATTR_PREFIX "public-access"
+
+/* RGW File Attributes */
+#define RGW_ATTR_UNIX_KEY1      RGW_ATTR_PREFIX "unix-key1"
+#define RGW_ATTR_UNIX1          RGW_ATTR_PREFIX "unix1"
+
+#define RGW_ATTR_CRYPT_PREFIX   RGW_ATTR_PREFIX "crypt."
+#define RGW_ATTR_CRYPT_MODE     RGW_ATTR_CRYPT_PREFIX "mode"
+#define RGW_ATTR_CRYPT_KEYMD5   RGW_ATTR_CRYPT_PREFIX "keymd5"
+#define RGW_ATTR_CRYPT_KEYID    RGW_ATTR_CRYPT_PREFIX "keyid"
+#define RGW_ATTR_CRYPT_KEYSEL   RGW_ATTR_CRYPT_PREFIX "keysel"
+#define RGW_ATTR_CRYPT_CONTEXT  RGW_ATTR_CRYPT_PREFIX "context"
+#define RGW_ATTR_CRYPT_DATAKEY  RGW_ATTR_CRYPT_PREFIX "datakey"
+#define RGW_ATTR_CRYPT_PARTS    RGW_ATTR_CRYPT_PREFIX "part-lengths"
+
+/* SSE-S3 Encryption Attributes */
+#define RGW_ATTR_BUCKET_ENCRYPTION_PREFIX RGW_ATTR_PREFIX "sse-s3."
+#define RGW_ATTR_BUCKET_ENCRYPTION_POLICY RGW_ATTR_BUCKET_ENCRYPTION_PREFIX "policy"
+#define RGW_ATTR_BUCKET_ENCRYPTION_KEY_ID RGW_ATTR_BUCKET_ENCRYPTION_PREFIX "key-id"
+
+#define RGW_ATTR_TRACE RGW_ATTR_PREFIX "trace"
+
+enum class RGWFormat : int8_t {
+  BAD_FORMAT = -1,
+  PLAIN = 0,
+  XML,
+  JSON,
+  HTML,
+};
+
+static inline const char* to_mime_type(const RGWFormat f)
+{
+  switch (f) {
+  case RGWFormat::XML:
+    return "application/xml";
+    break;
+  case RGWFormat::JSON:
+    return "application/json";
+    break;
+  case RGWFormat::HTML:
+    return "text/html";
+    break;
+  case RGWFormat::PLAIN:
+    return "text/plain";
+    break;
+  default:
+    return "invalid format";
+  }
+}
+
+#define RGW_CAP_READ            0x1
+#define RGW_CAP_WRITE           0x2
+#define RGW_CAP_ALL             (RGW_CAP_READ | RGW_CAP_WRITE)
+
+#define RGW_REST_SWIFT          0x1
+#define RGW_REST_SWIFT_AUTH     0x2
+#define RGW_REST_S3             0x4
+#define RGW_REST_WEBSITE     0x8
+#define RGW_REST_STS            0x10
+#define RGW_REST_IAM            0x20
+#define RGW_REST_SNS            0x30
+
+#define RGW_SUSPENDED_USER_AUID (uint64_t)-2
+
+#define RGW_OP_TYPE_READ         0x01
+#define RGW_OP_TYPE_WRITE        0x02
+#define RGW_OP_TYPE_DELETE       0x04
+
+#define RGW_OP_TYPE_MODIFY       (RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE)
+#define RGW_OP_TYPE_ALL          (RGW_OP_TYPE_READ | RGW_OP_TYPE_WRITE | RGW_OP_TYPE_DELETE)
+
+#define RGW_DEFAULT_MAX_BUCKETS 1000
+
+#define RGW_DEFER_TO_BUCKET_ACLS_RECURSE 1
+#define RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL 2
+
+#define STATUS_CREATED           1900
+#define STATUS_ACCEPTED          1901
+#define STATUS_NO_CONTENT        1902
+#define STATUS_PARTIAL_CONTENT   1903
+#define STATUS_REDIRECT          1904
+#define STATUS_NO_APPLY          1905
+#define STATUS_APPLIED           1906
+
+#define ERR_INVALID_BUCKET_NAME  2000
+#define ERR_INVALID_OBJECT_NAME  2001
+#define ERR_NO_SUCH_BUCKET       2002
+#define ERR_METHOD_NOT_ALLOWED   2003
+#define ERR_INVALID_DIGEST       2004
+#define ERR_BAD_DIGEST           2005
+#define ERR_UNRESOLVABLE_EMAIL   2006
+#define ERR_INVALID_PART         2007
+#define ERR_INVALID_PART_ORDER   2008
+#define ERR_NO_SUCH_UPLOAD       2009
+#define ERR_REQUEST_TIMEOUT      2010
+#define ERR_LENGTH_REQUIRED      2011
+#define ERR_REQUEST_TIME_SKEWED  2012
+#define ERR_BUCKET_EXISTS        2013
+#define ERR_BAD_URL              2014
+#define ERR_PRECONDITION_FAILED  2015
+#define ERR_NOT_MODIFIED         2016
+#define ERR_INVALID_UTF8         2017
+#define ERR_UNPROCESSABLE_ENTITY 2018
+#define ERR_TOO_LARGE            2019
+#define ERR_TOO_MANY_BUCKETS     2020
+#define ERR_INVALID_REQUEST      2021
+#define ERR_TOO_SMALL            2022
+#define ERR_NOT_FOUND            2023
+#define ERR_PERMANENT_REDIRECT   2024
+#define ERR_LOCKED               2025
+#define ERR_QUOTA_EXCEEDED       2026
+#define ERR_SIGNATURE_NO_MATCH   2027
+#define ERR_INVALID_ACCESS_KEY   2028
+#define ERR_MALFORMED_XML        2029
+#define ERR_USER_EXIST           2030
+#define ERR_NOT_SLO_MANIFEST     2031
+#define ERR_EMAIL_EXIST          2032
+#define ERR_KEY_EXIST            2033
+#define ERR_INVALID_SECRET_KEY   2034
+#define ERR_INVALID_KEY_TYPE     2035
+#define ERR_INVALID_CAP          2036
+#define ERR_INVALID_TENANT_NAME  2037
+#define ERR_WEBSITE_REDIRECT     2038
+#define ERR_NO_SUCH_WEBSITE_CONFIGURATION 2039
+#define ERR_AMZ_CONTENT_SHA256_MISMATCH 2040
+#define ERR_NO_SUCH_LC           2041
+#define ERR_NO_SUCH_USER         2042
+#define ERR_NO_SUCH_SUBUSER      2043
+#define ERR_MFA_REQUIRED         2044
+#define ERR_NO_SUCH_CORS_CONFIGURATION 2045
+#define ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION  2046
+#define ERR_INVALID_RETENTION_PERIOD 2047
+#define ERR_NO_SUCH_BUCKET_ENCRYPTION_CONFIGURATION 2048
+#define ERR_USER_SUSPENDED       2100
+#define ERR_INTERNAL_ERROR       2200
+#define ERR_NOT_IMPLEMENTED      2201
+#define ERR_SERVICE_UNAVAILABLE  2202
+#define ERR_ROLE_EXISTS          2203
+#define ERR_MALFORMED_DOC        2204
+#define ERR_NO_ROLE_FOUND        2205
+#define ERR_DELETE_CONFLICT      2206
+#define ERR_NO_SUCH_BUCKET_POLICY  2207
+#define ERR_INVALID_LOCATION_CONSTRAINT 2208
+#define ERR_TAG_CONFLICT         2209
+#define ERR_INVALID_TAG          2210
+#define ERR_ZERO_IN_URL          2211
+#define ERR_MALFORMED_ACL_ERROR  2212
+#define ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION 2213
+#define ERR_INVALID_ENCRYPTION_ALGORITHM                 2214
+#define ERR_INVALID_CORS_RULES_ERROR                     2215
+#define ERR_NO_CORS_FOUND        2216
+#define ERR_INVALID_WEBSITE_ROUTING_RULES_ERROR          2217
+#define ERR_RATE_LIMITED         2218
+#define ERR_POSITION_NOT_EQUAL_TO_LENGTH                 2219
+#define ERR_OBJECT_NOT_APPENDABLE                        2220
+#define ERR_INVALID_BUCKET_STATE                         2221
+#define ERR_INVALID_OBJECT_STATE			 2222
+
+#define ERR_BUSY_RESHARDING      2300
+#define ERR_NO_SUCH_ENTITY       2301
+#define ERR_LIMIT_EXCEEDED       2302
+
+// STS Errors
+#define ERR_PACKED_POLICY_TOO_LARGE 2400
+#define ERR_INVALID_IDENTITY_TOKEN  2401
+
+#define ERR_NO_SUCH_TAG_SET 2402
+
+#ifndef UINT32_MAX
+#define UINT32_MAX (0xffffffffu)
+#endif
+
+typedef void *RGWAccessHandle;
+
+/* Helper class used for RGWHTTPArgs parsing */
+class NameVal
+{
+   const std::string str;
+   std::string name;
+   std::string val;
+ public:
+    explicit NameVal(const std::string& nv) : str(nv) {}
+
+    int parse();
+
+    std::string& get_name() { return name; }
+    std::string& get_val() { return val; }
+};
+
+/** Stores the XML arguments associated with the HTTP request in req_state*/
+class RGWHTTPArgs {
+  std::string str, empty_str;
+  std::map<std::string, std::string> val_map;
+  std::map<std::string, std::string> sys_val_map;
+  std::map<std::string, std::string> sub_resources;
+  bool has_resp_modifier = false;
+  bool admin_subresource_added = false;
+ public:
+  RGWHTTPArgs() = default;
+  explicit RGWHTTPArgs(const std::string& s, const DoutPrefixProvider *dpp) {
+      set(s);
+      parse(dpp);
+  }
+
+  /** Set the arguments; as received */
+  void set(const std::string& s) {
+    has_resp_modifier = false;
+    val_map.clear();
+    sub_resources.clear();
+    str = s;
+  }
+  /** parse the received arguments */
+  int parse(const DoutPrefixProvider *dpp);
+  void append(const std::string& name, const std::string& val);
+  void remove(const std::string& name);
+  /** Get the value for a specific argument parameter */
+  const std::string& get(const std::string& name, bool *exists = NULL) const;
+  boost::optional<const std::string&>
+  get_optional(const std::string& name) const;
+  int get_bool(const std::string& name, bool *val, bool *exists) const;
+  int get_bool(const char *name, bool *val, bool *exists) const;
+  void get_bool(const char *name, bool *val, bool def_val) const;
+  int get_int(const char *name, int *val, int def_val) const;
+
+  /** Get the value for specific system argument parameter */
+  std::string sys_get(const std::string& name, bool *exists = nullptr) const;
+
+  /** see if a parameter is contained in this RGWHTTPArgs */
+  bool exists(const char *name) const {
+    return (val_map.find(name) != std::end(val_map));
+  }
+  bool sub_resource_exists(const char *name) const {
+    return (sub_resources.find(name) != std::end(sub_resources));
+  }
+  bool exist_obj_excl_sub_resource() const {
+    const char* const obj_sub_resource[] = {"append", "torrent", "uploadId",
+                                            "partNumber", "versionId"};
+    for (unsigned i = 0; i != std::size(obj_sub_resource); i++) {
+      if (sub_resource_exists(obj_sub_resource[i])) return true;
+    }
+    return false;
+  }
+
+  std::map<std::string, std::string>& get_params() {
+    return val_map;
+  }
+  const std::map<std::string, std::string>& get_params() const {
+    return val_map;
+  }
+  std::map<std::string, std::string>& get_sys_params() {
+    return sys_val_map;
+  }
+  const std::map<std::string, std::string>& get_sys_params() const {
+    return sys_val_map;
+  }
+  const std::map<std::string, std::string>& get_sub_resources() const {
+    return sub_resources;
+  }
+  unsigned get_num_params() const {
+    return val_map.size();
+  }
+  bool has_response_modifier() const {
+    return has_resp_modifier;
+  }
+  void set_system() { /* make all system params visible */
+    std::map<std::string, std::string>::iterator iter;
+    for (iter = sys_val_map.begin(); iter != sys_val_map.end(); ++iter) {
+      val_map[iter->first] = iter->second;
+    }
+  }
+  const std::string& get_str() {
+    return str;
+  }
+}; // RGWHTTPArgs
+
+const char *rgw_conf_get(const std::map<std::string, std::string, ltstr_nocase>& conf_map, const char *name, const char *def_val);
+boost::optional<const std::string&> rgw_conf_get_optional(const std::map<std::string, std::string, ltstr_nocase>& conf_map, const std::string& name);
+int rgw_conf_get_int(const std::map<std::string, std::string, ltstr_nocase>& conf_map, const char *name, int def_val);
+bool rgw_conf_get_bool(const std::map<std::string, std::string, ltstr_nocase>& conf_map, const char *name, bool def_val);
+
+class RGWEnv;
+
+class RGWConf {
+  friend class RGWEnv;
+  int enable_ops_log;
+  int enable_usage_log;
+  uint8_t defer_to_bucket_acls;
+  void init(CephContext *cct);
+public:
+  RGWConf()
+    : enable_ops_log(1),
+      enable_usage_log(1),
+      defer_to_bucket_acls(0) {
+  }
+};
+
+class RGWEnv {
+  std::map<std::string, std::string, ltstr_nocase> env_map;
+  RGWConf conf;
+public:
+  void init(CephContext *cct);
+  void init(CephContext *cct, char **envp);
+  void set(std::string name, std::string val);
+  const char *get(const char *name, const char *def_val = nullptr) const;
+  boost::optional<const std::string&>
+  get_optional(const std::string& name) const;
+  int get_int(const char *name, int def_val = 0) const;
+  bool get_bool(const char *name, bool def_val = 0);
+  size_t get_size(const char *name, size_t def_val = 0) const;
+  bool exists(const char *name) const;
+  bool exists_prefix(const char *prefix) const;
+  void remove(const char *name);
+  const std::map<std::string, std::string, ltstr_nocase>& get_map() const { return env_map; }
+  int get_enable_ops_log() const {
+    return conf.enable_ops_log;
+  }
+
+  int get_enable_usage_log() const {
+    return conf.enable_usage_log;
+  }
+
+  int get_defer_to_bucket_acls() const {
+    return conf.defer_to_bucket_acls;
+  }
+};
+
+// return true if the connection is secure. this either means that the
+// connection arrived via ssl, or was forwarded as https by a trusted proxy
+bool rgw_transport_is_secure(CephContext *cct, const RGWEnv& env);
+
+enum http_op {
+  OP_GET,
+  OP_PUT,
+  OP_DELETE,
+  OP_HEAD,
+  OP_POST,
+  OP_COPY,
+  OP_OPTIONS,
+  OP_UNKNOWN,
+};
+
+class RGWAccessControlPolicy;
+class JSONObj;
+
+void encode_json(const char *name, const obj_version& v, Formatter *f);
+void encode_json(const char *name, const RGWUserCaps& val, Formatter *f);
+
+void decode_json_obj(obj_version& v, JSONObj *obj);
+
+enum RGWIdentityType
+{
+  TYPE_NONE=0,
+  TYPE_RGW=1,
+  TYPE_KEYSTONE=2,
+  TYPE_LDAP=3,
+  TYPE_ROLE=4,
+  TYPE_WEB=5,
+};
+
+void encode_json(const char *name, const rgw_placement_rule& val, ceph::Formatter *f);
+void decode_json_obj(rgw_placement_rule& v, JSONObj *obj);
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_placement_rule& rule) {
+  return out << rule.to_str();
+}
+
+class RateLimiter;
+struct RGWRateLimitInfo {
+  int64_t max_write_ops;
+  int64_t max_read_ops;
+  int64_t max_write_bytes;
+  int64_t max_read_bytes;
+  bool enabled = false;
+  RGWRateLimitInfo()
+    : max_write_ops(0), max_read_ops(0), max_write_bytes(0), max_read_bytes(0)  {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(max_write_ops, bl);
+    encode(max_read_ops, bl);
+    encode(max_write_bytes, bl);
+    encode(max_read_bytes, bl);
+    encode(enabled, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(max_write_ops,bl);
+    decode(max_read_ops, bl);
+    decode(max_write_bytes,bl);
+    decode(max_read_bytes, bl);
+    decode(enabled, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+
+  void decode_json(JSONObj *obj);
+
+};
+WRITE_CLASS_ENCODER(RGWRateLimitInfo)
+
+struct RGWUserInfo
+{
+  rgw_user user_id;
+  std::string display_name;
+  std::string user_email;
+  std::map<std::string, RGWAccessKey> access_keys;
+  std::map<std::string, RGWAccessKey> swift_keys;
+  std::map<std::string, RGWSubUser> subusers;
+  __u8 suspended;
+  int32_t max_buckets;
+  uint32_t op_mask;
+  RGWUserCaps caps;
+  __u8 admin;
+  __u8 system;
+  rgw_placement_rule default_placement;
+  std::list<std::string> placement_tags;
+  std::map<int, std::string> temp_url_keys;
+  RGWQuota quota;
+  uint32_t type;
+  std::set<std::string> mfa_ids;
+
+  RGWUserInfo()
+    : suspended(0),
+      max_buckets(RGW_DEFAULT_MAX_BUCKETS),
+      op_mask(RGW_OP_TYPE_ALL),
+      admin(0),
+      system(0),
+      type(TYPE_NONE) {
+  }
+
+  RGWAccessKey* get_key(const std::string& access_key) {
+    if (access_keys.empty())
+      return nullptr;
+
+    auto k = access_keys.find(access_key);
+    if (k == access_keys.end())
+      return nullptr;
+    else
+      return &(k->second);
+  }
+
+  void encode(bufferlist& bl) const {
+     ENCODE_START(22, 9, bl);
+     encode((uint64_t)0, bl); // old auid
+     std::string access_key;
+     std::string secret_key;
+     if (!access_keys.empty()) {
+       std::map<std::string, RGWAccessKey>::const_iterator iter = access_keys.begin();
+       const RGWAccessKey& k = iter->second;
+       access_key = k.id;
+       secret_key = k.key;
+     }
+     encode(access_key, bl);
+     encode(secret_key, bl);
+     encode(display_name, bl);
+     encode(user_email, bl);
+     std::string swift_name;
+     std::string swift_key;
+     if (!swift_keys.empty()) {
+       std::map<std::string, RGWAccessKey>::const_iterator iter = swift_keys.begin();
+       const RGWAccessKey& k = iter->second;
+       swift_name = k.id;
+       swift_key = k.key;
+     }
+     encode(swift_name, bl);
+     encode(swift_key, bl);
+     encode(user_id.id, bl);
+     encode(access_keys, bl);
+     encode(subusers, bl);
+     encode(suspended, bl);
+     encode(swift_keys, bl);
+     encode(max_buckets, bl);
+     encode(caps, bl);
+     encode(op_mask, bl);
+     encode(system, bl);
+     encode(default_placement, bl);
+     encode(placement_tags, bl);
+     encode(quota.bucket_quota, bl);
+     encode(temp_url_keys, bl);
+     encode(quota.user_quota, bl);
+     encode(user_id.tenant, bl);
+     encode(admin, bl);
+     encode(type, bl);
+     encode(mfa_ids, bl);
+     {
+       std::string assumed_role_arn; // removed
+       encode(assumed_role_arn, bl);
+     }
+     encode(user_id.ns, bl);
+     ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START_LEGACY_COMPAT_LEN_32(22, 9, 9, bl);
+     if (struct_v >= 2) {
+       uint64_t old_auid;
+       decode(old_auid, bl);
+     }
+     std::string access_key;
+     std::string secret_key;
+    decode(access_key, bl);
+    decode(secret_key, bl);
+    if (struct_v < 6) {
+      RGWAccessKey k;
+      k.id = access_key;
+      k.key = secret_key;
+      access_keys[access_key] = k;
+    }
+    decode(display_name, bl);
+    decode(user_email, bl);
+    /* We populate swift_keys map later nowadays, but we have to decode. */
+    std::string swift_name;
+    std::string swift_key;
+    if (struct_v >= 3) decode(swift_name, bl);
+    if (struct_v >= 4) decode(swift_key, bl);
+    if (struct_v >= 5)
+      decode(user_id.id, bl);
+    else
+      user_id.id = access_key;
+    if (struct_v >= 6) {
+      decode(access_keys, bl);
+      decode(subusers, bl);
+    }
+    suspended = 0;
+    if (struct_v >= 7) {
+      decode(suspended, bl);
+    }
+    if (struct_v >= 8) {
+      decode(swift_keys, bl);
+    }
+    if (struct_v >= 10) {
+      decode(max_buckets, bl);
+    } else {
+      max_buckets = RGW_DEFAULT_MAX_BUCKETS;
+    }
+    if (struct_v >= 11) {
+      decode(caps, bl);
+    }
+    if (struct_v >= 12) {
+      decode(op_mask, bl);
+    } else {
+      op_mask = RGW_OP_TYPE_ALL;
+    }
+    if (struct_v >= 13) {
+      decode(system, bl);
+      decode(default_placement, bl);
+      decode(placement_tags, bl); /* tags of allowed placement rules */
+    }
+    if (struct_v >= 14) {
+      decode(quota.bucket_quota, bl);
+    }
+    if (struct_v >= 15) {
+     decode(temp_url_keys, bl);
+    }
+    if (struct_v >= 16) {
+      decode(quota.user_quota, bl);
+    }
+    if (struct_v >= 17) {
+      decode(user_id.tenant, bl);
+    } else {
+      user_id.tenant.clear();
+    }
+    if (struct_v >= 18) {
+      decode(admin, bl);
+    }
+    if (struct_v >= 19) {
+      decode(type, bl);
+    }
+    if (struct_v >= 20) {
+      decode(mfa_ids, bl);
+    }
+    if (struct_v >= 21) {
+      std::string assumed_role_arn; // removed
+      decode(assumed_role_arn, bl);
+    }
+    if (struct_v >= 22) {
+      decode(user_id.ns, bl);
+    } else {
+      user_id.ns.clear();
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWUserInfo*>& o);
+
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWUserInfo)
+
+/// `RGWObjVersionTracker`
+/// ======================
+///
+/// What and why is this?
+/// ---------------------
+///
+/// This is a wrapper around `cls_version` functionality. If two RGWs
+/// (or two non-synchronized threads in the same RGW) are accessing
+/// the same object, they may race and overwrite each other's work.
+///
+/// This class solves this issue by tracking and recording an object's
+/// version in the extended attributes. Operations are failed with
+/// ECANCELED if the version is not what we expect.
+///
+/// How to Use It
+/// -------------
+///
+/// When preparing a read operation, call `prepare_op_for_read`.
+/// For a write, call `prepare_op_for_write` when preparing the
+/// operation, and `apply_write` after it succeeds.
+///
+/// Adhere to the following guidelines:
+///
+/// - Each RGWObjVersionTracker should be used with only one object.
+///
+/// - If you receive `ECANCELED`, throw away whatever you were doing
+///   based on the content of the versioned object, re-read, and
+///   restart as appropriate.
+///
+/// - If one code path uses RGWObjVersionTracker, then they all
+///   should. In a situation where a writer should unconditionally
+///   overwrite an object, call `generate_new_write_ver` on a default
+///   constructed `RGWObjVersionTracker`.
+///
+/// - If we have a version from a previous read, we will check against
+///   it and fail the read if it doesn't match. Thus, if we want to
+///   re-read a new version of the object, call `clear()` on the
+///   `RGWObjVersionTracker`.
+///
+/// - This type is not thread-safe. Every thread must have its own
+///   instance.
+///
+struct RGWObjVersionTracker {
+  obj_version read_version; //< The version read from an object. If
+			    //  set, this value is used to check the
+			    //  stored version.
+  obj_version write_version; //< Set the object to this version on
+			     //  write, if set.
+
+  /// Pointer to the read version.
+  obj_version* version_for_read() {
+    return &read_version;
+  }
+
+  /// If we have a write version, return a pointer to it. Otherwise
+  /// return null. This is used in `prepare_op_for_write` to treat the
+  /// `write_version` as effectively an `option` type.
+  obj_version* version_for_write() {
+    if (write_version.ver == 0)
+      return nullptr;
+
+    return &write_version;
+  }
+
+  /// If read_version is non-empty, return a pointer to it, otherwise
+  /// null. This is used internally by `prepare_op_for_read` and
+  /// `prepare_op_for_write` to treat the `read_version` as
+  /// effectively an `option` type.
+  obj_version* version_for_check() {
+    if (read_version.ver == 0)
+      return nullptr;
+
+    return &read_version;
+  }
+
+  /// This function is to be called on any read operation. If we have
+  /// a non-empty `read_version`, assert on the OSD that the object
+  /// has the same version. Also reads the version into `read_version`.
+  ///
+  /// This function is defined in `rgw_rados.cc` rather than `rgw_common.cc`.
+  void prepare_op_for_read(librados::ObjectReadOperation* op);
+
+  /// This function is to be called on any write operation. If we have
+  /// a non-empty read operation, assert on the OSD that the object
+  /// has the same version. If we have a non-empty `write_version`,
+  /// set the object to it. Otherwise increment the version on the OSD.
+  ///
+  /// This function is defined in `rgw_rados.cc` rather than
+  /// `rgw_common.cc`.
+  void prepare_op_for_write(librados::ObjectWriteOperation* op);
+
+  /// This function is to be called after the completion of any write
+  /// operation on which `prepare_op_for_write` was called. If we did
+  /// not set the write version explicitly, it increments
+  /// `read_version`. If we did, it sets `read_version` to
+  /// `write_version`. In either case, it clears `write_version`.
+  ///
+  /// RADOS write operations, at least those not using the relatively
+  /// new RETURNVEC flag, cannot return more information than an error
+  /// code. Thus, write operations can't simply fill in the read
+  /// version the way read operations can, so prepare_op_for_write`
+  /// instructs the OSD to increment the object as stored in RADOS and
+  /// `apply_write` increments our `read_version` in RAM.
+  ///
+  /// This function is defined in `rgw_rados.cc` rather than
+  /// `rgw_common.cc`.
+  void apply_write();
+
+  /// Clear `read_version` and `write_version`, making the instance
+  /// identical to a default-constructed instance.
+  void clear() {
+    read_version = obj_version();
+    write_version = obj_version();
+  }
+
+  /// Set `write_version` to a new, unique version.
+  ///
+  /// An `obj_version` contains an opaque, random tag and a
+  /// sequence. If the tags of two `obj_version`s don't match, the
+  /// versions are unordered and unequal. This function creates a
+  /// version with a new tag, ensuring that any other process
+  /// operating on the object will receive `ECANCELED` and will know
+  /// to re-read the object and restart whatever it was doing.
+  void generate_new_write_ver(CephContext* cct);
+};
+
+inline std::ostream& operator<<(std::ostream& out, const obj_version &v)
+{
+  out << v.tag << ":" << v.ver;
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const RGWObjVersionTracker &ot)
+{
+  out << "{r=" << ot.read_version << ",w=" << ot.write_version << "}";
+  return out;
+}
+
+enum RGWBucketFlags {
+  BUCKET_SUSPENDED = 0x1,
+  BUCKET_VERSIONED = 0x2,
+  BUCKET_VERSIONS_SUSPENDED = 0x4,
+  BUCKET_DATASYNC_DISABLED = 0X8,
+  BUCKET_MFA_ENABLED = 0X10,
+  BUCKET_OBJ_LOCK_ENABLED = 0X20,
+};
+
+class RGWSI_Zone;
+
+struct RGWBucketInfo {
+  rgw_bucket bucket;
+  rgw_user owner;
+  uint32_t flags{0};
+  std::string zonegroup;
+  ceph::real_time creation_time;
+  rgw_placement_rule placement_rule;
+  bool has_instance_obj{false};
+  RGWObjVersionTracker objv_tracker; /* we don't need to serialize this, for runtime tracking */
+  RGWQuotaInfo quota;
+
+  // layout of bucket index objects
+  rgw::BucketLayout layout;
+
+  // Represents the shard number for blind bucket.
+  const static uint32_t NUM_SHARDS_BLIND_BUCKET;
+
+  bool requester_pays{false};
+
+  bool has_website{false};
+  RGWBucketWebsiteConf website_conf;
+
+  bool swift_versioning{false};
+  std::string swift_ver_location;
+
+  std::map<std::string, uint32_t> mdsearch_config;
+
+  // resharding
+  cls_rgw_reshard_status reshard_status{cls_rgw_reshard_status::NOT_RESHARDING};
+  std::string new_bucket_instance_id;
+
+  RGWObjectLock obj_lock;
+
+  std::optional<rgw_sync_policy_info> sync_policy;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWBucketInfo*>& o);
+
+  void decode_json(JSONObj *obj);
+
+  bool versioned() const { return (flags & BUCKET_VERSIONED) != 0; }
+  int versioning_status() const { return flags & (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED | BUCKET_MFA_ENABLED); }
+  bool versioning_enabled() const { return (versioning_status() & (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED)) == BUCKET_VERSIONED; }
+  bool mfa_enabled() const { return (versioning_status() & BUCKET_MFA_ENABLED) != 0; }
+  bool datasync_flag_enabled() const { return (flags & BUCKET_DATASYNC_DISABLED) == 0; }
+  bool obj_lock_enabled() const { return (flags & BUCKET_OBJ_LOCK_ENABLED) != 0; }
+
+  bool has_swift_versioning() const {
+    /* A bucket may be versioned through one mechanism only. */
+    return swift_versioning && !versioned();
+  }
+
+  void set_sync_policy(rgw_sync_policy_info&& policy);
+
+  bool empty_sync_policy() const;
+
+  bool is_indexless() const {
+    return rgw::is_layout_indexless(layout.current_index);
+  }
+  const rgw::bucket_index_layout_generation& get_current_index() const {
+    return layout.current_index;
+  }
+  rgw::bucket_index_layout_generation& get_current_index() {
+    return layout.current_index;
+  }
+
+  RGWBucketInfo();
+  ~RGWBucketInfo();
+};
+WRITE_CLASS_ENCODER(RGWBucketInfo)
+
+struct RGWBucketEntryPoint
+{
+  rgw_bucket bucket;
+  rgw_user owner;
+  ceph::real_time creation_time;
+  bool linked;
+
+  bool has_bucket_info;
+  RGWBucketInfo old_bucket_info;
+
+  RGWBucketEntryPoint() : linked(false), has_bucket_info(false) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(10, 8, bl);
+    encode(bucket, bl);
+    encode(owner.id, bl);
+    encode(linked, bl);
+    uint64_t ctime = (uint64_t)real_clock::to_time_t(creation_time);
+    encode(ctime, bl);
+    encode(owner, bl);
+    encode(creation_time, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    auto orig_iter = bl;
+    DECODE_START_LEGACY_COMPAT_LEN_32(10, 4, 4, bl);
+    if (struct_v < 8) {
+      /* ouch, old entry, contains the bucket info itself */
+      old_bucket_info.decode(orig_iter);
+      has_bucket_info = true;
+      return;
+    }
+    has_bucket_info = false;
+    decode(bucket, bl);
+    decode(owner.id, bl);
+    decode(linked, bl);
+    uint64_t ctime;
+    decode(ctime, bl);
+    if (struct_v < 10) {
+      creation_time = real_clock::from_time_t((time_t)ctime);
+    }
+    if (struct_v >= 9) {
+      decode(owner, bl);
+    }
+    if (struct_v >= 10) {
+      decode(creation_time, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWBucketEntryPoint*>& o);
+};
+WRITE_CLASS_ENCODER(RGWBucketEntryPoint)
+
+struct RGWStorageStats
+{
+  RGWObjCategory category;
+  uint64_t size;
+  uint64_t size_rounded;
+  uint64_t num_objects;
+  uint64_t size_utilized{0}; //< size after compression, encryption
+  bool dump_utilized;        // whether dump should include utilized values
+
+  RGWStorageStats(bool _dump_utilized=true)
+    : category(RGWObjCategory::None),
+      size(0),
+      size_rounded(0),
+      num_objects(0),
+      dump_utilized(_dump_utilized)
+  {}
+
+  void dump(Formatter *f) const;
+}; // RGWStorageStats
+
+class RGWEnv;
+
+/* Namespaced forward declarations. */
+namespace rgw {
+  namespace auth {
+    namespace s3 {
+      class AWSBrowserUploadAbstractor;
+      class STSEngine;
+    }
+    class Completer;
+  }
+  namespace io {
+    class BasicClient;
+  }
+}
+
+using meta_map_t = boost::container::flat_map <std::string, std::string>;
+
+struct req_info {
+  const RGWEnv *env;
+  RGWHTTPArgs args;
+  meta_map_t x_meta_map;
+  meta_map_t crypt_attribute_map;
+
+  std::string host;
+  const char *method;
+  std::string script_uri;
+  std::string request_uri;
+  std::string request_uri_aws4;
+  std::string effective_uri;
+  std::string request_params;
+  std::string domain;
+  std::string storage_class;
+
+  req_info(CephContext *cct, const RGWEnv *env);
+  void rebuild_from(req_info& src);
+  void init_meta_info(const DoutPrefixProvider *dpp, bool *found_bad_meta);
+};
+
+struct req_init_state {
+  /* Keeps [[tenant]:]bucket until we parse the token. */
+  std::string url_bucket;
+  std::string src_bucket;
+};
+
+#include "rgw_auth.h"
+
+class RGWObjectCtx;
+
+/** Store all the state necessary to complete and respond to an HTTP request*/
+struct req_state : DoutPrefixProvider {
+  CephContext *cct;
+  const RGWProcessEnv& penv;
+  rgw::io::BasicClient *cio{nullptr};
+  http_op op{OP_UNKNOWN};
+  RGWOpType op_type{};
+  std::shared_ptr<RateLimiter> ratelimit_data;
+  RGWRateLimitInfo user_ratelimit;
+  RGWRateLimitInfo bucket_ratelimit;
+  std::string ratelimit_bucket_marker;
+  std::string ratelimit_user_name;
+  bool content_started{false};
+  RGWFormat format{RGWFormat::PLAIN};
+  ceph::Formatter *formatter{nullptr};
+  std::string decoded_uri;
+  std::string relative_uri;
+  const char *length{nullptr};
+  int64_t content_length{0};
+  std::map<std::string, std::string> generic_attrs;
+  rgw_err err;
+  bool expect_cont{false};
+  uint64_t obj_size{0};
+  bool enable_ops_log;
+  bool enable_usage_log;
+  uint8_t defer_to_bucket_acls;
+  uint32_t perm_mask{0};
+
+  /* Set once when url_bucket is parsed and not violated thereafter. */
+  std::string account_name;
+
+  std::string bucket_tenant;
+  std::string bucket_name;
+
+  /* bucket is only created in rgw_build_bucket_policies() and should never be
+   * overwritten */
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  std::unique_ptr<rgw::sal::Object> object;
+  std::string src_tenant_name;
+  std::string src_bucket_name;
+  std::unique_ptr<rgw::sal::Object> src_object;
+  ACLOwner bucket_owner;
+  ACLOwner owner;
+
+  std::string zonegroup_name;
+  std::string zonegroup_endpoint;
+  std::string bucket_instance_id;
+  int bucket_instance_shard_id{-1};
+  std::string redirect_zone_endpoint;
+
+  std::string redirect;
+
+  real_time bucket_mtime;
+  std::map<std::string, ceph::bufferlist> bucket_attrs;
+  bool bucket_exists{false};
+  rgw_placement_rule dest_placement;
+
+  bool has_bad_meta{false};
+
+  std::unique_ptr<rgw::sal::User> user;
+
+  struct {
+    /* TODO(rzarzynski): switch out to the static_ptr for both members. */
+
+    /* Object having the knowledge about an authenticated identity and allowing
+     * to apply it during the authorization phase (verify_permission() methods
+     * of a given RGWOp). Thus, it bounds authentication and authorization steps
+     * through a well-defined interface. For more details, see rgw_auth.h. */
+    std::unique_ptr<rgw::auth::Identity> identity;
+
+    std::shared_ptr<rgw::auth::Completer> completer;
+
+    /* A container for credentials of the S3's browser upload. It's necessary
+     * because: 1) the ::authenticate() method of auth engines and strategies
+     * take req_state only; 2) auth strategies live much longer than RGWOps -
+     * there is no way to pass additional data dependencies through ctors. */
+    class {
+      /* Writer. */
+      friend class RGWPostObj_ObjStore_S3;
+      /* Reader. */
+      friend class rgw::auth::s3::AWSBrowserUploadAbstractor;
+      friend class rgw::auth::s3::STSEngine;
+
+      std::string access_key;
+      std::string signature;
+      std::string x_amz_algorithm;
+      std::string x_amz_credential;
+      std::string x_amz_date;
+      std::string x_amz_security_token;
+      ceph::bufferlist encoded_policy;
+    } s3_postobj_creds;
+  } auth;
+
+  std::unique_ptr<RGWAccessControlPolicy> user_acl;
+  std::unique_ptr<RGWAccessControlPolicy> bucket_acl;
+  std::unique_ptr<RGWAccessControlPolicy> object_acl;
+
+  rgw::IAM::Environment env;
+  boost::optional<rgw::IAM::Policy> iam_policy;
+  boost::optional<PublicAccessBlockConfiguration> bucket_access_conf;
+  std::vector<rgw::IAM::Policy> iam_user_policies;
+
+  /* Is the request made by an user marked as a system one?
+   * Being system user means we also have the admin status. */
+  bool system_request{false};
+
+  std::string canned_acl;
+  bool has_acl_header{false};
+  bool local_source{false}; /* source is local */
+
+  int prot_flags{0};
+
+  /* Content-Disposition override for TempURL of Swift API. */
+  struct {
+    std::string override;
+    std::string fallback;
+  } content_disp;
+
+  std::string host_id;
+
+  req_info info;
+  req_init_state init_state;
+
+  using Clock = ceph::coarse_real_clock;
+  Clock::time_point time;
+
+  Clock::duration time_elapsed() const { return Clock::now() - time; }
+
+  std::string dialect;
+  std::string req_id;
+  std::string trans_id;
+  uint64_t id;
+
+  RGWObjTags tagset;
+
+  bool mfa_verified{false};
+
+  /// optional coroutine context
+  optional_yield yield{null_yield};
+
+  //token claims from STS token for ops log (can be used for Keystone token also)
+  std::vector<std::string> token_claims;
+
+  std::vector<rgw::IAM::Policy> session_policies;
+
+  jspan trace;
+  bool trace_enabled = false;
+
+  //Principal tags that come in as part of AssumeRoleWithWebIdentity
+  std::vector<std::pair<std::string, std::string>> principal_tags;
+
+  req_state(CephContext* _cct, const RGWProcessEnv& penv, RGWEnv* e, uint64_t id);
+  ~req_state();
+
+
+  void set_user(std::unique_ptr<rgw::sal::User>& u) { user.swap(u); }
+  bool is_err() const { return err.is_err(); }
+
+  // implements DoutPrefixProvider
+  std::ostream& gen_prefix(std::ostream& out) const override;
+  CephContext* get_cct() const override { return cct; }
+  unsigned get_subsys() const override { return ceph_subsys_rgw; }
+};
+
+void set_req_state_err(req_state*, int);
+void set_req_state_err(req_state*, int, const std::string&);
+void set_req_state_err(struct rgw_err&, int, const int);
+void dump(req_state*);
+
+/** Store basic data on bucket */
+struct RGWBucketEnt {
+  rgw_bucket bucket;
+  size_t size;
+  size_t size_rounded;
+  ceph::real_time creation_time;
+  uint64_t count;
+
+  /* The placement_rule is necessary to calculate per-storage-policy statics
+   * of the Swift API. Although the info available in RGWBucketInfo, we need
+   * to duplicate it here to not affect the performance of buckets listing. */
+  rgw_placement_rule placement_rule;
+
+  RGWBucketEnt()
+    : size(0),
+      size_rounded(0),
+      count(0) {
+  }
+  RGWBucketEnt(const RGWBucketEnt&) = default;
+  RGWBucketEnt(RGWBucketEnt&&) = default;
+  explicit RGWBucketEnt(const rgw_user& u, cls_user_bucket_entry&& e)
+    : bucket(u, std::move(e.bucket)),
+      size(e.size),
+      size_rounded(e.size_rounded),
+      creation_time(e.creation_time),
+      count(e.count) {
+  }
+
+  RGWBucketEnt& operator=(const RGWBucketEnt&) = default;
+
+  void convert(cls_user_bucket_entry *b) const {
+    bucket.convert(&b->bucket);
+    b->size = size;
+    b->size_rounded = size_rounded;
+    b->creation_time = creation_time;
+    b->count = count;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(7, 5, bl);
+    uint64_t s = size;
+    __u32 mt = ceph::real_clock::to_time_t(creation_time);
+    std::string empty_str;  // originally had the bucket name here, but we encode bucket later
+    encode(empty_str, bl);
+    encode(s, bl);
+    encode(mt, bl);
+    encode(count, bl);
+    encode(bucket, bl);
+    s = size_rounded;
+    encode(s, bl);
+    encode(creation_time, bl);
+    encode(placement_rule, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(7, 5, 5, bl);
+    __u32 mt;
+    uint64_t s;
+    std::string empty_str;  // backward compatibility
+    decode(empty_str, bl);
+    decode(s, bl);
+    decode(mt, bl);
+    size = s;
+    if (struct_v < 6) {
+      creation_time = ceph::real_clock::from_time_t(mt);
+    }
+    if (struct_v >= 2)
+      decode(count, bl);
+    if (struct_v >= 3)
+      decode(bucket, bl);
+    if (struct_v >= 4)
+      decode(s, bl);
+    size_rounded = s;
+    if (struct_v >= 6)
+      decode(creation_time, bl);
+    if (struct_v >= 7)
+      decode(placement_rule, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWBucketEnt*>& o);
+};
+WRITE_CLASS_ENCODER(RGWBucketEnt)
+
+struct rgw_cache_entry_info {
+  std::string cache_locator;
+  uint64_t gen;
+
+  rgw_cache_entry_info() : gen(0) {}
+};
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_obj &o) {
+  return out << o.bucket.name << ":" << o.get_oid();
+}
+
+struct multipart_upload_info
+{
+  rgw_placement_rule dest_placement;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(dest_placement, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(dest_placement, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(multipart_upload_info)
+
+static inline void buf_to_hex(const unsigned char* const buf,
+                              const size_t len,
+                              char* const str)
+{
+  str[0] = '\0';
+  for (size_t i = 0; i < len; i++) {
+    ::sprintf(&str[i*2], "%02x", static_cast<int>(buf[i]));
+  }
+}
+
+template<size_t N> static inline std::array<char, N * 2 + 1>
+buf_to_hex(const std::array<unsigned char, N>& buf)
+{
+  static_assert(N > 0, "The input array must be at least one element long");
+
+  std::array<char, N * 2 + 1> hex_dest;
+  buf_to_hex(buf.data(), N, hex_dest.data());
+  return hex_dest;
+}
+
+static inline int hexdigit(char c)
+{
+  if (c >= '0' && c <= '9')
+    return (c - '0');
+  c = toupper(c);
+  if (c >= 'A' && c <= 'F')
+    return c - 'A' + 0xa;
+  return -EINVAL;
+}
+
+static inline int hex_to_buf(const char *hex, char *buf, int len)
+{
+  int i = 0;
+  const char *p = hex;
+  while (*p) {
+    if (i >= len)
+      return -EINVAL;
+    buf[i] = 0;
+    int d = hexdigit(*p);
+    if (d < 0)
+      return d;
+    buf[i] = d << 4;
+    p++;
+    if (!*p)
+      return -EINVAL;
+    d = hexdigit(*p);
+    if (d < 0)
+      return d;
+    buf[i] += d;
+    i++;
+    p++;
+  }
+  return i;
+}
+
+static inline int rgw_str_to_bool(const char *s, int def_val)
+{
+  if (!s)
+    return def_val;
+
+  return (strcasecmp(s, "true") == 0 ||
+          strcasecmp(s, "on") == 0 ||
+          strcasecmp(s, "yes") == 0 ||
+          strcasecmp(s, "1") == 0);
+}
+
+static inline void append_rand_alpha(CephContext *cct, const std::string& src, std::string& dest, int len)
+{
+  dest = src;
+  char buf[len + 1];
+  gen_rand_alphanumeric(cct, buf, len);
+  dest.append("_");
+  dest.append(buf);
+}
+
+static inline uint64_t rgw_rounded_kb(uint64_t bytes)
+{
+  return (bytes + 1023) / 1024;
+}
+
+static inline uint64_t rgw_rounded_objsize(uint64_t bytes)
+{
+  return ((bytes + 4095) & ~4095);
+}
+
+static inline uint64_t rgw_rounded_objsize_kb(uint64_t bytes)
+{
+  return ((bytes + 4095) & ~4095) / 1024;
+}
+
+/* implement combining step, S3 header canonicalization;  k is a
+ * valid header and in lc form */
+void rgw_add_amz_meta_header(
+  meta_map_t& x_meta_map,
+  const std::string& k,
+  const std::string& v);
+
+enum rgw_set_action_if_set {
+  DISCARD=0, OVERWRITE, APPEND
+};
+
+bool rgw_set_amz_meta_header(
+  meta_map_t& x_meta_map,
+  const std::string& k,
+  const std::string& v, rgw_set_action_if_set f);
+
+extern std::string rgw_string_unquote(const std::string& s);
+extern void parse_csv_string(const std::string& ival, std::vector<std::string>& ovals);
+extern int parse_key_value(std::string& in_str, std::string& key, std::string& val);
+extern int parse_key_value(std::string& in_str, const char *delim, std::string& key, std::string& val);
+
+extern boost::optional<std::pair<std::string_view,std::string_view>>
+parse_key_value(const std::string_view& in_str,
+                const std::string_view& delim);
+extern boost::optional<std::pair<std::string_view,std::string_view>>
+parse_key_value(const std::string_view& in_str);
+
+struct rgw_name_to_flag {
+  const char *type_name;
+  uint32_t flag;
+};
+
+/** time parsing */
+extern int parse_time(const char *time_str, real_time *time);
+extern bool parse_rfc2616(const char *s, struct tm *t);
+extern bool parse_iso8601(const char *s, struct tm *t, uint32_t *pns = NULL, bool extended_format = true);
+extern std::string rgw_trim_whitespace(const std::string& src);
+extern std::string_view rgw_trim_whitespace(const std::string_view& src);
+extern std::string rgw_trim_quotes(const std::string& val);
+
+extern void rgw_to_iso8601(const real_time& t, char *dest, int buf_size);
+extern void rgw_to_iso8601(const real_time& t, std::string *dest);
+extern std::string rgw_to_asctime(const utime_t& t);
+
+struct perm_state_base {
+  CephContext *cct;
+  const rgw::IAM::Environment& env;
+  rgw::auth::Identity *identity;
+  const RGWBucketInfo bucket_info;
+  int perm_mask;
+  bool defer_to_bucket_acls;
+  boost::optional<PublicAccessBlockConfiguration> bucket_access_conf;
+
+  perm_state_base(CephContext *_cct,
+                  const rgw::IAM::Environment& _env,
+                  rgw::auth::Identity *_identity,
+                  const RGWBucketInfo& _bucket_info,
+                  int _perm_mask,
+                  bool _defer_to_bucket_acls,
+                  boost::optional<PublicAccessBlockConfiguration> _bucket_acess_conf = boost::none) :
+                                                cct(_cct),
+                                                env(_env),
+                                                identity(_identity),
+                                                bucket_info(_bucket_info),
+                                                perm_mask(_perm_mask),
+                                                defer_to_bucket_acls(_defer_to_bucket_acls),
+                                                bucket_access_conf(_bucket_acess_conf)
+  {}
+
+  virtual ~perm_state_base() {}
+
+  virtual const char *get_referer() const = 0;
+  virtual std::optional<bool> get_request_payer() const = 0; /*
+                                                              * empty state means that request_payer param was not passed in
+                                                              */
+
+};
+
+struct perm_state : public perm_state_base {
+  const char *referer;
+  bool request_payer;
+
+  perm_state(CephContext *_cct,
+             const rgw::IAM::Environment& _env,
+             rgw::auth::Identity *_identity,
+             const RGWBucketInfo& _bucket_info,
+             int _perm_mask,
+             bool _defer_to_bucket_acls,
+             const char *_referer,
+             bool _request_payer) : perm_state_base(_cct,
+                                                    _env,
+                                                    _identity,
+                                                    _bucket_info,
+                                                    _perm_mask,
+                                                    _defer_to_bucket_acls),
+                                    referer(_referer),
+                                    request_payer(_request_payer) {}
+
+  const char *get_referer() const override {
+    return referer;
+  }
+
+  std::optional<bool> get_request_payer() const override {
+    return request_payer;
+  }
+};
+
+/** Check if the req_state's user has the necessary permissions
+ * to do the requested action  */
+bool verify_bucket_permission_no_policy(
+  const DoutPrefixProvider* dpp,
+  struct perm_state_base * const s,
+  RGWAccessControlPolicy * const user_acl,
+  RGWAccessControlPolicy * const bucket_acl,
+  const int perm);
+
+bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp,
+                                      struct perm_state_base * const s,
+                                      RGWAccessControlPolicy * const user_acl,
+                                      const int perm);
+
+bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp,
+                                        struct perm_state_base * const s,
+					RGWAccessControlPolicy * const user_acl,
+					RGWAccessControlPolicy * const bucket_acl,
+					RGWAccessControlPolicy * const object_acl,
+					const int perm);
+
+/** Check if the req_state's user has the necessary permissions
+ * to do the requested action */
+rgw::IAM::Effect eval_identity_or_session_policies(const DoutPrefixProvider* dpp,
+			  const std::vector<rgw::IAM::Policy>& user_policies,
+                          const rgw::IAM::Environment& env,
+                          const uint64_t op,
+                          const rgw::ARN& arn);
+bool verify_user_permission(const DoutPrefixProvider* dpp,
+                            req_state * const s,
+                            RGWAccessControlPolicy * const user_acl,
+                            const std::vector<rgw::IAM::Policy>& user_policies,
+                            const std::vector<rgw::IAM::Policy>& session_policies,
+                            const rgw::ARN& res,
+                            const uint64_t op,
+                            bool mandatory_policy=true);
+bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp,
+                                      req_state * const s,
+                                      RGWAccessControlPolicy * const user_acl,
+                                      const int perm);
+bool verify_user_permission(const DoutPrefixProvider* dpp,
+                            req_state * const s,
+                            const rgw::ARN& res,
+                            const uint64_t op,
+                            bool mandatory_policy=true);
+bool verify_user_permission_no_policy(const DoutPrefixProvider* dpp,
+                                      req_state * const s,
+                                      int perm);
+bool verify_bucket_permission(
+  const DoutPrefixProvider* dpp,
+  req_state * const s,
+  const rgw_bucket& bucket,
+  RGWAccessControlPolicy * const user_acl,
+  RGWAccessControlPolicy * const bucket_acl,
+  const boost::optional<rgw::IAM::Policy>& bucket_policy,
+  const std::vector<rgw::IAM::Policy>& identity_policies,
+  const std::vector<rgw::IAM::Policy>& session_policies,
+  const uint64_t op);
+bool verify_bucket_permission(const DoutPrefixProvider* dpp, req_state * const s, const uint64_t op);
+bool verify_bucket_permission_no_policy(
+  const DoutPrefixProvider* dpp,
+  req_state * const s,
+  RGWAccessControlPolicy * const user_acl,
+  RGWAccessControlPolicy * const bucket_acl,
+  const int perm);
+bool verify_bucket_permission_no_policy(const DoutPrefixProvider* dpp,
+                                        req_state * const s,
+					const int perm);
+int verify_bucket_owner_or_policy(req_state* const s,
+				  const uint64_t op);
+extern bool verify_object_permission(
+  const DoutPrefixProvider* dpp,
+  req_state * const s,
+  const rgw_obj& obj,
+  RGWAccessControlPolicy * const user_acl,
+  RGWAccessControlPolicy * const bucket_acl,
+  RGWAccessControlPolicy * const object_acl,
+  const boost::optional<rgw::IAM::Policy>& bucket_policy,
+  const std::vector<rgw::IAM::Policy>& identity_policies,
+  const std::vector<rgw::IAM::Policy>& session_policies,
+  const uint64_t op);
+extern bool verify_object_permission(const DoutPrefixProvider* dpp, req_state *s, uint64_t op);
+extern bool verify_object_permission_no_policy(
+  const DoutPrefixProvider* dpp,
+  req_state * const s,
+  RGWAccessControlPolicy * const user_acl,
+  RGWAccessControlPolicy * const bucket_acl,
+  RGWAccessControlPolicy * const object_acl,
+  int perm);
+extern bool verify_object_permission_no_policy(const DoutPrefixProvider* dpp, req_state *s,
+					       int perm);
+extern int verify_object_lock(
+  const DoutPrefixProvider* dpp,
+  const rgw::sal::Attrs& attrs,
+  const bool bypass_perm,
+  const bool bypass_governance_mode);
+
+/** Convert an input URL into a sane object name
+ * by converting %-escaped std::strings into characters, etc*/
+extern void rgw_uri_escape_char(char c, std::string& dst);
+extern std::string url_decode(const std::string_view& src_str,
+                              bool in_query = false);
+extern void url_encode(const std::string& src, std::string& dst,
+                       bool encode_slash = true);
+extern std::string url_encode(const std::string& src, bool encode_slash = true);
+extern std::string url_remove_prefix(const std::string& url); // Removes hhtp, https and www from url
+/* destination should be CEPH_CRYPTO_HMACSHA1_DIGESTSIZE bytes long */
+extern void calc_hmac_sha1(const char *key, int key_len,
+                          const char *msg, int msg_len, char *dest);
+
+static inline sha1_digest_t
+calc_hmac_sha1(const std::string_view& key, const std::string_view& msg) {
+  sha1_digest_t dest;
+  calc_hmac_sha1(key.data(), key.size(), msg.data(), msg.size(),
+                 reinterpret_cast<char*>(dest.v));
+  return dest;
+}
+
+/* destination should be CEPH_CRYPTO_HMACSHA256_DIGESTSIZE bytes long */
+extern void calc_hmac_sha256(const char *key, int key_len,
+                             const char *msg, int msg_len,
+                             char *dest);
+
+static inline sha256_digest_t
+calc_hmac_sha256(const char *key, const int key_len,
+                 const char *msg, const int msg_len) {
+  sha256_digest_t dest;
+  calc_hmac_sha256(key, key_len, msg, msg_len,
+                   reinterpret_cast<char*>(dest.v));
+  return dest;
+}
+
+static inline sha256_digest_t
+calc_hmac_sha256(const std::string_view& key, const std::string_view& msg) {
+  sha256_digest_t dest;
+  calc_hmac_sha256(key.data(), key.size(),
+                   msg.data(), msg.size(),
+                   reinterpret_cast<char*>(dest.v));
+  return dest;
+}
+
+static inline sha256_digest_t
+calc_hmac_sha256(const sha256_digest_t &key,
+                 const std::string_view& msg) {
+  sha256_digest_t dest;
+  calc_hmac_sha256(reinterpret_cast<const char*>(key.v), sha256_digest_t::SIZE,
+                   msg.data(), msg.size(),
+                   reinterpret_cast<char*>(dest.v));
+  return dest;
+}
+
+static inline sha256_digest_t
+calc_hmac_sha256(const std::vector<unsigned char>& key,
+                 const std::string_view& msg) {
+  sha256_digest_t dest;
+  calc_hmac_sha256(reinterpret_cast<const char*>(key.data()), key.size(),
+                   msg.data(), msg.size(),
+                   reinterpret_cast<char*>(dest.v));
+  return dest;
+}
+
+template<size_t KeyLenN>
+static inline sha256_digest_t
+calc_hmac_sha256(const std::array<unsigned char, KeyLenN>& key,
+                 const std::string_view& msg) {
+  sha256_digest_t dest;
+  calc_hmac_sha256(reinterpret_cast<const char*>(key.data()), key.size(),
+                   msg.data(), msg.size(),
+                   reinterpret_cast<char*>(dest.v));
+  return dest;
+}
+
+extern sha256_digest_t calc_hash_sha256(const std::string_view& msg);
+
+extern ceph::crypto::SHA256* calc_hash_sha256_open_stream();
+extern void calc_hash_sha256_update_stream(ceph::crypto::SHA256* hash,
+                                           const char* msg,
+                                           int len);
+extern std::string calc_hash_sha256_close_stream(ceph::crypto::SHA256** phash);
+extern std::string calc_hash_sha256_restart_stream(ceph::crypto::SHA256** phash);
+
+extern int rgw_parse_op_type_list(const std::string& str, uint32_t *perm);
+
+static constexpr uint32_t MATCH_POLICY_ACTION = 0x01;
+static constexpr uint32_t MATCH_POLICY_RESOURCE = 0x02;
+static constexpr uint32_t MATCH_POLICY_ARN = 0x04;
+static constexpr uint32_t MATCH_POLICY_STRING = 0x08;
+
+extern bool match_policy(std::string_view pattern, std::string_view input,
+                         uint32_t flag);
+
+extern std::string camelcase_dash_http_attr(const std::string& orig);
+extern std::string lowercase_dash_http_attr(const std::string& orig);
+
+void rgw_setup_saved_curl_handles();
+void rgw_release_all_curl_handles();
+
+static inline void rgw_escape_str(const std::string& s, char esc_char,
+				  char special_char, std::string *dest)
+{
+  const char *src = s.c_str();
+  char dest_buf[s.size() * 2 + 1];
+  char *destp = dest_buf;
+
+  for (size_t i = 0; i < s.size(); i++) {
+    char c = src[i];
+    if (c == esc_char || c == special_char) {
+      *destp++ = esc_char;
+    }
+    *destp++ = c;
+  }
+  *destp++ = '\0';
+  *dest = dest_buf;
+}
+
+static inline ssize_t rgw_unescape_str(const std::string& s, ssize_t ofs,
+				       char esc_char, char special_char,
+				       std::string *dest)
+{
+  const char *src = s.c_str();
+  char dest_buf[s.size() + 1];
+  char *destp = dest_buf;
+  bool esc = false;
+
+  dest_buf[0] = '\0';
+
+  for (size_t i = ofs; i < s.size(); i++) {
+    char c = src[i];
+    if (!esc && c == esc_char) {
+      esc = true;
+      continue;
+    }
+    if (!esc && c == special_char) {
+      *destp = '\0';
+      *dest = dest_buf;
+      return (ssize_t)i + 1;
+    }
+    *destp++ = c;
+    esc = false;
+  }
+  *destp = '\0';
+  *dest = dest_buf;
+  return std::string::npos;
+}
+
+static inline std::string rgw_bl_str(ceph::buffer::list& raw)
+{
+  size_t len = raw.length();
+  std::string s(raw.c_str(), len);
+  while (len && !s[len - 1]) {
+    --len;
+    s.resize(len);
+  }
+  return s;
+}
+
+template <typename T>
+int decode_bl(bufferlist& bl, T& t)
+{
+  auto iter = bl.cbegin();
+  try {
+    decode(t, iter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+  return 0;
+}
+
+extern int rgw_bucket_parse_bucket_instance(const std::string& bucket_instance, std::string *bucket_name, std::string *bucket_id, int *shard_id);
+
+boost::intrusive_ptr<CephContext>
+rgw_global_init(const std::map<std::string,std::string> *defaults,
+		    std::vector < const char* >& args,
+		    uint32_t module_type, code_environment_t code_env,
+		    int flags);
diff --git a/src/rgw/rgw_compression.cc b/src/rgw/rgw_compression.cc
new file mode 100644
index 000000000..8306e766a
--- /dev/null
+++ b/src/rgw/rgw_compression.cc
@@ -0,0 +1,236 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_compression.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int rgw_compression_info_from_attr(const bufferlist& attr,
+                                   bool& need_decompress,
+                                   RGWCompressionInfo& cs_info)
+{
+  auto bliter = attr.cbegin();
+  try {
+    decode(cs_info, bliter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+  if (cs_info.blocks.size() == 0) {
+    return -EIO;
+  }
+  if (cs_info.compression_type != "none")
+    need_decompress = true;
+  else
+    need_decompress = false;
+  return 0;
+}
+
+int rgw_compression_info_from_attrset(const map<string, bufferlist>& attrs,
+                                      bool& need_decompress,
+                                      RGWCompressionInfo& cs_info)
+{
+  auto value = attrs.find(RGW_ATTR_COMPRESSION);
+  if (value == attrs.end()) {
+    need_decompress = false;
+    return 0;
+  }
+  return rgw_compression_info_from_attr(value->second, need_decompress, cs_info);
+}
+
+//------------RGWPutObj_Compress---------------
+
+int RGWPutObj_Compress::process(bufferlist&& in, uint64_t logical_offset)
+{
+  bufferlist out;
+  compressed_ofs = logical_offset;
+
+  if (in.length() > 0) {
+    // compression stuff
+    if ((logical_offset > 0 && compressed) || // if previous part was compressed
+        (logical_offset == 0)) {              // or it's the first part
+      ldout(cct, 10) << "Compression for rgw is enabled, compress part " << in.length() << dendl;
+      int cr = compressor->compress(in, out, compressor_message);
+      if (cr < 0) {
+        if (logical_offset > 0) {
+          lderr(cct) << "Compression failed with exit code " << cr
+              << " for next part, compression process failed" << dendl;
+          return -EIO;
+        }
+        compressed = false;
+        ldout(cct, 5) << "Compression failed with exit code " << cr
+            << " for first part, storing uncompressed" << dendl;
+        out = std::move(in);
+      } else {
+        compressed = true;
+    
+        compression_block newbl;
+        size_t bs = blocks.size();
+        newbl.old_ofs = logical_offset;
+        newbl.new_ofs = bs > 0 ? blocks[bs-1].len + blocks[bs-1].new_ofs : 0;
+        newbl.len = out.length();
+        blocks.push_back(newbl);
+
+	compressed_ofs = newbl.new_ofs;
+      }
+    } else {
+      compressed = false;
+      out = std::move(in);
+    }
+    // end of compression stuff
+  } else {
+    size_t bs = blocks.size();
+    compressed_ofs = bs > 0 ? blocks[bs-1].len + blocks[bs-1].new_ofs : logical_offset;
+  }
+
+  return Pipe::process(std::move(out), compressed_ofs);
+}
+
+//----------------RGWGetObj_Decompress---------------------
+RGWGetObj_Decompress::RGWGetObj_Decompress(CephContext* cct_, 
+                                           RGWCompressionInfo* cs_info_, 
+                                           bool partial_content_,
+                                           RGWGetObj_Filter* next): RGWGetObj_Filter(next),
+                                                                cct(cct_),
+                                                                cs_info(cs_info_),
+                                                                partial_content(partial_content_),
+                                                                q_ofs(0),
+                                                                q_len(0),
+                                                                cur_ofs(0)
+{
+  compressor = Compressor::create(cct, cs_info->compression_type);
+  if (!compressor.get())
+    lderr(cct) << "Cannot load compressor of type " << cs_info->compression_type << dendl;
+}
+
+int RGWGetObj_Decompress::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len)
+{
+  ldout(cct, 10) << "Compression for rgw is enabled, decompress part "
+      << "bl_ofs=" << bl_ofs << ", bl_len=" << bl_len << dendl;
+
+  if (!compressor.get()) {
+    // if compressor isn't available - error, because cannot return decompressed data?
+    lderr(cct) << "Cannot load compressor of type " << cs_info->compression_type << dendl;
+    return -EIO;
+  }
+  bufferlist out_bl, in_bl, temp_in_bl;
+  bl.begin(bl_ofs).copy(bl_len, temp_in_bl);
+  bl_ofs = 0;
+  int r = 0;
+  if (waiting.length() != 0) {
+    in_bl.append(waiting);
+    in_bl.append(temp_in_bl);        
+    waiting.clear();
+  } else {
+    in_bl = std::move(temp_in_bl);
+  }
+  bl_len = in_bl.length();
+  
+  auto iter_in_bl = in_bl.cbegin();
+  while (first_block <= last_block) {
+    bufferlist tmp;
+    off_t ofs_in_bl = first_block->new_ofs - cur_ofs;
+    if (ofs_in_bl + (off_t)first_block->len > bl_len) {
+      // not complete block, put it to waiting
+      unsigned tail = bl_len - ofs_in_bl;
+      if (iter_in_bl.get_off() != ofs_in_bl) {
+        iter_in_bl.seek(ofs_in_bl);
+      }
+      iter_in_bl.copy(tail, waiting);
+      cur_ofs -= tail;
+      break;
+    }
+    if (iter_in_bl.get_off() != ofs_in_bl) {
+      iter_in_bl.seek(ofs_in_bl);
+    }
+    iter_in_bl.copy(first_block->len, tmp);
+    int cr = compressor->decompress(tmp, out_bl, cs_info->compressor_message);
+    if (cr < 0) {
+      lderr(cct) << "Decompression failed with exit code " << cr << dendl;
+      return cr;
+    }
+    ++first_block;
+    while (out_bl.length() - q_ofs >=
+	   static_cast<off_t>(cct->_conf->rgw_max_chunk_size)) {
+      off_t ch_len = std::min<off_t>(cct->_conf->rgw_max_chunk_size, q_len);
+      q_len -= ch_len;
+      r = next->handle_data(out_bl, q_ofs, ch_len);
+      if (r < 0) {
+        lsubdout(cct, rgw, 0) << "handle_data failed with exit code " << r << dendl;
+        return r;
+      }
+      out_bl.splice(0, q_ofs + ch_len);
+      q_ofs = 0;
+    }
+  }
+
+  cur_ofs += bl_len;
+  off_t ch_len = std::min<off_t>(out_bl.length() - q_ofs, q_len);
+  if (ch_len > 0) {
+    r = next->handle_data(out_bl, q_ofs, ch_len);
+    if (r < 0) {
+      lsubdout(cct, rgw, 0) << "handle_data failed with exit code " << r << dendl;
+      return r;
+    }
+    out_bl.splice(0, q_ofs + ch_len);
+    q_len -= ch_len;
+    q_ofs = 0;
+  }
+  return r;
+}
+
+int RGWGetObj_Decompress::fixup_range(off_t& ofs, off_t& end)
+{
+  if (partial_content) {
+    // if user set range, we need to calculate it in decompressed data
+    first_block = cs_info->blocks.begin(); last_block = cs_info->blocks.begin();
+    if (cs_info->blocks.size() > 1) {
+      vector<compression_block>::iterator fb, lb;
+      // not bad to use auto for lambda, I think
+      auto cmp_u = [] (off_t ofs, const compression_block& e) { return (uint64_t)ofs < e.old_ofs; };
+      auto cmp_l = [] (const compression_block& e, off_t ofs) { return e.old_ofs <= (uint64_t)ofs; };
+      fb = upper_bound(cs_info->blocks.begin()+1,
+                       cs_info->blocks.end(),
+                       ofs,
+                       cmp_u);
+      first_block = fb - 1;
+      lb = lower_bound(fb,
+                       cs_info->blocks.end(),
+                       end,
+                       cmp_l);
+      last_block = lb - 1;
+    }
+  } else {
+    first_block = cs_info->blocks.begin(); last_block = cs_info->blocks.end() - 1;
+  }
+
+  q_ofs = ofs - first_block->old_ofs;
+  q_len = end + 1 - ofs;
+
+  ofs = first_block->new_ofs;
+  end = last_block->new_ofs + last_block->len - 1;
+
+  cur_ofs = ofs;
+  waiting.clear();
+
+  return next->fixup_range(ofs, end);
+}
+
+void compression_block::dump(Formatter *f) const
+{
+  f->dump_unsigned("old_ofs", old_ofs);
+  f->dump_unsigned("new_ofs", new_ofs);
+  f->dump_unsigned("len", len);
+}
+
+void RGWCompressionInfo::dump(Formatter *f) const
+{
+  f->dump_string("compression_type", compression_type);
+  f->dump_unsigned("orig_size", orig_size);
+  if (compressor_message) {
+    f->dump_int("compressor_message", *compressor_message);
+  }
+  ::encode_json("blocks", blocks, f);
+}
+
diff --git a/src/rgw/rgw_compression.h b/src/rgw/rgw_compression.h
new file mode 100644
index 000000000..84250bfe4
--- /dev/null
+++ b/src/rgw/rgw_compression.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <vector>
+
+#include "compressor/Compressor.h"
+#include "rgw_putobj.h"
+#include "rgw_op.h"
+#include "rgw_compression_types.h"
+
+int rgw_compression_info_from_attr(const bufferlist& attr,
+                                   bool& need_decompress,
+                                   RGWCompressionInfo& cs_info);
+int rgw_compression_info_from_attrset(const std::map<std::string, bufferlist>& attrs,
+                                      bool& need_decompress,
+                                      RGWCompressionInfo& cs_info);
+
+class RGWGetObj_Decompress : public RGWGetObj_Filter
+{
+  CephContext* cct;
+  CompressorRef compressor;
+  RGWCompressionInfo* cs_info;
+  bool partial_content;
+  std::vector<compression_block>::iterator first_block, last_block;
+  off_t q_ofs, q_len;
+  uint64_t cur_ofs;
+  bufferlist waiting;
+public:
+  RGWGetObj_Decompress(CephContext* cct_, 
+                       RGWCompressionInfo* cs_info_, 
+                       bool partial_content_,
+                       RGWGetObj_Filter* next);
+  virtual ~RGWGetObj_Decompress() override {}
+
+  int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override;
+  int fixup_range(off_t& ofs, off_t& end) override;
+
+};
+
+class RGWPutObj_Compress : public rgw::putobj::Pipe
+{
+  CephContext* cct;
+  bool compressed{false};
+  CompressorRef compressor;
+  std::optional<int32_t> compressor_message;
+  std::vector<compression_block> blocks;
+  uint64_t compressed_ofs{0};
+public:
+  RGWPutObj_Compress(CephContext* cct_, CompressorRef compressor,
+                     rgw::sal::DataProcessor *next)
+    : Pipe(next), cct(cct_), compressor(compressor) {}
+  virtual ~RGWPutObj_Compress() override {};
+
+  int process(bufferlist&& data, uint64_t logical_offset) override;
+
+  bool is_compressed() { return compressed; }
+  std::vector<compression_block>& get_compression_blocks() { return blocks; }
+  std::optional<int32_t> get_compressor_message() { return compressor_message; }
+
+}; /* RGWPutObj_Compress */
diff --git a/src/rgw/rgw_compression_types.h b/src/rgw/rgw_compression_types.h
new file mode 100644
index 000000000..efc002efb
--- /dev/null
+++ b/src/rgw/rgw_compression_types.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+struct compression_block {
+  uint64_t old_ofs;
+  uint64_t new_ofs;
+  uint64_t len;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(old_ofs, bl);
+    encode(new_ofs, bl);
+    encode(len, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(old_ofs, bl);
+     decode(new_ofs, bl);
+     decode(len, bl);
+     DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(compression_block)
+
+struct RGWCompressionInfo {
+  std::string compression_type;
+  uint64_t orig_size;
+  std::optional<int32_t> compressor_message;
+  std::vector<compression_block> blocks;
+
+  RGWCompressionInfo() : compression_type("none"), orig_size(0) {}
+  RGWCompressionInfo(const RGWCompressionInfo& cs_info) : compression_type(cs_info.compression_type),
+                                                          orig_size(cs_info.orig_size),
+							  compressor_message(cs_info.compressor_message),
+                                                          blocks(cs_info.blocks) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(compression_type, bl);
+    encode(orig_size, bl);
+    encode(compressor_message, bl);
+    encode(blocks, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(2, bl);
+     decode(compression_type, bl);
+     decode(orig_size, bl);
+     if (struct_v >= 2) {
+       decode(compressor_message, bl);
+     }
+     decode(blocks, bl);
+     DECODE_FINISH(bl);
+  } 
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWCompressionInfo)
+
diff --git a/src/rgw/rgw_coroutine.cc b/src/rgw/rgw_coroutine.cc
new file mode 100644
index 000000000..a9c9c38e3
--- /dev/null
+++ b/src/rgw/rgw_coroutine.cc
@@ -0,0 +1,1130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/Context.h"
+#include "common/ceph_json.h"
+#include "rgw_coroutine.h"
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+#define dout_context g_ceph_context
+
+using namespace std;
+
+class RGWCompletionManager::WaitContext : public Context {
+  RGWCompletionManager *manager;
+  void *opaque;
+public:
+  WaitContext(RGWCompletionManager *_cm, void *_opaque) : manager(_cm), opaque(_opaque) {}
+  void finish(int r) override {
+    manager->_wakeup(opaque);
+  }
+};
+
+RGWCompletionManager::RGWCompletionManager(CephContext *_cct) : cct(_cct),
+                                            timer(cct, lock)
+{
+  timer.init();
+}
+
+RGWCompletionManager::~RGWCompletionManager()
+{
+  std::lock_guard l{lock};
+  timer.cancel_all_events();
+  timer.shutdown();
+}
+
+void RGWCompletionManager::complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info)
+{
+  std::lock_guard l{lock};
+  _complete(cn, io_id, user_info);
+}
+
+void RGWCompletionManager::register_completion_notifier(RGWAioCompletionNotifier *cn)
+{
+  std::lock_guard l{lock};
+  if (cn) {
+    cns.insert(cn);
+  }
+}
+
+void RGWCompletionManager::unregister_completion_notifier(RGWAioCompletionNotifier *cn)
+{
+  std::lock_guard l{lock};
+  if (cn) {
+    cns.erase(cn);
+  }
+}
+
+void RGWCompletionManager::_complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info)
+{
+  if (cn) {
+    cns.erase(cn);
+  }
+
+  if (complete_reqs_set.find(io_id) != complete_reqs_set.end()) {
+    /* already have completion for this io_id, don't allow multiple completions for it */
+    return;
+  }
+  complete_reqs.push_back(io_completion{io_id, user_info});
+  cond.notify_all();
+}
+
+int RGWCompletionManager::get_next(io_completion *io)
+{
+  std::unique_lock l{lock};
+  while (complete_reqs.empty()) {
+    if (going_down) {
+      return -ECANCELED;
+    }
+    cond.wait(l);
+  }
+  *io = complete_reqs.front();
+  complete_reqs_set.erase(io->io_id);
+  complete_reqs.pop_front();
+  return 0;
+}
+
+bool RGWCompletionManager::try_get_next(io_completion *io)
+{
+  std::lock_guard l{lock};
+  if (complete_reqs.empty()) {
+    return false;
+  }
+  *io = complete_reqs.front();
+  complete_reqs_set.erase(io->io_id);
+  complete_reqs.pop_front();
+  return true;
+}
+
+void RGWCompletionManager::go_down()
+{
+  std::lock_guard l{lock};
+  for (auto cn : cns) {
+    cn->unregister();
+  }
+  going_down = true;
+  cond.notify_all();
+}
+
+void RGWCompletionManager::wait_interval(void *opaque, const utime_t& interval, void *user_info)
+{
+  std::lock_guard l{lock};
+  ceph_assert(waiters.find(opaque) == waiters.end());
+  waiters[opaque] = user_info;
+  timer.add_event_after(interval, new WaitContext(this, opaque));
+}
+
+void RGWCompletionManager::wakeup(void *opaque)
+{
+  std::lock_guard l{lock};
+  _wakeup(opaque);
+}
+
+void RGWCompletionManager::_wakeup(void *opaque)
+{
+  map<void *, void *>::iterator iter = waiters.find(opaque);
+  if (iter != waiters.end()) {
+    void *user_id = iter->second;
+    waiters.erase(iter);
+    _complete(NULL, rgw_io_id{0, -1} /* no IO id */, user_id);
+  }
+}
+
+RGWCoroutine::~RGWCoroutine() {
+  for (auto stack : spawned.entries) {
+    stack->put();
+  }
+}
+
+void RGWCoroutine::init_new_io(RGWIOProvider *io_provider)
+{
+  ceph_assert(stack); // if there's no stack, io_provider won't be uninitialized
+  stack->init_new_io(io_provider);
+}
+
+void RGWCoroutine::set_io_blocked(bool flag) {
+  if (stack) {
+    stack->set_io_blocked(flag);
+  }
+}
+
+void RGWCoroutine::set_sleeping(bool flag) {
+  if (stack) {
+    stack->set_sleeping(flag);
+  }
+}
+
+int RGWCoroutine::io_block(int ret, int64_t io_id) {
+  return io_block(ret, rgw_io_id{io_id, -1});
+}
+
+int RGWCoroutine::io_block(int ret, const rgw_io_id& io_id) {
+  if (!stack) {
+    return 0;
+  }
+  if (stack->consume_io_finish(io_id)) {
+    return 0;
+  }
+  set_io_blocked(true);
+  stack->set_io_blocked_id(io_id);
+  return ret;
+}
+
+void RGWCoroutine::io_complete(const rgw_io_id& io_id) {
+  if (stack) {
+    stack->io_complete(io_id);
+  }
+}
+
+void RGWCoroutine::StatusItem::dump(Formatter *f) const {
+  ::encode_json("timestamp", timestamp, f);
+  ::encode_json("status", status, f);
+}
+
+stringstream& RGWCoroutine::Status::set_status()
+{
+  std::unique_lock l{lock};
+  string s = status.str();
+  status.str(string());
+  if (!timestamp.is_zero()) {
+    history.push_back(StatusItem(timestamp, s));
+  }
+  if (history.size() > (size_t)max_history) {
+    history.pop_front();
+  }
+  timestamp = ceph_clock_now();
+
+  return status;
+}
+
+RGWCoroutinesManager::~RGWCoroutinesManager() {
+  stop();
+  completion_mgr->put();
+  if (cr_registry) {
+    cr_registry->remove(this);
+  }
+}
+
+int64_t RGWCoroutinesManager::get_next_io_id()
+{
+  return (int64_t)++max_io_id;
+}
+
+uint64_t RGWCoroutinesManager::get_next_stack_id() {
+  return (uint64_t)++max_stack_id;
+}
+
+RGWCoroutinesStack::RGWCoroutinesStack(CephContext *_cct, RGWCoroutinesManager *_ops_mgr, RGWCoroutine *start) : cct(_cct), ops_mgr(_ops_mgr),
+                                                                                                         done_flag(false), error_flag(false), blocked_flag(false),
+                                                                                                         sleep_flag(false), interval_wait_flag(false), is_scheduled(false), is_waiting_for_child(false),
+													 retcode(0), run_count(0),
+													 env(NULL), parent(NULL)
+{
+  id = ops_mgr->get_next_stack_id();
+  if (start) {
+    ops.push_back(start);
+  }
+  pos = ops.begin();
+}
+
+RGWCoroutinesStack::~RGWCoroutinesStack()
+{
+  for (auto op : ops) {
+    op->put();
+  }
+
+  for (auto stack : spawned.entries) {
+    stack->put();
+  }
+}
+
+int RGWCoroutinesStack::operate(const DoutPrefixProvider *dpp, RGWCoroutinesEnv *_env)
+{
+  env = _env;
+  RGWCoroutine *op = *pos;
+  op->stack = this;
+  ldpp_dout(dpp, 20) << *op << ": operate()" << dendl;
+  int r = op->operate_wrapper(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << *op << ": operate() returned r=" << r << dendl;
+  }
+
+  error_flag = op->is_error();
+
+  if (op->is_done()) {
+    int op_retcode = r;
+    r = unwind(op_retcode);
+    op->put();
+    done_flag = (pos == ops.end());
+    blocked_flag &= !done_flag;
+    if (done_flag) {
+      retcode = op_retcode;
+    }
+    return r;
+  }
+
+  /* should r ever be negative at this point? */
+  ceph_assert(r >= 0);
+
+  return 0;
+}
+
+string RGWCoroutinesStack::error_str()
+{
+  if (pos != ops.end()) {
+    return (*pos)->error_str();
+  }
+  return string();
+}
+
+void RGWCoroutinesStack::call(RGWCoroutine *next_op) {
+  if (!next_op) {
+    return;
+  }
+  ops.push_back(next_op);
+  if (pos != ops.end()) {
+    ++pos;
+  } else {
+    pos = ops.begin();
+  }
+}
+
+void RGWCoroutinesStack::schedule()
+{
+  env->manager->schedule(env, this);
+}
+
+void RGWCoroutinesStack::_schedule()
+{
+  env->manager->_schedule(env, this);
+}
+
+RGWCoroutinesStack *RGWCoroutinesStack::spawn(RGWCoroutine *source_op, RGWCoroutine *op, bool wait)
+{
+  if (!op) {
+    return NULL;
+  }
+
+  rgw_spawned_stacks *s = (source_op ? &source_op->spawned : &spawned);
+
+  RGWCoroutinesStack *stack = env->manager->allocate_stack();
+  s->add_pending(stack);
+  stack->parent = this;
+
+  stack->get(); /* we'll need to collect the stack */
+  stack->call(op);
+
+  env->manager->schedule(env, stack);
+
+  if (wait) {
+    set_blocked_by(stack);
+  }
+
+  return stack;
+}
+
+RGWCoroutinesStack *RGWCoroutinesStack::spawn(RGWCoroutine *op, bool wait)
+{
+  return spawn(NULL, op, wait);
+}
+
+int RGWCoroutinesStack::wait(const utime_t& interval)
+{
+  RGWCompletionManager *completion_mgr = env->manager->get_completion_mgr();
+  completion_mgr->wait_interval((void *)this, interval, (void *)this);
+  set_io_blocked(true);
+  set_interval_wait(true);
+  return 0;
+}
+
+void RGWCoroutinesStack::wakeup()
+{
+  RGWCompletionManager *completion_mgr = env->manager->get_completion_mgr();
+  completion_mgr->wakeup((void *)this);
+}
+
+void RGWCoroutinesStack::io_complete(const rgw_io_id& io_id)
+{
+  RGWCompletionManager *completion_mgr = env->manager->get_completion_mgr();
+  completion_mgr->complete(nullptr, io_id, (void *)this);
+}
+
+int RGWCoroutinesStack::unwind(int retcode)
+{
+  rgw_spawned_stacks *src_spawned = &(*pos)->spawned;
+
+  if (pos == ops.begin()) {
+    ldout(cct, 15) << "stack " << (void *)this << " end" << dendl;
+    spawned.inherit(src_spawned);
+    ops.clear();
+    pos = ops.end();
+    return retcode;
+  }
+
+  --pos;
+  ops.pop_back();
+  RGWCoroutine *op = *pos;
+  op->set_retcode(retcode);
+  op->spawned.inherit(src_spawned);
+  return 0;
+}
+
+void RGWCoroutinesStack::cancel()
+{
+  while (!ops.empty()) {
+    RGWCoroutine *op = *pos;
+    unwind(-ECANCELED);
+    op->put();
+  }
+  put();
+}
+
+bool RGWCoroutinesStack::collect(RGWCoroutine *op, int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id) /* returns true if needs to be called again */
+{
+  bool need_retry = false;
+  rgw_spawned_stacks *s = (op ? &op->spawned : &spawned);
+  *ret = 0;
+  vector<RGWCoroutinesStack *> new_list;
+
+  for (vector<RGWCoroutinesStack *>::iterator iter = s->entries.begin(); iter != s->entries.end(); ++iter) {
+    RGWCoroutinesStack *stack = *iter;
+    if (stack == skip_stack || !stack->is_done()) {
+      new_list.push_back(stack);
+      if (!stack->is_done()) {
+        ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " is still running" << dendl;
+      } else if (stack == skip_stack) {
+        ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " explicitly skipping stack" << dendl;
+      }
+      continue;
+    }
+    if (stack_id) {
+      *stack_id = stack->get_id();
+    }
+    int r = stack->get_ret_status();
+    stack->put();
+    if (r < 0) {
+      *ret = r;
+      ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " encountered error (r=" << r << "), skipping next stacks" << dendl;
+      new_list.insert(new_list.end(), ++iter, s->entries.end());
+      need_retry = (iter != s->entries.end());
+      break;
+    }
+
+    ldout(cct, 20) << "collect(): s=" << (void *)this << " stack=" << (void *)stack << " is complete" << dendl;
+  }
+
+  s->entries.swap(new_list);
+  return need_retry;
+}
+
+bool RGWCoroutinesStack::collect_next(RGWCoroutine *op, int *ret, RGWCoroutinesStack **collected_stack) /* returns true if found a stack to collect */
+{
+  rgw_spawned_stacks *s = (op ? &op->spawned : &spawned);
+  *ret = 0;
+
+  if (collected_stack) {
+    *collected_stack = NULL;
+  }
+
+  for (vector<RGWCoroutinesStack *>::iterator iter = s->entries.begin(); iter != s->entries.end(); ++iter) {
+    RGWCoroutinesStack *stack = *iter;
+    if (!stack->is_done()) {
+      continue;
+    }
+    int r = stack->get_ret_status();
+    if (r < 0) {
+      *ret = r;
+    }
+
+    if (collected_stack) {
+      *collected_stack = stack;
+    }
+    stack->put();
+
+    s->entries.erase(iter);
+    return true;
+  }
+
+  return false;
+}
+
+bool RGWCoroutinesStack::collect(int *ret, RGWCoroutinesStack *skip_stack, uint64_t  *stack_id) /* returns true if needs to be called again */
+{
+  return collect(NULL, ret, skip_stack, stack_id);
+}
+
+static void _aio_completion_notifier_cb(librados::completion_t cb, void *arg)
+{
+  (static_cast<RGWAioCompletionNotifier *>(arg))->cb();
+}
+
+RGWAioCompletionNotifier::RGWAioCompletionNotifier(RGWCompletionManager *_mgr, const rgw_io_id& _io_id, void *_user_data) : completion_mgr(_mgr),
+                                                                         io_id(_io_id),
+                                                                         user_data(_user_data), registered(true) {
+  c = librados::Rados::aio_create_completion(this, _aio_completion_notifier_cb);
+}
+
+RGWAioCompletionNotifier *RGWCoroutinesStack::create_completion_notifier()
+{
+  return ops_mgr->create_completion_notifier(this);
+}
+
+RGWCompletionManager *RGWCoroutinesStack::get_completion_mgr()
+{
+  return ops_mgr->get_completion_mgr();
+}
+
+bool RGWCoroutinesStack::unblock_stack(RGWCoroutinesStack **s)
+{
+  if (blocking_stacks.empty()) {
+    return false;
+  }
+
+  set<RGWCoroutinesStack *>::iterator iter = blocking_stacks.begin();
+  *s = *iter;
+  blocking_stacks.erase(iter);
+  (*s)->blocked_by_stack.erase(this);
+
+  return true;
+}
+
+void RGWCoroutinesManager::report_error(RGWCoroutinesStack *op)
+{
+  if (!op) {
+    return;
+  }
+  string err = op->error_str();
+  if (err.empty()) {
+    return;
+  }
+  lderr(cct) << "ERROR: failed operation: " << op->error_str() << dendl;
+}
+
+void RGWCoroutinesStack::dump(Formatter *f) const {
+  stringstream ss;
+  ss << (void *)this;
+  ::encode_json("stack", ss.str(), f);
+  ::encode_json("run_count", run_count, f);
+  f->open_array_section("ops");
+  for (auto& i : ops) {
+    encode_json("op", *i, f);
+  }
+  f->close_section();
+}
+
+void RGWCoroutinesStack::init_new_io(RGWIOProvider *io_provider)
+{
+  io_provider->set_io_user_info((void *)this);
+  io_provider->assign_io(env->manager->get_io_id_provider());
+}
+
+bool RGWCoroutinesStack::try_io_unblock(const rgw_io_id& io_id)
+{
+  if (!can_io_unblock(io_id)) {
+    auto p = io_finish_ids.emplace(io_id.id, io_id);
+    auto& iter = p.first;
+    bool inserted = p.second;
+    if (!inserted) { /* could not insert, entry already existed, add channel to completion mask */
+      iter->second.channels |= io_id.channels;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool RGWCoroutinesStack::consume_io_finish(const rgw_io_id& io_id)
+{
+  auto iter = io_finish_ids.find(io_id.id);
+  if (iter == io_finish_ids.end()) {
+    return false;
+  }
+  int finish_mask = iter->second.channels;
+  bool found = (finish_mask & io_id.channels) != 0;
+
+  finish_mask &= ~(finish_mask & io_id.channels);
+
+  if (finish_mask == 0) {
+    io_finish_ids.erase(iter);
+  }
+  return found;
+}
+
+
+void RGWCoroutinesManager::handle_unblocked_stack(set<RGWCoroutinesStack *>& context_stacks, list<RGWCoroutinesStack *>& scheduled_stacks,
+                                                  RGWCompletionManager::io_completion& io, int *blocked_count, int *interval_wait_count)
+{
+  ceph_assert(ceph_mutex_is_wlocked(lock));
+  RGWCoroutinesStack *stack = static_cast<RGWCoroutinesStack *>(io.user_info);
+  if (context_stacks.find(stack) == context_stacks.end()) {
+    return;
+  }
+  if (!stack->try_io_unblock(io.io_id)) {
+    return;
+  }
+  if (stack->is_io_blocked()) {
+    --(*blocked_count);
+    stack->set_io_blocked(false);
+    if (stack->is_interval_waiting()) {
+      --(*interval_wait_count);
+    }
+  }
+  stack->set_interval_wait(false);
+  if (!stack->is_done()) {
+    if (!stack->is_scheduled) {
+      scheduled_stacks.push_back(stack);
+      stack->set_is_scheduled(true);
+    }
+  } else {
+    context_stacks.erase(stack);
+    stack->put();
+  }
+}
+
+void RGWCoroutinesManager::schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack)
+{
+  std::unique_lock wl{lock};
+  _schedule(env, stack);
+}
+
+void RGWCoroutinesManager::_schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack)
+{
+  ceph_assert(ceph_mutex_is_wlocked(lock));
+  if (!stack->is_scheduled) {
+    env->scheduled_stacks->push_back(stack);
+    stack->set_is_scheduled(true);
+  }
+  set<RGWCoroutinesStack *>& context_stacks = run_contexts[env->run_context];
+  context_stacks.insert(stack);
+}
+
+void RGWCoroutinesManager::set_sleeping(RGWCoroutine *cr, bool flag)
+{
+  cr->set_sleeping(flag);
+}
+
+void RGWCoroutinesManager::io_complete(RGWCoroutine *cr, const rgw_io_id& io_id)
+{
+  cr->io_complete(io_id);
+}
+
+int RGWCoroutinesManager::run(const DoutPrefixProvider *dpp, list<RGWCoroutinesStack *>& stacks)
+{
+  int ret = 0;
+  int blocked_count = 0;
+  int interval_wait_count = 0;
+  bool canceled = false; // set on going_down
+  RGWCoroutinesEnv env;
+  bool op_not_blocked;
+
+  uint64_t run_context = ++run_context_count;
+
+  lock.lock();
+  set<RGWCoroutinesStack *>& context_stacks = run_contexts[run_context];
+  list<RGWCoroutinesStack *> scheduled_stacks;
+  for (auto& st : stacks) {
+    context_stacks.insert(st);
+    scheduled_stacks.push_back(st);
+    st->set_is_scheduled(true);
+  }
+  env.run_context = run_context;
+  env.manager = this;
+  env.scheduled_stacks = &scheduled_stacks;
+
+  for (list<RGWCoroutinesStack *>::iterator iter = scheduled_stacks.begin(); iter != scheduled_stacks.end() && !going_down;) {
+    RGWCompletionManager::io_completion io;
+    RGWCoroutinesStack *stack = *iter;
+    ++iter;
+    scheduled_stacks.pop_front();
+
+    if (context_stacks.find(stack) == context_stacks.end()) {
+      /* stack was probably schedule more than once due to IO, but was since complete */
+      goto next;
+    }
+    env.stack = stack;
+
+    lock.unlock();
+
+    ret = stack->operate(dpp, &env);
+
+    lock.lock();
+
+    stack->set_is_scheduled(false);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "stack->operate() returned ret=" << ret << dendl;
+    }
+
+    if (stack->is_error()) {
+      report_error(stack);
+    }
+
+    op_not_blocked = false;
+
+    if (stack->is_io_blocked()) {
+      ldout(cct, 20) << __func__ << ":" << " stack=" << (void *)stack << " is io blocked" << dendl;
+      if (stack->is_interval_waiting()) {
+        interval_wait_count++;
+      }
+      blocked_count++;
+    } else if (stack->is_blocked()) {
+      /* do nothing, we'll re-add the stack when the blocking stack is done,
+       * or when we're awaken
+       */
+      ldout(cct, 20) << __func__ << ":" << " stack=" << (void *)stack << " is_blocked_by_stack()=" << stack->is_blocked_by_stack()
+	             << " is_sleeping=" << stack->is_sleeping() << " waiting_for_child()=" << stack->waiting_for_child() << dendl;
+    } else if (stack->is_done()) {
+      ldout(cct, 20) << __func__ << ":" << " stack=" << (void *)stack << " is done" << dendl;
+      RGWCoroutinesStack *s;
+      while (stack->unblock_stack(&s)) {
+	if (!s->is_blocked_by_stack() && !s->is_done()) {
+	  if (s->is_io_blocked()) {
+            if (stack->is_interval_waiting()) {
+              interval_wait_count++;
+            }
+	    blocked_count++;
+	  } else {
+	    s->_schedule();
+	  }
+	}
+      }
+      if (stack->parent && stack->parent->waiting_for_child()) {
+        stack->parent->set_wait_for_child(false);
+        stack->parent->_schedule();
+      }
+      context_stacks.erase(stack);
+      stack->put();
+      stack = NULL;
+    } else {
+      op_not_blocked = true;
+      stack->run_count++;
+      stack->_schedule();
+    }
+
+    if (!op_not_blocked && stack) {
+      stack->run_count = 0;
+    }
+
+    while (completion_mgr->try_get_next(&io)) {
+      handle_unblocked_stack(context_stacks, scheduled_stacks, io, &blocked_count, &interval_wait_count);
+    }
+
+    /*
+     * only account blocked operations that are not in interval_wait, these are stacks that
+     * were put on a wait without any real IO operations. While we mark these as io_blocked,
+     * these aren't really waiting for IOs
+     */
+    while (blocked_count - interval_wait_count >= ops_window) {
+      lock.unlock();
+      ret = completion_mgr->get_next(&io);
+      lock.lock();
+      if (ret < 0) {
+       ldout(cct, 5) << "completion_mgr.get_next() returned ret=" << ret << dendl;
+      }
+      handle_unblocked_stack(context_stacks, scheduled_stacks, io, &blocked_count, &interval_wait_count);
+    }
+
+next:
+    while (scheduled_stacks.empty() && blocked_count > 0) {
+      lock.unlock();
+      ret = completion_mgr->get_next(&io);
+      lock.lock();
+      if (ret < 0) {
+        ldout(cct, 5) << "completion_mgr.get_next() returned ret=" << ret << dendl;
+      }
+      if (going_down) {
+	ldout(cct, 5) << __func__ << "(): was stopped, exiting" << dendl;
+	ret = -ECANCELED;
+        canceled = true;
+        break;
+      }
+      handle_unblocked_stack(context_stacks, scheduled_stacks, io, &blocked_count, &interval_wait_count);
+      iter = scheduled_stacks.begin();
+    }
+    if (canceled) {
+      break;
+    }
+
+    if (iter == scheduled_stacks.end()) {
+      iter = scheduled_stacks.begin();
+    }
+  }
+
+  if (!context_stacks.empty() && !going_down) {
+    JSONFormatter formatter(true);
+    formatter.open_array_section("context_stacks");
+    for (auto& s : context_stacks) {
+      ::encode_json("entry", *s, &formatter);
+    }
+    formatter.close_section();
+    lderr(cct) << __func__ << "(): ERROR: deadlock detected, dumping remaining coroutines:\n";
+    formatter.flush(*_dout);
+    *_dout << dendl;
+    ceph_assert(context_stacks.empty() || going_down); // assert on deadlock
+  }
+
+  for (auto stack : context_stacks) {
+    ldout(cct, 20) << "clearing stack on run() exit: stack=" << (void *)stack << " nref=" << stack->get_nref() << dendl;
+    stack->cancel();
+  }
+  run_contexts.erase(run_context);
+  lock.unlock();
+
+  return ret;
+}
+
+int RGWCoroutinesManager::run(const DoutPrefixProvider *dpp, RGWCoroutine *op)
+{
+  if (!op) {
+    return 0;
+  }
+  list<RGWCoroutinesStack *> stacks;
+  RGWCoroutinesStack *stack = allocate_stack();
+  op->get();
+  stack->call(op);
+
+  stacks.push_back(stack);
+
+  int r = run(dpp, stacks);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "run(stacks) returned r=" << r << dendl;
+  } else {
+    r = op->get_ret_status();
+  }
+  op->put();
+
+  return r;
+}
+
+RGWAioCompletionNotifier *RGWCoroutinesManager::create_completion_notifier(RGWCoroutinesStack *stack)
+{
+  rgw_io_id io_id{get_next_io_id(), -1};
+  RGWAioCompletionNotifier *cn = new RGWAioCompletionNotifier(completion_mgr, io_id, (void *)stack);
+  completion_mgr->register_completion_notifier(cn);
+  return cn;
+}
+
+void RGWCoroutinesManager::dump(Formatter *f) const {
+  std::shared_lock rl{lock};
+
+  f->open_array_section("run_contexts");
+  for (auto& i : run_contexts) {
+    f->open_object_section("context");
+    ::encode_json("id", i.first, f);
+    f->open_array_section("entries");
+    for (auto& s : i.second) {
+      ::encode_json("entry", *s, f);
+    }
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+}
+
+RGWCoroutinesStack *RGWCoroutinesManager::allocate_stack() {
+  return new RGWCoroutinesStack(cct, this);
+}
+
+string RGWCoroutinesManager::get_id()
+{
+  if (!id.empty()) {
+    return id;
+  }
+  stringstream ss;
+  ss << (void *)this;
+  return ss.str();
+}
+
+void RGWCoroutinesManagerRegistry::add(RGWCoroutinesManager *mgr)
+{
+  std::unique_lock wl{lock};
+  if (managers.find(mgr) == managers.end()) {
+    managers.insert(mgr);
+    get();
+  }
+}
+
+void RGWCoroutinesManagerRegistry::remove(RGWCoroutinesManager *mgr)
+{
+  std::unique_lock wl{lock};
+  if (managers.find(mgr) != managers.end()) {
+    managers.erase(mgr);
+    put();
+  }
+}
+
+RGWCoroutinesManagerRegistry::~RGWCoroutinesManagerRegistry()
+{
+  AdminSocket *admin_socket = cct->get_admin_socket();
+  if (!admin_command.empty()) {
+    admin_socket->unregister_commands(this);
+  }
+}
+
+int RGWCoroutinesManagerRegistry::hook_to_admin_command(const string& command)
+{
+  AdminSocket *admin_socket = cct->get_admin_socket();
+  if (!admin_command.empty()) {
+    admin_socket->unregister_commands(this);
+  }
+  admin_command = command;
+  int r = admin_socket->register_command(admin_command, this,
+				     "dump current coroutines stack state");
+  if (r < 0) {
+    lderr(cct) << "ERROR: fail to register admin socket command (r=" << r << ")" << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int RGWCoroutinesManagerRegistry::call(std::string_view command,
+				       const cmdmap_t& cmdmap,
+				       const bufferlist&,
+				       Formatter *f,
+				       std::ostream& ss,
+				       bufferlist& out) {
+  std::shared_lock rl{lock};
+  ::encode_json("cr_managers", *this, f);
+  return 0;
+}
+
+void RGWCoroutinesManagerRegistry::dump(Formatter *f) const {
+  f->open_array_section("coroutine_managers");
+  for (auto m : managers) {
+    ::encode_json("entry", *m, f);
+  }
+  f->close_section();
+}
+
+void RGWCoroutine::call(RGWCoroutine *op)
+{
+  if (op) {
+    stack->call(op);
+  } else {
+    // the call()er expects this to set a retcode
+    retcode = 0;
+  }
+}
+
+RGWCoroutinesStack *RGWCoroutine::spawn(RGWCoroutine *op, bool wait)
+{
+  return stack->spawn(this, op, wait);
+}
+
+bool RGWCoroutine::collect(int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id) /* returns true if needs to be called again */
+{
+  return stack->collect(this, ret, skip_stack, stack_id);
+}
+
+bool RGWCoroutine::collect_next(int *ret, RGWCoroutinesStack **collected_stack) /* returns true if found a stack to collect */
+{
+  return stack->collect_next(this, ret, collected_stack);
+}
+
+int RGWCoroutine::wait(const utime_t& interval)
+{
+  return stack->wait(interval);
+}
+
+void RGWCoroutine::wait_for_child()
+{
+  /* should only wait for child if there is a child that is not done yet, and no complete children */
+  if (spawned.entries.empty()) {
+    return;
+  }
+  for (vector<RGWCoroutinesStack *>::iterator iter = spawned.entries.begin(); iter != spawned.entries.end(); ++iter) {
+    if ((*iter)->is_done()) {
+      return;
+    }
+  }
+  stack->set_wait_for_child(true);
+}
+
+string RGWCoroutine::to_str() const
+{
+  return typeid(*this).name();
+}
+
+ostream& operator<<(ostream& out, const RGWCoroutine& cr)
+{
+  out << "cr:s=" << (void *)cr.get_stack() << ":op=" << (void *)&cr << ":" << typeid(cr).name();
+  return out;
+}
+
+bool RGWCoroutine::drain_children(int num_cr_left,
+                                  RGWCoroutinesStack *skip_stack,
+                                  std::optional<std::function<void(uint64_t stack_id, int ret)> > cb)
+{
+  bool done = false;
+  ceph_assert(num_cr_left >= 0);
+  if (num_cr_left == 0 && skip_stack) {
+    num_cr_left = 1;
+  }
+  reenter(&drain_status.cr) {
+    while (num_spawned() > (size_t)num_cr_left) {
+      yield wait_for_child();
+      int ret;
+      uint64_t stack_id;
+      bool again = false;
+      do {
+        again = collect(&ret, skip_stack, &stack_id);
+        if (ret < 0) {
+            ldout(cct, 10) << "collect() returned ret=" << ret << dendl;
+            /* we should have reported this error */
+            log_error() << "ERROR: collect() returned error (ret=" << ret << ")";
+        }
+        if (cb) {
+          (*cb)(stack_id, ret);
+        }
+      } while (again);
+    }
+    done = true;
+  }
+  return done;
+}
+
+bool RGWCoroutine::drain_children(int num_cr_left,
+                                  std::optional<std::function<int(uint64_t stack_id, int ret)> > cb)
+{
+  bool done = false;
+  ceph_assert(num_cr_left >= 0);
+
+  reenter(&drain_status.cr) {
+    while (num_spawned() > (size_t)num_cr_left) {
+      yield wait_for_child();
+      int ret;
+      uint64_t stack_id;
+      bool again = false;
+      do {
+        again = collect(&ret, nullptr, &stack_id);
+        if (ret < 0) {
+          ldout(cct, 10) << "collect() returned ret=" << ret << dendl;
+          /* we should have reported this error */
+          log_error() << "ERROR: collect() returned error (ret=" << ret << ")";
+        }
+        if (cb && !drain_status.should_exit) {
+          int r = (*cb)(stack_id, ret);
+          if (r < 0) {
+            drain_status.ret = r;
+            drain_status.should_exit = true;
+            num_cr_left = 0; /* need to drain all */
+          }
+        }
+      } while (again);
+    }
+    done = true;
+  }
+  return done;
+}
+
+void RGWCoroutine::wakeup()
+{
+  if (stack) {
+    stack->wakeup();
+  }
+}
+
+RGWCoroutinesEnv *RGWCoroutine::get_env() const
+{
+  return stack->get_env();
+}
+
+void RGWCoroutine::dump(Formatter *f) const {
+  if (!description.str().empty()) {
+    encode_json("description", description.str(), f);
+  }
+  encode_json("type", to_str(), f);
+  if (!spawned.entries.empty()) {
+    f->open_array_section("spawned");
+    for (auto& i : spawned.entries) {
+      char buf[32];
+      snprintf(buf, sizeof(buf), "%p", (void *)i);
+      encode_json("stack", string(buf), f);
+    }
+    f->close_section();
+  }
+  if (!status.history.empty()) {
+    encode_json("history", status.history, f);
+  }
+
+  if (!status.status.str().empty()) {
+    f->open_object_section("status");
+    encode_json("status", status.status.str(), f);
+    encode_json("timestamp", status.timestamp, f);
+    f->close_section();
+  }
+}
+
+RGWSimpleCoroutine::~RGWSimpleCoroutine()
+{
+  if (!called_cleanup) {
+    request_cleanup();
+  }
+}
+
+void RGWSimpleCoroutine::call_cleanup()
+{
+  called_cleanup = true;
+  request_cleanup();
+}
+
+int RGWSimpleCoroutine::operate(const DoutPrefixProvider *dpp)
+{
+  int ret = 0;
+  reenter(this) {
+    yield return state_init();
+    yield return state_send_request(dpp);
+    yield return state_request_complete();
+    yield return state_all_complete();
+    drain_all();
+    call_cleanup();
+    return set_state(RGWCoroutine_Done, ret);
+  }
+  return 0;
+}
+
+int RGWSimpleCoroutine::state_init()
+{
+  int ret = init();
+  if (ret < 0) {
+    call_cleanup();
+    return set_state(RGWCoroutine_Error, ret);
+  }
+  return 0;
+}
+
+int RGWSimpleCoroutine::state_send_request(const DoutPrefixProvider *dpp)
+{
+  int ret = send_request(dpp);
+  if (ret < 0) {
+    call_cleanup();
+    return set_state(RGWCoroutine_Error, ret);
+  }
+  return io_block(0);
+}
+
+int RGWSimpleCoroutine::state_request_complete()
+{
+  int ret = request_complete();
+  if (ret < 0) {
+    call_cleanup();
+    return set_state(RGWCoroutine_Error, ret);
+  }
+  return 0;
+}
+
+int RGWSimpleCoroutine::state_all_complete()
+{
+  int ret = finish();
+  if (ret < 0) {
+    call_cleanup();
+    return set_state(RGWCoroutine_Error, ret);
+  }
+  return 0;
+}
+
+
diff --git a/src/rgw/rgw_coroutine.h b/src/rgw/rgw_coroutine.h
new file mode 100644
index 000000000..eb3216640
--- /dev/null
+++ b/src/rgw/rgw_coroutine.h
@@ -0,0 +1,722 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#ifdef _ASSERT_H
+#define NEED_ASSERT_H
+#pragma push_macro("_ASSERT_H")
+#endif
+
+#include <boost/asio.hpp>
+#include <boost/intrusive_ptr.hpp>
+
+#ifdef NEED_ASSERT_H
+#pragma pop_macro("_ASSERT_H")
+#endif
+
+#include "include/utime.h"
+#include "common/RefCountedObj.h"
+#include "common/debug.h"
+#include "common/Timer.h"
+#include "common/admin_socket.h"
+
+#include "rgw_common.h"
+#include "rgw_http_client_types.h"
+
+#include <boost/asio/coroutine.hpp>
+
+#include <atomic>
+
+#define RGW_ASYNC_OPS_MGR_WINDOW 100
+
+class RGWCoroutinesStack;
+class RGWCoroutinesManager;
+class RGWAioCompletionNotifier;
+
+class RGWCompletionManager : public RefCountedObject {
+  friend class RGWCoroutinesManager;
+
+  CephContext *cct;
+
+  struct io_completion {
+    rgw_io_id io_id;
+    void *user_info;
+  };
+  std::list<io_completion> complete_reqs;
+  std::set<rgw_io_id> complete_reqs_set;
+  using NotifierRef = boost::intrusive_ptr<RGWAioCompletionNotifier>;
+  std::set<NotifierRef> cns;
+
+  ceph::mutex lock = ceph::make_mutex("RGWCompletionManager::lock");
+  ceph::condition_variable cond;
+
+  SafeTimer timer;
+
+  std::atomic<bool> going_down = { false };
+
+  std::map<void *, void *> waiters;
+
+  class WaitContext;
+
+protected:
+  void _wakeup(void *opaque);
+  void _complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info);
+public:
+  explicit RGWCompletionManager(CephContext *_cct);
+  virtual ~RGWCompletionManager() override;
+
+  void complete(RGWAioCompletionNotifier *cn, const rgw_io_id& io_id, void *user_info);
+  int get_next(io_completion *io);
+  bool try_get_next(io_completion *io);
+
+  void go_down();
+
+  /*
+   * wait for interval length to complete user_info
+   */
+  void wait_interval(void *opaque, const utime_t& interval, void *user_info);
+  void wakeup(void *opaque);
+
+  void register_completion_notifier(RGWAioCompletionNotifier *cn);
+  void unregister_completion_notifier(RGWAioCompletionNotifier *cn);
+};
+
+/* a single use librados aio completion notifier that hooks into the RGWCompletionManager */
+class RGWAioCompletionNotifier : public RefCountedObject {
+  librados::AioCompletion *c;
+  RGWCompletionManager *completion_mgr;
+  rgw_io_id io_id;
+  void *user_data;
+  ceph::mutex lock = ceph::make_mutex("RGWAioCompletionNotifier");
+  bool registered;
+
+public:
+  RGWAioCompletionNotifier(RGWCompletionManager *_mgr, const rgw_io_id& _io_id, void *_user_data);
+  virtual ~RGWAioCompletionNotifier() override {
+    c->release();
+    lock.lock();
+    bool need_unregister = registered;
+    if (registered) {
+      completion_mgr->get();
+    }
+    registered = false;
+    lock.unlock();
+    if (need_unregister) {
+      completion_mgr->unregister_completion_notifier(this);
+      completion_mgr->put();
+    }
+  }
+
+  librados::AioCompletion *completion() {
+    return c;
+  }
+
+  void unregister() {
+    std::lock_guard l{lock};
+    if (!registered) {
+      return;
+    }
+    registered = false;
+  }
+
+  void cb() {
+    lock.lock();
+    if (!registered) {
+      lock.unlock();
+      put();
+      return;
+    }
+    completion_mgr->get();
+    registered = false;
+    lock.unlock();
+    completion_mgr->complete(this, io_id, user_data);
+    completion_mgr->put();
+    put();
+  }
+};
+
+// completion notifier with opaque payload (ie a reference-counted pointer)
+template <typename T>
+class RGWAioCompletionNotifierWith : public RGWAioCompletionNotifier {
+  T value;
+public:
+  RGWAioCompletionNotifierWith(RGWCompletionManager *mgr,
+                               const rgw_io_id& io_id, void *user_data,
+                               T value)
+    : RGWAioCompletionNotifier(mgr, io_id, user_data), value(std::move(value))
+  {}
+};
+
+struct RGWCoroutinesEnv {
+  uint64_t run_context;
+  RGWCoroutinesManager *manager;
+  std::list<RGWCoroutinesStack *> *scheduled_stacks;
+  RGWCoroutinesStack *stack;
+
+  RGWCoroutinesEnv() : run_context(0), manager(NULL), scheduled_stacks(NULL), stack(NULL) {}
+};
+
+enum RGWCoroutineState {
+  RGWCoroutine_Error = -2,
+  RGWCoroutine_Done  = -1,
+  RGWCoroutine_Run   =  0,
+};
+
+struct rgw_spawned_stacks {
+  std::vector<RGWCoroutinesStack *> entries;
+
+  rgw_spawned_stacks() {}
+
+  void add_pending(RGWCoroutinesStack *s) {
+    entries.push_back(s);
+  }
+
+  void inherit(rgw_spawned_stacks *source) {
+    for (auto* entry : source->entries) {
+      add_pending(entry);
+    }
+    source->entries.clear();
+  }
+};
+
+
+
+class RGWCoroutine : public RefCountedObject, public boost::asio::coroutine {
+  friend class RGWCoroutinesStack;
+
+  struct StatusItem {
+    utime_t timestamp;
+    std::string status;
+
+    StatusItem(utime_t& t, const std::string& s) : timestamp(t), status(s) {}
+
+    void dump(Formatter *f) const;
+  };
+
+#define MAX_COROUTINE_HISTORY 10
+
+  struct Status {
+    CephContext *cct;
+    ceph::shared_mutex lock =
+      ceph::make_shared_mutex("RGWCoroutine::Status::lock");
+    int max_history;
+
+    utime_t timestamp;
+    std::stringstream status;
+
+    explicit Status(CephContext *_cct) : cct(_cct), max_history(MAX_COROUTINE_HISTORY) {}
+
+    std::deque<StatusItem> history;
+
+    std::stringstream& set_status();
+  } status;
+
+  std::stringstream description;
+
+protected:
+  bool _yield_ret;
+
+  struct {
+    boost::asio::coroutine cr;
+    bool should_exit{false};
+    int ret{0};
+
+    void init() {
+      cr = boost::asio::coroutine();
+      should_exit = false;
+      ret = 0;
+    }
+  } drain_status;
+
+  CephContext *cct;
+
+  RGWCoroutinesStack *stack;
+  int retcode;
+  int state;
+
+  rgw_spawned_stacks spawned;
+
+  std::stringstream error_stream;
+
+  int set_state(int s, int ret = 0) {
+    retcode = ret;
+    state = s;
+    return ret;
+  }
+  int set_cr_error(int ret) {
+    return set_state(RGWCoroutine_Error, ret);
+  }
+  int set_cr_done() {
+    return set_state(RGWCoroutine_Done, 0);
+  }
+  void set_io_blocked(bool flag);
+
+  void reset_description() {
+    description.str(std::string());
+  }
+
+  std::stringstream& set_description() {
+    return description;
+  }
+  std::stringstream& set_status() {
+    return status.set_status();
+  }
+
+  std::stringstream& set_status(const std::string& s) {
+    std::stringstream& status = set_status();
+    status << s;
+    return status;
+  }
+
+  virtual int operate_wrapper(const DoutPrefixProvider *dpp) {
+    return operate(dpp);
+  }
+public:
+  RGWCoroutine(CephContext *_cct) : status(_cct), _yield_ret(false), cct(_cct), stack(NULL), retcode(0), state(RGWCoroutine_Run) {}
+  virtual ~RGWCoroutine() override;
+
+  virtual int operate(const DoutPrefixProvider *dpp) = 0;
+
+  bool is_done() { return (state == RGWCoroutine_Done || state == RGWCoroutine_Error); }
+  bool is_error() { return (state == RGWCoroutine_Error); }
+
+  std::stringstream& log_error() { return error_stream; }
+  std::string error_str() {
+    return error_stream.str();
+  }
+
+  void set_retcode(int r) {
+    retcode = r;
+  }
+
+  int get_ret_status() {
+    return retcode;
+  }
+
+  void call(RGWCoroutine *op); /* call at the same stack we're in */
+  RGWCoroutinesStack *spawn(RGWCoroutine *op, bool wait); /* execute on a different stack */
+  bool collect(int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id = nullptr); /* returns true if needs to be called again */
+  bool collect_next(int *ret, RGWCoroutinesStack **collected_stack = NULL); /* returns true if found a stack to collect */
+
+  int wait(const utime_t& interval);
+  bool drain_children(int num_cr_left,
+                      RGWCoroutinesStack *skip_stack = nullptr,
+                      std::optional<std::function<void(uint64_t stack_id, int ret)> > cb = std::nullopt); /* returns true if needed to be called again,
+                                                                                                             cb will be called on completion of every
+                                                                                                             completion. */
+  bool drain_children(int num_cr_left,
+                      std::optional<std::function<int(uint64_t stack_id, int ret)> > cb); /* returns true if needed to be called again,
+                                                                                             cb will be called on every completion, can filter errors.
+                                                                                             A negative return value from cb means that current cr
+                                                                                             will need to exit */
+  void wakeup();
+  void set_sleeping(bool flag); /* put in sleep, or wakeup from sleep */
+
+  size_t num_spawned() {
+    return spawned.entries.size();
+  }
+
+  void wait_for_child();
+
+  virtual std::string to_str() const;
+
+  RGWCoroutinesStack *get_stack() const {
+    return stack;
+  }
+
+  RGWCoroutinesEnv *get_env() const;
+
+  void dump(Formatter *f) const;
+
+  void init_new_io(RGWIOProvider *io_provider); /* only links the default io id */
+
+  int io_block(int ret = 0) {
+    return io_block(ret, -1);
+  }
+  int io_block(int ret, int64_t io_id);
+  int io_block(int ret, const rgw_io_id& io_id);
+  void io_complete() {
+    io_complete(rgw_io_id{});
+  }
+  void io_complete(const rgw_io_id& io_id);
+};
+
+std::ostream& operator<<(std::ostream& out, const RGWCoroutine& cr);
+
+#define yield_until_true(x)     \
+do {                            \
+  do {                          \
+    yield _yield_ret = x;       \
+  } while (!_yield_ret);        \
+  _yield_ret = false;           \
+} while (0)
+
+#define drain_all() \
+  drain_status.init(); \
+  yield_until_true(drain_children(0))
+
+#define drain_all_but(n) \
+  drain_status.init(); \
+  yield_until_true(drain_children(n))
+
+#define drain_all_but_stack(stack) \
+  drain_status.init(); \
+  yield_until_true(drain_children(1, stack))
+
+#define drain_all_but_stack_cb(stack, cb) \
+  drain_status.init(); \
+  yield_until_true(drain_children(1, stack, cb))
+
+#define drain_with_cb(n, cb) \
+  drain_status.init(); \
+  yield_until_true(drain_children(n, cb)); \
+  if (drain_status.should_exit) { \
+    return set_cr_error(drain_status.ret); \
+  }
+
+#define drain_all_cb(cb) \
+  drain_with_cb(0, cb)
+
+#define yield_spawn_window(cr, n, cb) \
+  do { \
+    spawn(cr, false); \
+    drain_with_cb(n, cb); /* this is guaranteed to yield */ \
+  } while (0)
+
+
+
+template <class T>
+class RGWConsumerCR : public RGWCoroutine {
+  std::list<T> product;
+
+public:
+  explicit RGWConsumerCR(CephContext *_cct) : RGWCoroutine(_cct) {}
+
+  bool has_product() {
+    return !product.empty();
+  }
+
+  void wait_for_product() {
+    if (!has_product()) {
+      set_sleeping(true);
+    }
+  }
+
+  bool consume(T *p) {
+    if (product.empty()) {
+      return false;
+    }
+    *p = product.front();
+    product.pop_front();
+    return true;
+  }
+
+  void receive(const T& p, bool wakeup = true);
+  void receive(std::list<T>& l, bool wakeup = true);
+};
+
+class RGWCoroutinesStack : public RefCountedObject {
+  friend class RGWCoroutine;
+  friend class RGWCoroutinesManager;
+
+  CephContext *cct;
+
+  int64_t id{-1};
+
+  RGWCoroutinesManager *ops_mgr;
+
+  std::list<RGWCoroutine *> ops;
+  std::list<RGWCoroutine *>::iterator pos;
+
+  rgw_spawned_stacks spawned;
+
+  std::set<RGWCoroutinesStack *> blocked_by_stack;
+  std::set<RGWCoroutinesStack *> blocking_stacks;
+
+  std::map<int64_t, rgw_io_id> io_finish_ids;
+  rgw_io_id io_blocked_id;
+
+  bool done_flag;
+  bool error_flag;
+  bool blocked_flag;
+  bool sleep_flag;
+  bool interval_wait_flag;
+
+  bool is_scheduled;
+
+  bool is_waiting_for_child;
+
+  int retcode;
+
+  uint64_t run_count;
+
+protected:
+  RGWCoroutinesEnv *env;
+  RGWCoroutinesStack *parent;
+
+  RGWCoroutinesStack *spawn(RGWCoroutine *source_op, RGWCoroutine *next_op, bool wait);
+  bool collect(RGWCoroutine *op, int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id); /* returns true if needs to be called again */
+  bool collect_next(RGWCoroutine *op, int *ret, RGWCoroutinesStack **collected_stack); /* returns true if found a stack to collect */
+public:
+  RGWCoroutinesStack(CephContext *_cct, RGWCoroutinesManager *_ops_mgr, RGWCoroutine *start = NULL);
+  virtual ~RGWCoroutinesStack() override;
+
+  int64_t get_id() const {
+    return id;
+  }
+
+  int operate(const DoutPrefixProvider *dpp, RGWCoroutinesEnv *env);
+
+  bool is_done() {
+    return done_flag;
+  }
+  bool is_error() {
+    return error_flag;
+  }
+  bool is_blocked_by_stack() {
+    return !blocked_by_stack.empty();
+  }
+  void set_io_blocked(bool flag) {
+    blocked_flag = flag;
+  }
+  void set_io_blocked_id(const rgw_io_id& io_id) {
+    io_blocked_id = io_id;
+  }
+  bool is_io_blocked() {
+    return blocked_flag && !done_flag;
+  }
+  bool can_io_unblock(const rgw_io_id& io_id) {
+    return ((io_blocked_id.id < 0) ||
+            io_blocked_id.intersects(io_id));
+  }
+  bool try_io_unblock(const rgw_io_id& io_id);
+  bool consume_io_finish(const rgw_io_id& io_id);
+  void set_interval_wait(bool flag) {
+    interval_wait_flag = flag;
+  }
+  bool is_interval_waiting() {
+    return interval_wait_flag;
+  }
+  void set_sleeping(bool flag) {
+    bool wakeup = sleep_flag & !flag;
+    sleep_flag = flag;
+    if (wakeup) {
+      schedule();
+    }
+  }
+  bool is_sleeping() {
+    return sleep_flag;
+  }
+  void set_is_scheduled(bool flag) {
+    is_scheduled = flag;
+  }
+
+  bool is_blocked() {
+    return is_blocked_by_stack() || is_sleeping() ||
+          is_io_blocked() || waiting_for_child() ;
+  }
+
+  void schedule();
+  void _schedule();
+
+  int get_ret_status() {
+    return retcode;
+  }
+
+  std::string error_str();
+
+  void call(RGWCoroutine *next_op);
+  RGWCoroutinesStack *spawn(RGWCoroutine *next_op, bool wait);
+  int unwind(int retcode);
+
+  int wait(const utime_t& interval);
+  void wakeup();
+  void io_complete() {
+    io_complete(rgw_io_id{});
+  }
+  void io_complete(const rgw_io_id& io_id);
+
+  bool collect(int *ret, RGWCoroutinesStack *skip_stack, uint64_t *stack_id); /* returns true if needs to be called again */
+
+  void cancel();
+
+  RGWAioCompletionNotifier *create_completion_notifier();
+  template <typename T>
+  RGWAioCompletionNotifier *create_completion_notifier(T value);
+  RGWCompletionManager *get_completion_mgr();
+
+  void set_blocked_by(RGWCoroutinesStack *s) {
+    blocked_by_stack.insert(s);
+    s->blocking_stacks.insert(this);
+  }
+
+  void set_wait_for_child(bool flag) {
+    is_waiting_for_child = flag;
+  }
+
+  bool waiting_for_child() {
+    return is_waiting_for_child;
+  }
+
+  bool unblock_stack(RGWCoroutinesStack **s);
+
+  RGWCoroutinesEnv *get_env() const { return env; }
+
+  void dump(Formatter *f) const;
+
+  void init_new_io(RGWIOProvider *io_provider);
+};
+
+template <class T>
+void RGWConsumerCR<T>::receive(std::list<T>& l, bool wakeup)
+{
+  product.splice(product.end(), l);
+  if (wakeup) {
+    set_sleeping(false);
+  }
+}
+
+
+template <class T>
+void RGWConsumerCR<T>::receive(const T& p, bool wakeup)
+{
+  product.push_back(p);
+  if (wakeup) {
+    set_sleeping(false);
+  }
+}
+
+class RGWCoroutinesManagerRegistry : public RefCountedObject, public AdminSocketHook {
+  CephContext *cct;
+
+  std::set<RGWCoroutinesManager *> managers;
+  ceph::shared_mutex lock =
+    ceph::make_shared_mutex("RGWCoroutinesRegistry::lock");
+
+  std::string admin_command;
+
+public:
+  explicit RGWCoroutinesManagerRegistry(CephContext *_cct) : cct(_cct) {}
+  virtual ~RGWCoroutinesManagerRegistry() override;
+
+  void add(RGWCoroutinesManager *mgr);
+  void remove(RGWCoroutinesManager *mgr);
+
+  int hook_to_admin_command(const std::string& command);
+  int call(std::string_view command, const cmdmap_t& cmdmap,
+	   const bufferlist&,
+	   Formatter *f,
+	   std::ostream& ss,
+	   bufferlist& out) override;
+
+  void dump(Formatter *f) const;
+};
+
+class RGWCoroutinesManager {
+  CephContext *cct;
+  std::atomic<bool> going_down = { false };
+
+  std::atomic<int64_t> run_context_count = { 0 };
+  std::map<uint64_t, std::set<RGWCoroutinesStack *> > run_contexts;
+
+  std::atomic<int64_t> max_io_id = { 0 };
+  std::atomic<uint64_t> max_stack_id = { 0 };
+
+  mutable ceph::shared_mutex lock =
+    ceph::make_shared_mutex("RGWCoroutinesManager::lock");
+
+  RGWIOIDProvider io_id_provider;
+
+  void handle_unblocked_stack(std::set<RGWCoroutinesStack *>& context_stacks, std::list<RGWCoroutinesStack *>& scheduled_stacks,
+                              RGWCompletionManager::io_completion& io, int *waiting_count, int *interval_wait_count);
+protected:
+  RGWCompletionManager *completion_mgr;
+  RGWCoroutinesManagerRegistry *cr_registry;
+
+  int ops_window;
+
+  std::string id;
+
+  void put_completion_notifier(RGWAioCompletionNotifier *cn);
+public:
+  RGWCoroutinesManager(CephContext *_cct, RGWCoroutinesManagerRegistry *_cr_registry) : cct(_cct),
+                                                                                        cr_registry(_cr_registry), ops_window(RGW_ASYNC_OPS_MGR_WINDOW) {
+    completion_mgr = new RGWCompletionManager(cct);
+    if (cr_registry) {
+      cr_registry->add(this);
+    }
+  }
+  virtual ~RGWCoroutinesManager();
+
+  int run(const DoutPrefixProvider *dpp, std::list<RGWCoroutinesStack *>& ops);
+  int run(const DoutPrefixProvider *dpp, RGWCoroutine *op);
+  void stop() {
+    bool expected = false;
+    if (going_down.compare_exchange_strong(expected, true)) {
+      completion_mgr->go_down();
+    }
+  }
+
+  virtual void report_error(RGWCoroutinesStack *op);
+
+  RGWAioCompletionNotifier *create_completion_notifier(RGWCoroutinesStack *stack);
+  template <typename T>
+  RGWAioCompletionNotifier *create_completion_notifier(RGWCoroutinesStack *stack, T value);
+  RGWCompletionManager *get_completion_mgr() { return completion_mgr; }
+
+  void schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack);
+  void _schedule(RGWCoroutinesEnv *env, RGWCoroutinesStack *stack);
+  RGWCoroutinesStack *allocate_stack();
+
+  int64_t get_next_io_id();
+  uint64_t get_next_stack_id();
+
+  void set_sleeping(RGWCoroutine *cr, bool flag);
+  void io_complete(RGWCoroutine *cr, const rgw_io_id& io_id);
+
+  virtual std::string get_id();
+  void dump(Formatter *f) const;
+
+  RGWIOIDProvider& get_io_id_provider() {
+    return io_id_provider;
+  }
+};
+
+template <typename T>
+RGWAioCompletionNotifier *RGWCoroutinesManager::create_completion_notifier(RGWCoroutinesStack *stack, T value)
+{
+  rgw_io_id io_id{get_next_io_id(), -1};
+  RGWAioCompletionNotifier *cn = new RGWAioCompletionNotifierWith<T>(completion_mgr, io_id, (void *)stack, std::move(value));
+  completion_mgr->register_completion_notifier(cn);
+  return cn;
+}
+
+template <typename T>
+RGWAioCompletionNotifier *RGWCoroutinesStack::create_completion_notifier(T value)
+{
+  return ops_mgr->create_completion_notifier(this, std::move(value));
+}
+
+class RGWSimpleCoroutine : public RGWCoroutine {
+  bool called_cleanup;
+
+  int operate(const DoutPrefixProvider *dpp) override;
+
+  int state_init();
+  int state_send_request(const DoutPrefixProvider *dpp);
+  int state_request_complete();
+  int state_all_complete();
+
+  void call_cleanup();
+
+public:
+  RGWSimpleCoroutine(CephContext *_cct) : RGWCoroutine(_cct), called_cleanup(false) {}
+  virtual ~RGWSimpleCoroutine() override;
+
+  virtual int init() { return 0; }
+  virtual int send_request(const DoutPrefixProvider *dpp) = 0;
+  virtual int request_complete() = 0;
+  virtual int finish() { return 0; }
+  virtual void request_cleanup() {}
+};
diff --git a/src/rgw/rgw_cors.cc b/src/rgw/rgw_cors.cc
new file mode 100644
index 000000000..83ba079b2
--- /dev/null
+++ b/src/rgw/rgw_cors.cc
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string.h>
+
+#include <iostream>
+#include <map>
+
+#include <boost/algorithm/string.hpp>
+
+#include "include/types.h"
+#include "common/debug.h"
+#include "include/str_list.h"
+#include "common/Formatter.h"
+
+#include "rgw_cors.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void RGWCORSRule::dump_origins() {
+  unsigned num_origins = allowed_origins.size();
+  dout(10) << "Allowed origins : " << num_origins << dendl;
+  for(auto& origin : allowed_origins) {
+    dout(10) << origin << "," << dendl;
+  }
+}
+
+void RGWCORSRule::erase_origin_if_present(string& origin, bool *rule_empty) {
+  set<string>::iterator it = allowed_origins.find(origin);
+  if (!rule_empty)
+    return;
+  *rule_empty = false;
+  if (it != allowed_origins.end()) {
+    dout(10) << "Found origin " << origin << ", set size:" << 
+        allowed_origins.size() << dendl;
+    allowed_origins.erase(it);
+    *rule_empty = (allowed_origins.empty());
+  }
+}
+
+/*
+ * make attrs look-like-this
+ * does not convert underscores or dashes
+ *
+ * Per CORS specification, section 3:
+ * ===
+ * "Converting a string to ASCII lowercase" means replacing all characters in the
+ * range U+0041 LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z with
+ * the corresponding characters in the range U+0061 LATIN SMALL LETTER A to
+ * U+007A LATIN SMALL LETTER Z).
+ * ===
+ *
+ * @todo When UTF-8 is allowed in HTTP headers, this function will need to change
+ */
+string lowercase_http_attr(const string& orig)
+{
+  const char *s = orig.c_str();
+  char buf[orig.size() + 1];
+  buf[orig.size()] = '\0';
+
+  for (size_t i = 0; i < orig.size(); ++i, ++s) {
+	buf[i] = tolower(*s);
+  }
+  return string(buf);
+}
+
+
+static bool is_string_in_set(set<string>& s, string h) {
+  if ((s.find("*") != s.end()) || 
+          (s.find(h) != s.end())) {
+    return true;
+  }
+  /* The header can be Content-*-type, or Content-* */
+  for(set<string>::iterator it = s.begin();
+      it != s.end(); ++it) {
+    size_t off;
+    if ((off = (*it).find("*"))!=string::npos) {
+      list<string> ssplit;
+      unsigned flen = 0;
+      
+      get_str_list((*it), "* \t", ssplit);
+      if (off != 0) {
+        string sl = ssplit.front();
+        flen = sl.length();
+        dout(10) << "Finding " << sl << ", in " << h << ", at offset 0" << dendl;
+        if (!boost::algorithm::starts_with(h,sl))
+          continue;
+        ssplit.pop_front();
+      }
+      if (off != ((*it).length() - 1)) {
+        string sl = ssplit.front();
+        dout(10) << "Finding " << sl << ", in " << h 
+          << ", at offset not less than " << flen << dendl;
+        if (h.size() < sl.size() ||
+	    h.compare((h.size() - sl.size()), sl.size(), sl) != 0)
+          continue;
+        ssplit.pop_front();
+      }
+      if (!ssplit.empty())
+        continue;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool RGWCORSRule::has_wildcard_origin() {
+  if (allowed_origins.find("*") != allowed_origins.end())
+    return true;
+
+  return false;
+}
+
+bool RGWCORSRule::is_origin_present(const char *o) {
+  string origin = o;
+  return is_string_in_set(allowed_origins, origin);
+}
+
+bool RGWCORSRule::is_header_allowed(const char *h, size_t len) {
+  string hdr(h, len);
+  if(lowercase_allowed_hdrs.empty()) {
+    set<string>::iterator iter;
+    for (iter = allowed_hdrs.begin(); iter != allowed_hdrs.end(); ++iter) {
+      lowercase_allowed_hdrs.insert(lowercase_http_attr(*iter));
+    }
+  }
+  return is_string_in_set(lowercase_allowed_hdrs, lowercase_http_attr(hdr));
+}
+
+void RGWCORSRule::format_exp_headers(string& s) {
+  s = "";
+  for (const auto& header : exposable_hdrs) {
+    if (s.length() > 0)
+      s.append(",");
+    // these values are sent to clients in a 'Access-Control-Expose-Headers'
+    // response header, so we escape '\n' to avoid header injection
+    boost::replace_all_copy(std::back_inserter(s), header, "\n", "\\n");
+  }
+}
+
+RGWCORSRule * RGWCORSConfiguration::host_name_rule(const char *origin) {
+  for(list<RGWCORSRule>::iterator it_r = rules.begin(); 
+      it_r != rules.end(); ++it_r) {
+    RGWCORSRule& r = (*it_r);
+    if (r.is_origin_present(origin))
+      return &r;
+  }
+  return NULL;
+}
+
+void RGWCORSConfiguration::erase_host_name_rule(string& origin) {
+  bool rule_empty;
+  unsigned loop = 0;
+  /*Erase the host name from that rule*/
+  dout(10) << "Num of rules : " << rules.size() << dendl;
+  for(list<RGWCORSRule>::iterator it_r = rules.begin(); 
+      it_r != rules.end(); ++it_r, loop++) {
+    RGWCORSRule& r = (*it_r);
+    r.erase_origin_if_present(origin, &rule_empty);
+    dout(10) << "Origin:" << origin << ", rule num:" 
+      << loop << ", emptying now:" << rule_empty << dendl;
+    if (rule_empty) {
+      rules.erase(it_r);
+      break;
+    }
+  }
+}
+
+void RGWCORSConfiguration::dump() {
+  unsigned loop = 1;
+  unsigned num_rules = rules.size();
+  dout(10) << "Number of rules: " << num_rules << dendl;
+  for(list<RGWCORSRule>::iterator it = rules.begin();
+      it!= rules.end(); ++it, loop++) {
+    dout(10) << " <<<<<<< Rule " << loop << " >>>>>>> " << dendl;
+    (*it).dump_origins();
+  }
+}
diff --git a/src/rgw/rgw_cors.h b/src/rgw/rgw_cors.h
new file mode 100644
index 000000000..c7a2ed5bd
--- /dev/null
+++ b/src/rgw/rgw_cors.h
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <include/types.h>
+
+#define RGW_CORS_GET    0x1
+#define RGW_CORS_PUT    0x2
+#define RGW_CORS_HEAD   0x4
+#define RGW_CORS_POST   0x8
+#define RGW_CORS_DELETE 0x10
+#define RGW_CORS_COPY   0x20
+#define RGW_CORS_ALL    (RGW_CORS_GET    |  \
+                         RGW_CORS_PUT    |  \
+                         RGW_CORS_HEAD   |  \
+                         RGW_CORS_POST   |  \
+                         RGW_CORS_DELETE |  \
+                         RGW_CORS_COPY)
+
+#define CORS_MAX_AGE_INVALID ((uint32_t)-1)
+
+class RGWCORSRule
+{
+protected:
+  uint32_t       max_age;
+  uint8_t        allowed_methods;
+  std::string         id;
+  std::set<std::string> allowed_hdrs; /* If you change this, you need to discard lowercase_allowed_hdrs */
+  std::set<std::string> lowercase_allowed_hdrs; /* Not built until needed in RGWCORSRule::is_header_allowed */
+  std::set<std::string> allowed_origins;
+  std::list<std::string> exposable_hdrs;
+
+public:
+  RGWCORSRule() : max_age(CORS_MAX_AGE_INVALID),allowed_methods(0) {}
+  RGWCORSRule(std::set<std::string>& o, std::set<std::string>& h,
+              std::list<std::string>& e, uint8_t f, uint32_t a)
+      :max_age(a),
+       allowed_methods(f),
+       allowed_hdrs(h),
+       allowed_origins(o),
+       exposable_hdrs(e) {}
+  virtual ~RGWCORSRule() {}
+
+  std::string& get_id() { return id; }
+  uint32_t get_max_age() { return max_age; }
+  uint8_t get_allowed_methods() { return allowed_methods; }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(max_age, bl);
+    encode(allowed_methods, bl);
+    encode(id, bl);
+    encode(allowed_hdrs, bl);
+    encode(allowed_origins, bl);
+    encode(exposable_hdrs, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(max_age, bl);
+    decode(allowed_methods, bl);
+    decode(id, bl);
+    decode(allowed_hdrs, bl);
+    decode(allowed_origins, bl);
+    decode(exposable_hdrs, bl);
+    DECODE_FINISH(bl);
+  }
+  bool has_wildcard_origin();
+  bool is_origin_present(const char *o);
+  void format_exp_headers(std::string& s);
+  void erase_origin_if_present(std::string& origin, bool *rule_empty);
+  void dump_origins(); 
+  void dump(Formatter *f) const;
+  bool is_header_allowed(const char *hdr, size_t len);
+};
+WRITE_CLASS_ENCODER(RGWCORSRule)
+
+class RGWCORSConfiguration
+{
+  protected:
+    std::list<RGWCORSRule> rules;
+  public:
+    RGWCORSConfiguration() {}
+    ~RGWCORSConfiguration() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(rules, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(rules, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  std::list<RGWCORSRule>& get_rules() {
+    return rules;
+  }
+  bool is_empty() {
+    return rules.empty();
+  }
+  void get_origins_list(const char *origin, std::list<std::string>& origins);
+  RGWCORSRule * host_name_rule(const char *origin);
+  void erase_host_name_rule(std::string& origin);
+  void dump();
+  void stack_rule(RGWCORSRule& r) {
+    rules.push_front(r);    
+  }
+};
+WRITE_CLASS_ENCODER(RGWCORSConfiguration)
+
+static inline int validate_name_string(std::string_view o) {
+  if (o.length() == 0)
+    return -1;
+  if (o.find_first_of("*") != o.find_last_of("*"))
+    return -1;
+  return 0;
+}
+
+static inline uint8_t get_cors_method_flags(const char *req_meth) {
+  uint8_t flags = 0;
+
+  if (strcmp(req_meth, "GET") == 0) flags = RGW_CORS_GET;
+  else if (strcmp(req_meth, "POST") == 0) flags = RGW_CORS_POST;
+  else if (strcmp(req_meth, "PUT") == 0) flags = RGW_CORS_PUT;
+  else if (strcmp(req_meth, "DELETE") == 0) flags = RGW_CORS_DELETE;
+  else if (strcmp(req_meth, "HEAD") == 0) flags = RGW_CORS_HEAD;
+
+  return flags;
+}
diff --git a/src/rgw/rgw_cors_s3.cc b/src/rgw/rgw_cors_s3.cc
new file mode 100644
index 000000000..ba68487e2
--- /dev/null
+++ b/src/rgw/rgw_cors_s3.cc
@@ -0,0 +1,246 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <string.h>
+#include <limits.h>
+
+#include <iostream>
+#include <map>
+
+#include "include/types.h"
+
+#include "rgw_cors_s3.h"
+#include "rgw_user.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void RGWCORSRule_S3::to_xml(XMLFormatter& f) {
+
+  f.open_object_section("CORSRule");
+  /*ID if present*/
+  if (id.length() > 0) {
+    f.dump_string("ID", id);
+  }
+  /*AllowedMethods*/
+  if (allowed_methods & RGW_CORS_GET)
+    f.dump_string("AllowedMethod", "GET");
+  if (allowed_methods & RGW_CORS_PUT)
+    f.dump_string("AllowedMethod", "PUT");
+  if (allowed_methods & RGW_CORS_DELETE)
+    f.dump_string("AllowedMethod", "DELETE");
+  if (allowed_methods & RGW_CORS_HEAD)
+    f.dump_string("AllowedMethod", "HEAD");
+  if (allowed_methods & RGW_CORS_POST)
+    f.dump_string("AllowedMethod", "POST");
+  if (allowed_methods & RGW_CORS_COPY)
+    f.dump_string("AllowedMethod", "COPY");
+  /*AllowedOrigins*/
+  for(set<string>::iterator it = allowed_origins.begin(); 
+      it != allowed_origins.end(); 
+      ++it) {
+    string host = *it;
+    f.dump_string("AllowedOrigin", host);
+  }
+  /*AllowedHeader*/
+  for(set<string>::iterator it = allowed_hdrs.begin(); 
+      it != allowed_hdrs.end(); ++it) {
+    f.dump_string("AllowedHeader", *it);
+  }
+  /*MaxAgeSeconds*/
+  if (max_age != CORS_MAX_AGE_INVALID) {
+    f.dump_unsigned("MaxAgeSeconds", max_age);
+  }
+  /*ExposeHeader*/
+  for(list<string>::iterator it = exposable_hdrs.begin(); 
+      it != exposable_hdrs.end(); ++it) {
+    f.dump_string("ExposeHeader", *it);
+  }
+  f.close_section();
+}
+
+bool RGWCORSRule_S3::xml_end(const char *el) {
+  XMLObjIter iter = find("AllowedMethod");
+  XMLObj *obj;
+  /*Check all the allowedmethods*/
+  obj = iter.get_next();
+  if (obj) {
+    for( ; obj; obj = iter.get_next()) {
+      const char *s = obj->get_data().c_str();
+      ldpp_dout(dpp, 10) << "RGWCORSRule::xml_end, el : " << el << ", data : " << s << dendl;
+      if (strcasecmp(s, "GET") == 0) {
+        allowed_methods |= RGW_CORS_GET;
+      } else if (strcasecmp(s, "POST") == 0) {
+        allowed_methods |= RGW_CORS_POST;
+      } else if (strcasecmp(s, "DELETE") == 0) {
+        allowed_methods |= RGW_CORS_DELETE;
+      } else if (strcasecmp(s, "HEAD") == 0) {
+        allowed_methods |= RGW_CORS_HEAD;
+      } else if (strcasecmp(s, "PUT") == 0) {
+        allowed_methods |= RGW_CORS_PUT;
+      } else if (strcasecmp(s, "COPY") == 0) {
+        allowed_methods |= RGW_CORS_COPY;
+      } else {
+        return false;
+      }
+    }
+  } 
+  /*Check the id's len, it should be less than 255*/
+  XMLObj *xml_id = find_first("ID");
+  if (xml_id != NULL) {
+    string data = xml_id->get_data();
+    if (data.length() > 255) {
+      ldpp_dout(dpp, 0) << "RGWCORSRule has id of length greater than 255" << dendl;
+      return false;
+    }
+    ldpp_dout(dpp, 10) << "RGWCORRule id : " << data << dendl;  
+    id = data;
+  }
+  /*Check if there is atleast one AllowedOrigin*/
+  iter = find("AllowedOrigin");
+  if (!(obj = iter.get_next())) {
+    ldpp_dout(dpp, 0) << "RGWCORSRule does not have even one AllowedOrigin" << dendl;
+    return false;
+  }
+  for( ; obj; obj = iter.get_next()) {
+    ldpp_dout(dpp, 10) << "RGWCORSRule - origin : " << obj->get_data() << dendl;
+    /*Just take the hostname*/
+    string host = obj->get_data();
+    if (validate_name_string(host) != 0)
+      return false;
+    allowed_origins.insert(allowed_origins.end(), host);
+  }
+  /*Check of max_age*/
+  iter = find("MaxAgeSeconds");
+  if ((obj = iter.get_next())) {
+    char *end = NULL;
+
+    unsigned long long ull = strtoull(obj->get_data().c_str(), &end, 10);
+    if (*end != '\0') {
+      ldpp_dout(dpp, 0) << "RGWCORSRule's MaxAgeSeconds " << obj->get_data() << " is an invalid integer" << dendl;
+      return false;
+    }
+    if (ull >= 0x100000000ull) {
+      max_age = CORS_MAX_AGE_INVALID;
+    } else  {
+      max_age = (uint32_t)ull;
+    }
+    ldpp_dout(dpp, 10) << "RGWCORSRule : max_age : " << max_age << dendl;
+  }
+  /*Check and update ExposeHeader*/
+  iter = find("ExposeHeader");
+  if ((obj = iter.get_next())) {
+    for(; obj; obj = iter.get_next()) {
+      ldpp_dout(dpp, 10) << "RGWCORSRule - exp_hdr : " << obj->get_data() << dendl;
+      exposable_hdrs.push_back(obj->get_data());
+    }
+  }
+  /*Check and update AllowedHeader*/
+  iter = find("AllowedHeader");
+  if ((obj = iter.get_next())) {
+    for(; obj; obj = iter.get_next()) {
+      ldpp_dout(dpp, 10) << "RGWCORSRule - allowed_hdr : " << obj->get_data() << dendl;
+      string s = obj->get_data();
+      if (validate_name_string(s) != 0)
+         return false;
+      allowed_hdrs.insert(allowed_hdrs.end(), s);
+    }
+  }
+  return true;
+}
+
+void RGWCORSConfiguration_S3::to_xml(ostream& out) {
+  XMLFormatter f;
+  f.open_object_section_in_ns("CORSConfiguration", XMLNS_AWS_S3);
+  for(list<RGWCORSRule>::iterator it = rules.begin();
+      it != rules.end(); ++it) {
+    (static_cast<RGWCORSRule_S3 &>(*it)).to_xml(f);
+  }
+  f.close_section();
+  f.flush(out);
+}
+
+bool RGWCORSConfiguration_S3::xml_end(const char *el) {
+  XMLObjIter iter = find("CORSRule");
+  RGWCORSRule_S3 *obj;
+  if (!(obj = static_cast<RGWCORSRule_S3 *>(iter.get_next()))) {
+    ldpp_dout(dpp, 0) << "CORSConfiguration should have atleast one CORSRule" << dendl;
+    return false;
+  }
+  for(; obj; obj = static_cast<RGWCORSRule_S3 *>(iter.get_next())) {
+    rules.push_back(*obj);
+  }
+  return true;
+}
+
+class CORSRuleID_S3 : public XMLObj {
+  public:
+    CORSRuleID_S3() {}
+    ~CORSRuleID_S3() override {}
+};
+
+class CORSRuleAllowedOrigin_S3 : public XMLObj {
+  public:
+    CORSRuleAllowedOrigin_S3() {}
+    ~CORSRuleAllowedOrigin_S3() override {}
+};
+
+class CORSRuleAllowedMethod_S3 : public XMLObj {
+  public:
+    CORSRuleAllowedMethod_S3() {}
+    ~CORSRuleAllowedMethod_S3() override {}
+};
+
+class CORSRuleAllowedHeader_S3 : public XMLObj {
+  public:
+    CORSRuleAllowedHeader_S3() {}
+    ~CORSRuleAllowedHeader_S3() override {}
+};
+
+class CORSRuleMaxAgeSeconds_S3 : public XMLObj {
+  public:
+    CORSRuleMaxAgeSeconds_S3() {}
+    ~CORSRuleMaxAgeSeconds_S3() override {}
+};
+
+class CORSRuleExposeHeader_S3 : public XMLObj {
+  public:
+    CORSRuleExposeHeader_S3() {}
+    ~CORSRuleExposeHeader_S3() override {}
+};
+
+XMLObj *RGWCORSXMLParser_S3::alloc_obj(const char *el) {
+  if (strcmp(el, "CORSConfiguration") == 0) {
+    return new RGWCORSConfiguration_S3(dpp);
+  } else if (strcmp(el, "CORSRule") == 0) {
+    return new RGWCORSRule_S3(dpp);
+  } else if (strcmp(el, "ID") == 0) {
+    return new CORSRuleID_S3;
+  } else if (strcmp(el, "AllowedOrigin") == 0) {
+    return new CORSRuleAllowedOrigin_S3;
+  } else if (strcmp(el, "AllowedMethod") == 0) {
+    return new CORSRuleAllowedMethod_S3;
+  } else if (strcmp(el, "AllowedHeader") == 0) {
+    return new CORSRuleAllowedHeader_S3;
+  } else if (strcmp(el, "MaxAgeSeconds") == 0) {
+    return new CORSRuleMaxAgeSeconds_S3;
+  } else if (strcmp(el, "ExposeHeader")  == 0) {
+    return new CORSRuleExposeHeader_S3;
+  }
+  return NULL;
+}
+
diff --git a/src/rgw/rgw_cors_s3.h b/src/rgw/rgw_cors_s3.h
new file mode 100644
index 000000000..8d92a3c5f
--- /dev/null
+++ b/src/rgw/rgw_cors_s3.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <iosfwd>
+
+#include <include/types.h>
+#include <common/Formatter.h>
+#include <common/dout.h>
+#include "rgw_xml.h"
+#include "rgw_cors.h"
+
+class RGWCORSRule_S3 : public RGWCORSRule, public XMLObj
+{
+  const DoutPrefixProvider *dpp;
+  public:
+    RGWCORSRule_S3(const DoutPrefixProvider *dpp) : dpp(dpp) {}
+    ~RGWCORSRule_S3() override {}
+    
+    bool xml_end(const char *el) override;
+    void to_xml(XMLFormatter& f);
+};
+
+class RGWCORSConfiguration_S3 : public RGWCORSConfiguration, public XMLObj
+{
+  const DoutPrefixProvider *dpp;
+  public:
+    RGWCORSConfiguration_S3(const DoutPrefixProvider *dpp) : dpp(dpp) {}
+    ~RGWCORSConfiguration_S3() override {}
+
+    bool xml_end(const char *el) override;
+    void to_xml(std::ostream& out);
+};
+
+class RGWCORSXMLParser_S3 : public RGWXMLParser
+{
+  const DoutPrefixProvider *dpp;
+  CephContext *cct;
+
+  XMLObj *alloc_obj(const char *el) override;
+public:
+  explicit RGWCORSXMLParser_S3(const DoutPrefixProvider *_dpp, CephContext *_cct) : dpp(_dpp), cct(_cct) {}
+};
diff --git a/src/rgw/rgw_cors_swift.h b/src/rgw/rgw_cors_swift.h
new file mode 100644
index 000000000..f5a1b14a0
--- /dev/null
+++ b/src/rgw/rgw_cors_swift.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+#include <include/types.h>
+#include <include/str_list.h>
+
+#include "rgw_cors.h"
+
+class RGWCORSConfiguration_SWIFT : public RGWCORSConfiguration
+{
+  public:
+    RGWCORSConfiguration_SWIFT() {}
+    ~RGWCORSConfiguration_SWIFT() {}
+    int create_update(const char *allow_origins, const char *allow_headers, 
+                  const char *expose_headers, const char *max_age) {
+      std::set<std::string> o, h;
+      std::list<std::string> e;
+      unsigned long a = CORS_MAX_AGE_INVALID;
+      uint8_t flags = RGW_CORS_ALL;
+
+      int nr_invalid_names = 0;
+      auto add_host = [&nr_invalid_names, &o] (auto host) {
+        if (validate_name_string(host) == 0) {
+          o.emplace(std::string{host});
+        } else {
+          nr_invalid_names++;
+        }
+      };
+      for_each_substr(allow_origins, ";,= \t", add_host);
+      if (o.empty() || nr_invalid_names > 0) {
+        return -EINVAL;
+      }
+
+      if (allow_headers) {
+        int nr_invalid_headers = 0;
+        auto add_header = [&nr_invalid_headers, &h] (auto allow_header) {
+          if (validate_name_string(allow_header) == 0) {
+            h.emplace(std::string{allow_header});
+          } else {
+            nr_invalid_headers++;
+          }
+        };
+        for_each_substr(allow_headers, ";,= \t", add_header);
+        if (h.empty() || nr_invalid_headers > 0) {
+          return -EINVAL;
+        }
+      }
+
+      if (expose_headers) {
+        for_each_substr(expose_headers, ";,= \t",
+            [&e] (auto expose_header) {
+              e.emplace_back(std::string(expose_header));
+            });
+      }
+      if (max_age) {
+        char *end = NULL;
+        a = strtoul(max_age, &end, 10);
+        if (a == ULONG_MAX)
+          a = CORS_MAX_AGE_INVALID;
+      }
+
+      RGWCORSRule rule(o, h, e, flags, a);
+      stack_rule(rule);
+      return 0;
+    }
+};
diff --git a/src/rgw/rgw_cr_rest.cc b/src/rgw/rgw_cr_rest.cc
new file mode 100644
index 000000000..04920a155
--- /dev/null
+++ b/src/rgw/rgw_cr_rest.cc
@@ -0,0 +1,351 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_cr_rest.h"
+
+#include "rgw_coroutine.h"
+
+// re-include our assert to clobber the system one; fix dout:
+#include "include/ceph_assert.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWCRHTTPGetDataCB::RGWCRHTTPGetDataCB(RGWCoroutinesEnv *_env, RGWCoroutine *_cr, RGWHTTPStreamRWRequest *_req) : env(_env), cr(_cr), req(_req) {
+  io_id = req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_READ |RGWHTTPClient::HTTPCLIENT_IO_CONTROL);
+  req->set_in_cb(this);
+}
+
+#define GET_DATA_WINDOW_SIZE 2 * 1024 * 1024
+
+int RGWCRHTTPGetDataCB::handle_data(bufferlist& bl, bool *pause) {
+  if (data.length() < GET_DATA_WINDOW_SIZE / 2) {
+    notified = false;
+  }
+
+  {
+    uint64_t bl_len = bl.length();
+
+    std::lock_guard l{lock};
+
+    if (!got_all_extra_data) {
+      uint64_t max = extra_data_len - extra_data.length();
+      if (max > bl_len) {
+        max = bl_len;
+      }
+      bl.splice(0, max, &extra_data);
+      bl_len -= max;
+      got_all_extra_data = extra_data.length() == extra_data_len;
+    }
+
+    data.append(bl);
+  }
+
+  uint64_t data_len = data.length();
+  if (data_len >= GET_DATA_WINDOW_SIZE && !notified) {
+    notified = true;
+    env->manager->io_complete(cr, io_id);
+  }
+  if (data_len >= 2 * GET_DATA_WINDOW_SIZE) {
+    *pause = true;
+    paused = true;
+  }
+  return 0;
+}
+
+void RGWCRHTTPGetDataCB::claim_data(bufferlist *dest, uint64_t max) {
+  bool need_to_unpause = false;
+
+  {
+    std::lock_guard l{lock};
+
+    if (data.length() == 0) {
+      return;
+    }
+
+    if (data.length() < max) {
+      max = data.length();
+    }
+
+    data.splice(0, max, dest);
+    need_to_unpause = (paused && data.length() <= GET_DATA_WINDOW_SIZE);
+  }
+
+  if (need_to_unpause) {
+    req->unpause_receive();
+  }
+}
+
+RGWStreamReadHTTPResourceCRF::~RGWStreamReadHTTPResourceCRF()
+{
+  if (req) {
+    req->cancel();
+    req->wait(null_yield);
+    delete req;
+  }
+}
+
+int RGWStreamReadHTTPResourceCRF::init(const DoutPrefixProvider *dpp)
+{
+  env->stack->init_new_io(req);
+
+  in_cb.emplace(env, caller, req);
+
+  int r = req->send(http_manager);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWStreamWriteHTTPResourceCRF::send()
+{
+  env->stack->init_new_io(req);
+
+  req->set_write_drain_cb(&write_drain_notify_cb);
+
+  int r = req->send(http_manager);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+bool RGWStreamReadHTTPResourceCRF::has_attrs()
+{
+  return got_attrs;
+}
+
+void RGWStreamReadHTTPResourceCRF::get_attrs(std::map<string, string> *attrs)
+{
+  req->get_out_headers(attrs);
+}
+
+int RGWStreamReadHTTPResourceCRF::decode_rest_obj(const DoutPrefixProvider *dpp, map<string, string>& headers, bufferlist& extra_data) {
+  /* basic generic implementation */
+  for (auto header : headers) {
+    const string& val = header.second;
+
+    rest_obj.attrs[header.first] = val;
+  }
+
+  return 0;
+}
+
+int RGWStreamReadHTTPResourceCRF::read(const DoutPrefixProvider *dpp, bufferlist *out, uint64_t max_size, bool *io_pending)
+{
+    reenter(&read_state) {
+    io_read_mask = req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_READ | RGWHTTPClient::HTTPCLIENT_IO_CONTROL);
+    while (!req->is_done() ||
+           in_cb->has_data()) {
+      *io_pending = true;
+      if (!in_cb->has_data()) {
+        yield caller->io_block(0, io_read_mask);
+      }
+      got_attrs = true;
+      if (need_extra_data() && !got_extra_data) {
+        if (!in_cb->has_all_extra_data()) {
+          continue;
+        }
+        extra_data.claim_append(in_cb->get_extra_data());
+        map<string, string> attrs;
+        req->get_out_headers(&attrs);
+        int ret = decode_rest_obj(dpp, attrs, extra_data);
+        if (ret < 0) {
+          ldout(cct, 0) << "ERROR: " << __func__ << " decode_rest_obj() returned ret=" << ret << dendl;
+          return ret;
+        }
+        got_extra_data = true;
+      }
+      *io_pending = false;
+      in_cb->claim_data(out, max_size);
+      if (out->length() == 0) {
+        /* this may happen if we just read the prepended extra_data and didn't have any data
+         * after. In that case, retry reading, so that caller doesn't assume it's EOF.
+         */
+        continue;
+      }
+      if (!req->is_done() || out->length() >= max_size) {
+        yield;
+      }
+    }
+  }
+  return 0;
+}
+
+bool RGWStreamReadHTTPResourceCRF::is_done()
+{
+  return req->is_done();
+}
+
+RGWStreamWriteHTTPResourceCRF::~RGWStreamWriteHTTPResourceCRF()
+{
+  if (req) {
+    req->cancel();
+    req->wait(null_yield);
+    delete req;
+  }
+}
+
+void RGWStreamWriteHTTPResourceCRF::send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj)
+{
+  req->set_send_length(rest_obj.content_len);
+  for (auto h : rest_obj.attrs) {
+    req->append_header(h.first, h.second);
+  }
+}
+
+#define PENDING_WRITES_WINDOW (1 * 1024 * 1024)
+
+void RGWStreamWriteHTTPResourceCRF::write_drain_notify(uint64_t pending_size)
+{
+  lock_guard l(blocked_lock);
+  if (is_blocked && (pending_size < PENDING_WRITES_WINDOW / 2)) {
+    env->manager->io_complete(caller, req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_WRITE | RGWHTTPClient::HTTPCLIENT_IO_CONTROL));
+    is_blocked = false;
+  }
+}
+
+void RGWStreamWriteHTTPResourceCRF::WriteDrainNotify::notify(uint64_t pending_size)
+{
+  crf->write_drain_notify(pending_size);
+}
+
+int RGWStreamWriteHTTPResourceCRF::write(bufferlist& data, bool *io_pending)
+{
+  reenter(&write_state) {
+    while (!req->is_done()) {
+      *io_pending = false;
+      if (req->get_pending_send_size() >= PENDING_WRITES_WINDOW) {
+        *io_pending = true;
+        {
+          lock_guard l(blocked_lock);
+          is_blocked = true;
+
+          /* it's ok to unlock here, even if io_complete() arrives before io_block(), it'll wakeup
+           * correctly */
+        }
+        yield caller->io_block(0, req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_WRITE | RGWHTTPClient::HTTPCLIENT_IO_CONTROL));
+      }
+      yield req->add_send_data(data);
+    }
+    return req->get_status();
+  }
+  return 0;
+}
+
+int RGWStreamWriteHTTPResourceCRF::drain_writes(bool *need_retry)
+{
+  reenter(&drain_state) {
+    *need_retry = true;
+    yield req->finish_write();
+    *need_retry = !req->is_done();
+    while (!req->is_done()) {
+      yield caller->io_block(0, req->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_CONTROL));
+      *need_retry = !req->is_done();
+    }
+
+    map<string, string> headers;
+    req->get_out_headers(&headers);
+    handle_headers(headers);
+
+    return req->get_req_retcode();
+  }
+  return 0;
+}
+
+RGWStreamSpliceCR::RGWStreamSpliceCR(CephContext *_cct, RGWHTTPManager *_mgr,
+                           shared_ptr<RGWStreamReadHTTPResourceCRF>& _in_crf,
+                           shared_ptr<RGWStreamWriteHTTPResourceCRF>& _out_crf) : RGWCoroutine(_cct), cct(_cct), http_manager(_mgr),
+                                                               in_crf(_in_crf), out_crf(_out_crf) {}
+RGWStreamSpliceCR::~RGWStreamSpliceCR() { }
+
+int RGWStreamSpliceCR::operate(const DoutPrefixProvider *dpp) {
+  reenter(this) {
+    {
+      int ret = in_crf->init(dpp);
+      if (ret < 0) {
+        return set_cr_error(ret);
+      }
+    }
+
+    do {
+
+      bl.clear();
+
+      do {
+        yield {
+          ret = in_crf->read(dpp, &bl, 4 * 1024 * 1024, &need_retry);
+          if (ret < 0)  {
+            return set_cr_error(ret);
+          }
+        }
+
+        if (retcode < 0) {
+          ldout(cct, 20) << __func__ << ": in_crf->read() retcode=" << retcode << dendl;
+          return set_cr_error(ret);
+        }
+      } while (need_retry);
+
+      ldout(cct, 20) << "read " << bl.length() << " bytes" << dendl;
+
+      if (!in_crf->has_attrs()) {
+        assert (bl.length() == 0);
+        continue;
+      }
+
+      if (!sent_attrs) {
+        int ret = out_crf->init();
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        out_crf->send_ready(dpp, in_crf->get_rest_obj());
+        ret = out_crf->send();
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+        sent_attrs = true;
+      }
+
+      if (bl.length() == 0 && in_crf->is_done()) {
+        break;
+      }
+
+      total_read += bl.length();
+
+      do {
+        yield {
+          ldout(cct, 20) << "writing " << bl.length() << " bytes" << dendl;
+          ret = out_crf->write(bl, &need_retry);
+          if (ret < 0)  {
+            return set_cr_error(ret);
+          }
+        }
+
+        if (retcode < 0) {
+          ldout(cct, 20) << __func__ << ": out_crf->write() retcode=" << retcode << dendl;
+          return set_cr_error(ret);
+        }
+      } while (need_retry);
+    } while (true);
+
+    do {
+      yield {
+        int ret = out_crf->drain_writes(&need_retry);
+        if (ret < 0) {
+          return set_cr_error(ret);
+        }
+      }
+    } while (need_retry);
+
+    return set_cr_done();
+  }
+  return 0;
+}
+
diff --git a/src/rgw/rgw_cr_rest.h b/src/rgw/rgw_cr_rest.h
new file mode 100644
index 000000000..ba47c3dd6
--- /dev/null
+++ b/src/rgw/rgw_cr_rest.h
@@ -0,0 +1,590 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <boost/intrusive_ptr.hpp>
+#include <mutex>
+#include "include/ceph_assert.h" // boost header clobbers our assert.h
+
+#include "rgw_coroutine.h"
+#include "rgw_rest_conn.h"
+
+
+struct rgw_rest_obj {
+  rgw_obj_key key;
+  uint64_t content_len;
+  std::map<std::string, std::string> attrs;
+  std::map<std::string, std::string> custom_attrs;
+  RGWAccessControlPolicy acls;
+
+  void init(const rgw_obj_key& _key) {
+    key = _key;
+  }
+};
+
+class RGWReadRawRESTResourceCR : public RGWSimpleCoroutine {
+  bufferlist *result;
+ protected:
+  RGWRESTConn *conn;
+  RGWHTTPManager *http_manager;
+  std::string path;
+  param_vec_t params;
+  param_vec_t extra_headers;
+public:
+  boost::intrusive_ptr<RGWRESTReadResource> http_op;
+  RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                           RGWHTTPManager *_http_manager, const std::string& _path,
+                           rgw_http_param_pair *params, bufferlist *_result)
+    : RGWSimpleCoroutine(_cct), result(_result), conn(_conn), http_manager(_http_manager),
+    path(_path), params(make_param_list(params))
+  {}
+
+ RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                          RGWHTTPManager *_http_manager, const std::string& _path,
+                          rgw_http_param_pair *params)
+   : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager),
+    path(_path), params(make_param_list(params))
+  {}
+
+  RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                           RGWHTTPManager *_http_manager, const std::string& _path,
+                           rgw_http_param_pair *params, param_vec_t &hdrs)
+    : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager),
+      path(_path), params(make_param_list(params)),
+      extra_headers(hdrs)
+  {}
+
+ RGWReadRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                          RGWHTTPManager *_http_manager, const std::string& _path,
+                          rgw_http_param_pair *params,
+                          std::map <std::string, std::string> *hdrs)
+   : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager),
+    path(_path), params(make_param_list(params)),
+    extra_headers(make_param_list(hdrs))
+    {}
+
+
+  ~RGWReadRawRESTResourceCR() override {
+    request_cleanup();
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    auto op = boost::intrusive_ptr<RGWRESTReadResource>(
+        new RGWRESTReadResource(conn, path, params, &extra_headers, http_manager));
+
+    init_new_io(op.get());
+
+    int ret = op->aio_read(dpp);
+    if (ret < 0) {
+      log_error() << "failed to send http operation: " << op->to_str()
+          << " ret=" << ret << std::endl;
+      op->put();
+      return ret;
+    }
+    std::swap(http_op, op); // store reference in http_op on success
+    return 0;
+  }
+
+
+
+  virtual int wait_result() {
+    return http_op->wait(result, null_yield);
+  }
+
+  int request_complete() override {
+    int ret;
+
+    ret = wait_result();
+
+    auto op = std::move(http_op); // release ref on return
+    if (ret < 0) {
+      error_stream << "http operation failed: " << op->to_str()
+                   << " status=" << op->get_http_status() << std::endl;
+      op->put();
+      return ret;
+    }
+    op->put();
+    return 0;
+  }
+
+  void request_cleanup() override {
+    if (http_op) {
+      http_op->put();
+      http_op = NULL;
+    }
+  }
+
+};
+
+
+template <class T>
+class RGWReadRESTResourceCR : public RGWReadRawRESTResourceCR {
+  T *result;
+ public:
+ RGWReadRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                       RGWHTTPManager *_http_manager, const std::string& _path,
+                       rgw_http_param_pair *params, T *_result)
+   : RGWReadRawRESTResourceCR(_cct, _conn, _http_manager, _path, params), result(_result)
+  {}
+
+  RGWReadRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                        RGWHTTPManager *_http_manager, const std::string& _path,
+                        rgw_http_param_pair *params,
+                        std::map <std::string, std::string> *hdrs,
+                        T *_result)
+    : RGWReadRawRESTResourceCR(_cct, _conn, _http_manager, _path, params, hdrs), result(_result)
+  {}
+
+  int wait_result() override {
+    return http_op->wait(result, null_yield);
+  }
+
+};
+
+template <class T, class E = int>
+class RGWSendRawRESTResourceCR: public RGWSimpleCoroutine {
+ protected:
+  RGWRESTConn *conn;
+  RGWHTTPManager *http_manager;
+  std::string method;
+  std::string path;
+  param_vec_t params;
+  param_vec_t headers;
+  std::map<std::string, std::string> *attrs;
+  T *result;
+  E *err_result;
+  bufferlist input_bl;
+  bool send_content_length=false;
+  boost::intrusive_ptr<RGWRESTSendResource> http_op;
+
+ public:
+ RGWSendRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                          RGWHTTPManager *_http_manager,
+                          const std::string& _method, const std::string& _path,
+                          rgw_http_param_pair *_params,
+                          std::map<std::string, std::string> *_attrs,
+                          bufferlist& _input, T *_result,
+                          bool _send_content_length,
+                          E *_err_result = nullptr)
+   : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager),
+     method(_method), path(_path), params(make_param_list(_params)),
+     headers(make_param_list(_attrs)), attrs(_attrs),
+     result(_result), err_result(_err_result),
+     input_bl(_input), send_content_length(_send_content_length) {}
+
+  RGWSendRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                          RGWHTTPManager *_http_manager,
+                          const std::string& _method, const std::string& _path,
+                          rgw_http_param_pair *_params, std::map<std::string, std::string> *_attrs,
+                          T *_result, E *_err_result = nullptr)
+   : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager),
+    method(_method), path(_path), params(make_param_list(_params)), headers(make_param_list(_attrs)), attrs(_attrs), result(_result),
+    err_result(_err_result) {}
+
+  ~RGWSendRawRESTResourceCR() override {
+    request_cleanup();
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    auto op = boost::intrusive_ptr<RGWRESTSendResource>(
+        new RGWRESTSendResource(conn, method, path, params, &headers, http_manager));
+
+    init_new_io(op.get());
+
+    int ret = op->aio_send(dpp, input_bl);
+    if (ret < 0) {
+      ldpp_subdout(dpp, rgw, 0) << "ERROR: failed to send request" << dendl;
+      op->put();
+      return ret;
+    }
+    std::swap(http_op, op); // store reference in http_op on success
+    return 0;
+  }
+
+  int request_complete() override {
+    int ret;
+    if (result || err_result) {
+      ret = http_op->wait(result, null_yield, err_result);
+    } else {
+      bufferlist bl;
+      ret = http_op->wait(&bl, null_yield);
+    }
+    auto op = std::move(http_op); // release ref on return
+    if (ret < 0) {
+      error_stream << "http operation failed: " << op->to_str()
+          << " status=" << op->get_http_status() << std::endl;
+      lsubdout(cct, rgw, 5) << "failed to wait for op, ret=" << ret
+          << ": " << op->to_str() << dendl;
+      op->put();
+      return ret;
+    }
+    op->put();
+    return 0;
+  }
+
+  void request_cleanup() override {
+    if (http_op) {
+      http_op->put();
+      http_op = NULL;
+    }
+  }
+};
+
+template <class S, class T, class E = int>
+class RGWSendRESTResourceCR : public RGWSendRawRESTResourceCR<T, E> {
+ public:
+  RGWSendRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                           RGWHTTPManager *_http_manager,
+                           const std::string& _method, const std::string& _path,
+                        rgw_http_param_pair *_params, std::map<std::string, std::string> *_attrs,
+                        S& _input, T *_result, E *_err_result = nullptr)
+    : RGWSendRawRESTResourceCR<T, E>(_cct, _conn, _http_manager, _method, _path, _params, _attrs, _result, _err_result) {
+
+    JSONFormatter jf;
+    encode_json("data", _input, &jf);
+    std::stringstream ss;
+    jf.flush(ss);
+    //bufferlist bl;
+    this->input_bl.append(ss.str());
+  }
+
+};
+
+template <class S, class T, class E = int>
+class RGWPostRESTResourceCR : public RGWSendRESTResourceCR<S, T, E> {
+public:
+  RGWPostRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                        RGWHTTPManager *_http_manager,
+                        const std::string& _path,
+                        rgw_http_param_pair *_params, S& _input,
+                        T *_result, E *_err_result = nullptr)
+    : RGWSendRESTResourceCR<S, T, E>(_cct, _conn, _http_manager,
+                            "POST", _path,
+                            _params, nullptr, _input,
+                            _result, _err_result) {}
+};
+
+template <class T, class E = int>
+class RGWPutRawRESTResourceCR: public RGWSendRawRESTResourceCR <T, E> {
+ public:
+  RGWPutRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                          RGWHTTPManager *_http_manager,
+                          const std::string& _path,
+                          rgw_http_param_pair *_params, bufferlist& _input,
+                          T *_result, E *_err_result = nullptr)
+    : RGWSendRawRESTResourceCR<T, E>(_cct, _conn, _http_manager, "PUT", _path,
+                                  _params, nullptr, _input, _result, true, _err_result) {}
+
+};
+
+template <class T, class E = int>
+class RGWPostRawRESTResourceCR: public RGWSendRawRESTResourceCR <T, E> {
+ public:
+  RGWPostRawRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                          RGWHTTPManager *_http_manager,
+                          const std::string& _path,
+                          rgw_http_param_pair *_params,
+                          std::map<std::string, std::string> * _attrs,
+                          bufferlist& _input,
+                          T *_result, E *_err_result = nullptr)
+    : RGWSendRawRESTResourceCR<T, E>(_cct, _conn, _http_manager, "POST", _path,
+                                  _params, _attrs, _input, _result, true, _err_result) {}
+
+};
+
+
+template <class S, class T, class E = int>
+class RGWPutRESTResourceCR : public RGWSendRESTResourceCR<S, T, E> {
+public:
+  RGWPutRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                        RGWHTTPManager *_http_manager,
+                        const std::string& _path,
+                        rgw_http_param_pair *_params, S& _input,
+                        T *_result, E *_err_result = nullptr)
+    : RGWSendRESTResourceCR<S, T, E>(_cct, _conn, _http_manager,
+                                  "PUT", _path,
+                                  _params, nullptr, _input,
+                                  _result, _err_result) {}
+
+  RGWPutRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                       RGWHTTPManager *_http_manager,
+                       const std::string& _path,
+                       rgw_http_param_pair *_params,
+                       std::map<std::string, std::string> *_attrs,
+                       S& _input, T *_result, E *_err_result = nullptr)
+    : RGWSendRESTResourceCR<S, T, E>(_cct, _conn, _http_manager,
+                                  "PUT", _path,
+                                  _params, _attrs, _input,
+                                  _result, _err_result) {}
+
+};
+
+class RGWDeleteRESTResourceCR : public RGWSimpleCoroutine {
+  RGWRESTConn *conn;
+  RGWHTTPManager *http_manager;
+  std::string path;
+  param_vec_t params;
+
+  boost::intrusive_ptr<RGWRESTDeleteResource> http_op;
+
+public:
+  RGWDeleteRESTResourceCR(CephContext *_cct, RGWRESTConn *_conn,
+                        RGWHTTPManager *_http_manager,
+                        const std::string& _path,
+                        rgw_http_param_pair *_params)
+    : RGWSimpleCoroutine(_cct), conn(_conn), http_manager(_http_manager),
+      path(_path), params(make_param_list(_params))
+  {}
+
+  ~RGWDeleteRESTResourceCR() override {
+    request_cleanup();
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    auto op = boost::intrusive_ptr<RGWRESTDeleteResource>(
+        new RGWRESTDeleteResource(conn, path, params, nullptr, http_manager));
+
+    init_new_io(op.get());
+
+    bufferlist bl;
+
+    int ret = op->aio_send(dpp, bl);
+    if (ret < 0) {
+      ldpp_subdout(dpp, rgw, 0) << "ERROR: failed to send DELETE request" << dendl;
+      op->put();
+      return ret;
+    }
+    std::swap(http_op, op); // store reference in http_op on success
+    return 0;
+  }
+
+  int request_complete() override {
+    int ret;
+    bufferlist bl;
+    ret = http_op->wait(&bl, null_yield);
+    auto op = std::move(http_op); // release ref on return
+    if (ret < 0) {
+      error_stream << "http operation failed: " << op->to_str()
+          << " status=" << op->get_http_status() << std::endl;
+      lsubdout(cct, rgw, 5) << "failed to wait for op, ret=" << ret
+          << ": " << op->to_str() << dendl;
+      op->put();
+      return ret;
+    }
+    op->put();
+    return 0;
+  }
+
+  void request_cleanup() override {
+    if (http_op) {
+      http_op->put();
+      http_op = NULL;
+    }
+  }
+};
+
+class RGWCRHTTPGetDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
+  ceph::mutex lock = ceph::make_mutex("RGWCRHTTPGetDataCB");
+  RGWCoroutinesEnv *env;
+  RGWCoroutine *cr;
+  RGWHTTPStreamRWRequest *req;
+  rgw_io_id io_id;
+  bufferlist data;
+  bufferlist extra_data;
+  bool got_all_extra_data{false};
+  bool paused{false};
+  bool notified{false};
+public:
+  RGWCRHTTPGetDataCB(RGWCoroutinesEnv *_env, RGWCoroutine *_cr, RGWHTTPStreamRWRequest *_req);
+
+  int handle_data(bufferlist& bl, bool *pause) override;
+
+  void claim_data(bufferlist *dest, uint64_t max);
+
+  bufferlist& get_extra_data() {
+    return extra_data;
+  }
+
+  bool has_data() {
+    return (data.length() > 0);
+  }
+
+  bool has_all_extra_data() {
+    return got_all_extra_data;
+  }
+};
+
+
+class RGWStreamReadResourceCRF {
+protected:
+  boost::asio::coroutine read_state;
+
+public:
+  virtual int init(const DoutPrefixProvider *dpp) = 0;
+  virtual int read(const DoutPrefixProvider *dpp, bufferlist *data, uint64_t max, bool *need_retry) = 0; /* reentrant */
+  virtual int decode_rest_obj(const DoutPrefixProvider *dpp, std::map<std::string, std::string>& headers, bufferlist& extra_data) = 0;
+  virtual bool has_attrs() = 0;
+  virtual void get_attrs(std::map<std::string, std::string> *attrs) = 0;
+  virtual ~RGWStreamReadResourceCRF() = default;
+};
+
+class RGWStreamWriteResourceCRF {
+protected:
+  boost::asio::coroutine write_state;
+  boost::asio::coroutine drain_state;
+
+public:
+  virtual int init() = 0;
+  virtual void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) = 0;
+  virtual int send() = 0;
+  virtual int write(bufferlist& data, bool *need_retry) = 0; /* reentrant */
+  virtual int drain_writes(bool *need_retry) = 0; /* reentrant */
+
+  virtual ~RGWStreamWriteResourceCRF() = default;
+};
+
+class RGWStreamReadHTTPResourceCRF : public RGWStreamReadResourceCRF {
+  CephContext *cct;
+  RGWCoroutinesEnv *env;
+  RGWCoroutine *caller;
+  RGWHTTPManager *http_manager;
+
+  RGWHTTPStreamRWRequest *req{nullptr};
+
+  std::optional<RGWCRHTTPGetDataCB> in_cb;
+
+  bufferlist extra_data;
+
+  bool got_attrs{false};
+  bool got_extra_data{false};
+
+  rgw_io_id io_read_mask;
+
+protected:
+  rgw_rest_obj rest_obj;
+
+  struct range_info {
+    bool is_set{false};
+    uint64_t ofs;
+    uint64_t size;
+  } range;
+
+  ceph::real_time mtime;
+  std::string etag;
+
+public:
+  RGWStreamReadHTTPResourceCRF(CephContext *_cct,
+                               RGWCoroutinesEnv *_env,
+                               RGWCoroutine *_caller,
+                               RGWHTTPManager *_http_manager,
+                               const rgw_obj_key& _src_key) : cct(_cct),
+                                                                env(_env),
+                                                                caller(_caller),
+                                                                http_manager(_http_manager) {
+    rest_obj.init(_src_key);
+  }
+  ~RGWStreamReadHTTPResourceCRF();
+
+  int init(const DoutPrefixProvider *dpp) override;
+  int read(const DoutPrefixProvider *dpp, bufferlist *data, uint64_t max, bool *need_retry) override; /* reentrant */
+  int decode_rest_obj(const DoutPrefixProvider *dpp, std::map<std::string, std::string>& headers, bufferlist& extra_data) override;
+  bool has_attrs() override;
+  void get_attrs(std::map<std::string, std::string> *attrs) override;
+  bool is_done();
+  virtual bool need_extra_data() { return false; }
+
+  void set_req(RGWHTTPStreamRWRequest *r) {
+    req = r;
+  }
+
+  rgw_rest_obj& get_rest_obj() {
+    return rest_obj;
+  }
+
+  void set_range(uint64_t ofs, uint64_t size) {
+    range.is_set = true;
+    range.ofs = ofs;
+    range.size = size;
+  }
+};
+
+class RGWStreamWriteHTTPResourceCRF : public RGWStreamWriteResourceCRF {
+protected:
+  RGWCoroutinesEnv *env;
+  RGWCoroutine *caller;
+  RGWHTTPManager *http_manager;
+
+  using lock_guard = std::lock_guard<std::mutex>;
+
+  std::mutex blocked_lock;
+  bool is_blocked;
+
+  RGWHTTPStreamRWRequest *req{nullptr};
+
+  struct multipart_info {
+    bool is_multipart{false};
+    std::string upload_id;
+    int part_num{0};
+    uint64_t part_size;
+  } multipart;
+
+  class WriteDrainNotify : public RGWWriteDrainCB {
+    RGWStreamWriteHTTPResourceCRF *crf;
+  public:
+    explicit WriteDrainNotify(RGWStreamWriteHTTPResourceCRF *_crf) : crf(_crf) {}
+    void notify(uint64_t pending_size) override;
+  } write_drain_notify_cb;
+
+public:
+  RGWStreamWriteHTTPResourceCRF(CephContext *_cct,
+                               RGWCoroutinesEnv *_env,
+                               RGWCoroutine *_caller,
+                               RGWHTTPManager *_http_manager) : env(_env),
+                                                               caller(_caller),
+                                                               http_manager(_http_manager),
+                                                               write_drain_notify_cb(this) {}
+  virtual ~RGWStreamWriteHTTPResourceCRF();
+
+  int init() override {
+    return 0;
+  }
+  void send_ready(const DoutPrefixProvider *dpp, const rgw_rest_obj& rest_obj) override;
+  int send() override;
+  int write(bufferlist& data, bool *need_retry) override; /* reentrant */
+  void write_drain_notify(uint64_t pending_size);
+  int drain_writes(bool *need_retry) override; /* reentrant */
+
+  virtual void handle_headers(const std::map<std::string, std::string>& headers) {}
+
+  void set_req(RGWHTTPStreamRWRequest *r) {
+    req = r;
+  }
+
+  void set_multipart(const std::string& upload_id, int part_num, uint64_t part_size) {
+    multipart.is_multipart = true;
+    multipart.upload_id = upload_id;
+    multipart.part_num = part_num;
+    multipart.part_size = part_size;
+  }
+};
+
+class RGWStreamSpliceCR : public RGWCoroutine {
+  CephContext *cct;
+  RGWHTTPManager *http_manager;
+  std::string url;
+  std::shared_ptr<RGWStreamReadHTTPResourceCRF> in_crf;
+  std::shared_ptr<RGWStreamWriteHTTPResourceCRF> out_crf;
+  bufferlist bl;
+  bool need_retry{false};
+  bool sent_attrs{false};
+  uint64_t total_read{0};
+  int ret{0};
+public:
+  RGWStreamSpliceCR(CephContext *_cct, RGWHTTPManager *_mgr,
+                    std::shared_ptr<RGWStreamReadHTTPResourceCRF>& _in_crf,
+                    std::shared_ptr<RGWStreamWriteHTTPResourceCRF>& _out_crf);
+  ~RGWStreamSpliceCR();
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
diff --git a/src/rgw/rgw_crypt.cc b/src/rgw/rgw_crypt.cc
new file mode 100644
index 000000000..69b1b8bc6
--- /dev/null
+++ b/src/rgw/rgw_crypt.cc
@@ -0,0 +1,1537 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/**
+ * Crypto filters for Put/Post/Get operations.
+ */
+
+#include <string_view>
+
+#include <rgw/rgw_op.h>
+#include <rgw/rgw_crypt.h>
+#include <auth/Crypto.h>
+#include <rgw/rgw_b64.h>
+#include <rgw/rgw_rest_s3.h>
+#include "include/ceph_assert.h"
+#include "crypto/crypto_accel.h"
+#include "crypto/crypto_plugin.h"
+#include "rgw/rgw_kms.h"
+#include "rapidjson/document.h"
+#include "rapidjson/writer.h"
+#include "rapidjson/error/error.h"
+#include "rapidjson/error/en.h"
+#include <unicode/normalizer2.h>	// libicu
+
+#include <openssl/evp.h>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw;
+
+template<typename M>
+class canonical_char_sorter {
+private:
+    const DoutPrefixProvider *dpp;
+    const icu::Normalizer2* normalizer;
+    CephContext *cct;
+public:
+    canonical_char_sorter(const DoutPrefixProvider *dpp, CephContext *cct) : dpp(dpp), cct(cct) {
+        UErrorCode status = U_ZERO_ERROR;
+        normalizer = icu::Normalizer2::getNFCInstance(status);
+        if (U_FAILURE(status)) {
+            ldpp_dout(this->dpp, -1) << "ERROR: can't get nfc instance, error = " << status << dendl;
+            normalizer = 0;
+        }
+    }
+    bool compare_helper (const M *, const M *);
+    bool make_string_canonical(rapidjson::Value &,
+        rapidjson::Document::AllocatorType&);
+};
+
+template<typename M>
+bool
+canonical_char_sorter<M>::compare_helper (const M*a, const M*b)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    const std::string as{a->name.GetString(), a->name.GetStringLength()},
+        bs{b->name.GetString(), b->name.GetStringLength()};
+    icu::UnicodeString aw{icu::UnicodeString::fromUTF8(as)}, bw{icu::UnicodeString::fromUTF8(bs)};
+    int32_t afl{ aw.countChar32()}, bfl{bw.countChar32()};
+    std::u32string af, bf;
+    af.resize(afl); bf.resize(bfl);
+    auto *astr{af.c_str()}, *bstr{bf.c_str()};
+    aw.toUTF32((int32_t*)astr, afl, status);
+    bw.toUTF32((int32_t*)bstr, bfl, status);
+    bool r{af < bf};
+    return r;
+}
+
+template<typename M>
+bool
+canonical_char_sorter<M>::make_string_canonical (rapidjson::Value &v, rapidjson::Document::AllocatorType&a)
+{
+    UErrorCode status = U_ZERO_ERROR;
+    const std::string as{v.GetString(), v.GetStringLength()};
+
+    if (!normalizer)
+        return false;
+    const icu::UnicodeString aw{icu::UnicodeString::fromUTF8(as)};
+    icu::UnicodeString an{normalizer->normalize(aw, status)};
+    if (U_FAILURE(status)) {
+        ldpp_dout(this->dpp, 5) << "conversion error; code=" << status <<
+            " on string " << as << dendl;
+        return false;
+    }
+    std::string ans;
+    an.toUTF8String(ans);
+    v.SetString(ans.c_str(), ans.length(), a);
+    return true;
+}
+
+typedef
+rapidjson::GenericMember<rapidjson::UTF8<>, rapidjson::MemoryPoolAllocator<> >
+MyMember;
+
+template<typename H>
+bool
+sort_and_write(rapidjson::Value &d, H &writer, canonical_char_sorter<MyMember>& ccs)
+{
+    bool r;
+    switch(d.GetType()) {
+    case rapidjson::kObjectType: {
+    struct comparer {
+        canonical_char_sorter<MyMember> &r;
+        comparer(canonical_char_sorter<MyMember> &r) : r(r) {};
+        bool operator()(const MyMember*a, const MyMember*b) {
+            return r.compare_helper(a,b);
+        }
+    } cmp_functor{ccs};
+        if (!(r = writer.StartObject()))
+            break;
+        std::vector<MyMember*> q;
+        for (auto &m: d.GetObject())
+            q.push_back(&m);
+        std::sort(q.begin(), q.end(), cmp_functor);
+        for (auto m: q) {
+            assert(m->name.IsString());
+            if (!(r = writer.Key(m->name.GetString(), m->name.GetStringLength())))
+                goto Done;
+            if (!(r = sort_and_write(m->value, writer, ccs)))
+                goto Done;
+        }
+        r = writer.EndObject();
+        break; }
+    case rapidjson::kArrayType:
+        if (!(r = writer.StartArray()))
+            break;
+        for (auto &v: d.GetArray()) {
+            if (!(r = sort_and_write(v, writer, ccs)))
+                goto Done;
+        }
+        r = writer.EndArray();
+        break;
+    default:
+        r = d.Accept(writer);
+        break;
+    }
+Done:
+    return r;
+}
+
+enum struct mec_option {
+empty = 0, number_ok = 1
+};
+
+enum struct mec_error {
+success = 0, conversion, number
+};
+
+mec_error
+make_everything_canonical(rapidjson::Value &d, rapidjson::Document::AllocatorType&a, canonical_char_sorter<MyMember>& ccs, mec_option f = mec_option::empty )
+{
+    mec_error r;
+    switch(d.GetType()) {
+    case rapidjson::kObjectType:
+        for (auto &m: d.GetObject()) {
+            assert(m.name.IsString());
+            if (!ccs.make_string_canonical(m.name, a)) {
+                r = mec_error::conversion;
+                goto Error;
+            }
+            if ((r = make_everything_canonical(m.value, a, ccs, f)) != mec_error::success)
+                goto Error;
+        }
+        break;
+    case rapidjson::kArrayType:
+        for (auto &v: d.GetArray()) {
+            if ((r = make_everything_canonical(v, a, ccs, f)) != mec_error::success)
+                goto Error;
+        }
+        break;
+    case rapidjson::kStringType:
+        if (!ccs.make_string_canonical(d, a)) {
+            r = mec_error::conversion;
+            goto Error;
+        }
+        break;
+    case rapidjson::kNumberType:
+        if (static_cast<int>(f) & static_cast<int>(mec_option::number_ok))
+            break;
+        r = mec_error::number;
+        goto Error;
+    default:
+        break;
+    }
+    r = mec_error::success;
+Error:
+    return r;
+}
+
+bool
+add_object_to_context(rgw_obj &obj, rapidjson::Document &d)
+{
+    ARN a{obj};
+    const char aws_s3_arn[] { "aws:s3:arn" };
+    std::string as{a.to_string()};
+    rapidjson::Document::AllocatorType &allocator { d.GetAllocator() };
+    rapidjson::Value name, val;
+
+    if (!d.IsObject())
+        return false;
+    if (d.HasMember(aws_s3_arn))
+        return true;
+    val.SetString(as.c_str(), as.length(), allocator);
+    name.SetString(aws_s3_arn, sizeof aws_s3_arn - 1, allocator);
+    d.AddMember(name, val, allocator);
+    return true;
+}
+
+static inline const std::string &
+get_tenant_or_id(req_state *s)
+{
+    const std::string &tenant{ s->user->get_tenant() };
+    if (!tenant.empty()) return tenant;
+    return s->user->get_id().id;
+}
+
+int
+make_canonical_context(req_state *s,
+    std::string_view &context,
+    std::string &cooked_context)
+{
+    rapidjson::Document d;
+    bool b = false;
+mec_option options {
+//mec_option::number_ok :	SEE BOTTOM OF FILE
+mec_option::empty };
+    rgw_obj obj;
+    std::ostringstream oss;
+    canonical_char_sorter<MyMember> ccs{s, s->cct};
+
+    obj.bucket.tenant = get_tenant_or_id(s);
+    obj.bucket.name = s->bucket->get_name();
+    obj.key.name = s->object->get_name();
+    std::string iline;
+    rapidjson::Document::AllocatorType &allocator { d.GetAllocator() };
+
+    try {
+	iline = rgw::from_base64(context);
+    } catch (const std::exception& e) {
+	oss << "bad context: " << e.what();
+	s->err.message = oss.str();
+        return -ERR_INVALID_REQUEST;
+    }
+    rapidjson::StringStream isw(iline.c_str());
+    if (!iline.length())
+        d.SetObject();
+//    else if (qflag)		SEE BOTTOM OF FILE
+//       d.ParseStream<rapidjson::kParseNumbersAsStringsFlag>(isw);
+    else
+        d.ParseStream<rapidjson::kParseFullPrecisionFlag>(isw);
+    if (isw.Tell() != iline.length()) {
+        oss << "bad context: did not consume all of input: @ "
+	    << isw.Tell();
+	s->err.message = oss.str();
+        return -ERR_INVALID_REQUEST;
+    }
+    if (d.HasParseError()) {
+        oss << "bad context: parse error: @ " << d.GetErrorOffset()
+	    << " " << rapidjson::GetParseError_En(d.GetParseError());
+	s->err.message = oss.str();
+        return -ERR_INVALID_REQUEST;
+    }
+    rapidjson::StringBuffer buf;
+    rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
+    if (!add_object_to_context(obj, d)) {
+	ldpp_dout(s, -1) << "ERROR: can't add default value to context" << dendl;
+	s->err.message = "context: internal error adding defaults";
+        return -ERR_INVALID_REQUEST;
+    }
+    b = make_everything_canonical(d, allocator, ccs, options) == mec_error::success;
+    if (!b) {
+	ldpp_dout(s, -1) << "ERROR: can't make canonical json <"
+	    << context << ">" << dendl;
+	s->err.message = "context: can't make canonical";
+        return -ERR_INVALID_REQUEST;
+    }
+    b = sort_and_write(d, writer, ccs);
+    if (!b) {
+	    ldpp_dout(s, 5) << "format error <" << context
+	    << ">: partial.results=" << buf.GetString() << dendl;
+	s->err.message = "unable to reformat json";
+        return -ERR_INVALID_REQUEST;
+    }
+    cooked_context = rgw::to_base64(buf.GetString());
+    return 0;
+}
+
+
+CryptoAccelRef get_crypto_accel(const DoutPrefixProvider* dpp, CephContext *cct)
+{
+  CryptoAccelRef ca_impl = nullptr;
+  stringstream ss;
+  PluginRegistry *reg = cct->get_plugin_registry();
+  string crypto_accel_type = cct->_conf->plugin_crypto_accelerator;
+
+  CryptoPlugin *factory = dynamic_cast<CryptoPlugin*>(reg->get_with_load("crypto", crypto_accel_type));
+  if (factory == nullptr) {
+    ldpp_dout(dpp, -1) << __func__ << " cannot load crypto accelerator of type " << crypto_accel_type << dendl;
+    return nullptr;
+  }
+  int err = factory->factory(&ca_impl, &ss);
+  if (err) {
+    ldpp_dout(dpp, -1) << __func__ << " factory return error " << err <<
+        " with description: " << ss.str() << dendl;
+  }
+  return ca_impl;
+}
+
+
+template <std::size_t KeySizeV, std::size_t IvSizeV>
+static inline
+bool evp_sym_transform(const DoutPrefixProvider* dpp,
+                       CephContext* const cct,
+                       const EVP_CIPHER* const type,
+                       unsigned char* const out,
+                       const unsigned char* const in,
+                       const size_t size,
+                       const unsigned char* const iv,
+                       const unsigned char* const key,
+                       const bool encrypt)
+{
+  using pctx_t = \
+    std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)>;
+  pctx_t pctx{ EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free };
+
+  if (!pctx) {
+    return false;
+  }
+
+  if (1 != EVP_CipherInit_ex(pctx.get(), type, nullptr,
+                             nullptr, nullptr, encrypt)) {
+    ldpp_dout(dpp, 5) << "EVP: failed to 1st initialization stage" << dendl;
+    return false;
+  }
+
+  // we want to support ciphers that don't use IV at all like AES-256-ECB
+  if constexpr (static_cast<bool>(IvSizeV)) {
+    ceph_assert(EVP_CIPHER_CTX_iv_length(pctx.get()) == IvSizeV);
+    ceph_assert(EVP_CIPHER_CTX_block_size(pctx.get()) == IvSizeV);
+  }
+  ceph_assert(EVP_CIPHER_CTX_key_length(pctx.get()) == KeySizeV);
+
+  if (1 != EVP_CipherInit_ex(pctx.get(), nullptr, nullptr, key, iv, encrypt)) {
+    ldpp_dout(dpp, 5) << "EVP: failed to 2nd initialization stage" << dendl;
+    return false;
+  }
+
+  // disable padding
+  if (1 != EVP_CIPHER_CTX_set_padding(pctx.get(), 0)) {
+    ldpp_dout(dpp, 5) << "EVP: cannot disable PKCS padding" << dendl;
+    return false;
+  }
+
+  // operate!
+  int written = 0;
+  ceph_assert(size <= static_cast<size_t>(std::numeric_limits<int>::max()));
+  if (1 != EVP_CipherUpdate(pctx.get(), out, &written, in, size)) {
+    ldpp_dout(dpp, 5) << "EVP: EVP_CipherUpdate failed" << dendl;
+    return false;
+  }
+
+  int finally_written = 0;
+  static_assert(sizeof(*out) == 1);
+  if (1 != EVP_CipherFinal_ex(pctx.get(), out + written, &finally_written)) {
+    ldpp_dout(dpp, 5) << "EVP: EVP_CipherFinal_ex failed" << dendl;
+    return false;
+  }
+
+  // padding is disabled so EVP_CipherFinal_ex should not append anything
+  ceph_assert(finally_written == 0);
+  return (written + finally_written) == static_cast<int>(size);
+}
+
+
+/**
+ * Encryption in CBC mode. Chunked to 4K blocks. Offset is used as IV for each 4K block.
+ *
+ *
+ *
+ * A. Encryption
+ * 1. Input is split to 4K chunks + remainder in one, smaller chunk
+ * 2. Each full chunk is encrypted separately with CBC chained mode, with initial IV derived from offset
+ * 3. Last chunk is 16*m + n.
+ * 4. 16*m bytes are encrypted with CBC chained mode, with initial IV derived from offset
+ * 5. Last n bytes are xor-ed with pattern obtained by CBC encryption of
+ *    last encrypted 16 byte block <16m-16, 16m-15) with IV = {0}.
+ * 6. (Special case) If m == 0 then last n bytes are xor-ed with pattern
+ *    obtained by CBC encryption of {0} with IV derived from offset
+ *
+ * B. Decryption
+ * 1. Input is split to 4K chunks + remainder in one, smaller chunk
+ * 2. Each full chunk is decrypted separately with CBC chained mode, with initial IV derived from offset
+ * 3. Last chunk is 16*m + n.
+ * 4. 16*m bytes are decrypted with CBC chained mode, with initial IV derived from offset
+ * 5. Last n bytes are xor-ed with pattern obtained by CBC ENCRYPTION of
+ *    last (still encrypted) 16 byte block <16m-16,16m-15) with IV = {0}
+ * 6. (Special case) If m == 0 then last n bytes are xor-ed with pattern
+ *    obtained by CBC ENCRYPTION of {0} with IV derived from offset
+ */
+class AES_256_CBC : public BlockCrypt {
+public:
+  static const size_t AES_256_KEYSIZE = 256 / 8;
+  static const size_t AES_256_IVSIZE = 128 / 8;
+  static const size_t CHUNK_SIZE = 4096;
+  const DoutPrefixProvider* dpp;
+private:
+  static const uint8_t IV[AES_256_IVSIZE];
+  CephContext* cct;
+  uint8_t key[AES_256_KEYSIZE];
+public:
+  explicit AES_256_CBC(const DoutPrefixProvider* dpp, CephContext* cct): dpp(dpp), cct(cct) {
+  }
+  ~AES_256_CBC() {
+    ::ceph::crypto::zeroize_for_security(key, AES_256_KEYSIZE);
+  }
+  bool set_key(const uint8_t* _key, size_t key_size) {
+    if (key_size != AES_256_KEYSIZE) {
+      return false;
+    }
+    memcpy(key, _key, AES_256_KEYSIZE);
+    return true;
+  }
+  size_t get_block_size() {
+    return CHUNK_SIZE;
+  }
+
+  bool cbc_transform(unsigned char* out,
+                     const unsigned char* in,
+                     const size_t size,
+                     const unsigned char (&iv)[AES_256_IVSIZE],
+                     const unsigned char (&key)[AES_256_KEYSIZE],
+                     bool encrypt)
+  {
+    return evp_sym_transform<AES_256_KEYSIZE, AES_256_IVSIZE>(
+      dpp, cct, EVP_aes_256_cbc(), out, in, size, iv, key, encrypt);
+  }
+
+  bool cbc_transform(unsigned char* out,
+                     const unsigned char* in,
+                     size_t size,
+                     off_t stream_offset,
+                     const unsigned char (&key)[AES_256_KEYSIZE],
+                     bool encrypt)
+  {
+    static std::atomic<bool> failed_to_get_crypto(false);
+    CryptoAccelRef crypto_accel;
+    if (! failed_to_get_crypto.load())
+    {
+      crypto_accel = get_crypto_accel(this->dpp, cct);
+      if (!crypto_accel)
+        failed_to_get_crypto = true;
+    }
+    bool result = true;
+    unsigned char iv[AES_256_IVSIZE];
+    for (size_t offset = 0; result && (offset < size); offset += CHUNK_SIZE) {
+      size_t process_size = offset + CHUNK_SIZE <= size ? CHUNK_SIZE : size - offset;
+      prepare_iv(iv, stream_offset + offset);
+      if (crypto_accel != nullptr) {
+        if (encrypt) {
+          result = crypto_accel->cbc_encrypt(out + offset, in + offset,
+                                             process_size, iv, key);
+        } else {
+          result = crypto_accel->cbc_decrypt(out + offset, in + offset,
+                                             process_size, iv, key);
+        }
+      } else {
+        result = cbc_transform(
+            out + offset, in + offset, process_size,
+            iv, key, encrypt);
+      }
+    }
+    return result;
+  }
+
+
+  bool encrypt(bufferlist& input,
+               off_t in_ofs,
+               size_t size,
+               bufferlist& output,
+               off_t stream_offset)
+  {
+    bool result = false;
+    size_t aligned_size = size / AES_256_IVSIZE * AES_256_IVSIZE;
+    size_t unaligned_rest_size = size - aligned_size;
+    output.clear();
+    buffer::ptr buf(aligned_size + AES_256_IVSIZE);
+    unsigned char* buf_raw = reinterpret_cast<unsigned char*>(buf.c_str());
+    const unsigned char* input_raw = reinterpret_cast<const unsigned char*>(input.c_str());
+
+    /* encrypt main bulk of data */
+    result = cbc_transform(buf_raw,
+                           input_raw + in_ofs,
+                           aligned_size,
+                           stream_offset, key, true);
+    if (result && (unaligned_rest_size > 0)) {
+      /* remainder to encrypt */
+      if (aligned_size % CHUNK_SIZE > 0) {
+        /* use last chunk for unaligned part */
+        unsigned char iv[AES_256_IVSIZE] = {0};
+        result = cbc_transform(buf_raw + aligned_size,
+                               buf_raw + aligned_size - AES_256_IVSIZE,
+                               AES_256_IVSIZE,
+                               iv, key, true);
+      } else {
+        /* 0 full blocks in current chunk, use IV as base for unaligned part */
+        unsigned char iv[AES_256_IVSIZE] = {0};
+        unsigned char data[AES_256_IVSIZE];
+        prepare_iv(data, stream_offset + aligned_size);
+        result = cbc_transform(buf_raw + aligned_size,
+                               data,
+                               AES_256_IVSIZE,
+                               iv, key, true);
+      }
+      if (result) {
+        for(size_t i = aligned_size; i < size; i++) {
+          *(buf_raw + i) ^= *(input_raw + in_ofs + i);
+        }
+      }
+    }
+    if (result) {
+      ldpp_dout(this->dpp, 25) << "Encrypted " << size << " bytes"<< dendl;
+      buf.set_length(size);
+      output.append(buf);
+    } else {
+      ldpp_dout(this->dpp, 5) << "Failed to encrypt" << dendl;
+    }
+    return result;
+  }
+
+
+  bool decrypt(bufferlist& input,
+               off_t in_ofs,
+               size_t size,
+               bufferlist& output,
+               off_t stream_offset)
+  {
+    bool result = false;
+    size_t aligned_size = size / AES_256_IVSIZE * AES_256_IVSIZE;
+    size_t unaligned_rest_size = size - aligned_size;
+    output.clear();
+    buffer::ptr buf(aligned_size + AES_256_IVSIZE);
+    unsigned char* buf_raw = reinterpret_cast<unsigned char*>(buf.c_str());
+    unsigned char* input_raw = reinterpret_cast<unsigned char*>(input.c_str());
+
+    /* decrypt main bulk of data */
+    result = cbc_transform(buf_raw,
+                           input_raw + in_ofs,
+                           aligned_size,
+                           stream_offset, key, false);
+    if (result && unaligned_rest_size > 0) {
+      /* remainder to decrypt */
+      if (aligned_size % CHUNK_SIZE > 0) {
+        /*use last chunk for unaligned part*/
+        unsigned char iv[AES_256_IVSIZE] = {0};
+        result = cbc_transform(buf_raw + aligned_size,
+                               input_raw + in_ofs + aligned_size - AES_256_IVSIZE,
+                               AES_256_IVSIZE,
+                               iv, key, true);
+      } else {
+        /* 0 full blocks in current chunk, use IV as base for unaligned part */
+        unsigned char iv[AES_256_IVSIZE] = {0};
+        unsigned char data[AES_256_IVSIZE];
+        prepare_iv(data, stream_offset + aligned_size);
+        result = cbc_transform(buf_raw + aligned_size,
+                               data,
+                               AES_256_IVSIZE,
+                               iv, key, true);
+      }
+      if (result) {
+        for(size_t i = aligned_size; i < size; i++) {
+          *(buf_raw + i) ^= *(input_raw + in_ofs + i);
+        }
+      }
+    }
+    if (result) {
+      ldpp_dout(this->dpp, 25) << "Decrypted " << size << " bytes"<< dendl;
+      buf.set_length(size);
+      output.append(buf);
+    } else {
+      ldpp_dout(this->dpp, 5) << "Failed to decrypt" << dendl;
+    }
+    return result;
+  }
+
+
+  void prepare_iv(unsigned char (&iv)[AES_256_IVSIZE], off_t offset) {
+    off_t index = offset / AES_256_IVSIZE;
+    off_t i = AES_256_IVSIZE - 1;
+    unsigned int val;
+    unsigned int carry = 0;
+    while (i>=0) {
+      val = (index & 0xff) + IV[i] + carry;
+      iv[i] = val;
+      carry = val >> 8;
+      index = index >> 8;
+      i--;
+    }
+  }
+};
+
+
+std::unique_ptr<BlockCrypt> AES_256_CBC_create(const DoutPrefixProvider* dpp, CephContext* cct, const uint8_t* key, size_t len)
+{
+  auto cbc = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(dpp, cct));
+  cbc->set_key(key, AES_256_KEYSIZE);
+  return cbc;
+}
+
+
+const uint8_t AES_256_CBC::IV[AES_256_CBC::AES_256_IVSIZE] =
+    { 'a', 'e', 's', '2', '5', '6', 'i', 'v', '_', 'c', 't', 'r', '1', '3', '3', '7' };
+
+
+bool AES_256_ECB_encrypt(const DoutPrefixProvider* dpp,
+                         CephContext* cct,
+                         const uint8_t* key,
+                         size_t key_size,
+                         const uint8_t* data_in,
+                         uint8_t* data_out,
+                         size_t data_size)
+{
+  if (key_size == AES_256_KEYSIZE) {
+    return evp_sym_transform<AES_256_KEYSIZE, 0 /* no IV in ECB */>(
+      dpp, cct, EVP_aes_256_ecb(),  data_out, data_in, data_size,
+      nullptr /* no IV in ECB */, key, true /* encrypt */);
+  } else {
+    ldpp_dout(dpp, 5) << "Key size must be 256 bits long" << dendl;
+    return false;
+  }
+}
+
+
+RGWGetObj_BlockDecrypt::RGWGetObj_BlockDecrypt(const DoutPrefixProvider *dpp,
+                                               CephContext* cct,
+                                               RGWGetObj_Filter* next,
+                                               std::unique_ptr<BlockCrypt> crypt,
+                                               std::vector<size_t> parts_len)
+    :
+    RGWGetObj_Filter(next),
+    dpp(dpp),
+    cct(cct),
+    crypt(std::move(crypt)),
+    enc_begin_skip(0),
+    ofs(0),
+    end(0),
+    cache(),
+    parts_len(std::move(parts_len))
+{
+  block_size = this->crypt->get_block_size();
+}
+
+RGWGetObj_BlockDecrypt::~RGWGetObj_BlockDecrypt() {
+}
+
+int RGWGetObj_BlockDecrypt::read_manifest_parts(const DoutPrefixProvider *dpp,
+                                                const bufferlist& manifest_bl,
+                                                std::vector<size_t>& parts_len)
+{
+  RGWObjManifest manifest;
+  if (manifest_bl.length()) {
+    auto miter = manifest_bl.cbegin();
+    try {
+      decode(manifest, miter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
+      return -EIO;
+    }
+    RGWObjManifest::obj_iterator mi;
+    for (mi = manifest.obj_begin(dpp); mi != manifest.obj_end(dpp); ++mi) {
+      if (mi.get_cur_stripe() == 0) {
+        parts_len.push_back(0);
+      }
+      parts_len.back() += mi.get_stripe_size();
+    }
+    for (size_t i = 0; i<parts_len.size(); i++) {
+      ldpp_dout(dpp, 20) << "Manifest part " << i << ", size=" << parts_len[i] << dendl;
+    }
+  }
+  return 0;
+}
+
+int RGWGetObj_BlockDecrypt::fixup_range(off_t& bl_ofs, off_t& bl_end) {
+  off_t inp_ofs = bl_ofs;
+  off_t inp_end = bl_end;
+  if (parts_len.size() > 0) {
+    off_t in_ofs = bl_ofs;
+    off_t in_end = bl_end;
+
+    size_t i = 0;
+    while (i<parts_len.size() && (in_ofs >= (off_t)parts_len[i])) {
+      in_ofs -= parts_len[i];
+      i++;
+    }
+    //in_ofs is inside block i
+    size_t j = 0;
+    while (j<(parts_len.size() - 1) && (in_end >= (off_t)parts_len[j])) {
+      in_end -= parts_len[j];
+      j++;
+    }
+    //in_end is inside part j, OR j is the last part
+
+    size_t rounded_end = ( in_end & ~(block_size - 1) ) + (block_size - 1);
+    if (rounded_end > parts_len[j]) {
+      rounded_end = parts_len[j] - 1;
+    }
+
+    enc_begin_skip = in_ofs & (block_size - 1);
+    ofs = bl_ofs - enc_begin_skip;
+    end = bl_end;
+    bl_end += rounded_end - in_end;
+    bl_ofs = std::min(bl_ofs - enc_begin_skip, bl_end);
+  }
+  else
+  {
+    enc_begin_skip = bl_ofs & (block_size - 1);
+    ofs = bl_ofs & ~(block_size - 1);
+    end = bl_end;
+    bl_ofs = bl_ofs & ~(block_size - 1);
+    bl_end = ( bl_end & ~(block_size - 1) ) + (block_size - 1);
+  }
+  ldpp_dout(this->dpp, 20) << "fixup_range [" << inp_ofs << "," << inp_end
+      << "] => [" << bl_ofs << "," << bl_end << "]" << dendl;
+  return 0;
+}
+
+int RGWGetObj_BlockDecrypt::process(bufferlist& in, size_t part_ofs, size_t size)
+{
+  bufferlist data;
+  if (!crypt->decrypt(in, 0, size, data, part_ofs)) {
+    return -ERR_INTERNAL_ERROR;
+  }
+  off_t send_size = size - enc_begin_skip;
+  if (ofs + enc_begin_skip + send_size > end + 1) {
+    send_size = end + 1 - ofs - enc_begin_skip;
+  }
+  int res = next->handle_data(data, enc_begin_skip, send_size);
+  enc_begin_skip = 0;
+  ofs += size;
+  in.splice(0, size);
+  return res;
+}
+
+int RGWGetObj_BlockDecrypt::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) {
+  ldpp_dout(this->dpp, 25) << "Decrypt " << bl_len << " bytes" << dendl;
+  bl.begin(bl_ofs).copy(bl_len, cache);
+
+  int res = 0;
+  size_t part_ofs = ofs;
+  for (size_t part : parts_len) {
+    if (part_ofs >= part) {
+      part_ofs -= part;
+    } else if (part_ofs + cache.length() >= part) {
+      // flush data up to part boundaries, aligned or not
+      res = process(cache, part_ofs, part - part_ofs);
+      if (res < 0) {
+        return res;
+      }
+      part_ofs = 0;
+    } else {
+      break;
+    }
+  }
+  // write up to block boundaries, aligned only
+  off_t aligned_size = cache.length() & ~(block_size - 1);
+  if (aligned_size > 0) {
+    res = process(cache, part_ofs, aligned_size);
+  }
+  return res;
+}
+
+/**
+ * flush remainder of data to output
+ */
+int RGWGetObj_BlockDecrypt::flush() {
+  ldpp_dout(this->dpp, 25) << "Decrypt flushing " << cache.length() << " bytes" << dendl;
+  int res = 0;
+  size_t part_ofs = ofs;
+  for (size_t part : parts_len) {
+    if (part_ofs >= part) {
+      part_ofs -= part;
+    } else if (part_ofs + cache.length() >= part) {
+      // flush data up to part boundaries, aligned or not
+      res = process(cache, part_ofs, part - part_ofs);
+      if (res < 0) {
+        return res;
+      }
+      part_ofs = 0;
+    } else {
+      break;
+    }
+  }
+  // flush up to block boundaries, aligned or not
+  if (cache.length() > 0) {
+    res = process(cache, part_ofs, cache.length());
+  }
+  return res;
+}
+
+RGWPutObj_BlockEncrypt::RGWPutObj_BlockEncrypt(const DoutPrefixProvider *dpp,
+                                               CephContext* cct,
+                                               rgw::sal::DataProcessor *next,
+                                               std::unique_ptr<BlockCrypt> crypt)
+  : Pipe(next),
+    dpp(dpp),
+    cct(cct),
+    crypt(std::move(crypt)),
+    block_size(this->crypt->get_block_size())
+{
+}
+
+int RGWPutObj_BlockEncrypt::process(bufferlist&& data, uint64_t logical_offset)
+{
+  ldpp_dout(this->dpp, 25) << "Encrypt " << data.length() << " bytes" << dendl;
+
+  // adjust logical offset to beginning of cached data
+  ceph_assert(logical_offset >= cache.length());
+  logical_offset -= cache.length();
+
+  const bool flush = (data.length() == 0);
+  cache.claim_append(data);
+
+  uint64_t proc_size = cache.length() & ~(block_size - 1);
+  if (flush) {
+    proc_size = cache.length();
+  }
+  if (proc_size > 0) {
+    bufferlist in, out;
+    cache.splice(0, proc_size, &in);
+    if (!crypt->encrypt(in, 0, proc_size, out, logical_offset)) {
+      return -ERR_INTERNAL_ERROR;
+    }
+    int r = Pipe::process(std::move(out), logical_offset);
+    logical_offset += proc_size;
+    if (r < 0)
+      return r;
+  }
+
+  if (flush) {
+    /*replicate 0-sized handle_data*/
+    return Pipe::process({}, logical_offset);
+  }
+  return 0;
+}
+
+
+std::string create_random_key_selector(CephContext * const cct) {
+  char random[AES_256_KEYSIZE];
+  cct->random()->get_bytes(&random[0], sizeof(random));
+  return std::string(random, sizeof(random));
+}
+
+typedef enum {
+  X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM=0,
+  X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY,
+  X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
+  X_AMZ_SERVER_SIDE_ENCRYPTION,
+  X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID,
+  X_AMZ_SERVER_SIDE_ENCRYPTION_CONTEXT,
+  X_AMZ_SERVER_SIDE_ENCRYPTION_LAST
+} crypt_option_e;
+struct crypt_option_names {
+  const std::string post_part_name;
+};
+
+static const crypt_option_names crypt_options[] = {
+    {  "x-amz-server-side-encryption-customer-algorithm"},
+    {        "x-amz-server-side-encryption-customer-key"},
+    {    "x-amz-server-side-encryption-customer-key-md5"},
+    {                     "x-amz-server-side-encryption"},
+    {      "x-amz-server-side-encryption-aws-kms-key-id"},
+    {             "x-amz-server-side-encryption-context"},
+};
+
+struct CryptAttributes {
+  meta_map_t &x_meta_map;
+
+  CryptAttributes(req_state *s)
+    : x_meta_map(s->info.crypt_attribute_map) {
+  }
+
+  std::string_view get(crypt_option_e option)
+  {
+    static_assert(
+	X_AMZ_SERVER_SIDE_ENCRYPTION_LAST == sizeof(crypt_options)/sizeof(*crypt_options),
+	"Missing items in crypt_options");
+    auto hdr { x_meta_map.find(crypt_options[option].post_part_name) };
+    if (hdr != x_meta_map.end()) {
+      return std::string_view(hdr->second);
+    } else {
+      return std::string_view();
+    }
+  }
+};
+
+std::string fetch_bucket_key_id(req_state *s)
+{
+  auto kek_iter = s->bucket_attrs.find(RGW_ATTR_BUCKET_ENCRYPTION_KEY_ID);
+  if (kek_iter == s->bucket_attrs.end())
+    return std::string();
+  std::string a_key { kek_iter->second.to_str() };
+  // early code appends a nul; pretend that didn't happen
+  auto l { a_key.length() };
+  if (l > 0 && a_key[l-1] == '\0') {
+    a_key.resize(--l);
+  }
+  return a_key;
+}
+
+const std::string cant_expand_key{ "\uFFFD" };
+std::string expand_key_name(req_state *s, const std::string_view&t)
+{
+  std::string r;
+  size_t i, j;
+  for (i = 0;;) {
+    i = t.find('%', (j = i));
+    if (i != j) {
+      if (i == std::string_view::npos)
+        r.append( t.substr(j) );
+      else
+        r.append( t.substr(j, i-j) );
+    }
+    if (i == std::string_view::npos) {
+      break;
+    }
+    if (t[i+1] == '%') {
+      r.append("%");
+      i += 2;
+      continue;
+    }
+    if (t.compare(i+1, 9, "bucket_id") == 0) {
+      r.append(s->bucket->get_marker());
+      i += 10;
+      continue;
+    }
+    if (t.compare(i+1, 8, "owner_id") == 0) {
+      r.append(s->bucket->get_info().owner.id);
+      i += 9;
+      continue;
+    }
+    return cant_expand_key;
+  }
+  return r;
+}
+
+static int get_sse_s3_bucket_key(req_state *s,
+                          std::string &key_id)
+{
+  int res;
+  std::string saved_key;
+
+  key_id = expand_key_name(s, s->cct->_conf->rgw_crypt_sse_s3_key_template);
+
+  if (key_id == cant_expand_key) {
+    ldpp_dout(s, 5) << "ERROR: unable to expand key_id " <<
+      s->cct->_conf->rgw_crypt_sse_s3_key_template << " on bucket" << dendl;
+    s->err.message = "Server side error - unable to expand key_id";
+    return -EINVAL;
+  }
+
+  saved_key = fetch_bucket_key_id(s);
+  if (saved_key != "") {
+    ldpp_dout(s, 5) << "Found KEK ID: " << key_id << dendl;
+  }
+  if (saved_key != key_id) {
+    res = create_sse_s3_bucket_key(s, s->cct, key_id);
+    if (res != 0) {
+      return res;
+    }
+    bufferlist key_id_bl;
+    key_id_bl.append(key_id.c_str(), key_id.length());
+    for (int count = 0; count < 15; ++count) {
+      rgw::sal::Attrs attrs = s->bucket->get_attrs();
+      attrs[RGW_ATTR_BUCKET_ENCRYPTION_KEY_ID] = key_id_bl;
+      res = s->bucket->merge_and_store_attrs(s, attrs, s->yield);
+      if (res != -ECANCELED) {
+        break;
+      }
+      res = s->bucket->try_refresh_info(s, nullptr);
+      if (res != 0) {
+        break;
+      }
+    }
+    if (res != 0) {
+      ldpp_dout(s, 5) << "ERROR: unable to save new key_id on bucket" << dendl;
+      s->err.message = "Server side error - unable to save key_id";
+      return res;
+    }
+  }
+  return 0;
+}
+
+int rgw_s3_prepare_encrypt(req_state* s,
+                           std::map<std::string, ceph::bufferlist>& attrs,
+                           std::unique_ptr<BlockCrypt>* block_crypt,
+                           std::map<std::string, std::string>& crypt_http_responses)
+{
+  int res = 0;
+  CryptAttributes crypt_attributes { s };
+  crypt_http_responses.clear();
+
+  {
+    std::string_view req_sse_ca =
+        crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM);
+    if (! req_sse_ca.empty()) {
+      if (req_sse_ca != "AES256") {
+        ldpp_dout(s, 5) << "ERROR: Invalid value for header "
+                         << "x-amz-server-side-encryption-customer-algorithm"
+                         << dendl;
+        s->err.message = "The requested encryption algorithm is not valid, must be AES256.";
+        return -ERR_INVALID_ENCRYPTION_ALGORITHM;
+      }
+      if (s->cct->_conf->rgw_crypt_require_ssl &&
+          !rgw_transport_is_secure(s->cct, *s->info.env)) {
+        ldpp_dout(s, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
+        return -ERR_INVALID_REQUEST;
+      }
+
+      std::string key_bin;
+      try {
+        key_bin = from_base64(
+          crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY) );
+      } catch (...) {
+        ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption "
+                         << "key which contains character that is not base64 encoded."
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key.";
+        return -EINVAL;
+      }
+
+      if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) {
+        ldpp_dout(s, 5) << "ERROR: invalid encryption key size" << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key.";
+        return -EINVAL;
+      }
+
+      std::string_view keymd5 =
+          crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5);
+
+      std::string keymd5_bin;
+      try {
+        keymd5_bin = from_base64(keymd5);
+      } catch (...) {
+        ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_encrypt invalid encryption key "
+                         << "md5 which contains character that is not base64 encoded."
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key md5.";
+        return -EINVAL;
+      }
+
+      if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) {
+        ldpp_dout(s, 5) << "ERROR: Invalid key md5 size" << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key md5.";
+        return -EINVAL;
+      }
+
+      MD5 key_hash;
+      // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+      key_hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+      unsigned char key_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE];
+      key_hash.Update(reinterpret_cast<const unsigned char*>(key_bin.c_str()), key_bin.size());
+      key_hash.Final(key_hash_res);
+
+      if (memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) {
+        ldpp_dout(s, 5) << "ERROR: Invalid key md5 hash" << dendl;
+        s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided.";
+        return -EINVAL;
+      }
+
+      set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-C-AES256");
+      set_attr(attrs, RGW_ATTR_CRYPT_KEYMD5, keymd5_bin);
+
+      if (block_crypt) {
+        auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s, s->cct));
+        aes->set_key(reinterpret_cast<const uint8_t*>(key_bin.c_str()), AES_256_KEYSIZE);
+        *block_crypt = std::move(aes);
+      }
+
+      crypt_http_responses["x-amz-server-side-encryption-customer-algorithm"] = "AES256";
+      crypt_http_responses["x-amz-server-side-encryption-customer-key-MD5"] = std::string(keymd5);
+      return 0;
+    } else {
+      std::string_view customer_key =
+          crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY);
+      if (!customer_key.empty()) {
+        ldpp_dout(s, 5) << "ERROR: SSE-C encryption request is missing the header "
+                         << "x-amz-server-side-encryption-customer-algorithm"
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide a valid encryption algorithm.";
+        return -EINVAL;
+      }
+
+      std::string_view customer_key_md5 =
+          crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5);
+      if (!customer_key_md5.empty()) {
+        ldpp_dout(s, 5) << "ERROR: SSE-C encryption request is missing the header "
+                         << "x-amz-server-side-encryption-customer-algorithm"
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide a valid encryption algorithm.";
+        return -EINVAL;
+      }
+    }
+
+    /* AMAZON server side encryption with KMS (key management service) */
+    std::string_view req_sse =
+        crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION);
+    if (! req_sse.empty()) {
+
+      if (s->cct->_conf->rgw_crypt_require_ssl &&
+          !rgw_transport_is_secure(s->cct, *s->info.env)) {
+        ldpp_dout(s, 5) << "ERROR: insecure request, rgw_crypt_require_ssl is set" << dendl;
+        return -ERR_INVALID_REQUEST;
+      }
+
+      if (req_sse == "aws:kms") {
+        std::string_view context =
+          crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_CONTEXT);
+        std::string cooked_context;
+        if ((res = make_canonical_context(s, context, cooked_context)))
+          return res;
+        std::string_view key_id =
+          crypt_attributes.get(X_AMZ_SERVER_SIDE_ENCRYPTION_AWS_KMS_KEY_ID);
+        if (key_id.empty()) {
+          ldpp_dout(s, 5) << "ERROR: not provide a valid key id" << dendl;
+          s->err.message = "Server Side Encryption with KMS managed key requires "
+            "HTTP header x-amz-server-side-encryption-aws-kms-key-id";
+          return -EINVAL;
+        }
+        /* try to retrieve actual key */
+        std::string key_selector = create_random_key_selector(s->cct);
+        set_attr(attrs, RGW_ATTR_CRYPT_MODE, "SSE-KMS");
+        set_attr(attrs, RGW_ATTR_CRYPT_KEYID, key_id);
+        set_attr(attrs, RGW_ATTR_CRYPT_KEYSEL, key_selector);
+        set_attr(attrs, RGW_ATTR_CRYPT_CONTEXT, cooked_context);
+        std::string actual_key;
+        res = make_actual_key_from_kms(s, s->cct, attrs, actual_key);
+        if (res != 0) {
+          ldpp_dout(s, 5) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl;
+          s->err.message = "Failed to retrieve the actual key, kms-keyid: " + std::string(key_id);
+          return res;
+        }
+        if (actual_key.size() != AES_256_KEYSIZE) {
+          ldpp_dout(s, 5) << "ERROR: key obtained from key_id:" <<
+            key_id << " is not 256 bit size" << dendl;
+          s->err.message = "KMS provided an invalid key for the given kms-keyid.";
+          return -EINVAL;
+        }
+
+        if (block_crypt) {
+          auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s, s->cct));
+          aes->set_key(reinterpret_cast<const uint8_t*>(actual_key.c_str()), AES_256_KEYSIZE);
+          *block_crypt = std::move(aes);
+        }
+        ::ceph::crypto::zeroize_for_security(actual_key.data(), actual_key.length());
+
+        crypt_http_responses["x-amz-server-side-encryption"] = "aws:kms";
+        crypt_http_responses["x-amz-server-side-encryption-aws-kms-key-id"] = std::string(key_id);
+        crypt_http_responses["x-amz-server-side-encryption-context"] = std::move(cooked_context);
+        return 0;
+      } else if (req_sse != "AES256") {
+        ldpp_dout(s, 5) << "ERROR: Invalid value for header x-amz-server-side-encryption"
+                         << dendl;
+        s->err.message = "Server Side Encryption with KMS managed key requires "
+          "HTTP header x-amz-server-side-encryption : aws:kms or AES256";
+        return -EINVAL;
+      }
+
+      if (s->cct->_conf->rgw_crypt_sse_s3_backend != "vault") {
+        s->err.message = "Request specifies Server Side Encryption "
+            "but server configuration does not support this.";
+        return -EINVAL;
+      }
+
+      ldpp_dout(s, 5) << "RGW_ATTR_BUCKET_ENCRYPTION ALGO: "
+              <<  req_sse << dendl;
+      std::string_view context = "";
+      std::string cooked_context;
+      if ((res = make_canonical_context(s, context, cooked_context)))
+        return res;
+
+      std::string key_id;
+      res = get_sse_s3_bucket_key(s, key_id);
+      if (res != 0) {
+        return res;
+      }
+      std::string key_selector = create_random_key_selector(s->cct);
+
+      set_attr(attrs, RGW_ATTR_CRYPT_KEYSEL, key_selector);
+      set_attr(attrs, RGW_ATTR_CRYPT_CONTEXT, cooked_context);
+      set_attr(attrs, RGW_ATTR_CRYPT_MODE, "AES256");
+      set_attr(attrs, RGW_ATTR_CRYPT_KEYID, key_id);
+      std::string actual_key;
+      res = make_actual_key_from_sse_s3(s, s->cct, attrs, actual_key);
+      if (res != 0) {
+        ldpp_dout(s, 5) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl;
+        s->err.message = "Failed to retrieve the actual key";
+        return res;
+      }
+      if (actual_key.size() != AES_256_KEYSIZE) {
+        ldpp_dout(s, 5) << "ERROR: key obtained from key_id:" <<
+                       key_id << " is not 256 bit size" << dendl;
+        s->err.message = "SSE-S3 provided an invalid key for the given keyid.";
+        return -EINVAL;
+      }
+
+      if (block_crypt) {
+        auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s, s->cct));
+        aes->set_key(reinterpret_cast<const uint8_t*>(actual_key.c_str()), AES_256_KEYSIZE);
+        *block_crypt = std::move(aes);
+      }
+      ::ceph::crypto::zeroize_for_security(actual_key.data(), actual_key.length());
+
+      crypt_http_responses["x-amz-server-side-encryption"] = "AES256";
+
+      return 0;
+    } else if (s->cct->_conf->rgw_crypt_default_encryption_key != "") {
+      std::string master_encryption_key;
+      try {
+        master_encryption_key = from_base64(s->cct->_conf->rgw_crypt_default_encryption_key);
+      } catch (...) {
+        ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_encrypt invalid default encryption key "
+                         << "which contains character that is not base64 encoded."
+                         << dendl;
+        s->err.message = "Requests specifying Server Side Encryption with Customer "
+                         "provided keys must provide an appropriate secret key.";
+        return -EINVAL;
+      }
+
+      if (master_encryption_key.size() != 256 / 8) {
+        ldpp_dout(s, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl;
+        /* not an error to return; missing encryption does not inhibit processing */
+        return 0;
+      }
+
+      set_attr(attrs, RGW_ATTR_CRYPT_MODE, "RGW-AUTO");
+      std::string key_selector = create_random_key_selector(s->cct);
+      set_attr(attrs, RGW_ATTR_CRYPT_KEYSEL, key_selector);
+
+      uint8_t actual_key[AES_256_KEYSIZE];
+      if (AES_256_ECB_encrypt(s, s->cct,
+                              reinterpret_cast<const uint8_t*>(master_encryption_key.c_str()), AES_256_KEYSIZE,
+                              reinterpret_cast<const uint8_t*>(key_selector.c_str()),
+                              actual_key, AES_256_KEYSIZE) != true) {
+        ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key));
+        return -EIO;
+      }
+      if (block_crypt) {
+        auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s, s->cct));
+        aes->set_key(reinterpret_cast<const uint8_t*>(actual_key), AES_256_KEYSIZE);
+        *block_crypt = std::move(aes);
+      }
+      ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key));
+      return 0;
+    }
+  }
+  return 0;
+}
+
+
+int rgw_s3_prepare_decrypt(req_state* s,
+                       map<string, bufferlist>& attrs,
+                       std::unique_ptr<BlockCrypt>* block_crypt,
+                       std::map<std::string, std::string>& crypt_http_responses)
+{
+  int res = 0;
+  std::string stored_mode = get_str_attribute(attrs, RGW_ATTR_CRYPT_MODE);
+  ldpp_dout(s, 15) << "Encryption mode: " << stored_mode << dendl;
+
+  const char *req_sse = s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION", NULL);
+  if (nullptr != req_sse && (s->op == OP_GET || s->op == OP_HEAD)) {
+    return -ERR_INVALID_REQUEST;
+  }
+
+  if (stored_mode == "SSE-C-AES256") {
+    if (s->cct->_conf->rgw_crypt_require_ssl &&
+        !rgw_transport_is_secure(s->cct, *s->info.env)) {
+      ldpp_dout(s, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
+      return -ERR_INVALID_REQUEST;
+    }
+    const char *req_cust_alg =
+        s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM", NULL);
+
+    if (nullptr == req_cust_alg)  {
+      ldpp_dout(s, 5) << "ERROR: Request for SSE-C encrypted object missing "
+                       << "x-amz-server-side-encryption-customer-algorithm"
+                       << dendl;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide a valid encryption algorithm.";
+      return -EINVAL;
+    } else if (strcmp(req_cust_alg, "AES256") != 0) {
+      ldpp_dout(s, 5) << "ERROR: The requested encryption algorithm is not valid, must be AES256." << dendl;
+      s->err.message = "The requested encryption algorithm is not valid, must be AES256.";
+      return -ERR_INVALID_ENCRYPTION_ALGORITHM;
+    }
+
+    std::string key_bin;
+    try {
+      key_bin = from_base64(s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY", ""));
+    } catch (...) {
+      ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key "
+                       << "which contains character that is not base64 encoded."
+                       << dendl;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key.";
+      return -EINVAL;
+    }
+
+    if (key_bin.size() != AES_256_CBC::AES_256_KEYSIZE) {
+      ldpp_dout(s, 5) << "ERROR: Invalid encryption key size" << dendl;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key.";
+      return -EINVAL;
+    }
+
+    std::string keymd5 =
+        s->info.env->get("HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5", "");
+    std::string keymd5_bin;
+    try {
+      keymd5_bin = from_base64(keymd5);
+    } catch (...) {
+      ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_decrypt invalid encryption key md5 "
+                       << "which contains character that is not base64 encoded."
+                       << dendl;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key md5.";
+      return -EINVAL;
+    }
+
+
+    if (keymd5_bin.size() != CEPH_CRYPTO_MD5_DIGESTSIZE) {
+      ldpp_dout(s, 5) << "ERROR: Invalid key md5 size " << dendl;
+      s->err.message = "Requests specifying Server Side Encryption with Customer "
+                       "provided keys must provide an appropriate secret key md5.";
+      return -EINVAL;
+    }
+
+    MD5 key_hash;
+    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+    key_hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+    uint8_t key_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    key_hash.Update(reinterpret_cast<const unsigned char*>(key_bin.c_str()), key_bin.size());
+    key_hash.Final(key_hash_res);
+
+    if ((memcmp(key_hash_res, keymd5_bin.c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) ||
+        (get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYMD5) != keymd5_bin)) {
+      s->err.message = "The calculated MD5 hash of the key did not match the hash that was provided.";
+      return -EINVAL;
+    }
+    auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s, s->cct));
+    aes->set_key(reinterpret_cast<const uint8_t*>(key_bin.c_str()), AES_256_CBC::AES_256_KEYSIZE);
+    if (block_crypt) *block_crypt = std::move(aes);
+
+    crypt_http_responses["x-amz-server-side-encryption-customer-algorithm"] = "AES256";
+    crypt_http_responses["x-amz-server-side-encryption-customer-key-MD5"] = keymd5;
+    return 0;
+  }
+
+  if (stored_mode == "SSE-KMS") {
+    if (s->cct->_conf->rgw_crypt_require_ssl &&
+        !rgw_transport_is_secure(s->cct, *s->info.env)) {
+      ldpp_dout(s, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
+      return -ERR_INVALID_REQUEST;
+    }
+    /* try to retrieve actual key */
+    std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID);
+    std::string actual_key;
+    res = reconstitute_actual_key_from_kms(s, s->cct, attrs, actual_key);
+    if (res != 0) {
+      ldpp_dout(s, 10) << "ERROR: failed to retrieve actual key from key_id: " << key_id << dendl;
+      s->err.message = "Failed to retrieve the actual key, kms-keyid: " + key_id;
+      return res;
+    }
+    if (actual_key.size() != AES_256_KEYSIZE) {
+      ldpp_dout(s, 0) << "ERROR: key obtained from key_id:" <<
+          key_id << " is not 256 bit size" << dendl;
+      s->err.message = "KMS provided an invalid key for the given kms-keyid.";
+      return -EINVAL;
+    }
+
+    auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s, s->cct));
+    aes->set_key(reinterpret_cast<const uint8_t*>(actual_key.c_str()), AES_256_KEYSIZE);
+    actual_key.replace(0, actual_key.length(), actual_key.length(), '\000');
+    if (block_crypt) *block_crypt = std::move(aes);
+
+    crypt_http_responses["x-amz-server-side-encryption"] = "aws:kms";
+    crypt_http_responses["x-amz-server-side-encryption-aws-kms-key-id"] = key_id;
+    return 0;
+  }
+
+  if (stored_mode == "RGW-AUTO") {
+    std::string master_encryption_key;
+    try {
+      master_encryption_key = from_base64(std::string(s->cct->_conf->rgw_crypt_default_encryption_key));
+    } catch (...) {
+      ldpp_dout(s, 5) << "ERROR: rgw_s3_prepare_decrypt invalid default encryption key "
+                       << "which contains character that is not base64 encoded."
+                       << dendl;
+      s->err.message = "The default encryption key is not valid base64.";
+      return -EINVAL;
+    }
+
+    if (master_encryption_key.size() != 256 / 8) {
+      ldpp_dout(s, 0) << "ERROR: failed to decode 'rgw crypt default encryption key' to 256 bit string" << dendl;
+      return -EIO;
+    }
+    std::string attr_key_selector = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYSEL);
+    if (attr_key_selector.size() != AES_256_CBC::AES_256_KEYSIZE) {
+      ldpp_dout(s, 0) << "ERROR: missing or invalid " RGW_ATTR_CRYPT_KEYSEL << dendl;
+      return -EIO;
+    }
+    uint8_t actual_key[AES_256_KEYSIZE];
+    if (AES_256_ECB_encrypt(s, s->cct,
+                            reinterpret_cast<const uint8_t*>(master_encryption_key.c_str()),
+                            AES_256_KEYSIZE,
+                            reinterpret_cast<const uint8_t*>(attr_key_selector.c_str()),
+                            actual_key, AES_256_KEYSIZE) != true) {
+      ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key));
+      return -EIO;
+    }
+    auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s, s->cct));
+    aes->set_key(actual_key, AES_256_KEYSIZE);
+    ::ceph::crypto::zeroize_for_security(actual_key, sizeof(actual_key));
+    if (block_crypt) *block_crypt = std::move(aes);
+    return 0;
+  }
+
+  /* SSE-S3 */
+  if (stored_mode == "AES256") {
+    if (s->cct->_conf->rgw_crypt_require_ssl &&
+        !rgw_transport_is_secure(s->cct, *s->info.env)) {
+      ldpp_dout(s, 5) << "ERROR: Insecure request, rgw_crypt_require_ssl is set" << dendl;
+      return -ERR_INVALID_REQUEST;
+    }
+    /* try to retrieve actual key */
+    std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID);
+    std::string actual_key;
+    res = reconstitute_actual_key_from_sse_s3(s, s->cct, attrs, actual_key);
+    if (res != 0) {
+      ldpp_dout(s, 10) << "ERROR: failed to retrieve actual key" << dendl;
+      s->err.message = "Failed to retrieve the actual key";
+      return res;
+    }
+    if (actual_key.size() != AES_256_KEYSIZE) {
+      ldpp_dout(s, 0) << "ERROR: key obtained " <<
+          "is not 256 bit size" << dendl;
+      s->err.message = "SSE-S3 provided an invalid key for the given keyid.";
+      return -EINVAL;
+    }
+
+    auto aes = std::unique_ptr<AES_256_CBC>(new AES_256_CBC(s, s->cct));
+    aes->set_key(reinterpret_cast<const uint8_t*>(actual_key.c_str()), AES_256_KEYSIZE);
+    actual_key.replace(0, actual_key.length(), actual_key.length(), '\000');
+    if (block_crypt) *block_crypt = std::move(aes);
+
+    crypt_http_responses["x-amz-server-side-encryption"] = "AES256";
+    return 0;
+  }
+
+
+  /*no decryption*/
+  return 0;
+}
+
+int rgw_remove_sse_s3_bucket_key(req_state *s)
+{
+  int res;
+  auto key_id { expand_key_name(s, s->cct->_conf->rgw_crypt_sse_s3_key_template) };
+  auto saved_key { fetch_bucket_key_id(s) };
+  size_t i;
+
+  if (key_id == cant_expand_key) {
+    ldpp_dout(s, 5) << "ERROR: unable to expand key_id " <<
+      s->cct->_conf->rgw_crypt_sse_s3_key_template << " on bucket" << dendl;
+    s->err.message = "Server side error - unable to expand key_id";
+    return -EINVAL;
+  }
+
+  if (saved_key == "") {
+    return 0;
+  } else if (saved_key != key_id) {
+    ldpp_dout(s, 5) << "Found but will not delete strange KEK ID: " << saved_key << dendl;
+    return 0;
+  }
+  i = s->cct->_conf->rgw_crypt_sse_s3_key_template.find("%bucket_id");
+  if (i == std::string_view::npos) {
+    ldpp_dout(s, 5) << "Kept valid KEK ID: " << saved_key << dendl;
+    return 0;
+  }
+  ldpp_dout(s, 5) << "Removing valid KEK ID: " << saved_key << dendl;
+  res = remove_sse_s3_bucket_key(s, s->cct, saved_key);
+  if (res != 0) {
+    ldpp_dout(s, 0) << "ERROR: Unable to remove KEK ID: " << saved_key << " got " << res << dendl;
+  }
+  return res;
+}
+
+/*********************************************************************
+*	"BOTTOM OF FILE"
+*	I've left some commented out lines above.  They are there for
+*	a reason, which I will explain.  The "canonical" json constructed
+*	by the code above as a crypto context must take a json object and
+*	turn it into a unique determinstic fixed form.  For most json
+*	types this is easy.  The hardest problem that is handled above is
+*	detailing with unicode strings; they must be turned into
+*	NFC form and sorted in a fixed order.  Numbers, however,
+*	are another story.  Json makes no distinction between integers
+*	and floating point, and both types have their problems.
+*	Integers can overflow, so very large numbers are a problem.
+*	Floating point is even worse; not all floating point numbers
+*	can be represented accurately in c++ data types, and there
+*	are many quirks regarding how overflow, underflow, and loss
+*	of significance are handled.
+*
+*	In this version of the code, I took the simplest answer, I
+*	reject all numbers altogether.  This is not ideal, but it's
+*	the only choice that is guaranteed to be future compatible.
+*	AWS S3 does not guarantee to support numbers at all; but it
+*	actually converts all numbers into strings right off.
+*	This has the interesting property that 7 and 007 are different,
+*	but that 007 and "007" are the same.  I would rather
+*	treat numbers as a string of digits and have logic
+*	to produce the "most compact" equivalent form.  This can
+*	fix all the overflow/underflow problems, but it requires
+*	fixing the json parser part, and I put that problem off.
+*
+*	The commented code above indicates places in this code that
+*	will need to be revised depending on future work in this area.
+*	Removing those comments makes that work harder.
+*				February 25, 2021
+*********************************************************************/
diff --git a/src/rgw/rgw_crypt.h b/src/rgw/rgw_crypt.h
new file mode 100644
index 000000000..d8f561eca
--- /dev/null
+++ b/src/rgw/rgw_crypt.h
@@ -0,0 +1,174 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/**
+ * Crypto filters for Put/Post/Get operations.
+ */
+
+#pragma once
+
+#include <string_view>
+
+#include <rgw/rgw_op.h>
+#include <rgw/rgw_rest.h>
+#include <rgw/rgw_rest_s3.h>
+#include "rgw_putobj.h"
+
+/**
+ * \brief Interface for block encryption methods
+ *
+ * Encrypts and decrypts data.
+ * Operations are performed in context of larger stream being divided into blocks.
+ * Each block can be processed independently, but only as a whole.
+ * Part block cannot be properly processed.
+ * Each request must start on block-aligned offset.
+ * Each request should have length that is multiply of block size.
+ * Request with unaligned length is only acceptable for last part of stream.
+ */
+class BlockCrypt {
+public:
+  BlockCrypt(){};
+  virtual ~BlockCrypt(){};
+
+  /**
+    * Determines size of encryption block.
+    * This is usually multiply of key size.
+    * It determines size of chunks that should be passed to \ref encrypt and \ref decrypt.
+    */
+  virtual size_t get_block_size() = 0;
+
+  /**
+   * Encrypts data.
+   * Argument \ref stream_offset shows where in generalized stream chunk is located.
+   * Input for encryption is \ref input buffer, with relevant data in range <in_ofs, in_ofs+size).
+   * \ref input and \output may not be the same buffer.
+   *
+   * \params
+   * input - source buffer of data
+   * in_ofs - offset of chunk inside input
+   * size - size of chunk, must be chunk-aligned unless last part is processed
+   * output - destination buffer to encrypt to
+   * stream_offset - location of <in_ofs,in_ofs+size) chunk in data stream, must be chunk-aligned
+   * \return true iff successfully encrypted
+   */
+  virtual bool encrypt(bufferlist& input,
+                       off_t in_ofs,
+                       size_t size,
+                       bufferlist& output,
+                       off_t stream_offset) = 0;
+
+  /**
+   * Decrypts data.
+   * Argument \ref stream_offset shows where in generalized stream chunk is located.
+   * Input for decryption is \ref input buffer, with relevant data in range <in_ofs, in_ofs+size).
+   * \ref input and \output may not be the same buffer.
+   *
+   * \params
+   * input - source buffer of data
+   * in_ofs - offset of chunk inside input
+   * size - size of chunk, must be chunk-aligned unless last part is processed
+   * output - destination buffer to encrypt to
+   * stream_offset - location of <in_ofs,in_ofs+size) chunk in data stream, must be chunk-aligned
+   * \return true iff successfully encrypted
+   */
+  virtual bool decrypt(bufferlist& input,
+                       off_t in_ofs,
+                       size_t size,
+                       bufferlist& output,
+                       off_t stream_offset) = 0;
+};
+
+static const size_t AES_256_KEYSIZE = 256 / 8;
+bool AES_256_ECB_encrypt(const DoutPrefixProvider* dpp,
+                         CephContext* cct,
+                         const uint8_t* key,
+                         size_t key_size,
+                         const uint8_t* data_in,
+                         uint8_t* data_out,
+                         size_t data_size);
+
+class RGWGetObj_BlockDecrypt : public RGWGetObj_Filter {
+  const DoutPrefixProvider *dpp;
+  CephContext* cct;
+  std::unique_ptr<BlockCrypt> crypt; /**< already configured stateless BlockCrypt
+                                          for operations when enough data is accumulated */
+  off_t enc_begin_skip; /**< amount of data to skip from beginning of received data */
+  off_t ofs; /**< stream offset of data we expect to show up next through \ref handle_data */
+  off_t end; /**< stream offset of last byte that is requested */
+  bufferlist cache; /**< stores extra data that could not (yet) be processed by BlockCrypt */
+  size_t block_size; /**< snapshot of \ref BlockCrypt.get_block_size() */
+  std::vector<size_t> parts_len; /**< size of parts of multipart object, parsed from manifest */
+
+  int process(bufferlist& cipher, size_t part_ofs, size_t size);
+
+public:
+  RGWGetObj_BlockDecrypt(const DoutPrefixProvider *dpp,
+                         CephContext* cct,
+                         RGWGetObj_Filter* next,
+                         std::unique_ptr<BlockCrypt> crypt,
+                         std::vector<size_t> parts_len);
+  virtual ~RGWGetObj_BlockDecrypt();
+
+  virtual int fixup_range(off_t& bl_ofs,
+                          off_t& bl_end) override;
+  virtual int handle_data(bufferlist& bl,
+                          off_t bl_ofs,
+                          off_t bl_len) override;
+  virtual int flush() override;
+
+  static int read_manifest_parts(const DoutPrefixProvider *dpp,
+                                 const bufferlist& manifest_bl,
+                                 std::vector<size_t>& parts_len);
+}; /* RGWGetObj_BlockDecrypt */
+
+
+class RGWPutObj_BlockEncrypt : public rgw::putobj::Pipe
+{
+  const DoutPrefixProvider *dpp;
+  CephContext* cct;
+  std::unique_ptr<BlockCrypt> crypt; /**< already configured stateless BlockCrypt
+                                          for operations when enough data is accumulated */
+  bufferlist cache; /**< stores extra data that could not (yet) be processed by BlockCrypt */
+  const size_t block_size; /**< snapshot of \ref BlockCrypt.get_block_size() */
+public:
+  RGWPutObj_BlockEncrypt(const DoutPrefixProvider *dpp,
+                         CephContext* cct,
+                         rgw::sal::DataProcessor *next,
+                         std::unique_ptr<BlockCrypt> crypt);
+
+  int process(bufferlist&& data, uint64_t logical_offset) override;
+}; /* RGWPutObj_BlockEncrypt */
+
+
+int rgw_s3_prepare_encrypt(req_state* s,
+                           std::map<std::string, ceph::bufferlist>& attrs,
+                           std::unique_ptr<BlockCrypt>* block_crypt,
+                           std::map<std::string,
+                                    std::string>& crypt_http_responses);
+
+int rgw_s3_prepare_decrypt(req_state* s,
+                           std::map<std::string, ceph::bufferlist>& attrs,
+                           std::unique_ptr<BlockCrypt>* block_crypt,
+                           std::map<std::string,
+                                    std::string>& crypt_http_responses);
+
+static inline void set_attr(std::map<std::string, bufferlist>& attrs,
+                            const char* key,
+                            std::string_view value)
+{
+  bufferlist bl;
+  bl.append(value.data(), value.size());
+  attrs[key] = std::move(bl);
+}
+
+static inline std::string get_str_attribute(std::map<std::string, bufferlist>& attrs,
+                                            const char *name)
+{
+  auto iter = attrs.find(name);
+  if (iter == attrs.end()) {
+    return {};
+  }
+  return iter->second.to_str();
+}
+
+int rgw_remove_sse_s3_bucket_key(req_state *s);
diff --git a/src/rgw/rgw_crypt_sanitize.cc b/src/rgw/rgw_crypt_sanitize.cc
new file mode 100644
index 000000000..05aec6d3b
--- /dev/null
+++ b/src/rgw/rgw_crypt_sanitize.cc
@@ -0,0 +1,88 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * rgw_crypt_sanitize.cc
+ *
+ *  Created on: Mar 3, 2017
+ *      Author: adam
+ */
+
+#include "rgw_common.h"
+#include "rgw_crypt_sanitize.h"
+#include "boost/algorithm/string/predicate.hpp"
+
+namespace rgw {
+namespace crypt_sanitize {
+const char* HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY = "HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY";
+const char* x_amz_server_side_encryption_customer_key = "x-amz-server-side-encryption-customer-key";
+const char* dollar_x_amz_server_side_encryption_customer_key = "$x-amz-server-side-encryption-customer-key";
+const char* suppression_message = "=suppressed due to key presence=";
+
+std::ostream& operator<<(std::ostream& out, const env& e) {
+  if (g_ceph_context->_conf->rgw_crypt_suppress_logs) {
+    if (boost::algorithm::iequals(
+        e.name,
+        HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY))
+    {
+      out << suppression_message;
+      return out;
+    }
+    if (boost::algorithm::iequals(e.name, "QUERY_STRING") &&
+        boost::algorithm::ifind_first(
+            e.value,
+            x_amz_server_side_encryption_customer_key))
+    {
+      out << suppression_message;
+      return out;
+    }
+  }
+  out << e.value;
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const x_meta_map& x) {
+  if (g_ceph_context->_conf->rgw_crypt_suppress_logs &&
+      boost::algorithm::iequals(x.name, x_amz_server_side_encryption_customer_key))
+  {
+    out << suppression_message;
+    return out;
+  }
+  out << x.value;
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const s3_policy& x) {
+  if (g_ceph_context->_conf->rgw_crypt_suppress_logs &&
+      boost::algorithm::iequals(x.name, dollar_x_amz_server_side_encryption_customer_key))
+  {
+    out << suppression_message;
+    return out;
+  }
+  out << x.value;
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const auth& x) {
+  if (g_ceph_context->_conf->rgw_crypt_suppress_logs &&
+      x.s->info.env->get(HTTP_X_AMZ_SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY, nullptr) != nullptr)
+  {
+    out << suppression_message;
+    return out;
+  }
+  out << x.value;
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const log_content& x) {
+  if (g_ceph_context->_conf->rgw_crypt_suppress_logs &&
+      boost::algorithm::ifind_first(x.buf, x_amz_server_side_encryption_customer_key)) {
+    out << suppression_message;
+    return out;
+  }
+  out << x.buf;
+  return out;
+}
+
+}
+}
diff --git a/src/rgw/rgw_crypt_sanitize.h b/src/rgw/rgw_crypt_sanitize.h
new file mode 100644
index 000000000..aa0261fc2
--- /dev/null
+++ b/src/rgw/rgw_crypt_sanitize.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string_view>
+#include "rgw_common.h"
+
+namespace rgw {
+namespace crypt_sanitize {
+
+/*
+ * Temporary container for suppressing printing if variable contains secret key.
+ */
+struct env {
+  std::string_view name;
+  std::string_view value;
+
+  env(std::string_view name, std::string_view value)
+  : name(name), value(value) {}
+};
+
+/*
+ * Temporary container for suppressing printing if aws meta attributes contains secret key.
+ */
+struct x_meta_map {
+  std::string_view name;
+  std::string_view value;
+  x_meta_map(std::string_view name, std::string_view value)
+  : name(name), value(value) {}
+};
+
+/*
+ * Temporary container for suppressing printing if s3_policy calculation variable contains secret key.
+ */
+struct s3_policy {
+  std::string_view name;
+  std::string_view value;
+  s3_policy(std::string_view name, std::string_view value)
+  : name(name), value(value) {}
+};
+
+/*
+ * Temporary container for suppressing printing if auth string contains secret key.
+ */
+struct auth {
+  const req_state* const s;
+  std::string_view value;
+  auth(const req_state* const s, std::string_view value)
+  : s(s), value(value) {}
+};
+
+/*
+ * Temporary container for suppressing printing if log made from civetweb may contain secret key.
+ */
+struct log_content {
+  const std::string_view buf;
+  explicit log_content(const std::string_view buf)
+  : buf(buf) {}
+};
+
+std::ostream& operator<<(std::ostream& out, const env& e);
+std::ostream& operator<<(std::ostream& out, const x_meta_map& x);
+std::ostream& operator<<(std::ostream& out, const s3_policy& x);
+std::ostream& operator<<(std::ostream& out, const auth& x);
+std::ostream& operator<<(std::ostream& out, const log_content& x);
+}
+}
diff --git a/src/rgw/rgw_d3n_cacherequest.h b/src/rgw/rgw_d3n_cacherequest.h
new file mode 100644
index 000000000..edc70247f
--- /dev/null
+++ b/src/rgw/rgw_d3n_cacherequest.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <aio.h>
+
+#include "include/rados/librados.hpp"
+#include "include/Context.h"
+#include "common/async/completion.h"
+
+#include <errno.h>
+#include "common/error_code.h"
+#include "common/errno.h"
+
+#include "rgw_aio.h"
+#include "rgw_cache.h"
+
+
+struct D3nGetObjData {
+  std::mutex d3n_lock;
+};
+
+struct D3nL1CacheRequest {
+  ~D3nL1CacheRequest() {
+    lsubdout(g_ceph_context, rgw_datacache, 30) << "D3nDataCache: " << __func__ << "(): Read From Cache, complete" << dendl;
+  }
+
+  // unique_ptr with custom deleter for struct aiocb
+  struct libaio_aiocb_deleter {
+    void operator()(struct aiocb* c) {
+      if(c->aio_fildes > 0) {
+        if( ::close(c->aio_fildes) != 0) {
+          lsubdout(g_ceph_context, rgw_datacache, 2) << "D3nDataCache: " << __func__ << "(): Error - can't close file, errno=" << -errno << dendl;
+        }
+      }
+      delete c;
+    }
+  };
+
+  using unique_aio_cb_ptr = std::unique_ptr<struct aiocb, libaio_aiocb_deleter>;
+
+  struct AsyncFileReadOp {
+    bufferlist result;
+    unique_aio_cb_ptr aio_cb;
+    using Signature = void(boost::system::error_code, bufferlist);
+    using Completion = ceph::async::Completion<Signature, AsyncFileReadOp>;
+
+    int init_async_read(const DoutPrefixProvider *dpp, const std::string& location, off_t read_ofs, off_t read_len, void* arg) {
+      ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl;
+      aio_cb.reset(new struct aiocb);
+      memset(aio_cb.get(), 0, sizeof(struct aiocb));
+      aio_cb->aio_fildes = TEMP_FAILURE_RETRY(::open(location.c_str(), O_RDONLY|O_CLOEXEC|O_BINARY));
+      if(aio_cb->aio_fildes < 0) {
+        int err = errno;
+        ldpp_dout(dpp, 1) << "ERROR: D3nDataCache: " << __func__ << "(): can't open " << location << " : " << cpp_strerror(err) << dendl;
+        return -err;
+      }
+      if (g_conf()->rgw_d3n_l1_fadvise != POSIX_FADV_NORMAL)
+        posix_fadvise(aio_cb->aio_fildes, 0, 0, g_conf()->rgw_d3n_l1_fadvise);
+
+      bufferptr bp(read_len);
+      aio_cb->aio_buf = bp.c_str();
+      result.append(std::move(bp));
+
+      aio_cb->aio_nbytes = read_len;
+      aio_cb->aio_offset = read_ofs;
+      aio_cb->aio_sigevent.sigev_notify = SIGEV_THREAD;
+      aio_cb->aio_sigevent.sigev_notify_function = libaio_cb_aio_dispatch;
+      aio_cb->aio_sigevent.sigev_notify_attributes = nullptr;
+      aio_cb->aio_sigevent.sigev_value.sival_ptr = arg;
+
+      return 0;
+    }
+
+    static void libaio_cb_aio_dispatch(sigval sigval) {
+      lsubdout(g_ceph_context, rgw_datacache, 20) << "D3nDataCache: " << __func__ << "()" << dendl;
+      auto p = std::unique_ptr<Completion>{static_cast<Completion*>(sigval.sival_ptr)};
+      auto op = std::move(p->user_data);
+      const int ret = -aio_error(op.aio_cb.get());
+      boost::system::error_code ec;
+      if (ret < 0) {
+          ec.assign(-ret, boost::system::system_category());
+      }
+
+      ceph::async::dispatch(std::move(p), ec, std::move(op.result));
+    }
+
+    template <typename Executor1, typename CompletionHandler>
+    static auto create(const Executor1& ex1, CompletionHandler&& handler) {
+      auto p = Completion::create(ex1, std::move(handler));
+      return p;
+    }
+  };
+
+  template <typename ExecutionContext, typename CompletionToken>
+  auto async_read(const DoutPrefixProvider *dpp, ExecutionContext& ctx, const std::string& location,
+                  off_t read_ofs, off_t read_len, CompletionToken&& token) {
+    using Op = AsyncFileReadOp;
+    using Signature = typename Op::Signature;
+    boost::asio::async_completion<CompletionToken, Signature> init(token);
+    auto p = Op::create(ctx.get_executor(), init.completion_handler);
+    auto& op = p->user_data;
+
+    ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): location=" << location << dendl;
+    int ret = op.init_async_read(dpp, location, read_ofs, read_len, p.get());
+    if(0 == ret) {
+      ret = ::aio_read(op.aio_cb.get());
+    }
+    ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): ::aio_read(), ret=" << ret << dendl;
+    if(ret < 0) {
+      auto ec = boost::system::error_code{-ret, boost::system::system_category()};
+      ceph::async::post(std::move(p), ec, bufferlist{});
+    } else {
+      (void)p.release();
+    }
+    return init.result.get();
+  }
+
+  struct d3n_libaio_handler {
+    rgw::Aio* throttle = nullptr;
+    rgw::AioResult& r;
+    // read callback
+    void operator()(boost::system::error_code ec, bufferlist bl) const {
+      r.result = -ec.value();
+      r.data = std::move(bl);
+      throttle->put(r);
+    }
+  };
+
+  void file_aio_read_abstract(const DoutPrefixProvider *dpp, boost::asio::io_context& context, yield_context yield,
+                              std::string& cache_location, off_t read_ofs, off_t read_len,
+                              rgw::Aio* aio, rgw::AioResult& r) {
+    using namespace boost::asio;
+    async_completion<yield_context, void()> init(yield);
+    auto ex = get_associated_executor(init.completion_handler);
+
+    auto& ref = r.obj.get_ref();
+    ldpp_dout(dpp, 20) << "D3nDataCache: " << __func__ << "(): oid=" << ref.obj.oid << dendl;
+    async_read(dpp, context, cache_location+"/"+url_encode(ref.obj.oid, true), read_ofs, read_len, bind_executor(ex, d3n_libaio_handler{aio, r}));
+  }
+
+};
diff --git a/src/rgw/rgw_dencoder.cc b/src/rgw/rgw_dencoder.cc
new file mode 100644
index 000000000..2475b45ed
--- /dev/null
+++ b/src/rgw/rgw_dencoder.cc
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_rados.h"
+#include "rgw_zone.h"
+#include "rgw_log.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_cache.h"
+#include "rgw_meta_sync_status.h"
+#include "rgw_data_sync.h"
+#include "rgw_multi.h"
+#include "rgw_bucket_encryption.h"
+
+#include "common/Formatter.h"
+
+using namespace std;
+
+static string shadow_ns = RGW_OBJ_NS_SHADOW;
+
+void obj_version::generate_test_instances(list<obj_version*>& o)
+{
+  obj_version *v = new obj_version;
+  v->ver = 5;
+  v->tag = "tag";
+
+  o.push_back(v);
+  o.push_back(new obj_version);
+}
+
+void RGWBucketEncryptionConfig::generate_test_instances(std::list<RGWBucketEncryptionConfig*>& o)
+{
+  auto *bc = new RGWBucketEncryptionConfig("aws:kms", "some:key", true);
+  o.push_back(bc);
+
+  bc = new RGWBucketEncryptionConfig("AES256");
+  o.push_back(bc);
+
+  o.push_back(new RGWBucketEncryptionConfig);
+}
diff --git a/src/rgw/rgw_dmclock.h b/src/rgw/rgw_dmclock.h
new file mode 100644
index 000000000..6fad9cc18
--- /dev/null
+++ b/src/rgw/rgw_dmclock.h
@@ -0,0 +1,52 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ * Copyright (C) 2019 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "dmclock/src/dmclock_server.h"
+
+namespace rgw::dmclock {
+// TODO: implement read vs write
+enum class client_id {
+                      admin, //< /admin apis
+                      auth, //< swift auth, sts
+                      data, //< PutObj, GetObj
+                      metadata, //< bucket operations, object metadata
+                      count
+};
+
+// TODO move these to dmclock/types or so in submodule
+using crimson::dmclock::Cost;
+using crimson::dmclock::ClientInfo;
+
+enum class scheduler_t {
+                        none,
+                        throttler,
+                        dmclock
+};
+
+inline scheduler_t get_scheduler_t(CephContext* const cct)
+{
+  const auto scheduler_type = cct->_conf.get_val<std::string>("rgw_scheduler_type");
+  if (scheduler_type == "dmclock")
+    return scheduler_t::dmclock;
+  else if (scheduler_type == "throttler")
+    return scheduler_t::throttler;
+  else
+    return scheduler_t::none;
+}
+
+} // namespace rgw::dmclock
diff --git a/src/rgw/rgw_dmclock_async_scheduler.cc b/src/rgw/rgw_dmclock_async_scheduler.cc
new file mode 100644
index 000000000..28738e9f3
--- /dev/null
+++ b/src/rgw/rgw_dmclock_async_scheduler.cc
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/async/completion.h"
+#include "rgw_dmclock_async_scheduler.h"
+#include "rgw_dmclock_scheduler.h"
+
+namespace rgw::dmclock {
+
+AsyncScheduler::~AsyncScheduler()
+{
+  cancel();
+  if (observer) {
+    cct->_conf.remove_observer(this);
+  }
+}
+
+const char** AsyncScheduler::get_tracked_conf_keys() const
+{
+  if (observer) {
+    return observer->get_tracked_conf_keys();
+  }
+  static const char* keys[] = { "rgw_max_concurrent_requests", nullptr };
+  return keys;
+}
+
+void AsyncScheduler::handle_conf_change(const ConfigProxy& conf,
+                                        const std::set<std::string>& changed)
+{
+  if (observer) {
+    observer->handle_conf_change(conf, changed);
+  }
+  if (changed.count("rgw_max_concurrent_requests")) {
+    auto new_max = conf.get_val<int64_t>("rgw_max_concurrent_requests");
+    max_requests = new_max > 0 ? new_max : std::numeric_limits<int64_t>::max();
+  }
+  queue.update_client_infos();
+  schedule(crimson::dmclock::TimeZero);
+}
+
+int AsyncScheduler::schedule_request_impl(const client_id& client,
+                                          const ReqParams& params,
+                                          const Time& time, const Cost& cost,
+                                          optional_yield yield_ctx)
+{
+    ceph_assert(yield_ctx);
+
+    auto &yield = yield_ctx.get_yield_context();
+    boost::system::error_code ec;
+    async_request(client, params, time, cost, yield[ec]);
+
+    if (ec){
+      if (ec == boost::system::errc::resource_unavailable_try_again)
+        return -EAGAIN;
+      else
+        return -ec.value();
+    }
+
+    return 0;
+}
+
+void AsyncScheduler::request_complete()
+{
+  --outstanding_requests;
+  if(auto c = counters(client_id::count)){
+    c->inc(throttle_counters::l_outstanding, -1);
+  }
+  schedule(crimson::dmclock::TimeZero);
+}
+
+void AsyncScheduler::cancel()
+{
+  ClientSums sums;
+
+  queue.remove_by_req_filter([&] (RequestRef&& request) {
+      inc(sums, request->client, request->cost);
+      auto c = static_cast<Completion*>(request.release());
+      Completion::dispatch(std::unique_ptr<Completion>{c},
+                           boost::asio::error::operation_aborted,
+                           PhaseType::priority);
+      return true;
+    });
+  timer.cancel();
+
+  for (size_t i = 0; i < client_count; i++) {
+    if (auto c = counters(static_cast<client_id>(i))) {
+      on_cancel(c, sums[i]);
+    }
+  }
+}
+
+void AsyncScheduler::cancel(const client_id& client)
+{
+  ClientSum sum;
+
+  queue.remove_by_client(client, false, [&] (RequestRef&& request) {
+      sum.count++;
+      sum.cost += request->cost;
+      auto c = static_cast<Completion*>(request.release());
+      Completion::dispatch(std::unique_ptr<Completion>{c},
+                           boost::asio::error::operation_aborted,
+                           PhaseType::priority);
+    });
+  if (auto c = counters(client)) {
+    on_cancel(c, sum);
+  }
+  schedule(crimson::dmclock::TimeZero);
+}
+
+void AsyncScheduler::schedule(const Time& time)
+{
+  timer.expires_at(Clock::from_double(time));
+  timer.async_wait([this] (boost::system::error_code ec) {
+      // process requests unless the wait was canceled. note that a canceled
+      // wait may execute after this AsyncScheduler destructs
+      if (ec != boost::asio::error::operation_aborted) {
+        process(get_time());
+      }
+    });
+}
+
+void AsyncScheduler::process(const Time& now)
+{
+  // must run in the executor. we should only invoke completion handlers if the
+  // executor is running
+  assert(get_executor().running_in_this_thread());
+
+  ClientSums rsums, psums;
+
+  while (outstanding_requests < max_requests) {
+    auto pull = queue.pull_request(now);
+
+    if (pull.is_none()) {
+      // no pending requests, cancel the timer
+      timer.cancel();
+      break;
+    }
+    if (pull.is_future()) {
+      // update the timer based on the future time
+      schedule(pull.getTime());
+      break;
+    }
+    ++outstanding_requests;
+    if(auto c = counters(client_id::count)){
+      c->inc(throttle_counters::l_outstanding);
+    }
+
+    // complete the request
+    auto& r = pull.get_retn();
+    auto client = r.client;
+    auto phase = r.phase;
+    auto started = r.request->started;
+    auto cost = r.request->cost;
+    auto c = static_cast<Completion*>(r.request.release());
+    Completion::post(std::unique_ptr<Completion>{c},
+                     boost::system::error_code{}, phase);
+
+    if (auto c = counters(client)) {
+      auto lat = Clock::from_double(now) - Clock::from_double(started);
+      if (phase == PhaseType::reservation) {
+        inc(rsums, client, cost);
+        c->tinc(queue_counters::l_res_latency, lat);
+      } else {
+        inc(psums, client, cost);
+        c->tinc(queue_counters::l_prio_latency, lat);
+      }
+    }
+  }
+
+  if (outstanding_requests >= max_requests) {
+    if(auto c = counters(client_id::count)){
+      c->inc(throttle_counters::l_throttle);
+    }
+  }
+
+  for (size_t i = 0; i < client_count; i++) {
+    if (auto c = counters(static_cast<client_id>(i))) {
+      on_process(c, rsums[i], psums[i]);
+    }
+  }
+}
+
+} // namespace rgw::dmclock
diff --git a/src/rgw/rgw_dmclock_async_scheduler.h b/src/rgw/rgw_dmclock_async_scheduler.h
new file mode 100644
index 000000000..7bde75870
--- /dev/null
+++ b/src/rgw/rgw_dmclock_async_scheduler.h
@@ -0,0 +1,217 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "common/async/completion.h"
+
+#include <boost/asio.hpp>
+#include "rgw_dmclock_scheduler.h"
+#include "rgw_dmclock_scheduler_ctx.h"
+
+namespace rgw::dmclock {
+  namespace async = ceph::async;
+
+/*
+ * A dmclock request scheduling service for use with boost::asio.
+ *
+ * An asynchronous dmclock priority queue, where scheduled requests complete
+ * on a boost::asio executor.
+ */
+class AsyncScheduler : public md_config_obs_t, public Scheduler {
+ public:
+  template <typename ...Args> // args forwarded to PullPriorityQueue ctor
+  AsyncScheduler(CephContext *cct, boost::asio::io_context& context,
+            GetClientCounters&& counters, md_config_obs_t *observer,
+            Args&& ...args);
+  ~AsyncScheduler();
+
+  using executor_type = boost::asio::io_context::executor_type;
+
+  /// return the default executor for async_request() callbacks
+  executor_type get_executor() noexcept {
+    return timer.get_executor();
+  }
+
+  /// submit an async request for dmclock scheduling. the given completion
+  /// handler will be invoked with (error_code, PhaseType) when the request
+  /// is ready or canceled. on success, this grants a throttle unit that must
+  /// be returned with a call to request_complete()
+  template <typename CompletionToken>
+  auto async_request(const client_id& client, const ReqParams& params,
+                     const Time& time, Cost cost, CompletionToken&& token);
+
+  /// returns a throttle unit granted by async_request()
+  void request_complete() override;
+
+  /// cancel all queued requests, invoking their completion handlers with an
+  /// operation_aborted error and default-constructed result
+  void cancel();
+
+  /// cancel all queued requests for a given client, invoking their completion
+  /// handler with an operation_aborted error and default-constructed result
+  void cancel(const client_id& client);
+
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set<std::string>& changed) override;
+
+ private:
+  int schedule_request_impl(const client_id& client, const ReqParams& params,
+                            const Time& time, const Cost& cost,
+                            optional_yield yield_ctx) override;
+
+  static constexpr bool IsDelayed = false;
+  using Queue = crimson::dmclock::PullPriorityQueue<client_id, Request, IsDelayed>;
+  using RequestRef = typename Queue::RequestRef;
+  Queue queue; //< dmclock priority queue
+
+  using Signature = void(boost::system::error_code, PhaseType);
+  using Completion = async::Completion<Signature, async::AsBase<Request>>;
+
+  using Clock = ceph::coarse_real_clock;
+  using Timer = boost::asio::basic_waitable_timer<Clock,
+        boost::asio::wait_traits<Clock>, executor_type>;
+  Timer timer; //< timer for the next scheduled request
+
+  CephContext *const cct;
+  md_config_obs_t *const observer; //< observer to update ClientInfoFunc
+  GetClientCounters counters; //< provides per-client perf counters
+
+  /// max request throttle
+  std::atomic<int64_t> max_requests;
+  std::atomic<int64_t> outstanding_requests = 0;
+
+  /// set a timer to process the next request
+  void schedule(const Time& time);
+
+  /// process ready requests, then schedule the next pending request
+  void process(const Time& now);
+};
+
+
+template <typename ...Args>
+AsyncScheduler::AsyncScheduler(CephContext *cct, boost::asio::io_context& context,
+                               GetClientCounters&& counters,
+                               md_config_obs_t *observer, Args&& ...args)
+  : queue(std::forward<Args>(args)...),
+    timer(context), cct(cct), observer(observer),
+    counters(std::move(counters)),
+    max_requests(cct->_conf.get_val<int64_t>("rgw_max_concurrent_requests"))
+{
+  if (max_requests <= 0) {
+    max_requests = std::numeric_limits<int64_t>::max();
+  }
+  if (observer) {
+    cct->_conf.add_observer(this);
+  }
+}
+
+template <typename CompletionToken>
+auto AsyncScheduler::async_request(const client_id& client,
+                              const ReqParams& params,
+                              const Time& time, Cost cost,
+                              CompletionToken&& token)
+{
+  boost::asio::async_completion<CompletionToken, Signature> init(token);
+
+  auto ex1 = get_executor();
+  auto& handler = init.completion_handler;
+
+  // allocate the Request and add it to the queue
+  auto completion = Completion::create(ex1, std::move(handler),
+                                       Request{client, time, cost});
+  // cast to unique_ptr<Request>
+  auto req = RequestRef{std::move(completion)};
+  int r = queue.add_request(std::move(req), client, params, time, cost);
+  if (r == 0) {
+    // schedule an immediate call to process() on the executor
+    schedule(crimson::dmclock::TimeZero);
+    if (auto c = counters(client)) {
+      c->inc(queue_counters::l_qlen);
+      c->inc(queue_counters::l_cost, cost);
+    }
+  } else {
+    // post the error code
+    boost::system::error_code ec(r, boost::system::system_category());
+    // cast back to Completion
+    auto completion = static_cast<Completion*>(req.release());
+    async::post(std::unique_ptr<Completion>{completion},
+                ec, PhaseType::priority);
+    if (auto c = counters(client)) {
+      c->inc(queue_counters::l_limit);
+      c->inc(queue_counters::l_limit_cost, cost);
+    }
+  }
+
+  return init.result.get();
+}
+
+class SimpleThrottler : public md_config_obs_t, public dmclock::Scheduler {
+public:
+  SimpleThrottler(CephContext *cct) :
+    max_requests(cct->_conf.get_val<int64_t>("rgw_max_concurrent_requests")),
+    counters(cct, "simple-throttler")
+  {
+    if (max_requests <= 0) {
+      max_requests = std::numeric_limits<int64_t>::max();
+    }
+    cct->_conf.add_observer(this);
+  }
+
+  const char** get_tracked_conf_keys() const override {
+    static const char* keys[] = { "rgw_max_concurrent_requests", nullptr };
+    return keys;
+  }
+
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set<std::string>& changed) override
+  {
+    if (changed.count("rgw_max_concurrent_requests")) {
+      auto new_max = conf.get_val<int64_t>("rgw_max_concurrent_requests");
+      max_requests = new_max > 0 ? new_max : std::numeric_limits<int64_t>::max();
+    }
+  }
+
+  void request_complete() override {
+    --outstanding_requests;
+    if (auto c = counters();
+        c != nullptr) {
+      c->inc(throttle_counters::l_outstanding, -1);
+    }
+
+  }
+
+private:
+  int schedule_request_impl(const client_id&, const ReqParams&,
+                            const Time&, const Cost&,
+                            optional_yield) override {
+    if (outstanding_requests++ >= max_requests) {
+      if (auto c = counters();
+          c != nullptr) {
+        c->inc(throttle_counters::l_outstanding);
+        c->inc(throttle_counters::l_throttle);
+      }
+      return -EAGAIN;
+    }
+
+    return 0 ;
+  }
+
+  std::atomic<int64_t> max_requests;
+  std::atomic<int64_t> outstanding_requests = 0;
+  ThrottleCounters counters;
+};
+
+} // namespace rgw::dmclock
diff --git a/src/rgw/rgw_dmclock_scheduler.h b/src/rgw/rgw_dmclock_scheduler.h
new file mode 100644
index 000000000..655e12bef
--- /dev/null
+++ b/src/rgw/rgw_dmclock_scheduler.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *           (C) 2019 SUSE LLC
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "common/ceph_time.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "common/async/yield_context.h"
+#include "rgw_dmclock.h"
+
+namespace rgw::dmclock {
+
+using crimson::dmclock::ReqParams;
+using crimson::dmclock::PhaseType;
+using crimson::dmclock::AtLimit;
+using crimson::dmclock::Time;
+using crimson::dmclock::get_time;
+
+/// function to provide client counters
+using GetClientCounters = std::function<PerfCounters*(client_id)>;
+
+struct Request {
+  client_id client;
+  Time started;
+  Cost cost;
+};
+
+enum class ReqState {
+  Wait,
+  Ready,
+  Cancelled
+};
+
+template <typename F>
+class Completer {
+public:
+  Completer(F &&f): f(std::move(f)) {}
+  // Default constructor is needed as we need to create an empty completer
+  // that'll be move assigned later in process request
+  Completer() = default;
+  ~Completer() {
+    if (f) {
+      f();
+    }
+  }
+  Completer(const Completer&) = delete;
+  Completer& operator=(const Completer&) = delete;
+  Completer(Completer&& other) = default;
+  Completer& operator=(Completer&& other) = default;
+private:
+  F f;
+};
+
+using SchedulerCompleter = Completer<std::function<void()>>;
+
+class Scheduler  {
+public:
+  auto schedule_request(const client_id& client, const ReqParams& params,
+			const Time& time, const Cost& cost,
+			optional_yield yield)
+  {
+    int r = schedule_request_impl(client,params,time,cost,yield);
+    return std::make_pair(r,SchedulerCompleter(std::bind(&Scheduler::request_complete,this)));
+  }
+  virtual void request_complete() {};
+
+  virtual ~Scheduler() {};
+private:
+  virtual int schedule_request_impl(const client_id&, const ReqParams&,
+				    const Time&, const Cost&,
+				    optional_yield) = 0;
+};
+
+} // namespace rgw::dmclock
diff --git a/src/rgw/rgw_dmclock_scheduler_ctx.cc b/src/rgw/rgw_dmclock_scheduler_ctx.cc
new file mode 100644
index 000000000..cc1170eb1
--- /dev/null
+++ b/src/rgw/rgw_dmclock_scheduler_ctx.cc
@@ -0,0 +1,178 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *           (C) 2019 SUSE Linux LLC
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "rgw_dmclock_scheduler_ctx.h"
+
+namespace rgw::dmclock {
+
+ClientConfig::ClientConfig(CephContext *cct)
+{
+  update(cct->_conf);
+}
+
+ClientInfo* ClientConfig::operator()(client_id client)
+{
+  return &clients[static_cast<size_t>(client)];
+}
+
+const char** ClientConfig::get_tracked_conf_keys() const
+{
+  static const char* keys[] = {
+    "rgw_dmclock_admin_res",
+    "rgw_dmclock_admin_wgt",
+    "rgw_dmclock_admin_lim",
+    "rgw_dmclock_auth_res",
+    "rgw_dmclock_auth_wgt",
+    "rgw_dmclock_auth_lim",
+    "rgw_dmclock_data_res",
+    "rgw_dmclock_data_wgt",
+    "rgw_dmclock_data_lim",
+    "rgw_dmclock_metadata_res",
+    "rgw_dmclock_metadata_wgt",
+    "rgw_dmclock_metadata_lim",
+    "rgw_max_concurrent_requests",
+    nullptr
+  };
+  return keys;
+}
+
+void ClientConfig::update(const ConfigProxy& conf)
+{
+  clients.clear();
+  static_assert(0 == static_cast<int>(client_id::admin));
+  clients.emplace_back(conf.get_val<double>("rgw_dmclock_admin_res"),
+                       conf.get_val<double>("rgw_dmclock_admin_wgt"),
+                       conf.get_val<double>("rgw_dmclock_admin_lim"));
+  static_assert(1 == static_cast<int>(client_id::auth));
+  clients.emplace_back(conf.get_val<double>("rgw_dmclock_auth_res"),
+                       conf.get_val<double>("rgw_dmclock_auth_wgt"),
+                       conf.get_val<double>("rgw_dmclock_auth_lim"));
+  static_assert(2 == static_cast<int>(client_id::data));
+  clients.emplace_back(conf.get_val<double>("rgw_dmclock_data_res"),
+                       conf.get_val<double>("rgw_dmclock_data_wgt"),
+                       conf.get_val<double>("rgw_dmclock_data_lim"));
+  static_assert(3 == static_cast<int>(client_id::metadata));
+  clients.emplace_back(conf.get_val<double>("rgw_dmclock_metadata_res"),
+                       conf.get_val<double>("rgw_dmclock_metadata_wgt"),
+                       conf.get_val<double>("rgw_dmclock_metadata_lim"));
+}
+
+void ClientConfig::handle_conf_change(const ConfigProxy& conf,
+                                      const std::set<std::string>& changed)
+{
+  update(conf);
+}
+
+ClientCounters::ClientCounters(CephContext *cct)
+{
+  clients[static_cast<size_t>(client_id::admin)] =
+      queue_counters::build(cct, "dmclock-admin");
+  clients[static_cast<size_t>(client_id::auth)] =
+      queue_counters::build(cct, "dmclock-auth");
+  clients[static_cast<size_t>(client_id::data)] =
+      queue_counters::build(cct, "dmclock-data");
+  clients[static_cast<size_t>(client_id::metadata)] =
+      queue_counters::build(cct, "dmclock-metadata");
+  clients[static_cast<size_t>(client_id::count)] =
+      throttle_counters::build(cct, "dmclock-scheduler");
+}
+
+void inc(ClientSums& sums, client_id client, Cost cost)
+{
+  auto& sum = sums[static_cast<size_t>(client)];
+  sum.count++;
+  sum.cost += cost;
+}
+
+void on_cancel(PerfCounters *c, const ClientSum& sum)
+{
+  if (sum.count) {
+    c->dec(queue_counters::l_qlen, sum.count);
+    c->inc(queue_counters::l_cancel, sum.count);
+  }
+  if (sum.cost) {
+    c->dec(queue_counters::l_cost, sum.cost);
+    c->inc(queue_counters::l_cancel_cost, sum.cost);
+  }
+}
+
+void on_process(PerfCounters* c, const ClientSum& rsum, const ClientSum& psum)
+{
+  if (rsum.count) {
+    c->inc(queue_counters::l_res, rsum.count);
+  }
+  if (rsum.cost) {
+    c->inc(queue_counters::l_res_cost, rsum.cost);
+  }
+  if (psum.count) {
+    c->inc(queue_counters::l_prio, psum.count);
+  }
+  if (psum.cost) {
+    c->inc(queue_counters::l_prio_cost, psum.cost);
+  }
+  if (rsum.count + psum.count) {
+    c->dec(queue_counters::l_qlen, rsum.count + psum.count);
+  }
+  if (rsum.cost + psum.cost) {
+    c->dec(queue_counters::l_cost, rsum.cost + psum.cost);
+  }
+}
+} // namespace rgw::dmclock
+
+namespace queue_counters {
+
+PerfCountersRef build(CephContext *cct, const std::string& name)
+{
+  if (!cct->_conf->throttler_perf_counter) {
+    return {};
+  }
+
+  PerfCountersBuilder b(cct, name, l_first, l_last);
+  b.add_u64(l_qlen, "qlen", "Queue size");
+  b.add_u64(l_cost, "cost", "Cost of queued requests");
+  b.add_u64_counter(l_res, "res", "Requests satisfied by reservation");
+  b.add_u64_counter(l_res_cost, "res_cost", "Cost satisfied by reservation");
+  b.add_u64_counter(l_prio, "prio", "Requests satisfied by priority");
+  b.add_u64_counter(l_prio_cost, "prio_cost", "Cost satisfied by priority");
+  b.add_u64_counter(l_limit, "limit", "Requests rejected by limit");
+  b.add_u64_counter(l_limit_cost, "limit_cost", "Cost rejected by limit");
+  b.add_u64_counter(l_cancel, "cancel", "Cancels");
+  b.add_u64_counter(l_cancel_cost, "cancel_cost", "Canceled cost");
+  b.add_time_avg(l_res_latency, "res latency", "Reservation latency");
+  b.add_time_avg(l_prio_latency, "prio latency", "Priority latency");
+
+  auto logger = PerfCountersRef{ b.create_perf_counters(), cct };
+  cct->get_perfcounters_collection()->add(logger.get());
+  return logger;
+}
+
+} // namespace queue_counters
+
+namespace throttle_counters {
+
+PerfCountersRef build(CephContext *cct, const std::string& name)
+{
+  if (!cct->_conf->throttler_perf_counter) {
+    return {};
+  }
+
+  PerfCountersBuilder b(cct, name, l_first, l_last);
+  b.add_u64(l_throttle, "throttle", "Requests throttled");
+  b.add_u64(l_outstanding, "outstanding", "Outstanding Requests");
+
+  auto logger = PerfCountersRef{ b.create_perf_counters(), cct };
+  cct->get_perfcounters_collection()->add(logger.get());
+  return logger;
+}
+
+} // namespace throttle_counters
diff --git a/src/rgw/rgw_dmclock_scheduler_ctx.h b/src/rgw/rgw_dmclock_scheduler_ctx.h
new file mode 100644
index 000000000..f27b81c26
--- /dev/null
+++ b/src/rgw/rgw_dmclock_scheduler_ctx.h
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "common/perf_counters.h"
+#include "common/ceph_context.h"
+#include "common/config.h"
+#include "rgw_dmclock.h"
+
+namespace queue_counters {
+
+  enum {
+        l_first = 427150,
+        l_qlen,
+        l_cost,
+        l_res,
+        l_res_cost,
+        l_prio,
+        l_prio_cost,
+        l_limit,
+        l_limit_cost,
+        l_cancel,
+        l_cancel_cost,
+        l_res_latency,
+        l_prio_latency,
+        l_last,
+  };
+
+  PerfCountersRef build(CephContext *cct, const std::string& name);
+
+} // namespace queue_counters
+
+namespace throttle_counters {
+  enum {
+        l_first = 437219,
+        l_throttle,
+        l_outstanding,
+        l_last
+  };
+
+  PerfCountersRef build(CephContext *cct, const std::string& name);
+} // namespace throttle
+
+namespace rgw::dmclock {
+
+// the last client counter would be for global scheduler stats
+static constexpr auto counter_size = static_cast<size_t>(client_id::count) + 1;
+/// array of per-client counters to serve as GetClientCounters
+class ClientCounters {
+  std::array<PerfCountersRef, counter_size> clients;
+ public:
+  ClientCounters(CephContext *cct);
+
+  PerfCounters* operator()(client_id client) const {
+    return clients[static_cast<size_t>(client)].get();
+  }
+};
+
+class ThrottleCounters {
+  PerfCountersRef counters;
+public:
+  ThrottleCounters(CephContext* const cct,const std::string& name):
+    counters(throttle_counters::build(cct, name)) {}
+
+  PerfCounters* operator()() const {
+    return counters.get();
+  }
+};
+
+
+struct ClientSum {
+  uint64_t count{0};
+  Cost cost{0};
+};
+
+constexpr auto client_count = static_cast<size_t>(client_id::count);
+using ClientSums = std::array<ClientSum, client_count>;
+
+void inc(ClientSums& sums, client_id client, Cost cost);
+void on_cancel(PerfCounters *c, const ClientSum& sum);
+void on_process(PerfCounters* c, const ClientSum& rsum, const ClientSum& psum);
+
+
+class ClientConfig : public md_config_obs_t {
+  std::vector<ClientInfo> clients;
+
+  void update(const ConfigProxy &conf);
+
+public:
+  ClientConfig(CephContext *cct);
+
+  ClientInfo* operator()(client_id client);
+
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const ConfigProxy& conf,
+                          const std::set<std::string>& changed) override;
+};
+
+class SchedulerCtx {
+public:
+  SchedulerCtx(CephContext* const cct) : sched_t(get_scheduler_t(cct))
+  {
+    if(sched_t == scheduler_t::dmclock) {
+      dmc_client_config = std::make_shared<ClientConfig>(cct);
+      // we don't have a move only cref std::function yet
+      dmc_client_counters = std::make_optional<ClientCounters>(cct);
+    }
+  }
+  // We need to construct a std::function from a NonCopyable object
+  ClientCounters& get_dmc_client_counters() { return dmc_client_counters.value(); }
+  ClientConfig* const get_dmc_client_config() const { return dmc_client_config.get(); }
+private:
+  scheduler_t sched_t;
+  std::shared_ptr<ClientConfig> dmc_client_config {nullptr};
+  std::optional<ClientCounters> dmc_client_counters  {std::nullopt};
+};
+
+} // namespace rgw::dmclock
diff --git a/src/rgw/rgw_dmclock_sync_scheduler.cc b/src/rgw/rgw_dmclock_sync_scheduler.cc
new file mode 100644
index 000000000..06857202f
--- /dev/null
+++ b/src/rgw/rgw_dmclock_sync_scheduler.cc
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_dmclock_scheduler.h"
+#include "rgw_dmclock_sync_scheduler.h"
+#include "rgw_dmclock_scheduler_ctx.h"
+
+namespace rgw::dmclock {
+
+SyncScheduler::~SyncScheduler()
+{
+  cancel();
+}
+
+int SyncScheduler::add_request(const client_id& client, const ReqParams& params,
+                               const Time& time, Cost cost)
+{
+  std::mutex req_mtx;
+  std::condition_variable req_cv;
+  ReqState rstate {ReqState::Wait};
+  auto req = SyncRequest{client, time, cost, req_mtx, req_cv, rstate, counters};
+  int r = queue.add_request_time(req, client, params, time, cost);
+  if (r == 0) {
+    if (auto c = counters(client)) {
+      c->inc(queue_counters::l_qlen);
+      c->inc(queue_counters::l_cost, cost);
+    }
+    queue.request_completed();
+    // Perform a blocking wait until the request callback is called
+    {
+      std::unique_lock lock{req_mtx};
+      req_cv.wait(lock, [&rstate] {return rstate != ReqState::Wait;});
+    }
+    if (rstate == ReqState::Cancelled) {
+      //FIXME: decide on error code for cancelled request
+      r = -ECONNABORTED;
+    }
+  } else {
+      // post the error code
+    if (auto c = counters(client)) {
+      c->inc(queue_counters::l_limit);
+      c->inc(queue_counters::l_limit_cost, cost);
+    }
+  }
+  return r;
+}
+
+void SyncScheduler::handle_request_cb(const client_id &c,
+                                      std::unique_ptr<SyncRequest> req,
+                                      PhaseType phase, Cost cost)
+{
+  { std::lock_guard<std::mutex> lg(req->req_mtx);
+    req->req_state = ReqState::Ready;
+    req->req_cv.notify_one();
+  }
+
+  if (auto ctr = req->counters(c)) {
+    auto lat = Clock::from_double(get_time()) - Clock::from_double(req->started);
+    if (phase == PhaseType::reservation){
+      ctr->tinc(queue_counters::l_res_latency, lat);
+      ctr->inc(queue_counters::l_res);
+      if (cost) ctr->inc(queue_counters::l_res_cost);
+    } else if (phase == PhaseType::priority){
+      ctr->tinc(queue_counters::l_prio_latency, lat);
+      ctr->inc(queue_counters::l_prio);
+      if (cost) ctr->inc(queue_counters::l_prio_cost);
+    }
+    ctr->dec(queue_counters::l_qlen);
+    if (cost) ctr->dec(queue_counters::l_cost);
+  }
+}
+
+
+void SyncScheduler::cancel(const client_id& client)
+{
+  ClientSum sum;
+
+  queue.remove_by_client(client, false, [&](RequestRef&& request)
+    {
+      sum.count++;
+      sum.cost += request->cost;
+      {
+        std::lock_guard <std::mutex> lg(request->req_mtx);
+        request->req_state = ReqState::Cancelled;
+        request->req_cv.notify_one();
+      }
+    });
+  if (auto c = counters(client)) {
+    on_cancel(c, sum);
+  }
+
+  queue.request_completed();
+}
+
+void SyncScheduler::cancel()
+{
+  ClientSums sums;
+
+  queue.remove_by_req_filter([&](RequestRef&& request) -> bool
+           {
+             inc(sums, request->client, request->cost);
+             {
+               std::lock_guard<std::mutex> lg(request->req_mtx);
+               request->req_state = ReqState::Cancelled;
+               request->req_cv.notify_one();
+             }
+             return true;
+           });
+
+  for (size_t i = 0; i < client_count; i++) {
+    if (auto c = counters(static_cast<client_id>(i))) {
+      on_cancel(c, sums[i]);
+    }
+  }
+}
+
+} // namespace rgw::dmclock
diff --git a/src/rgw/rgw_dmclock_sync_scheduler.h b/src/rgw/rgw_dmclock_sync_scheduler.h
new file mode 100644
index 000000000..740234965
--- /dev/null
+++ b/src/rgw/rgw_dmclock_sync_scheduler.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 SUSE Linux Gmbh
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_dmclock_scheduler.h"
+#include "rgw_dmclock_scheduler_ctx.h"
+
+namespace rgw::dmclock {
+// For a blocking SyncRequest we hold a reference to a cv and the caller must
+// ensure the lifetime
+struct SyncRequest : public Request {
+  std::mutex& req_mtx;
+  std::condition_variable& req_cv;
+  ReqState& req_state;
+  GetClientCounters& counters;
+  explicit SyncRequest(client_id _id, Time started, Cost cost,
+                       std::mutex& mtx, std::condition_variable& _cv,
+                       ReqState& _state, GetClientCounters& counters):
+    Request{_id, started, cost}, req_mtx(mtx), req_cv(_cv), req_state(_state), counters(counters) {};
+};
+
+class SyncScheduler: public Scheduler {
+public:
+  template <typename ...Args>
+  SyncScheduler(CephContext *cct, GetClientCounters&& counters,
+		Args&& ...args);
+  ~SyncScheduler();
+
+  // submit a blocking request for dmclock scheduling, this function waits until
+  // the request is ready.
+  int add_request(const client_id& client, const ReqParams& params,
+		  const Time& time, Cost cost);
+
+
+  void cancel();
+
+  void cancel(const client_id& client);
+
+  static void handle_request_cb(const client_id& c, std::unique_ptr<SyncRequest> req,
+				PhaseType phase, Cost cost);
+private:
+  int schedule_request_impl(const client_id& client, const ReqParams& params,
+			    const Time& time, const Cost& cost,
+			    optional_yield _y [[maybe_unused]]) override
+  {
+    return add_request(client, params, time, cost);
+  }
+
+  static constexpr bool IsDelayed = false;
+  using Queue = crimson::dmclock::PushPriorityQueue<client_id, SyncRequest, IsDelayed>;
+  using RequestRef = typename Queue::RequestRef;
+  using Clock = ceph::coarse_real_clock;
+
+  Queue queue;
+  CephContext const *cct;
+  GetClientCounters counters; //< provides per-client perf counters
+};
+
+template <typename ...Args>
+SyncScheduler::SyncScheduler(CephContext *cct, GetClientCounters&& counters,
+			     Args&& ...args):
+  queue(std::forward<Args>(args)...), cct(cct), counters(std::move(counters))
+{}
+
+} // namespace rgw::dmclock
diff --git a/src/rgw/rgw_env.cc b/src/rgw/rgw_env.cc
new file mode 100644
index 000000000..d528f0e6d
--- /dev/null
+++ b/src/rgw/rgw_env.cc
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_log.h"
+
+#include <string>
+#include <map>
+#include "include/ceph_assert.h"
+#include "rgw_crypt_sanitize.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void RGWEnv::init(CephContext *cct)
+{
+  conf.init(cct);
+}
+
+void RGWEnv::set(std::string name, std::string val)
+{
+  env_map[std::move(name)] = std::move(val);
+}
+
+void RGWEnv::init(CephContext *cct, char **envp)
+{
+  const char *p;
+
+  env_map.clear();
+
+  for (int i=0; (p = envp[i]); ++i) {
+    string s(p);
+    int pos = s.find('=');
+    if (pos <= 0) // should never be 0
+      continue;
+    string name = s.substr(0, pos);
+    string val = s.substr(pos + 1);
+    env_map[name] = val;
+  }
+
+  init(cct);
+}
+
+const char *rgw_conf_get(const map<string, string, ltstr_nocase>& conf_map, const char *name, const char *def_val)
+{
+  auto iter = conf_map.find(name);
+  if (iter == conf_map.end())
+    return def_val;
+
+  return iter->second.c_str();
+}
+
+boost::optional<const std::string&> rgw_conf_get_optional(const map<string, string, ltstr_nocase>& conf_map, const std::string& name)
+{
+  auto iter = conf_map.find(name);
+  if (iter == conf_map.end())
+    return boost::none;
+
+  return boost::optional<const std::string&>(iter->second);
+}
+
+const char *RGWEnv::get(const char *name, const char *def_val) const
+{
+  return rgw_conf_get(env_map, name, def_val);
+}
+
+boost::optional<const std::string&>
+RGWEnv::get_optional(const std::string& name) const
+{
+  return rgw_conf_get_optional(env_map, name);
+}
+
+int rgw_conf_get_int(const map<string, string, ltstr_nocase>& conf_map, const char *name, int def_val)
+{
+  auto iter = conf_map.find(name);
+  if (iter == conf_map.end())
+    return def_val;
+
+  const char *s = iter->second.c_str();
+  return atoi(s);  
+}
+
+int RGWEnv::get_int(const char *name, int def_val) const
+{
+  return rgw_conf_get_int(env_map, name, def_val);
+}
+
+bool rgw_conf_get_bool(const map<string, string, ltstr_nocase>& conf_map, const char *name, bool def_val)
+{
+  auto iter = conf_map.find(name);
+  if (iter == conf_map.end())
+    return def_val;
+
+  const char *s = iter->second.c_str();
+  return rgw_str_to_bool(s, def_val);
+}
+
+bool RGWEnv::get_bool(const char *name, bool def_val)
+{
+  return rgw_conf_get_bool(env_map, name, def_val);
+}
+
+size_t RGWEnv::get_size(const char *name, size_t def_val) const
+{
+  const auto iter = env_map.find(name);
+  if (iter == env_map.end())
+    return def_val;
+
+  size_t sz;
+  try{
+    sz = stoull(iter->second);
+  } catch(...){
+    /* it is very unlikely that we'll ever encounter out_of_range, but let's
+       return the default eitherway */
+    sz = def_val;
+  }
+
+  return sz;
+}
+
+bool RGWEnv::exists(const char *name) const
+{
+  return env_map.find(name)!= env_map.end();
+}
+
+bool RGWEnv::exists_prefix(const char *prefix) const
+{
+  if (env_map.empty() || prefix == NULL)
+    return false;
+
+  const auto iter = env_map.lower_bound(prefix);
+  if (iter == env_map.end())
+    return false;
+
+  return (strncmp(iter->first.c_str(), prefix, strlen(prefix)) == 0);
+}
+
+void RGWEnv::remove(const char *name)
+{
+  map<string, string, ltstr_nocase>::iterator iter = env_map.find(name);
+  if (iter != env_map.end())
+    env_map.erase(iter);
+}
+
+void RGWConf::init(CephContext *cct)
+{
+  enable_ops_log = cct->_conf->rgw_enable_ops_log;
+  enable_usage_log = cct->_conf->rgw_enable_usage_log;
+
+  defer_to_bucket_acls = 0;  // default
+  if (cct->_conf->rgw_defer_to_bucket_acls == "recurse") {
+    defer_to_bucket_acls = RGW_DEFER_TO_BUCKET_ACLS_RECURSE;
+  } else if (cct->_conf->rgw_defer_to_bucket_acls == "full_control") {
+    defer_to_bucket_acls = RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL;
+  }
+}
diff --git a/src/rgw/rgw_es_main.cc b/src/rgw/rgw_es_main.cc
new file mode 100644
index 000000000..6cfbc9352
--- /dev/null
+++ b/src/rgw/rgw_es_main.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <list>
+#include <string>
+#include <iostream>
+
+#include "global/global_init.h"
+#include "global/global_context.h"
+
+#include "common/ceph_argparse.h"
+#include "common/ceph_json.h"
+#include "rgw_es_query.h"
+
+using namespace std;
+
+int main(int argc, char *argv[])
+{
+  auto args = argv_to_vec(argc, argv);
+
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+                         CODE_ENVIRONMENT_UTILITY, 0);
+
+  common_init_finish(g_ceph_context);
+
+  string expr;
+
+  if (argc > 1) {
+    expr = argv[1];
+  } else {
+    expr = "age >= 30";
+  }
+
+  ESQueryCompiler es_query(expr, nullptr, "x-amz-meta-");
+
+  map<string, string, ltstr_nocase> aliases = { { "key", "name" },
+                                  { "etag", "meta.etag" },
+                                  { "size", "meta.size" },
+                                  { "mtime", "meta.mtime" },
+                                  { "lastmodified", "meta.mtime" },
+                                  { "contenttype", "meta.contenttype" },
+  };
+  es_query.set_field_aliases(&aliases);
+
+  map<string, ESEntityTypeMap::EntityType> generic_map = { {"bucket", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"name", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"instance", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.etag", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.contenttype", ESEntityTypeMap::ES_ENTITY_STR},
+                                                           {"meta.mtime", ESEntityTypeMap::ES_ENTITY_DATE},
+                                                           {"meta.size", ESEntityTypeMap::ES_ENTITY_INT} };
+  ESEntityTypeMap gm(generic_map);
+  es_query.set_generic_type_map(&gm);
+
+  map<string, ESEntityTypeMap::EntityType> custom_map = { {"str", ESEntityTypeMap::ES_ENTITY_STR},
+                                                          {"int", ESEntityTypeMap::ES_ENTITY_INT},
+                                                          {"date", ESEntityTypeMap::ES_ENTITY_DATE} };
+  ESEntityTypeMap em(custom_map);
+  es_query.set_custom_type_map(&em);
+
+  string err;
+  
+  bool valid = es_query.compile(&err);
+  if (!valid) {
+    cout << "failed to compile query: " << err << std::endl;
+    return EINVAL;
+  }
+
+  JSONFormatter f;
+  encode_json("root", es_query, &f);
+
+  f.flush(cout);
+
+  return 0;
+}
+
diff --git a/src/rgw/rgw_es_query.cc b/src/rgw/rgw_es_query.cc
new file mode 100644
index 000000000..16105d599
--- /dev/null
+++ b/src/rgw/rgw_es_query.cc
@@ -0,0 +1,696 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <list>
+#include <map>
+#include <string>
+#include <iostream>
+#include <boost/algorithm/string.hpp>
+
+#include "common/ceph_json.h"
+#include "rgw_common.h"
+#include "rgw_es_query.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+bool pop_front(list<string>& l, string *s)
+{
+  if (l.empty()) {
+    return false;
+  }
+  *s = l.front();
+  l.pop_front();
+  return true;
+}
+
+map<string, int> operator_map = {
+  { "or",  1 },
+  { "and", 2 },
+  { "<",   3 },
+  { "<=",  3 },
+  { "==",  3 },
+  { "!=",  3 },
+  { ">=",  3 },
+  { ">",   3 },
+};
+
+bool is_operator(const string& s)
+{
+  return (operator_map.find(s) != operator_map.end());
+}
+
+int operand_value(const string& op)
+{
+  auto i = operator_map.find(op);
+  if (i == operator_map.end()) {
+    return 0;
+  }
+
+  return i->second;
+}
+
+int check_precedence(const string& op1, const string& op2)
+{
+  return operand_value(op1) - operand_value(op2);
+}
+
+static bool infix_to_prefix(list<string>& source, list<string> *out)
+{
+  list<string> operator_stack;
+  list<string> operand_stack;
+
+  operator_stack.push_front("(");
+  source.push_back(")");
+
+  for (string& entity : source) {
+    if (entity == "(") {
+      operator_stack.push_front(entity);
+    } else if (entity == ")") {
+      string popped_operator;
+      if (!pop_front(operator_stack, &popped_operator)) {
+        return false;
+      }
+
+      while (popped_operator != "(") {
+        operand_stack.push_front(popped_operator);
+        if (!pop_front(operator_stack, &popped_operator)) {
+          return false;
+        }
+      }
+
+    } else if (is_operator(entity)) {
+      string popped_operator;
+      if (!pop_front(operator_stack, &popped_operator)) {
+        return false;
+      }
+
+      int precedence = check_precedence(popped_operator, entity);
+
+      while (precedence >= 0) {
+        operand_stack.push_front(popped_operator);
+        if (!pop_front(operator_stack, &popped_operator)) {
+          return false;
+        }
+        precedence = check_precedence(popped_operator, entity);
+      }
+
+      operator_stack.push_front(popped_operator);
+      operator_stack.push_front(entity);
+    } else {
+      operand_stack.push_front(entity);
+    }
+
+  }
+
+  if (!operator_stack.empty()) {
+    return false;
+  }
+
+  out->swap(operand_stack);
+  return true;
+}
+
+class ESQueryNode {
+protected:
+  ESQueryCompiler *compiler;
+public:
+  ESQueryNode(ESQueryCompiler *_compiler) : compiler(_compiler) {}
+  virtual ~ESQueryNode() {}
+
+  virtual bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) = 0;
+
+  virtual void dump(Formatter *f) const = 0;
+};
+
+static bool alloc_node(ESQueryCompiler *compiler, ESQueryStack *s, ESQueryNode **pnode, string *perr);
+
+class ESQueryNode_Bool : public ESQueryNode {
+  string op;
+  ESQueryNode *first{nullptr};
+  ESQueryNode *second{nullptr};
+public:
+  explicit ESQueryNode_Bool(ESQueryCompiler *compiler) : ESQueryNode(compiler) {}
+  ESQueryNode_Bool(ESQueryCompiler *compiler, const string& _op, ESQueryNode *_first, ESQueryNode *_second) :ESQueryNode(compiler), op(_op), first(_first), second(_second) {}
+  bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override {
+    bool valid = s->pop(&op);
+    if (!valid) {
+      *perr = "incorrect expression";
+      return false;
+    }
+    valid = alloc_node(compiler, s, &first, perr) &&
+      alloc_node(compiler, s, &second, perr);
+    if (!valid) {
+      return false;
+    }
+    *pnode = this;
+    return true;
+  }
+  virtual ~ESQueryNode_Bool() {
+    delete first;
+    delete second;
+  }
+
+  void dump(Formatter *f) const override {
+    f->open_object_section("bool");
+    const char *section = (op == "and" ? "must" : "should");
+    f->open_array_section(section);
+    encode_json("entry", *first, f);
+    encode_json("entry", *second, f);
+    f->close_section();
+    f->close_section();
+  }
+
+};
+
+class ESQueryNodeLeafVal {
+public:
+  ESQueryNodeLeafVal() = default;
+  virtual ~ESQueryNodeLeafVal() {}
+
+  virtual bool init(const string& str_val, string *perr) = 0;
+  virtual void encode_json(const string& field, Formatter *f) const  = 0;
+};
+
+class ESQueryNodeLeafVal_Str : public ESQueryNodeLeafVal {
+  string val;
+public:
+  ESQueryNodeLeafVal_Str() {}
+  bool init(const string& str_val, string *perr) override {
+    val = str_val;
+    return true;
+  }
+  void encode_json(const string& field, Formatter *f) const override {
+    ::encode_json(field.c_str(), val.c_str(), f);
+  }
+};
+
+class ESQueryNodeLeafVal_Int : public ESQueryNodeLeafVal {
+  int64_t val{0};
+public:
+  ESQueryNodeLeafVal_Int() {}
+  bool init(const string& str_val, string *perr) override {
+    string err;
+    val = strict_strtoll(str_val.c_str(), 10, &err);
+    if (!err.empty()) {
+      *perr = string("failed to parse integer: ") + err;
+      return false;
+    }
+    return true;
+  }
+  void encode_json(const string& field, Formatter *f) const override {
+    ::encode_json(field.c_str(), val, f);
+  }
+};
+
+class ESQueryNodeLeafVal_Date : public ESQueryNodeLeafVal {
+  ceph::real_time val;
+public:
+  ESQueryNodeLeafVal_Date() {}
+  bool init(const string& str_val, string *perr) override {
+    if (parse_time(str_val.c_str(), &val) < 0) {
+      *perr = string("failed to parse date: ") + str_val;
+      return false;
+    }
+    return true;
+  }
+  void encode_json(const string& field, Formatter *f) const override {
+    string s;
+    rgw_to_iso8601(val, &s);
+    ::encode_json(field.c_str(), s, f);
+  }
+};
+
+class ESQueryNode_Op : public ESQueryNode {
+protected:
+  string op;
+  string field;
+  string str_val;
+  ESQueryNodeLeafVal *val{nullptr};
+  ESEntityTypeMap::EntityType entity_type{ESEntityTypeMap::ES_ENTITY_NONE};
+  bool allow_restricted{false};
+
+  bool val_from_str(string *perr) {
+    switch (entity_type) {
+      case ESEntityTypeMap::ES_ENTITY_DATE:
+        val = new ESQueryNodeLeafVal_Date;
+        break;
+      case ESEntityTypeMap::ES_ENTITY_INT:
+        val = new ESQueryNodeLeafVal_Int;
+        break;
+      default:
+        val = new ESQueryNodeLeafVal_Str;
+    }
+    return val->init(str_val, perr);
+  }
+  bool do_init(ESQueryNode **pnode, string *perr) {
+    field = compiler->unalias_field(field);
+    ESQueryNode *effective_node;
+    if (!handle_nested(&effective_node, perr)) {
+      return false;
+    }
+    if (!val_from_str(perr)) {
+      return false;
+    }
+    *pnode = effective_node;
+    return true;
+  }
+
+public:
+  ESQueryNode_Op(ESQueryCompiler *compiler) : ESQueryNode(compiler) {}
+  ~ESQueryNode_Op() {
+    delete val;
+  }
+  virtual bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override {
+    bool valid = s->pop(&op) &&
+      s->pop(&str_val) &&
+      s->pop(&field);
+    if (!valid) {
+      *perr = "invalid expression";
+      return false;
+    }
+    return do_init(pnode, perr);
+  }
+  bool handle_nested(ESQueryNode **pnode, string *perr);
+
+  void set_allow_restricted(bool allow) {
+    allow_restricted = allow;
+  }
+
+  virtual void dump(Formatter *f) const override = 0;
+};
+
+class ESQueryNode_Op_Equal : public ESQueryNode_Op {
+public:
+  explicit ESQueryNode_Op_Equal(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {}
+  ESQueryNode_Op_Equal(ESQueryCompiler *compiler, const string& f, const string& v) : ESQueryNode_Op(compiler) {
+    op = "==";
+    field = f;
+    str_val = v;
+  }
+
+  bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override {
+    if (op.empty()) {
+      return ESQueryNode_Op::init(s, pnode, perr);
+    }
+    return do_init(pnode, perr);
+  }
+
+  virtual void dump(Formatter *f) const override {
+    f->open_object_section("term");
+    val->encode_json(field, f);
+    f->close_section();
+  }
+};
+
+class ESQueryNode_Op_NotEqual : public ESQueryNode_Op {
+public:
+  explicit ESQueryNode_Op_NotEqual(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {}
+  ESQueryNode_Op_NotEqual(ESQueryCompiler *compiler, const string& f, const string& v) : ESQueryNode_Op(compiler) {
+    op = "!=";
+    field = f;
+    str_val = v;
+  }
+
+  bool init(ESQueryStack *s, ESQueryNode **pnode, string *perr) override {
+    if (op.empty()) {
+      return ESQueryNode_Op::init(s, pnode, perr);
+    }
+    return do_init(pnode, perr);
+  }
+
+  virtual void dump(Formatter *f) const override {
+    f->open_object_section("bool");
+    f->open_object_section("must_not");
+    f->open_object_section("term");
+    val->encode_json(field, f);
+    f->close_section();
+    f->close_section();
+    f->close_section();
+  }
+};
+
+class ESQueryNode_Op_Range : public ESQueryNode_Op {
+  string range_str;
+public:
+  ESQueryNode_Op_Range(ESQueryCompiler *compiler, const string& rs) : ESQueryNode_Op(compiler), range_str(rs) {}
+
+  virtual void dump(Formatter *f) const override {
+    f->open_object_section("range");
+    f->open_object_section(field.c_str());
+    val->encode_json(range_str, f);
+    f->close_section();
+    f->close_section();
+  }
+};
+
+class ESQueryNode_Op_Nested_Parent : public ESQueryNode_Op {
+public:
+  ESQueryNode_Op_Nested_Parent(ESQueryCompiler *compiler) : ESQueryNode_Op(compiler) {}
+
+  virtual string get_custom_leaf_field_name() = 0;
+};
+
+template <class T>
+class ESQueryNode_Op_Nested : public ESQueryNode_Op_Nested_Parent {
+  string name;
+  ESQueryNode *next;
+public:
+  ESQueryNode_Op_Nested(ESQueryCompiler *compiler, const string& _name, ESQueryNode *_next) : ESQueryNode_Op_Nested_Parent(compiler),
+                                                                                              name(_name), next(_next) {}
+  ~ESQueryNode_Op_Nested() {
+    delete next;
+  }
+
+  virtual void dump(Formatter *f) const override {
+    f->open_object_section("nested");
+    string s = string("meta.custom-") + type_str();
+    encode_json("path", s.c_str(), f);
+    f->open_object_section("query");
+    f->open_object_section("bool");
+    f->open_array_section("must");
+    f->open_object_section("entry");
+    f->open_object_section("match");
+    string n = s + ".name";
+    encode_json(n.c_str(), name.c_str(), f);
+    f->close_section();
+    f->close_section();
+    encode_json("entry", *next, f);
+    f->close_section();
+    f->close_section();
+    f->close_section();
+    f->close_section();
+  }
+
+  string type_str() const;
+  string get_custom_leaf_field_name() override {
+    return string("meta.custom-") + type_str() + ".value";
+  }
+};
+
+template<>
+string ESQueryNode_Op_Nested<string>::type_str() const {
+  return "string";
+}
+
+template<>
+string ESQueryNode_Op_Nested<int64_t>::type_str() const {
+  return "int";
+}
+
+template<>
+string ESQueryNode_Op_Nested<ceph::real_time>::type_str() const {
+  return "date";
+}
+
+bool ESQueryNode_Op::handle_nested(ESQueryNode **pnode, string *perr)
+{
+  string field_name = field;
+  const string& custom_prefix = compiler->get_custom_prefix();
+  if (!boost::algorithm::starts_with(field_name, custom_prefix)) {
+    *pnode = this;
+    auto m = compiler->get_generic_type_map();
+    if (m) {
+      bool found = m->find(field_name, &entity_type) &&
+        (allow_restricted || !compiler->is_restricted(field_name));
+      if (!found) {
+        *perr = string("unexpected generic field '") + field_name + "'";
+      }
+      return found;
+    }
+    *perr = "query parser does not support generic types";
+    return false;
+  }
+
+  field_name = field_name.substr(custom_prefix.size());
+  auto m = compiler->get_custom_type_map();
+  if (m) {
+    m->find(field_name, &entity_type);
+    /* ignoring returned bool, for now just treat it as string */
+  }
+
+  ESQueryNode_Op_Nested_Parent *new_node;
+  switch (entity_type) {
+    case ESEntityTypeMap::ES_ENTITY_INT:
+      new_node = new ESQueryNode_Op_Nested<int64_t>(compiler, field_name, this);
+      break;
+    case ESEntityTypeMap::ES_ENTITY_DATE:
+      new_node = new ESQueryNode_Op_Nested<ceph::real_time>(compiler, field_name, this);
+      break;
+    default:
+      new_node = new ESQueryNode_Op_Nested<string>(compiler, field_name, this);
+  }
+    
+  field = new_node->get_custom_leaf_field_name();
+  *pnode = new_node;
+
+  return true;
+}
+
+static bool is_bool_op(const string& str)
+{
+  return (str == "or" || str == "and");
+}
+
+static bool alloc_node(ESQueryCompiler *compiler, ESQueryStack *s, ESQueryNode **pnode, string *perr)
+{
+  string op;
+  bool valid = s->peek(&op);
+  if (!valid) {
+    *perr = "incorrect expression";
+    return false;
+  }
+
+  ESQueryNode *node;
+
+  if (is_bool_op(op)) {
+    node = new ESQueryNode_Bool(compiler);
+  } else if (op == "==") {
+    node = new ESQueryNode_Op_Equal(compiler);
+  } else if (op == "!=") {
+    node = new ESQueryNode_Op_NotEqual(compiler);
+  } else {
+    static map<string, string> range_op_map = {
+      { "<", "lt"},
+      { "<=", "lte"},
+      { ">=", "gte"},
+      { ">", "gt"},
+    };
+
+    auto iter = range_op_map.find(op);
+    if (iter == range_op_map.end()) {
+      *perr = string("invalid operator: ") + op;
+      return false;
+    }
+
+    node = new ESQueryNode_Op_Range(compiler, iter->second);
+  }
+
+  if (!node->init(s, pnode, perr)) {
+    delete node;
+    return false;
+  }
+  return true;
+}
+
+
+bool is_key_char(char c)
+{
+  switch (c) {
+    case '(':
+    case ')':
+    case '<':
+    case '>':
+    case '!':
+    case '@':
+    case ',':
+    case ';':
+    case ':':
+    case '\\':
+    case '"':
+    case '/':
+    case '[':
+    case ']':
+    case '?':
+    case '=':
+    case '{':
+    case '}':
+    case ' ':
+    case '\t':
+      return false;
+  };
+  return (isascii(c) > 0);
+}
+
+static bool is_op_char(char c)
+{
+  switch (c) {
+    case '!':
+    case '<':
+    case '=':
+    case '>':
+      return true;
+  };
+  return false;
+}
+
+static bool is_val_char(char c)
+{
+  if (isspace(c)) {
+    return false;
+  }
+  return (c != ')');
+}
+
+void ESInfixQueryParser::skip_whitespace(const char *str, int size, int& pos) {
+  while (pos < size && isspace(str[pos])) {
+    ++pos;
+  }
+}
+
+bool ESInfixQueryParser::get_next_token(bool (*filter)(char)) {
+  skip_whitespace(str, size, pos);
+  int token_start = pos;
+  while (pos < size && filter(str[pos])) {
+    ++pos;
+  }
+  if (pos == token_start) {
+    return false;
+  }
+  string token = string(str + token_start, pos - token_start);
+  args.push_back(token);
+  return true;
+}
+
+bool ESInfixQueryParser::parse_condition() {
+  /*
+   * condition: <key> <operator> <val>
+   *
+   * whereas key: needs to conform to http header field restrictions
+   *         operator: one of the following: < <= == != >= >
+   *         val: ascii, terminated by either space or ')' (or end of string)
+   */
+
+  /* parse key */
+  bool valid = get_next_token(is_key_char) &&
+    get_next_token(is_op_char) &&
+    get_next_token(is_val_char);
+
+  if (!valid) {
+    return false;
+  }
+
+  return true;
+}
+
+bool ESInfixQueryParser::parse_and_or() {
+  skip_whitespace(str, size, pos);
+  if (pos + 3 <= size && strncmp(str + pos, "and", 3) == 0) {
+    pos += 3;
+    args.push_back("and");
+    return true;
+  }
+
+  if (pos + 2 <= size && strncmp(str + pos, "or", 2) == 0) {
+    pos += 2;
+    args.push_back("or");
+    return true;
+  }
+
+  return false;
+}
+
+bool ESInfixQueryParser::parse_specific_char(const char *pchar) {
+  skip_whitespace(str, size, pos);
+  if (pos >= size) {
+    return false;
+  }
+  if (str[pos] != *pchar) {
+    return false;
+  }
+
+  args.push_back(pchar);
+  ++pos;
+  return true;
+}
+
+bool ESInfixQueryParser::parse_open_bracket() {
+  return parse_specific_char("(");
+}
+
+bool ESInfixQueryParser::parse_close_bracket() {
+  return parse_specific_char(")");
+}
+
+bool ESInfixQueryParser::parse(list<string> *result) {
+  /*
+   * expression: [(]<condition>[[and/or]<condition>][)][and/or]...
+   */
+
+  while (pos < size) {
+    parse_open_bracket();
+    if (!parse_condition()) {
+      return false;
+    }
+    parse_close_bracket();
+    parse_and_or();
+  }
+
+  result->swap(args);
+
+  return true;
+}
+
+bool ESQueryCompiler::convert(list<string>& infix, string *perr) {
+  list<string> prefix;
+  if (!infix_to_prefix(infix, &prefix)) {
+    *perr = "invalid query";
+    return false;
+  }
+  stack.assign(prefix);
+  if (!alloc_node(this, &stack, &query_root, perr)) {
+    return false;
+  }
+  if (!stack.done()) {
+    *perr = "invalid query";
+    return false;
+  }
+  return true;
+}
+
+ESQueryCompiler::~ESQueryCompiler() {
+  delete query_root;
+}
+
+bool ESQueryCompiler::compile(string *perr) {
+  list<string> infix;
+  if (!parser.parse(&infix)) {
+    *perr = "failed to parse query";
+    return false;
+  }
+
+  if (!convert(infix, perr)) {
+    return false;
+  }
+
+  for (auto& c : eq_conds) {
+    ESQueryNode_Op_Equal *eq_node = new ESQueryNode_Op_Equal(this, c.first, c.second);
+    eq_node->set_allow_restricted(true); /* can access restricted fields */
+    ESQueryNode *effective_node;
+    if (!eq_node->init(nullptr, &effective_node, perr)) {
+      delete eq_node;
+      return false;
+    }
+    query_root = new ESQueryNode_Bool(this, "and", effective_node, query_root);
+  }
+
+  return true;
+}
+
+void ESQueryCompiler::dump(Formatter *f) const {
+  encode_json("query", *query_root, f);
+}
+
diff --git a/src/rgw/rgw_es_query.h b/src/rgw/rgw_es_query.h
new file mode 100644
index 000000000..f96e06f75
--- /dev/null
+++ b/src/rgw/rgw_es_query.h
@@ -0,0 +1,164 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_string.h"
+
+class ESQueryStack {
+  std::list<std::string> l;
+  std::list<std::string>::iterator iter;
+
+public:
+  explicit ESQueryStack(std::list<std::string>& src) {
+    assign(src);
+  }
+
+  ESQueryStack() {}
+
+  void assign(std::list<std::string>& src) {
+    l.swap(src);
+    iter = l.begin();
+  }
+
+  bool peek(std::string *dest) {
+    if (done()) {
+      return false;
+    }
+    *dest = *iter;
+    return true;
+  }
+
+  bool pop(std::string *dest) {
+    bool valid = peek(dest);
+    if (!valid) {
+      return false;
+    }
+    ++iter;
+    return true;
+  }
+
+  bool done() {
+    return (iter == l.end());
+  }
+};
+
+class ESInfixQueryParser {
+  std::string query;
+  int size;
+  const char *str;
+  int pos{0};
+  std::list<std::string> args;
+
+  void skip_whitespace(const char *str, int size, int& pos);
+  bool get_next_token(bool (*filter)(char));
+
+  bool parse_condition();
+  bool parse_and_or();
+  bool parse_specific_char(const char *pchar);
+  bool parse_open_bracket();
+  bool parse_close_bracket();
+
+public:
+  explicit ESInfixQueryParser(const std::string& _query) : query(_query), size(query.size()), str(query.c_str()) {}
+  bool parse(std::list<std::string> *result);
+};
+
+class ESQueryNode;
+
+struct ESEntityTypeMap {
+  enum EntityType {
+    ES_ENTITY_NONE = 0,
+    ES_ENTITY_STR  = 1,
+    ES_ENTITY_INT  = 2,
+    ES_ENTITY_DATE = 3,
+  };
+
+  std::map<std::string, EntityType> m;
+
+  explicit ESEntityTypeMap(std::map<std::string, EntityType>& _m) : m(_m) {}
+
+  bool find(const std::string& entity, EntityType *ptype) {
+    auto i = m.find(entity);
+    if (i != m.end()) {
+      *ptype = i->second;
+      return true;
+    }
+
+    *ptype = ES_ENTITY_NONE;
+    return false;
+  }
+};
+
+class ESQueryCompiler {
+  ESInfixQueryParser parser;
+  ESQueryStack stack;
+  ESQueryNode *query_root{nullptr};
+
+  std::string custom_prefix;
+
+  bool convert(std::list<std::string>& infix, std::string *perr);
+
+  std::list<std::pair<std::string, std::string> > eq_conds;
+
+  ESEntityTypeMap *generic_type_map{nullptr};
+  ESEntityTypeMap *custom_type_map{nullptr};
+
+  std::map<std::string, std::string, ltstr_nocase> *field_aliases = nullptr;
+  std::set<std::string> *restricted_fields = nullptr;
+
+public:
+    ESQueryCompiler(const std::string& query,
+		    std::list<std::pair<std::string, std::string> > *prepend_eq_conds,
+		    const std::string& _custom_prefix)
+      : parser(query), custom_prefix(_custom_prefix) {
+    if (prepend_eq_conds) {
+      eq_conds = std::move(*prepend_eq_conds);
+    }
+  }
+  ~ESQueryCompiler();
+
+  bool compile(std::string *perr);
+  void dump(Formatter *f) const;
+  
+  void set_generic_type_map(ESEntityTypeMap *entity_map) {
+    generic_type_map = entity_map;
+  }
+
+  ESEntityTypeMap *get_generic_type_map() {
+    return generic_type_map;
+  }
+  const std::string& get_custom_prefix() { return custom_prefix; }
+
+  void set_custom_type_map(ESEntityTypeMap *entity_map) {
+    custom_type_map = entity_map;
+  }
+
+  ESEntityTypeMap *get_custom_type_map() {
+    return custom_type_map;
+  }
+
+  void set_field_aliases(std::map<std::string, std::string, ltstr_nocase> *fa) {
+    field_aliases = fa;
+  }
+
+  std::string unalias_field(const std::string& field) {
+    if (!field_aliases) {
+      return field;
+    }
+    auto i = field_aliases->find(field);
+    if (i == field_aliases->end()) {
+      return field;
+    }
+
+    return i->second;
+  }
+
+  void set_restricted_fields(std::set<std::string> *rf) {
+    restricted_fields = rf;
+  }
+
+  bool is_restricted(const std::string& f) {
+    return (restricted_fields && restricted_fields->find(f) != restricted_fields->end());
+  }
+};
diff --git a/src/rgw/rgw_file.cc b/src/rgw/rgw_file.cc
new file mode 100644
index 000000000..ee32170a1
--- /dev/null
+++ b/src/rgw/rgw_file.cc
@@ -0,0 +1,2787 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/compat.h"
+#include "include/rados/rgw_file.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "rgw_lib.h"
+#include "rgw_resolve.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_frontend.h"
+#include "rgw_request.h"
+#include "rgw_process.h"
+#include "rgw_rest_user.h"
+#include "rgw_rest_s3.h"
+#include "rgw_os_lib.h"
+#include "rgw_auth_s3.h"
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_zone.h"
+#include "rgw_file.h"
+#include "rgw_lib_frontend.h"
+#include "rgw_perf_counters.h"
+#include "common/errno.h"
+
+#include "services/svc_zone.h"
+
+#include <atomic>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw;
+
+namespace rgw {
+
+  const string RGWFileHandle::root_name = "/";
+
+  std::atomic<uint32_t> RGWLibFS::fs_inst_counter;
+
+  uint32_t RGWLibFS::write_completion_interval_s = 10;
+
+  ceph::timer<ceph::mono_clock> RGWLibFS::write_timer{
+    ceph::construct_suspended};
+
+  inline int valid_fs_bucket_name(const string& name) {
+    int rc = valid_s3_bucket_name(name, false /* relaxed */);
+    if (rc != 0) {
+      if (name.size() > 255)
+        return -ENAMETOOLONG;
+      return -EINVAL;
+    }
+    return 0;
+  }
+
+  inline int valid_fs_object_name(const string& name) {
+    int rc = valid_s3_object_name(name);
+    if (rc != 0) {
+      if (name.size() > 1024)
+        return -ENAMETOOLONG;
+      return -EINVAL;
+    }
+    return 0;
+  }
+
+  class XattrHash
+  {
+  public:
+    std::size_t operator()(const rgw_xattrstr& att) const noexcept {
+      return XXH64(att.val, att.len, 5882300);
+    }
+  };
+
+  class XattrEqual
+  {
+  public:
+    bool operator()(const rgw_xattrstr& lhs, const rgw_xattrstr& rhs) const {
+      return ((lhs.len == rhs.len) &&
+	      (strncmp(lhs.val, rhs.val, lhs.len) == 0));
+    }
+  };
+
+  /* well-known attributes */
+  static const std::unordered_set<
+    rgw_xattrstr, XattrHash, XattrEqual> rgw_exposed_attrs = {
+    rgw_xattrstr{const_cast<char*>(RGW_ATTR_ETAG), sizeof(RGW_ATTR_ETAG)-1}
+  };
+
+  static inline bool is_exposed_attr(const rgw_xattrstr& k) {
+    return (rgw_exposed_attrs.find(k) != rgw_exposed_attrs.end());
+  }
+
+  LookupFHResult RGWLibFS::stat_bucket(RGWFileHandle* parent, const char *path,
+				       RGWLibFS::BucketStats& bs,
+				       uint32_t flags)
+  {
+    LookupFHResult fhr{nullptr, 0};
+    std::string bucket_name{path};
+    RGWStatBucketRequest req(cct, user->clone(), bucket_name, bs);
+
+    int rc = g_rgwlib->get_fe()->execute_req(&req);
+    if ((rc == 0) &&
+	(req.get_ret() == 0) &&
+	(req.matched())) {
+      fhr = lookup_fh(parent, path,
+		      (flags & RGWFileHandle::FLAG_LOCKED)|
+		      RGWFileHandle::FLAG_CREATE|
+		      RGWFileHandle::FLAG_BUCKET);
+      if (get<0>(fhr)) {
+	RGWFileHandle* rgw_fh = get<0>(fhr);
+	if (! (flags & RGWFileHandle::FLAG_LOCKED)) {
+	  rgw_fh->mtx.lock();
+	}
+	rgw_fh->set_times(req.get_ctime());
+	/* restore attributes */
+	auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
+	auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
+	if (ux_key && ux_attrs) {
+	  DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+	  if (get<0>(dar) || get<1>(dar)) {
+	    update_fh(rgw_fh);
+          }
+	}
+	if (! (flags & RGWFileHandle::FLAG_LOCKED)) {
+	  rgw_fh->mtx.unlock();
+	}
+      }
+    }
+    return fhr;
+  }
+
+  LookupFHResult RGWLibFS::fake_leaf(RGWFileHandle* parent,
+				     const char *path,
+				     enum rgw_fh_type type,
+				     struct stat *st, uint32_t st_mask,
+				     uint32_t flags)
+  {
+    /* synthesize a minimal handle from parent, path, type, and st */
+    using std::get;
+
+    flags |= RGWFileHandle::FLAG_CREATE;
+
+    switch (type) {
+    case RGW_FS_TYPE_DIRECTORY:
+      flags |= RGWFileHandle::FLAG_DIRECTORY;
+      break;
+    default:
+      /* file */
+      break;
+    };
+
+    LookupFHResult fhr = lookup_fh(parent, path, flags);
+    if (get<0>(fhr)) {
+      RGWFileHandle* rgw_fh = get<0>(fhr);
+      if (st) {	
+	lock_guard guard(rgw_fh->mtx);
+	if (st_mask & RGW_SETATTR_SIZE) {
+	  rgw_fh->set_size(st->st_size);
+	}
+	if (st_mask & RGW_SETATTR_MTIME) {
+	  rgw_fh->set_times(st->st_mtim);
+	}
+      } /* st */
+    } /* rgw_fh */
+    return fhr;
+  } /* RGWLibFS::fake_leaf */
+
+  LookupFHResult RGWLibFS::stat_leaf(RGWFileHandle* parent,
+				     const char *path,
+				     enum rgw_fh_type type,
+				     uint32_t flags)
+  {
+    /* find either-of <object_name>, <object_name/>, only one of
+     * which should exist;  atomicity? */
+    using std::get;
+
+    LookupFHResult fhr{nullptr, 0};
+
+    /* XXX the need for two round-trip operations to identify file or
+     * directory leaf objects is unecessary--the current proposed
+     * mechanism to avoid this is to store leaf object names with an
+     * object locator w/o trailing slash */
+
+    std::string obj_path = parent->format_child_name(path, false);
+
+    for (auto ix : { 0, 1, 2 }) {
+      switch (ix) {
+      case 0:
+      {
+	/* type hint */
+	if (type == RGW_FS_TYPE_DIRECTORY)
+	  continue;
+
+	RGWStatObjRequest req(cct, user->clone(),
+			      parent->bucket_name(), obj_path,
+			      RGWStatObjRequest::FLAG_NONE);
+	int rc = g_rgwlib->get_fe()->execute_req(&req);
+	if ((rc == 0) &&
+	    (req.get_ret() == 0)) {
+	  fhr = lookup_fh(parent, path, RGWFileHandle::FLAG_CREATE);
+	  if (get<0>(fhr)) {
+	    RGWFileHandle* rgw_fh = get<0>(fhr);
+	    lock_guard guard(rgw_fh->mtx);
+	    rgw_fh->set_size(req.get_size());
+	    rgw_fh->set_times(req.get_mtime());
+	    /* restore attributes */
+	    auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
+	    auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
+            rgw_fh->set_etag(*(req.get_attr(RGW_ATTR_ETAG)));
+            rgw_fh->set_acls(*(req.get_attr(RGW_ATTR_ACL)));
+	    if (!(flags & RGWFileHandle::FLAG_IN_CB) &&
+		ux_key && ux_attrs) {
+              DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+              if (get<0>(dar) || get<1>(dar)) {
+                update_fh(rgw_fh);
+              }
+	    }
+	  }
+	  goto done;
+	}
+      }
+      break;
+      case 1:
+      {
+	/* try dir form */
+	/* type hint */
+	if (type == RGW_FS_TYPE_FILE)
+	  continue;
+
+	obj_path += "/";
+	RGWStatObjRequest req(cct, user->clone(),
+			      parent->bucket_name(), obj_path,
+			      RGWStatObjRequest::FLAG_NONE);
+	int rc = g_rgwlib->get_fe()->execute_req(&req);
+	if ((rc == 0) &&
+	    (req.get_ret() == 0)) {
+	  fhr = lookup_fh(parent, path, RGWFileHandle::FLAG_DIRECTORY);
+	  if (get<0>(fhr)) {
+	    RGWFileHandle* rgw_fh = get<0>(fhr);
+	    lock_guard guard(rgw_fh->mtx);
+	    rgw_fh->set_size(req.get_size());
+	    rgw_fh->set_times(req.get_mtime());
+	    /* restore attributes */
+	    auto ux_key = req.get_attr(RGW_ATTR_UNIX_KEY1);
+	    auto ux_attrs = req.get_attr(RGW_ATTR_UNIX1);
+            rgw_fh->set_etag(*(req.get_attr(RGW_ATTR_ETAG)));
+            rgw_fh->set_acls(*(req.get_attr(RGW_ATTR_ACL)));
+	    if (!(flags & RGWFileHandle::FLAG_IN_CB) &&
+		ux_key && ux_attrs) {
+              DecodeAttrsResult dar = rgw_fh->decode_attrs(ux_key, ux_attrs);
+              if (get<0>(dar) || get<1>(dar)) {
+                update_fh(rgw_fh);
+              }
+	    }
+	  }
+	  goto done;
+	}
+      }
+      break;
+      case 2:
+      {
+	std::string object_name{path};
+	RGWStatLeafRequest req(cct, user->clone(),
+			       parent, object_name);
+	int rc = g_rgwlib->get_fe()->execute_req(&req);
+	if ((rc == 0) &&
+	    (req.get_ret() == 0)) {
+	  if (req.matched) {
+	    /* we need rgw object's key name equal to file name, if
+	     * not return NULL */
+	    if ((flags & RGWFileHandle::FLAG_EXACT_MATCH) &&
+		!req.exact_matched) {
+	      lsubdout(get_context(), rgw, 15)
+	        << __func__
+		<< ": stat leaf not exact match file name = "
+		<< path << dendl;
+	      goto done;
+	    }
+	    fhr = lookup_fh(parent, path,
+			    RGWFileHandle::FLAG_CREATE|
+			    ((req.is_dir) ?
+			      RGWFileHandle::FLAG_DIRECTORY :
+			      RGWFileHandle::FLAG_NONE));
+	    /* XXX we don't have an object--in general, there need not
+	     * be one (just a path segment in some other object).  In
+	     * actual leaf an object exists, but we'd need another round
+	     * trip to get attrs */
+	    if (get<0>(fhr)) {
+	      /* for now use the parent object's mtime */
+	      RGWFileHandle* rgw_fh = get<0>(fhr);
+	      lock_guard guard(rgw_fh->mtx);
+	      rgw_fh->set_mtime(parent->get_mtime());
+	    }
+	  }
+	}
+      }
+      break;
+      default:
+	/* not reached */
+	break;
+      }
+    }
+  done:
+    return fhr;
+  } /* RGWLibFS::stat_leaf */
+
+  int RGWLibFS::read(RGWFileHandle* rgw_fh, uint64_t offset, size_t length,
+		     size_t* bytes_read, void* buffer, uint32_t flags)
+  {
+    if (! rgw_fh->is_file())
+      return -EINVAL;
+
+    if (rgw_fh->deleted())
+      return -ESTALE;
+
+    RGWReadRequest req(get_context(), user->clone(), rgw_fh, offset, length, buffer);
+
+    int rc = g_rgwlib->get_fe()->execute_req(&req);
+    if ((rc == 0) &&
+        ((rc = req.get_ret()) == 0)) {
+      lock_guard guard(rgw_fh->mtx);
+      rgw_fh->set_atime(real_clock::to_timespec(real_clock::now()));
+      *bytes_read = req.nread;
+    }
+
+    return rc;
+  }
+
+  int RGWLibFS::readlink(RGWFileHandle* rgw_fh, uint64_t offset, size_t length,
+                     size_t* bytes_read, void* buffer, uint32_t flags)
+  {
+    if (! rgw_fh->is_link())
+      return -EINVAL;
+
+    if (rgw_fh->deleted())
+      return -ESTALE;
+
+    RGWReadRequest req(get_context(), user->clone(), rgw_fh, offset, length, buffer);
+
+    int rc = g_rgwlib->get_fe()->execute_req(&req);
+    if ((rc == 0) &&
+        ((rc = req.get_ret()) == 0)) {
+      lock_guard(rgw_fh->mtx);
+      rgw_fh->set_atime(real_clock::to_timespec(real_clock::now()));
+      *bytes_read = req.nread;
+    }
+
+    return rc;
+  }
+
+  int RGWLibFS::unlink(RGWFileHandle* rgw_fh, const char* name, uint32_t flags)
+  {
+    int rc = 0;
+    BucketStats bs;
+    RGWFileHandle* parent = nullptr;
+    RGWFileHandle* bkt_fh = nullptr;
+
+    if (unlikely(flags & RGWFileHandle::FLAG_UNLINK_THIS)) {
+      /* LOCKED */
+      parent = rgw_fh->get_parent();
+    } else {
+      /* atomicity */
+      parent = rgw_fh;
+      LookupFHResult fhr = lookup_fh(parent, name, RGWFileHandle::FLAG_LOCK);
+      rgw_fh = get<0>(fhr);
+      /* LOCKED */
+    }
+
+    if (parent->is_root()) {
+      /* a bucket may have an object storing Unix attributes, check
+       * for and delete it */
+      LookupFHResult fhr;
+      fhr = stat_bucket(parent, name, bs, (rgw_fh) ?
+			RGWFileHandle::FLAG_LOCKED :
+			RGWFileHandle::FLAG_NONE);
+      bkt_fh = get<0>(fhr);
+      if (unlikely(! bkt_fh)) {
+	/* implies !rgw_fh, so also !LOCKED */
+	return -ENOENT;
+      }
+
+      if (bs.num_entries > 1) {
+	unref(bkt_fh); /* return stat_bucket ref */
+	if (likely(!! rgw_fh)) { /* return lock and ref from
+				  * lookup_fh (or caller in the
+				  * special case of
+				  * RGWFileHandle::FLAG_UNLINK_THIS) */
+	  rgw_fh->mtx.unlock();
+	  unref(rgw_fh);
+	}
+	return -ENOTEMPTY;
+      } else {
+	/* delete object w/key "<bucket>/" (uxattrs), if any */
+	string oname{"/"};
+	RGWDeleteObjRequest req(cct, user->clone(), bkt_fh->bucket_name(), oname);
+	rc = g_rgwlib->get_fe()->execute_req(&req);
+	/* don't care if ENOENT */
+	unref(bkt_fh);
+      }
+
+      string bname{name};
+      RGWDeleteBucketRequest req(cct, user->clone(), bname);
+      rc = g_rgwlib->get_fe()->execute_req(&req);
+      if (! rc) {
+	rc = req.get_ret();
+      }
+    } else {
+      /*
+       * leaf object
+       */
+      if (! rgw_fh) {
+	/* XXX for now, peform a hard lookup to deduce the type of
+	 * object to be deleted ("foo" vs. "foo/")--also, ensures
+	 * atomicity at this endpoint */
+	struct rgw_file_handle *fh;
+	rc = rgw_lookup(get_fs(), parent->get_fh(), name, &fh,
+			nullptr /* st */, 0 /* mask */,
+			RGW_LOOKUP_FLAG_NONE);
+	if (!! rc)
+	  return rc;
+
+	/* rgw_fh ref+ */
+	rgw_fh = get_rgwfh(fh);
+	rgw_fh->mtx.lock(); /* LOCKED */
+      }
+
+      std::string oname = rgw_fh->relative_object_name();
+      if (rgw_fh->is_dir()) {
+	/* for the duration of our cache timer, trust positive
+	 * child cache */
+	if (rgw_fh->has_children()) {
+	  rgw_fh->mtx.unlock();
+	  unref(rgw_fh);
+	  return(-ENOTEMPTY);
+	}
+	oname += "/";
+      }
+      RGWDeleteObjRequest req(cct, user->clone(), parent->bucket_name(), oname);
+      rc = g_rgwlib->get_fe()->execute_req(&req);
+      if (! rc) {
+	rc = req.get_ret();
+      }
+    }
+
+    /* ENOENT when raced with other s3 gateway */
+    if (! rc || rc == -ENOENT) {
+      rgw_fh->flags |= RGWFileHandle::FLAG_DELETED;
+      fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh,
+		      RGWFileHandle::FHCache::FLAG_LOCK);
+    }
+
+    if (! rc) {
+      real_time t = real_clock::now();
+      parent->set_mtime(real_clock::to_timespec(t));
+      parent->set_ctime(real_clock::to_timespec(t));
+    }
+
+    rgw_fh->mtx.unlock();
+    unref(rgw_fh);
+
+    return rc;
+  } /* RGWLibFS::unlink */
+
+  int RGWLibFS::rename(RGWFileHandle* src_fh, RGWFileHandle* dst_fh,
+		       const char *_src_name, const char *_dst_name)
+
+  {
+    /* XXX initial implementation: try-copy, and delete if copy
+     * succeeds */
+    int rc = -EINVAL;
+    real_time t;
+
+    std::string src_name{_src_name};
+    std::string dst_name{_dst_name};
+
+    /* atomicity */
+    LookupFHResult fhr = lookup_fh(src_fh, _src_name, RGWFileHandle::FLAG_LOCK);
+    RGWFileHandle* rgw_fh = get<0>(fhr);
+
+    /* should not happen */
+    if (! rgw_fh) {
+      ldout(get_context(), 0) << __func__
+		       << " BUG no such src renaming path="
+		       << src_name
+		       << dendl;
+      goto out;
+    }
+
+    /* forbid renaming of directories (unreasonable at scale) */
+    if (rgw_fh->is_dir()) {
+      ldout(get_context(), 12) << __func__
+			<< " rejecting attempt to rename directory path="
+			<< rgw_fh->full_object_name()
+			<< dendl;
+      rc = -EPERM;
+      goto unlock;
+    }
+
+    /* forbid renaming open files (violates intent, for now) */
+    if (rgw_fh->is_open()) {
+      ldout(get_context(), 12) << __func__
+			<< " rejecting attempt to rename open file path="
+			<< rgw_fh->full_object_name()
+			<< dendl;
+      rc = -EPERM;
+      goto unlock;
+    }
+
+    t = real_clock::now();
+
+    for (int ix : {0, 1}) {
+      switch (ix) {
+      case 0:
+      {
+	RGWCopyObjRequest req(cct, user->clone(), src_fh, dst_fh, src_name, dst_name);
+	int rc = g_rgwlib->get_fe()->execute_req(&req);
+	if ((rc != 0) ||
+	    ((rc = req.get_ret()) != 0)) {
+	  ldout(get_context(), 1)
+	    << __func__
+	    << " rename step 0 failed src="
+	    << src_fh->full_object_name() << " " << src_name
+	    << " dst=" << dst_fh->full_object_name()
+	    << " " << dst_name
+	    << "rc " << rc
+	    << dendl;
+	  goto unlock;
+	}
+	ldout(get_context(), 12)
+	  << __func__
+	  << " rename step 0 success src="
+	  << src_fh->full_object_name() << " " << src_name
+	  << " dst=" << dst_fh->full_object_name()
+	  << " " << dst_name
+	  << " rc " << rc
+	  << dendl;
+	/* update dst change id */
+	dst_fh->set_times(t);
+      }
+      break;
+      case 1:
+      {
+	rc = this->unlink(rgw_fh /* LOCKED */, _src_name,
+			  RGWFileHandle::FLAG_UNLINK_THIS);
+	/* !LOCKED, -ref */
+	if (! rc) {
+	  ldout(get_context(), 12)
+	    << __func__
+	    << " rename step 1 success src="
+	    << src_fh->full_object_name() << " " << src_name
+	    << " dst=" << dst_fh->full_object_name()
+	    << " " << dst_name
+	    << " rc " << rc
+	    << dendl;
+	  /* update src change id */
+	  src_fh->set_times(t);
+	} else {
+	  ldout(get_context(), 1)
+	    << __func__
+	    << " rename step 1 failed src="
+	    << src_fh->full_object_name() << " " << src_name
+	    << " dst=" << dst_fh->full_object_name()
+	    << " " << dst_name
+	    << " rc " << rc
+	    << dendl;
+	}
+      }
+      goto out;
+      default:
+	ceph_abort();
+      } /* switch */
+    } /* ix */
+  unlock:
+    rgw_fh->mtx.unlock(); /* !LOCKED */
+    unref(rgw_fh); /* -ref */
+
+  out:
+    return rc;
+  } /* RGWLibFS::rename */
+
+  MkObjResult RGWLibFS::mkdir(RGWFileHandle* parent, const char *name,
+			      struct stat *st, uint32_t mask, uint32_t flags)
+  {
+    int rc, rc2;
+    rgw_file_handle *lfh;
+
+    rc = rgw_lookup(get_fs(), parent->get_fh(), name, &lfh,
+		    nullptr /* st */, 0 /* mask */,
+		    RGW_LOOKUP_FLAG_NONE);
+    if (! rc) {
+      /* conflict! */
+      rc = rgw_fh_rele(get_fs(), lfh, RGW_FH_RELE_FLAG_NONE);
+      // ignore return code
+      return MkObjResult{nullptr, -EEXIST};
+    }
+
+    MkObjResult mkr{nullptr, -EINVAL};
+    LookupFHResult fhr;
+    RGWFileHandle* rgw_fh = nullptr;
+    buffer::list ux_key, ux_attrs;
+
+    fhr = lookup_fh(parent, name,
+		    RGWFileHandle::FLAG_CREATE|
+		    RGWFileHandle::FLAG_DIRECTORY|
+		    RGWFileHandle::FLAG_LOCK);
+    rgw_fh = get<0>(fhr);
+    if (rgw_fh) {
+      rgw_fh->create_stat(st, mask);
+      rgw_fh->set_times(real_clock::now());
+      /* save attrs */
+      rgw_fh->encode_attrs(ux_key, ux_attrs);
+      if (st)
+        rgw_fh->stat(st, RGWFileHandle::FLAG_LOCKED);
+      get<0>(mkr) = rgw_fh;
+    } else {
+      get<1>(mkr) = -EIO;
+      return mkr;
+    }
+
+    if (parent->is_root()) {
+      /* bucket */
+      string bname{name};
+      /* enforce S3 name restrictions */
+      rc = valid_fs_bucket_name(bname);
+      if (rc != 0) {
+	rgw_fh->flags |= RGWFileHandle::FLAG_DELETED;
+	fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh,
+			RGWFileHandle::FHCache::FLAG_LOCK);
+	rgw_fh->mtx.unlock();
+	unref(rgw_fh);
+	get<0>(mkr) = nullptr;
+	get<1>(mkr) = rc;
+	return mkr;
+      }
+
+      RGWCreateBucketRequest req(get_context(), user->clone(), bname);
+
+      /* save attrs */
+      req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+      req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs));
+
+      rc = g_rgwlib->get_fe()->execute_req(&req);
+      rc2 = req.get_ret();
+    } else {
+      /* create an object representing the directory */
+      buffer::list bl;
+      string dir_name = parent->format_child_name(name, true);
+
+      /* need valid S3 name (characters, length <= 1024, etc) */
+      rc = valid_fs_object_name(dir_name);
+      if (rc != 0) {
+	rgw_fh->flags |= RGWFileHandle::FLAG_DELETED;
+	fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh,
+			RGWFileHandle::FHCache::FLAG_LOCK);
+	rgw_fh->mtx.unlock();
+	unref(rgw_fh);
+	get<0>(mkr) = nullptr;
+	get<1>(mkr) = rc;
+	return mkr;
+      }
+
+      RGWPutObjRequest req(get_context(), user->clone(), parent->bucket_name(), dir_name, bl);
+
+      /* save attrs */
+      req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+      req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs));
+
+      rc = g_rgwlib->get_fe()->execute_req(&req);
+      rc2 = req.get_ret();
+    }
+
+    if (! ((rc == 0) &&
+	   (rc2 == 0))) {
+      /* op failed */
+      rgw_fh->flags |= RGWFileHandle::FLAG_DELETED;
+      rgw_fh->mtx.unlock(); /* !LOCKED */
+      unref(rgw_fh);
+      get<0>(mkr) = nullptr;
+      /* fixup rc */
+      if (!rc)
+	rc = rc2;
+    } else {
+      real_time t = real_clock::now();
+      parent->set_mtime(real_clock::to_timespec(t));
+      parent->set_ctime(real_clock::to_timespec(t));
+      rgw_fh->mtx.unlock(); /* !LOCKED */
+    }
+
+    get<1>(mkr) = rc;
+
+    return mkr;
+  } /* RGWLibFS::mkdir */
+
+  MkObjResult RGWLibFS::create(RGWFileHandle* parent, const char *name,
+			      struct stat *st, uint32_t mask, uint32_t flags)
+  {
+    int rc, rc2;
+
+    using std::get;
+
+    rgw_file_handle *lfh;
+    rc = rgw_lookup(get_fs(), parent->get_fh(), name, &lfh,
+		    nullptr /* st */, 0 /* mask */,
+		    RGW_LOOKUP_FLAG_NONE);
+    if (! rc) {
+      /* conflict! */
+      rc = rgw_fh_rele(get_fs(), lfh, RGW_FH_RELE_FLAG_NONE);
+      // ignore return code
+      return MkObjResult{nullptr, -EEXIST};
+    }
+
+    /* expand and check name */
+    std::string obj_name = parent->format_child_name(name, false);
+    rc = valid_fs_object_name(obj_name);
+    if (rc != 0) {
+      return MkObjResult{nullptr, rc};
+    }
+
+    /* create it */
+    buffer::list bl;
+    RGWPutObjRequest req(cct, user->clone(), parent->bucket_name(), obj_name, bl);
+    MkObjResult mkr{nullptr, -EINVAL};
+
+    rc = g_rgwlib->get_fe()->execute_req(&req);
+    rc2 = req.get_ret();
+
+    if ((rc == 0) &&
+	(rc2 == 0)) {
+      /* XXX atomicity */
+      LookupFHResult fhr = lookup_fh(parent, name, RGWFileHandle::FLAG_CREATE |
+                                                   RGWFileHandle::FLAG_LOCK);
+      RGWFileHandle* rgw_fh = get<0>(fhr);
+      if (rgw_fh) {
+	if (get<1>(fhr) & RGWFileHandle::FLAG_CREATE) {
+	  /* fill in stat data */
+	  real_time t = real_clock::now();
+	  rgw_fh->create_stat(st, mask);
+	  rgw_fh->set_times(t);
+
+	  parent->set_mtime(real_clock::to_timespec(t));
+	  parent->set_ctime(real_clock::to_timespec(t));
+	}
+        if (st)
+	  (void) rgw_fh->stat(st, RGWFileHandle::FLAG_LOCKED);
+
+        rgw_fh->set_etag(*(req.get_attr(RGW_ATTR_ETAG)));
+        rgw_fh->set_acls(*(req.get_attr(RGW_ATTR_ACL))); 
+
+	get<0>(mkr) = rgw_fh;
+	rgw_fh->file_ondisk_version = 0; // inital version
+	rgw_fh->mtx.unlock();
+      } else
+	rc = -EIO;
+    }
+
+    get<1>(mkr) = rc;
+   
+    /* case like : quota exceed will be considered as fail too*/
+    if(rc2 < 0)
+       get<1>(mkr) = rc2;
+
+    return mkr;
+  } /* RGWLibFS::create */
+
+  MkObjResult RGWLibFS::symlink(RGWFileHandle* parent, const char *name,
+            const char* link_path, struct stat *st, uint32_t mask, uint32_t flags)
+  {
+    int rc, rc2;
+
+    using std::get;
+
+    rgw_file_handle *lfh;
+    rc = rgw_lookup(get_fs(), parent->get_fh(), name, &lfh,
+		    nullptr /* st */, 0 /* mask */,
+                    RGW_LOOKUP_FLAG_NONE);
+    if (! rc) {
+      /* conflict! */
+      rc = rgw_fh_rele(get_fs(), lfh, RGW_FH_RELE_FLAG_NONE);
+      // ignore return code
+      return MkObjResult{nullptr, -EEXIST};
+    }
+
+    MkObjResult mkr{nullptr, -EINVAL};
+    LookupFHResult fhr;
+    RGWFileHandle* rgw_fh = nullptr;
+    buffer::list ux_key, ux_attrs;
+
+    fhr = lookup_fh(parent, name,
+      RGWFileHandle::FLAG_CREATE|
+      RGWFileHandle::FLAG_SYMBOLIC_LINK|
+      RGWFileHandle::FLAG_LOCK);
+    rgw_fh = get<0>(fhr);
+    if (rgw_fh) {
+      rgw_fh->create_stat(st, mask);
+      rgw_fh->set_times(real_clock::now());
+      /* save attrs */
+      rgw_fh->encode_attrs(ux_key, ux_attrs);
+      if (st)
+        rgw_fh->stat(st);
+      get<0>(mkr) = rgw_fh;
+    } else {
+      get<1>(mkr) = -EIO;
+      return mkr;
+    }
+
+    /* need valid S3 name (characters, length <= 1024, etc) */
+    rc = valid_fs_object_name(name);
+    if (rc != 0) {
+      rgw_fh->flags |= RGWFileHandle::FLAG_DELETED;
+      fh_cache.remove(rgw_fh->fh.fh_hk.object, rgw_fh,
+        RGWFileHandle::FHCache::FLAG_LOCK);
+      rgw_fh->mtx.unlock();
+      unref(rgw_fh);
+      get<0>(mkr) = nullptr;
+      get<1>(mkr) = rc;
+      return mkr;
+    }
+
+    string obj_name = std::string(name);
+    /* create an object representing the directory */
+    buffer::list bl;
+
+    /* XXXX */
+#if 0
+    bl.push_back(
+      buffer::create_static(len, static_cast<char*>(buffer)));
+#else
+
+    bl.push_back(
+      buffer::copy(link_path, strlen(link_path)));
+#endif
+
+    RGWPutObjRequest req(get_context(), user->clone(), parent->bucket_name(), obj_name, bl);
+
+    /* save attrs */
+    req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+    req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs));
+
+    rc = g_rgwlib->get_fe()->execute_req(&req);
+    rc2 = req.get_ret();
+    if (! ((rc == 0) &&
+        (rc2 == 0))) {
+      /* op failed */
+      rgw_fh->flags |= RGWFileHandle::FLAG_DELETED;
+      rgw_fh->mtx.unlock(); /* !LOCKED */
+      unref(rgw_fh);
+      get<0>(mkr) = nullptr;
+      /* fixup rc */
+      if (!rc)
+        rc = rc2;
+    } else {
+      real_time t = real_clock::now();
+      parent->set_mtime(real_clock::to_timespec(t));
+      parent->set_ctime(real_clock::to_timespec(t));
+      rgw_fh->mtx.unlock(); /* !LOCKED */
+    }
+
+    get<1>(mkr) = rc;
+
+    return mkr;
+  } /* RGWLibFS::symlink */
+
+  int RGWLibFS::getattr(RGWFileHandle* rgw_fh, struct stat* st)
+  {
+    switch(rgw_fh->fh.fh_type) {
+    case RGW_FS_TYPE_FILE:
+    {
+      if (rgw_fh->deleted())
+	return -ESTALE;
+    }
+    break;
+    default:
+      break;
+    };
+    /* if rgw_fh is a directory, mtime will be advanced */
+    return rgw_fh->stat(st);
+  } /* RGWLibFS::getattr */
+
+  int RGWLibFS::setattr(RGWFileHandle* rgw_fh, struct stat* st, uint32_t mask,
+			uint32_t flags)
+  {
+    int rc, rc2;
+    buffer::list ux_key, ux_attrs;
+    buffer::list etag = rgw_fh->get_etag();
+    buffer::list acls = rgw_fh->get_acls();
+
+    lock_guard guard(rgw_fh->mtx);
+
+    switch(rgw_fh->fh.fh_type) {
+    case RGW_FS_TYPE_FILE:
+    {
+      if (rgw_fh->deleted())
+	return -ESTALE;
+    }
+    break;
+    default:
+      break;
+    };
+
+    string obj_name{rgw_fh->relative_object_name()};
+
+    if (rgw_fh->is_dir() &&
+	(likely(! rgw_fh->is_bucket()))) {
+      obj_name += "/";
+    }
+
+    RGWSetAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name);
+
+    rgw_fh->create_stat(st, mask);
+    rgw_fh->encode_attrs(ux_key, ux_attrs);
+
+    /* save attrs */
+    req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+    req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs));
+    req.emplace_attr(RGW_ATTR_ETAG, std::move(etag));
+    req.emplace_attr(RGW_ATTR_ACL, std::move(acls));
+
+    rc = g_rgwlib->get_fe()->execute_req(&req);
+    rc2 = req.get_ret();
+
+    if (rc == -ENOENT) {
+      /* special case:  materialize placeholder dir */
+      buffer::list bl;
+      RGWPutObjRequest req(get_context(), user->clone(), rgw_fh->bucket_name(), obj_name, bl);
+
+      rgw_fh->encode_attrs(ux_key, ux_attrs); /* because std::moved */
+
+      /* save attrs */
+      req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+      req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs));
+
+      rc = g_rgwlib->get_fe()->execute_req(&req);
+      rc2 = req.get_ret();
+    }
+
+    if ((rc != 0) || (rc2 != 0)) {
+      return -EIO;
+    }
+
+    rgw_fh->set_ctime(real_clock::to_timespec(real_clock::now()));
+
+    return 0;
+  } /* RGWLibFS::setattr */
+
+  static inline std::string prefix_xattr_keystr(const rgw_xattrstr& key) {
+    std::string keystr;
+    keystr.reserve(sizeof(RGW_ATTR_META_PREFIX) + key.len);
+    keystr += string{RGW_ATTR_META_PREFIX};
+    keystr += string{key.val, key.len};
+    return keystr;
+  }
+
+  static inline std::string_view unprefix_xattr_keystr(const std::string& key)
+  {
+    std::string_view svk{key};
+    auto pos = svk.find(RGW_ATTR_META_PREFIX);
+    if (pos == std::string_view::npos) {
+      return std::string_view{""};
+    } else if (pos == 0) {
+      svk.remove_prefix(sizeof(RGW_ATTR_META_PREFIX)-1);
+    }
+    return svk;
+  }
+
+  int RGWLibFS::getxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist *attrs,
+			  rgw_getxattr_cb cb, void *cb_arg,
+			  uint32_t flags)
+  {
+    /* cannot store on fs_root, should not on buckets? */
+    if ((rgw_fh->is_bucket()) ||
+	(rgw_fh->is_root()))  {
+      return -EINVAL;
+    }
+
+    int rc, rc2, rc3;
+    string obj_name{rgw_fh->relative_object_name2()};
+
+    RGWGetAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name);
+
+    for (uint32_t ix = 0; ix < attrs->xattr_cnt; ++ix) {
+      auto& xattr = attrs->xattrs[ix];
+
+      /* pass exposed attr keys as given, else prefix */
+      std::string k = is_exposed_attr(xattr.key)
+	? std::string{xattr.key.val, xattr.key.len}
+	: prefix_xattr_keystr(xattr.key);
+
+      req.emplace_key(std::move(k));
+    }
+
+    if (ldlog_p1(get_context(), ceph_subsys_rgw, 15)) {
+      lsubdout(get_context(), rgw, 15)
+	<< __func__
+	<< " get keys for: "
+	<< rgw_fh->object_name()
+	<< " keys:"
+	<< dendl;
+      for (const auto& attr: req.get_attrs()) {
+	lsubdout(get_context(), rgw, 15)
+	  << "\tkey: " << attr.first << dendl;
+      }
+    }
+
+    rc = g_rgwlib->get_fe()->execute_req(&req);
+    rc2 = req.get_ret();
+    rc3 = ((rc == 0) && (rc2 == 0)) ? 0 : -EIO;
+
+    /* call back w/xattr data */
+    if (rc3 == 0) {
+      const auto& attrs = req.get_attrs();
+      for (const auto& attr : attrs) {
+
+	if (!attr.second.has_value())
+	  continue;
+
+	const auto& k = attr.first;
+	const auto& v = attr.second.value();
+
+	/* return exposed attr keys as given, else unprefix --
+	 * yes, we could have memoized the exposed check, but
+	 * to be efficient it would need to be saved with
+	 * RGWGetAttrs::attrs, I think */
+	std::string_view svk =
+	  is_exposed_attr(rgw_xattrstr{const_cast<char*>(k.c_str()),
+				       uint32_t(k.length())})
+	  ? k
+	  : unprefix_xattr_keystr(k);
+
+	/* skip entries not matching prefix */
+	if (svk.empty())
+	  continue;
+
+	rgw_xattrstr xattr_k = { const_cast<char*>(svk.data()),
+				 uint32_t(svk.length())};
+	rgw_xattrstr xattr_v =
+	  {const_cast<char*>(const_cast<buffer::list&>(v).c_str()),
+	   uint32_t(v.length())};
+	rgw_xattr xattr = { xattr_k, xattr_v };
+	rgw_xattrlist xattrlist = { &xattr, 1 };
+
+	cb(&xattrlist, cb_arg, RGW_GETXATTR_FLAG_NONE);
+      }
+    }
+
+    return rc3;
+  } /* RGWLibFS::getxattrs */
+
+  int RGWLibFS::lsxattrs(
+    RGWFileHandle* rgw_fh, rgw_xattrstr *filter_prefix, rgw_getxattr_cb cb,
+    void *cb_arg, uint32_t flags)
+  {
+    /* cannot store on fs_root, should not on buckets? */
+    if ((rgw_fh->is_bucket()) ||
+	(rgw_fh->is_root()))  {
+      return -EINVAL;
+    }
+
+    int rc, rc2, rc3;
+    string obj_name{rgw_fh->relative_object_name2()};
+
+    RGWGetAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name);
+
+    rc = g_rgwlib->get_fe()->execute_req(&req);
+    rc2 = req.get_ret();
+    rc3 = ((rc == 0) && (rc2 == 0)) ? 0 : -EIO;
+
+    /* call back w/xattr data--check for eof */
+    if (rc3 == 0) {
+      const auto& keys = req.get_attrs();
+      for (const auto& k : keys) {
+
+	/* return exposed attr keys as given, else unprefix */
+	std::string_view svk =
+	  is_exposed_attr(rgw_xattrstr{const_cast<char*>(k.first.c_str()),
+				       uint32_t(k.first.length())})
+	  ? k.first
+	  : unprefix_xattr_keystr(k.first);
+
+	/* skip entries not matching prefix */
+	if (svk.empty())
+	  continue;
+
+	rgw_xattrstr xattr_k = { const_cast<char*>(svk.data()),
+				 uint32_t(svk.length())};
+	rgw_xattrstr xattr_v = { nullptr, 0 };
+	rgw_xattr xattr = { xattr_k, xattr_v };
+	rgw_xattrlist xattrlist = { &xattr, 1 };
+
+	auto cbr = cb(&xattrlist, cb_arg, RGW_LSXATTR_FLAG_NONE);
+	if (cbr & RGW_LSXATTR_FLAG_STOP)
+	  break;
+      }
+    }
+
+    return rc3;
+  } /* RGWLibFS::lsxattrs */
+
+  int RGWLibFS::setxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist *attrs,
+			  uint32_t flags)
+  {
+    /* cannot store on fs_root, should not on buckets? */
+    if ((rgw_fh->is_bucket()) ||
+	(rgw_fh->is_root()))  {
+      return -EINVAL;
+    }
+
+    int rc, rc2;
+    string obj_name{rgw_fh->relative_object_name2()};
+
+    RGWSetAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name);
+
+    for (uint32_t ix = 0; ix < attrs->xattr_cnt; ++ix) {
+      auto& xattr = attrs->xattrs[ix];
+      buffer::list attr_bl;
+      /* don't allow storing at RGW_ATTR_META_PREFIX */
+      if (! (xattr.key.len > 0))
+	continue;
+
+      /* reject lexical match with any exposed attr */
+      if (is_exposed_attr(xattr.key))
+	continue;
+
+      string k = prefix_xattr_keystr(xattr.key);
+      attr_bl.append(xattr.val.val, xattr.val.len);
+      req.emplace_attr(k.c_str(), std::move(attr_bl));
+    }
+
+    /* don't send null requests */
+    if (! (req.get_attrs().size() > 0)) {
+      return -EINVAL;
+    }
+
+    rc = g_rgwlib->get_fe()->execute_req(&req);
+    rc2 = req.get_ret();
+
+    return (((rc == 0) && (rc2 == 0)) ? 0 : -EIO);
+
+  } /* RGWLibFS::setxattrs */
+
+  int RGWLibFS::rmxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist* attrs,
+			 uint32_t flags)
+  {
+    /* cannot store on fs_root, should not on buckets? */
+    if ((rgw_fh->is_bucket()) ||
+	(rgw_fh->is_root()))  {
+      return -EINVAL;
+    }
+
+    int rc, rc2;
+    string obj_name{rgw_fh->relative_object_name2()};
+
+    RGWRMAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name);
+
+    for (uint32_t ix = 0; ix < attrs->xattr_cnt; ++ix) {
+      auto& xattr = attrs->xattrs[ix];
+      /* don't allow storing at RGW_ATTR_META_PREFIX */
+      if (! (xattr.key.len > 0)) {
+	continue;
+      }
+      string k = prefix_xattr_keystr(xattr.key);
+      req.emplace_key(std::move(k));
+    }
+
+    /* don't send null requests */
+    if (! (req.get_attrs().size() > 0)) {
+      return -EINVAL;
+    }
+
+    rc = g_rgwlib->get_fe()->execute_req(&req);
+    rc2 = req.get_ret();
+
+    return (((rc == 0) && (rc2 == 0)) ? 0 : -EIO);
+
+  } /* RGWLibFS::rmxattrs */
+
+  /* called with rgw_fh->mtx held */
+  void RGWLibFS::update_fh(RGWFileHandle *rgw_fh)
+  {
+    int rc, rc2;
+    string obj_name{rgw_fh->relative_object_name()};
+    buffer::list ux_key, ux_attrs;
+
+    if (rgw_fh->is_dir() &&
+	(likely(! rgw_fh->is_bucket()))) {
+      obj_name += "/";
+    }
+
+    lsubdout(get_context(), rgw, 17)
+      << __func__
+      << " update old versioned fh : " << obj_name
+      << dendl;
+
+    RGWSetAttrsRequest req(cct, user->clone(), rgw_fh->bucket_name(), obj_name);
+
+    rgw_fh->encode_attrs(ux_key, ux_attrs, false);
+
+    req.emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+    req.emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs));
+
+    rc = g_rgwlib->get_fe()->execute_req(&req);
+    rc2 = req.get_ret();
+
+    if ((rc != 0) || (rc2 != 0)) {
+      lsubdout(get_context(), rgw, 17)
+	<< __func__
+	<< " update fh failed : " << obj_name
+	<< dendl;
+    }
+  } /* RGWLibFS::update_fh */
+
+  void RGWLibFS::close()
+  {
+    state.flags |= FLAG_CLOSED;
+
+    class ObjUnref
+    {
+      RGWLibFS* fs;
+    public:
+      explicit ObjUnref(RGWLibFS* _fs) : fs(_fs) {}
+      void operator()(RGWFileHandle* fh) const {
+	lsubdout(fs->get_context(), rgw, 5)
+	  << __PRETTY_FUNCTION__
+	  << fh->name
+	  << " before ObjUnref refs=" << fh->get_refcnt()
+	  << dendl;
+	fs->unref(fh);
+      }
+    };
+
+    /* force cache drain, forces objects to evict */
+    fh_cache.drain(ObjUnref(this),
+		  RGWFileHandle::FHCache::FLAG_LOCK);
+    g_rgwlib->get_fe()->get_process()->unregister_fs(this);
+    rele();
+  } /* RGWLibFS::close */
+
+  inline std::ostream& operator<<(std::ostream &os, fh_key const &fhk) {
+    os << "<fh_key: bucket=";
+    os << fhk.fh_hk.bucket;
+    os << "; object=";
+    os << fhk.fh_hk.object;
+    os << ">";
+    return os;
+  }
+
+  inline std::ostream& operator<<(std::ostream &os, struct timespec const &ts) {
+      os << "<timespec: tv_sec=";
+      os << ts.tv_sec;
+      os << "; tv_nsec=";
+      os << ts.tv_nsec;
+      os << ">";
+    return os;
+  }
+
+  std::ostream& operator<<(std::ostream &os, RGWLibFS::event const &ev) {
+    os << "<event:";
+      switch (ev.t) {
+      case RGWLibFS::event::type::READDIR:
+	os << "type=READDIR;";
+	break;
+      default:
+	os << "type=UNKNOWN;";
+	break;
+      };
+    os << "fid=" << ev.fhk.fh_hk.bucket << ":" << ev.fhk.fh_hk.object
+       << ";ts=" << ev.ts << ">";
+    return os;
+  }
+
+  void RGWLibFS::gc()
+  {
+    using std::get;
+    using directory = RGWFileHandle::directory;
+
+    /* dirent invalidate timeout--basically, the upper-bound on
+     * inconsistency with the S3 namespace */
+    auto expire_s
+      = get_context()->_conf->rgw_nfs_namespace_expire_secs;
+
+    /* max events to gc in one cycle */
+    uint32_t max_ev = get_context()->_conf->rgw_nfs_max_gc;
+
+    struct timespec now, expire_ts;
+    event_vector ve;
+    bool stop = false;
+    std::deque<event> &events = state.events;
+
+    do {
+      (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now);
+      lsubdout(get_context(), rgw, 15)
+	<< "GC: top of expire loop"
+	<< " now=" << now
+	<< " expire_s=" << expire_s
+	<< dendl;
+      {
+	lock_guard guard(state.mtx); /* LOCKED */
+	lsubdout(get_context(), rgw, 15)
+	  << "GC: processing"
+	  << " count=" << events.size()
+	  << " events"
+	  << dendl;
+        /* just return if no events */
+	if (events.empty()) {
+	  return;
+	}
+	uint32_t _max_ev =
+	  (events.size() < 500) ? max_ev : (events.size() / 4);
+	for (uint32_t ix = 0; (ix < _max_ev) && (events.size() > 0); ++ix) {
+	  event& ev = events.front();
+	  expire_ts = ev.ts;
+	  expire_ts.tv_sec += expire_s;
+	  if (expire_ts > now) {
+	    stop = true;
+	    break;
+	  }
+	  ve.push_back(ev);
+	  events.pop_front();
+	}
+      } /* anon */
+      /* !LOCKED */
+      for (auto& ev : ve) {
+	lsubdout(get_context(), rgw, 15)
+	  << "try-expire ev: " << ev << dendl;
+	if (likely(ev.t == event::type::READDIR)) {
+	  RGWFileHandle* rgw_fh = lookup_handle(ev.fhk.fh_hk);
+	  lsubdout(get_context(), rgw, 15)
+	    << "ev rgw_fh: " << rgw_fh << dendl;
+	  if (rgw_fh) {
+	    RGWFileHandle::directory* d;
+	    if (unlikely(! rgw_fh->is_dir())) {
+	      lsubdout(get_context(), rgw, 0)
+		<< __func__
+		<< " BUG non-directory found with READDIR event "
+		<< "(" << rgw_fh->bucket_name() << ","
+		<< rgw_fh->object_name() << ")"
+		<< dendl;
+	      goto rele;
+	    }
+	    /* maybe clear state */
+	    d = get<directory>(&rgw_fh->variant_type);
+	    if (d) {
+	      struct timespec ev_ts = ev.ts;
+	      lock_guard guard(rgw_fh->mtx);
+	      struct timespec d_last_readdir = d->last_readdir;
+	      if (unlikely(ev_ts < d_last_readdir)) {
+		/* readdir cycle in progress, don't invalidate */
+		lsubdout(get_context(), rgw, 15)
+		  << "GC: delay expiration for "
+		  << rgw_fh->object_name()
+		  << " ev.ts=" << ev_ts
+		  << " last_readdir=" << d_last_readdir
+		  << dendl;
+		continue;
+	      } else {
+		lsubdout(get_context(), rgw, 15)
+		  << "GC: expiring "
+		  << rgw_fh->object_name()
+		  << dendl;
+		rgw_fh->clear_state();
+		rgw_fh->invalidate();
+	      }
+	    }
+	  rele:
+	    unref(rgw_fh);
+	  } /* rgw_fh */
+	} /* event::type::READDIR */
+      } /* ev */
+      ve.clear();
+    } while (! (stop || shutdown));
+  } /* RGWLibFS::gc */
+
+  std::ostream& operator<<(std::ostream &os,
+			   RGWFileHandle const &rgw_fh)
+  {
+    const auto& fhk = rgw_fh.get_key();
+    const auto& fh = const_cast<RGWFileHandle&>(rgw_fh).get_fh();
+    os << "<RGWFileHandle:";
+    os << "addr=" << &rgw_fh << ";";
+    switch (fh->fh_type) {
+    case RGW_FS_TYPE_DIRECTORY:
+	os << "type=DIRECTORY;";
+	break;
+    case RGW_FS_TYPE_FILE:
+	os << "type=FILE;";
+	break;
+    default:
+	os << "type=UNKNOWN;";
+	break;
+      };
+    os << "fid=" << fhk.fh_hk.bucket << ":" << fhk.fh_hk.object << ";";
+    os << "name=" << rgw_fh.object_name() << ";";
+    os << "refcnt=" << rgw_fh.get_refcnt() << ";";
+    os << ">";
+    return os;
+  }
+
+  RGWFileHandle::~RGWFileHandle() {
+    /* !recycle case, handle may STILL be in handle table, BUT
+     * the partition lock is not held in this path */
+    if (fh_hook.is_linked()) {
+      fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_LOCK);
+    }
+    /* cond-unref parent */
+    if (parent && (! parent->is_mount())) {
+      /* safe because if parent->unref causes its deletion,
+       * there are a) by refcnt, no other objects/paths pointing
+       * to it and b) by the semantics of valid iteration of
+       * fh_lru (observed, e.g., by cohort_lru<T,...>::drain())
+       * no unsafe iterators reaching it either--n.b., this constraint
+       * is binding oncode which may in future attempt to e.g.,
+       * cause the eviction of objects in LRU order */
+      (void) get_fs()->unref(parent);
+    }
+  }
+
+  fh_key RGWFileHandle::make_fhk(const std::string& name)
+  {
+    std::string tenant = get_fs()->get_user()->user_id.to_str();
+    if (depth == 0) {
+      /* S3 bucket -- assert mount-at-bucket case reaches here */
+      return fh_key(name, name, tenant);
+    } else {
+      std::string key_name = make_key_name(name.c_str());
+      return fh_key(fhk.fh_hk.bucket, key_name.c_str(), tenant);
+    }
+  }
+
+  void RGWFileHandle::encode_attrs(ceph::buffer::list& ux_key1,
+				   ceph::buffer::list& ux_attrs1,
+				   bool inc_ov)
+  {
+    using ceph::encode;
+    fh_key fhk(this->fh.fh_hk);
+    encode(fhk, ux_key1);
+    bool need_ondisk_version =
+      (fh.fh_type == RGW_FS_TYPE_FILE ||
+       fh.fh_type == RGW_FS_TYPE_SYMBOLIC_LINK);
+    if (need_ondisk_version &&
+	file_ondisk_version < 0) {
+      file_ondisk_version = 0;
+    }
+    encode(*this, ux_attrs1);
+    if (need_ondisk_version && inc_ov) {
+      file_ondisk_version++;
+    }
+  } /* RGWFileHandle::encode_attrs */
+
+  DecodeAttrsResult RGWFileHandle::decode_attrs(const ceph::buffer::list* ux_key1,
+                                                const ceph::buffer::list* ux_attrs1)
+  {
+    using ceph::decode;
+    DecodeAttrsResult dar { false, false };
+    fh_key fhk;
+    auto bl_iter_key1 = ux_key1->cbegin();
+    decode(fhk, bl_iter_key1);
+    get<0>(dar) = true;
+
+    // decode to a temporary file handle which may not be
+    // copied to the current file handle if its file_ondisk_version
+    // is not newer
+    RGWFileHandle tmp_fh(fs);
+    tmp_fh.fh.fh_type = fh.fh_type;
+    auto bl_iter_unix1 = ux_attrs1->cbegin();
+    decode(tmp_fh, bl_iter_unix1);
+
+    fh.fh_type = tmp_fh.fh.fh_type;
+    // for file handles that represent files and whose file_ondisk_version
+    // is newer, no updates are need, otherwise, go updating the current
+    // file handle
+    if (!((fh.fh_type == RGW_FS_TYPE_FILE ||
+	    fh.fh_type == RGW_FS_TYPE_SYMBOLIC_LINK) &&
+	  file_ondisk_version >= tmp_fh.file_ondisk_version)) {
+      // make sure the following "encode" always encode a greater version
+      file_ondisk_version = tmp_fh.file_ondisk_version + 1;
+      state.dev = tmp_fh.state.dev;
+      state.size = tmp_fh.state.size;
+      state.nlink = tmp_fh.state.nlink;
+      state.owner_uid = tmp_fh.state.owner_uid;
+      state.owner_gid = tmp_fh.state.owner_gid;
+      state.unix_mode = tmp_fh.state.unix_mode;
+      state.ctime = tmp_fh.state.ctime;
+      state.mtime = tmp_fh.state.mtime;
+      state.atime = tmp_fh.state.atime;
+      state.version = tmp_fh.state.version;
+    }
+
+    if (this->state.version < 2) {
+      get<1>(dar) = true;
+    }
+
+    return dar;
+  } /* RGWFileHandle::decode_attrs */
+
+  bool RGWFileHandle::reclaim(const cohort::lru::ObjectFactory* newobj_fac) {
+    lsubdout(fs->get_context(), rgw, 17)
+      << __func__ << " " << *this
+      << dendl;
+    auto factory = dynamic_cast<const RGWFileHandle::Factory*>(newobj_fac);
+    if (factory == nullptr) {
+      return false;
+    }
+    /* make sure the reclaiming object is the same partiton with newobject factory,
+     * then we can recycle the object, and replace with newobject */
+    if (!fs->fh_cache.is_same_partition(factory->fhk.fh_hk.object, fh.fh_hk.object)) {
+      return false;
+    }
+    /* in the non-delete case, handle may still be in handle table */
+    if (fh_hook.is_linked()) {
+      /* in this case, we are being called from a context which holds
+       * the partition lock */
+      fs->fh_cache.remove(fh.fh_hk.object, this, FHCache::FLAG_NONE);
+    }
+    return true;
+  } /* RGWFileHandle::reclaim */
+
+  bool RGWFileHandle::has_children() const
+  {
+    if (unlikely(! is_dir()))
+      return false;
+
+    RGWRMdirCheck req(fs->get_context(),
+		      g_rgwlib->get_driver()->get_user(fs->get_user()->user_id),
+		      this);
+    int rc = g_rgwlib->get_fe()->execute_req(&req);
+    if (! rc) {
+      return req.valid && req.has_children;
+    }
+
+    return false;
+  }
+
+  std::ostream& operator<<(std::ostream &os,
+			   RGWFileHandle::readdir_offset const &offset)
+  {
+    using boost::get;
+    if (unlikely(!! get<uint64_t*>(&offset))) {
+      uint64_t* ioff = get<uint64_t*>(offset);
+      os << *ioff;
+    }
+    else
+      os << get<const char*>(offset);
+    return os;
+  }
+
+  int RGWFileHandle::readdir(rgw_readdir_cb rcb, void *cb_arg,
+			     readdir_offset offset,
+			     bool *eof, uint32_t flags)
+  {
+    using event = RGWLibFS::event;
+    using boost::get;
+    int rc = 0;
+    struct timespec now;
+    CephContext* cct = fs->get_context();
+
+    lsubdout(cct, rgw, 10)
+      << __func__ << " readdir called on "
+      << object_name()
+      << dendl;
+
+    directory* d = get<directory>(&variant_type);
+    if (d) {
+      (void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
+      lock_guard guard(mtx);
+      d->last_readdir = now;
+    }
+
+    bool initial_off;
+    char* mk{nullptr};
+
+    if (likely(!! get<const char*>(&offset))) {
+      mk = const_cast<char*>(get<const char*>(offset));
+      initial_off = !mk;
+    } else {
+      initial_off = (*get<uint64_t*>(offset) == 0);
+    }
+
+    if (is_root()) {
+      RGWListBucketsRequest req(cct, g_rgwlib->get_driver()->get_user(fs->get_user()->user_id),
+				this, rcb, cb_arg, offset);
+      rc = g_rgwlib->get_fe()->execute_req(&req);
+      if (! rc) {
+	(void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
+	lock_guard guard(mtx);
+	state.atime = now;
+	if (initial_off)
+	  set_nlink(2);
+	inc_nlink(req.d_count);
+	*eof = req.eof();
+      }
+    } else {
+      RGWReaddirRequest req(cct, g_rgwlib->get_driver()->get_user(fs->get_user()->user_id),
+			    this, rcb, cb_arg, offset);
+      rc = g_rgwlib->get_fe()->execute_req(&req);
+      if (! rc) {
+	(void) clock_gettime(CLOCK_MONOTONIC_COARSE, &now); /* !LOCKED */
+	lock_guard guard(mtx);
+	state.atime = now;
+	if (initial_off)
+	  set_nlink(2);
+	inc_nlink(req.d_count);
+	*eof = req.eof();
+      }
+    }
+
+    event ev(event::type::READDIR, get_key(), state.atime);
+    lock_guard sguard(fs->state.mtx);
+    fs->state.push_event(ev);
+
+    lsubdout(fs->get_context(), rgw, 15)
+      << __func__
+      << " final link count=" << state.nlink
+      << dendl;
+
+    return rc;
+  } /* RGWFileHandle::readdir */
+
+  int RGWFileHandle::write(uint64_t off, size_t len, size_t *bytes_written,
+			   void *buffer)
+  {
+    using std::get;
+    using WriteCompletion = RGWLibFS::WriteCompletion;
+
+    lock_guard guard(mtx);
+
+    int rc = 0;
+
+    file* f = get<file>(&variant_type);
+    if (! f)
+      return -EISDIR;
+
+    if (deleted()) {
+      lsubdout(fs->get_context(), rgw, 5)
+	<< __func__
+	<< " write attempted on deleted object "
+	<< this->object_name()
+	<< dendl;
+      /* zap write transaction, if any */
+      if (f->write_req) {
+	delete f->write_req;
+	f->write_req = nullptr;
+      }
+      return -ESTALE;
+    }
+
+    if (! f->write_req) {
+      /* guard--we do not support (e.g., COW-backed) partial writes */
+      if (off != 0) {
+	lsubdout(fs->get_context(), rgw, 5)
+	  << __func__
+	  << " " << object_name()
+	  << " non-0 initial write position " << off
+	  << " (mounting with -o sync required)"
+	  << dendl;
+	return -EIO;
+      }
+
+      const RGWProcessEnv& penv = g_rgwlib->get_fe()->get_process()->get_env();
+
+      /* start */
+      std::string object_name = relative_object_name();
+      f->write_req =
+	new RGWWriteRequest(g_rgwlib->get_driver(), penv,
+			    g_rgwlib->get_driver()->get_user(fs->get_user()->user_id),
+			    this, bucket_name(), object_name);
+      rc = g_rgwlib->get_fe()->start_req(f->write_req);
+      if (rc < 0) {
+	lsubdout(fs->get_context(), rgw, 5)
+	  << __func__
+	  << this->object_name()
+	  << " write start failed " << off
+	  << " (" << rc << ")"
+	  << dendl;
+	/* zap failed write transaction */
+	delete f->write_req;
+	f->write_req = nullptr;
+        return -EIO;
+      } else {
+	if (stateless_open())  {
+	  /* start write timer */
+	  f->write_req->timer_id =
+	    RGWLibFS::write_timer.add_event(
+	      std::chrono::seconds(RGWLibFS::write_completion_interval_s),
+	      WriteCompletion(*this));
+	}
+      }
+    }
+
+    int overlap = 0;
+    if ((static_cast<off_t>(off) < f->write_req->real_ofs) &&
+        ((f->write_req->real_ofs - off) <= len)) {
+      overlap = f->write_req->real_ofs - off;
+      off = f->write_req->real_ofs;
+      buffer = static_cast<char*>(buffer) + overlap;
+      len -= overlap;
+    }
+
+    buffer::list bl;
+    /* XXXX */
+#if 0
+    bl.push_back(
+      buffer::create_static(len, static_cast<char*>(buffer)));
+#else
+    bl.push_back(
+      buffer::copy(static_cast<char*>(buffer), len));
+#endif
+
+    f->write_req->put_data(off, bl);
+    rc = f->write_req->exec_continue();
+
+    if (rc == 0) {
+      size_t min_size = off + len;
+      if (min_size > get_size())
+	set_size(min_size);
+      if (stateless_open()) {
+	/* bump write timer */
+	RGWLibFS::write_timer.adjust_event(
+	  f->write_req->timer_id, std::chrono::seconds(10));
+      }
+    } else {
+      /* continuation failed (e.g., non-contiguous write position) */
+      lsubdout(fs->get_context(), rgw, 5)
+	<< __func__
+	<< object_name()
+	<< " failed write at position " << off
+	<< " (fails write transaction) "
+	<< dendl;
+      /* zap failed write transaction */
+      delete f->write_req;
+      f->write_req = nullptr;
+      rc = -EIO;
+    }
+
+    *bytes_written = (rc == 0) ? (len + overlap) : 0;
+    return rc;
+  } /* RGWFileHandle::write */
+
+  int RGWFileHandle::write_finish(uint32_t flags)
+  {
+    unique_lock guard{mtx, std::defer_lock};
+    int rc = 0;
+
+    if (! (flags & FLAG_LOCKED)) {
+      guard.lock();
+    }
+
+    file* f = get<file>(&variant_type);
+    if (f && (f->write_req)) {
+      lsubdout(fs->get_context(), rgw, 10)
+	<< __func__
+	<< " finishing write trans on " << object_name()
+	<< dendl;
+      rc = g_rgwlib->get_fe()->finish_req(f->write_req);
+      if (! rc) {
+	rc = f->write_req->get_ret();
+      }
+      delete f->write_req;
+      f->write_req = nullptr;
+    }
+
+    return rc;
+  } /* RGWFileHandle::write_finish */
+
+  int RGWFileHandle::close()
+  {
+    lock_guard guard(mtx);
+
+    int rc = write_finish(FLAG_LOCKED);
+
+    flags &= ~FLAG_OPEN;
+    flags &= ~FLAG_STATELESS_OPEN;
+
+    return rc;
+  } /* RGWFileHandle::close */
+
+  RGWFileHandle::file::~file()
+  {
+    delete write_req;
+  }
+
+  void RGWFileHandle::clear_state()
+  {
+    directory* d = get<directory>(&variant_type);
+    if (d) {
+      state.nlink = 2;
+      d->last_marker = rgw_obj_key{};
+    }
+  }
+
+  void RGWFileHandle::advance_mtime(uint32_t flags) {
+    /* intended for use on directories, fast-forward mtime so as to
+     * ensure a new, higher value for the change attribute */
+    unique_lock uniq(mtx, std::defer_lock);
+    if (likely(! (flags & RGWFileHandle::FLAG_LOCKED))) {
+      uniq.lock();
+    }
+
+    /* advance mtime only if stored mtime is older than the
+     * configured namespace expiration */
+    auto now = real_clock::now();
+    auto cmptime = state.mtime;
+    cmptime.tv_sec +=
+      fs->get_context()->_conf->rgw_nfs_namespace_expire_secs;
+    if (cmptime < real_clock::to_timespec(now)) {
+      /* sets ctime as well as mtime, to avoid masking updates should
+       * ctime inexplicably hold a higher value */
+      set_times(now);
+    }
+  }
+
+  void RGWFileHandle::invalidate() {
+    RGWLibFS *fs = get_fs();
+    if (fs->invalidate_cb) {
+      fs->invalidate_cb(fs->invalidate_arg, get_key().fh_hk);
+    }
+  }
+
+  int RGWWriteRequest::exec_start() {
+    req_state* state = get_state();
+
+    /* Object needs a bucket from this point */
+    state->object->set_bucket(state->bucket.get());
+
+    auto compression_type =
+      get_driver()->get_compression_type(state->bucket->get_placement_rule());
+
+    /* not obviously supportable */
+    ceph_assert(! dlo_manifest);
+    ceph_assert(! slo_info);
+
+    perfcounter->inc(l_rgw_put);
+    op_ret = -EINVAL;
+
+    if (state->object->empty()) {
+      ldout(state->cct, 0) << __func__ << " called on empty object" << dendl;
+      goto done;
+    }
+
+    op_ret = get_params(null_yield);
+    if (op_ret < 0)
+      goto done;
+
+    op_ret = get_system_versioning_params(state, &olh_epoch, &version_id);
+    if (op_ret < 0) {
+      goto done;
+    }
+
+    /* user-supplied MD5 check skipped (not supplied) */
+    /* early quota check skipped--we don't have size yet */
+    /* skipping user-supplied etag--we might have one in future, but
+     * like data it and other attrs would arrive after open */
+
+    aio.emplace(state->cct->_conf->rgw_put_obj_min_window_size);
+
+    if (state->bucket->versioning_enabled()) {
+      if (!version_id.empty()) {
+        state->object->set_instance(version_id);
+      } else {
+	state->object->gen_rand_obj_instance_name();
+        version_id = state->object->get_instance();
+      }
+    }
+    processor = get_driver()->get_atomic_writer(this, state->yield, state->object.get(),
+					 state->bucket_owner.get_id(),
+					 &state->dest_placement, 0, state->req_id);
+
+    op_ret = processor->prepare(state->yield);
+    if (op_ret < 0) {
+      ldout(state->cct, 20) << "processor->prepare() returned ret=" << op_ret
+			<< dendl;
+      goto done;
+    }
+    filter = &*processor;
+    if (compression_type != "none") {
+      plugin = Compressor::create(state->cct, compression_type);
+      if (! plugin) {
+        ldout(state->cct, 1) << "Cannot load plugin for rgw_compression_type "
+                         << compression_type << dendl;
+      } else {
+        compressor.emplace(state->cct, plugin, filter);
+        filter = &*compressor;
+      }
+    }
+
+  done:
+    return op_ret;
+  } /* exec_start */
+
+  int RGWWriteRequest::exec_continue()
+  {
+    req_state* state = get_state();
+    op_ret = 0;
+
+    /* check guards (e.g., contig write) */
+    if (eio) {
+      ldout(state->cct, 5)
+        << " chunks arrived in wrong order"
+        << " (mounting with -o sync required)"
+        << dendl;
+      return -EIO;
+    }
+
+    op_ret = state->bucket->check_quota(this, quota, real_ofs, null_yield, true);
+    /* max_size exceed */
+    if (op_ret < 0)
+      return -EIO;
+
+    size_t len = data.length();
+    if (! len)
+      return 0;
+
+    hash.Update((const unsigned char *)data.c_str(), data.length());
+    op_ret = filter->process(std::move(data), ofs);
+    if (op_ret < 0) {
+      goto done;
+    }
+    bytes_written += len;
+
+  done:
+    return op_ret;
+  } /* exec_continue */
+
+  int RGWWriteRequest::exec_finish()
+  {
+    buffer::list bl, aclbl, ux_key, ux_attrs;
+    map<string, string>::iterator iter;
+    char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+    unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    req_state* state = get_state();
+
+    size_t osize = rgw_fh->get_size();
+    struct timespec octime = rgw_fh->get_ctime();
+    struct timespec omtime = rgw_fh->get_mtime();
+    real_time appx_t = real_clock::now();
+
+    state->obj_size = bytes_written;
+    perfcounter->inc(l_rgw_put_b, state->obj_size);
+
+    // flush data in filters
+    op_ret = filter->process({}, state->obj_size);
+    if (op_ret < 0) {
+      goto done;
+    }
+
+    op_ret = state->bucket->check_quota(this, quota, state->obj_size, null_yield, true);
+    /* max_size exceed */
+    if (op_ret < 0) {
+      goto done;
+    }
+
+    hash.Final(m);
+
+    if (compressor && compressor->is_compressed()) {
+      bufferlist tmp;
+      RGWCompressionInfo cs_info;
+      cs_info.compression_type = plugin->get_type_name();
+      cs_info.orig_size = state->obj_size;
+      cs_info.blocks = std::move(compressor->get_compression_blocks());
+      encode(cs_info, tmp);
+      attrs[RGW_ATTR_COMPRESSION] = tmp;
+      ldpp_dout(this, 20) << "storing " << RGW_ATTR_COMPRESSION
+			<< " with type=" << cs_info.compression_type
+			<< ", orig_size=" << cs_info.orig_size
+			<< ", blocks=" << cs_info.blocks.size() << dendl;
+    }
+
+    buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
+    etag = calc_md5;
+
+    bl.append(etag.c_str(), etag.size() + 1);
+    emplace_attr(RGW_ATTR_ETAG, std::move(bl));
+
+    policy.encode(aclbl);
+    emplace_attr(RGW_ATTR_ACL, std::move(aclbl));
+
+    /* unix attrs */
+    rgw_fh->set_mtime(real_clock::to_timespec(appx_t));
+    rgw_fh->set_ctime(real_clock::to_timespec(appx_t));
+    rgw_fh->set_size(bytes_written);
+    rgw_fh->encode_attrs(ux_key, ux_attrs);
+
+    emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+    emplace_attr(RGW_ATTR_UNIX1, std::move(ux_attrs));
+
+    for (iter = state->generic_attrs.begin(); iter != state->generic_attrs.end();
+	 ++iter) {
+      buffer::list& attrbl = attrs[iter->first];
+      const string& val = iter->second;
+      attrbl.append(val.c_str(), val.size() + 1);
+    }
+
+    op_ret = rgw_get_request_metadata(this, state->cct, state->info, attrs);
+    if (op_ret < 0) {
+      goto done;
+    }
+    encode_delete_at_attr(delete_at, attrs);
+
+    /* Add a custom metadata to expose the information whether an object
+     * is an SLO or not. Appending the attribute must be performed AFTER
+     * processing any input from user in order to prohibit overwriting. */
+    if (unlikely(!! slo_info)) {
+      buffer::list slo_userindicator_bl;
+      using ceph::encode;
+      encode("True", slo_userindicator_bl);
+      emplace_attr(RGW_ATTR_SLO_UINDICATOR, std::move(slo_userindicator_bl));
+    }
+
+    op_ret = processor->complete(state->obj_size, etag, &mtime, real_time(), attrs,
+                                 (delete_at ? *delete_at : real_time()),
+                                if_match, if_nomatch, nullptr, nullptr, nullptr,
+                                state->yield);
+    if (op_ret != 0) {
+      /* revert attr updates */
+      rgw_fh->set_mtime(omtime);
+      rgw_fh->set_ctime(octime);
+      rgw_fh->set_size(osize);
+    }
+
+  done:
+    perfcounter->tinc(l_rgw_put_lat, state->time_elapsed());
+    return op_ret;
+  } /* exec_finish */
+
+} /* namespace rgw */
+
+/* librgw */
+extern "C" {
+
+void rgwfile_version(int *major, int *minor, int *extra)
+{
+  if (major)
+    *major = LIBRGW_FILE_VER_MAJOR;
+  if (minor)
+    *minor = LIBRGW_FILE_VER_MINOR;
+  if (extra)
+    *extra = LIBRGW_FILE_VER_EXTRA;
+}
+
+/*
+ attach rgw namespace
+*/
+  int rgw_mount(librgw_t rgw, const char *uid, const char *acc_key,
+		const char *sec_key, struct rgw_fs **rgw_fs,
+		uint32_t flags)
+{
+  int rc = 0;
+
+  /* stash access data for "mount" */
+  RGWLibFS* new_fs = new RGWLibFS(static_cast<CephContext*>(rgw), uid, acc_key,
+				  sec_key, "/");
+  ceph_assert(new_fs);
+
+  const DoutPrefix dp(g_rgwlib->get_driver()->ctx(), dout_subsys, "rgw mount: ");
+  rc = new_fs->authorize(&dp, g_rgwlib->get_driver());
+  if (rc != 0) {
+    delete new_fs;
+    return -EINVAL;
+  }
+
+  /* register fs for shared gc */
+  g_rgwlib->get_fe()->get_process()->register_fs(new_fs);
+
+  struct rgw_fs *fs = new_fs->get_fs();
+  fs->rgw = rgw;
+
+  /* XXX we no longer assume "/" is unique, but we aren't tracking the
+   * roots atm */
+
+  *rgw_fs = fs;
+
+  return 0;
+}
+
+int rgw_mount2(librgw_t rgw, const char *uid, const char *acc_key,
+               const char *sec_key, const char *root, struct rgw_fs **rgw_fs,
+               uint32_t flags)
+{
+  int rc = 0;
+
+  /* if the config has no value for path/root, choose "/" */
+  RGWLibFS* new_fs{nullptr};
+  if(root &&
+     (!strcmp(root, ""))) {
+    /* stash access data for "mount" */
+    new_fs = new RGWLibFS(
+      static_cast<CephContext*>(rgw), uid, acc_key, sec_key, "/");
+  }
+  else {
+    /* stash access data for "mount" */
+    new_fs = new RGWLibFS(
+      static_cast<CephContext*>(rgw), uid, acc_key, sec_key, root);
+  }
+
+  ceph_assert(new_fs); /* should we be using ceph_assert? */
+
+  const DoutPrefix dp(g_rgwlib->get_driver()->ctx(), dout_subsys, "rgw mount2: ");
+  rc = new_fs->authorize(&dp, g_rgwlib->get_driver());
+  if (rc != 0) {
+    delete new_fs;
+    return -EINVAL;
+  }
+
+  /* register fs for shared gc */
+  g_rgwlib->get_fe()->get_process()->register_fs(new_fs);
+
+  struct rgw_fs *fs = new_fs->get_fs();
+  fs->rgw = rgw;
+
+  /* XXX we no longer assume "/" is unique, but we aren't tracking the
+   * roots atm */
+
+  *rgw_fs = fs;
+
+  return 0;
+}
+
+/*
+ register invalidate callbacks
+*/
+int rgw_register_invalidate(struct rgw_fs *rgw_fs, rgw_fh_callback_t cb,
+			    void *arg, uint32_t flags)
+
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  return fs->register_invalidate(cb, arg, flags);
+}
+
+/*
+ detach rgw namespace
+*/
+int rgw_umount(struct rgw_fs *rgw_fs, uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  fs->close();
+  return 0;
+}
+
+/*
+  get filesystem attributes
+*/
+int rgw_statfs(struct rgw_fs *rgw_fs,
+	       struct rgw_file_handle *parent_fh,
+	       struct rgw_statvfs *vfs_st, uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  struct rados_cluster_stat_t stats;
+
+  RGWGetClusterStatReq req(fs->get_context(),
+			   g_rgwlib->get_driver()->get_user(fs->get_user()->user_id),
+			   stats);
+  int rc = g_rgwlib->get_fe()->execute_req(&req);
+  if (rc < 0) {
+    lderr(fs->get_context()) << "ERROR: getting total cluster usage"
+                             << cpp_strerror(-rc) << dendl;
+    return rc;
+  }
+
+  //Set block size to 1M.
+  constexpr uint32_t CEPH_BLOCK_SHIFT = 20;
+  vfs_st->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+  vfs_st->f_frsize = 1 << CEPH_BLOCK_SHIFT;
+  vfs_st->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
+  vfs_st->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
+  vfs_st->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
+  vfs_st->f_files = stats.num_objects;
+  vfs_st->f_ffree = -1;
+  vfs_st->f_fsid[0] = fs->get_fsid();
+  vfs_st->f_fsid[1] = fs->get_fsid();
+  vfs_st->f_flag = 0;
+  vfs_st->f_namemax = 4096;
+  return 0;
+}
+
+/*
+  generic create -- create an empty regular file
+*/
+int rgw_create(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+	       const char *name, struct stat *st, uint32_t mask,
+	       struct rgw_file_handle **fh, uint32_t posix_flags,
+	       uint32_t flags)
+{
+  using std::get;
+
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* parent = get_rgwfh(parent_fh);
+
+  if ((! parent) ||
+      (parent->is_root()) ||
+      (parent->is_file())) {
+    /* bad parent */
+    return -EINVAL;
+  }
+
+  MkObjResult fhr = fs->create(parent, name, st, mask, flags);
+  RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success
+
+  if (nfh)
+    *fh = nfh->get_fh();
+
+  return get<1>(fhr);
+} /* rgw_create */
+
+/*
+  create a symbolic link
+ */
+int rgw_symlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+        const char *name, const char *link_path, struct stat *st, uint32_t mask,
+        struct rgw_file_handle **fh, uint32_t posix_flags,
+        uint32_t flags)
+{
+  using std::get;
+
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* parent = get_rgwfh(parent_fh);
+
+  if ((! parent) ||
+      (parent->is_root()) ||
+      (parent->is_file())) {
+    /* bad parent */
+    return -EINVAL;
+  }
+
+  MkObjResult fhr = fs->symlink(parent, name, link_path, st, mask, flags);
+  RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success
+
+  if (nfh)
+    *fh = nfh->get_fh();
+
+  return get<1>(fhr);
+} /* rgw_symlink */
+
+/*
+  create a new directory
+*/
+int rgw_mkdir(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *parent_fh,
+	      const char *name, struct stat *st, uint32_t mask,
+	      struct rgw_file_handle **fh, uint32_t flags)
+{
+  using std::get;
+
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* parent = get_rgwfh(parent_fh);
+
+  if (! parent) {
+    /* bad parent */
+    return -EINVAL;
+  }
+
+  MkObjResult fhr = fs->mkdir(parent, name, st, mask, flags);
+  RGWFileHandle *nfh = get<0>(fhr); // nullptr if !success
+
+  if (nfh)
+    *fh = nfh->get_fh();
+
+  return get<1>(fhr);
+} /* rgw_mkdir */
+
+/*
+  rename object
+*/
+int rgw_rename(struct rgw_fs *rgw_fs,
+	       struct rgw_file_handle *src, const char* src_name,
+	       struct rgw_file_handle *dst, const char* dst_name,
+	       uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+
+  RGWFileHandle* src_fh = get_rgwfh(src);
+  RGWFileHandle* dst_fh = get_rgwfh(dst);
+
+  return fs->rename(src_fh, dst_fh, src_name, dst_name);
+}
+
+/*
+  remove file or directory
+*/
+int rgw_unlink(struct rgw_fs *rgw_fs, struct rgw_file_handle *parent_fh,
+	       const char *name, uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* parent = get_rgwfh(parent_fh);
+
+  return fs->unlink(parent, name);
+}
+
+/*
+  lookup object by name (POSIX style)
+*/
+int rgw_lookup(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *parent_fh, const char* path,
+	      struct rgw_file_handle **fh,
+	      struct stat *st, uint32_t mask, uint32_t flags)
+{
+  //CephContext* cct = static_cast<CephContext*>(rgw_fs->rgw);
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+
+  RGWFileHandle* parent = get_rgwfh(parent_fh);
+  if ((! parent) ||
+      (! parent->is_dir())) {
+    /* bad parent */
+    return -EINVAL;
+  }
+
+  RGWFileHandle* rgw_fh;
+  LookupFHResult fhr;
+
+  if (parent->is_root()) {
+    /* special: parent lookup--note lack of ref()! */
+    if (unlikely((strcmp(path, "..") == 0) ||
+		 (strcmp(path, "/") == 0))) {
+      rgw_fh = parent;
+    } else {
+      RGWLibFS::BucketStats bstat;
+      fhr = fs->stat_bucket(parent, path, bstat, RGWFileHandle::FLAG_NONE);
+      rgw_fh = get<0>(fhr);
+      if (! rgw_fh)
+	return -ENOENT;
+    }
+  } else {
+    /* special: after readdir--note extra ref()! */
+    if (unlikely((strcmp(path, "..") == 0))) {
+      rgw_fh = parent;
+      lsubdout(fs->get_context(), rgw, 17)
+	<< __func__ << " BANG"<< *rgw_fh
+	<< dendl;
+      fs->ref(rgw_fh);
+    } else {
+      enum rgw_fh_type fh_type = fh_type_of(flags);
+
+      uint32_t sl_flags = (flags & RGW_LOOKUP_FLAG_RCB)
+	? RGWFileHandle::FLAG_IN_CB
+	: RGWFileHandle::FLAG_EXACT_MATCH;
+
+      bool fast_attrs= fs->get_context()->_conf->rgw_nfs_s3_fast_attrs;
+
+      if ((flags & RGW_LOOKUP_FLAG_RCB) && fast_attrs) {
+	/* FAKE STAT--this should mean, interpolate special
+	 * owner, group, and perms masks */
+	fhr = fs->fake_leaf(parent, path, fh_type, st, mask, sl_flags);
+      } else {
+	if ((fh_type == RGW_FS_TYPE_DIRECTORY) && fast_attrs) {
+	  /* trust cached dir, if present */
+	  fhr = fs->lookup_fh(parent, path, RGWFileHandle::FLAG_DIRECTORY);
+	  if (get<0>(fhr)) {
+	    rgw_fh = get<0>(fhr);
+	    goto done;
+	  }
+	}
+	fhr = fs->stat_leaf(parent, path, fh_type, sl_flags);
+      }
+      if (! get<0>(fhr)) {
+	if (! (flags & RGW_LOOKUP_FLAG_CREATE))
+	  return -ENOENT;
+	else
+	  fhr = fs->lookup_fh(parent, path, RGWFileHandle::FLAG_CREATE);
+      }
+      rgw_fh = get<0>(fhr);
+    }
+  } /* !root */
+
+done:
+  struct rgw_file_handle *rfh = rgw_fh->get_fh();
+  *fh = rfh;
+
+  return 0;
+} /* rgw_lookup */
+
+/*
+  lookup object by handle (NFS style)
+*/
+int rgw_lookup_handle(struct rgw_fs *rgw_fs, struct rgw_fh_hk *fh_hk,
+		      struct rgw_file_handle **fh, uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+
+  RGWFileHandle* rgw_fh = fs->lookup_handle(*fh_hk);
+  if (! rgw_fh) {
+    /* not found */
+    return -ENOENT;
+  }
+
+  struct rgw_file_handle *rfh = rgw_fh->get_fh();
+  *fh = rfh;
+
+  return 0;
+}
+
+/*
+ * release file handle
+ */
+int rgw_fh_rele(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  lsubdout(fs->get_context(), rgw, 17)
+    << __func__ << " " << *rgw_fh
+    << dendl;
+
+  fs->unref(rgw_fh);
+  return 0;
+}
+
+/*
+   get unix attributes for object
+*/
+int rgw_getattr(struct rgw_fs *rgw_fs,
+		struct rgw_file_handle *fh, struct stat *st, uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  return fs->getattr(rgw_fh, st);
+}
+
+/*
+  set unix attributes for object
+*/
+int rgw_setattr(struct rgw_fs *rgw_fs,
+		struct rgw_file_handle *fh, struct stat *st,
+		uint32_t mask, uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  return fs->setattr(rgw_fh, st, mask, flags);
+}
+
+/*
+   truncate file
+*/
+int rgw_truncate(struct rgw_fs *rgw_fs,
+		 struct rgw_file_handle *fh, uint64_t size, uint32_t flags)
+{
+  return 0;
+}
+
+/*
+   open file
+*/
+int rgw_open(struct rgw_fs *rgw_fs,
+	     struct rgw_file_handle *fh, uint32_t posix_flags, uint32_t flags)
+{
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  /* XXX
+   * need to track specific opens--at least read opens and
+   * a write open;  we need to know when a write open is returned,
+   * that closes a write transaction
+   *
+   * for now, we will support single-open only, it's preferable to
+   * anything we can otherwise do without access to the NFS state
+   */
+  if (! rgw_fh->is_file())
+    return -EISDIR;
+
+  return rgw_fh->open(flags);
+}
+
+/*
+   close file
+*/
+int rgw_close(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *fh, uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+  int rc = rgw_fh->close(/* XXX */);
+
+  if (flags & RGW_CLOSE_FLAG_RELE)
+    fs->unref(rgw_fh);
+
+  return rc;
+}
+
+int rgw_readdir(struct rgw_fs *rgw_fs,
+		struct rgw_file_handle *parent_fh, uint64_t *offset,
+		rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+		uint32_t flags)
+{
+  RGWFileHandle* parent = get_rgwfh(parent_fh);
+  if (! parent) {
+    /* bad parent */
+    return -EINVAL;
+  }
+
+  lsubdout(parent->get_fs()->get_context(), rgw, 15)
+    << __func__
+    << " offset=" << *offset
+    << dendl;
+
+  if ((*offset == 0) &&
+      (flags & RGW_READDIR_FLAG_DOTDOT)) {
+    /* send '.' and '..' with their NFS-defined offsets */
+    rcb(".", cb_arg, 1, nullptr, 0, RGW_LOOKUP_FLAG_DIR);
+    rcb("..", cb_arg, 2, nullptr, 0, RGW_LOOKUP_FLAG_DIR);
+  }
+
+  int rc = parent->readdir(rcb, cb_arg, offset, eof, flags);
+  return rc;
+} /* rgw_readdir */
+
+/* enumeration continuing from name */
+int rgw_readdir2(struct rgw_fs *rgw_fs,
+		 struct rgw_file_handle *parent_fh, const char *name,
+		 rgw_readdir_cb rcb, void *cb_arg, bool *eof,
+		 uint32_t flags)
+{
+  RGWFileHandle* parent = get_rgwfh(parent_fh);
+  if (! parent) {
+    /* bad parent */
+    return -EINVAL;
+  }
+
+  lsubdout(parent->get_fs()->get_context(), rgw, 15)
+    << __func__
+    << " offset=" << ((name) ? name : "(nil)")
+    << dendl;
+
+  if ((! name) &&
+      (flags & RGW_READDIR_FLAG_DOTDOT)) {
+    /* send '.' and '..' with their NFS-defined offsets */
+    rcb(".", cb_arg, 1, nullptr, 0, RGW_LOOKUP_FLAG_DIR);
+    rcb("..", cb_arg, 2, nullptr, 0, RGW_LOOKUP_FLAG_DIR);
+  }
+
+  int rc = parent->readdir(rcb, cb_arg, name, eof, flags);
+  return rc;
+} /* rgw_readdir2 */
+
+/* project offset of dirent name */
+int rgw_dirent_offset(struct rgw_fs *rgw_fs,
+		      struct rgw_file_handle *parent_fh,
+		      const char *name, int64_t *offset,
+		      uint32_t flags)
+{
+  RGWFileHandle* parent = get_rgwfh(parent_fh);
+  if ((! parent)) {
+    /* bad parent */
+    return -EINVAL;
+  }
+  std::string sname{name};
+  int rc = parent->offset_of(sname, offset, flags);
+  return rc;
+}
+
+/*
+   read data from file
+*/
+int rgw_read(struct rgw_fs *rgw_fs,
+	     struct rgw_file_handle *fh, uint64_t offset,
+	     size_t length, size_t *bytes_read, void *buffer,
+	     uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  return fs->read(rgw_fh, offset, length, bytes_read, buffer, flags);
+}
+
+/*
+   read symbolic link
+*/
+int rgw_readlink(struct rgw_fs *rgw_fs,
+	     struct rgw_file_handle *fh, uint64_t offset,
+	     size_t length, size_t *bytes_read, void *buffer,
+	     uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  return fs->readlink(rgw_fh, offset, length, bytes_read, buffer, flags);
+}
+
+/*
+   write data to file
+*/
+int rgw_write(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *fh, uint64_t offset,
+	      size_t length, size_t *bytes_written, void *buffer,
+	      uint32_t flags)
+{
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+  int rc;
+
+  *bytes_written = 0;
+
+  if (! rgw_fh->is_file())
+    return -EISDIR;
+
+  if (! rgw_fh->is_open()) {
+    if (flags & RGW_OPEN_FLAG_V3) {
+      rc = rgw_fh->open(flags);
+      if (!! rc)
+	return rc;
+    } else
+      return -EPERM;
+  }
+
+  rc = rgw_fh->write(offset, length, bytes_written, buffer);
+
+  return rc;
+}
+
+/*
+   read data from file (vector)
+*/
+class RGWReadV
+{
+  buffer::list bl;
+  struct rgw_vio* vio;
+
+public:
+  RGWReadV(buffer::list& _bl, rgw_vio* _vio) : vio(_vio) {
+    bl = std::move(_bl);
+  }
+
+  struct rgw_vio* get_vio() { return vio; }
+
+  const auto& buffers() { return bl.buffers(); }
+
+  unsigned /* XXX */ length() { return bl.length(); }
+
+};
+
+void rgw_readv_rele(struct rgw_uio *uio, uint32_t flags)
+{
+  RGWReadV* rdv = static_cast<RGWReadV*>(uio->uio_p1);
+  rdv->~RGWReadV();
+  ::operator delete(rdv);
+}
+
+int rgw_readv(struct rgw_fs *rgw_fs,
+	      struct rgw_file_handle *fh, rgw_uio *uio, uint32_t flags)
+{
+#if 0 /* XXX */
+  CephContext* cct = static_cast<CephContext*>(rgw_fs->rgw);
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  if (! rgw_fh->is_file())
+    return -EINVAL;
+
+  int rc = 0;
+
+  buffer::list bl;
+  RGWGetObjRequest req(cct, fs->get_user(), rgw_fh->bucket_name(),
+		      rgw_fh->object_name(), uio->uio_offset, uio->uio_resid,
+		      bl);
+  req.do_hexdump = false;
+
+  rc = g_rgwlib->get_fe()->execute_req(&req);
+
+  if (! rc) {
+    RGWReadV* rdv = static_cast<RGWReadV*>(
+      ::operator new(sizeof(RGWReadV) +
+		    (bl.buffers().size() * sizeof(struct rgw_vio))));
+
+    (void) new (rdv)
+      RGWReadV(bl, reinterpret_cast<rgw_vio*>(rdv+sizeof(RGWReadV)));
+
+    uio->uio_p1 = rdv;
+    uio->uio_cnt = rdv->buffers().size();
+    uio->uio_resid = rdv->length();
+    uio->uio_vio = rdv->get_vio();
+    uio->uio_rele = rgw_readv_rele;
+
+    int ix = 0;
+    auto& buffers = rdv->buffers();
+    for (auto& bp : buffers) {
+      rgw_vio *vio = &(uio->uio_vio[ix]);
+      vio->vio_base = const_cast<char*>(bp.c_str());
+      vio->vio_len = bp.length();
+      vio->vio_u1 = nullptr;
+      vio->vio_p1 = nullptr;
+      ++ix;
+    }
+  }
+
+  return rc;
+#else
+  return 0;
+#endif
+}
+
+/*
+   write data to file (vector)
+*/
+int rgw_writev(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+	      rgw_uio *uio, uint32_t flags)
+{
+
+  // not supported - rest of function is ignored
+  return -ENOTSUP;
+
+  CephContext* cct = static_cast<CephContext*>(rgw_fs->rgw);
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  if (! rgw_fh->is_file())
+    return -EINVAL;
+
+  buffer::list bl;
+  for (unsigned int ix = 0; ix < uio->uio_cnt; ++ix) {
+    rgw_vio *vio = &(uio->uio_vio[ix]);
+    bl.push_back(
+      buffer::create_static(vio->vio_len,
+			    static_cast<char*>(vio->vio_base)));
+  }
+
+  std::string oname = rgw_fh->relative_object_name();
+  RGWPutObjRequest req(cct, g_rgwlib->get_driver()->get_user(fs->get_user()->user_id),
+		       rgw_fh->bucket_name(), oname, bl);
+
+  int rc = g_rgwlib->get_fe()->execute_req(&req);
+
+  /* XXX update size (in request) */
+
+  return rc;
+}
+
+/*
+   sync written data
+*/
+int rgw_fsync(struct rgw_fs *rgw_fs, struct rgw_file_handle *handle,
+	      uint32_t flags)
+{
+  return 0;
+}
+
+int rgw_commit(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+	       uint64_t offset, uint64_t length, uint32_t flags)
+{
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  return rgw_fh->commit(offset, length, RGWFileHandle::FLAG_NONE);
+}
+
+/*
+  extended attributes
+ */
+
+int rgw_getxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		  rgw_xattrlist *attrs, rgw_getxattr_cb cb, void *cb_arg,
+		  uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  return fs->getxattrs(rgw_fh, attrs, cb, cb_arg, flags);
+}
+
+int rgw_lsxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		 rgw_xattrstr *filter_prefix /* ignored */,
+		 rgw_getxattr_cb cb, void *cb_arg, uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  return fs->lsxattrs(rgw_fh, filter_prefix, cb, cb_arg, flags);
+}
+
+int rgw_setxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		  rgw_xattrlist *attrs, uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  return fs->setxattrs(rgw_fh, attrs, flags);
+}
+
+int rgw_rmxattrs(struct rgw_fs *rgw_fs, struct rgw_file_handle *fh,
+		 rgw_xattrlist *attrs, uint32_t flags)
+{
+  RGWLibFS *fs = static_cast<RGWLibFS*>(rgw_fs->fs_private);
+  RGWFileHandle* rgw_fh = get_rgwfh(fh);
+
+  return fs->rmxattrs(rgw_fh, attrs, flags);
+}
+
+} /* extern "C" */
diff --git a/src/rgw/rgw_file.h b/src/rgw/rgw_file.h
new file mode 100644
index 000000000..65ec3dd15
--- /dev/null
+++ b/src/rgw/rgw_file.h
@@ -0,0 +1,2857 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "include/rados/rgw_file.h"
+
+/* internal header */
+#include <string.h>
+#include <string_view>
+#include <sys/stat.h>
+#include <stdint.h>
+
+#include <atomic>
+#include <chrono>
+#include <thread>
+#include <mutex>
+#include <vector>
+#include <deque>
+#include <algorithm>
+#include <functional>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+#include <boost/container/flat_map.hpp>
+#include <boost/variant.hpp>
+#include <boost/optional.hpp>
+#include "xxhash.h"
+#include "include/buffer.h"
+#include "common/cohort_lru.h"
+#include "common/ceph_timer.h"
+#include "rgw_common.h"
+#include "rgw_user.h"
+#include "rgw_lib.h"
+#include "rgw_ldap.h"
+#include "rgw_token.h"
+#include "rgw_putobj_processor.h"
+#include "rgw_aio_throttle.h"
+#include "rgw_compression.h"
+
+
+/* XXX
+ * ASSERT_H somehow not defined after all the above (which bring
+ * in common/debug.h [e.g., dout])
+ */
+#include "include/ceph_assert.h"
+
+
+#define RGW_RWXMODE  (S_IRWXU | S_IRWXG | S_IRWXO)
+
+#define RGW_RWMODE (RGW_RWXMODE &			\
+		      ~(S_IXUSR | S_IXGRP | S_IXOTH))
+
+
+namespace rgw {
+
+  template <typename T>
+  static inline void ignore(T &&) {}
+
+
+  namespace bi = boost::intrusive;
+
+  class RGWLibFS;
+  class RGWFileHandle;
+  class RGWWriteRequest;
+
+  inline bool operator <(const struct timespec& lhs,
+			 const struct timespec& rhs) {
+    if (lhs.tv_sec == rhs.tv_sec)
+      return lhs.tv_nsec < rhs.tv_nsec;
+    else
+      return lhs.tv_sec < rhs.tv_sec;
+  }
+
+  inline bool operator ==(const struct timespec& lhs,
+			  const struct timespec& rhs) {
+    return ((lhs.tv_sec == rhs.tv_sec) &&
+	    (lhs.tv_nsec == rhs.tv_nsec));
+  }
+
+  /*
+   * XXX
+   * The current 64-bit, non-cryptographic hash used here is intended
+   * for prototyping only.
+   *
+   * However, the invariant being prototyped is that objects be
+   * identifiable by their hash components alone.  We believe this can
+   * be legitimately implemented using 128-hash values for bucket and
+   * object components, together with a cluster-resident cryptographic
+   * key.  Since an MD5 or SHA-1 key is 128 bits and the (fast),
+   * non-cryptographic CityHash128 hash algorithm takes a 128-bit seed,
+   * speculatively we could use that for the final hash computations.
+   */
+  struct fh_key
+  {
+    rgw_fh_hk fh_hk {};
+    uint32_t version;
+
+    static constexpr uint64_t seed = 8675309;
+
+    fh_key() : version(0) {}
+
+    fh_key(const rgw_fh_hk& _hk)
+      : fh_hk(_hk), version(0) {
+      // nothing
+    }
+
+    fh_key(const uint64_t bk, const uint64_t ok)
+      : version(0) {
+      fh_hk.bucket = bk;
+      fh_hk.object = ok;
+    }
+
+    fh_key(const uint64_t bk, const char *_o, const std::string& _t)
+      : version(0) {
+      fh_hk.bucket = bk;
+      std::string to = _t + ":" + _o;
+      fh_hk.object = XXH64(to.c_str(), to.length(), seed);
+    }
+
+    fh_key(const std::string& _b, const std::string& _o,
+	   const std::string& _t /* tenant */)
+      : version(0) {
+      std::string tb = _t + ":" + _b;
+      std::string to = _t + ":" + _o;
+      fh_hk.bucket = XXH64(tb.c_str(), tb.length(), seed);
+      fh_hk.object = XXH64(to.c_str(), to.length(), seed);
+    }
+
+    void encode(buffer::list& bl) const {
+      ENCODE_START(2, 1, bl);
+      encode(fh_hk.bucket, bl);
+      encode(fh_hk.object, bl);
+      encode((uint32_t)2, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::const_iterator& bl) {
+      DECODE_START(2, bl);
+      decode(fh_hk.bucket, bl);
+      decode(fh_hk.object, bl);
+      if (struct_v >= 2) {
+	decode(version, bl);
+      }
+      DECODE_FINISH(bl);
+    }
+
+    friend std::ostream& operator<<(std::ostream &os, fh_key const &fhk);
+
+  }; /* fh_key */
+
+  WRITE_CLASS_ENCODER(fh_key);
+
+  inline bool operator<(const fh_key& lhs, const fh_key& rhs)
+  {
+    return ((lhs.fh_hk.bucket < rhs.fh_hk.bucket) ||
+	    ((lhs.fh_hk.bucket == rhs.fh_hk.bucket) &&
+	      (lhs.fh_hk.object < rhs.fh_hk.object)));
+  }
+
+  inline bool operator>(const fh_key& lhs, const fh_key& rhs)
+  {
+    return (rhs < lhs);
+  }
+
+  inline bool operator==(const fh_key& lhs, const fh_key& rhs)
+  {
+    return ((lhs.fh_hk.bucket == rhs.fh_hk.bucket) &&
+	    (lhs.fh_hk.object == rhs.fh_hk.object));
+  }
+
+  inline bool operator!=(const fh_key& lhs, const fh_key& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+  inline bool operator<=(const fh_key& lhs, const fh_key& rhs)
+  {
+    return (lhs < rhs) || (lhs == rhs);
+  }
+
+  using boost::variant;
+  using boost::container::flat_map;
+
+  typedef std::tuple<bool, bool> DecodeAttrsResult;
+
+  class RGWFileHandle : public cohort::lru::Object
+  {
+    struct rgw_file_handle fh;
+    std::mutex mtx;
+
+    RGWLibFS* fs;
+    RGWFileHandle* bucket;
+    RGWFileHandle* parent;
+    std::atomic_int64_t file_ondisk_version; // version of unix attrs, file only
+    /* const */ std::string name; /* XXX file or bucket name */
+    /* const */ fh_key fhk;
+
+    using lock_guard = std::lock_guard<std::mutex>;
+    using unique_lock = std::unique_lock<std::mutex>;
+
+    /* TODO: keeping just the last marker is sufficient for
+     * nfs-ganesha 2.4.5; in the near future, nfs-ganesha will
+     * be able to hint the name of the next dirent required,
+     * from which we can directly synthesize a RADOS marker.
+     * using marker_cache_t = flat_map<uint64_t, rgw_obj_key>;
+     */
+
+    struct State {
+      uint64_t dev;
+      uint64_t size;
+      uint64_t nlink;
+      uint32_t owner_uid; /* XXX need Unix attr */
+      uint32_t owner_gid; /* XXX need Unix attr */
+      mode_t unix_mode;
+      struct timespec ctime;
+      struct timespec mtime;
+      struct timespec atime;
+      uint32_t version;
+      State() : dev(0), size(0), nlink(1), owner_uid(0), owner_gid(0), unix_mode(0),
+		ctime{0,0}, mtime{0,0}, atime{0,0}, version(0) {}
+    } state;
+
+    struct file {
+      RGWWriteRequest* write_req;
+      file() : write_req(nullptr) {}
+      ~file();
+    };
+
+    struct directory {
+
+      static constexpr uint32_t FLAG_NONE =     0x0000;
+
+      uint32_t flags;
+      rgw_obj_key last_marker;
+      struct timespec last_readdir;
+
+      directory() : flags(FLAG_NONE), last_readdir{0,0} {}
+    };
+
+    void clear_state();
+    void advance_mtime(uint32_t flags = FLAG_NONE);
+
+    boost::variant<file, directory> variant_type;
+
+    uint16_t depth;
+    uint32_t flags;
+
+    ceph::buffer::list etag;
+    ceph::buffer::list acls;
+
+  public:
+    const static std::string root_name;
+
+    static constexpr uint16_t MAX_DEPTH = 256;
+
+    static constexpr uint32_t FLAG_NONE =    0x0000;
+    static constexpr uint32_t FLAG_OPEN =    0x0001;
+    static constexpr uint32_t FLAG_ROOT =    0x0002;
+    static constexpr uint32_t FLAG_CREATE =  0x0004;
+    static constexpr uint32_t FLAG_CREATING =  0x0008;
+    static constexpr uint32_t FLAG_SYMBOLIC_LINK = 0x0009;
+    static constexpr uint32_t FLAG_DIRECTORY = 0x0010;
+    static constexpr uint32_t FLAG_BUCKET = 0x0020;
+    static constexpr uint32_t FLAG_LOCK =   0x0040;
+    static constexpr uint32_t FLAG_DELETED = 0x0080;
+    static constexpr uint32_t FLAG_UNLINK_THIS = 0x0100;
+    static constexpr uint32_t FLAG_LOCKED = 0x0200;
+    static constexpr uint32_t FLAG_STATELESS_OPEN = 0x0400;
+    static constexpr uint32_t FLAG_EXACT_MATCH = 0x0800;
+    static constexpr uint32_t FLAG_MOUNT = 0x1000;
+    static constexpr uint32_t FLAG_IN_CB = 0x2000;
+
+#define CREATE_FLAGS(x) \
+    ((x) & ~(RGWFileHandle::FLAG_CREATE|RGWFileHandle::FLAG_LOCK))
+
+    static constexpr uint32_t RCB_MASK = \
+      RGW_SETATTR_MTIME|RGW_SETATTR_CTIME|RGW_SETATTR_ATIME|RGW_SETATTR_SIZE;
+
+    friend class RGWLibFS;
+
+  private:
+    explicit RGWFileHandle(RGWLibFS* _fs)
+      : fs(_fs), bucket(nullptr), parent(nullptr), file_ondisk_version(-1),
+	variant_type{directory()}, depth(0), flags(FLAG_NONE)
+      {
+        fh.fh_hk.bucket = 0;
+        fh.fh_hk.object = 0;
+	/* root */
+	fh.fh_type = RGW_FS_TYPE_DIRECTORY;
+	variant_type = directory();
+	/* stat */
+	state.unix_mode = RGW_RWXMODE|S_IFDIR;
+	/* pointer to self */
+	fh.fh_private = this;
+      }
+
+    uint64_t init_fsid(std::string& uid) {
+      return XXH64(uid.c_str(), uid.length(), fh_key::seed);
+    }
+
+    void init_rootfs(std::string& fsid, const std::string& object_name,
+                     bool is_bucket) {
+      /* fh_key */
+      fh.fh_hk.bucket = XXH64(fsid.c_str(), fsid.length(), fh_key::seed);
+      fh.fh_hk.object = XXH64(object_name.c_str(), object_name.length(),
+			      fh_key::seed);
+      fhk = fh.fh_hk;
+      name = object_name;
+
+      state.dev = init_fsid(fsid);
+
+      if (is_bucket) {
+        flags |= RGWFileHandle::FLAG_BUCKET | RGWFileHandle::FLAG_MOUNT;
+        bucket = this;
+        depth = 1;
+      } else {
+        flags |= RGWFileHandle::FLAG_ROOT | RGWFileHandle::FLAG_MOUNT;
+      }
+    }
+
+    void encode(buffer::list& bl) const {
+      ENCODE_START(3, 1, bl);
+      encode(uint32_t(fh.fh_type), bl);
+      encode(state.dev, bl);
+      encode(state.size, bl);
+      encode(state.nlink, bl);
+      encode(state.owner_uid, bl);
+      encode(state.owner_gid, bl);
+      encode(state.unix_mode, bl);
+      for (const auto& t : { state.ctime, state.mtime, state.atime }) {
+	encode(real_clock::from_timespec(t), bl);
+      }
+      encode((uint32_t)2, bl);
+      encode(file_ondisk_version.load(), bl);
+      ENCODE_FINISH(bl);
+    }
+
+    //XXX: RGWFileHandle::decode method can only be called from
+    //	   RGWFileHandle::decode_attrs, otherwise the file_ondisk_version
+    //	   fied would be contaminated
+    void decode(bufferlist::const_iterator& bl) {
+      DECODE_START(3, bl);
+      uint32_t fh_type;
+      decode(fh_type, bl);
+      if ((fh.fh_type != fh_type) &&
+	 (fh_type == RGW_FS_TYPE_SYMBOLIC_LINK))
+        fh.fh_type = RGW_FS_TYPE_SYMBOLIC_LINK;
+      decode(state.dev, bl);
+      decode(state.size, bl);
+      decode(state.nlink, bl);
+      decode(state.owner_uid, bl);
+      decode(state.owner_gid, bl);
+      decode(state.unix_mode, bl);
+      ceph::real_time enc_time;
+      for (auto t : { &(state.ctime), &(state.mtime), &(state.atime) }) {
+	decode(enc_time, bl);
+	*t = real_clock::to_timespec(enc_time);
+      }
+      if (struct_v >= 2) {
+        decode(state.version, bl);
+      }
+      if (struct_v >= 3) {
+	int64_t fov;
+	decode(fov, bl);
+	file_ondisk_version = fov;
+      }
+      DECODE_FINISH(bl);
+    }
+
+    friend void encode(const RGWFileHandle& c, ::ceph::buffer::list &bl, uint64_t features);
+    friend void decode(RGWFileHandle &c, ::ceph::bufferlist::const_iterator &p);
+  public:
+    RGWFileHandle(RGWLibFS* _fs, RGWFileHandle* _parent,
+		  const fh_key& _fhk, std::string& _name, uint32_t _flags)
+      : fs(_fs), bucket(nullptr), parent(_parent), file_ondisk_version(-1),
+	name(std::move(_name)), fhk(_fhk), flags(_flags) {
+
+      if (parent->is_root()) {
+	fh.fh_type = RGW_FS_TYPE_DIRECTORY;
+	variant_type = directory();
+	flags |= FLAG_BUCKET;
+      } else {
+	bucket = parent->is_bucket() ? parent
+	  : parent->bucket;
+	if (flags & FLAG_DIRECTORY) {
+	  fh.fh_type = RGW_FS_TYPE_DIRECTORY;
+	  variant_type = directory();
+        } else if(flags & FLAG_SYMBOLIC_LINK) {
+	  fh.fh_type = RGW_FS_TYPE_SYMBOLIC_LINK;
+          variant_type = file();
+        } else {
+	  fh.fh_type = RGW_FS_TYPE_FILE;
+	  variant_type = file();
+	}
+      }
+
+      depth = parent->depth + 1;
+
+      /* save constant fhk */
+      fh.fh_hk = fhk.fh_hk; /* XXX redundant in fh_hk */
+
+      /* inherits parent's fsid */
+      state.dev = parent->state.dev;
+
+      switch (fh.fh_type) {
+      case RGW_FS_TYPE_DIRECTORY:
+	state.unix_mode = RGW_RWXMODE|S_IFDIR;
+	/* virtual directories are always invalid */
+	advance_mtime();
+	break;
+      case RGW_FS_TYPE_FILE:
+	state.unix_mode = RGW_RWMODE|S_IFREG;
+        break;
+      case RGW_FS_TYPE_SYMBOLIC_LINK:
+        state.unix_mode = RGW_RWMODE|S_IFLNK;
+        break;
+      default:
+	break;
+      }
+
+      /* pointer to self */
+      fh.fh_private = this;
+    }
+
+    const std::string& get_name() const {
+      return name;
+    }
+
+    const fh_key& get_key() const {
+      return fhk;
+    }
+
+    directory* get_directory() {
+      return boost::get<directory>(&variant_type);
+    }
+
+    size_t get_size() const { return state.size; }
+
+    const char* stype() {
+      return is_dir() ? "DIR" : "FILE";
+    }
+
+    uint16_t get_depth() const { return depth; }
+
+    struct rgw_file_handle* get_fh() { return &fh; }
+
+    RGWLibFS* get_fs() { return fs; }
+
+    RGWFileHandle* get_parent() { return parent; }
+
+    uint32_t get_owner_uid() const { return state.owner_uid; }
+    uint32_t get_owner_gid() const { return state.owner_gid; }
+
+    struct timespec get_ctime() const { return state.ctime; }
+    struct timespec get_mtime() const { return state.mtime; }
+
+    const ceph::buffer::list& get_etag() const { return etag; }
+    const ceph::buffer::list& get_acls() const { return acls; }
+
+    void create_stat(struct stat* st, uint32_t mask) {
+      if (mask & RGW_SETATTR_UID)
+	state.owner_uid = st->st_uid;
+
+      if (mask & RGW_SETATTR_GID)
+	state.owner_gid = st->st_gid;
+
+      if (mask & RGW_SETATTR_MODE)  {
+	switch (fh.fh_type) {
+	case RGW_FS_TYPE_DIRECTORY:
+	  state.unix_mode = st->st_mode|S_IFDIR;
+	  break;
+	case RGW_FS_TYPE_FILE:
+	  state.unix_mode = st->st_mode|S_IFREG;
+          break;
+        case RGW_FS_TYPE_SYMBOLIC_LINK:
+          state.unix_mode = st->st_mode|S_IFLNK;
+          break;
+      default:
+	break;
+	}
+      }
+
+      if (mask & RGW_SETATTR_ATIME)
+	state.atime = st->st_atim;
+
+      if (mask & RGW_SETATTR_MTIME) {
+	if (fh.fh_type != RGW_FS_TYPE_DIRECTORY)
+	  state.mtime = st->st_mtim;
+      }
+
+      if (mask & RGW_SETATTR_CTIME)
+	state.ctime = st->st_ctim;
+    }
+
+    int stat(struct stat* st, uint32_t flags = FLAG_NONE) {
+      /* partial Unix attrs */
+      /* FIPS zeroization audit 20191115: this memset is not security
+       * related. */
+      memset(st, 0, sizeof(struct stat));
+      st->st_dev = state.dev;
+      st->st_ino = fh.fh_hk.object; // XXX
+
+      st->st_uid = state.owner_uid;
+      st->st_gid = state.owner_gid;
+
+      st->st_mode = state.unix_mode;
+
+      switch (fh.fh_type) {
+      case RGW_FS_TYPE_DIRECTORY:
+	/* virtual directories are always invalid */
+	advance_mtime(flags);
+	st->st_nlink = state.nlink;
+	break;
+      case RGW_FS_TYPE_FILE:
+	st->st_nlink = 1;
+	st->st_blksize = 4096;
+	st->st_size = state.size;
+	st->st_blocks = (state.size) / 512;
+        break;
+      case RGW_FS_TYPE_SYMBOLIC_LINK:
+	st->st_nlink = 1;
+	st->st_blksize = 4096;
+	st->st_size = state.size;
+	st->st_blocks = (state.size) / 512;
+        break;
+      default:
+	break;
+      }
+
+#ifdef HAVE_STAT_ST_MTIMESPEC_TV_NSEC
+      st->st_atimespec = state.atime;
+      st->st_mtimespec = state.mtime;
+      st->st_ctimespec = state.ctime;
+#else
+      st->st_atim = state.atime;
+      st->st_mtim = state.mtime;
+      st->st_ctim = state.ctime;
+#endif
+
+      return 0;
+    }
+
+    const std::string& bucket_name() const {
+      if (is_root())
+	return root_name;
+      if (is_bucket())
+	return name;
+      return bucket->object_name();
+    }
+
+    const std::string& object_name() const { return name; }
+
+    std::string full_object_name(bool omit_bucket = false) const {
+      std::string path;
+      std::vector<const std::string*> segments;
+      int reserve = 0;
+      const RGWFileHandle* tfh = this;
+      while (tfh && !tfh->is_root() && !(tfh->is_bucket() && omit_bucket)) {
+	segments.push_back(&tfh->object_name());
+	reserve += (1 + tfh->object_name().length());
+	tfh = tfh->parent;
+      }
+      int pos = 1;
+      path.reserve(reserve);
+      for (auto& s : boost::adaptors::reverse(segments)) {
+	if (pos > 1) {
+	  path += "/";
+	} else {
+	  if (!omit_bucket &&
+	      ((path.length() == 0) || (path.front() != '/')))
+	    path += "/";
+	}
+	path += *s;
+	++pos;
+      }
+      return path;
+    }
+
+    inline std::string relative_object_name() const {
+      return full_object_name(true /* omit_bucket */);
+    }
+
+    inline std::string relative_object_name2() {
+      std::string rname = full_object_name(true /* omit_bucket */);
+      if (is_dir()) {
+	rname += "/";
+      }
+      return rname;
+    }
+
+    inline std::string format_child_name(const std::string& cbasename,
+                                         bool is_dir) const {
+      std::string child_name{relative_object_name()};
+      if ((child_name.size() > 0) &&
+	  (child_name.back() != '/'))
+	child_name += "/";
+      child_name += cbasename;
+      if (is_dir)
+	child_name += "/";
+      return child_name;
+    }
+
+    inline std::string make_key_name(const char *name) const {
+      std::string key_name{full_object_name()};
+      if (key_name.length() > 0)
+	key_name += "/";
+      key_name += name;
+      return key_name;
+    }
+
+    fh_key make_fhk(const std::string& name);
+
+    void add_marker(uint64_t off, const rgw_obj_key& marker,
+		    uint8_t obj_type) {
+      using std::get;
+      directory* d = get<directory>(&variant_type);
+      if (d) {
+	unique_lock guard(mtx);
+	d->last_marker = marker;
+      }
+    }
+
+    const rgw_obj_key* find_marker(uint64_t off) const {
+      using std::get;
+      if (off > 0) {
+	const directory* d = get<directory>(&variant_type);
+	if (d ) {
+	  return &d->last_marker;
+	}
+      }
+      return nullptr;
+    }
+
+    int offset_of(const std::string& name, int64_t *offset, uint32_t flags) {
+      if (unlikely(! is_dir())) {
+	return -EINVAL;
+      }
+      *offset = XXH64(name.c_str(), name.length(), fh_key::seed);
+      return 0;
+    }
+
+    bool is_open() const { return flags & FLAG_OPEN; }
+    bool is_root() const { return flags & FLAG_ROOT; }
+    bool is_mount() const { return flags & FLAG_MOUNT; }
+    bool is_bucket() const { return flags & FLAG_BUCKET; }
+    bool is_object() const { return !is_bucket(); }
+    bool is_file() const { return (fh.fh_type == RGW_FS_TYPE_FILE); }
+    bool is_dir() const { return (fh.fh_type == RGW_FS_TYPE_DIRECTORY); }
+    bool is_link() const { return (fh.fh_type == RGW_FS_TYPE_SYMBOLIC_LINK); }
+    bool creating() const { return flags & FLAG_CREATING; }
+    bool deleted() const { return flags & FLAG_DELETED; }
+    bool stateless_open() const { return flags & FLAG_STATELESS_OPEN; }
+    bool has_children() const;
+
+    int open(uint32_t gsh_flags) {
+      lock_guard guard(mtx);
+      if (! is_open()) {
+	if (gsh_flags & RGW_OPEN_FLAG_V3) {
+	  flags |= FLAG_STATELESS_OPEN;
+	}
+	flags |= FLAG_OPEN;
+	return 0;
+      }
+      return -EPERM;
+    }
+
+    typedef boost::variant<uint64_t*, const char*> readdir_offset;
+
+    int readdir(rgw_readdir_cb rcb, void *cb_arg, readdir_offset offset,
+		bool *eof, uint32_t flags);
+
+    int write(uint64_t off, size_t len, size_t *nbytes, void *buffer);
+
+    int commit(uint64_t offset, uint64_t length, uint32_t flags) {
+      /* NFS3 and NFSv4 COMMIT implementation
+       * the current atomic update strategy doesn't actually permit
+       * clients to read-stable until either CLOSE (NFSv4+) or the
+       * expiration of the active write timer (NFS3).  In the
+       * interim, the client may send an arbitrary number of COMMIT
+       * operations which must return a success result */
+      return 0;
+    }
+
+    int write_finish(uint32_t flags = FLAG_NONE);
+    int close();
+
+    void open_for_create() {
+      lock_guard guard(mtx);
+      flags |= FLAG_CREATING;
+    }
+
+    void clear_creating() {
+      lock_guard guard(mtx);
+      flags &= ~FLAG_CREATING;
+    }
+
+    void inc_nlink(const uint64_t n) {
+      state.nlink += n;
+    }
+
+    void set_nlink(const uint64_t n) {
+      state.nlink = n;
+    }
+
+    void set_size(const size_t size) {
+      state.size = size;
+    }
+
+    void set_times(const struct timespec &ts) {
+      state.ctime = ts;
+      state.mtime = state.ctime;
+      state.atime = state.ctime;
+    }
+
+    void set_times(real_time t) {
+      set_times(real_clock::to_timespec(t));
+    }
+
+    void set_ctime(const struct timespec &ts) {
+      state.ctime = ts;
+    }
+
+    void set_mtime(const struct timespec &ts) {
+      state.mtime = ts;
+    }
+
+    void set_atime(const struct timespec &ts) {
+      state.atime = ts;
+    }
+
+    void set_etag(const ceph::buffer::list& _etag ) {
+      etag = _etag;
+    }
+
+    void set_acls(const ceph::buffer::list& _acls ) {
+      acls = _acls;
+    }
+
+    void encode_attrs(ceph::buffer::list& ux_key1,
+		      ceph::buffer::list& ux_attrs1,
+		      bool inc_ov = true);
+
+    DecodeAttrsResult decode_attrs(const ceph::buffer::list* ux_key1,
+                                   const ceph::buffer::list* ux_attrs1);
+
+    void invalidate();
+
+    bool reclaim(const cohort::lru::ObjectFactory* newobj_fac) override;
+
+    typedef cohort::lru::LRU<std::mutex> FhLRU;
+
+    struct FhLT
+    {
+      // for internal ordering
+      bool operator()(const RGWFileHandle& lhs, const RGWFileHandle& rhs) const
+	{ return (lhs.get_key() < rhs.get_key()); }
+
+      // for external search by fh_key
+      bool operator()(const fh_key& k, const RGWFileHandle& fh) const
+	{ return k < fh.get_key(); }
+
+      bool operator()(const RGWFileHandle& fh, const fh_key& k) const
+	{ return fh.get_key() < k; }
+    };
+
+    struct FhEQ
+    {
+      bool operator()(const RGWFileHandle& lhs, const RGWFileHandle& rhs) const
+	{ return (lhs.get_key() == rhs.get_key()); }
+
+      bool operator()(const fh_key& k, const RGWFileHandle& fh) const
+	{ return k == fh.get_key(); }
+
+      bool operator()(const RGWFileHandle& fh, const fh_key& k) const
+	{ return fh.get_key() == k; }
+    };
+
+    typedef bi::link_mode<bi::safe_link> link_mode; /* XXX normal */
+#if defined(FHCACHE_AVL)
+    typedef bi::avl_set_member_hook<link_mode> tree_hook_type;
+#else
+    /* RBT */
+    typedef bi::set_member_hook<link_mode> tree_hook_type;
+#endif
+    tree_hook_type fh_hook;
+
+    typedef bi::member_hook<
+      RGWFileHandle, tree_hook_type, &RGWFileHandle::fh_hook> FhHook;
+
+#if defined(FHCACHE_AVL)
+    typedef bi::avltree<RGWFileHandle, bi::compare<FhLT>, FhHook> FHTree;
+#else
+    typedef bi::rbtree<RGWFileHandle, bi::compare<FhLT>, FhHook> FhTree;
+#endif
+    typedef cohort::lru::TreeX<RGWFileHandle, FhTree, FhLT, FhEQ, fh_key,
+			       std::mutex> FHCache;
+
+    ~RGWFileHandle() override;
+
+    friend std::ostream& operator<<(std::ostream &os,
+				    RGWFileHandle const &rgw_fh);
+
+    class Factory : public cohort::lru::ObjectFactory
+    {
+    public:
+      RGWLibFS* fs;
+      RGWFileHandle* parent;
+      const fh_key& fhk;
+      std::string& name;
+      uint32_t flags;
+
+      Factory() = delete;
+
+      Factory(RGWLibFS* _fs, RGWFileHandle* _parent,
+	      const fh_key& _fhk, std::string& _name, uint32_t _flags)
+	: fs(_fs), parent(_parent), fhk(_fhk), name(_name),
+	  flags(_flags) {}
+
+      void recycle (cohort::lru::Object* o) override {
+	/* re-use an existing object */
+	o->~Object(); // call lru::Object virtual dtor
+	// placement new!
+	new (o) RGWFileHandle(fs, parent, fhk, name, flags);
+      }
+
+      cohort::lru::Object* alloc() override {
+	return new RGWFileHandle(fs, parent, fhk, name, flags);
+      }
+    }; /* Factory */
+
+  }; /* RGWFileHandle */
+
+  WRITE_CLASS_ENCODER(RGWFileHandle);
+
+  inline RGWFileHandle* get_rgwfh(struct rgw_file_handle* fh) {
+    return static_cast<RGWFileHandle*>(fh->fh_private);
+  }
+
+  inline enum rgw_fh_type fh_type_of(uint32_t flags) {
+    enum rgw_fh_type fh_type;
+    switch(flags & RGW_LOOKUP_TYPE_FLAGS)
+    {
+    case RGW_LOOKUP_FLAG_DIR:
+      fh_type = RGW_FS_TYPE_DIRECTORY;
+      break;
+    case RGW_LOOKUP_FLAG_FILE:
+      fh_type = RGW_FS_TYPE_FILE;
+      break;
+    default:
+      fh_type = RGW_FS_TYPE_NIL;
+    };
+    return fh_type;
+  }
+
+  typedef std::tuple<RGWFileHandle*, uint32_t> LookupFHResult;
+  typedef std::tuple<RGWFileHandle*, int> MkObjResult;
+
+  class RGWLibFS
+  {
+    CephContext* cct;
+    struct rgw_fs fs{};
+    RGWFileHandle root_fh;
+    rgw_fh_callback_t invalidate_cb;
+    void *invalidate_arg;
+    bool shutdown;
+
+    mutable std::atomic<uint64_t> refcnt;
+
+    RGWFileHandle::FHCache fh_cache;
+    RGWFileHandle::FhLRU fh_lru;
+
+    std::string uid; // should match user.user_id, iiuc
+
+    std::unique_ptr<rgw::sal::User> user;
+    RGWAccessKey key; // XXXX acc_key
+
+    static std::atomic<uint32_t> fs_inst_counter;
+
+    static uint32_t write_completion_interval_s;
+
+    using lock_guard = std::lock_guard<std::mutex>;
+    using unique_lock = std::unique_lock<std::mutex>;
+
+    struct event
+    {
+      enum class type : uint8_t { READDIR } ;
+      type t;
+      const fh_key fhk;
+      struct timespec ts;
+      event(type t, const fh_key& k, const struct timespec& ts)
+	: t(t), fhk(k), ts(ts) {}
+    };
+
+    friend std::ostream& operator<<(std::ostream &os,
+				    RGWLibFS::event const &ev);
+
+    using event_vector = /* boost::small_vector<event, 16> */
+      std::vector<event>;
+
+    struct WriteCompletion
+    {
+      RGWFileHandle& rgw_fh;
+
+      explicit WriteCompletion(RGWFileHandle& _fh) : rgw_fh(_fh) {
+	rgw_fh.get_fs()->ref(&rgw_fh);
+      }
+
+      void operator()() {
+	rgw_fh.close(); /* will finish in-progress write */
+	rgw_fh.get_fs()->unref(&rgw_fh);
+      }
+    };
+
+    static ceph::timer<ceph::mono_clock> write_timer;
+
+    struct State {
+      std::mutex mtx;
+      std::atomic<uint32_t> flags;
+      std::deque<event> events;
+
+      State() : flags(0) {}
+
+      void push_event(const event& ev) {
+	events.push_back(ev);
+      }
+    } state;
+
+    uint32_t new_inst() {
+      return ++fs_inst_counter;
+    }
+
+    friend class RGWFileHandle;
+    friend class RGWLibProcess;
+
+  public:
+
+    static constexpr uint32_t FLAG_NONE =      0x0000;
+    static constexpr uint32_t FLAG_CLOSED =    0x0001;
+
+    struct BucketStats {
+      size_t size;
+      size_t size_rounded;
+      real_time creation_time;
+      uint64_t num_entries;
+    };
+
+    RGWLibFS(CephContext* _cct, const char *_uid, const char *_user_id,
+	    const char* _key, const char *root)
+      : cct(_cct), root_fh(this), invalidate_cb(nullptr),
+	invalidate_arg(nullptr), shutdown(false), refcnt(1),
+	fh_cache(cct->_conf->rgw_nfs_fhcache_partitions,
+		 cct->_conf->rgw_nfs_fhcache_size),
+	fh_lru(cct->_conf->rgw_nfs_lru_lanes,
+	       cct->_conf->rgw_nfs_lru_lane_hiwat),
+	uid(_uid), key(_user_id, _key) {
+
+      if (!root || !strcmp(root, "/")) {
+        root_fh.init_rootfs(uid, RGWFileHandle::root_name, false);
+      } else {
+        root_fh.init_rootfs(uid, root, true);
+      }
+
+      /* pointer to self */
+      fs.fs_private = this;
+
+      /* expose public root fh */
+      fs.root_fh = root_fh.get_fh();
+
+      new_inst();
+    }
+
+    friend void intrusive_ptr_add_ref(const RGWLibFS* fs) {
+      fs->refcnt.fetch_add(1, std::memory_order_relaxed);
+    }
+
+    friend void intrusive_ptr_release(const RGWLibFS* fs) {
+      if (fs->refcnt.fetch_sub(1, std::memory_order_release) == 0) {
+	std::atomic_thread_fence(std::memory_order_acquire);
+	delete fs;
+      }
+    }
+
+    RGWLibFS* ref() {
+      intrusive_ptr_add_ref(this);
+      return this;
+    }
+
+    inline void rele() {
+      intrusive_ptr_release(this);
+    }
+
+    void stop() { shutdown = true; }
+
+    void release_evict(RGWFileHandle* fh) {
+      /* remove from cache, releases sentinel ref */
+      fh_cache.remove(fh->fh.fh_hk.object, fh,
+		      RGWFileHandle::FHCache::FLAG_LOCK);
+      /* release call-path ref */
+      (void) fh_lru.unref(fh, cohort::lru::FLAG_NONE);
+    }
+
+    int authorize(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver) {
+      int ret = driver->get_user_by_access_key(dpp, key.id, null_yield, &user);
+      if (ret == 0) {
+	RGWAccessKey* k = user->get_info().get_key(key.id);
+	if (!k || (k->key != key.key))
+	  return -EINVAL;
+	if (user->get_info().suspended)
+	  return -ERR_USER_SUSPENDED;
+      } else {
+	/* try external authenticators (ldap for now) */
+	rgw::LDAPHelper* ldh = g_rgwlib->get_ldh(); /* !nullptr */
+	RGWToken token;
+	/* boost filters and/or string_ref may throw on invalid input */
+	try {
+	  token = rgw::from_base64(key.id);
+	} catch(...) {
+	  token = std::string("");
+	}
+	if (token.valid() && (ldh->auth(token.id, token.key) == 0)) {
+	  /* try to driver user if it doesn't already exist */
+	  if (user->load_user(dpp, null_yield) < 0) {
+	    int ret = user->store_user(dpp, null_yield, true);
+	    if (ret < 0) {
+	      lsubdout(get_context(), rgw, 10)
+		<< "NOTICE: failed to driver new user's info: ret=" << ret
+		<< dendl;
+	    }
+	  }
+	} /* auth success */
+      }
+      return ret;
+    } /* authorize */
+
+    int register_invalidate(rgw_fh_callback_t cb, void *arg, uint32_t flags) {
+      invalidate_cb = cb;
+      invalidate_arg = arg;
+      return 0;
+    }
+
+    /* find RGWFileHandle by id  */
+    LookupFHResult lookup_fh(const fh_key& fhk,
+			     const uint32_t flags = RGWFileHandle::FLAG_NONE) {
+      using std::get;
+
+      // cast int32_t(RGWFileHandle::FLAG_NONE) due to strictness of Clang
+      // the cast transfers a lvalue into a rvalue  in the ctor
+      // check the commit message for the full details
+      LookupFHResult fhr { nullptr, uint32_t(RGWFileHandle::FLAG_NONE) };
+
+      RGWFileHandle::FHCache::Latch lat;
+      bool fh_locked = flags & RGWFileHandle::FLAG_LOCKED;
+
+    retry:
+      RGWFileHandle* fh =
+	fh_cache.find_latch(fhk.fh_hk.object /* partition selector*/,
+			    fhk /* key */, lat /* serializer */,
+			    RGWFileHandle::FHCache::FLAG_LOCK);
+      /* LATCHED */
+      if (fh) {
+	if (likely(! fh_locked))
+	    fh->mtx.lock(); // XXX !RAII because may-return-LOCKED
+	/* need initial ref from LRU (fast path) */
+	if (! fh_lru.ref(fh, cohort::lru::FLAG_INITIAL)) {
+	  lat.lock->unlock();
+	  if (likely(! fh_locked))
+	    fh->mtx.unlock();
+	  goto retry; /* !LATCHED */
+	}
+	/* LATCHED, LOCKED */
+	if (! (flags & RGWFileHandle::FLAG_LOCK))
+	  fh->mtx.unlock(); /* ! LOCKED */
+      }
+      lat.lock->unlock(); /* !LATCHED */
+      get<0>(fhr) = fh;
+      if (fh) {
+	    lsubdout(get_context(), rgw, 17)
+	      << __func__ << " 1 " << *fh
+	      << dendl;
+      }
+      return fhr;
+    } /* lookup_fh(const fh_key&) */
+
+    /* find or create an RGWFileHandle */
+    LookupFHResult lookup_fh(RGWFileHandle* parent, const char *name,
+			     const uint32_t flags = RGWFileHandle::FLAG_NONE) {
+      using std::get;
+
+      // cast int32_t(RGWFileHandle::FLAG_NONE) due to strictness of Clang
+      // the cast transfers a lvalue into a rvalue  in the ctor
+      // check the commit message for the full details
+      LookupFHResult fhr { nullptr, uint32_t(RGWFileHandle::FLAG_NONE) };
+
+      /* mount is stale? */
+      if (state.flags & FLAG_CLOSED)
+	return fhr;
+
+      RGWFileHandle::FHCache::Latch lat;
+      bool fh_locked = flags & RGWFileHandle::FLAG_LOCKED;
+
+      std::string obj_name{name};
+      std::string key_name{parent->make_key_name(name)};
+      fh_key fhk = parent->make_fhk(obj_name);
+
+      lsubdout(get_context(), rgw, 10)
+	<< __func__ << " called on "
+	<< parent->object_name() << " for " << key_name
+	<< " (" << obj_name << ")"
+	<< " -> " << fhk
+	<< dendl;
+
+    retry:
+      RGWFileHandle* fh =
+	fh_cache.find_latch(fhk.fh_hk.object /* partition selector*/,
+			    fhk /* key */, lat /* serializer */,
+			    RGWFileHandle::FHCache::FLAG_LOCK);
+      /* LATCHED */
+      if (fh) {
+	if (likely(! fh_locked))
+	  fh->mtx.lock(); // XXX !RAII because may-return-LOCKED
+	if (fh->flags & RGWFileHandle::FLAG_DELETED) {
+	  /* for now, delay briefly and retry */
+	  lat.lock->unlock();
+	  if (likely(! fh_locked))
+	    fh->mtx.unlock();
+	  std::this_thread::sleep_for(std::chrono::milliseconds(20));
+	  goto retry; /* !LATCHED */
+	}
+	/* need initial ref from LRU (fast path) */
+	if (! fh_lru.ref(fh, cohort::lru::FLAG_INITIAL)) {
+	  lat.lock->unlock();
+	  if (likely(! fh_locked))
+	    fh->mtx.unlock();
+	  goto retry; /* !LATCHED */
+	}
+	/* LATCHED, LOCKED */
+	if (! (flags & RGWFileHandle::FLAG_LOCK))
+	  if (likely(! fh_locked))
+	    fh->mtx.unlock(); /* ! LOCKED */
+      } else {
+	/* make or re-use handle */
+	RGWFileHandle::Factory prototype(this, parent, fhk,
+					 obj_name, CREATE_FLAGS(flags));
+	uint32_t iflags{cohort::lru::FLAG_INITIAL};
+	fh = static_cast<RGWFileHandle*>(
+	  fh_lru.insert(&prototype,
+			cohort::lru::Edge::MRU,
+			iflags));
+	if (fh) {
+	  /* lock fh (LATCHED) */
+	  if (flags & RGWFileHandle::FLAG_LOCK)
+	    fh->mtx.lock();
+	  if (likely(! (iflags & cohort::lru::FLAG_RECYCLE))) {
+	    /* inserts at cached insert iterator, releasing latch */
+	    fh_cache.insert_latched(
+	      fh, lat, RGWFileHandle::FHCache::FLAG_UNLOCK);
+	  } else {
+	    /* recycle step invalidates Latch */
+	    fh_cache.insert(
+	      fhk.fh_hk.object, fh, RGWFileHandle::FHCache::FLAG_NONE);
+	    lat.lock->unlock(); /* !LATCHED */
+	  }
+	  get<1>(fhr) |= RGWFileHandle::FLAG_CREATE;
+	  /* ref parent (non-initial ref cannot fail on valid object) */
+	  if (! parent->is_mount()) {
+	    (void) fh_lru.ref(parent, cohort::lru::FLAG_NONE);
+	  }
+	  goto out; /* !LATCHED */
+	} else {
+	  lat.lock->unlock();
+	  goto retry; /* !LATCHED */
+	}
+      }
+      lat.lock->unlock(); /* !LATCHED */
+    out:
+      get<0>(fhr) = fh;
+      if (fh) {
+	    lsubdout(get_context(), rgw, 17)
+	      << __func__ << " 2 " << *fh
+	      << dendl;
+      }
+      return fhr;
+    } /*  lookup_fh(RGWFileHandle*, const char *, const uint32_t) */
+
+    inline void unref(RGWFileHandle* fh) {
+      if (likely(! fh->is_mount())) {
+	(void) fh_lru.unref(fh, cohort::lru::FLAG_NONE);
+      }
+    }
+
+    inline RGWFileHandle* ref(RGWFileHandle* fh) {
+      if (likely(! fh->is_mount())) {
+	fh_lru.ref(fh, cohort::lru::FLAG_NONE);
+      }
+      return fh;
+    }
+
+    int getattr(RGWFileHandle* rgw_fh, struct stat* st);
+
+    int setattr(RGWFileHandle* rgw_fh, struct stat* st, uint32_t mask,
+		uint32_t flags);
+
+    int getxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist* attrs,
+		  rgw_getxattr_cb cb, void *cb_arg, uint32_t flags);
+
+    int lsxattrs(RGWFileHandle* rgw_fh, rgw_xattrstr *filter_prefix,
+		 rgw_getxattr_cb cb, void *cb_arg, uint32_t flags);
+
+    int setxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist* attrs, uint32_t flags);
+
+    int rmxattrs(RGWFileHandle* rgw_fh, rgw_xattrlist* attrs, uint32_t flags);
+
+    void update_fh(RGWFileHandle *rgw_fh);
+
+    LookupFHResult stat_bucket(RGWFileHandle* parent, const char *path,
+			       RGWLibFS::BucketStats& bs,
+			       uint32_t flags);
+
+    LookupFHResult fake_leaf(RGWFileHandle* parent, const char *path,
+			     enum rgw_fh_type type = RGW_FS_TYPE_NIL,
+			     struct stat *st = nullptr, uint32_t mask = 0,
+			     uint32_t flags = RGWFileHandle::FLAG_NONE);
+
+    LookupFHResult stat_leaf(RGWFileHandle* parent, const char *path,
+			     enum rgw_fh_type type = RGW_FS_TYPE_NIL,
+			     uint32_t flags = RGWFileHandle::FLAG_NONE);
+
+    int read(RGWFileHandle* rgw_fh, uint64_t offset, size_t length,
+	     size_t* bytes_read, void* buffer, uint32_t flags);
+
+    int readlink(RGWFileHandle* rgw_fh, uint64_t offset, size_t length,
+	     size_t* bytes_read, void* buffer, uint32_t flags);
+
+    int rename(RGWFileHandle* old_fh, RGWFileHandle* new_fh,
+	       const char *old_name, const char *new_name);
+
+    MkObjResult create(RGWFileHandle* parent, const char *name, struct stat *st,
+		      uint32_t mask, uint32_t flags);
+
+    MkObjResult symlink(RGWFileHandle* parent, const char *name,
+               const char *link_path, struct stat *st, uint32_t mask, uint32_t flags);
+
+    MkObjResult mkdir(RGWFileHandle* parent, const char *name, struct stat *st,
+		      uint32_t mask, uint32_t flags);
+
+    int unlink(RGWFileHandle* rgw_fh, const char *name,
+	       uint32_t flags = FLAG_NONE);
+
+    /* find existing RGWFileHandle */
+    RGWFileHandle* lookup_handle(struct rgw_fh_hk fh_hk) {
+
+      if (state.flags & FLAG_CLOSED)
+	return nullptr;
+
+      RGWFileHandle::FHCache::Latch lat;
+      fh_key fhk(fh_hk);
+
+    retry:
+      RGWFileHandle* fh =
+	fh_cache.find_latch(fhk.fh_hk.object /* partition selector*/,
+			    fhk /* key */, lat /* serializer */,
+			    RGWFileHandle::FHCache::FLAG_LOCK);
+      /* LATCHED */
+      if (! fh) {
+	if (unlikely(fhk == root_fh.fh.fh_hk)) {
+	  /* lookup for root of this fs */
+	  fh = &root_fh;
+	  goto out;
+	}
+	lsubdout(get_context(), rgw, 0)
+	  << __func__ << " handle lookup failed " << fhk
+	  << dendl;
+	goto out;
+      }
+      fh->mtx.lock();
+      if (fh->flags & RGWFileHandle::FLAG_DELETED) {
+	/* for now, delay briefly and retry */
+	lat.lock->unlock();
+	fh->mtx.unlock(); /* !LOCKED */
+	std::this_thread::sleep_for(std::chrono::milliseconds(20));
+	goto retry; /* !LATCHED */
+      }
+      if (! fh_lru.ref(fh, cohort::lru::FLAG_INITIAL)) {
+	lat.lock->unlock();
+	fh->mtx.unlock();
+	goto retry; /* !LATCHED */
+      }
+      /* LATCHED */
+      fh->mtx.unlock(); /* !LOCKED */
+    out:
+      lat.lock->unlock(); /* !LATCHED */
+
+      /* special case:  lookup root_fh */
+      if (! fh) {
+	if (unlikely(fh_hk == root_fh.fh.fh_hk)) {
+	  fh = &root_fh;
+	}
+      }
+
+      return fh;
+    }
+
+    CephContext* get_context() {
+      return cct;
+    }
+
+    struct rgw_fs* get_fs() { return &fs; }
+
+    RGWFileHandle& get_fh() { return root_fh; }
+
+    uint64_t get_fsid() { return root_fh.state.dev; }
+
+    RGWUserInfo* get_user() { return &user->get_info(); }
+
+    void update_user(const DoutPrefixProvider *dpp) {
+      (void) g_rgwlib->get_driver()->get_user_by_access_key(dpp, key.id, null_yield, &user);
+    }
+
+    void close();
+    void gc();
+  }; /* RGWLibFS */
+
+static inline std::string make_uri(const std::string& bucket_name,
+				   const std::string& object_name) {
+  std::string uri("/");
+  uri.reserve(bucket_name.length() + object_name.length() + 2);
+  uri += bucket_name;
+  uri += "/";
+  uri += object_name;
+  return uri;
+}
+
+/*
+  read directory content (buckets)
+*/
+
+class RGWListBucketsRequest : public RGWLibRequest,
+			      public RGWListBuckets /* RGWOp */
+{
+public:
+  RGWFileHandle* rgw_fh;
+  RGWFileHandle::readdir_offset offset;
+  void* cb_arg;
+  rgw_readdir_cb rcb;
+  uint64_t* ioff;
+  size_t ix;
+  uint32_t d_count;
+  bool rcb_eof; // caller forced early stop in readdir cycle
+
+  RGWListBucketsRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+			RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb,
+			void* _cb_arg, RGWFileHandle::readdir_offset& _offset)
+    : RGWLibRequest(_cct, std::move(_user)), rgw_fh(_rgw_fh), offset(_offset),
+      cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0),
+      rcb_eof(false) {
+
+    using boost::get;
+
+    if (unlikely(!! get<uint64_t*>(&offset))) {
+      ioff = get<uint64_t*>(offset);
+      const auto& mk = rgw_fh->find_marker(*ioff);
+      if (mk) {
+	marker = mk->name;
+      }
+    } else {
+      const char* mk = get<const char*>(offset);
+      if (mk) {
+	marker = mk;
+      }
+    }
+    op = this;
+  }
+
+  bool only_bucket() override { return false; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+    req_state* state = get_state();
+    state->info.method = "GET";
+    state->op = OP_GET;
+
+    /* XXX derp derp derp */
+    state->relative_uri = "/";
+    state->info.request_uri = "/"; // XXX
+    state->info.effective_uri = "/";
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  int get_params(optional_yield) override {
+    limit = -1; /* no limit */
+    return 0;
+  }
+
+  void send_response_begin(bool has_buckets) override {
+    sent_data = true;
+  }
+
+  void send_response_data(rgw::sal::BucketList& buckets) override {
+    if (!sent_data)
+      return;
+    auto& m = buckets.get_buckets();
+    for (const auto& iter : m) {
+      std::string_view marker{iter.first};
+      auto& ent = iter.second;
+      if (! this->operator()(ent->get_name(), marker)) {
+	/* caller cannot accept more */
+	lsubdout(cct, rgw, 5) << "ListBuckets rcb failed"
+			      << " dirent=" << ent->get_name()
+			      << " call count=" << ix
+			      << dendl;
+	rcb_eof = true;
+	return;
+      }
+      ++ix;
+    }
+  } /* send_response_data */
+
+  void send_response_end() override {
+    // do nothing
+  }
+
+  int operator()(const std::string_view& name,
+		 const std::string_view& marker) {
+    uint64_t off = XXH64(name.data(), name.length(), fh_key::seed);
+    if (!! ioff) {
+      *ioff = off;
+    }
+    /* update traversal cache */
+    rgw_fh->add_marker(off, rgw_obj_key{marker.data(), ""},
+		       RGW_FS_TYPE_DIRECTORY);
+    ++d_count;
+    return rcb(name.data(), cb_arg, off, nullptr, 0, RGW_LOOKUP_FLAG_DIR);
+  }
+
+  bool eof() {
+    using boost::get;
+
+    if (unlikely(cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15))) {
+      bool is_offset =
+	unlikely(! get<const char*>(&offset)) ||
+	!! get<const char*>(offset);
+      lsubdout(cct, rgw, 15) << "READDIR offset: " <<
+	((is_offset) ? offset : "(nil)")
+			     << " is_truncated: " << is_truncated
+			     << dendl;
+    }
+    return !is_truncated && !rcb_eof;
+  }
+
+}; /* RGWListBucketsRequest */
+
+/*
+  read directory content (bucket objects)
+*/
+
+class RGWReaddirRequest : public RGWLibRequest,
+			  public RGWListBucket /* RGWOp */
+{
+public:
+  RGWFileHandle* rgw_fh;
+  RGWFileHandle::readdir_offset offset;
+  void* cb_arg;
+  rgw_readdir_cb rcb;
+  uint64_t* ioff;
+  size_t ix;
+  uint32_t d_count;
+  bool rcb_eof; // caller forced early stop in readdir cycle
+
+  RGWReaddirRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+		    RGWFileHandle* _rgw_fh, rgw_readdir_cb _rcb,
+		    void* _cb_arg, RGWFileHandle::readdir_offset& _offset)
+    : RGWLibRequest(_cct, std::move(_user)), rgw_fh(_rgw_fh), offset(_offset),
+      cb_arg(_cb_arg), rcb(_rcb), ioff(nullptr), ix(0), d_count(0),
+      rcb_eof(false) {
+
+    using boost::get;
+
+    if (unlikely(!! get<uint64_t*>(&offset))) {
+      ioff = get<uint64_t*>(offset);
+      const auto& mk = rgw_fh->find_marker(*ioff);
+      if (mk) {
+	marker = *mk;
+      }
+    } else {
+      const char* mk = get<const char*>(offset);
+      if (mk) {
+	std::string tmark{rgw_fh->relative_object_name()};
+	if (tmark.length() > 0)
+	  tmark += "/";
+	tmark += mk;
+	marker = rgw_obj_key{std::move(tmark), "", ""};
+      }
+    }
+
+    default_max = 1000; // XXX was being omitted
+    op = this;
+  }
+
+  bool only_bucket() override { return true; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+    req_state* state = get_state();
+    state->info.method = "GET";
+    state->op = OP_GET;
+
+    /* XXX derp derp derp */
+    std::string uri = "/" + rgw_fh->bucket_name() + "/";
+    state->relative_uri = uri;
+    state->info.request_uri = uri; // XXX
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    prefix = rgw_fh->relative_object_name();
+    if (prefix.length() > 0)
+      prefix += "/";
+    delimiter = '/';
+
+    return 0;
+  }
+
+  int operator()(const std::string_view name, const rgw_obj_key& marker,
+		 const ceph::real_time& t, const uint64_t fsz, uint8_t type) {
+
+    assert(name.length() > 0); // all cases handled in callers
+
+    /* hash offset of name in parent (short name) for NFS readdir cookie */
+    uint64_t off = XXH64(name.data(), name.length(), fh_key::seed);
+    if (unlikely(!! ioff)) {
+      *ioff = off;
+    }
+
+    /* update traversal cache */
+    rgw_fh->add_marker(off, marker, type);
+    ++d_count;
+
+    /* set c/mtime and size from bucket index entry */
+    struct stat st = {};
+#ifdef HAVE_STAT_ST_MTIMESPEC_TV_NSEC
+    st.st_atimespec = ceph::real_clock::to_timespec(t);
+    st.st_mtimespec = st.st_atimespec;
+    st.st_ctimespec = st.st_atimespec;
+#else
+    st.st_atim = ceph::real_clock::to_timespec(t);
+    st.st_mtim = st.st_atim;
+    st.st_ctim = st.st_atim;
+#endif
+    st.st_size = fsz;
+
+    return rcb(name.data(), cb_arg, off, &st, RGWFileHandle::RCB_MASK,
+	       (type == RGW_FS_TYPE_DIRECTORY) ?
+	       RGW_LOOKUP_FLAG_DIR :
+	       RGW_LOOKUP_FLAG_FILE);
+  }
+
+  int get_params(optional_yield) override {
+    max = default_max;
+    return 0;
+  }
+
+  void send_response() override {
+    req_state* state = get_state();
+    auto cnow = real_clock::now();
+
+    /* enumerate objs and common_prefixes in parallel,
+     * avoiding increment on and end iterator, which is
+     * undefined */
+
+    class DirIterator
+    {
+      std::vector<rgw_bucket_dir_entry>& objs;
+      std::vector<rgw_bucket_dir_entry>::iterator obj_iter;
+
+      std::map<std::string, bool>& common_prefixes;
+      std::map<string, bool>::iterator cp_iter;
+
+      boost::optional<std::string_view> obj_sref;
+      boost::optional<std::string_view> cp_sref;
+      bool _skip_cp;
+
+    public:
+
+      DirIterator(std::vector<rgw_bucket_dir_entry>& objs,
+		  std::map<string, bool>& common_prefixes)
+	: objs(objs), common_prefixes(common_prefixes), _skip_cp(false)
+	{
+	  obj_iter = objs.begin();
+	  parse_obj();
+	  cp_iter = common_prefixes.begin();
+	  parse_cp();
+	}
+
+      bool is_obj() {
+	return (obj_iter != objs.end());
+      }
+
+      bool is_cp(){
+	return (cp_iter != common_prefixes.end());
+      }
+
+      bool eof() {
+	return ((!is_obj()) && (!is_cp()));
+      }
+
+      void parse_obj() {
+	if (is_obj()) {
+	  std::string_view sref{obj_iter->key.name};
+	  size_t last_del = sref.find_last_of('/');
+	  if (last_del != string::npos)
+	    sref.remove_prefix(last_del+1);
+	  obj_sref = sref;
+	}
+      } /* parse_obj */
+
+      void next_obj() {
+	++obj_iter;
+	parse_obj();
+      }
+
+      void parse_cp() {
+	if (is_cp()) {
+	  /* leading-/ skip case */
+	  if (cp_iter->first == "/") {
+	    _skip_cp = true;
+	    return;
+	  } else
+	    _skip_cp = false;
+
+	  /* it's safest to modify the element in place--a suffix-modifying
+	   * string_ref operation is problematic since ULP rgw_file callers
+	   * will ultimately need a c-string */
+	  if (cp_iter->first.back() == '/')
+	    const_cast<std::string&>(cp_iter->first).pop_back();
+
+	  std::string_view sref{cp_iter->first};
+	  size_t last_del = sref.find_last_of('/');
+	  if (last_del != string::npos)
+	    sref.remove_prefix(last_del+1);
+	  cp_sref = sref;
+	} /* is_cp */
+      } /* parse_cp */
+
+      void next_cp() {
+	++cp_iter;
+	parse_cp();
+      }
+
+      bool skip_cp() {
+	return _skip_cp;
+      }
+
+      bool entry_is_obj() {
+	return (is_obj() &&
+		((! is_cp()) ||
+		 (obj_sref.get() < cp_sref.get())));
+      }
+
+      std::string_view get_obj_sref() {
+	return obj_sref.get();
+      }
+
+      std::string_view get_cp_sref() {
+	return cp_sref.get();
+      }
+
+      std::vector<rgw_bucket_dir_entry>::iterator& get_obj_iter() {
+	return obj_iter;
+      }
+
+      std::map<string, bool>::iterator& get_cp_iter() {
+	return cp_iter;
+      }
+
+    }; /* DirIterator */
+
+    DirIterator di{objs, common_prefixes};
+
+    for (;;) {
+
+      if (di.eof()) {
+	break; // done
+      }
+
+      /* assert: one of is_obj() || is_cp() holds */
+      if (di.entry_is_obj()) {
+	auto sref = di.get_obj_sref();
+	if (sref.empty()) {
+	  /* recursive list of a leaf dir (iirc), do nothing */
+	} else {
+	  /* send a file entry */
+	  auto obj_entry = *(di.get_obj_iter());
+
+	  lsubdout(cct, rgw, 15) << "RGWReaddirRequest "
+				 << __func__ << " "
+				 << "list uri=" << state->relative_uri << " "
+				 << " prefix=" << prefix << " "
+				 << " obj path=" << obj_entry.key.name
+				 << " (" << sref << ")" << ""
+				 << " mtime="
+				 << real_clock::to_time_t(obj_entry.meta.mtime)
+				 << " size=" << obj_entry.meta.accounted_size
+				 << dendl;
+
+	  if (! this->operator()(sref, next_marker, obj_entry.meta.mtime,
+				 obj_entry.meta.accounted_size,
+				 RGW_FS_TYPE_FILE)) {
+	    /* caller cannot accept more */
+	    lsubdout(cct, rgw, 5) << "readdir rcb caller signalled stop"
+				  << " dirent=" << sref.data()
+				  << " call count=" << ix
+				  << dendl;
+	    rcb_eof = true;
+	    return;
+	  }
+	}
+	di.next_obj(); // and advance object
+      } else {
+	/* send a dir entry */
+	if (! di.skip_cp()) {
+	  auto sref = di.get_cp_sref();
+
+	  lsubdout(cct, rgw, 15) << "RGWReaddirRequest "
+				 << __func__ << " "
+				 << "list uri=" << state->relative_uri << " "
+				 << " prefix=" << prefix << " "
+				 << " cpref=" << sref
+				 << dendl;
+
+	  if (sref.empty()) {
+	    /* null path segment--could be created in S3 but has no NFS
+	     * interpretation */
+	  } else {
+	    if (! this->operator()(sref, next_marker, cnow, 0,
+				   RGW_FS_TYPE_DIRECTORY)) {
+	      /* caller cannot accept more */
+	      lsubdout(cct, rgw, 5) << "readdir rcb caller signalled stop"
+				    << " dirent=" << sref.data()
+				    << " call count=" << ix
+				    << dendl;
+	      rcb_eof = true;
+	      return;
+	    }
+	  }
+	}
+	di.next_cp(); // and advance common_prefixes
+      } /* ! di.entry_is_obj() */
+    } /* for (;;) */
+  }
+
+  virtual void send_versioned_response() {
+    send_response();
+  }
+
+  bool eof() {
+    using boost::get;
+
+    if (unlikely(cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15))) {
+      bool is_offset =
+	unlikely(! get<const char*>(&offset)) ||
+	!! get<const char*>(offset);
+      lsubdout(cct, rgw, 15) << "READDIR offset: " <<
+	((is_offset) ? offset : "(nil)")
+			     << " next marker: " << next_marker
+			     << " is_truncated: " << is_truncated
+			     << dendl;
+    }
+    return !is_truncated && !rcb_eof;
+  }
+
+}; /* RGWReaddirRequest */
+
+/*
+  dir has-children predicate (bucket objects)
+*/
+
+class RGWRMdirCheck : public RGWLibRequest,
+		      public RGWListBucket /* RGWOp */
+{
+public:
+  const RGWFileHandle* rgw_fh;
+  bool valid;
+  bool has_children;
+
+  RGWRMdirCheck (CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+		 const RGWFileHandle* _rgw_fh)
+    : RGWLibRequest(_cct, std::move(_user)), rgw_fh(_rgw_fh), valid(false),
+      has_children(false) {
+    default_max = 2;
+    op = this;
+  }
+
+  bool only_bucket() override { return true; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+    req_state* state = get_state();
+    state->info.method = "GET";
+    state->op = OP_GET;
+
+    std::string uri = "/" + rgw_fh->bucket_name() + "/";
+    state->relative_uri = uri;
+    state->info.request_uri = uri;
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    prefix = rgw_fh->relative_object_name();
+    if (prefix.length() > 0)
+      prefix += "/";
+    delimiter = '/';
+
+    return 0;
+  }
+
+  int get_params(optional_yield) override {
+    max = default_max;
+    return 0;
+  }
+
+  void send_response() override {
+    valid = true;
+    if ((objs.size() > 1) ||
+	(! objs.empty() &&
+	 (objs.front().key.name != prefix))) {
+      has_children = true;
+      return;
+    }
+    for (auto& iter : common_prefixes) {
+      /* readdir never produces a name for this case */
+      if (iter.first == "/")
+	continue;
+      has_children = true;
+      break;
+    }
+  }
+
+  virtual void send_versioned_response() {
+    send_response();
+  }
+
+}; /* RGWRMdirCheck */
+
+/*
+  create bucket
+*/
+
+class RGWCreateBucketRequest : public RGWLibRequest,
+			       public RGWCreateBucket /* RGWOp */
+{
+public:
+  const std::string& bucket_name;
+
+  RGWCreateBucketRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+			std::string& _bname)
+    : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname) {
+    op = this;
+  }
+
+  bool only_bucket() override { return false; }
+
+  int read_permissions(RGWOp* op_obj, optional_yield) override {
+    /* we ARE a 'create bucket' request (cf. rgw_rest.cc, ll. 1305-6) */
+    return 0;
+  }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "PUT";
+    state->op = OP_PUT;
+
+    string uri = "/" + bucket_name;
+    /* XXX derp derp derp */
+    state->relative_uri = uri;
+    state->info.request_uri = uri; // XXX
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  int get_params(optional_yield) override {
+    req_state* state = get_state();
+    RGWAccessControlPolicy_S3 s3policy(state->cct);
+    /* we don't have (any) headers, so just create canned ACLs */
+    int ret = s3policy.create_canned(state->owner, state->bucket_owner, state->canned_acl);
+    policy = s3policy;
+    return ret;
+  }
+
+  void send_response() override {
+    /* TODO: something (maybe) */
+  }
+}; /* RGWCreateBucketRequest */
+
+/*
+  delete bucket
+*/
+
+class RGWDeleteBucketRequest : public RGWLibRequest,
+			       public RGWDeleteBucket /* RGWOp */
+{
+public:
+  const std::string& bucket_name;
+
+  RGWDeleteBucketRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+			std::string& _bname)
+    : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname) {
+    op = this;
+  }
+
+  bool only_bucket() override { return true; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "DELETE";
+    state->op = OP_DELETE;
+
+    string uri = "/" + bucket_name;
+    /* XXX derp derp derp */
+    state->relative_uri = uri;
+    state->info.request_uri = uri; // XXX
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  void send_response() override {}
+
+}; /* RGWDeleteBucketRequest */
+
+/*
+  put object
+*/
+class RGWPutObjRequest : public RGWLibRequest,
+			 public RGWPutObj /* RGWOp */
+{
+public:
+  const std::string& bucket_name;
+  const std::string& obj_name;
+  buffer::list& bl; /* XXX */
+  size_t bytes_written;
+
+  RGWPutObjRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+		  const std::string& _bname, const std::string& _oname,
+		  buffer::list& _bl)
+    : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname), obj_name(_oname),
+      bl(_bl), bytes_written(0) {
+    op = this;
+  }
+
+  bool only_bucket() override { return true; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+
+    int rc = valid_s3_object_name(obj_name);
+    if (rc != 0)
+      return rc;
+
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "PUT";
+    state->op = OP_PUT;
+
+    /* XXX derp derp derp */
+    std::string uri = make_uri(bucket_name, obj_name);
+    state->relative_uri = uri;
+    state->info.request_uri = uri; // XXX
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    /* XXX required in RGWOp::execute() */
+    state->content_length = bl.length();
+
+    return 0;
+  }
+
+  int get_params(optional_yield) override {
+    req_state* state = get_state();
+    RGWAccessControlPolicy_S3 s3policy(state->cct);
+    /* we don't have (any) headers, so just create canned ACLs */
+    int ret = s3policy.create_canned(state->owner, state->bucket_owner, state->canned_acl);
+    policy = s3policy;
+    return ret;
+  }
+
+  int get_data(buffer::list& _bl) override {
+    /* XXX for now, use sharing semantics */
+    _bl = std::move(bl);
+    uint32_t len = _bl.length();
+    bytes_written += len;
+    return len;
+  }
+
+  void send_response() override {}
+
+  int verify_params() override {
+    if (bl.length() > cct->_conf->rgw_max_put_size)
+      return -ERR_TOO_LARGE;
+    return 0;
+  }
+
+  buffer::list* get_attr(const std::string& k) {
+    auto iter = attrs.find(k);
+    return (iter != attrs.end()) ? &(iter->second) : nullptr;
+  }
+
+}; /* RGWPutObjRequest */
+
+/*
+  get object
+*/
+
+class RGWReadRequest : public RGWLibRequest,
+		       public RGWGetObj /* RGWOp */
+{
+public:
+  RGWFileHandle* rgw_fh;
+  void *ulp_buffer;
+  size_t nread;
+  size_t read_resid; /* initialize to len, <= sizeof(ulp_buffer) */
+  bool do_hexdump = false;
+
+  RGWReadRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+		 RGWFileHandle* _rgw_fh, uint64_t off, uint64_t len,
+		 void *_ulp_buffer)
+    : RGWLibRequest(_cct, std::move(_user)), rgw_fh(_rgw_fh), ulp_buffer(_ulp_buffer),
+      nread(0), read_resid(len) {
+    op = this;
+
+    /* fixup RGWGetObj (already know range parameters) */
+    RGWGetObj::range_parsed = true;
+    RGWGetObj::get_data = true; // XXX
+    RGWGetObj::partial_content = true;
+    RGWGetObj::ofs = off;
+    RGWGetObj::end = off + len;
+  }
+
+  bool only_bucket() override { return false; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "GET";
+    state->op = OP_GET;
+
+    /* XXX derp derp derp */
+    state->relative_uri = make_uri(rgw_fh->bucket_name(),
+			       rgw_fh->relative_object_name());
+    state->info.request_uri = state->relative_uri; // XXX
+    state->info.effective_uri = state->relative_uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  int get_params(optional_yield) override {
+    return 0;
+  }
+
+  int send_response_data(ceph::buffer::list& bl, off_t bl_off,
+                         off_t bl_len) override {
+    size_t bytes;
+    for (auto& bp : bl.buffers()) {
+      /* if for some reason bl_off indicates the start-of-data is not at
+       * the current buffer::ptr, skip it and account */
+      if (bl_off > bp.length()) {
+	bl_off -= bp.length();
+	continue;
+      }
+      /* read no more than read_resid */
+      bytes = std::min(read_resid, size_t(bp.length()-bl_off));
+      memcpy(static_cast<char*>(ulp_buffer)+nread, bp.c_str()+bl_off, bytes);
+      read_resid -= bytes; /* reduce read_resid by bytes read */
+      nread += bytes;
+      bl_off = 0;
+      /* stop if we have no residual ulp_buffer */
+      if (! read_resid)
+	break;
+    }
+    return 0;
+  }
+
+  int send_response_data_error(optional_yield) override {
+    /* S3 implementation just sends nothing--there is no side effect
+     * to simulate here */
+    return 0;
+  }
+
+  bool prefetch_data() override { return false; }
+
+}; /* RGWReadRequest */
+
+/*
+  delete object
+*/
+
+class RGWDeleteObjRequest : public RGWLibRequest,
+			    public RGWDeleteObj /* RGWOp */
+{
+public:
+  const std::string& bucket_name;
+  const std::string& obj_name;
+
+  RGWDeleteObjRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+		      const std::string& _bname, const std::string& _oname)
+    : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname), obj_name(_oname) {
+    op = this;
+  }
+
+  bool only_bucket() override { return true; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "DELETE";
+    state->op = OP_DELETE;
+
+    /* XXX derp derp derp */
+    std::string uri = make_uri(bucket_name, obj_name);
+    state->relative_uri = uri;
+    state->info.request_uri = uri; // XXX
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  void send_response() override {}
+
+}; /* RGWDeleteObjRequest */
+
+class RGWStatObjRequest : public RGWLibRequest,
+			  public RGWGetObj /* RGWOp */
+{
+public:
+  const std::string& bucket_name;
+  const std::string& obj_name;
+  uint64_t _size;
+  uint32_t flags;
+
+  static constexpr uint32_t FLAG_NONE = 0x000;
+
+  RGWStatObjRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+		    const std::string& _bname, const std::string& _oname,
+		    uint32_t _flags)
+    : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname), obj_name(_oname),
+      _size(0), flags(_flags) {
+    op = this;
+
+    /* fixup RGWGetObj (already know range parameters) */
+    RGWGetObj::range_parsed = true;
+    RGWGetObj::get_data = false; // XXX
+    RGWGetObj::partial_content = true;
+    RGWGetObj::ofs = 0;
+    RGWGetObj::end = UINT64_MAX;
+  }
+
+  const char* name() const override { return "stat_obj"; }
+  RGWOpType get_type() override { return RGW_OP_STAT_OBJ; }
+
+  real_time get_mtime() const {
+    return lastmod;
+  }
+
+  /* attributes */
+  uint64_t get_size() { return _size; }
+  real_time ctime() { return mod_time; } // XXX
+  real_time mtime() { return mod_time; }
+  std::map<string, bufferlist>& get_attrs() { return attrs; }
+
+  buffer::list* get_attr(const std::string& k) {
+    auto iter = attrs.find(k);
+    return (iter != attrs.end()) ? &(iter->second) : nullptr;
+  }
+
+  bool only_bucket() override { return false; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "GET";
+    state->op = OP_GET;
+
+    /* XXX derp derp derp */
+    state->relative_uri = make_uri(bucket_name, obj_name);
+    state->info.request_uri = state->relative_uri; // XXX
+    state->info.effective_uri = state->relative_uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  int get_params(optional_yield) override {
+    return 0;
+  }
+
+  int send_response_data(ceph::buffer::list& _bl, off_t s_off,
+                         off_t e_off) override {
+    /* NOP */
+    /* XXX save attrs? */
+    return 0;
+  }
+
+  int send_response_data_error(optional_yield) override {
+    /* NOP */
+    return 0;
+  }
+
+  void execute(optional_yield y) override {
+    RGWGetObj::execute(y);
+    _size = get_state()->obj_size;
+  }
+
+}; /* RGWStatObjRequest */
+
+class RGWStatBucketRequest : public RGWLibRequest,
+			     public RGWStatBucket /* RGWOp */
+{
+public:
+  std::string uri;
+  std::map<std::string, buffer::list> attrs;
+  RGWLibFS::BucketStats& bs;
+
+  RGWStatBucketRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+		       const std::string& _path,
+		       RGWLibFS::BucketStats& _stats)
+    : RGWLibRequest(_cct, std::move(_user)), bs(_stats) {
+    uri = "/" + _path;
+    op = this;
+  }
+
+  buffer::list* get_attr(const std::string& k) {
+    auto iter = attrs.find(k);
+    return (iter != attrs.end()) ? &(iter->second) : nullptr;
+  }
+
+  real_time get_ctime() const {
+    return bucket->get_creation_time();
+  }
+
+  bool only_bucket() override { return false; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "GET";
+    state->op = OP_GET;
+
+    /* XXX derp derp derp */
+    state->relative_uri = uri;
+    state->info.request_uri = uri; // XXX
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  virtual int get_params() {
+    return 0;
+  }
+
+  void send_response() override {
+    bucket->get_creation_time() = get_state()->bucket->get_info().creation_time;
+    bs.size = bucket->get_size();
+    bs.size_rounded = bucket->get_size_rounded();
+    bs.creation_time = bucket->get_creation_time();
+    bs.num_entries = bucket->get_count();
+    std::swap(attrs, get_state()->bucket_attrs);
+  }
+
+  bool matched() {
+    return (bucket->get_name().length() > 0);
+  }
+
+}; /* RGWStatBucketRequest */
+
+class RGWStatLeafRequest : public RGWLibRequest,
+			   public RGWListBucket /* RGWOp */
+{
+public:
+  RGWFileHandle* rgw_fh;
+  std::string path;
+  bool matched;
+  bool is_dir;
+  bool exact_matched;
+
+  RGWStatLeafRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+		     RGWFileHandle* _rgw_fh, const std::string& _path)
+    : RGWLibRequest(_cct, std::move(_user)), rgw_fh(_rgw_fh), path(_path),
+      matched(false), is_dir(false), exact_matched(false) {
+    default_max = 1000; // logical max {"foo", "foo/"}
+    op = this;
+  }
+
+  bool only_bucket() override { return true; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "GET";
+    state->op = OP_GET;
+
+    /* XXX derp derp derp */
+    std::string uri = "/" + rgw_fh->bucket_name() + "/";
+    state->relative_uri = uri;
+    state->info.request_uri = uri; // XXX
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    prefix = rgw_fh->relative_object_name();
+    if (prefix.length() > 0)
+      prefix += "/";
+    prefix += path;
+    delimiter = '/';
+
+    return 0;
+  }
+
+  int get_params(optional_yield) override {
+    max = default_max;
+    return 0;
+  }
+
+  void send_response() override {
+    req_state* state = get_state();
+    // try objects
+    for (const auto& iter : objs) {
+      auto& name = iter.key.name;
+      lsubdout(cct, rgw, 15) << "RGWStatLeafRequest "
+			     << __func__ << " "
+			     << "list uri=" << state->relative_uri << " "
+			     << " prefix=" << prefix << " "
+			     << " obj path=" << name << ""
+			     << " target = " << path << ""
+			     << dendl;
+      /* XXX is there a missing match-dir case (trailing '/')? */
+      matched = true;
+      if (name == path) {
+	exact_matched = true;
+        return;
+      }
+    }
+    // try prefixes
+    for (auto& iter : common_prefixes) {
+      auto& name = iter.first;
+      lsubdout(cct, rgw, 15) << "RGWStatLeafRequest "
+			     << __func__ << " "
+			     << "list uri=" << state->relative_uri << " "
+			     << " prefix=" << prefix << " "
+			     << " pref path=" << name << " (not chomped)"
+			     << " target = " << path << ""
+			     << dendl;
+      matched = true;
+      /* match-dir case (trailing '/') */
+      if (name == prefix + "/") {
+	exact_matched = true;
+        is_dir = true;
+        return;
+      }
+    }
+  }
+
+  virtual void send_versioned_response() {
+    send_response();
+  }
+}; /* RGWStatLeafRequest */
+
+/*
+  put object
+*/
+
+class RGWWriteRequest : public RGWLibContinuedReq,
+			public RGWPutObj /* RGWOp */
+{
+public:
+  const std::string& bucket_name;
+  const std::string& obj_name;
+  RGWFileHandle* rgw_fh;
+  std::optional<rgw::BlockingAioThrottle> aio;
+  std::unique_ptr<rgw::sal::Writer> processor;
+  rgw::sal::DataProcessor* filter;
+  boost::optional<RGWPutObj_Compress> compressor;
+  CompressorRef plugin;
+  buffer::list data;
+  uint64_t timer_id;
+  MD5 hash;
+  off_t real_ofs;
+  size_t bytes_written;
+  bool eio;
+
+  RGWWriteRequest(rgw::sal::Driver* driver, const RGWProcessEnv& penv,
+		  std::unique_ptr<rgw::sal::User> _user,
+		  RGWFileHandle* _fh, const std::string& _bname,
+		  const std::string& _oname)
+    : RGWLibContinuedReq(driver->ctx(), penv, std::move(_user)),
+      bucket_name(_bname), obj_name(_oname),
+      rgw_fh(_fh), filter(nullptr), timer_id(0), real_ofs(0),
+      bytes_written(0), eio(false) {
+
+    // in ctr this is not a virtual call
+    // invoking this classes's header_init()
+    (void) RGWWriteRequest::header_init();
+    op = this;
+    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+    hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  }
+
+  bool only_bucket() override { return true; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "PUT";
+    state->op = OP_PUT;
+
+    /* XXX derp derp derp */
+    std::string uri = make_uri(bucket_name, obj_name);
+    state->relative_uri = uri;
+    state->info.request_uri = uri; // XXX
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  int get_params(optional_yield) override {
+    req_state* state = get_state();
+    RGWAccessControlPolicy_S3 s3policy(state->cct);
+    /* we don't have (any) headers, so just create canned ACLs */
+    int ret = s3policy.create_canned(state->owner, state->bucket_owner, state->canned_acl);
+    policy = s3policy;
+    return ret;
+  }
+
+  int get_data(buffer::list& _bl) override {
+    /* XXX for now, use sharing semantics */
+    uint32_t len = data.length();
+    _bl = std::move(data);
+    bytes_written += len;
+    return len;
+  }
+
+  void put_data(off_t off, buffer::list& _bl) {
+    if (off != real_ofs) {
+      eio = true;
+    }
+    data = std::move(_bl);
+    real_ofs += data.length();
+    ofs = off; /* consumed in exec_continue() */
+  }
+
+  int exec_start() override;
+  int exec_continue() override;
+  int exec_finish() override;
+
+  void send_response() override {}
+
+  int verify_params() override {
+    return 0;
+  }
+}; /* RGWWriteRequest */
+
+/*
+  copy object
+*/
+class RGWCopyObjRequest : public RGWLibRequest,
+			  public RGWCopyObj /* RGWOp */
+{
+public:
+  RGWFileHandle* src_parent;
+  RGWFileHandle* dst_parent;
+  const std::string& src_name;
+  const std::string& dst_name;
+
+  RGWCopyObjRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+		    RGWFileHandle* _src_parent, RGWFileHandle* _dst_parent,
+		    const std::string& _src_name, const std::string& _dst_name)
+    : RGWLibRequest(_cct, std::move(_user)), src_parent(_src_parent),
+      dst_parent(_dst_parent), src_name(_src_name), dst_name(_dst_name) {
+    /* all requests have this */
+    op = this;
+
+    /* allow this request to replace selected attrs */
+    attrs_mod = rgw::sal::ATTRSMOD_MERGE;
+  }
+
+  bool only_bucket() override { return true; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "PUT"; // XXX check
+    state->op = OP_PUT;
+
+    state->src_bucket_name = src_parent->bucket_name();
+    state->bucket_name = dst_parent->bucket_name();
+
+    std::string dest_obj_name = dst_parent->format_child_name(dst_name, false);
+
+    int rc = valid_s3_object_name(dest_obj_name);
+    if (rc != 0)
+      return rc;
+
+    state->object = RGWHandler::driver->get_object(rgw_obj_key(dest_obj_name));
+
+    /* XXX and fixup key attr (could optimize w/string ref and
+     * dest_obj_name) */
+    buffer::list ux_key;
+    fh_key fhk = dst_parent->make_fhk(dst_name);
+    rgw::encode(fhk, ux_key);
+    emplace_attr(RGW_ATTR_UNIX_KEY1, std::move(ux_key));
+
+#if 0 /* XXX needed? */
+    state->relative_uri = uri;
+    state->info.request_uri = uri; // XXX
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+#endif
+
+    return 0;
+  }
+
+  int get_params(optional_yield) override {
+    req_state* s = get_state();
+    RGWAccessControlPolicy_S3 s3policy(s->cct);
+    /* we don't have (any) headers, so just create canned ACLs */
+    int ret = s3policy.create_canned(s->owner, s->bucket_owner, s->canned_acl);
+    dest_policy = s3policy;
+    /* src_object required before RGWCopyObj::verify_permissions() */
+    rgw_obj_key k = rgw_obj_key(src_name);
+    s->src_object = s->bucket->get_object(k);
+    s->object = s->src_object->clone(); // needed to avoid trap at rgw_op.cc:5150
+    return ret;
+  }
+
+  void send_response() override {}
+  void send_partial_response(off_t ofs) override {}
+
+}; /* RGWCopyObjRequest */
+
+class RGWGetAttrsRequest : public RGWLibRequest,
+			   public RGWGetAttrs /* RGWOp */
+{
+public:
+  const std::string& bucket_name;
+  const std::string& obj_name;
+
+  RGWGetAttrsRequest(CephContext* _cct,
+		     std::unique_ptr<rgw::sal::User> _user,
+		     const std::string& _bname, const std::string& _oname)
+    : RGWLibRequest(_cct, std::move(_user)), RGWGetAttrs(),
+      bucket_name(_bname), obj_name(_oname) {
+    op = this;
+  }
+
+  const flat_map<std::string, std::optional<buffer::list>>& get_attrs() {
+    return attrs;
+  }
+
+  virtual bool only_bucket() { return false; }
+
+  virtual int op_init() {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  virtual int header_init() {
+
+    req_state* s = get_state();
+    s->info.method = "GET";
+    s->op = OP_GET;
+
+    std::string uri = make_uri(bucket_name, obj_name);
+    s->relative_uri = uri;
+    s->info.request_uri = uri;
+    s->info.effective_uri = uri;
+    s->info.request_params = "";
+    s->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  virtual int get_params() {
+    return 0;
+  }
+
+  virtual void send_response() {}
+
+}; /* RGWGetAttrsRequest */
+
+class RGWSetAttrsRequest : public RGWLibRequest,
+			   public RGWSetAttrs /* RGWOp */
+{
+public:
+  const std::string& bucket_name;
+  const std::string& obj_name;
+
+  RGWSetAttrsRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+		     const std::string& _bname, const std::string& _oname)
+    : RGWLibRequest(_cct, std::move(_user)), bucket_name(_bname), obj_name(_oname) {
+    op = this;
+  }
+
+  const std::map<std::string, buffer::list>& get_attrs() {
+    return attrs;
+  }
+
+  bool only_bucket() override { return false; }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+
+    req_state* state = get_state();
+    state->info.method = "PUT";
+    state->op = OP_PUT;
+
+    /* XXX derp derp derp */
+    std::string uri = make_uri(bucket_name, obj_name);
+    state->relative_uri = uri;
+    state->info.request_uri = uri; // XXX
+    state->info.effective_uri = uri;
+    state->info.request_params = "";
+    state->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  int get_params(optional_yield) override {
+    return 0;
+  }
+
+  void send_response() override {}
+
+}; /* RGWSetAttrsRequest */
+
+class RGWRMAttrsRequest : public RGWLibRequest,
+			  public RGWRMAttrs /* RGWOp */
+{
+public:
+  const std::string& bucket_name;
+  const std::string& obj_name;
+
+  RGWRMAttrsRequest(CephContext* _cct,
+		     std::unique_ptr<rgw::sal::User> _user,
+		     const std::string& _bname, const std::string& _oname)
+    : RGWLibRequest(_cct, std::move(_user)), RGWRMAttrs(),
+      bucket_name(_bname), obj_name(_oname) {
+    op = this;
+  }
+
+  const rgw::sal::Attrs& get_attrs() {
+    return attrs;
+  }
+
+  virtual bool only_bucket() { return false; }
+
+  virtual int op_init() {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  virtual int header_init() {
+
+    req_state* s = get_state();
+    s->info.method = "DELETE";
+    s->op = OP_PUT;
+
+    std::string uri = make_uri(bucket_name, obj_name);
+    s->relative_uri = uri;
+    s->info.request_uri = uri;
+    s->info.effective_uri = uri;
+    s->info.request_params = "";
+    s->info.domain = ""; /* XXX ? */
+
+    return 0;
+  }
+
+  virtual int get_params() {
+    return 0;
+  }
+
+  virtual void send_response() {}
+
+}; /* RGWRMAttrsRequest */
+
+/*
+ * Send request to get the rados cluster stats
+ */
+class RGWGetClusterStatReq : public RGWLibRequest,
+        public RGWGetClusterStat {
+public:
+  struct rados_cluster_stat_t& stats_req;
+  RGWGetClusterStatReq(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user,
+                       rados_cluster_stat_t& _stats):
+  RGWLibRequest(_cct, std::move(_user)), stats_req(_stats){
+    op = this;
+  }
+
+  int op_init() override {
+    // assign driver, s, and dialect_handler
+    // framework promises to call op_init after parent init
+    RGWOp::init(RGWHandler::driver, get_state(), this);
+    op = this; // assign self as op: REQUIRED
+    return 0;
+  }
+
+  int header_init() override {
+    req_state* state = get_state();
+    state->info.method = "GET";
+    state->op = OP_GET;
+    return 0;
+  }
+
+  int get_params(optional_yield) override { return 0; }
+  bool only_bucket() override { return false; }
+  void send_response() override {
+    stats_req.kb = stats_op.kb;
+    stats_req.kb_avail = stats_op.kb_avail;
+    stats_req.kb_used = stats_op.kb_used;
+    stats_req.num_objects = stats_op.num_objects;
+  }
+}; /* RGWGetClusterStatReq */
+
+
+} /* namespace rgw */
diff --git a/src/rgw/rgw_flight.cc b/src/rgw/rgw_flight.cc
new file mode 100644
index 000000000..f37d934b3
--- /dev/null
+++ b/src/rgw/rgw_flight.cc
@@ -0,0 +1,724 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright 2023 IBM
+ *
+ * See file COPYING for licensing information.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <mutex>
+#include <map>
+#include <algorithm>
+
+#include "arrow/type.h"
+#include "arrow/buffer.h"
+#include "arrow/util/string_view.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/table.h"
+
+#include "arrow/flight/server.h"
+
+#include "parquet/arrow/reader.h"
+
+#include "common/dout.h"
+#include "rgw_op.h"
+
+#include "rgw_flight.h"
+#include "rgw_flight_frontend.h"
+
+
+namespace rgw::flight {
+
+// Ticket and FlightKey
+
+std::atomic<FlightKey> next_flight_key = null_flight_key;
+
+flt::Ticket FlightKeyToTicket(const FlightKey& key) {
+  flt::Ticket result;
+  result.ticket = std::to_string(key);
+  return result;
+}
+
+arw::Result<FlightKey> TicketToFlightKey(const flt::Ticket& t) {
+  try {
+    return (FlightKey) std::stoul(t.ticket);
+  } catch (std::invalid_argument const& ex) {
+    return arw::Status::Invalid(
+      "could not convert Ticket containing \"%s\" into a Flight Key",
+      t.ticket);
+  } catch (const std::out_of_range& ex) {
+    return arw::Status::Invalid(
+      "could not convert Ticket containing \"%s\" into a Flight Key due to range",
+      t.ticket);
+  }
+}
+
+// FlightData
+
+FlightData::FlightData(const std::string& _uri,
+		       const std::string& _tenant_name,
+		       const std::string& _bucket_name,
+		       const rgw_obj_key& _object_key,
+		       uint64_t _num_records,
+		       uint64_t _obj_size,
+		       std::shared_ptr<arw::Schema>& _schema,
+		       std::shared_ptr<const arw::KeyValueMetadata>& _kv_metadata,
+		       rgw_user _user_id) :
+  key(++next_flight_key),
+  /* expires(coarse_real_clock::now() + lifespan), */
+  uri(_uri),
+  tenant_name(_tenant_name),
+  bucket_name(_bucket_name),
+  object_key(_object_key),
+  num_records(_num_records),
+  obj_size(_obj_size),
+  schema(_schema),
+  kv_metadata(_kv_metadata),
+  user_id(_user_id)
+{ }
+
+/**** FlightStore ****/
+
+FlightStore::FlightStore(const DoutPrefix& _dp) :
+  dp(_dp)
+{ }
+
+FlightStore::~FlightStore() { }
+
+/**** MemoryFlightStore ****/
+
+MemoryFlightStore::MemoryFlightStore(const DoutPrefix& _dp) :
+  FlightStore(_dp)
+{ }
+
+MemoryFlightStore::~MemoryFlightStore() { }
+
+FlightKey MemoryFlightStore::add_flight(FlightData&& flight) {
+  std::pair<decltype(map)::iterator,bool> result;
+  {
+    const std::lock_guard lock(mtx);
+    result = map.insert( {flight.key, std::move(flight)} );
+  }
+  ceph_assertf(result.second,
+	       "unable to add FlightData to MemoryFlightStore"); // temporary until error handling
+
+  return result.first->second.key;
+}
+
+arw::Result<FlightData> MemoryFlightStore::get_flight(const FlightKey& key) const {
+  const std::lock_guard lock(mtx);
+  auto i = map.find(key);
+  if (i == map.cend()) {
+    return arw::Status::KeyError("could not find Flight with Key %" PRIu32,
+				 key);
+  } else {
+    return i->second;
+  }
+}
+
+// returns either the next FilghtData or, if at end, empty optional
+std::optional<FlightData> MemoryFlightStore::after_key(const FlightKey& key) const {
+  std::optional<FlightData> result;
+  {
+    const std::lock_guard lock(mtx);
+    auto i = map.upper_bound(key);
+    if (i != map.end()) {
+      result = i->second;
+    }
+  }
+  return result;
+}
+
+int MemoryFlightStore::remove_flight(const FlightKey& key) {
+  return 0;
+}
+
+int MemoryFlightStore::expire_flights() {
+  return 0;
+}
+
+/**** FlightServer ****/
+
+FlightServer::FlightServer(RGWProcessEnv& _env,
+			   FlightStore* _flight_store,
+			   const DoutPrefix& _dp) :
+  env(_env),
+  driver(env.driver),
+  dp(_dp),
+  flight_store(_flight_store)
+{ }
+
+FlightServer::~FlightServer()
+{ }
+
+
+arw::Status FlightServer::ListFlights(const flt::ServerCallContext& context,
+				      const flt::Criteria* criteria,
+				      std::unique_ptr<flt::FlightListing>* listings) {
+
+  // function local class to implement FlightListing interface
+  class RGWFlightListing : public flt::FlightListing {
+
+    FlightStore* flight_store;
+    FlightKey previous_key;
+
+  public:
+
+    RGWFlightListing(FlightStore* flight_store) :
+      flight_store(flight_store),
+      previous_key(null_flight_key)
+      { }
+
+    arw::Status Next(std::unique_ptr<flt::FlightInfo>* info) {
+      std::optional<FlightData> fd = flight_store->after_key(previous_key);
+      if (fd) {
+	previous_key = fd->key;
+	auto descriptor =
+	  flt::FlightDescriptor::Path(
+	    { fd->tenant_name, fd->bucket_name, fd->object_key.name, fd->object_key.instance, fd->object_key.ns });
+	flt::FlightEndpoint endpoint;
+	endpoint.ticket = FlightKeyToTicket(fd->key);
+	std::vector<flt::FlightEndpoint> endpoints { endpoint };
+
+	ARROW_ASSIGN_OR_RAISE(flt::FlightInfo info_obj,
+			      flt::FlightInfo::Make(*fd->schema, descriptor, endpoints, fd->num_records, fd->obj_size));
+	*info = std::make_unique<flt::FlightInfo>(std::move(info_obj));
+	return arw::Status::OK();
+      } else {
+	*info = nullptr;
+	return arw::Status::OK();
+      }
+    }
+  }; // class RGWFlightListing
+
+  *listings = std::make_unique<RGWFlightListing>(flight_store);
+  return arw::Status::OK();
+} // FlightServer::ListFlights
+
+
+arw::Status FlightServer::GetFlightInfo(const flt::ServerCallContext &context,
+					const flt::FlightDescriptor &request,
+					std::unique_ptr<flt::FlightInfo> *info) {
+  return arw::Status::OK();
+} // FlightServer::GetFlightInfo
+
+
+arw::Status FlightServer::GetSchema(const flt::ServerCallContext &context,
+				    const flt::FlightDescriptor &request,
+				    std::unique_ptr<flt::SchemaResult> *schema) {
+  return arw::Status::OK();
+} // FlightServer::GetSchema
+
+  // A Buffer that owns its memory and frees it when the Buffer is
+  // destructed
+class OwnedBuffer : public arw::Buffer {
+
+  uint8_t* buffer;
+
+protected:
+
+  OwnedBuffer(uint8_t* _buffer, int64_t _size) :
+    Buffer(_buffer, _size),
+    buffer(_buffer)
+    { }
+
+public:
+
+  ~OwnedBuffer() override {
+    delete[] buffer;
+  }
+
+  static arw::Result<std::shared_ptr<OwnedBuffer>> make(int64_t size) {
+    uint8_t* buffer = new (std::nothrow) uint8_t[size];
+    if (!buffer) {
+      return arw::Status::OutOfMemory("could not allocated buffer of size %" PRId64, size);
+    }
+
+    OwnedBuffer* ptr = new OwnedBuffer(buffer, size);
+    std::shared_ptr<OwnedBuffer> result;
+    result.reset(ptr);
+    return result;
+  }
+
+  // if what's read in is less than capacity
+  void set_size(int64_t size) {
+    size_ = size;
+  }
+
+  // pointer that can be used to write into buffer
+  uint8_t* writeable_data() {
+    return buffer;
+  }
+}; // class OwnedBuffer
+
+#if 0 // remove classes used for testing and incrementally building
+
+// make local to DoGet eventually
+class LocalInputStream : public arw::io::InputStream {
+
+  std::iostream::pos_type position;
+  std::fstream file;
+  std::shared_ptr<const arw::KeyValueMetadata> kv_metadata;
+  const DoutPrefix dp;
+
+public:
+
+  LocalInputStream(std::shared_ptr<const arw::KeyValueMetadata> _kv_metadata,
+		   const DoutPrefix _dp) :
+    kv_metadata(_kv_metadata),
+    dp(_dp)
+    {}
+
+  arw::Status Open() {
+    file.open("/tmp/green_tripdata_2022-04.parquet", std::ios::in);
+    if (!file.good()) {
+      return arw::Status::IOError("unable to open file");
+    }
+
+    INFO << "file opened successfully" << dendl;
+    position = file.tellg();
+    return arw::Status::OK();
+  }
+
+  arw::Status Close() override {
+    file.close();
+    INFO << "file closed" << dendl;
+    return arw::Status::OK();
+  }
+
+  arw::Result<int64_t> Tell() const override {
+    if (position < 0) {
+      return arw::Status::IOError(
+	"could not query file implementaiton with tellg");
+    } else {
+      return int64_t(position);
+    }
+  }
+
+  bool closed() const override {
+    return file.is_open();
+  }
+
+  arw::Result<int64_t> Read(int64_t nbytes, void* out) override {
+    INFO << "entered: asking for " << nbytes << " bytes" << dendl;
+    if (file.read(reinterpret_cast<char*>(out),
+		  reinterpret_cast<std::streamsize>(nbytes))) {
+      const std::streamsize bytes_read = file.gcount();
+      INFO << "Point A: read bytes " << bytes_read << dendl;
+      position = file.tellg();
+      return bytes_read;
+    } else {
+      ERROR << "unable to read from file" << dendl;
+      return arw::Status::IOError("unable to read from offset %" PRId64,
+				  int64_t(position));
+    }
+  }
+
+  arw::Result<std::shared_ptr<arw::Buffer>> Read(int64_t nbytes) override {
+    INFO << "entered: " << ": asking for " << nbytes << " bytes" << dendl;
+
+    std::shared_ptr<OwnedBuffer> buffer;
+    ARROW_ASSIGN_OR_RAISE(buffer, OwnedBuffer::make(nbytes));
+
+    if (file.read(reinterpret_cast<char*>(buffer->writeable_data()),
+		  reinterpret_cast<std::streamsize>(nbytes))) {
+      const auto bytes_read = file.gcount();
+      INFO << "Point B: read bytes " << bytes_read << dendl;
+      // buffer->set_size(bytes_read);
+      position = file.tellg();
+      return buffer;
+    } else if (file.rdstate() & std::ifstream::failbit &&
+	       file.rdstate() & std::ifstream::eofbit) {
+      const auto bytes_read = file.gcount();
+      INFO << "3 read bytes " << bytes_read << " and reached EOF" << dendl;
+      // buffer->set_size(bytes_read);
+      position = file.tellg();
+      return buffer;
+    } else {
+      ERROR << "unable to read from file" << dendl;
+      return arw::Status::IOError("unable to read from offset %ld", position);
+    }
+  }
+
+  arw::Result<arw::util::string_view> Peek(int64_t nbytes) override {
+    INFO << "called, not implemented" << dendl;
+    return arw::Status::NotImplemented("peek not currently allowed");
+  }
+
+  bool supports_zero_copy() const override {
+    return false;
+  }
+
+  arw::Result<std::shared_ptr<const arw::KeyValueMetadata>> ReadMetadata() override {
+    INFO << "called" << dendl;
+    return kv_metadata;
+  }
+}; // class LocalInputStream
+
+class LocalRandomAccessFile : public arw::io::RandomAccessFile {
+
+  FlightData flight_data;
+  const DoutPrefix dp;
+
+  std::iostream::pos_type position;
+  std::fstream file;
+
+public:
+  LocalRandomAccessFile(const FlightData& _flight_data, const DoutPrefix _dp) :
+    flight_data(_flight_data),
+    dp(_dp)
+    { }
+
+  // implement InputStream
+
+  arw::Status Open() {
+    file.open("/tmp/green_tripdata_2022-04.parquet", std::ios::in);
+    if (!file.good()) {
+      return arw::Status::IOError("unable to open file");
+    }
+
+    INFO << "file opened successfully" << dendl;
+    position = file.tellg();
+    return arw::Status::OK();
+  }
+
+  arw::Status Close() override {
+    file.close();
+    INFO << "file closed" << dendl;
+    return arw::Status::OK();
+  }
+
+  arw::Result<int64_t> Tell() const override {
+    if (position < 0) {
+      return arw::Status::IOError(
+	"could not query file implementaiton with tellg");
+    } else {
+      return int64_t(position);
+    }
+  }
+
+  bool closed() const override {
+    return file.is_open();
+  }
+
+  arw::Result<int64_t> Read(int64_t nbytes, void* out) override {
+    INFO << "entered: asking for " << nbytes << " bytes" << dendl;
+    if (file.read(reinterpret_cast<char*>(out),
+		  reinterpret_cast<std::streamsize>(nbytes))) {
+      const std::streamsize bytes_read = file.gcount();
+      INFO << "Point A: read bytes " << bytes_read << dendl;
+      position = file.tellg();
+      return bytes_read;
+    } else {
+      ERROR << "unable to read from file" << dendl;
+      return arw::Status::IOError("unable to read from offset %" PRId64,
+				  int64_t(position));
+    }
+  }
+
+  arw::Result<std::shared_ptr<arw::Buffer>> Read(int64_t nbytes) override {
+    INFO << "entered: asking for " << nbytes << " bytes" << dendl;
+
+    std::shared_ptr<OwnedBuffer> buffer;
+    ARROW_ASSIGN_OR_RAISE(buffer, OwnedBuffer::make(nbytes));
+
+    if (file.read(reinterpret_cast<char*>(buffer->writeable_data()),
+		  reinterpret_cast<std::streamsize>(nbytes))) {
+      const auto bytes_read = file.gcount();
+      INFO << "Point B: read bytes " << bytes_read << dendl;
+      // buffer->set_size(bytes_read);
+      position = file.tellg();
+      return buffer;
+    } else if (file.rdstate() & std::ifstream::failbit &&
+	       file.rdstate() & std::ifstream::eofbit) {
+      const auto bytes_read = file.gcount();
+      INFO << "3 read bytes " << bytes_read << " and reached EOF" << dendl;
+      // buffer->set_size(bytes_read);
+      position = file.tellg();
+      return buffer;
+    } else {
+      ERROR << "unable to read from file" << dendl;
+      return arw::Status::IOError("unable to read from offset %ld", position);
+    }
+  }
+
+  bool supports_zero_copy() const override {
+    return false;
+  }
+
+  // implement Seekable
+
+  arw::Result<int64_t> GetSize() override {
+    return flight_data.obj_size;
+  }
+
+  arw::Result<arw::util::string_view> Peek(int64_t nbytes) override {
+    std::iostream::pos_type here = file.tellg();
+    if (here == -1) {
+      return arw::Status::IOError(
+	"unable to determine current position ahead of peek");
+    }
+
+    ARROW_ASSIGN_OR_RAISE(OwningStringView result,
+			  OwningStringView::make(nbytes));
+
+    // read
+    ARROW_ASSIGN_OR_RAISE(int64_t bytes_read,
+			  Read(nbytes, (void*) result.writeable_data()));
+    (void) bytes_read; // silence unused variable warnings
+
+    // return offset to original
+    ARROW_RETURN_NOT_OK(Seek(here));
+
+    return result;
+  }
+
+  arw::Result<std::shared_ptr<const arw::KeyValueMetadata>> ReadMetadata() {
+    return flight_data.kv_metadata;
+  }
+
+  arw::Future<std::shared_ptr<const arw::KeyValueMetadata>> ReadMetadataAsync(
+    const arw::io::IOContext& io_context) override {
+    return arw::Future<std::shared_ptr<const arw::KeyValueMetadata>>::MakeFinished(ReadMetadata());
+  }
+
+  // implement Seekable interface
+
+  arw::Status Seek(int64_t position) {
+    file.seekg(position);
+    if (file.fail()) {
+      return arw::Status::IOError(
+	"error encountered during seek to %" PRId64, position);
+    } else {
+      return arw::Status::OK();
+    }
+  }
+}; // class LocalRandomAccessFile
+#endif
+
+class RandomAccessObject : public arw::io::RandomAccessFile {
+
+  FlightData flight_data;
+  const DoutPrefix dp;
+
+  int64_t position;
+  bool is_closed;
+  std::unique_ptr<rgw::sal::Object::ReadOp> op;
+
+public:
+
+  RandomAccessObject(const FlightData& _flight_data,
+		     std::unique_ptr<rgw::sal::Object>& obj,
+		     const DoutPrefix _dp) :
+    flight_data(_flight_data),
+    dp(_dp),
+    position(-1),
+    is_closed(false)
+    {
+      op = obj->get_read_op();
+    }
+
+  arw::Status Open() {
+    int ret = op->prepare(null_yield, &dp);
+    if (ret < 0) {
+      return arw::Status::IOError(
+	"unable to prepare object with error %d", ret);
+    }
+    INFO << "file opened successfully" << dendl;
+    position = 0;
+    return arw::Status::OK();
+  }
+
+  // implement InputStream
+
+  arw::Status Close() override {
+    position = -1;
+    is_closed = true;
+    (void) op.reset();
+    INFO << "object closed" << dendl;
+    return arw::Status::OK();
+  }
+
+  arw::Result<int64_t> Tell() const override {
+    if (position < 0) {
+      return arw::Status::IOError("could not determine position");
+    } else {
+      return position;
+    }
+  }
+
+  bool closed() const override {
+    return is_closed;
+  }
+
+  arw::Result<int64_t> Read(int64_t nbytes, void* out) override {
+    INFO << "entered: asking for " << nbytes << " bytes" << dendl;
+
+    if (position < 0) {
+      ERROR << "error, position indicated error" << dendl;
+      return arw::Status::IOError("object read op is in bad state");
+    }
+
+    // note: read function reads through end_position inclusive
+    int64_t end_position = position + nbytes - 1;
+
+    bufferlist bl;
+
+    const int64_t bytes_read =
+      op->read(position, end_position, bl, null_yield, &dp);
+    if (bytes_read < 0) {
+      const int64_t former_position = position;
+      position = -1;
+      ERROR << "read operation returned " << bytes_read << dendl;
+      return arw::Status::IOError(
+	"unable to read object at position %" PRId64 ", error code: %" PRId64,
+	former_position,
+	bytes_read);
+    }
+
+    // TODO: see if there's a way to get rid of this copy, perhaps
+    // updating rgw::sal::read_op
+    bl.cbegin().copy(bytes_read, reinterpret_cast<char*>(out));
+
+    position += bytes_read;
+
+    if (nbytes != bytes_read) {
+      INFO << "partial read: nbytes=" << nbytes <<
+	", bytes_read=" << bytes_read << dendl;
+    }
+    INFO << bytes_read << " bytes read" << dendl;
+    return bytes_read;
+  }
+
+  arw::Result<std::shared_ptr<arw::Buffer>> Read(int64_t nbytes) override {
+    INFO << "entered: asking for " << nbytes << " bytes" << dendl;
+
+    std::shared_ptr<OwnedBuffer> buffer;
+    ARROW_ASSIGN_OR_RAISE(buffer, OwnedBuffer::make(nbytes));
+
+    ARROW_ASSIGN_OR_RAISE(const int64_t bytes_read,
+			  Read(nbytes, buffer->writeable_data()));
+    buffer->set_size(bytes_read);
+
+    return buffer;
+  }
+
+  bool supports_zero_copy() const override {
+    return false;
+  }
+
+  // implement Seekable
+
+  arw::Result<int64_t> GetSize() override {
+    INFO << "entered: " << flight_data.obj_size << " returned" << dendl;
+    return flight_data.obj_size;
+  }
+
+  arw::Result<arw::util::string_view> Peek(int64_t nbytes) override {
+    INFO << "entered: " << nbytes << " bytes" << dendl;
+
+    int64_t saved_position = position;
+
+    ARROW_ASSIGN_OR_RAISE(OwningStringView buffer,
+			  OwningStringView::make(nbytes));
+
+    ARROW_ASSIGN_OR_RAISE(const int64_t bytes_read,
+			  Read(nbytes, (void*) buffer.writeable_data()));
+
+    // restore position for a peek
+    position = saved_position;
+
+    if (bytes_read < nbytes) {
+      // create new OwningStringView with moved buffer
+      return OwningStringView::shrink(std::move(buffer), bytes_read);
+    } else {
+      return buffer;
+    }
+  }
+
+  arw::Result<std::shared_ptr<const arw::KeyValueMetadata>> ReadMetadata() {
+    return flight_data.kv_metadata;
+  }
+
+  arw::Future<std::shared_ptr<const arw::KeyValueMetadata>> ReadMetadataAsync(
+    const arw::io::IOContext& io_context) override {
+    return arw::Future<std::shared_ptr<const arw::KeyValueMetadata>>::MakeFinished(ReadMetadata());
+  }
+
+  // implement Seekable interface
+
+  arw::Status Seek(int64_t new_position) {
+    INFO << "entered: position: " << new_position << dendl;
+    if (position < 0) {
+      ERROR << "error, position indicated error" << dendl;
+      return arw::Status::IOError("object read op is in bad state");
+    } else {
+      position = new_position;
+      return arw::Status::OK();
+    }
+  }
+}; // class RandomAccessObject
+
+arw::Status FlightServer::DoGet(const flt::ServerCallContext &context,
+				const flt::Ticket &request,
+				std::unique_ptr<flt::FlightDataStream> *stream) {
+  int ret;
+
+  ARROW_ASSIGN_OR_RAISE(FlightKey key, TicketToFlightKey(request));
+  ARROW_ASSIGN_OR_RAISE(FlightData fd, get_flight_store()->get_flight(key));
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(fd.user_id);
+  if (user->empty()) {
+    INFO << "user is empty" << dendl;
+  } else {
+    // TODO: test what happens if user is not loaded
+    ret = user->load_user(&dp, null_yield);
+    if (ret < 0) {
+      ERROR << "load_user returned " << ret << dendl;
+      // TODO return something
+    }
+    INFO << "user is " << user->get_display_name() << dendl;
+  }
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  ret = driver->get_bucket(&dp, &(*user), fd.tenant_name, fd.bucket_name,
+			   &bucket, null_yield);
+  if (ret < 0) {
+    ERROR << "get_bucket returned " << ret << dendl;
+    // TODO return something
+  }
+
+  std::unique_ptr<rgw::sal::Object> object = bucket->get_object(fd.object_key);
+
+  auto input = std::make_shared<RandomAccessObject>(fd, object, dp);
+  ARROW_RETURN_NOT_OK(input->Open());
+
+  std::unique_ptr<parquet::arrow::FileReader> reader;
+  ARROW_RETURN_NOT_OK(parquet::arrow::OpenFile(input,
+					       arw::default_memory_pool(),
+					       &reader));
+
+  std::shared_ptr<arrow::Table> table;
+  ARROW_RETURN_NOT_OK(reader->ReadTable(&table));
+
+  std::vector<std::shared_ptr<arw::RecordBatch>> batches;
+  arw::TableBatchReader batch_reader(*table);
+  ARROW_RETURN_NOT_OK(batch_reader.ReadAll(&batches));
+
+  ARROW_ASSIGN_OR_RAISE(auto owning_reader,
+			arw::RecordBatchReader::Make(
+			  std::move(batches), table->schema()));
+  *stream = std::unique_ptr<flt::FlightDataStream>(
+    new flt::RecordBatchStream(owning_reader));
+
+  return arw::Status::OK();
+} // flightServer::DoGet
+
+} // namespace rgw::flight
diff --git a/src/rgw/rgw_flight.h b/src/rgw/rgw_flight.h
new file mode 100644
index 000000000..bb0a987d0
--- /dev/null
+++ b/src/rgw/rgw_flight.h
@@ -0,0 +1,221 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright 2023 IBM
+ *
+ * See file COPYING for licensing information.
+ */
+
+#pragma once
+
+#include <map>
+#include <mutex>
+#include <atomic>
+
+#include "include/common_fwd.h"
+#include "common/ceph_context.h"
+#include "common/Thread.h"
+#include "common/ceph_time.h"
+#include "rgw_frontend.h"
+#include "arrow/type.h"
+#include "arrow/flight/server.h"
+#include "arrow/util/string_view.h"
+
+#include "rgw_flight_frontend.h"
+
+
+#define INFO_F(dp)   ldpp_dout(&dp, 20) << "INFO: " << __func__ << ": "
+#define STATUS_F(dp) ldpp_dout(&dp, 10) << "STATUS: " << __func__ << ": "
+#define WARN_F(dp)   ldpp_dout(&dp,  0) << "WARNING: " << __func__ << ": "
+#define ERROR_F(dp)  ldpp_dout(&dp,  0) << "ERROR: " << __func__ << ": "
+
+#define INFO   INFO_F(dp)
+#define STATUS STATUS_F(dp)
+#define WARN   WARN_F(dp)
+#define ERROR  ERROR_F(dp)
+
+
+namespace arw = arrow;
+namespace flt = arrow::flight;
+
+
+struct req_state;
+
+namespace rgw::flight {
+
+static const coarse_real_clock::duration lifespan = std::chrono::hours(1);
+
+struct FlightData {
+  FlightKey key;
+  // coarse_real_clock::time_point expires;
+  std::string uri;
+  std::string tenant_name;
+  std::string bucket_name;
+  rgw_obj_key object_key;
+  // NB: what about object's namespace and instance?
+  uint64_t num_records;
+  uint64_t obj_size;
+  std::shared_ptr<arw::Schema> schema;
+  std::shared_ptr<const arw::KeyValueMetadata> kv_metadata;
+
+  rgw_user user_id; // TODO: this should be removed when we do
+  // proper flight authentication
+
+  FlightData(const std::string& _uri,
+	     const std::string& _tenant_name,
+	     const std::string& _bucket_name,
+	     const rgw_obj_key& _object_key,
+	     uint64_t _num_records,
+	     uint64_t _obj_size,
+	     std::shared_ptr<arw::Schema>& _schema,
+	     std::shared_ptr<const arw::KeyValueMetadata>& _kv_metadata,
+	     rgw_user _user_id);
+};
+
+// stores flights that have been created and helps expire them
+class FlightStore {
+
+protected:
+
+  const DoutPrefix& dp;
+
+public:
+
+  FlightStore(const DoutPrefix& dp);
+  virtual ~FlightStore();
+  virtual FlightKey add_flight(FlightData&& flight) = 0;
+
+  // TODO consider returning const shared pointers to FlightData in
+  // the following two functions
+  virtual arw::Result<FlightData> get_flight(const FlightKey& key) const = 0;
+  virtual std::optional<FlightData> after_key(const FlightKey& key) const = 0;
+
+  virtual int remove_flight(const FlightKey& key) = 0;
+  virtual int expire_flights() = 0;
+};
+
+class MemoryFlightStore : public FlightStore {
+  std::map<FlightKey, FlightData> map;
+  mutable std::mutex mtx; // for map
+
+public:
+
+  MemoryFlightStore(const DoutPrefix& dp);
+  virtual ~MemoryFlightStore();
+  FlightKey add_flight(FlightData&& flight) override;
+  arw::Result<FlightData> get_flight(const FlightKey& key) const override;
+  std::optional<FlightData> after_key(const FlightKey& key) const override;
+  int remove_flight(const FlightKey& key) override;
+  int expire_flights() override;
+};
+
+class FlightServer : public flt::FlightServerBase {
+
+  using Data1 = std::vector<std::shared_ptr<arw::RecordBatch>>;
+
+  RGWProcessEnv& env;
+  rgw::sal::Driver* driver;
+  const DoutPrefix& dp;
+  FlightStore* flight_store;
+
+  std::map<std::string, Data1> data;
+
+public:
+
+  static constexpr int default_port = 8077;
+
+  FlightServer(RGWProcessEnv& env,
+	       FlightStore* flight_store,
+	       const DoutPrefix& dp);
+  ~FlightServer() override;
+
+  FlightStore* get_flight_store() {
+    return flight_store;
+  }
+
+  arw::Status ListFlights(const flt::ServerCallContext& context,
+			  const flt::Criteria* criteria,
+			  std::unique_ptr<flt::FlightListing>* listings) override;
+
+  arw::Status GetFlightInfo(const flt::ServerCallContext &context,
+			    const flt::FlightDescriptor &request,
+			    std::unique_ptr<flt::FlightInfo> *info) override;
+
+  arw::Status GetSchema(const flt::ServerCallContext &context,
+			const flt::FlightDescriptor &request,
+			std::unique_ptr<flt::SchemaResult> *schema) override;
+
+  arw::Status DoGet(const flt::ServerCallContext &context,
+		    const flt::Ticket &request,
+		    std::unique_ptr<flt::FlightDataStream> *stream) override;
+}; // class FlightServer
+
+class OwningStringView : public arw::util::string_view {
+
+  uint8_t* buffer;
+  int64_t capacity;
+  int64_t consumed;
+
+  OwningStringView(uint8_t* _buffer, int64_t _size) :
+    arw::util::string_view((const char*) _buffer, _size),
+    buffer(_buffer),
+    capacity(_size),
+    consumed(_size)
+    { }
+
+  OwningStringView(OwningStringView&& from, int64_t new_size) :
+    buffer(nullptr),
+    capacity(from.capacity),
+    consumed(new_size)
+    {
+      // should be impossible due to static function check
+      ceph_assertf(consumed <= capacity, "new size cannot exceed capacity");
+
+      std::swap(buffer, from.buffer);
+      from.capacity = 0;
+      from.consumed = 0;
+    }
+
+public:
+
+  OwningStringView(OwningStringView&&) = default;
+  OwningStringView& operator=(OwningStringView&&) = default;
+
+  uint8_t* writeable_data() {
+    return buffer;
+  }
+
+  ~OwningStringView() {
+    if (buffer) {
+      delete[] buffer;
+    }
+  }
+
+  static arw::Result<OwningStringView> make(int64_t size) {
+    uint8_t* buffer = new uint8_t[size];
+    if (!buffer) {
+      return arw::Status::OutOfMemory("could not allocated buffer of size %" PRId64, size);
+    }
+    return OwningStringView(buffer, size);
+  }
+
+  static arw::Result<OwningStringView> shrink(OwningStringView&& from,
+					      int64_t new_size) {
+    if (new_size > from.capacity) {
+      return arw::Status::Invalid("new size cannot exceed capacity");
+    } else {
+      return OwningStringView(std::move(from), new_size);
+    }
+  }
+
+};
+
+// GLOBAL
+
+flt::Ticket FlightKeyToTicket(const FlightKey& key);
+arw::Status TicketToFlightKey(const flt::Ticket& t, FlightKey& key);
+
+} // namespace rgw::flight
diff --git a/src/rgw/rgw_flight_frontend.cc b/src/rgw/rgw_flight_frontend.cc
new file mode 100644
index 000000000..c29703fe5
--- /dev/null
+++ b/src/rgw/rgw_flight_frontend.cc
@@ -0,0 +1,246 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright 2023 IBM
+ *
+ * See file COPYING for licensing information.
+ */
+
+#include <cstdio>
+#include <filesystem>
+#include <sstream>
+
+#include "arrow/type.h"
+#include "arrow/flight/server.h"
+#include "arrow/io/file.h"
+
+#include "parquet/arrow/reader.h"
+#include "parquet/arrow/schema.h"
+#include "parquet/stream_reader.h"
+
+#include "rgw_flight_frontend.h"
+#include "rgw_flight.h"
+
+
+// logging
+constexpr unsigned dout_subsys = ceph_subsys_rgw_flight;
+constexpr const char* dout_prefix_str = "rgw arrow_flight: ";
+
+
+namespace rgw::flight {
+
+const FlightKey null_flight_key = 0;
+
+FlightFrontend::FlightFrontend(RGWProcessEnv& _env,
+			       RGWFrontendConfig* _config,
+			       int _port) :
+  env(_env),
+  config(_config),
+  port(_port),
+  dp(env.driver->ctx(), dout_subsys, dout_prefix_str)
+{
+  env.flight_store = new MemoryFlightStore(dp);
+  env.flight_server = new FlightServer(env, env.flight_store, dp);
+  INFO << "flight server started" << dendl;
+}
+
+FlightFrontend::~FlightFrontend() {
+  delete env.flight_server;
+  env.flight_server = nullptr;
+
+  delete env.flight_store;
+  env.flight_store = nullptr;
+
+  INFO << "flight server shut down" << dendl;
+}
+
+int FlightFrontend::init() {
+  if (port <= 0) {
+    port = FlightServer::default_port;
+  }
+  const std::string url =
+    std::string("grpc+tcp://localhost:") + std::to_string(port);
+  flt::Location location;
+  arw::Status s = flt::Location::Parse(url, &location);
+  if (!s.ok()) {
+    ERROR << "couldn't parse url=" << url << ", status=" << s << dendl;
+    return -EINVAL;
+  }
+
+  flt::FlightServerOptions options(location);
+  options.verify_client = false;
+  s = env.flight_server->Init(options);
+  if (!s.ok()) {
+    ERROR << "couldn't init flight server; status=" << s << dendl;
+    return -EINVAL;
+  }
+
+  INFO << "FlightServer inited; will use port " << port << dendl;
+  return 0;
+}
+
+int FlightFrontend::run() {
+  try {
+    flight_thread = make_named_thread(server_thread_name,
+				      &FlightServer::Serve,
+				      env.flight_server);
+
+    INFO << "FlightServer thread started, id=" <<
+      flight_thread.get_id() <<
+      ", joinable=" << flight_thread.joinable() << dendl;
+    return 0;
+  } catch (std::system_error& e) {
+    ERROR << "FlightServer thread failed to start" << dendl;
+    return -e.code().value();
+  }
+}
+
+void FlightFrontend::stop() {
+  env.flight_server->Shutdown();
+  env.flight_server->Wait();
+  INFO << "FlightServer shut down" << dendl;
+}
+
+void FlightFrontend::join() {
+  flight_thread.join();
+  INFO << "FlightServer thread joined" << dendl;
+}
+
+void FlightFrontend::pause_for_new_config() {
+  // ignore since config changes won't alter flight_server
+}
+
+void FlightFrontend::unpause_with_new_config() {
+  // ignore since config changes won't alter flight_server
+}
+
+/* ************************************************************ */
+
+FlightGetObj_Filter::FlightGetObj_Filter(const req_state* request,
+					 RGWGetObj_Filter* next) :
+  RGWGetObj_Filter(next),
+  penv(request->penv),
+  dp(request->cct->get(), dout_subsys, dout_prefix_str),
+  current_offset(0),
+  expected_size(request->obj_size),
+  uri(request->decoded_uri),
+  tenant_name(request->bucket->get_tenant()),
+  bucket_name(request->bucket->get_name()),
+  object_key(request->object->get_key()),
+  // note: what about object namespace and instance?
+  schema_status(arrow::StatusCode::Cancelled,
+		"schema determination incomplete"),
+  user_id(request->user->get_id())
+{
+#warning "TODO: fix use of tmpnam"
+  char name[L_tmpnam];
+  const char* namep = std::tmpnam(name);
+  if (!namep) {
+    //
+  }
+  temp_file_name = namep;
+
+  temp_file.open(temp_file_name);
+}
+
+FlightGetObj_Filter::~FlightGetObj_Filter() {
+  if (temp_file.is_open()) {
+    temp_file.close();
+  }
+  std::error_code error;
+  std::filesystem::remove(temp_file_name, error);
+  if (error) {
+    ERROR << "FlightGetObj_Filter got error when removing temp file; "
+      "error=" << error.value() <<
+      ", temp_file_name=" << temp_file_name << dendl;
+  } else {
+    INFO << "parquet/arrow schema determination status: " <<
+      schema_status << dendl;
+  }
+}
+
+int FlightGetObj_Filter::handle_data(bufferlist& bl,
+				     off_t bl_ofs, off_t bl_len) {
+  INFO << "flight handling data from offset " <<
+    current_offset << " (" << bl_ofs << ") of size " << bl_len << dendl;
+
+  current_offset += bl_len;
+
+  if (temp_file.is_open()) {
+    bl.write_stream(temp_file);
+
+    if (current_offset >= expected_size) {
+      INFO << "data read is completed, current_offset=" <<
+	current_offset << ", expected_size=" << expected_size << dendl;
+      temp_file.close();
+
+      std::shared_ptr<const arw::KeyValueMetadata> kv_metadata;
+      std::shared_ptr<arw::Schema> aw_schema;
+      int64_t num_rows = 0;
+
+      auto process_metadata = [&aw_schema, &num_rows, &kv_metadata, this]() -> arrow::Status {
+	ARROW_ASSIGN_OR_RAISE(std::shared_ptr<arrow::io::ReadableFile> file,
+			      arrow::io::ReadableFile::Open(temp_file_name));
+	const std::shared_ptr<parquet::FileMetaData> metadata = parquet::ReadMetaData(file);
+
+	file->Close();
+
+	num_rows = metadata->num_rows();
+	kv_metadata = metadata->key_value_metadata();
+	const parquet::SchemaDescriptor* pq_schema = metadata->schema();
+	ARROW_RETURN_NOT_OK(parquet::arrow::FromParquetSchema(pq_schema, &aw_schema));
+
+	return arrow::Status::OK();
+      };
+
+      schema_status = process_metadata();
+      if (!schema_status.ok()) {
+	ERROR << "reading metadata to access schema, error=" << schema_status << dendl;
+      } else {
+	// INFO << "arrow_schema=" << *aw_schema << dendl;
+	FlightStore* store = penv.flight_store;
+	auto key =
+	  store->add_flight(FlightData(uri, tenant_name, bucket_name,
+				       object_key, num_rows,
+				       expected_size, aw_schema,
+				       kv_metadata, user_id));
+	(void) key; // suppress unused variable warning
+      }
+    } // if last block
+  } // if file opened
+
+    // chain to next filter in stream
+  int ret = RGWGetObj_Filter::handle_data(bl, bl_ofs, bl_len);
+
+  return ret;
+}
+
+#if 0
+void code_snippets() {
+  INFO << "num_columns:" << md->num_columns() <<
+    " num_schema_elements:" << md->num_schema_elements() <<
+    " num_rows:" << md->num_rows() <<
+    " num_row_groups:" << md->num_row_groups() << dendl;
+
+
+  INFO << "file schema: name=" << schema1->name() << ", ToString:" << schema1->ToString() << ", num_columns=" << schema1->num_columns() << dendl;
+  for (int c = 0; c < schema1->num_columns(); ++c) {
+    const parquet::ColumnDescriptor* cd = schema1->Column(c);
+    // const parquet::ConvertedType::type t = cd->converted_type;
+    const std::shared_ptr<const parquet::LogicalType> lt = cd->logical_type();
+    INFO << "column " << c << ": name=" << cd->name() << ", ToString=" << cd->ToString() << ", logical_type=" << lt->ToString() << dendl;
+  }
+
+  INFO << "There are " << md->num_rows() << " rows and " << md->num_row_groups() << " row groups" << dendl;
+  for (int rg = 0; rg < md->num_row_groups(); ++rg) {
+    INFO << "Row Group " << rg << dendl;
+    auto rg_md = md->RowGroup(rg);
+    auto schema2 = rg_md->schema();
+  }
+}
+#endif
+
+} // namespace rgw::flight
diff --git a/src/rgw/rgw_flight_frontend.h b/src/rgw/rgw_flight_frontend.h
new file mode 100644
index 000000000..dfc470a3b
--- /dev/null
+++ b/src/rgw/rgw_flight_frontend.h
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright 2023 IBM
+ *
+ * See file COPYING for licensing information.
+ */
+
+#pragma once
+
+#include "include/common_fwd.h"
+#include "common/Thread.h"
+#include "rgw_frontend.h"
+#include "rgw_op.h"
+
+#include "arrow/status.h"
+
+
+namespace rgw::flight {
+
+using FlightKey = uint32_t;
+extern const FlightKey null_flight_key;
+
+class FlightServer;
+
+class FlightFrontend : public RGWFrontend {
+
+  static constexpr std::string_view server_thread_name =
+    "Arrow Flight Server thread";
+
+  RGWProcessEnv& env;
+  std::thread flight_thread;
+  RGWFrontendConfig* config;
+  int port;
+
+  const DoutPrefix dp;
+
+public:
+
+  // port <= 0 means let server decide; typically 8077
+  FlightFrontend(RGWProcessEnv& env,
+		 RGWFrontendConfig* config,
+		 int port = -1);
+  ~FlightFrontend() override;
+  int init() override;
+  int run() override;
+  void stop() override;
+  void join() override;
+
+  void pause_for_new_config() override;
+  void unpause_with_new_config() override;
+}; // class FlightFrontend
+
+class FlightGetObj_Filter : public RGWGetObj_Filter {
+
+  const RGWProcessEnv& penv;
+  const DoutPrefix dp;
+  FlightKey key;
+  uint64_t current_offset;
+  uint64_t expected_size;
+  std::string uri;
+  std::string tenant_name;
+  std::string bucket_name;
+  rgw_obj_key object_key;
+  std::string temp_file_name;
+  std::ofstream temp_file;
+  arrow::Status schema_status;
+  rgw_user user_id; // TODO: this should be removed when we do
+  // proper flight authentication
+
+public:
+
+  FlightGetObj_Filter(const req_state* request, RGWGetObj_Filter* next);
+  ~FlightGetObj_Filter();
+
+  int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override;
+#if 0
+  // this would allow the range to be modified if necessary;
+  int fixup_range(off_t& ofs, off_t& end) override;
+#endif
+};
+
+} // namespace rgw::flight
diff --git a/src/rgw/rgw_formats.cc b/src/rgw/rgw_formats.cc
new file mode 100644
index 000000000..7ff312802
--- /dev/null
+++ b/src/rgw/rgw_formats.cc
@@ -0,0 +1,381 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <boost/format.hpp>
+
+#include "common/escape.h"
+#include "common/Formatter.h"
+#include "rgw/rgw_common.h"
+#include "rgw/rgw_formats.h"
+#include "rgw/rgw_rest.h"
+
+#define LARGE_SIZE 8192
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWFormatter_Plain::RGWFormatter_Plain(const bool ukv)
+  : use_kv(ukv)
+{
+}
+
+RGWFormatter_Plain::~RGWFormatter_Plain()
+{
+  free(buf);
+}
+
+void RGWFormatter_Plain::flush(ostream& os)
+{
+  if (!buf)
+    return;
+
+  if (len) {
+    os << buf;
+    os.flush();
+  }
+
+  reset_buf();
+}
+
+void RGWFormatter_Plain::reset_buf()
+{
+  free(buf);
+  buf = NULL;
+  len = 0;
+  max_len = 0;
+}
+
+void RGWFormatter_Plain::reset()
+{
+  reset_buf();
+  stack.clear();
+  min_stack_level = 0;
+}
+
+void RGWFormatter_Plain::open_array_section(std::string_view name)
+{
+  struct plain_stack_entry new_entry;
+  new_entry.is_array = true;
+  new_entry.size = 0;
+
+  if (use_kv && min_stack_level > 0 && !stack.empty()) {
+    struct plain_stack_entry& entry = stack.back();
+
+    if (!entry.is_array)
+      dump_format(name, "");
+  }
+
+  stack.push_back(new_entry);
+}
+
+void RGWFormatter_Plain::open_array_section_in_ns(std::string_view name, const char *ns)
+{
+  ostringstream oss;
+  oss << name << " " << ns;
+  open_array_section(oss.str().c_str());
+}
+
+void RGWFormatter_Plain::open_object_section(std::string_view name)
+{
+  struct plain_stack_entry new_entry;
+  new_entry.is_array = false;
+  new_entry.size = 0;
+
+  if (use_kv && min_stack_level > 0)
+    dump_format(name, "");
+
+  stack.push_back(new_entry);
+}
+
+void RGWFormatter_Plain::open_object_section_in_ns(std::string_view name,
+						   const char *ns)
+{
+  ostringstream oss;
+  oss << name << " " << ns;
+  open_object_section(oss.str().c_str());
+}
+
+void RGWFormatter_Plain::close_section()
+{
+  stack.pop_back();
+}
+
+void RGWFormatter_Plain::dump_null(std::string_view name)
+{
+  dump_value_int(name, "null"); /* I feel a little bad about this. */
+}
+
+void RGWFormatter_Plain::dump_unsigned(std::string_view name, uint64_t u)
+{
+  dump_value_int(name, "%" PRIu64, u);
+}
+
+void RGWFormatter_Plain::dump_int(std::string_view name, int64_t u)
+{
+  dump_value_int(name, "%" PRId64, u);
+}
+
+void RGWFormatter_Plain::dump_float(std::string_view name, double d)
+{
+  dump_value_int(name, "%f", d);
+}
+
+void RGWFormatter_Plain::dump_string(std::string_view name, std::string_view s)
+{
+  dump_format(name, "%.*s", s.size(), s.data());
+}
+
+std::ostream& RGWFormatter_Plain::dump_stream(std::string_view name)
+{
+  // TODO: implement this!
+  ceph_abort();
+}
+
+void RGWFormatter_Plain::dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap)
+{
+  char buf[LARGE_SIZE];
+
+  struct plain_stack_entry& entry = stack.back();
+
+  if (!min_stack_level)
+    min_stack_level = stack.size();
+
+  bool should_print = ((stack.size() == min_stack_level && !entry.size) || use_kv);
+
+  entry.size++;
+
+  if (!should_print)
+    return;
+
+  vsnprintf(buf, LARGE_SIZE, fmt, ap);
+
+  const char *eol;
+  if (wrote_something) {
+    if (use_kv && entry.is_array && entry.size > 1)
+      eol = ", ";
+    else
+      eol = "\n";
+  } else
+    eol = "";
+  wrote_something = true;
+
+  if (use_kv && !entry.is_array)
+    write_data("%s%.*s: %s", eol, name.size(), name.data(), buf);
+  else
+    write_data("%s%s", eol, buf);
+}
+
+int RGWFormatter_Plain::get_len() const
+{
+  // don't include null termination in length
+  return (len ? len - 1 : 0);
+}
+
+void RGWFormatter_Plain::write_raw_data(const char *data)
+{
+  write_data("%s", data);
+}
+
+void RGWFormatter_Plain::write_data(const char *fmt, ...)
+{
+#define LARGE_ENOUGH_LEN 128
+  int n, size = LARGE_ENOUGH_LEN;
+  char s[size + 8];
+  char *p, *np;
+  bool p_on_stack;
+  va_list ap;
+  int pos;
+
+  p = s;
+  p_on_stack = true;
+
+  while (1) {
+    va_start(ap, fmt);
+    n = vsnprintf(p, size, fmt, ap);
+    va_end(ap);
+
+    if (n > -1 && n < size)
+      goto done;
+    /* Else try again with more space. */
+    if (n > -1)    /* glibc 2.1 */
+      size = n+1; /* precisely what is needed */
+    else           /* glibc 2.0 */
+      size *= 2;  /* twice the old size */
+    if (p_on_stack)
+      np = (char *)malloc(size + 8);
+    else
+      np = (char *)realloc(p, size + 8);
+    if (!np)
+      goto done_free;
+    p = np;
+    p_on_stack = false;
+  }
+done:
+#define LARGE_ENOUGH_BUF 4096
+  if (!buf) {
+    max_len = std::max(LARGE_ENOUGH_BUF, size);
+    buf = (char *)malloc(max_len);
+    if (!buf) {
+      cerr << "ERROR: RGWFormatter_Plain::write_data: failed allocating " << max_len << " bytes" << std::endl;
+      goto done_free;
+    }
+  }
+
+  if (len + size > max_len) {
+    max_len = len + size + LARGE_ENOUGH_BUF;
+    void *_realloc = NULL;
+    if ((_realloc = realloc(buf, max_len)) == NULL) {
+      cerr << "ERROR: RGWFormatter_Plain::write_data: failed allocating " << max_len << " bytes" << std::endl;
+      goto done_free;
+    } else {
+      buf = (char *)_realloc;
+    }
+  }
+
+  pos = len;
+  if (len)
+    pos--; // squash null termination
+  strcpy(buf + pos, p);
+  len = pos + strlen(p) + 1;
+done_free:
+  if (!p_on_stack)
+    free(p);
+}
+
+void RGWFormatter_Plain::dump_value_int(std::string_view name, const char *fmt, ...)
+{
+  char buf[LARGE_SIZE];
+  va_list ap;
+
+  if (!min_stack_level)
+    min_stack_level = stack.size();
+
+  struct plain_stack_entry& entry = stack.back();
+  bool should_print = ((stack.size() == min_stack_level && !entry.size) || use_kv);
+
+  entry.size++;
+
+  if (!should_print)
+    return;
+
+  va_start(ap, fmt);
+  vsnprintf(buf, LARGE_SIZE, fmt, ap);
+  va_end(ap);
+
+  const char *eol;
+  if (wrote_something) {
+    eol = "\n";
+  } else
+    eol = "";
+  wrote_something = true;
+
+  if (use_kv && !entry.is_array)
+    write_data("%s%.*s: %s", eol, name.size(), name.data(), buf);
+  else
+    write_data("%s%s", eol, buf);
+
+}
+
+
+/* An utility class that serves as a mean to access the protected static
+ * methods of XMLFormatter. */
+class HTMLHelper : public XMLFormatter {
+public:
+  static std::string escape(const std::string& unescaped_str) {
+    int len = escape_xml_attr_len(unescaped_str.c_str());
+    std::string escaped(len, 0);
+    escape_xml_attr(unescaped_str.c_str(), escaped.data());
+    return escaped;
+  }
+};
+
+void RGWSwiftWebsiteListingFormatter::generate_header(
+  const std::string& dir_path,
+  const std::string& css_path)
+{
+  ss << R"(<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 )"
+     << R"(Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">)";
+
+  ss << "<html><head><title>Listing of " << xml_stream_escaper(dir_path)
+     << "</title>";
+
+  if (! css_path.empty()) {
+    ss << boost::format(R"(<link rel="stylesheet" type="text/css" href="%s" />)")
+                                % url_encode(css_path);
+  } else {
+    ss << R"(<style type="text/css">)"
+       << R"(h1 {font-size: 1em; font-weight: bold;})"
+       << R"(th {text-align: left; padding: 0px 1em 0px 1em;})"
+       << R"(td {padding: 0px 1em 0px 1em;})"
+       << R"(a {text-decoration: none;})"
+       << R"(</style>)";
+  }
+
+  ss << "</head><body>";
+
+  ss << R"(<h1 id="title">Listing of )" << xml_stream_escaper(dir_path) << "</h1>"
+     << R"(<table id="listing">)"
+     << R"(<tr id="heading">)"
+     << R"(<th class="colname">Name</th>)"
+     << R"(<th class="colsize">Size</th>)"
+     << R"(<th class="coldate">Date</th>)"
+     << R"(</tr>)";
+
+  if (! prefix.empty()) {
+    ss << R"(<tr id="parent" class="item">)"
+       << R"(<td class="colname"><a href="../">../</a></td>)"
+       << R"(<td class="colsize">&nbsp;</td>)"
+       << R"(<td class="coldate">&nbsp;</td>)"
+       << R"(</tr>)";
+  }
+}
+
+void RGWSwiftWebsiteListingFormatter::generate_footer()
+{
+  ss << R"(</table></body></html>)";
+}
+
+std::string RGWSwiftWebsiteListingFormatter::format_name(
+  const std::string& item_name) const
+{
+  return item_name.substr(prefix.length());
+}
+
+void RGWSwiftWebsiteListingFormatter::dump_object(const rgw_bucket_dir_entry& objent)
+{
+  const auto name = format_name(objent.key.name);
+  ss << boost::format(R"(<tr class="item %s">)")
+                                % "default"
+     << boost::format(R"(<td class="colname"><a href="%s">%s</a></td>)")
+                                % url_encode(name)
+                                % HTMLHelper::escape(name)
+     << boost::format(R"(<td class="colsize">%lld</td>)") % objent.meta.size
+     << boost::format(R"(<td class="coldate">%s</td>)")
+                                % dump_time_to_str(objent.meta.mtime)
+     << R"(</tr>)";
+}
+
+void RGWSwiftWebsiteListingFormatter::dump_subdir(const std::string& name)
+{
+  const auto fname = format_name(name);
+  ss << R"(<tr class="item subdir">)"
+     << boost::format(R"(<td class="colname"><a href="%s">%s</a></td>)")
+                                % url_encode(fname)
+                                % HTMLHelper::escape(fname)
+     << R"(<td class="colsize">&nbsp;</td>)"
+     << R"(<td class="coldate">&nbsp;</td>)"
+     << R"(</tr>)";
+}
diff --git a/src/rgw/rgw_formats.h b/src/rgw/rgw_formats.h
new file mode 100644
index 000000000..e645d3ec2
--- /dev/null
+++ b/src/rgw/rgw_formats.h
@@ -0,0 +1,134 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "common/Formatter.h"
+
+#include <list>
+#include <stdint.h>
+#include <string>
+#include <ostream>
+
+struct plain_stack_entry {
+  int size;
+  bool is_array;
+};
+
+/* FIXME: this class is mis-named.
+ * FIXME: This was a hack to send certain swift messages.
+ * There is a much better way to do this.
+ */
+class RGWFormatter_Plain : public Formatter {
+  void reset_buf();
+public:
+  explicit RGWFormatter_Plain(bool use_kv = false);
+  ~RGWFormatter_Plain() override;
+
+  void set_status(int status, const char* status_name) override {};
+  void output_header() override {};
+  void output_footer() override {};
+  void enable_line_break() override {};
+  void flush(std::ostream& os) override;
+  void reset() override;
+
+  void open_array_section(std::string_view name) override;
+  void open_array_section_in_ns(std::string_view name, const char *ns) override;
+  void open_object_section(std::string_view name) override;
+  void open_object_section_in_ns(std::string_view name, const char *ns) override;
+  void close_section() override;
+  void dump_null(std::string_view name) override;
+  void dump_unsigned(std::string_view name, uint64_t u) override;
+  void dump_int(std::string_view name, int64_t u) override;
+  void dump_float(std::string_view name, double d) override;
+  void dump_string(std::string_view name, std::string_view s) override;
+  std::ostream& dump_stream(std::string_view name) override;
+  void dump_format_va(std::string_view name, const char *ns, bool quoted, const char *fmt, va_list ap) override;
+  int get_len() const override;
+  void write_raw_data(const char *data) override;
+
+private:
+  void write_data(const char *fmt, ...);
+  void dump_value_int(std::string_view name, const char *fmt, ...);
+
+  char *buf = nullptr;
+  int len = 0;
+  int max_len = 0;
+
+  std::list<struct plain_stack_entry> stack;
+  size_t min_stack_level = 0;
+  bool use_kv;
+  bool wrote_something = 0;
+};
+
+
+/* This is a presentation layer. No logic inside, please. */
+class RGWSwiftWebsiteListingFormatter {
+  std::ostream& ss;
+  const std::string prefix;
+protected:
+  std::string format_name(const std::string& item_name) const;
+public:
+  RGWSwiftWebsiteListingFormatter(std::ostream& ss,
+                                  std::string prefix)
+    : ss(ss),
+      prefix(std::move(prefix)) {
+  }
+
+  /* The supplied css_path can be empty. In such situation a default,
+   * embedded style sheet will be generated. */
+  void generate_header(const std::string& dir_path,
+                       const std::string& css_path);
+  void generate_footer();
+  void dump_object(const rgw_bucket_dir_entry& objent);
+  void dump_subdir(const std::string& name);
+};
+
+
+class RGWFormatterFlusher {
+protected:
+  Formatter *formatter;
+  bool flushed;
+  bool started;
+  virtual void do_flush() = 0;
+  virtual void do_start(int ret) {}
+  void set_formatter(Formatter *f) {
+    formatter = f;
+  }
+public:
+  explicit RGWFormatterFlusher(Formatter *f) : formatter(f), flushed(false), started(false) {}
+  virtual ~RGWFormatterFlusher() {}
+
+  void flush() {
+    do_flush();
+    flushed = true;
+  }
+
+  virtual void start(int client_ret) {
+    if (!started)
+      do_start(client_ret);
+    started = true;
+  }
+
+  Formatter *get_formatter() { return formatter; }
+  bool did_flush() { return flushed; }
+  bool did_start() { return started; }
+};
+
+class RGWStreamFlusher : public RGWFormatterFlusher {
+  std::ostream& os;
+protected:
+  void do_flush() override {
+    formatter->flush(os);
+  }
+public:
+  RGWStreamFlusher(Formatter *f, std::ostream& _os) : RGWFormatterFlusher(f), os(_os) {}
+};
+
+class RGWNullFlusher : public RGWFormatterFlusher {
+protected:
+  void do_flush() override {
+  }
+public:
+  RGWNullFlusher() : RGWFormatterFlusher(nullptr) {}
+};
diff --git a/src/rgw/rgw_frontend.cc b/src/rgw/rgw_frontend.cc
new file mode 100644
index 000000000..ea5cbbafe
--- /dev/null
+++ b/src/rgw/rgw_frontend.cc
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <signal.h>
+
+#include "rgw_frontend.h"
+#include "include/str_list.h"
+
+#include "include/ceph_assert.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int RGWFrontendConfig::parse_config(const string& config,
+				    std::multimap<string, string>& config_map)
+{
+  for (auto& entry : get_str_vec(config, " ")) {
+    string key;
+    string val;
+
+    if (framework.empty()) {
+      framework = entry;
+      dout(0) << "framework: " << framework << dendl;
+      continue;
+    }
+
+    ssize_t pos = entry.find('=');
+    if (pos < 0) {
+      dout(0) << "framework conf key: " << entry << dendl;
+      config_map.emplace(std::move(entry), "");
+      continue;
+    }
+
+    int ret = parse_key_value(entry, key, val);
+    if (ret < 0) {
+      cerr << "ERROR: can't parse " << entry << std::endl;
+      return ret;
+    }
+
+    dout(0) << "framework conf key: " << key << ", val: " << val << dendl;
+    config_map.emplace(std::move(key), std::move(val));
+  }
+
+  return 0;
+}
+
+void RGWFrontendConfig::set_default_config(RGWFrontendConfig& def_conf)
+{
+  const auto& def_conf_map = def_conf.get_config_map();
+
+  for (auto& entry : def_conf_map) {
+    if (config_map.find(entry.first) == config_map.end()) {
+      config_map.emplace(entry.first, entry.second);
+    }
+  }
+}
+
+std::optional<string> RGWFrontendConfig::get_val(const std::string& key)
+{
+ auto iter = config_map.find(key);
+ if (iter == config_map.end()) {
+   return std::nullopt;
+ }
+
+ return iter->second;
+}
+
+bool RGWFrontendConfig::get_val(const string& key, const string& def_val,
+				string *out)
+{
+ auto iter = config_map.find(key);
+ if (iter == config_map.end()) {
+   *out = def_val;
+   return false;
+ }
+
+ *out = iter->second;
+ return true;
+}
+
+bool RGWFrontendConfig::get_val(const string& key, int def_val, int *out)
+{
+  string str;
+  bool found = get_val(key, "", &str);
+  if (!found) {
+    *out = def_val;
+    return false;
+  }
+  string err;
+  *out = strict_strtol(str.c_str(), 10, &err);
+  if (!err.empty()) {
+    cerr << "error parsing int: " << str << ": " << err << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+void RGWProcessFrontend::stop()
+{
+  pprocess->close_fd();
+  thread->kill(SIGUSR1);
+}
diff --git a/src/rgw/rgw_frontend.h b/src/rgw/rgw_frontend.h
new file mode 100644
index 000000000..ca1a8cba1
--- /dev/null
+++ b/src/rgw/rgw_frontend.h
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "common/RWLock.h"
+
+#include "rgw_request.h"
+#include "rgw_process.h"
+#include "rgw_process_env.h"
+#include "rgw_realm_reloader.h"
+
+#include "rgw_auth_registry.h"
+#include "rgw_sal_rados.h"
+
+#define dout_context g_ceph_context
+
+namespace rgw::dmclock {
+  class SyncScheduler;
+  class ClientConfig;
+  class SchedulerCtx;
+}
+
+class RGWFrontendConfig {
+  std::string config;
+  std::multimap<std::string, std::string> config_map;
+  std::string framework;
+
+  int parse_config(const std::string& config,
+                   std::multimap<std::string, std::string>& config_map);
+
+public:
+  explicit RGWFrontendConfig(const std::string& config)
+    : config(config) {
+  }
+
+  int init() {
+    const int ret = parse_config(config, config_map);
+    return ret < 0 ? ret : 0;
+  }
+
+  void set_default_config(RGWFrontendConfig& def_conf);
+
+  std::optional<std::string> get_val(const std::string& key);
+
+  bool get_val(const std::string& key,
+               const std::string& def_val,
+               std::string* out);
+  bool get_val(const std::string& key, int def_val, int *out);
+
+  std::string get_val(const std::string& key,
+                      const std::string& def_val) {
+    std::string out;
+    get_val(key, def_val, &out);
+    return out;
+  }
+
+  const std::string& get_config() {
+    return config;
+  }
+
+  std::multimap<std::string, std::string>& get_config_map() {
+    return config_map;
+  }
+
+  std::string get_framework() const {
+    return framework;
+ }
+};
+
+class RGWFrontend {
+public:
+  virtual ~RGWFrontend() {}
+
+  virtual int init() = 0;
+
+  virtual int run() = 0;
+  virtual void stop() = 0;
+  virtual void join() = 0;
+
+  virtual void pause_for_new_config() = 0;
+  virtual void unpause_with_new_config() = 0;
+};
+
+
+class RGWProcessFrontend : public RGWFrontend {
+protected:
+  RGWFrontendConfig* conf;
+  RGWProcess* pprocess;
+  RGWProcessEnv& env;
+  RGWProcessControlThread* thread;
+
+public:
+  RGWProcessFrontend(RGWProcessEnv& pe, RGWFrontendConfig* _conf)
+    : conf(_conf), pprocess(nullptr), env(pe), thread(nullptr) {
+  }
+
+  ~RGWProcessFrontend() override {
+    delete thread;
+    delete pprocess;
+  }
+
+  int run() override {
+    ceph_assert(pprocess); /* should have initialized by init() */
+    thread = new RGWProcessControlThread(pprocess);
+    thread->create("rgw_frontend");
+    return 0;
+  }
+
+  void stop() override;
+
+  void join() override {
+    thread->join();
+  }
+
+  void pause_for_new_config() override {
+    pprocess->pause();
+  }
+
+  void unpause_with_new_config() override {
+    pprocess->unpause_with_new_config();
+  }
+}; /* RGWProcessFrontend */
+
+class RGWLoadGenFrontend : public RGWProcessFrontend, public DoutPrefixProvider {
+public:
+  RGWLoadGenFrontend(RGWProcessEnv& pe, RGWFrontendConfig *_conf)
+    : RGWProcessFrontend(pe, _conf) {}
+
+  CephContext *get_cct() const {
+    return env.driver->ctx();
+  }
+
+  unsigned get_subsys() const
+  {
+    return ceph_subsys_rgw;
+  }
+
+  std::ostream& gen_prefix(std::ostream& out) const
+  {
+    return out << "rgw loadgen frontend: ";
+  }
+
+  int init() override {
+    int num_threads;
+    conf->get_val("num_threads", g_conf()->rgw_thread_pool_size, &num_threads);
+    std::string uri_prefix;
+    conf->get_val("prefix", "", &uri_prefix);
+
+    RGWLoadGenProcess *pp = new RGWLoadGenProcess(
+        g_ceph_context, env, num_threads, std::move(uri_prefix), conf);
+
+    pprocess = pp;
+
+    std::string uid_str;
+    conf->get_val("uid", "", &uid_str);
+    if (uid_str.empty()) {
+      derr << "ERROR: uid param must be specified for loadgen frontend"
+	   << dendl;
+      return -EINVAL;
+    }
+
+    rgw_user uid(uid_str);
+    std::unique_ptr<rgw::sal::User> user = env.driver->get_user(uid);
+
+    int ret = user->load_user(this, null_yield);
+    if (ret < 0) {
+      derr << "ERROR: failed reading user info: uid=" << uid << " ret="
+	   << ret << dendl;
+      return ret;
+    }
+
+    auto aiter = user->get_info().access_keys.begin();
+    if (aiter == user->get_info().access_keys.end()) {
+      derr << "ERROR: user has no S3 access keys set" << dendl;
+      return -EINVAL;
+    }
+
+    pp->set_access_key(aiter->second);
+
+    return 0;
+  }
+}; /* RGWLoadGenFrontend */
+
+// FrontendPauser implementation for RGWRealmReloader
+class RGWFrontendPauser : public RGWRealmReloader::Pauser {
+  std::vector<RGWFrontend*> &frontends;
+  RGWRealmReloader::Pauser* pauser;
+
+ public:
+  RGWFrontendPauser(std::vector<RGWFrontend*> &frontends,
+                    RGWRealmReloader::Pauser* pauser = nullptr)
+    : frontends(frontends), pauser(pauser) {}
+
+  void pause() override {
+    for (auto frontend : frontends)
+      frontend->pause_for_new_config();
+    if (pauser)
+      pauser->pause();
+  }
+  void resume(rgw::sal::Driver* driver) override {
+    for (auto frontend : frontends)
+      frontend->unpause_with_new_config();
+    if (pauser)
+      pauser->resume(driver);
+  }
+};
diff --git a/src/rgw/rgw_gc_log.h b/src/rgw/rgw_gc_log.h
new file mode 100644
index 000000000..a37672617
--- /dev/null
+++ b/src/rgw/rgw_gc_log.h
@@ -0,0 +1,28 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "include/rados/librados.hpp"
+#include "cls/rgw/cls_rgw_types.h"
+
+
+// initialize the cls_rgw_gc queue
+void gc_log_init2(librados::ObjectWriteOperation& op,
+                  uint64_t max_size, uint64_t max_deferred);
+
+// enqueue a gc entry to omap with cls_rgw
+void gc_log_enqueue1(librados::ObjectWriteOperation& op,
+                     uint32_t expiration, cls_rgw_gc_obj_info& info);
+
+// enqueue a gc entry to the cls_rgw_gc queue
+void gc_log_enqueue2(librados::ObjectWriteOperation& op,
+                     uint32_t expiration, const cls_rgw_gc_obj_info& info);
+
+// defer a gc entry in omap with cls_rgw
+void gc_log_defer1(librados::ObjectWriteOperation& op,
+                   uint32_t expiration, const cls_rgw_gc_obj_info& info);
+
+// defer a gc entry in the cls_rgw_gc queue
+void gc_log_defer2(librados::ObjectWriteOperation& op,
+                   uint32_t expiration, const cls_rgw_gc_obj_info& info);
diff --git a/src/rgw/rgw_http_client.cc b/src/rgw/rgw_http_client.cc
new file mode 100644
index 000000000..255db71a5
--- /dev/null
+++ b/src/rgw/rgw_http_client.cc
@@ -0,0 +1,1223 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/compat.h"
+#include "common/errno.h"
+
+
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <curl/multi.h>
+
+#include "rgw_common.h"
+#include "rgw_http_client.h"
+#include "rgw_http_errors.h"
+#include "common/async/completion.h"
+#include "common/RefCountedObj.h"
+
+#include "rgw_coroutine.h"
+#include "rgw_tools.h"
+
+#include <atomic>
+#include <string_view>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWHTTPManager *rgw_http_manager;
+
+struct RGWCurlHandle;
+
+static void do_curl_easy_cleanup(RGWCurlHandle *curl_handle);
+
+struct rgw_http_req_data : public RefCountedObject {
+  RGWCurlHandle *curl_handle{nullptr};
+  curl_slist *h{nullptr};
+  uint64_t id;
+  int ret{0};
+  std::atomic<bool> done = { false };
+  RGWHTTPClient *client{nullptr};
+  rgw_io_id control_io_id;
+  void *user_info{nullptr};
+  bool registered{false};
+  RGWHTTPManager *mgr{nullptr};
+  char error_buf[CURL_ERROR_SIZE];
+  bool write_paused{false};
+  bool read_paused{false};
+
+  optional<int> user_ret;
+
+  ceph::mutex lock = ceph::make_mutex("rgw_http_req_data::lock");
+  ceph::condition_variable cond;
+
+  using Signature = void(boost::system::error_code);
+  using Completion = ceph::async::Completion<Signature>;
+  std::unique_ptr<Completion> completion;
+
+  rgw_http_req_data() : id(-1) {
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(error_buf, 0, sizeof(error_buf));
+  }
+
+  template <typename ExecutionContext, typename CompletionToken>
+  auto async_wait(ExecutionContext& ctx, CompletionToken&& token) {
+    boost::asio::async_completion<CompletionToken, Signature> init(token);
+    auto& handler = init.completion_handler;
+    {
+      std::unique_lock l{lock};
+      completion = Completion::create(ctx.get_executor(), std::move(handler));
+    }
+    return init.result.get();
+  }
+
+  int wait(optional_yield y) {
+    if (done) {
+      return ret;
+    }
+    if (y) {
+      auto& context = y.get_io_context();
+      auto& yield = y.get_yield_context();
+      boost::system::error_code ec;
+      async_wait(context, yield[ec]);
+      return -ec.value();
+    }
+    // work on asio threads should be asynchronous, so warn when they block
+    if (is_asio_thread) {
+      dout(20) << "WARNING: blocking http request" << dendl;
+    }
+    std::unique_lock l{lock};
+    cond.wait(l, [this]{return done==true;});
+    return ret;
+  }
+
+  void set_state(int bitmask);
+
+  void finish(int r, long http_status = -1) {
+    std::lock_guard l{lock};
+    if (http_status != -1) {
+      if (client) {
+        client->set_http_status(http_status);
+      }
+    }
+    ret = r;
+    if (curl_handle)
+      do_curl_easy_cleanup(curl_handle);
+
+    if (h)
+      curl_slist_free_all(h);
+
+    curl_handle = NULL;
+    h = NULL;
+    done = true;
+    if (completion) {
+      boost::system::error_code ec(-ret, boost::system::system_category());
+      Completion::post(std::move(completion), ec);
+    } else {
+      cond.notify_all();
+    }
+  }
+
+  bool is_done() {
+    return done;
+  }
+
+  int get_retcode() {
+    std::lock_guard l{lock};
+    return ret;
+  }
+  
+  RGWHTTPManager *get_manager() {
+    std::lock_guard l{lock};
+    return mgr;
+  }
+
+  CURL *get_easy_handle() const;
+};
+
+struct RGWCurlHandle {
+  int uses;
+  mono_time lastuse;
+  CURL* h;
+
+  explicit RGWCurlHandle(CURL* h) : uses(0), h(h) {};
+  CURL* operator*() {
+    return this->h;
+  }
+};
+
+void rgw_http_req_data::set_state(int bitmask) {
+  /* no need to lock here, moreover curl_easy_pause() might trigger
+   * the data receive callback :/
+   */
+  CURLcode rc = curl_easy_pause(**curl_handle, bitmask);
+  if (rc != CURLE_OK) {
+    dout(0) << "ERROR: curl_easy_pause() returned rc=" << rc << dendl;
+  }
+}
+
+#define MAXIDLE 5
+class RGWCurlHandles : public Thread {
+public:
+  ceph::mutex cleaner_lock = ceph::make_mutex("RGWCurlHandles::cleaner_lock");
+  std::vector<RGWCurlHandle*> saved_curl;
+  int cleaner_shutdown;
+  ceph::condition_variable cleaner_cond;
+
+  RGWCurlHandles() :
+    cleaner_shutdown{0} {
+  }
+
+  RGWCurlHandle* get_curl_handle();
+  void release_curl_handle_now(RGWCurlHandle* curl);
+  void release_curl_handle(RGWCurlHandle* curl);
+  void flush_curl_handles();
+  void* entry();
+  void stop();
+};
+
+RGWCurlHandle* RGWCurlHandles::get_curl_handle() {
+  RGWCurlHandle* curl = 0;
+  CURL* h;
+  {
+    std::lock_guard lock{cleaner_lock};
+    if (!saved_curl.empty()) {
+      curl = *saved_curl.begin();
+      saved_curl.erase(saved_curl.begin());
+    }
+  }
+  if (curl) {
+  } else if ((h = curl_easy_init())) {
+    curl = new RGWCurlHandle{h};
+  } else {
+    // curl = 0;
+  }
+  return curl;
+}
+
+void RGWCurlHandles::release_curl_handle_now(RGWCurlHandle* curl)
+{
+  curl_easy_cleanup(**curl);
+  delete curl;
+}
+
+void RGWCurlHandles::release_curl_handle(RGWCurlHandle* curl)
+{
+  if (cleaner_shutdown) {
+    release_curl_handle_now(curl);
+  } else {
+    curl_easy_reset(**curl);
+    std::lock_guard lock{cleaner_lock};
+    curl->lastuse = mono_clock::now();
+    saved_curl.insert(saved_curl.begin(), 1, curl);
+  }
+}
+
+void* RGWCurlHandles::entry()
+{
+  RGWCurlHandle* curl;
+  std::unique_lock lock{cleaner_lock};
+
+  for (;;) {
+    if (cleaner_shutdown) {
+      if (saved_curl.empty())
+        break;
+    } else {
+      cleaner_cond.wait_for(lock, std::chrono::seconds(MAXIDLE));
+    }
+    mono_time now = mono_clock::now();
+    while (!saved_curl.empty()) {
+      auto cend = saved_curl.end();
+      --cend;
+      curl = *cend;
+      if (!cleaner_shutdown && now - curl->lastuse < std::chrono::seconds(MAXIDLE))
+        break;
+      saved_curl.erase(cend);
+      release_curl_handle_now(curl);
+    }
+  }
+  return nullptr;
+}
+
+void RGWCurlHandles::stop()
+{
+  std::lock_guard lock{cleaner_lock};
+  cleaner_shutdown = 1;
+  cleaner_cond.notify_all();
+}
+
+void RGWCurlHandles::flush_curl_handles()
+{
+  stop();
+  join();
+  if (!saved_curl.empty()) {
+    dout(0) << "ERROR: " << __func__ << " failed final cleanup" << dendl;
+  }
+  saved_curl.shrink_to_fit();
+}
+
+CURL *rgw_http_req_data::get_easy_handle() const
+{
+  return **curl_handle;
+}
+
+static RGWCurlHandles *handles;
+
+static RGWCurlHandle *do_curl_easy_init()
+{
+  return handles->get_curl_handle();
+}
+
+static void do_curl_easy_cleanup(RGWCurlHandle *curl_handle)
+{
+  handles->release_curl_handle(curl_handle);
+}
+
+// XXX make this part of the token cache?  (but that's swift-only;
+//	and this especially needs to integrates with s3...)
+
+void rgw_setup_saved_curl_handles()
+{
+  handles = new RGWCurlHandles();
+  handles->create("rgw_curl");
+}
+
+void rgw_release_all_curl_handles()
+{
+  handles->flush_curl_handles();
+  delete handles;
+}
+
+void RGWIOProvider::assign_io(RGWIOIDProvider& io_id_provider, int io_type)
+{
+  if (id == 0) {
+    id = io_id_provider.get_next();
+  }
+}
+
+RGWHTTPClient::RGWHTTPClient(CephContext *cct,
+                             const string& _method,
+                             const string& _url)
+    : NoDoutPrefix(cct, dout_subsys),
+      has_send_len(false),
+      http_status(HTTP_STATUS_NOSTATUS),
+      req_data(nullptr),
+      verify_ssl(cct->_conf->rgw_verify_ssl),
+      cct(cct),
+      method(_method),
+      url(_url) {
+  init();
+}
+
+std::ostream& RGWHTTPClient::gen_prefix(std::ostream& out) const
+{
+  out << "http_client[" << method << "/" << url << "]";
+  return out;
+}
+
+void RGWHTTPClient::init()
+{
+  auto pos = url.find("://");
+  if (pos == string::npos) {
+    host = url;
+    return;
+  }
+
+  protocol = url.substr(0, pos);
+
+  pos += 3;
+
+  auto host_end_pos = url.find("/", pos);
+  if (host_end_pos == string::npos) {
+    host = url.substr(pos);
+    return;
+  }
+
+  host = url.substr(pos, host_end_pos - pos);
+  resource_prefix = url.substr(host_end_pos + 1);
+  if (resource_prefix.size() > 0 && resource_prefix[resource_prefix.size() - 1] != '/') {
+    resource_prefix.append("/");
+  }
+}
+
+/*
+ * the following set of callbacks will be called either on RGWHTTPManager::process(),
+ * or via the RGWHTTPManager async processing.
+ */
+size_t RGWHTTPClient::receive_http_header(void * const ptr,
+                                          const size_t size,
+                                          const size_t nmemb,
+                                          void * const _info)
+{
+  rgw_http_req_data *req_data = static_cast<rgw_http_req_data *>(_info);
+  size_t len = size * nmemb;
+
+  std::lock_guard l{req_data->lock};
+  
+  if (!req_data->registered) {
+    return len;
+  }
+
+  int ret = req_data->client->receive_header(ptr, size * nmemb);
+  if (ret < 0) {
+    dout(5) << "WARNING: client->receive_header() returned ret=" << ret << dendl;
+    req_data->user_ret = ret;
+    return CURLE_WRITE_ERROR;
+  }
+
+  return len;
+}
+
+size_t RGWHTTPClient::receive_http_data(void * const ptr,
+                                        const size_t size,
+                                        const size_t nmemb,
+                                        void * const _info)
+{
+  rgw_http_req_data *req_data = static_cast<rgw_http_req_data *>(_info);
+  size_t len = size * nmemb;
+
+  bool pause = false;
+
+  RGWHTTPClient *client;
+
+  {
+    std::lock_guard l{req_data->lock};
+    if (!req_data->registered) {
+      return len;
+    }
+
+    client = req_data->client;
+  }
+
+  size_t& skip_bytes = client->receive_pause_skip;
+
+  if (skip_bytes >= len) {
+    skip_bytes -= len;
+    return len;
+  }
+
+  int ret = client->receive_data((char *)ptr + skip_bytes, len - skip_bytes, &pause);
+  if (ret < 0) {
+    dout(5) << "WARNING: client->receive_data() returned ret=" << ret << dendl;
+    req_data->user_ret = ret;
+    return CURLE_WRITE_ERROR;
+  }
+
+  if (pause) {
+    dout(20) << "RGWHTTPClient::receive_http_data(): pause" << dendl;
+    skip_bytes = len;
+    std::lock_guard l{req_data->lock};
+    req_data->read_paused = true;
+    return CURL_WRITEFUNC_PAUSE;
+  }
+
+  skip_bytes = 0;
+
+  return len;
+}
+
+size_t RGWHTTPClient::send_http_data(void * const ptr,
+                                     const size_t size,
+                                     const size_t nmemb,
+                                     void * const _info)
+{
+  rgw_http_req_data *req_data = static_cast<rgw_http_req_data *>(_info);
+
+  RGWHTTPClient *client;
+
+  {
+    std::lock_guard l{req_data->lock};
+  
+    if (!req_data->registered) {
+      return 0;
+    }
+
+    client = req_data->client;
+  }
+
+  bool pause = false;
+
+  int ret = client->send_data(ptr, size * nmemb, &pause);
+  if (ret < 0) {
+    dout(5) << "WARNING: client->send_data() returned ret=" << ret << dendl;
+    req_data->user_ret = ret;
+    return CURLE_READ_ERROR;
+  }
+
+  if (ret == 0 &&
+      pause) {
+    std::lock_guard l{req_data->lock};
+    req_data->write_paused = true;
+    return CURL_READFUNC_PAUSE;
+  }
+
+  return ret;
+}
+
+ceph::mutex& RGWHTTPClient::get_req_lock()
+{
+  return req_data->lock;
+}
+
+void RGWHTTPClient::_set_write_paused(bool pause)
+{
+  ceph_assert(ceph_mutex_is_locked(req_data->lock));
+  
+  RGWHTTPManager *mgr = req_data->mgr;
+  if (pause == req_data->write_paused) {
+    return;
+  }
+  if (pause) {
+    mgr->set_request_state(this, SET_WRITE_PAUSED);
+  } else {
+    mgr->set_request_state(this, SET_WRITE_RESUME);
+  }
+}
+
+void RGWHTTPClient::_set_read_paused(bool pause)
+{
+  ceph_assert(ceph_mutex_is_locked(req_data->lock));
+  
+  RGWHTTPManager *mgr = req_data->mgr;
+  if (pause == req_data->read_paused) {
+    return;
+  }
+  if (pause) {
+    mgr->set_request_state(this, SET_READ_PAUSED);
+  } else {
+    mgr->set_request_state(this, SET_READ_RESUME);
+  }
+}
+
+static curl_slist *headers_to_slist(param_vec_t& headers)
+{
+  curl_slist *h = NULL;
+
+  param_vec_t::iterator iter;
+  for (iter = headers.begin(); iter != headers.end(); ++iter) {
+    pair<string, string>& p = *iter;
+    string val = p.first;
+
+    if (strncmp(val.c_str(), "HTTP_", 5) == 0) {
+      val = val.substr(5);
+    }
+
+    /* we need to convert all underscores into dashes as some web servers forbid them
+     * in the http header field names
+     */
+    for (size_t i = 0; i < val.size(); i++) {
+      if (val[i] == '_') {
+        val[i] = '-';
+      }
+    }
+    
+    val = camelcase_dash_http_attr(val);
+
+    // curl won't send headers with empty values unless it ends with a ; instead
+    if (p.second.empty()) {
+      val.append(1, ';');
+    } else {
+      val.append(": ");
+      val.append(p.second);
+    }
+    h = curl_slist_append(h, val.c_str());
+  }
+
+  return h;
+}
+
+static bool is_upload_request(const string& method)
+{
+  return method == "POST" || method == "PUT";
+}
+
+/*
+ * process a single simple one off request
+ */
+int RGWHTTPClient::process(optional_yield y)
+{
+  return RGWHTTP::process(this, y);
+}
+
+string RGWHTTPClient::to_str()
+{
+  string method_str = (method.empty() ? "<no-method>" : method);
+  string url_str = (url.empty() ? "<no-url>" : url);
+  return method_str + " " + url_str;
+}
+
+int RGWHTTPClient::get_req_retcode()
+{
+  if (!req_data) {
+    return -EINVAL;
+  }
+
+  return req_data->get_retcode();
+}
+
+/*
+ * init request, will be used later with RGWHTTPManager
+ */
+int RGWHTTPClient::init_request(rgw_http_req_data *_req_data)
+{
+  ceph_assert(!req_data);
+  _req_data->get();
+  req_data = _req_data;
+
+  req_data->curl_handle = do_curl_easy_init();
+
+  CURL *easy_handle = req_data->get_easy_handle();
+
+  dout(20) << "sending request to " << url << dendl;
+
+  curl_slist *h = headers_to_slist(headers);
+
+  req_data->h = h;
+
+  curl_easy_setopt(easy_handle, CURLOPT_CUSTOMREQUEST, method.c_str());
+  curl_easy_setopt(easy_handle, CURLOPT_URL, url.c_str());
+  curl_easy_setopt(easy_handle, CURLOPT_NOPROGRESS, 1L);
+  curl_easy_setopt(easy_handle, CURLOPT_NOSIGNAL, 1L);
+  curl_easy_setopt(easy_handle, CURLOPT_HEADERFUNCTION, receive_http_header);
+  curl_easy_setopt(easy_handle, CURLOPT_WRITEHEADER, (void *)req_data);
+  curl_easy_setopt(easy_handle, CURLOPT_WRITEFUNCTION, receive_http_data);
+  curl_easy_setopt(easy_handle, CURLOPT_WRITEDATA, (void *)req_data);
+  curl_easy_setopt(easy_handle, CURLOPT_ERRORBUFFER, (void *)req_data->error_buf);
+  curl_easy_setopt(easy_handle, CURLOPT_LOW_SPEED_TIME, cct->_conf->rgw_curl_low_speed_time);
+  curl_easy_setopt(easy_handle, CURLOPT_LOW_SPEED_LIMIT, cct->_conf->rgw_curl_low_speed_limit);
+  curl_easy_setopt(easy_handle, CURLOPT_TCP_KEEPALIVE, cct->_conf->rgw_curl_tcp_keepalive);
+  curl_easy_setopt(easy_handle, CURLOPT_READFUNCTION, send_http_data);
+  curl_easy_setopt(easy_handle, CURLOPT_READDATA, (void *)req_data);
+  curl_easy_setopt(easy_handle, CURLOPT_BUFFERSIZE, cct->_conf->rgw_curl_buffersize);
+  if (send_data_hint || is_upload_request(method)) {
+    curl_easy_setopt(easy_handle, CURLOPT_UPLOAD, 1L);
+  }
+  if (has_send_len) {
+    // TODO: prevent overflow by using curl_off_t
+    // and: CURLOPT_INFILESIZE_LARGE, CURLOPT_POSTFIELDSIZE_LARGE
+    const long size = send_len;
+    curl_easy_setopt(easy_handle, CURLOPT_INFILESIZE, size);
+    if (method == "POST") {
+      curl_easy_setopt(easy_handle, CURLOPT_POSTFIELDSIZE, size); 
+      // TODO: set to size smaller than 1MB should prevent the "Expect" field
+      // from being sent. So explicit removal is not needed
+      h = curl_slist_append(h, "Expect:");
+    }
+  }
+
+  if (method == "HEAD") {
+    curl_easy_setopt(easy_handle, CURLOPT_NOBODY, 1L);
+  }
+
+  if (h) {
+    curl_easy_setopt(easy_handle, CURLOPT_HTTPHEADER, (void *)h);
+  }
+  if (!verify_ssl) {
+    curl_easy_setopt(easy_handle, CURLOPT_SSL_VERIFYPEER, 0L);
+    curl_easy_setopt(easy_handle, CURLOPT_SSL_VERIFYHOST, 0L);
+    dout(20) << "ssl verification is set to off" << dendl;
+  } else {
+    if (!ca_path.empty()) {
+      curl_easy_setopt(easy_handle, CURLOPT_CAINFO, ca_path.c_str());
+      dout(20) << "using custom ca cert "<< ca_path.c_str() << " for ssl" << dendl;
+    }
+    if (!client_cert.empty()) {
+      if (!client_key.empty()) {
+	curl_easy_setopt(easy_handle, CURLOPT_SSLCERT, client_cert.c_str());
+	curl_easy_setopt(easy_handle, CURLOPT_SSLKEY, client_key.c_str());
+	dout(20) << "using custom client cert " << client_cert.c_str()
+	  << " and private key " << client_key.c_str() << dendl;
+      } else {
+	dout(5) << "private key is missing for client certificate" << dendl;
+      }
+    }
+  }
+  curl_easy_setopt(easy_handle, CURLOPT_PRIVATE, (void *)req_data);
+  curl_easy_setopt(easy_handle, CURLOPT_TIMEOUT, req_timeout);
+
+  return 0;
+}
+
+bool RGWHTTPClient::is_done()
+{
+  return req_data->is_done();
+}
+
+/*
+ * wait for async request to complete
+ */
+int RGWHTTPClient::wait(optional_yield y)
+{
+  return req_data->wait(y);
+}
+
+void RGWHTTPClient::cancel()
+{
+  if (req_data) {
+    RGWHTTPManager *http_manager = req_data->mgr;
+    if (http_manager) {
+      http_manager->remove_request(this);
+    }
+  }
+}
+
+RGWHTTPClient::~RGWHTTPClient()
+{
+  cancel();
+  if (req_data) {
+    req_data->put();
+  }
+}
+
+
+int RGWHTTPHeadersCollector::receive_header(void * const ptr, const size_t len)
+{
+  const std::string_view header_line(static_cast<const char *>(ptr), len);
+
+  /* We're tokening the line that way due to backward compatibility. */
+  const size_t sep_loc = header_line.find_first_of(" \t:");
+
+  if (std::string_view::npos == sep_loc) {
+    /* Wrongly formatted header? Just skip it. */
+    return 0;
+  }
+
+  header_name_t name(header_line.substr(0, sep_loc));
+  if (0 == relevant_headers.count(name)) {
+    /* Not interested in this particular header. */
+    return 0;
+  }
+
+  const auto value_part = header_line.substr(sep_loc + 1);
+
+  /* Skip spaces and tabs after the separator. */
+  const size_t val_loc_s = value_part.find_first_not_of(' ');
+  const size_t val_loc_e = value_part.find_first_of("\r\n");
+
+  if (std::string_view::npos == val_loc_s ||
+      std::string_view::npos == val_loc_e) {
+    /* Empty value case. */
+    found_headers.emplace(name, header_value_t());
+  } else {
+    found_headers.emplace(name, header_value_t(
+        value_part.substr(val_loc_s, val_loc_e - val_loc_s)));
+  }
+
+  return 0;
+}
+
+int RGWHTTPTransceiver::send_data(void* ptr, size_t len, bool* pause)
+{
+  int length_to_copy = 0;
+  if (post_data_index < post_data.length()) {
+    length_to_copy = min(post_data.length() - post_data_index, len);
+    memcpy(ptr, post_data.data() + post_data_index, length_to_copy);
+    post_data_index += length_to_copy;
+  }
+  return length_to_copy;
+}
+
+
+static int clear_signal(int fd)
+{
+  // since we're in non-blocking mode, we can try to read a lot more than
+  // one signal from signal_thread() to avoid later wakeups
+  std::array<char, 256> buf{};
+  int ret = ::read(fd, (void *)buf.data(), buf.size());
+  if (ret < 0) {
+    ret = -errno;
+    return ret == -EAGAIN ? 0 : ret; // clear EAGAIN
+  }
+  return 0;
+}
+
+static int do_curl_wait(CephContext *cct, CURLM *handle, int signal_fd)
+{
+  int num_fds;
+  struct curl_waitfd wait_fd;
+
+  wait_fd.fd = signal_fd;
+  wait_fd.events = CURL_WAIT_POLLIN;
+  wait_fd.revents = 0;
+
+  int ret = curl_multi_wait(handle, &wait_fd, 1, cct->_conf->rgw_curl_wait_timeout_ms, &num_fds);
+  if (ret) {
+    ldout(cct, 0) << "ERROR: curl_multi_wait() returned " << ret << dendl;
+    return -EIO;
+  }
+
+  if (wait_fd.revents > 0) {
+    ret = clear_signal(signal_fd);
+    if (ret < 0) {
+      ldout(cct, 0) << "ERROR: " << __func__ << "(): read() returned " << ret << dendl;
+      return ret;
+    }
+  }
+  return 0;
+}
+
+void *RGWHTTPManager::ReqsThread::entry()
+{
+  manager->reqs_thread_entry();
+  return NULL;
+}
+
+/*
+ * RGWHTTPManager has two modes of operation: threaded and non-threaded.
+ */
+RGWHTTPManager::RGWHTTPManager(CephContext *_cct, RGWCompletionManager *_cm) : cct(_cct),
+                                                    completion_mgr(_cm)
+{
+  multi_handle = (void *)curl_multi_init();
+  thread_pipe[0] = -1;
+  thread_pipe[1] = -1;
+}
+
+RGWHTTPManager::~RGWHTTPManager() {
+  stop();
+  if (multi_handle)
+    curl_multi_cleanup((CURLM *)multi_handle);
+}
+
+void RGWHTTPManager::register_request(rgw_http_req_data *req_data)
+{
+  std::unique_lock rl{reqs_lock};
+  req_data->id = num_reqs;
+  req_data->registered = true;
+  reqs[num_reqs] = req_data;
+  num_reqs++;
+  ldout(cct, 20) << __func__ << " mgr=" << this << " req_data->id=" << req_data->id << ", curl_handle=" << req_data->curl_handle << dendl;
+}
+
+bool RGWHTTPManager::unregister_request(rgw_http_req_data *req_data)
+{
+  std::unique_lock rl{reqs_lock};
+  if (!req_data->registered) {
+    return false;
+  }
+  req_data->get();
+  req_data->registered = false;
+  unregistered_reqs.push_back(req_data);
+  ldout(cct, 20) << __func__ << " mgr=" << this << " req_data->id=" << req_data->id << ", curl_handle=" << req_data->curl_handle << dendl;
+  return true;
+}
+
+void RGWHTTPManager::complete_request(rgw_http_req_data *req_data)
+{
+  std::unique_lock rl{reqs_lock};
+  _complete_request(req_data);
+}
+
+void RGWHTTPManager::_complete_request(rgw_http_req_data *req_data)
+{
+  map<uint64_t, rgw_http_req_data *>::iterator iter = reqs.find(req_data->id);
+  if (iter != reqs.end()) {
+    reqs.erase(iter);
+  }
+  {
+    std::lock_guard l{req_data->lock};
+    req_data->mgr = nullptr;
+  }
+  if (completion_mgr) {
+    completion_mgr->complete(NULL, req_data->control_io_id, req_data->user_info);
+  }
+
+  req_data->put();
+}
+
+void RGWHTTPManager::finish_request(rgw_http_req_data *req_data, int ret, long http_status)
+{
+  req_data->finish(ret, http_status);
+  complete_request(req_data);
+}
+
+void RGWHTTPManager::_finish_request(rgw_http_req_data *req_data, int ret)
+{
+  req_data->finish(ret);
+  _complete_request(req_data);
+}
+
+void RGWHTTPManager::_set_req_state(set_state& ss)
+{
+  ss.req->set_state(ss.bitmask);
+}
+/*
+ * hook request to the curl multi handle
+ */
+int RGWHTTPManager::link_request(rgw_http_req_data *req_data)
+{
+  ldout(cct, 20) << __func__ << " req_data=" << req_data << " req_data->id=" << req_data->id << ", curl_handle=" << req_data->curl_handle << dendl;
+  CURLMcode mstatus = curl_multi_add_handle((CURLM *)multi_handle, req_data->get_easy_handle());
+  if (mstatus) {
+    dout(0) << "ERROR: failed on curl_multi_add_handle, status=" << mstatus << dendl;
+    return -EIO;
+  }
+  return 0;
+}
+
+/*
+ * unhook request from the curl multi handle, and finish request if it wasn't finished yet as
+ * there will be no more processing on this request
+ */
+void RGWHTTPManager::_unlink_request(rgw_http_req_data *req_data)
+{
+  if (req_data->curl_handle) {
+    curl_multi_remove_handle((CURLM *)multi_handle, req_data->get_easy_handle());
+  }
+  if (!req_data->is_done()) {
+    _finish_request(req_data, -ECANCELED);
+  }
+}
+
+void RGWHTTPManager::unlink_request(rgw_http_req_data *req_data)
+{
+  std::unique_lock wl{reqs_lock};
+  _unlink_request(req_data);
+}
+
+void RGWHTTPManager::manage_pending_requests()
+{
+  reqs_lock.lock_shared();
+  if (max_threaded_req == num_reqs &&
+      unregistered_reqs.empty() &&
+      reqs_change_state.empty()) {
+    reqs_lock.unlock_shared();
+    return;
+  }
+  reqs_lock.unlock_shared();
+
+  std::unique_lock wl{reqs_lock};
+
+  if (!reqs_change_state.empty()) {
+    for (auto siter : reqs_change_state) {
+      _set_req_state(siter);
+    }
+    reqs_change_state.clear();
+  }
+
+  if (!unregistered_reqs.empty()) {
+    for (auto& r : unregistered_reqs) {
+      _unlink_request(r);
+      r->put();
+    }
+
+    unregistered_reqs.clear();
+  }
+
+  map<uint64_t, rgw_http_req_data *>::iterator iter = reqs.find(max_threaded_req);
+
+  list<std::pair<rgw_http_req_data *, int> > remove_reqs;
+
+  for (; iter != reqs.end(); ++iter) {
+    rgw_http_req_data *req_data = iter->second;
+    int r = link_request(req_data);
+    if (r < 0) {
+      ldout(cct, 0) << "ERROR: failed to link http request" << dendl;
+      remove_reqs.push_back(std::make_pair(iter->second, r));
+    } else {
+      max_threaded_req = iter->first + 1;
+    }
+  }
+
+  for (auto piter : remove_reqs) {
+    rgw_http_req_data *req_data = piter.first;
+    int r = piter.second;
+
+    _finish_request(req_data, r);
+  }
+}
+
+int RGWHTTPManager::add_request(RGWHTTPClient *client)
+{
+  rgw_http_req_data *req_data = new rgw_http_req_data;
+
+  int ret = client->init_request(req_data);
+  if (ret < 0) {
+    req_data->put();
+    req_data = NULL;
+    return ret;
+  }
+
+  req_data->mgr = this;
+  req_data->client = client;
+  req_data->control_io_id = client->get_io_id(RGWHTTPClient::HTTPCLIENT_IO_CONTROL);
+  req_data->user_info = client->get_io_user_info();
+
+  register_request(req_data);
+
+  if (!is_started) {
+    ret = link_request(req_data);
+    if (ret < 0) {
+      req_data->put();
+      req_data = NULL;
+    }
+    return ret;
+  }
+  ret = signal_thread();
+  if (ret < 0) {
+    finish_request(req_data, ret);
+  }
+
+  return ret;
+}
+
+int RGWHTTPManager::remove_request(RGWHTTPClient *client)
+{
+  rgw_http_req_data *req_data = client->get_req_data();
+
+  if (!is_started) {
+    unlink_request(req_data);
+    return 0;
+  }
+  if (!unregister_request(req_data)) {
+    return 0;
+  }
+  int ret = signal_thread();
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWHTTPManager::set_request_state(RGWHTTPClient *client, RGWHTTPRequestSetState state)
+{
+  rgw_http_req_data *req_data = client->get_req_data();
+
+  ceph_assert(ceph_mutex_is_locked(req_data->lock));
+
+  /* can only do that if threaded */
+  if (!is_started) {
+    return -EINVAL;
+  }
+
+  bool suggested_wr_paused = req_data->write_paused;
+  bool suggested_rd_paused = req_data->read_paused;
+
+  switch (state) {
+    case SET_WRITE_PAUSED:
+      suggested_wr_paused = true;
+      break;
+    case SET_WRITE_RESUME:
+      suggested_wr_paused = false;
+      break;
+    case SET_READ_PAUSED:
+      suggested_rd_paused = true;
+      break;
+    case SET_READ_RESUME:
+      suggested_rd_paused = false;
+      break;
+    default:
+      /* shouldn't really be here */
+      return -EIO;
+  }
+  if (suggested_wr_paused == req_data->write_paused &&
+      suggested_rd_paused == req_data->read_paused) {
+    return 0;
+  }
+
+  req_data->write_paused = suggested_wr_paused;
+  req_data->read_paused = suggested_rd_paused;
+
+  int bitmask = CURLPAUSE_CONT;
+
+  if (req_data->write_paused) {
+    bitmask |= CURLPAUSE_SEND;
+  }
+
+  if (req_data->read_paused) {
+    bitmask |= CURLPAUSE_RECV;
+  }
+
+  reqs_change_state.push_back(set_state(req_data, bitmask));
+  int ret = signal_thread();
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWHTTPManager::start()
+{
+  if (pipe_cloexec(thread_pipe, 0) < 0) {
+    int e = errno;
+    ldout(cct, 0) << "ERROR: pipe(): " << cpp_strerror(e) << dendl;
+    return -e;
+  }
+
+  // enable non-blocking reads
+  if (::fcntl(thread_pipe[0], F_SETFL, O_NONBLOCK) < 0) {
+    int e = errno;
+    ldout(cct, 0) << "ERROR: fcntl(): " << cpp_strerror(e) << dendl;
+    TEMP_FAILURE_RETRY(::close(thread_pipe[0]));
+    TEMP_FAILURE_RETRY(::close(thread_pipe[1]));
+    return -e;
+  }
+
+  is_started = true;
+  reqs_thread = new ReqsThread(this);
+  reqs_thread->create("http_manager");
+  return 0;
+}
+
+void RGWHTTPManager::stop()
+{
+  if (is_stopped) {
+    return;
+  }
+
+  is_stopped = true;
+
+  if (is_started) {
+    going_down = true;
+    signal_thread();
+    reqs_thread->join();
+    delete reqs_thread;
+    TEMP_FAILURE_RETRY(::close(thread_pipe[1]));
+    TEMP_FAILURE_RETRY(::close(thread_pipe[0]));
+  }
+}
+
+int RGWHTTPManager::signal_thread()
+{
+  uint32_t buf = 0;
+  int ret = write(thread_pipe[1], (void *)&buf, sizeof(buf));
+  if (ret < 0) {
+    ret = -errno;
+    ldout(cct, 0) << "ERROR: " << __func__ << ": write() returned ret=" << ret << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+void *RGWHTTPManager::reqs_thread_entry()
+{
+  int still_running;
+  int mstatus;
+
+  ldout(cct, 20) << __func__ << ": start" << dendl;
+
+  while (!going_down) {
+    int ret = do_curl_wait(cct, (CURLM *)multi_handle, thread_pipe[0]);
+    if (ret < 0) {
+      dout(0) << "ERROR: do_curl_wait() returned: " << ret << dendl;
+      return NULL;
+    }
+
+    manage_pending_requests();
+
+    mstatus = curl_multi_perform((CURLM *)multi_handle, &still_running);
+    switch (mstatus) {
+      case CURLM_OK:
+      case CURLM_CALL_MULTI_PERFORM:
+        break;
+      default:
+        dout(10) << "curl_multi_perform returned: " << mstatus << dendl;
+	break;
+    }
+    int msgs_left;
+    CURLMsg *msg;
+    while ((msg = curl_multi_info_read((CURLM *)multi_handle, &msgs_left))) {
+      if (msg->msg == CURLMSG_DONE) {
+	int result = msg->data.result;
+	CURL *e = msg->easy_handle;
+	rgw_http_req_data *req_data;
+	curl_easy_getinfo(e, CURLINFO_PRIVATE, (void **)&req_data);
+	curl_multi_remove_handle((CURLM *)multi_handle, e);
+
+	long http_status;
+        int status;
+        if (!req_data->user_ret) {
+          curl_easy_getinfo(e, CURLINFO_RESPONSE_CODE, (void **)&http_status);
+
+          status = rgw_http_error_to_errno(http_status);
+          if (result != CURLE_OK && status == 0) {
+            dout(0) << "ERROR: curl error: " << curl_easy_strerror((CURLcode)result) << ", maybe network unstable" << dendl;
+            status = -EAGAIN;
+          }
+        } else {
+          status = *req_data->user_ret;
+          rgw_err err;
+          set_req_state_err(err, status, 0);
+          http_status = err.http_ret;
+        }
+        int id = req_data->id;
+	finish_request(req_data, status, http_status);
+        switch (result) {
+          case CURLE_OK:
+            break;
+          case CURLE_OPERATION_TIMEDOUT:
+            dout(0) << "WARNING: curl operation timed out, network average transfer speed less than " 
+              << cct->_conf->rgw_curl_low_speed_limit << " Bytes per second during " << cct->_conf->rgw_curl_low_speed_time << " seconds." << dendl;
+          default:
+            dout(20) << "ERROR: msg->data.result=" << result << " req_data->id=" << id << " http_status=" << http_status << dendl;
+            dout(20) << "ERROR: curl error: " << curl_easy_strerror((CURLcode)result) << " req_data->error_buf=" << req_data->error_buf << dendl;
+	    break;
+        }
+      }
+    }
+  }
+
+
+  std::unique_lock rl{reqs_lock};
+  for (auto r : unregistered_reqs) {
+    _unlink_request(r);
+  }
+
+  unregistered_reqs.clear();
+
+  auto all_reqs = std::move(reqs);
+  for (auto iter : all_reqs) {
+    _unlink_request(iter.second);
+  }
+
+  reqs.clear();
+
+  if (completion_mgr) {
+    completion_mgr->go_down();
+  }
+  
+  return 0;
+}
+
+void rgw_http_client_init(CephContext *cct)
+{
+  curl_global_init(CURL_GLOBAL_ALL);
+  rgw_http_manager = new RGWHTTPManager(cct);
+  rgw_http_manager->start();
+}
+
+void rgw_http_client_cleanup()
+{
+  rgw_http_manager->stop();
+  delete rgw_http_manager;
+  curl_global_cleanup();
+}
+
+
+int RGWHTTP::send(RGWHTTPClient *req) {
+  if (!req) {
+    return 0;
+  }
+  int r = rgw_http_manager->add_request(req);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWHTTP::process(RGWHTTPClient *req, optional_yield y) {
+  if (!req) {
+    return 0;
+  }
+  int r = send(req);
+  if (r < 0) {
+    return r;
+  }
+
+  return req->wait(y);
+}
+
diff --git a/src/rgw/rgw_http_client.h b/src/rgw/rgw_http_client.h
new file mode 100644
index 000000000..dbd705a18
--- /dev/null
+++ b/src/rgw/rgw_http_client.h
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "common/async/yield_context.h"
+#include "common/Cond.h"
+#include "rgw_common.h"
+#include "rgw_string.h"
+#include "rgw_http_client_types.h"
+
+#include <atomic>
+
+using param_pair_t = std::pair<std::string, std::string>;
+using param_vec_t = std::vector<param_pair_t>;
+
+void rgw_http_client_init(CephContext *cct);
+void rgw_http_client_cleanup();
+
+struct rgw_http_req_data;
+class RGWHTTPManager;
+
+class RGWHTTPClient : public RGWIOProvider,
+                      public NoDoutPrefix
+{
+  friend class RGWHTTPManager;
+
+  bufferlist send_bl;
+  bufferlist::iterator send_iter;
+  bool has_send_len;
+  long http_status;
+  bool send_data_hint{false};
+  size_t receive_pause_skip{0}; /* how many bytes to skip next time receive_data is called
+                                   due to being paused */
+
+  void *user_info{nullptr};
+
+  rgw_http_req_data *req_data;
+
+  bool verify_ssl; // Do not validate self signed certificates, default to false
+
+  std::string ca_path;
+
+  std::string client_cert;
+
+  std::string client_key;
+
+  std::atomic<unsigned> stopped { 0 };
+
+
+protected:
+  CephContext *cct;
+
+  std::string method;
+  std::string url;
+
+  std::string protocol;
+  std::string host;
+  std::string resource_prefix;
+
+  size_t send_len{0};
+
+  param_vec_t headers;
+
+  long  req_timeout{0L};
+
+  void init();
+
+  RGWHTTPManager *get_manager();
+
+  int init_request(rgw_http_req_data *req_data);
+
+  virtual int receive_header(void *ptr, size_t len) {
+    return 0;
+  }
+  virtual int receive_data(void *ptr, size_t len, bool *pause) {
+    return 0;
+  }
+
+  virtual int send_data(void *ptr, size_t len, bool *pause=nullptr) {
+    return 0;
+  }
+
+  /* Callbacks for libcurl. */
+  static size_t receive_http_header(void *ptr,
+                                    size_t size,
+                                    size_t nmemb,
+                                    void *_info);
+
+  static size_t receive_http_data(void *ptr,
+                                  size_t size,
+                                  size_t nmemb,
+                                  void *_info);
+
+  static size_t send_http_data(void *ptr,
+                               size_t size,
+                               size_t nmemb,
+                               void *_info);
+
+  ceph::mutex& get_req_lock();
+
+  /* needs to be called under req_lock() */
+  void _set_write_paused(bool pause);
+  void _set_read_paused(bool pause);
+public:
+  static const long HTTP_STATUS_NOSTATUS     = 0;
+  static const long HTTP_STATUS_UNAUTHORIZED = 401;
+  static const long HTTP_STATUS_NOTFOUND     = 404;
+
+  static constexpr int HTTPCLIENT_IO_READ    = 0x1;
+  static constexpr int HTTPCLIENT_IO_WRITE   = 0x2;
+  static constexpr int HTTPCLIENT_IO_CONTROL = 0x4;
+
+  virtual ~RGWHTTPClient();
+  explicit RGWHTTPClient(CephContext *cct,
+                         const std::string& _method,
+                         const std::string& _url);
+
+  std::ostream& gen_prefix(std::ostream& out) const override;
+
+
+  void append_header(const std::string& name, const std::string& val) {
+    headers.push_back(std::pair<std::string, std::string>(name, val));
+  }
+
+  void set_send_length(size_t len) {
+    send_len = len;
+    has_send_len = true;
+  }
+
+  void set_send_data_hint(bool hint) {
+    send_data_hint = hint;
+  }
+
+  long get_http_status() const {
+    return http_status;
+  }
+
+  void set_http_status(long _http_status) {
+    http_status = _http_status;
+  }
+
+  void set_verify_ssl(bool flag) {
+    verify_ssl = flag;
+  }
+
+  // set request timeout in seconds
+  // zero (default) mean that request will never timeout
+  void set_req_timeout(long timeout) {
+    req_timeout = timeout;
+  }
+
+  int process(optional_yield y);
+
+  int wait(optional_yield y);
+  void cancel();
+  bool is_done();
+
+  rgw_http_req_data *get_req_data() { return req_data; }
+
+  std::string to_str();
+
+  int get_req_retcode();
+
+  void set_url(const std::string& _url) {
+    url = _url;
+  }
+
+  void set_method(const std::string& _method) {
+    method = _method;
+  }
+
+  void set_io_user_info(void *_user_info) override {
+    user_info = _user_info;
+  }
+
+  void *get_io_user_info() override {
+    return user_info;
+  }
+
+  void set_ca_path(const std::string& _ca_path) {
+    ca_path = _ca_path;
+  }
+
+  void set_client_cert(const std::string& _client_cert) {
+    client_cert = _client_cert;
+  }
+
+  void set_client_key(const std::string& _client_key) {
+    client_key = _client_key;
+  }
+};
+
+
+class RGWHTTPHeadersCollector : public RGWHTTPClient {
+public:
+  typedef std::string header_name_t;
+  typedef std::string header_value_t;
+  typedef std::set<header_name_t, ltstr_nocase> header_spec_t;
+
+  RGWHTTPHeadersCollector(CephContext * const cct,
+                          const std::string& method,
+                          const std::string& url,
+                          const header_spec_t &relevant_headers)
+    : RGWHTTPClient(cct, method, url),
+      relevant_headers(relevant_headers) {
+  }
+
+  std::map<header_name_t, header_value_t, ltstr_nocase> get_headers() const {
+    return found_headers;
+  }
+
+  /* Throws std::out_of_range */
+  const header_value_t& get_header_value(const header_name_t& name) const {
+    return found_headers.at(name);
+  }
+
+protected:
+  int receive_header(void *ptr, size_t len) override;
+
+private:
+  const std::set<header_name_t, ltstr_nocase> relevant_headers;
+  std::map<header_name_t, header_value_t, ltstr_nocase> found_headers;
+};
+
+
+class RGWHTTPTransceiver : public RGWHTTPHeadersCollector {
+  bufferlist * const read_bl;
+  std::string post_data;
+  size_t post_data_index;
+
+public:
+  RGWHTTPTransceiver(CephContext * const cct,
+                     const std::string& method,
+                     const std::string& url,
+                     bufferlist * const read_bl,
+                     const header_spec_t intercept_headers = {})
+    : RGWHTTPHeadersCollector(cct, method, url, intercept_headers),
+      read_bl(read_bl),
+      post_data_index(0) {
+  }
+
+  RGWHTTPTransceiver(CephContext * const cct,
+                     const std::string& method,
+                     const std::string& url,
+                     bufferlist * const read_bl,
+                     const bool verify_ssl,
+                     const header_spec_t intercept_headers = {})
+    : RGWHTTPHeadersCollector(cct, method, url, intercept_headers),
+      read_bl(read_bl),
+      post_data_index(0) {
+    set_verify_ssl(verify_ssl);
+  }
+
+  void set_post_data(const std::string& _post_data) {
+    this->post_data = _post_data;
+  }
+
+protected:
+  int send_data(void* ptr, size_t len, bool *pause=nullptr) override;
+
+  int receive_data(void *ptr, size_t len, bool *pause) override {
+    read_bl->append((char *)ptr, len);
+    return 0;
+  }
+};
+
+typedef RGWHTTPTransceiver RGWPostHTTPData;
+
+
+class RGWCompletionManager;
+
+enum RGWHTTPRequestSetState {
+  SET_NOP = 0,
+  SET_WRITE_PAUSED = 1,
+  SET_WRITE_RESUME = 2,
+  SET_READ_PAUSED  = 3,
+  SET_READ_RESUME  = 4,
+};
+
+class RGWHTTPManager {
+  struct set_state {
+    rgw_http_req_data *req;
+    int bitmask;
+
+    set_state(rgw_http_req_data *_req, int _bitmask) : req(_req), bitmask(_bitmask) {}
+  };
+  CephContext *cct;
+  RGWCompletionManager *completion_mgr;
+  void *multi_handle;
+  bool is_started = false;
+  std::atomic<unsigned> going_down { 0 };
+  std::atomic<unsigned> is_stopped { 0 };
+
+  ceph::shared_mutex reqs_lock = ceph::make_shared_mutex("RGWHTTPManager::reqs_lock");
+  std::map<uint64_t, rgw_http_req_data *> reqs;
+  std::list<rgw_http_req_data *> unregistered_reqs;
+  std::list<set_state> reqs_change_state;
+  std::map<uint64_t, rgw_http_req_data *> complete_reqs;
+  int64_t num_reqs = 0;
+  int64_t max_threaded_req = 0;
+  int thread_pipe[2];
+
+  void register_request(rgw_http_req_data *req_data);
+  void complete_request(rgw_http_req_data *req_data);
+  void _complete_request(rgw_http_req_data *req_data);
+  bool unregister_request(rgw_http_req_data *req_data);
+  void _unlink_request(rgw_http_req_data *req_data);
+  void unlink_request(rgw_http_req_data *req_data);
+  void finish_request(rgw_http_req_data *req_data, int r, long http_status = -1);
+  void _finish_request(rgw_http_req_data *req_data, int r);
+  void _set_req_state(set_state& ss);
+  int link_request(rgw_http_req_data *req_data);
+
+  void manage_pending_requests();
+
+  class ReqsThread : public Thread {
+    RGWHTTPManager *manager;
+
+  public:
+    explicit ReqsThread(RGWHTTPManager *_m) : manager(_m) {}
+    void *entry() override;
+  };
+
+  ReqsThread *reqs_thread = nullptr;
+
+  void *reqs_thread_entry();
+
+  int signal_thread();
+
+public:
+  RGWHTTPManager(CephContext *_cct, RGWCompletionManager *completion_mgr = NULL);
+  ~RGWHTTPManager();
+
+  int start();
+  void stop();
+
+  int add_request(RGWHTTPClient *client);
+  int remove_request(RGWHTTPClient *client);
+  int set_request_state(RGWHTTPClient *client, RGWHTTPRequestSetState state);
+};
+
+class RGWHTTP
+{
+public:
+  static int send(RGWHTTPClient *req);
+  static int process(RGWHTTPClient *req, optional_yield y);
+};
diff --git a/src/rgw/rgw_http_client_curl.cc b/src/rgw/rgw_http_client_curl.cc
new file mode 100644
index 000000000..2477cfceb
--- /dev/null
+++ b/src/rgw/rgw_http_client_curl.cc
@@ -0,0 +1,112 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_http_client_curl.h"
+#include <mutex>
+#include <vector>
+#include <curl/curl.h>
+
+#include "rgw_common.h"
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+#ifdef WITH_CURL_OPENSSL
+#include <openssl/crypto.h>
+#endif
+
+#if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L
+namespace openssl {
+
+class RGWSSLSetup
+{
+  std::vector <std::mutex> locks;
+public:
+  explicit RGWSSLSetup(int n) : locks (n){}
+
+  void set_lock(int id){
+    try {
+      locks.at(id).lock();
+    } catch (std::out_of_range& e) {
+      dout(0) << __func__ << " failed to set locks" << dendl;
+    }
+  }
+
+  void clear_lock(int id){
+    try {
+      locks.at(id).unlock();
+    } catch (std::out_of_range& e) {
+      dout(0) << __func__ << " failed to unlock" << dendl;
+    }
+  }
+};
+
+
+void rgw_ssl_locking_callback(int mode, int id, const char *file, int line)
+{
+  static RGWSSLSetup locks(CRYPTO_num_locks());
+  if (mode & CRYPTO_LOCK)
+    locks.set_lock(id);
+  else
+    locks.clear_lock(id);
+}
+
+unsigned long rgw_ssl_thread_id_callback(){
+  return (unsigned long)pthread_self();
+}
+
+void init_ssl(){
+  CRYPTO_set_id_callback((unsigned long (*) ()) rgw_ssl_thread_id_callback);
+  CRYPTO_set_locking_callback(rgw_ssl_locking_callback);
+}
+
+} /* namespace openssl */
+#endif // WITH_CURL_OPENSSL
+
+
+namespace rgw {
+namespace curl {
+
+#if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L
+void init_ssl() {
+  ::openssl::init_ssl();
+}
+
+bool fe_inits_ssl(boost::optional <const fe_map_t&> m, long& curl_global_flags){
+  if (m) {
+    for (const auto& kv: *m){
+      if (kv.first == "beast"){
+        std::string cert;
+        kv.second->get_val("ssl_certificate","", &cert);
+        if (!cert.empty()){
+         /* TODO this flag is no op for curl > 7.57 */
+          curl_global_flags &= ~CURL_GLOBAL_SSL;
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+#endif // WITH_CURL_OPENSSL
+
+std::once_flag curl_init_flag;
+
+void setup_curl(boost::optional<const fe_map_t&> m) {
+  long curl_global_flags = CURL_GLOBAL_ALL;
+
+  #if defined(WITH_CURL_OPENSSL) && OPENSSL_API_COMPAT < 0x10100000L
+  if (!fe_inits_ssl(m, curl_global_flags))
+    init_ssl();
+  #endif
+
+  std::call_once(curl_init_flag, curl_global_init, curl_global_flags);
+  rgw_setup_saved_curl_handles();
+}
+
+void cleanup_curl() {
+  rgw_release_all_curl_handles();
+  curl_global_cleanup();
+}
+
+} /* namespace curl */
+} /* namespace rgw */
diff --git a/src/rgw/rgw_http_client_curl.h b/src/rgw/rgw_http_client_curl.h
new file mode 100644
index 000000000..a28826b0d
--- /dev/null
+++ b/src/rgw/rgw_http_client_curl.h
@@ -0,0 +1,29 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 SUSE Linux GmBH
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <map>
+#include <boost/optional.hpp>
+#include "rgw_frontend.h"
+
+namespace rgw {
+namespace curl {
+using fe_map_t = std::multimap <std::string, RGWFrontendConfig *>;
+
+void setup_curl(boost::optional<const fe_map_t&> m);
+void cleanup_curl();
+}
+}
diff --git a/src/rgw/rgw_http_client_types.h b/src/rgw/rgw_http_client_types.h
new file mode 100644
index 000000000..84e6ed678
--- /dev/null
+++ b/src/rgw/rgw_http_client_types.h
@@ -0,0 +1,69 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <atomic>
+
+struct rgw_io_id {
+  int64_t id{0};
+  int channels{0};
+
+  rgw_io_id() {}
+  rgw_io_id(int64_t _id, int _channels) : id(_id), channels(_channels) {}
+
+  bool intersects(const rgw_io_id& rhs) {
+    return (id == rhs.id && ((channels | rhs.channels) != 0));
+  }
+
+  bool operator<(const rgw_io_id& rhs) const {
+    if (id < rhs.id) {
+      return true;
+    }
+    return (id == rhs.id &&
+            channels < rhs.channels);
+  }
+};
+
+class RGWIOIDProvider
+{
+  std::atomic<int64_t> max = {0};
+
+public:
+  RGWIOIDProvider() {}
+  int64_t get_next() {
+    return ++max;
+  }
+};
+
+class RGWIOProvider
+{
+  int64_t id{-1};
+
+public:
+  RGWIOProvider() {}
+  virtual ~RGWIOProvider() = default;
+
+  void assign_io(RGWIOIDProvider& io_id_provider, int io_type = -1);
+  rgw_io_id get_io_id(int io_type) {
+    return rgw_io_id{id, io_type};
+  }
+
+  virtual void set_io_user_info(void *_user_info) = 0;
+  virtual void *get_io_user_info() = 0;
+};
+
diff --git a/src/rgw/rgw_http_errors.h b/src/rgw/rgw_http_errors.h
new file mode 100644
index 000000000..5e052819e
--- /dev/null
+++ b/src/rgw/rgw_http_errors.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_common.h"
+
+typedef const std::map<int,const std::pair<int, const char*>> rgw_http_errors;
+
+extern rgw_http_errors rgw_http_s3_errors;
+
+extern rgw_http_errors rgw_http_swift_errors;
+
+extern rgw_http_errors rgw_http_sts_errors;
+
+extern rgw_http_errors rgw_http_iam_errors;
+
+static inline int rgw_http_error_to_errno(int http_err)
+{
+  if (http_err >= 200 && http_err <= 299)
+    return 0;
+  switch (http_err) {
+    case 304:
+      return -ERR_NOT_MODIFIED;
+    case 400:
+      return -EINVAL;
+    case 401:
+      return -EPERM;
+    case 403:
+        return -EACCES;
+    case 404:
+        return -ENOENT;
+    case 405:
+        return -ERR_METHOD_NOT_ALLOWED;
+    case 409:
+        return -ENOTEMPTY;
+    case 503:
+        return -EBUSY;
+    default:
+        return -EIO;
+  }
+
+  return 0; /* unreachable */
+}
diff --git a/src/rgw/rgw_iam_policy.cc b/src/rgw/rgw_iam_policy.cc
new file mode 100644
index 000000000..35aeb15fc
--- /dev/null
+++ b/src/rgw/rgw_iam_policy.cc
@@ -0,0 +1,1663 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+
+#include <cstring>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <stack>
+#include <utility>
+
+#include <arpa/inet.h>
+
+#include <experimental/iterator>
+
+#include "rapidjson/reader.h"
+
+#include "include/expected.hpp"
+
+#include "rgw_auth.h"
+#include "rgw_iam_policy.h"
+
+
+namespace {
+constexpr int dout_subsys = ceph_subsys_rgw;
+}
+
+using std::dec;
+using std::hex;
+using std::int64_t;
+using std::size_t;
+using std::string;
+using std::stringstream;
+using std::ostream;
+using std::uint16_t;
+using std::uint64_t;
+
+using boost::container::flat_set;
+using std::regex;
+using std::regex_match;
+using std::smatch;
+
+using rapidjson::BaseReaderHandler;
+using rapidjson::UTF8;
+using rapidjson::SizeType;
+using rapidjson::Reader;
+using rapidjson::kParseCommentsFlag;
+using rapidjson::kParseNumbersAsStringsFlag;
+using rapidjson::StringStream;
+
+using rgw::auth::Principal;
+
+namespace rgw {
+namespace IAM {
+#include "rgw_iam_policy_keywords.frag.cc"
+
+struct actpair {
+  const char* name;
+  const uint64_t bit;
+};
+
+
+
+static const actpair actpairs[] =
+{{ "s3:AbortMultipartUpload", s3AbortMultipartUpload },
+ { "s3:CreateBucket", s3CreateBucket },
+ { "s3:DeleteBucketPolicy", s3DeleteBucketPolicy },
+ { "s3:DeleteBucket", s3DeleteBucket },
+ { "s3:DeleteBucketWebsite", s3DeleteBucketWebsite },
+ { "s3:DeleteObject", s3DeleteObject },
+ { "s3:DeleteObjectVersion", s3DeleteObjectVersion },
+ { "s3:DeleteObjectTagging", s3DeleteObjectTagging },
+ { "s3:DeleteObjectVersionTagging", s3DeleteObjectVersionTagging },
+ { "s3:DeleteBucketPublicAccessBlock", s3DeleteBucketPublicAccessBlock},
+ { "s3:DeletePublicAccessBlock", s3DeletePublicAccessBlock},
+ { "s3:DeleteReplicationConfiguration", s3DeleteReplicationConfiguration },
+ { "s3:GetAccelerateConfiguration", s3GetAccelerateConfiguration },
+ { "s3:GetBucketAcl", s3GetBucketAcl },
+ { "s3:GetBucketCORS", s3GetBucketCORS },
+ { "s3:GetBucketEncryption", s3GetBucketEncryption },
+ { "s3:GetBucketLocation", s3GetBucketLocation },
+ { "s3:GetBucketLogging", s3GetBucketLogging },
+ { "s3:GetBucketNotification", s3GetBucketNotification },
+ { "s3:GetBucketPolicy", s3GetBucketPolicy },
+ { "s3:GetBucketPolicyStatus", s3GetBucketPolicyStatus },
+ { "s3:GetBucketPublicAccessBlock", s3GetBucketPublicAccessBlock },
+ { "s3:GetBucketRequestPayment", s3GetBucketRequestPayment },
+ { "s3:GetBucketTagging", s3GetBucketTagging },
+ { "s3:GetBucketVersioning", s3GetBucketVersioning },
+ { "s3:GetBucketWebsite", s3GetBucketWebsite },
+ { "s3:GetLifecycleConfiguration", s3GetLifecycleConfiguration },
+ { "s3:GetBucketObjectLockConfiguration", s3GetBucketObjectLockConfiguration },
+ { "s3:GetPublicAccessBlock", s3GetPublicAccessBlock },
+ { "s3:GetObjectAcl", s3GetObjectAcl },
+ { "s3:GetObject", s3GetObject },
+ { "s3:GetObjectTorrent", s3GetObjectTorrent },
+ { "s3:GetObjectVersionAcl", s3GetObjectVersionAcl },
+ { "s3:GetObjectVersion", s3GetObjectVersion },
+ { "s3:GetObjectVersionTorrent", s3GetObjectVersionTorrent },
+ { "s3:GetObjectTagging", s3GetObjectTagging },
+ { "s3:GetObjectVersionTagging", s3GetObjectVersionTagging},
+ { "s3:GetObjectRetention", s3GetObjectRetention},
+ { "s3:GetObjectLegalHold", s3GetObjectLegalHold},
+ { "s3:GetReplicationConfiguration", s3GetReplicationConfiguration },
+ { "s3:ListAllMyBuckets", s3ListAllMyBuckets },
+ { "s3:ListBucketMultipartUploads", s3ListBucketMultipartUploads },
+ { "s3:ListBucket", s3ListBucket },
+ { "s3:ListBucketVersions", s3ListBucketVersions },
+ { "s3:ListMultipartUploadParts", s3ListMultipartUploadParts },
+ { "s3:PutAccelerateConfiguration", s3PutAccelerateConfiguration },
+ { "s3:PutBucketAcl", s3PutBucketAcl },
+ { "s3:PutBucketCORS", s3PutBucketCORS },
+ { "s3:PutBucketEncryption", s3PutBucketEncryption },
+ { "s3:PutBucketLogging", s3PutBucketLogging },
+ { "s3:PutBucketNotification", s3PutBucketNotification },
+ { "s3:PutBucketPolicy", s3PutBucketPolicy },
+ { "s3:PutBucketRequestPayment", s3PutBucketRequestPayment },
+ { "s3:PutBucketTagging", s3PutBucketTagging },
+ { "s3:PutBucketVersioning", s3PutBucketVersioning },
+ { "s3:PutBucketWebsite", s3PutBucketWebsite },
+ { "s3:PutLifecycleConfiguration", s3PutLifecycleConfiguration },
+ { "s3:PutBucketObjectLockConfiguration", s3PutBucketObjectLockConfiguration },
+ { "s3:PutObjectAcl",  s3PutObjectAcl },
+ { "s3:PutObject", s3PutObject },
+ { "s3:PutObjectVersionAcl", s3PutObjectVersionAcl },
+ { "s3:PutObjectTagging", s3PutObjectTagging },
+ { "s3:PutObjectVersionTagging", s3PutObjectVersionTagging },
+ { "s3:PutObjectRetention", s3PutObjectRetention },
+ { "s3:PutObjectLegalHold", s3PutObjectLegalHold },
+ { "s3:BypassGovernanceRetention", s3BypassGovernanceRetention },
+ { "s3:PutBucketPublicAccessBlock", s3PutBucketPublicAccessBlock },
+ { "s3:PutPublicAccessBlock", s3PutPublicAccessBlock },
+ { "s3:PutReplicationConfiguration", s3PutReplicationConfiguration },
+ { "s3:RestoreObject", s3RestoreObject },
+ { "iam:PutUserPolicy", iamPutUserPolicy },
+ { "iam:GetUserPolicy", iamGetUserPolicy },
+ { "iam:DeleteUserPolicy", iamDeleteUserPolicy },
+ { "iam:ListUserPolicies", iamListUserPolicies },
+ { "iam:CreateRole", iamCreateRole},
+ { "iam:DeleteRole", iamDeleteRole},
+ { "iam:GetRole", iamGetRole},
+ { "iam:ModifyRoleTrustPolicy", iamModifyRoleTrustPolicy},
+ { "iam:ListRoles", iamListRoles},
+ { "iam:PutRolePolicy", iamPutRolePolicy},
+ { "iam:GetRolePolicy", iamGetRolePolicy},
+ { "iam:ListRolePolicies", iamListRolePolicies},
+ { "iam:DeleteRolePolicy", iamDeleteRolePolicy},
+ { "iam:CreateOIDCProvider", iamCreateOIDCProvider},
+ { "iam:DeleteOIDCProvider", iamDeleteOIDCProvider},
+ { "iam:GetOIDCProvider", iamGetOIDCProvider},
+ { "iam:ListOIDCProviders", iamListOIDCProviders},
+ { "iam:TagRole", iamTagRole},
+ { "iam:ListRoleTags", iamListRoleTags},
+ { "iam:UntagRole", iamUntagRole},
+ { "iam:UpdateRole", iamUpdateRole},
+ { "sts:AssumeRole", stsAssumeRole},
+ { "sts:AssumeRoleWithWebIdentity", stsAssumeRoleWithWebIdentity},
+ { "sts:GetSessionToken", stsGetSessionToken},
+ { "sts:TagSession", stsTagSession},
+};
+
+struct PolicyParser;
+
+const Keyword top[1]{{"<Top>", TokenKind::pseudo, TokenID::Top, 0, false,
+			false}};
+const Keyword cond_key[1]{{"<Condition Key>", TokenKind::cond_key,
+			     TokenID::CondKey, 0, true, false}};
+
+struct ParseState {
+  PolicyParser* pp;
+  const Keyword* w;
+
+  bool arraying = false;
+  bool objecting = false;
+  bool cond_ifexists = false;
+
+  void reset();
+
+  void annotate(std::string&& a);
+
+  boost::optional<Principal> parse_principal(string&& s, string* errmsg);
+
+  ParseState(PolicyParser* pp, const Keyword* w)
+    : pp(pp), w(w) {}
+
+  bool obj_start();
+
+  bool obj_end();
+
+  bool array_start() {
+    if (w->arrayable && !arraying) {
+      arraying = true;
+      return true;
+    }
+    annotate(fmt::format("`{}` does not take array.",
+			 w->name));
+    return false;
+  }
+
+  bool array_end();
+
+  bool key(const char* s, size_t l);
+  bool do_string(CephContext* cct, const char* s, size_t l);
+  bool number(const char* str, size_t l);
+};
+
+// If this confuses you, look up the Curiously Recurring Template Pattern
+struct PolicyParser : public BaseReaderHandler<UTF8<>, PolicyParser> {
+  keyword_hash tokens;
+  std::vector<ParseState> s;
+  CephContext* cct;
+  const string& tenant;
+  Policy& policy;
+  uint32_t v = 0;
+
+  const bool reject_invalid_principals;
+
+  uint32_t seen = 0;
+
+  std::string annotation{"No error?"};
+
+  uint32_t dex(TokenID in) const {
+    switch (in) {
+    case TokenID::Version:
+      return 0x1;
+    case TokenID::Id:
+      return 0x2;
+    case TokenID::Statement:
+      return 0x4;
+    case TokenID::Sid:
+      return 0x8;
+    case TokenID::Effect:
+      return 0x10;
+    case TokenID::Principal:
+      return 0x20;
+    case TokenID::NotPrincipal:
+      return 0x40;
+    case TokenID::Action:
+      return 0x80;
+    case TokenID::NotAction:
+      return 0x100;
+    case TokenID::Resource:
+      return 0x200;
+    case TokenID::NotResource:
+      return 0x400;
+    case TokenID::Condition:
+      return 0x800;
+    case TokenID::AWS:
+      return 0x1000;
+    case TokenID::Federated:
+      return 0x2000;
+    case TokenID::Service:
+      return 0x4000;
+    case TokenID::CanonicalUser:
+      return 0x8000;
+    default:
+      ceph_abort();
+    }
+  }
+  bool test(TokenID in) {
+    return seen & dex(in);
+  }
+  void set(TokenID in) {
+    seen |= dex(in);
+    if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) |
+		   dex(TokenID::Principal) | dex(TokenID::NotPrincipal) |
+		   dex(TokenID::Action) | dex(TokenID::NotAction) |
+		   dex(TokenID::Resource) | dex(TokenID::NotResource) |
+		   dex(TokenID::Condition) | dex(TokenID::AWS) |
+		   dex(TokenID::Federated) | dex(TokenID::Service) |
+		   dex(TokenID::CanonicalUser))) {
+      v |= dex(in);
+    }
+  }
+  void set(std::initializer_list<TokenID> l) {
+    for (auto in : l) {
+      seen |= dex(in);
+      if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) |
+		     dex(TokenID::Principal) | dex(TokenID::NotPrincipal) |
+		     dex(TokenID::Action) | dex(TokenID::NotAction) |
+		     dex(TokenID::Resource) | dex(TokenID::NotResource) |
+		     dex(TokenID::Condition) | dex(TokenID::AWS) |
+		     dex(TokenID::Federated) | dex(TokenID::Service) |
+		     dex(TokenID::CanonicalUser))) {
+	v |= dex(in);
+      }
+    }
+  }
+  void reset(TokenID in) {
+    seen &= ~dex(in);
+    if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) |
+		   dex(TokenID::Principal) | dex(TokenID::NotPrincipal) |
+		   dex(TokenID::Action) | dex(TokenID::NotAction) |
+		   dex(TokenID::Resource) | dex(TokenID::NotResource) |
+		   dex(TokenID::Condition) | dex(TokenID::AWS) |
+		   dex(TokenID::Federated) | dex(TokenID::Service) |
+		   dex(TokenID::CanonicalUser))) {
+      v &= ~dex(in);
+    }
+  }
+  void reset(std::initializer_list<TokenID> l) {
+    for (auto in : l) {
+      seen &= ~dex(in);
+      if (dex(in) & (dex(TokenID::Sid) | dex(TokenID::Effect) |
+		     dex(TokenID::Principal) | dex(TokenID::NotPrincipal) |
+		     dex(TokenID::Action) | dex(TokenID::NotAction) |
+		     dex(TokenID::Resource) | dex(TokenID::NotResource) |
+		     dex(TokenID::Condition) | dex(TokenID::AWS) |
+		     dex(TokenID::Federated) | dex(TokenID::Service) |
+		     dex(TokenID::CanonicalUser))) {
+	v &= ~dex(in);
+      }
+    }
+  }
+  void reset(uint32_t& v) {
+    seen &= ~v;
+    v = 0;
+  }
+
+  PolicyParser(CephContext* cct, const string& tenant, Policy& policy,
+	       bool reject_invalid_principals)
+    : cct(cct), tenant(tenant), policy(policy),
+      reject_invalid_principals(reject_invalid_principals) {}
+  PolicyParser(const PolicyParser& policy) = delete;
+
+  bool StartObject() {
+    if (s.empty()) {
+      s.push_back({this, top});
+      s.back().objecting = true;
+      return true;
+    }
+
+    return s.back().obj_start();
+  }
+  bool EndObject(SizeType memberCount) {
+    if (s.empty()) {
+      annotation = "Attempt to end unopened object at top level.";
+      return false;
+    }
+    return s.back().obj_end();
+  }
+  bool Key(const char* str, SizeType length, bool copy) {
+    if (s.empty()) {
+      annotation = "Key not allowed at top level.";
+      return false;
+    }
+    return s.back().key(str, length);
+  }
+
+  bool String(const char* str, SizeType length, bool copy) {
+    if (s.empty()) {
+      annotation = "String not allowed at top level.";
+      return false;
+    }
+    return s.back().do_string(cct, str, length);
+  }
+  bool RawNumber(const char* str, SizeType length, bool copy) {
+    if (s.empty()) {
+      annotation = "Number not allowed at top level.";
+      return false;
+    }
+
+    return s.back().number(str, length);
+  }
+  bool StartArray() {
+    if (s.empty()) {
+      annotation = "Array not allowed at top level.";
+      return false;
+    }
+
+    return s.back().array_start();
+  }
+  bool EndArray(SizeType) {
+    if (s.empty()) {
+      return false;
+    }
+
+    return s.back().array_end();
+  }
+
+  bool Default() {
+    return false;
+  }
+};
+
+
+// I really despise this misfeature of C++.
+//
+void ParseState::annotate(std::string&& a) {
+  pp->annotation = std::move(a);
+}
+
+bool ParseState::obj_end() {
+  if (objecting) {
+    objecting = false;
+    if (!arraying) {
+      pp->s.pop_back();
+    } else {
+      reset();
+    }
+    return true;
+  }
+  annotate(
+    fmt::format("Attempt to end unopened object on keyword `{}`.",
+		w->name));
+  return false;
+}
+
+bool ParseState::key(const char* s, size_t l) {
+  auto token_len = l;
+  bool ifexists = false;
+  if (w->id == TokenID::Condition && w->kind == TokenKind::statement) {
+    static constexpr char IfExists[] = "IfExists";
+    if (boost::algorithm::ends_with(std::string_view{s, l}, IfExists)) {
+      ifexists = true;
+      token_len -= sizeof(IfExists)-1;
+    }
+  }
+  auto k = pp->tokens.lookup(s, token_len);
+
+  if (!k) {
+    if (w->kind == TokenKind::cond_op) {
+      auto id = w->id;
+      auto& t = pp->policy.statements.back();
+      auto c_ife =  cond_ifexists;
+      pp->s.emplace_back(pp, cond_key);
+      t.conditions.emplace_back(id, s, l, c_ife);
+      return true;
+    } else {
+      annotate(fmt::format("Unknown key `{}`.", std::string_view{s, token_len}));
+      return false;
+    }
+  }
+
+  // If the token we're going with belongs within the condition at the
+  // top of the stack and we haven't already encountered it, push it
+  // on the stack
+  // Top
+  if ((((w->id == TokenID::Top) && (k->kind == TokenKind::top)) ||
+       // Statement
+       ((w->id == TokenID::Statement) && (k->kind == TokenKind::statement)) ||
+
+       /// Principal
+       ((w->id == TokenID::Principal || w->id == TokenID::NotPrincipal) &&
+	(k->kind == TokenKind::princ_type))) &&
+
+      // Check that it hasn't been encountered. Note that this
+      // conjoins with the run of disjunctions above.
+      !pp->test(k->id)) {
+    pp->set(k->id);
+    pp->s.emplace_back(pp, k);
+    return true;
+  } else if ((w->id == TokenID::Condition) &&
+	     (k->kind == TokenKind::cond_op)) {
+    pp->s.emplace_back(pp, k);
+    pp->s.back().cond_ifexists = ifexists;
+    return true;
+  }
+  annotate(fmt::format("Token `{}` is not allowed in the context of `{}`.",
+		       k->name, w->name));
+  return false;
+}
+
+// I should just rewrite a few helper functions to use iterators,
+// which will make all of this ever so much nicer.
+boost::optional<Principal> ParseState::parse_principal(string&& s,
+						       string* errmsg) {
+  if ((w->id == TokenID::AWS) && (s == "*")) {
+    // Wildcard!
+    return Principal::wildcard();
+  } else if (w->id == TokenID::CanonicalUser) {
+    // Do nothing for now.
+    if (errmsg)
+      *errmsg = "RGW does not support canonical users.";
+    return boost::none;
+  } else if (w->id == TokenID::AWS || w->id == TokenID::Federated) {
+    // AWS and Federated ARNs
+    if (auto a = ARN::parse(s)) {
+      if (a->resource == "root") {
+	return Principal::tenant(std::move(a->account));
+      }
+
+      static const char rx_str[] = "([^/]*)/(.*)";
+      static const regex rx(rx_str, sizeof(rx_str) - 1,
+			    std::regex_constants::ECMAScript |
+			    std::regex_constants::optimize);
+      smatch match;
+      if (regex_match(a->resource, match, rx) && match.size() == 3) {
+	if (match[1] == "user") {
+	  return Principal::user(std::move(a->account),
+				 match[2]);
+	}
+
+	if (match[1] == "role") {
+	  return Principal::role(std::move(a->account),
+				 match[2]);
+	}
+
+        if (match[1] == "oidc-provider") {
+                return Principal::oidc_provider(std::move(match[2]));
+        }
+	if (match[1] == "assumed-role") {
+	  return Principal::assumed_role(std::move(a->account), match[2]);
+	}
+      }
+    } else if (std::none_of(s.begin(), s.end(),
+		       [](const char& c) {
+			 return (c == ':') || (c == '/');
+		       })) {
+      // Since tenants are simply prefixes, there's no really good
+      // way to see if one exists or not. So we return the thing and
+      // let them try to match against it.
+      return Principal::tenant(std::move(s));
+    }
+    if (errmsg)
+      *errmsg =
+	fmt::format(
+	  "`{}` is not a supported AWS or Federated ARN. Supported ARNs are "
+	  "forms like: "
+	  "`arn:aws:iam::tenant:root` or a bare tenant name for a tenant, "
+	  "`arn:aws:iam::tenant:role/role-name` for a role, "
+	  "`arn:aws:sts::tenant:assumed-role/role-name/role-session-name` "
+	  "for an assumed role, "
+	  "`arn:aws:iam::tenant:user/user-name` for a user, "
+	  "`arn:aws:iam::tenant:oidc-provider/idp-url` for OIDC.", s);
+  }
+
+  if (errmsg)
+    *errmsg = fmt::format("RGW does not support principals of type `{}`.",
+			  w->name);
+  return boost::none;
+}
+
+bool ParseState::do_string(CephContext* cct, const char* s, size_t l) {
+  auto k = pp->tokens.lookup(s, l);
+  Policy& p = pp->policy;
+  bool is_action = false;
+  bool is_validaction = false;
+  Statement* t = p.statements.empty() ? nullptr : &(p.statements.back());
+
+  // Top level!
+  if (w->id == TokenID::Version) {
+    if (k && k->kind == TokenKind::version_key) {
+      p.version = static_cast<Version>(k->specific);
+    } else {
+      annotate(
+	fmt::format("`{}` is not a valid version. Valid versions are "
+		    "`2008-10-17` and `2012-10-17`.",
+		    std::string_view{s, l}));
+
+      return false;
+    }
+  } else if (w->id == TokenID::Id) {
+    p.id = string(s, l);
+
+    // Statement
+
+  } else if (w->id == TokenID::Sid) {
+    t->sid.emplace(s, l);
+  } else if (w->id == TokenID::Effect) {
+    if (k && k->kind == TokenKind::effect_key) {
+      t->effect = static_cast<Effect>(k->specific);
+    } else {
+      annotate(fmt::format("`{}` is not a valid effect.",
+			   std::string_view{s, l}));
+      return false;
+    }
+  } else if (w->id == TokenID::Principal && s && *s == '*') {
+    t->princ.emplace(Principal::wildcard());
+  } else if (w->id == TokenID::NotPrincipal && s && *s == '*') {
+    t->noprinc.emplace(Principal::wildcard());
+  } else if ((w->id == TokenID::Action) ||
+	     (w->id == TokenID::NotAction)) {
+    is_action = true;
+    if (*s == '*') {
+      is_validaction = true;
+      (w->id == TokenID::Action ?
+        t->action = allValue : t->notaction = allValue);
+    } else {
+      for (auto& p : actpairs) {
+        if (match_policy({s, l}, p.name, MATCH_POLICY_ACTION)) {
+          is_validaction = true;
+          (w->id == TokenID::Action ? t->action[p.bit] = 1 : t->notaction[p.bit] = 1);
+        }
+        if ((t->action & s3AllValue) == s3AllValue) {
+          t->action[s3All] = 1;
+        }
+        if ((t->notaction & s3AllValue) == s3AllValue) {
+          t->notaction[s3All] = 1;
+        }
+        if ((t->action & iamAllValue) == iamAllValue) {
+          t->action[iamAll] = 1;
+        }
+        if ((t->notaction & iamAllValue) == iamAllValue) {
+          t->notaction[iamAll] = 1;
+        }
+        if ((t->action & stsAllValue) == stsAllValue) {
+          t->action[stsAll] = 1;
+        }
+        if ((t->notaction & stsAllValue) == stsAllValue) {
+          t->notaction[stsAll] = 1;
+        }
+      }
+    }
+  } else if (w->id == TokenID::Resource || w->id == TokenID::NotResource) {
+    auto a = ARN::parse({s, l}, true);
+    if (!a) {
+      annotate(
+	fmt::format("`{}` is not a valid ARN. Resource ARNs should have a "
+		    "format like `arn:aws:s3::tenant:resource' or "
+		    "`arn:aws:s3:::resource`.",
+		    std::string_view{s, l}));
+      return false;
+    }
+    // You can't specify resources for someone ELSE'S account.
+    if (a->account.empty() || a->account == pp->tenant ||
+	a->account == "*") {
+      if (a->account.empty() || a->account == "*")
+	a->account = pp->tenant;
+      (w->id == TokenID::Resource ? t->resource : t->notresource)
+	.emplace(std::move(*a));
+    } else {
+      annotate(fmt::format("Policy owned by tenant `{}` cannot grant access to "
+			   "resource owned by tenant `{}`.",
+			   pp->tenant, a->account));
+      return false;
+    }
+  } else if (w->kind == TokenKind::cond_key) {
+    auto& t = pp->policy.statements.back();
+    if (l > 0 && *s == '$') {
+      if (l >= 2 && *(s+1) == '{') {
+        if (l > 0 && *(s+l-1) == '}') {
+          t.conditions.back().isruntime = true;
+        } else {
+	  annotate(fmt::format("Invalid interpolation `{}`.",
+			       std::string_view{s, l}));
+          return false;
+        }
+      } else {
+	annotate(fmt::format("Invalid interpolation `{}`.",
+			     std::string_view{s, l}));
+        return false;
+      }
+    }
+    t.conditions.back().vals.emplace_back(s, l);
+
+    // Principals
+
+  } else if (w->kind == TokenKind::princ_type) {
+    if (pp->s.size() <= 1) {
+      annotate(fmt::format("Principle isn't allowed at top level."));
+      return false;
+    }
+    auto& pri = pp->s[pp->s.size() - 2].w->id == TokenID::Principal ?
+      t->princ : t->noprinc;
+
+    string errmsg;
+    if (auto o = parse_principal({s, l}, &errmsg)) {
+      pri.emplace(std::move(*o));
+    } else if (pp->reject_invalid_principals) {
+      annotate(std::move(errmsg));
+      return false;
+    } else {
+      ldout(cct, 0) << "Ignored principle `" << std::string_view{s, l} << "`: "
+		    << errmsg << dendl;
+    }
+  } else {
+    // Failure
+    annotate(fmt::format("`{}` is not valid in the context of `{}`.",
+			 std::string_view{s, l}, w->name));
+    return false;
+  }
+
+  if (!arraying) {
+    pp->s.pop_back();
+  }
+
+  if (is_action && !is_validaction) {
+    annotate(fmt::format("`{}` is not a valid action.",
+			 std::string_view{s, l}));
+    return false;
+  }
+
+  return true;
+}
+
+bool ParseState::number(const char* s, size_t l) {
+  // Top level!
+  if (w->kind == TokenKind::cond_key) {
+    auto& t = pp->policy.statements.back();
+    t.conditions.back().vals.emplace_back(s, l);
+  } else {
+    // Failure
+    annotate("Numbers are not allowed outside condition arguments.");
+    return false;
+  }
+
+  if (!arraying) {
+    pp->s.pop_back();
+  }
+
+  return true;
+}
+
+void ParseState::reset() {
+  pp->reset(pp->v);
+}
+
+bool ParseState::obj_start() {
+  if (w->objectable && !objecting) {
+    objecting = true;
+    if (w->id == TokenID::Statement) {
+      pp->policy.statements.emplace_back();
+    }
+
+    return true;
+  }
+
+  annotate(fmt::format("The {} keyword cannot introduce an object.",
+		       w->name));
+
+  return false;
+}
+
+
+bool ParseState::array_end() {
+  if (arraying && !objecting) {
+    pp->s.pop_back();
+    return true;
+  }
+
+  annotate("Attempt to close unopened array.");
+  return false;
+}
+
+ostream& operator <<(ostream& m, const MaskedIP& ip) {
+  // I have a theory about why std::bitset is the way it is.
+  if (ip.v6) {
+    for (int i = 7; i >= 0; --i) {
+      uint16_t hextet = 0;
+      for (int j = 15; j >= 0; --j) {
+	hextet |= (ip.addr[(i * 16) + j] << j);
+      }
+      m << hex << (unsigned int) hextet;
+      if (i != 0) {
+	m << ":";
+      }
+    }
+  } else {
+    // It involves Satan.
+    for (int i = 3; i >= 0; --i) {
+      uint8_t b = 0;
+      for (int j = 7; j >= 0; --j) {
+	b |= (ip.addr[(i * 8) + j] << j);
+      }
+      m << (unsigned int) b;
+      if (i != 0) {
+	m << ".";
+      }
+    }
+  }
+  m << "/" << dec << ip.prefix;
+  // It would explain a lot
+  return m;
+}
+
+bool Condition::eval(const Environment& env) const {
+  std::vector<std::string> runtime_vals;
+  auto i = env.find(key);
+  if (op == TokenID::Null) {
+    return i == env.end() ? true : false;
+  }
+
+  if (i == env.end()) {
+    if (op == TokenID::ForAllValuesStringEquals ||
+        op == TokenID::ForAllValuesStringEqualsIgnoreCase ||
+        op == TokenID::ForAllValuesStringLike) {
+      return true;
+    } else {
+      return ifexists;
+    }
+  }
+
+  if (isruntime) {
+    string k = vals.back();
+    k.erase(0,2); //erase $, {
+    k.erase(k.length() - 1, 1); //erase }
+    const auto& it = env.equal_range(k);
+    for (auto itr = it.first; itr != it.second; itr++) {
+      runtime_vals.emplace_back(itr->second);
+    }
+  }
+  const auto& s = i->second;
+
+  const auto& itr = env.equal_range(key);
+
+  switch (op) {
+    // String!
+  case TokenID::ForAnyValueStringEquals:
+  case TokenID::StringEquals:
+    return orrible(std::equal_to<std::string>(), itr, isruntime? runtime_vals : vals);
+
+  case TokenID::StringNotEquals:
+    return orrible(std::not_fn(std::equal_to<std::string>()),
+		   itr, isruntime? runtime_vals : vals);
+
+  case TokenID::ForAnyValueStringEqualsIgnoreCase:
+  case TokenID::StringEqualsIgnoreCase:
+    return orrible(ci_equal_to(), itr, isruntime? runtime_vals : vals);
+
+  case TokenID::StringNotEqualsIgnoreCase:
+    return orrible(std::not_fn(ci_equal_to()), itr, isruntime? runtime_vals : vals);
+
+  case TokenID::ForAnyValueStringLike:
+  case TokenID::StringLike:
+    return orrible(string_like(), itr, isruntime? runtime_vals : vals);
+
+  case TokenID::StringNotLike:
+    return orrible(std::not_fn(string_like()), itr, isruntime? runtime_vals : vals);
+
+  case TokenID::ForAllValuesStringEquals:
+    return andible(std::equal_to<std::string>(), itr, isruntime? runtime_vals : vals);
+
+  case TokenID::ForAllValuesStringLike:
+    return andible(string_like(), itr, isruntime? runtime_vals : vals);
+
+  case TokenID::ForAllValuesStringEqualsIgnoreCase:
+    return andible(ci_equal_to(), itr, isruntime? runtime_vals : vals);
+
+    // Numeric
+  case TokenID::NumericEquals:
+    return shortible(std::equal_to<double>(), as_number, s, vals);
+
+  case TokenID::NumericNotEquals:
+    return shortible(std::not_fn(std::equal_to<double>()),
+		     as_number, s, vals);
+
+
+  case TokenID::NumericLessThan:
+    return shortible(std::less<double>(), as_number, s, vals);
+
+
+  case TokenID::NumericLessThanEquals:
+    return shortible(std::less_equal<double>(), as_number, s, vals);
+
+  case TokenID::NumericGreaterThan:
+    return shortible(std::greater<double>(), as_number, s, vals);
+
+  case TokenID::NumericGreaterThanEquals:
+    return shortible(std::greater_equal<double>(), as_number, s, vals);
+
+    // Date!
+  case TokenID::DateEquals:
+    return shortible(std::equal_to<ceph::real_time>(), as_date, s, vals);
+
+  case TokenID::DateNotEquals:
+    return shortible(std::not_fn(std::equal_to<ceph::real_time>()),
+		     as_date, s, vals);
+
+  case TokenID::DateLessThan:
+    return shortible(std::less<ceph::real_time>(), as_date, s, vals);
+
+
+  case TokenID::DateLessThanEquals:
+    return shortible(std::less_equal<ceph::real_time>(), as_date, s, vals);
+
+  case TokenID::DateGreaterThan:
+    return shortible(std::greater<ceph::real_time>(), as_date, s, vals);
+
+  case TokenID::DateGreaterThanEquals:
+    return shortible(std::greater_equal<ceph::real_time>(), as_date, s,
+		     vals);
+
+    // Bool!
+  case TokenID::Bool:
+    return shortible(std::equal_to<bool>(), as_bool, s, vals);
+
+    // Binary!
+  case TokenID::BinaryEquals:
+    return shortible(std::equal_to<ceph::bufferlist>(), as_binary, s,
+		     vals);
+
+    // IP Address!
+  case TokenID::IpAddress:
+    return shortible(std::equal_to<MaskedIP>(), as_network, s, vals);
+
+  case TokenID::NotIpAddress:
+    {
+      auto xc = as_network(s);
+      if (!xc) {
+	return false;
+      }
+
+      for (const string& d : vals) {
+	auto xd = as_network(d);
+	if (!xd) {
+	  continue;
+	}
+
+	if (xc == xd) {
+	  return false;
+	}
+      }
+      return true;
+    }
+
+#if 0
+    // Amazon Resource Names! (Does S3 need this?)
+    TokenID::ArnEquals, TokenID::ArnNotEquals, TokenID::ArnLike,
+      TokenID::ArnNotLike,
+#endif
+
+  default:
+    return false;
+  }
+}
+
+boost::optional<MaskedIP> Condition::as_network(const string& s) {
+  MaskedIP m;
+  if (s.empty()) {
+    return boost::none;
+  }
+
+  m.v6 = (s.find(':') == string::npos) ? false : true;
+
+  auto slash = s.find('/');
+  if (slash == string::npos) {
+    m.prefix = m.v6 ? 128 : 32;
+  } else {
+    char* end = 0;
+    m.prefix = strtoul(s.data() + slash + 1, &end, 10);
+    if (*end != 0 || (m.v6 && m.prefix > 128) ||
+	(!m.v6 && m.prefix > 32)) {
+      return boost::none;
+    }
+  }
+
+  string t;
+  auto p = &s;
+
+  if (slash != string::npos) {
+    t.assign(s, 0, slash);
+    p = &t;
+  }
+
+  if (m.v6) {
+    struct in6_addr a;
+    if (inet_pton(AF_INET6, p->c_str(), static_cast<void*>(&a)) != 1) {
+      return boost::none;
+    }
+
+    m.addr |= Address(a.s6_addr[15]) << 0;
+    m.addr |= Address(a.s6_addr[14]) << 8;
+    m.addr |= Address(a.s6_addr[13]) << 16;
+    m.addr |= Address(a.s6_addr[12]) << 24;
+    m.addr |= Address(a.s6_addr[11]) << 32;
+    m.addr |= Address(a.s6_addr[10]) << 40;
+    m.addr |= Address(a.s6_addr[9]) << 48;
+    m.addr |= Address(a.s6_addr[8]) << 56;
+    m.addr |= Address(a.s6_addr[7]) << 64;
+    m.addr |= Address(a.s6_addr[6]) << 72;
+    m.addr |= Address(a.s6_addr[5]) << 80;
+    m.addr |= Address(a.s6_addr[4]) << 88;
+    m.addr |= Address(a.s6_addr[3]) << 96;
+    m.addr |= Address(a.s6_addr[2]) << 104;
+    m.addr |= Address(a.s6_addr[1]) << 112;
+    m.addr |= Address(a.s6_addr[0]) << 120;
+  } else {
+    struct in_addr a;
+    if (inet_pton(AF_INET, p->c_str(), static_cast<void*>(&a)) != 1) {
+      return boost::none;
+    }
+
+    m.addr = ntohl(a.s_addr);
+  }
+
+  return m;
+}
+
+namespace {
+const char* condop_string(const TokenID t) {
+  switch (t) {
+  case TokenID::StringEquals:
+    return "StringEquals";
+
+  case TokenID::StringNotEquals:
+    return "StringNotEquals";
+
+  case TokenID::StringEqualsIgnoreCase:
+    return "StringEqualsIgnoreCase";
+
+  case TokenID::StringNotEqualsIgnoreCase:
+    return "StringNotEqualsIgnoreCase";
+
+  case TokenID::StringLike:
+    return "StringLike";
+
+  case TokenID::StringNotLike:
+    return "StringNotLike";
+
+  // Numeric!
+  case TokenID::NumericEquals:
+    return "NumericEquals";
+
+  case TokenID::NumericNotEquals:
+    return "NumericNotEquals";
+
+  case TokenID::NumericLessThan:
+    return "NumericLessThan";
+
+  case TokenID::NumericLessThanEquals:
+    return "NumericLessThanEquals";
+
+  case TokenID::NumericGreaterThan:
+    return "NumericGreaterThan";
+
+  case TokenID::NumericGreaterThanEquals:
+    return "NumericGreaterThanEquals";
+
+  case TokenID::DateEquals:
+    return "DateEquals";
+
+  case TokenID::DateNotEquals:
+    return "DateNotEquals";
+
+  case TokenID::DateLessThan:
+    return "DateLessThan";
+
+  case TokenID::DateLessThanEquals:
+    return "DateLessThanEquals";
+
+  case TokenID::DateGreaterThan:
+    return "DateGreaterThan";
+
+  case TokenID::DateGreaterThanEquals:
+    return "DateGreaterThanEquals";
+
+  case TokenID::Bool:
+    return "Bool";
+
+  case TokenID::BinaryEquals:
+    return "BinaryEquals";
+
+  case TokenID::IpAddress:
+    return "case TokenID::IpAddress";
+
+  case TokenID::NotIpAddress:
+    return "NotIpAddress";
+
+  case TokenID::ArnEquals:
+    return "ArnEquals";
+
+  case TokenID::ArnNotEquals:
+    return "ArnNotEquals";
+
+  case TokenID::ArnLike:
+    return "ArnLike";
+
+  case TokenID::ArnNotLike:
+    return "ArnNotLike";
+
+  case TokenID::Null:
+    return "Null";
+
+  default:
+    return "InvalidConditionOperator";
+  }
+}
+
+template<typename Iterator>
+ostream& print_array(ostream& m, Iterator begin, Iterator end) {
+  if (begin == end) {
+    m << "[]";
+  } else {
+    m << "[ ";
+    std::copy(begin, end, std::experimental::make_ostream_joiner(m, ", "));
+    m << " ]";
+  }
+  return m;
+}
+
+template<typename Iterator>
+ostream& print_dict(ostream& m, Iterator begin, Iterator end) {
+  m << "{ ";
+  std::copy(begin, end, std::experimental::make_ostream_joiner(m, ", "));
+  m << " }";
+  return m;
+}
+
+}
+
+ostream& operator <<(ostream& m, const Condition& c) {
+  m << condop_string(c.op);
+  if (c.ifexists) {
+    m << "IfExists";
+  }
+  m << ": { " << c.key;
+  print_array(m, c.vals.cbegin(), c.vals.cend());
+  return m << " }";
+}
+
+Effect Statement::eval(const Environment& e,
+		       boost::optional<const rgw::auth::Identity&> ida,
+		       uint64_t act, boost::optional<const ARN&> res, boost::optional<PolicyPrincipal&> princ_type) const {
+
+  if (eval_principal(e, ida, princ_type) == Effect::Deny) {
+    return Effect::Pass;
+  }
+
+  if (res && resource.empty() && notresource.empty()) {
+    return Effect::Pass;
+  }
+  if (!res && (!resource.empty() || !notresource.empty())) {
+    return Effect::Pass;
+  }
+  if (!resource.empty() && res) {
+    if (!std::any_of(resource.begin(), resource.end(),
+          [&res](const ARN& pattern) {
+            return pattern.match(*res);
+          })) {
+      return Effect::Pass;
+    }
+  } else if (!notresource.empty() && res) {
+    if (std::any_of(notresource.begin(), notresource.end(),
+          [&res](const ARN& pattern) {
+            return pattern.match(*res);
+          })) {
+      return Effect::Pass;
+    }
+  }
+
+  if (!(action[act] == 1) || (notaction[act] == 1)) {
+    return Effect::Pass;
+  }
+
+  if (std::all_of(conditions.begin(),
+		  conditions.end(),
+		  [&e](const Condition& c) { return c.eval(e);})) {
+    return effect;
+  }
+
+  return Effect::Pass;
+}
+
+Effect Statement::eval_principal(const Environment& e,
+		       boost::optional<const rgw::auth::Identity&> ida, boost::optional<PolicyPrincipal&> princ_type) const {
+  if (princ_type) {
+    *princ_type = PolicyPrincipal::Other;
+  }
+  if (ida) {
+    if (princ.empty() && noprinc.empty()) {
+      return Effect::Deny;
+    }
+    if (ida->get_identity_type() != TYPE_ROLE && !princ.empty() && !ida->is_identity(princ)) {
+      return Effect::Deny;
+    }
+    if (ida->get_identity_type() == TYPE_ROLE && !princ.empty()) {
+      bool princ_matched = false;
+      for (auto p : princ) { // Check each principal to determine the type of the one that has matched
+        boost::container::flat_set<Principal> id;
+        id.insert(p);
+        if (ida->is_identity(id)) {
+          if (p.is_assumed_role() || p.is_user()) {
+            if (princ_type) *princ_type = PolicyPrincipal::Session;
+          } else {
+            if (princ_type) *princ_type = PolicyPrincipal::Role;
+          }
+          princ_matched = true;
+        }
+      }
+      if (!princ_matched) {
+        return Effect::Deny;
+      }
+    } else if (!noprinc.empty() && ida->is_identity(noprinc)) {
+      return Effect::Deny;
+    }
+  }
+  return Effect::Allow;
+}
+
+Effect Statement::eval_conditions(const Environment& e) const {
+  if (std::all_of(conditions.begin(),
+		  conditions.end(),
+		  [&e](const Condition& c) { return c.eval(e);})) {
+    return Effect::Allow;
+  }
+  return Effect::Deny;
+}
+
+namespace {
+const char* action_bit_string(uint64_t action) {
+  switch (action) {
+  case s3GetObject:
+    return "s3:GetObject";
+
+  case s3GetObjectVersion:
+    return "s3:GetObjectVersion";
+
+  case s3PutObject:
+    return "s3:PutObject";
+
+  case s3GetObjectAcl:
+    return "s3:GetObjectAcl";
+
+  case s3GetObjectVersionAcl:
+    return "s3:GetObjectVersionAcl";
+
+  case s3PutObjectAcl:
+    return "s3:PutObjectAcl";
+
+  case s3PutObjectVersionAcl:
+    return "s3:PutObjectVersionAcl";
+
+  case s3DeleteObject:
+    return "s3:DeleteObject";
+
+  case s3DeleteObjectVersion:
+    return "s3:DeleteObjectVersion";
+
+  case s3ListMultipartUploadParts:
+    return "s3:ListMultipartUploadParts";
+
+  case s3AbortMultipartUpload:
+    return "s3:AbortMultipartUpload";
+
+  case s3GetObjectTorrent:
+    return "s3:GetObjectTorrent";
+
+  case s3GetObjectVersionTorrent:
+    return "s3:GetObjectVersionTorrent";
+
+  case s3RestoreObject:
+    return "s3:RestoreObject";
+
+  case s3CreateBucket:
+    return "s3:CreateBucket";
+
+  case s3DeleteBucket:
+    return "s3:DeleteBucket";
+
+  case s3ListBucket:
+    return "s3:ListBucket";
+
+  case s3ListBucketVersions:
+    return "s3:ListBucketVersions";
+  case s3ListAllMyBuckets:
+    return "s3:ListAllMyBuckets";
+
+  case s3ListBucketMultipartUploads:
+    return "s3:ListBucketMultipartUploads";
+
+  case s3GetAccelerateConfiguration:
+    return "s3:GetAccelerateConfiguration";
+
+  case s3PutAccelerateConfiguration:
+    return "s3:PutAccelerateConfiguration";
+
+  case s3GetBucketAcl:
+    return "s3:GetBucketAcl";
+
+  case s3PutBucketAcl:
+    return "s3:PutBucketAcl";
+
+  case s3GetBucketCORS:
+    return "s3:GetBucketCORS";
+
+  case s3PutBucketCORS:
+    return "s3:PutBucketCORS";
+
+  case s3GetBucketEncryption:
+    return "s3:GetBucketEncryption";
+
+  case s3PutBucketEncryption:
+    return "s3:PutBucketEncryption";
+
+  case s3GetBucketVersioning:
+    return "s3:GetBucketVersioning";
+
+  case s3PutBucketVersioning:
+    return "s3:PutBucketVersioning";
+
+  case s3GetBucketRequestPayment:
+    return "s3:GetBucketRequestPayment";
+
+  case s3PutBucketRequestPayment:
+    return "s3:PutBucketRequestPayment";
+
+  case s3GetBucketLocation:
+    return "s3:GetBucketLocation";
+
+  case s3GetBucketPolicy:
+    return "s3:GetBucketPolicy";
+
+  case s3DeleteBucketPolicy:
+    return "s3:DeleteBucketPolicy";
+
+  case s3PutBucketPolicy:
+    return "s3:PutBucketPolicy";
+
+  case s3GetBucketNotification:
+    return "s3:GetBucketNotification";
+
+  case s3PutBucketNotification:
+    return "s3:PutBucketNotification";
+
+  case s3GetBucketLogging:
+    return "s3:GetBucketLogging";
+
+  case s3PutBucketLogging:
+    return "s3:PutBucketLogging";
+
+  case s3GetBucketTagging:
+    return "s3:GetBucketTagging";
+
+  case s3PutBucketTagging:
+    return "s3:PutBucketTagging";
+
+  case s3GetBucketWebsite:
+    return "s3:GetBucketWebsite";
+
+  case s3PutBucketWebsite:
+    return "s3:PutBucketWebsite";
+
+  case s3DeleteBucketWebsite:
+    return "s3:DeleteBucketWebsite";
+
+  case s3GetLifecycleConfiguration:
+    return "s3:GetLifecycleConfiguration";
+
+  case s3PutLifecycleConfiguration:
+    return "s3:PutLifecycleConfiguration";
+
+  case s3PutReplicationConfiguration:
+    return "s3:PutReplicationConfiguration";
+
+  case s3GetReplicationConfiguration:
+    return "s3:GetReplicationConfiguration";
+
+  case s3DeleteReplicationConfiguration:
+    return "s3:DeleteReplicationConfiguration";
+
+  case s3PutObjectTagging:
+    return "s3:PutObjectTagging";
+
+  case s3PutObjectVersionTagging:
+    return "s3:PutObjectVersionTagging";
+
+  case s3GetObjectTagging:
+    return "s3:GetObjectTagging";
+
+  case s3GetObjectVersionTagging:
+    return "s3:GetObjectVersionTagging";
+
+  case s3DeleteObjectTagging:
+    return "s3:DeleteObjectTagging";
+
+  case s3DeleteObjectVersionTagging:
+    return "s3:DeleteObjectVersionTagging";
+
+  case s3PutBucketObjectLockConfiguration:
+    return "s3:PutBucketObjectLockConfiguration";
+
+  case s3GetBucketObjectLockConfiguration:
+    return "s3:GetBucketObjectLockConfiguration";
+
+  case s3PutObjectRetention:
+    return "s3:PutObjectRetention";
+
+  case s3GetObjectRetention:
+    return "s3:GetObjectRetention";
+
+  case s3PutObjectLegalHold:
+    return "s3:PutObjectLegalHold";
+
+  case s3GetObjectLegalHold:
+    return "s3:GetObjectLegalHold";
+
+  case s3BypassGovernanceRetention:
+    return "s3:BypassGovernanceRetention";
+
+  case iamPutUserPolicy:
+    return "iam:PutUserPolicy";
+
+  case iamGetUserPolicy:
+    return "iam:GetUserPolicy";
+
+  case iamListUserPolicies:
+    return "iam:ListUserPolicies";
+
+  case iamDeleteUserPolicy:
+    return "iam:DeleteUserPolicy";
+
+  case iamCreateRole:
+    return "iam:CreateRole";
+
+  case iamDeleteRole:
+    return "iam:DeleteRole";
+
+  case iamGetRole:
+    return "iam:GetRole";
+
+  case iamModifyRoleTrustPolicy:
+    return "iam:ModifyRoleTrustPolicy";
+
+  case iamListRoles:
+    return "iam:ListRoles";
+
+  case iamPutRolePolicy:
+    return "iam:PutRolePolicy";
+
+  case iamGetRolePolicy:
+    return "iam:GetRolePolicy";
+
+  case iamListRolePolicies:
+    return "iam:ListRolePolicies";
+
+  case iamDeleteRolePolicy:
+    return "iam:DeleteRolePolicy";
+
+  case iamCreateOIDCProvider:
+    return "iam:CreateOIDCProvider";
+
+  case iamDeleteOIDCProvider:
+    return "iam:DeleteOIDCProvider";
+
+  case iamGetOIDCProvider:
+    return "iam:GetOIDCProvider";
+
+  case iamListOIDCProviders:
+    return "iam:ListOIDCProviders";
+
+  case iamTagRole:
+    return "iam:TagRole";
+
+  case iamListRoleTags:
+    return "iam:ListRoleTags";
+
+  case iamUntagRole:
+    return "iam:UntagRole";
+
+  case iamUpdateRole:
+    return "iam:UpdateRole";
+
+  case stsAssumeRole:
+    return "sts:AssumeRole";
+
+  case stsAssumeRoleWithWebIdentity:
+    return "sts:AssumeRoleWithWebIdentity";
+
+  case stsGetSessionToken:
+    return "sts:GetSessionToken";
+
+  case stsTagSession:
+    return "sts:TagSession";
+  }
+  return "s3Invalid";
+}
+
+ostream& print_actions(ostream& m, const Action_t a) {
+  bool begun = false;
+  m << "[ ";
+  for (auto i = 0U; i < allCount; ++i) {
+    if (a[i] == 1) {
+      if (begun) {
+        m << ", ";
+      } else {
+        begun = true;
+      }
+      m << action_bit_string(i);
+    }
+  }
+  if (begun) {
+    m << " ]";
+  } else {
+    m << "]";
+  }
+  return m;
+}
+}
+
+ostream& operator <<(ostream& m, const Statement& s) {
+  m << "{ ";
+  if (s.sid) {
+    m << "Sid: " << *s.sid << ", ";
+  }
+  if (!s.princ.empty()) {
+    m << "Principal: ";
+    print_dict(m, s.princ.cbegin(), s.princ.cend());
+    m << ", ";
+  }
+  if (!s.noprinc.empty()) {
+    m << "NotPrincipal: ";
+    print_dict(m, s.noprinc.cbegin(), s.noprinc.cend());
+    m << ", ";
+  }
+
+  m << "Effect: " <<
+    (s.effect == Effect::Allow ?
+     (const char*) "Allow" :
+     (const char*) "Deny");
+
+  if (s.action.any() || s.notaction.any() || !s.resource.empty() ||
+      !s.notresource.empty() || !s.conditions.empty()) {
+    m << ", ";
+  }
+
+  if (s.action.any()) {
+    m << "Action: ";
+    print_actions(m, s.action);
+
+    if (s.notaction.any() || !s.resource.empty() ||
+	!s.notresource.empty() || !s.conditions.empty()) {
+      m << ", ";
+    }
+  }
+
+  if (s.notaction.any()) {
+    m << "NotAction: ";
+    print_actions(m, s.notaction);
+
+    if (!s.resource.empty() || !s.notresource.empty() ||
+	!s.conditions.empty()) {
+      m << ", ";
+    }
+  }
+
+  if (!s.resource.empty()) {
+    m << "Resource: ";
+    print_array(m, s.resource.cbegin(), s.resource.cend());
+
+    if (!s.notresource.empty() || !s.conditions.empty()) {
+      m << ", ";
+    }
+  }
+
+  if (!s.notresource.empty()) {
+    m << "NotResource: ";
+    print_array(m, s.notresource.cbegin(), s.notresource.cend());
+
+    if (!s.conditions.empty()) {
+      m << ", ";
+    }
+  }
+
+  if (!s.conditions.empty()) {
+    m << "Condition: ";
+    print_dict(m, s.conditions.cbegin(), s.conditions.cend());
+  }
+
+  return m << " }";
+}
+
+Policy::Policy(CephContext* cct, const string& tenant,
+	       const bufferlist& _text,
+	       bool reject_invalid_principals)
+  : text(_text.to_str()) {
+  StringStream ss(text.data());
+  PolicyParser pp(cct, tenant, *this, reject_invalid_principals);
+  auto pr = Reader{}.Parse<kParseNumbersAsStringsFlag |
+			   kParseCommentsFlag>(ss, pp);
+  if (!pr) {
+    throw PolicyParseException(pr, pp.annotation);
+  }
+}
+
+Effect Policy::eval(const Environment& e,
+		    boost::optional<const rgw::auth::Identity&> ida,
+		    std::uint64_t action, boost::optional<const ARN&> resource,
+        boost::optional<PolicyPrincipal&> princ_type) const {
+  auto allowed = false;
+  for (auto& s : statements) {
+    auto g = s.eval(e, ida, action, resource, princ_type);
+    if (g == Effect::Deny) {
+      return g;
+    } else if (g == Effect::Allow) {
+      allowed = true;
+    }
+  }
+  return allowed ? Effect::Allow : Effect::Pass;
+}
+
+Effect Policy::eval_principal(const Environment& e,
+		    boost::optional<const rgw::auth::Identity&> ida, boost::optional<PolicyPrincipal&> princ_type) const {
+  auto allowed = false;
+  for (auto& s : statements) {
+    auto g = s.eval_principal(e, ida, princ_type);
+    if (g == Effect::Deny) {
+      return g;
+    } else if (g == Effect::Allow) {
+      allowed = true;
+    }
+  }
+  return allowed ? Effect::Allow : Effect::Deny;
+}
+
+Effect Policy::eval_conditions(const Environment& e) const {
+  auto allowed = false;
+  for (auto& s : statements) {
+    auto g = s.eval_conditions(e);
+    if (g == Effect::Deny) {
+      return g;
+    } else if (g == Effect::Allow) {
+      allowed = true;
+    }
+  }
+  return allowed ? Effect::Allow : Effect::Deny;
+}
+
+ostream& operator <<(ostream& m, const Policy& p) {
+  m << "{ Version: "
+    << (p.version == Version::v2008_10_17 ? "2008-10-17" : "2012-10-17");
+
+  if (p.id || !p.statements.empty()) {
+    m << ", ";
+  }
+
+  if (p.id) {
+    m << "Id: " << *p.id;
+    if (!p.statements.empty()) {
+      m << ", ";
+    }
+  }
+
+  if (!p.statements.empty()) {
+    m << "Statements: ";
+    print_array(m, p.statements.cbegin(), p.statements.cend());
+    m << ", ";
+  }
+  return m << " }";
+}
+
+static const Environment iam_all_env = {
+					{"aws:SourceIp","1.1.1.1"},
+					{"aws:UserId","anonymous"},
+					{"s3:x-amz-server-side-encryption-aws-kms-key-id","secret"}
+};
+
+struct IsPublicStatement
+{
+  bool operator() (const Statement &s) const {
+    if (s.effect == Effect::Allow) {
+      for (const auto& p : s.princ) {
+	if (p.is_wildcard()) {
+	  return s.eval_conditions(iam_all_env) == Effect::Allow;
+	}
+      }
+      // no princ should not contain fixed values
+      return std::none_of(s.noprinc.begin(), s.noprinc.end(), [](const rgw::auth::Principal& p) {
+								return p.is_wildcard();
+							      });
+    }
+    return false;
+  }
+};
+
+
+bool is_public(const Policy& p)
+{
+  return std::any_of(p.statements.begin(), p.statements.end(), IsPublicStatement());
+}
+
+} // namespace IAM
+} // namespace rgw
diff --git a/src/rgw/rgw_iam_policy.h b/src/rgw/rgw_iam_policy.h
new file mode 100644
index 000000000..c0a7e51b5
--- /dev/null
+++ b/src/rgw/rgw_iam_policy.h
@@ -0,0 +1,579 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <bitset>
+#include <chrono>
+#include <cstdint>
+#include <iostream>
+#include <string>
+#include <string_view>
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/container/flat_map.hpp>
+#include <boost/container/flat_set.hpp>
+#include <boost/optional.hpp>
+#include <boost/thread/shared_mutex.hpp>
+#include <boost/variant.hpp>
+
+#include <fmt/format.h>
+
+#include "common/ceph_time.h"
+#include "common/iso_8601.h"
+
+#include "rapidjson/error/error.h"
+#include "rapidjson/error/en.h"
+
+#include "rgw_acl.h"
+#include "rgw_basic_types.h"
+#include "rgw_iam_policy_keywords.h"
+#include "rgw_string.h"
+#include "rgw_arn.h"
+
+namespace rgw {
+namespace auth {
+class Identity;
+}
+}
+
+namespace rgw {
+namespace IAM {
+
+static constexpr std::uint64_t s3GetObject = 0;
+static constexpr std::uint64_t s3GetObjectVersion = 1;
+static constexpr std::uint64_t s3PutObject = 2;
+static constexpr std::uint64_t s3GetObjectAcl = 3;
+static constexpr std::uint64_t s3GetObjectVersionAcl = 4;
+static constexpr std::uint64_t s3PutObjectAcl = 5;
+static constexpr std::uint64_t s3PutObjectVersionAcl = 6;
+static constexpr std::uint64_t s3DeleteObject = 7;
+static constexpr std::uint64_t s3DeleteObjectVersion = 8;
+static constexpr std::uint64_t s3ListMultipartUploadParts = 9;
+static constexpr std::uint64_t s3AbortMultipartUpload = 10;
+static constexpr std::uint64_t s3GetObjectTorrent = 11;
+static constexpr std::uint64_t s3GetObjectVersionTorrent = 12;
+static constexpr std::uint64_t s3RestoreObject = 13;
+static constexpr std::uint64_t s3CreateBucket = 14;
+static constexpr std::uint64_t s3DeleteBucket = 15;
+static constexpr std::uint64_t s3ListBucket = 16;
+static constexpr std::uint64_t s3ListBucketVersions = 17;
+static constexpr std::uint64_t s3ListAllMyBuckets = 18;
+static constexpr std::uint64_t s3ListBucketMultipartUploads = 19;
+static constexpr std::uint64_t s3GetAccelerateConfiguration = 20;
+static constexpr std::uint64_t s3PutAccelerateConfiguration = 21;
+static constexpr std::uint64_t s3GetBucketAcl = 22;
+static constexpr std::uint64_t s3PutBucketAcl = 23;
+static constexpr std::uint64_t s3GetBucketCORS = 24;
+static constexpr std::uint64_t s3PutBucketCORS = 25;
+static constexpr std::uint64_t s3GetBucketVersioning = 26;
+static constexpr std::uint64_t s3PutBucketVersioning = 27;
+static constexpr std::uint64_t s3GetBucketRequestPayment = 28;
+static constexpr std::uint64_t s3PutBucketRequestPayment = 29;
+static constexpr std::uint64_t s3GetBucketLocation = 30;
+static constexpr std::uint64_t s3GetBucketPolicy = 31;
+static constexpr std::uint64_t s3DeleteBucketPolicy = 32;
+static constexpr std::uint64_t s3PutBucketPolicy = 33;
+static constexpr std::uint64_t s3GetBucketNotification = 34;
+static constexpr std::uint64_t s3PutBucketNotification = 35;
+static constexpr std::uint64_t s3GetBucketLogging = 36;
+static constexpr std::uint64_t s3PutBucketLogging = 37;
+static constexpr std::uint64_t s3GetBucketTagging = 38;
+static constexpr std::uint64_t s3PutBucketTagging = 39;
+static constexpr std::uint64_t s3GetBucketWebsite = 40;
+static constexpr std::uint64_t s3PutBucketWebsite = 41;
+static constexpr std::uint64_t s3DeleteBucketWebsite = 42;
+static constexpr std::uint64_t s3GetLifecycleConfiguration = 43;
+static constexpr std::uint64_t s3PutLifecycleConfiguration = 44;
+static constexpr std::uint64_t s3PutReplicationConfiguration = 45;
+static constexpr std::uint64_t s3GetReplicationConfiguration = 46;
+static constexpr std::uint64_t s3DeleteReplicationConfiguration = 47;
+static constexpr std::uint64_t s3GetObjectTagging = 48;
+static constexpr std::uint64_t s3PutObjectTagging = 49;
+static constexpr std::uint64_t s3DeleteObjectTagging = 50;
+static constexpr std::uint64_t s3GetObjectVersionTagging = 51;
+static constexpr std::uint64_t s3PutObjectVersionTagging = 52;
+static constexpr std::uint64_t s3DeleteObjectVersionTagging = 53;
+static constexpr std::uint64_t s3PutBucketObjectLockConfiguration = 54;
+static constexpr std::uint64_t s3GetBucketObjectLockConfiguration = 55;
+static constexpr std::uint64_t s3PutObjectRetention = 56;
+static constexpr std::uint64_t s3GetObjectRetention = 57;
+static constexpr std::uint64_t s3PutObjectLegalHold = 58;
+static constexpr std::uint64_t s3GetObjectLegalHold = 59;
+static constexpr std::uint64_t s3BypassGovernanceRetention = 60;
+static constexpr std::uint64_t s3GetBucketPolicyStatus = 61;
+static constexpr std::uint64_t s3PutPublicAccessBlock = 62;
+static constexpr std::uint64_t s3GetPublicAccessBlock = 63;
+static constexpr std::uint64_t s3DeletePublicAccessBlock = 64;
+static constexpr std::uint64_t s3GetBucketPublicAccessBlock = 65;
+static constexpr std::uint64_t s3PutBucketPublicAccessBlock = 66;
+static constexpr std::uint64_t s3DeleteBucketPublicAccessBlock = 67;
+static constexpr std::uint64_t s3GetBucketEncryption = 68;
+static constexpr std::uint64_t s3PutBucketEncryption = 69;
+static constexpr std::uint64_t s3All = 70;
+
+static constexpr std::uint64_t iamPutUserPolicy = s3All + 1;
+static constexpr std::uint64_t iamGetUserPolicy = s3All + 2;
+static constexpr std::uint64_t iamDeleteUserPolicy = s3All + 3;
+static constexpr std::uint64_t iamListUserPolicies = s3All + 4;
+static constexpr std::uint64_t iamCreateRole = s3All + 5;
+static constexpr std::uint64_t iamDeleteRole = s3All + 6;
+static constexpr std::uint64_t iamModifyRoleTrustPolicy = s3All + 7;
+static constexpr std::uint64_t iamGetRole = s3All + 8;
+static constexpr std::uint64_t iamListRoles = s3All + 9;
+static constexpr std::uint64_t iamPutRolePolicy = s3All + 10;
+static constexpr std::uint64_t iamGetRolePolicy = s3All + 11;
+static constexpr std::uint64_t iamListRolePolicies = s3All + 12;
+static constexpr std::uint64_t iamDeleteRolePolicy = s3All + 13;
+static constexpr std::uint64_t iamCreateOIDCProvider = s3All + 14;
+static constexpr std::uint64_t iamDeleteOIDCProvider = s3All + 15;
+static constexpr std::uint64_t iamGetOIDCProvider = s3All + 16;
+static constexpr std::uint64_t iamListOIDCProviders = s3All + 17;
+static constexpr std::uint64_t iamTagRole = s3All + 18;
+static constexpr std::uint64_t iamListRoleTags = s3All + 19;
+static constexpr std::uint64_t iamUntagRole = s3All + 20;
+static constexpr std::uint64_t iamUpdateRole = s3All + 21;
+static constexpr std::uint64_t iamAll = s3All + 22;
+
+static constexpr std::uint64_t stsAssumeRole = iamAll + 1;
+static constexpr std::uint64_t stsAssumeRoleWithWebIdentity = iamAll + 2;
+static constexpr std::uint64_t stsGetSessionToken = iamAll + 3;
+static constexpr std::uint64_t stsTagSession = iamAll + 4;
+static constexpr std::uint64_t stsAll = iamAll + 5;
+
+static constexpr std::uint64_t s3Count = s3All;
+static constexpr std::uint64_t allCount = stsAll + 1;
+
+using Action_t = std::bitset<allCount>;
+using NotAction_t = Action_t;
+
+template <size_t N>
+constexpr std::bitset<N> make_bitmask(size_t s) {
+  // unfortunately none of the shift/logic operators of std::bitset have a constexpr variation
+  return s < 64 ? std::bitset<N> ((1ULL << s) - 1) :
+    std::bitset<N>((1ULL << 63) - 1) | make_bitmask<N> (s - 63) << 63;
+}
+
+template <size_t N>
+constexpr std::bitset<N> set_cont_bits(size_t start, size_t end)
+{
+  return (make_bitmask<N>(end - start)) << start;
+}
+
+static const Action_t None(0);
+static const Action_t s3AllValue = set_cont_bits<allCount>(0,s3All);
+static const Action_t iamAllValue = set_cont_bits<allCount>(s3All+1,iamAll);
+static const Action_t stsAllValue = set_cont_bits<allCount>(iamAll+1,stsAll);
+static const Action_t allValue = set_cont_bits<allCount>(0,allCount);
+
+namespace {
+// Please update the table in doc/radosgw/s3/authentication.rst if you
+// modify this function.
+inline int op_to_perm(std::uint64_t op) {
+  switch (op) {
+  case s3GetObject:
+  case s3GetObjectTorrent:
+  case s3GetObjectVersion:
+  case s3GetObjectVersionTorrent:
+  case s3GetObjectTagging:
+  case s3GetObjectVersionTagging:
+  case s3GetObjectRetention:
+  case s3GetObjectLegalHold:
+  case s3ListAllMyBuckets:
+  case s3ListBucket:
+  case s3ListBucketMultipartUploads:
+  case s3ListBucketVersions:
+  case s3ListMultipartUploadParts:
+    return RGW_PERM_READ;
+
+  case s3AbortMultipartUpload:
+  case s3CreateBucket:
+  case s3DeleteBucket:
+  case s3DeleteObject:
+  case s3DeleteObjectVersion:
+  case s3PutObject:
+  case s3PutObjectTagging:
+  case s3PutObjectVersionTagging:
+  case s3DeleteObjectTagging:
+  case s3DeleteObjectVersionTagging:
+  case s3RestoreObject:
+  case s3PutObjectRetention:
+  case s3PutObjectLegalHold:
+  case s3BypassGovernanceRetention:
+    return RGW_PERM_WRITE;
+
+  case s3GetAccelerateConfiguration:
+  case s3GetBucketAcl:
+  case s3GetBucketCORS:
+  case s3GetBucketEncryption:
+  case s3GetBucketLocation:
+  case s3GetBucketLogging:
+  case s3GetBucketNotification:
+  case s3GetBucketPolicy:
+  case s3GetBucketPolicyStatus:
+  case s3GetBucketRequestPayment:
+  case s3GetBucketTagging:
+  case s3GetBucketVersioning:
+  case s3GetBucketWebsite:
+  case s3GetLifecycleConfiguration:
+  case s3GetObjectAcl:
+  case s3GetObjectVersionAcl:
+  case s3GetReplicationConfiguration:
+  case s3GetBucketObjectLockConfiguration:
+  case s3GetBucketPublicAccessBlock:
+    return RGW_PERM_READ_ACP;
+
+  case s3DeleteBucketPolicy:
+  case s3DeleteBucketWebsite:
+  case s3DeleteReplicationConfiguration:
+  case s3PutAccelerateConfiguration:
+  case s3PutBucketAcl:
+  case s3PutBucketCORS:
+  case s3PutBucketEncryption:
+  case s3PutBucketLogging:
+  case s3PutBucketNotification:
+  case s3PutBucketPolicy:
+  case s3PutBucketRequestPayment:
+  case s3PutBucketTagging:
+  case s3PutBucketVersioning:
+  case s3PutBucketWebsite:
+  case s3PutLifecycleConfiguration:
+  case s3PutObjectAcl:
+  case s3PutObjectVersionAcl:
+  case s3PutReplicationConfiguration:
+  case s3PutBucketObjectLockConfiguration:
+  case s3PutBucketPublicAccessBlock:
+    return RGW_PERM_WRITE_ACP;
+
+  case s3All:
+    return RGW_PERM_FULL_CONTROL;
+  }
+  return RGW_PERM_INVALID;
+}
+}
+
+enum class PolicyPrincipal {
+  Role,
+  Session,
+  Other
+};
+
+using Environment = std::unordered_multimap<std::string, std::string>;
+
+using Address = std::bitset<128>;
+struct MaskedIP {
+  bool v6;
+  Address addr;
+  // Since we're mapping IPv6 to IPv4 addresses, we may want to
+  // consider making the prefix always be in terms of a v6 address
+  // and just use the v6 bit to rewrite it as a v4 prefix for
+  // output.
+  unsigned int prefix;
+};
+
+std::ostream& operator <<(std::ostream& m, const MaskedIP& ip);
+
+inline bool operator ==(const MaskedIP& l, const MaskedIP& r) {
+  auto shift = std::max((l.v6 ? 128 : 32) - ((int) l.prefix),
+			(r.v6 ? 128 : 32) - ((int) r.prefix));
+  ceph_assert(shift >= 0);
+  return (l.addr >> shift) == (r.addr >> shift);
+}
+
+struct Condition {
+  TokenID op;
+  // Originally I was going to use a perfect hash table, but Marcus
+  // says keys are to be added at run-time not compile time.
+
+  // In future development, use symbol internment.
+  std::string key;
+  bool ifexists = false;
+  bool isruntime = false; //Is evaluated during run-time
+  // Much to my annoyance there is no actual way to do this in a
+  // typed way that is compatible with AWS. I know this because I've
+  // seen examples where the same value is used as a string in one
+  // context and a date in another.
+  std::vector<std::string> vals;
+
+  Condition() = default;
+  Condition(TokenID op, const char* s, std::size_t len, bool ifexists)
+    : op(op), key(s, len), ifexists(ifexists) {}
+
+  bool eval(const Environment& e) const;
+
+  static boost::optional<double> as_number(const std::string& s) {
+    std::size_t p = 0;
+
+    try {
+      double d = std::stod(s, &p);
+      if (p < s.length()) {
+	return boost::none;
+      }
+
+      return d;
+    } catch (const std::logic_error& e) {
+      return boost::none;
+    }
+  }
+
+  static boost::optional<ceph::real_time> as_date(const std::string& s) {
+    std::size_t p = 0;
+
+    try {
+      double d = std::stod(s, &p);
+      if (p == s.length()) {
+	return ceph::real_time(
+	  std::chrono::seconds(static_cast<uint64_t>(d)) +
+	  std::chrono::nanoseconds(
+	    static_cast<uint64_t>((d - static_cast<uint64_t>(d))
+				  * 1000000000)));
+      }
+
+      return from_iso_8601(std::string_view(s), false);
+    } catch (const std::logic_error& e) {
+      return boost::none;
+    }
+  }
+
+  static boost::optional<bool> as_bool(const std::string& s) {
+    std::size_t p = 0;
+
+    if (s.empty() || boost::iequals(s, "false")) {
+      return false;
+    }
+
+    try {
+      double d = std::stod(s, &p);
+      if (p == s.length()) {
+	return !((d == +0.0) || (d == -0.0) || std::isnan(d));
+      }
+    } catch (const std::logic_error& e) {
+      // Fallthrough
+    }
+
+    return true;
+  }
+
+  static boost::optional<ceph::bufferlist> as_binary(const std::string& s) {
+    // In a just world
+    ceph::bufferlist base64;
+    // I could populate a bufferlist
+    base64.push_back(buffer::create_static(
+		       s.length(),
+		       const_cast<char*>(s.data()))); // Yuck
+    // From a base64 encoded std::string.
+    ceph::bufferlist bin;
+
+    try {
+      bin.decode_base64(base64);
+    } catch (const ceph::buffer::malformed_input& e) {
+      return boost::none;
+    }
+    return bin;
+  }
+
+  static boost::optional<MaskedIP> as_network(const std::string& s);
+
+
+  struct ci_equal_to {
+    bool operator ()(const std::string& s1,
+		     const std::string& s2) const {
+      return boost::iequals(s1, s2);
+    }
+  };
+
+  struct string_like {
+    bool operator ()(const std::string& input,
+                     const std::string& pattern) const {
+      return match_wildcards(pattern, input, 0);
+    }
+  };
+
+  struct ci_starts_with {
+    bool operator()(const std::string& s1,
+		    const std::string& s2) const {
+      return boost::istarts_with(s1, s2);
+    }
+  };
+
+  using unordered_multimap_it_pair = std::pair <std::unordered_multimap<std::string,std::string>::const_iterator, std::unordered_multimap<std::string,std::string>::const_iterator>;
+
+  template<typename F>
+  static bool andible(F&& f, const unordered_multimap_it_pair& it,
+		      const std::vector<std::string>& v) {
+    for (auto itr = it.first; itr != it.second; itr++) {
+      bool matched = false;
+      for (const auto& d : v) {
+        if (std::forward<F>(f)(itr->second, d)) {
+	        matched = true;
+      }
+     }
+     if (!matched)
+      return false;
+    }
+    return true;
+  }
+
+  template<typename F>
+  static bool orrible(F&& f, const unordered_multimap_it_pair& it,
+		      const std::vector<std::string>& v) {
+    for (auto itr = it.first; itr != it.second; itr++) {
+      for (const auto& d : v) {
+        if (std::forward<F>(f)(itr->second, d)) {
+	        return true;
+      }
+     }
+    }
+    return false;
+  }
+
+  template<typename F, typename X>
+  static bool shortible(F&& f, X& x, const std::string& c,
+			const std::vector<std::string>& v) {
+    auto xc = std::forward<X>(x)(c);
+    if (!xc) {
+      return false;
+    }
+
+    for (const auto& d : v) {
+      auto xd = std::forward<X>(x)(d);
+      if (!xd) {
+	continue;
+      }
+
+      if (std::forward<F>(f)(*xc, *xd)) {
+	return true;
+      }
+    }
+    return false;
+  }
+
+  template <typename F>
+  bool has_key_p(const std::string& _key, F p) const {
+    return p(key, _key);
+  }
+
+  template <typename F>
+  bool has_val_p(const std::string& _val, F p) const {
+    for (auto val : vals) {
+      if (p(val, _val))
+        return true;
+    }
+    return false;
+  }
+};
+
+std::ostream& operator <<(std::ostream& m, const Condition& c);
+
+struct Statement {
+  boost::optional<std::string> sid = boost::none;
+
+  boost::container::flat_set<rgw::auth::Principal> princ;
+  boost::container::flat_set<rgw::auth::Principal> noprinc;
+
+  // Every statement MUST provide an effect. I just initialize it to
+  // deny as defensive programming.
+  Effect effect = Effect::Deny;
+
+  Action_t action = 0;
+  NotAction_t notaction = 0;
+
+  boost::container::flat_set<ARN> resource;
+  boost::container::flat_set<ARN> notresource;
+
+  std::vector<Condition> conditions;
+
+  Effect eval(const Environment& e,
+	      boost::optional<const rgw::auth::Identity&> ida,
+	      std::uint64_t action, boost::optional<const ARN&> resource, boost::optional<PolicyPrincipal&> princ_type=boost::none) const;
+
+  Effect eval_principal(const Environment& e,
+		       boost::optional<const rgw::auth::Identity&> ida, boost::optional<PolicyPrincipal&> princ_type=boost::none) const;
+
+  Effect eval_conditions(const Environment& e) const;
+};
+
+std::ostream& operator <<(std::ostream& m, const Statement& s);
+
+struct PolicyParseException : public std::exception {
+  rapidjson::ParseResult pr;
+  std::string msg;
+
+  explicit PolicyParseException(const rapidjson::ParseResult pr,
+				const std::string& annotation)
+    : pr(pr),
+      msg(fmt::format("At character offset {}, {}",
+		      pr.Offset(),
+		      (pr.Code() == rapidjson::kParseErrorTermination ?
+		       annotation :
+		       rapidjson::GetParseError_En(pr.Code())))) {}
+
+  const char* what() const noexcept override {
+    return msg.c_str();
+  }
+};
+
+struct Policy {
+  std::string text;
+  Version version = Version::v2008_10_17;
+  boost::optional<std::string> id = boost::none;
+
+  std::vector<Statement> statements;
+
+  // reject_invalid_principals should be set to
+  // `cct->_conf.get_val<bool>("rgw_policy_reject_invalid_principals")`
+  // when executing operations that *set* a bucket policy, but should
+  // be false when reading a stored bucket policy so as not to break
+  // backwards configuration.
+  Policy(CephContext* cct, const std::string& tenant,
+	 const bufferlist& text,
+	 bool reject_invalid_principals);
+
+  Effect eval(const Environment& e,
+	      boost::optional<const rgw::auth::Identity&> ida,
+	      std::uint64_t action, boost::optional<const ARN&> resource, boost::optional<PolicyPrincipal&> princ_type=boost::none) const;
+
+  Effect eval_principal(const Environment& e,
+	      boost::optional<const rgw::auth::Identity&> ida, boost::optional<PolicyPrincipal&> princ_type=boost::none) const;
+
+  Effect eval_conditions(const Environment& e) const;
+
+  template <typename F>
+  bool has_conditional(const std::string& conditional, F p) const {
+    for (const auto&s: statements){
+      if (std::any_of(s.conditions.begin(), s.conditions.end(),
+		      [&](const Condition& c) { return c.has_key_p(conditional, p);}))
+	return true;
+    }
+    return false;
+  }
+
+  template <typename F>
+  bool has_conditional_value(const std::string& conditional, F p) const {
+    for (const auto&s: statements){
+      if (std::any_of(s.conditions.begin(), s.conditions.end(),
+		      [&](const Condition& c) { return c.has_val_p(conditional, p);}))
+	    return true;
+    }
+    return false;
+  }
+
+  bool has_conditional(const std::string& c) const {
+    return has_conditional(c, Condition::ci_equal_to());
+  }
+
+  bool has_partial_conditional(const std::string& c) const {
+    return has_conditional(c, Condition::ci_starts_with());
+  }
+
+  // Example: ${s3:ResourceTag}
+  bool has_partial_conditional_value(const std::string& c) const {
+    return has_conditional_value(c, Condition::ci_starts_with());
+  }
+};
+
+std::ostream& operator <<(std::ostream& m, const Policy& p);
+bool is_public(const Policy& p);
+
+}
+}
diff --git a/src/rgw/rgw_iam_policy_keywords.gperf b/src/rgw/rgw_iam_policy_keywords.gperf
new file mode 100644
index 000000000..af73dd130
--- /dev/null
+++ b/src/rgw/rgw_iam_policy_keywords.gperf
@@ -0,0 +1,136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+%language=C++
+%compare-strncmp
+%define class-name keyword_hash
+%define lookup-function-name lookup
+%struct-type
+struct Keyword {
+  const char* name;
+  TokenKind kind;
+  TokenID id;
+  uint64_t specific;
+  bool arrayable;
+  bool objectable;
+};
+%%
+# Top-level
+#
+Version, TokenKind::top, TokenID::Version, 0, false, false
+Id, TokenKind::top, TokenID::Id, 0, false, false
+Statement, TokenKind::top, TokenID::Statement, 0, true, true
+#
+# Statement level
+#
+Sid, TokenKind::statement, TokenID::Sid, 0, false, false
+Effect, TokenKind::statement, TokenID::Effect, 0, false, false
+Principal, TokenKind::statement, TokenID::Principal, 0, false, true
+NotPrincipal, TokenKind::statement, TokenID::NotPrincipal, 0, true, true
+Action, TokenKind::statement, TokenID::Action, 0, true, false
+NotAction, TokenKind::statement, TokenID::NotAction, 0, true, false
+Resource, TokenKind::statement, TokenID::Resource, 0, true, false
+NotResource, TokenKind::statement, TokenID::NotResource, 0, true, false
+Condition, TokenKind::statement, TokenID::Condition, 0, true, true
+#
+# Condition operators
+#
+# String
+StringEquals, TokenKind::cond_op, TokenID::StringEquals, (uint64_t) Type::string, true, true
+StringNotEquals, TokenKind::cond_op, TokenID::StringNotEquals, (uint64_t) Type::string, true, true
+StringEqualsIgnoreCase, TokenKind::cond_op, TokenID::StringEqualsIgnoreCase, (uint64_t) Type::string, true, true
+StringNotEqualsIgnoreCase, TokenKind::cond_op, TokenID::StringNotEqualsIgnoreCase, (uint64_t) Type::string, true, true
+StringLike, TokenKind::cond_op, TokenID::StringLike, (uint64_t) Type::string, true, true,
+StringNotLike, TokenKind::cond_op, TokenID::StringNotLike, (uint64_t) Type::string, true, true
+ForAllValues:StringEquals, TokenKind::cond_op, TokenID::ForAllValuesStringEquals, (uint64_t) Type::string, true, true
+ForAnyValue:StringEquals, TokenKind::cond_op, TokenID::ForAnyValueStringEquals, (uint64_t) Type::string, true, true
+ForAllValues:StringLike, TokenKind::cond_op, TokenID::ForAllValuesStringLike, (uint64_t) Type::string, true, true
+ForAnyValue:StringLike, TokenKind::cond_op, TokenID::ForAnyValueStringLike, (uint64_t) Type::string, true, true
+ForAllValues:StringEqualsIgnoreCase, TokenKind::cond_op, TokenID::ForAllValuesStringEqualsIgnoreCase, (uint64_t) Type::string, true, true
+ForAnyValue:StringEqualsIgnoreCase, TokenKind::cond_op, TokenID::ForAnyValueStringEqualsIgnoreCase, (uint64_t) Type::string, true, true
+# Numeric
+NumericEquals, TokenKind::cond_op, TokenID::NumericEquals, (uint64_t) Type::number, true, true
+NumericNotEquals, TokenKind::cond_op, TokenID::NumericNotEquals, (uint64_t) Type::number, true, true
+NumericLessThan, TokenKind::cond_op, TokenID::NumericLessThan, (uint64_t) Type::number, true, true
+NumericLessThanEquals, TokenKind::cond_op, TokenID::NumericLessThanEquals, (uint64_t) Type::number, true, true
+NumericGreaterThan, TokenKind::cond_op, TokenID::NumericGreaterThan, (uint64_t) Type::number, true, true
+NumericGreaterThanEquals, TokenKind::cond_op, TokenID::NumericGreaterThanEquals, (uint64_t) Type::number, true, true
+# Date
+DateEquals, TokenKind::cond_op, TokenID::DateEquals, (uint64_t) Type::date, true, true
+DateNotEquals, TokenKind::cond_op, TokenID::DateNotEquals, (uint64_t) Type::date, true, true
+DateLessThan, TokenKind::cond_op, TokenID::DateLessThan, (uint64_t) Type::date, true, true
+DateLessThanEquals, TokenKind::cond_op, TokenID::DateLessThanEquals, (uint64_t) Type::date, true, true
+DateGreaterThan, TokenKind::cond_op, TokenID::DateGreaterThan, (uint64_t) Type::date, true, true
+DateGreaterThanEquals, TokenKind::cond_op, TokenID::DateGreaterThanEquals, (uint64_t) Type::date, true, true
+# Bool
+Bool, TokenKind::cond_op, TokenID::Bool, (uint64_t) Type::boolean, true, true
+# Binary
+BinaryEquals, TokenKind::cond_op, TokenID::BinaryEquals, (uint64_t) Type::binary, true, true
+# IP Address
+IpAddress, TokenKind::cond_op, TokenID::IpAddress, (uint64_t) Type::ipaddr, true, true
+NotIpAddress, TokenKind::cond_op, TokenID::NotIpAddress, (uint64_t) Type::ipaddr, true, true
+# Amazon Resource Names
+ArnEquals, TokenKind::cond_op, TokenID::ArnEquals, (uint64_t) Type::arn, true, true
+ArnNotEquals, TokenKind::cond_op, TokenID::ArnNotEquals, (uint64_t) Type::arn, true, true
+ArnLike, TokenKind::cond_op, TokenID::ArnLike, (uint64_t) Type::arn, true, true
+ArnNotLike, TokenKind::cond_op, TokenID::ArnNotLike, (uint64_t) Type::arn, true, true
+# Null
+Null, TokenKind::cond_op, TokenID::Null, (uint64_t) Type::null, true, true
+#
+# Condition keys
+#
+# AWS
+#aws:CurrentTime, TokenKind::cond_key, TokenID::awsCurrentTime, (uint64_t) Type::date, true, false
+#aws:EpochTime, TokenKind::cond_key, TokenID::awsEpochTime, (uint64_t) Type::date, true, false
+#aws:TokenIssueTime, TokenKind::cond_key, TokenID::awsTokenIssueTime, (uint64_t) Type::date, true, false
+#aws:MultiFactorAuthPresent, TokenKind::cond_key, TokenID::awsMultiFactorAuthPresent, (uint64_t) Type::boolean, true, false
+#aws:MultiFactorAuthAge, TokenKind::cond_key, TokenID::awsMultiFactorAuthAge, (uint64_t) Type::number, true, false
+#aws:PrincipalType, TokenKind::cond_key, TokenID::awsPrincipalType, (uint64_t) Type::string, true, false
+#aws:Referer, TokenKind::cond_key, TokenID::awsReferer, (uint64_t) Type::string, true, false
+#aws:SecureTransport, TokenKind::cond_key, TokenID::awsSecureTransport, (uint64_t) Type::boolean, true, false
+#aws:SourceArn, TokenKind::cond_key, TokenID::awsSourceArn, (uint64_t) Type::arn, true, false
+#aws:SourceIp, TokenKind::cond_key, TokenID::awsSourceIp, (uint64_t) Type::ipaddr, true, false
+#aws:SourceVpc, TokenKind::cond_key, TokenID::awsSourceVpc, (uint64_t) Type::string, true, false
+#aws:SourceVpce, TokenKind::cond_key, TokenID::awsSourceVpce, (uint64_t) Type::string, true, false
+#aws:UserAgent, TokenKind::cond_key, TokenID::awsUserAgent, (uint64_t) Type::string, true, false
+#aws:userid, TokenKind::cond_key, TokenID::awsuserid, (uint64_t) Type::string, true, false
+#aws:username, TokenKind::cond_key, TokenID::awsusername, (uint64_t) Type::string, true, false
+# S3
+#s3:x-amz-acl, TokenKind::cond_key, TokenID::s3x_amz_acl, (uint64_t) Type::string, true, false
+#s3:x-amz-grant-read, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false
+#s3:x-amz-grant-write, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false
+#s3:x-amz-grant-read-acp, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false
+#s3:x-amz-grant-write-acp, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false
+#s3:x-amz-grant-full-control, TokenKind::cond_key, TokenID::s3x_amz_grant_permission, (uint64_t) Type::boolean, true, false
+#s3:x-amz-copy-source, TokenKind::cond_key, TokenID::s3x_amz_copy_source, (uint64_t) Type::string, true, false
+#s3:x-amz-server-side-encryption, TokenKind::cond_key, TokenID::s3x_amz_server_side_encryption, (uint64_t) Type::boolean, true, false
+#s3:x-amz-server-side-encryption-aws-kms-key-id, TokenKind::cond_key, TokenID::s3x_amz_server_side_encryption_aws_kms_key_id, (uint64_t) Type::arn, true, false
+#s3:x-amz-metadata-directive, TokenKind::cond_key, TokenID::s3x_amz_metadata_directive, (uint64_t) Type::string, true, false
+#s3:x-amz-storage-class, TokenKind::cond_key, TokenID::s3x_amz_storage_class, (uint64_t) Type::string, true, false
+#s3:VersionId, TokenKind::cond_key, TokenID::s3VersionId, (uint64_t) Type::string, true, false
+#s3:LocationConstraint, TokenKind::cond_key, TokenID::s3LocationConstraint, (uint64_t) Type::string, true, false
+#s3:prefix, TokenKind::cond_key, TokenID::s3prefix, (uint64_t) Type::string, true, false
+#s3:delimiter, TokenKind::cond_key, TokenID::s3delimiter, (uint64_t) Type::string, true, false
+#s3:max-keys, TokenKind::cond_key, TokenID::s3max_keys, (uint64_t) Type::number, true, false
+#s3:signatureversion, TokenKind::cond_key, TokenID::s3signatureversion, (uint64_t) Type::string, true, false
+#s3:authType, TokenKind::cond_key, TokenID::s3authType, (uint64_t) Type::string, true, false
+#s3:signatureAge, TokenKind::cond_key, TokenID::s3signatureAge, (uint64_t) Type::number, true, false
+#s3:x-amz-content-sha256, TokenKind::cond_key, TokenID::s3x_amz_content_sha256, (uint64_t) Type::string, true, false
+# STS
+#sts:authentication, TokenKind::cond_key, TokenID::stsauthentication, (uint64_t) Type::boolean, true, false
+#
+# Version Keywords
+#
+2008-10-17, TokenKind::version_key, TokenID::v2008_10_17, (uint64_t) Version::v2008_10_17, false, false
+2012-10-17, TokenKind::version_key, TokenID::v2012_10_17, (uint64_t) Version::v2012_10_17, false, false
+#
+# Effect Keywords
+#
+Allow, TokenKind::effect_key, TokenID::Allow, (uint64_t) Effect::Allow, false, false
+Deny, TokenKind::effect_key, TokenID::Deny, (uint64_t) Effect::Deny, false, false
+#
+# Principal types
+#
+AWS, TokenKind::princ_type, TokenID::AWS, 0, true, false
+Federated, TokenKind::princ_type, TokenID::Federated, 0, true, false
+Service, TokenKind::princ_type, TokenID::Service, 0, true, false
+CanonicalUser, TokenKind::princ_type, TokenID::CanonicalUser, 0, true, false
diff --git a/src/rgw/rgw_iam_policy_keywords.h b/src/rgw/rgw_iam_policy_keywords.h
new file mode 100644
index 000000000..8130ace45
--- /dev/null
+++ b/src/rgw/rgw_iam_policy_keywords.h
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+namespace rgw {
+namespace IAM {
+
+enum class TokenKind {
+  pseudo, top, statement, cond_op, cond_key, version_key, effect_key,
+  princ_type
+};
+
+enum class TokenID {
+  /// Pseudo-token
+  Top,
+
+  /// Top-level tokens
+  Version, Id, Statement,
+
+  /// Statement level tokens
+  Sid, Effect, Principal, NotPrincipal, Action, NotAction,
+  Resource, NotResource, Condition,
+
+  /// Condition Operators!
+  /// Any of these, except Null, can have an IfExists variant.
+
+  // String!
+  StringEquals, StringNotEquals, StringEqualsIgnoreCase,
+  StringNotEqualsIgnoreCase, StringLike, StringNotLike,
+  ForAllValuesStringEquals, ForAnyValueStringEquals,
+  ForAllValuesStringLike, ForAnyValueStringLike,
+  ForAllValuesStringEqualsIgnoreCase, ForAnyValueStringEqualsIgnoreCase,
+
+  // Numeric!
+  NumericEquals, NumericNotEquals, NumericLessThan, NumericLessThanEquals,
+  NumericGreaterThan, NumericGreaterThanEquals,
+
+  // Date!
+  DateEquals, DateNotEquals, DateLessThan, DateLessThanEquals,
+  DateGreaterThan, DateGreaterThanEquals,
+
+  // Bool!
+  Bool,
+
+  // Binary!
+  BinaryEquals,
+
+  // IP Address!
+  IpAddress, NotIpAddress,
+
+  // Amazon Resource Names! (Does S3 need this?)
+  ArnEquals, ArnNotEquals, ArnLike, ArnNotLike,
+
+  // Null!
+  Null,
+
+#if 0 // Keys are done at runtime now
+
+      /// Condition Keys!
+  awsCurrentTime,
+  awsEpochTime,
+  awsTokenIssueTime,
+  awsMultiFactorAuthPresent,
+  awsMultiFactorAuthAge,
+  awsPrincipalType,
+  awsReferer,
+  awsSecureTransport,
+  awsSourceArn,
+  awsSourceIp,
+  awsSourceVpc,
+  awsSourceVpce,
+  awsUserAgent,
+  awsuserid,
+  awsusername,
+  s3x_amz_acl,
+  s3x_amz_grant_permission,
+  s3x_amz_copy_source,
+  s3x_amz_server_side_encryption,
+  s3x_amz_server_side_encryption_aws_kms_key_id,
+  s3x_amz_metadata_directive,
+  s3x_amz_storage_class,
+  s3VersionId,
+  s3LocationConstraint,
+  s3prefix,
+  s3delimiter,
+  s3max_keys,
+  s3signatureversion,
+  s3authType,
+  s3signatureAge,
+  s3x_amz_content_sha256,
+#else
+  CondKey,
+#endif
+
+  ///
+  /// Versions!
+  ///
+  v2008_10_17,
+  v2012_10_17,
+
+  ///
+  /// Effects!
+  ///
+  Allow,
+  Deny,
+
+  /// Principal Types!
+  AWS,
+  Federated,
+  Service,
+  CanonicalUser
+};
+
+
+enum class Version {
+  v2008_10_17,
+  v2012_10_17
+};
+
+
+enum class Effect {
+  Allow,
+  Deny,
+  Pass
+};
+
+enum class Type {
+  string,
+  number,
+  date,
+  boolean,
+  binary,
+  ipaddr,
+  arn,
+  null
+};
+}
+}
diff --git a/src/rgw/rgw_jsonparser.cc b/src/rgw/rgw_jsonparser.cc
new file mode 100644
index 000000000..6541630b2
--- /dev/null
+++ b/src/rgw/rgw_jsonparser.cc
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <string.h>
+
+#include <iostream>
+#include <map>
+
+#include "include/types.h"
+
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+
+#include "rgw_common.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void dump_array(JSONObj *obj)
+{
+
+  JSONObjIter iter = obj->find_first();
+
+  for (; !iter.end(); ++iter) { 
+    JSONObj *o = *iter;
+    cout << "data=" << o->get_data() << std::endl;
+  }
+
+}
+                                  
+struct Key {
+  string user;
+  string access_key;
+  string secret_key;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("user", user, obj);
+    JSONDecoder::decode_json("access_key", access_key, obj);
+    JSONDecoder::decode_json("secret_key", secret_key, obj);
+  }
+};
+
+struct UserInfo {
+  string uid;
+  string display_name;
+  int max_buckets;
+  list<Key> keys;
+
+  void decode_json(JSONObj *obj) {
+    JSONDecoder::decode_json("user_id", uid, obj);
+    JSONDecoder::decode_json("display_name", display_name, obj);
+    JSONDecoder::decode_json("max_buckets", max_buckets, obj);
+    JSONDecoder::decode_json("keys", keys, obj);
+  }
+};
+
+
+int main(int argc, char **argv) {
+  JSONParser parser;
+
+  char buf[1024];
+  bufferlist bl;
+
+  for (;;) {
+    int done;
+    int len;
+
+    len = fread(buf, 1, sizeof(buf), stdin);
+    if (ferror(stdin)) {
+      cerr << "read error" << std::endl;
+      exit(-1);
+    }
+    done = feof(stdin);
+
+    bool ret = parser.parse(buf, len);
+    if (!ret)
+      cerr << "parse error" << std::endl;
+
+    if (done) {
+      bl.append(buf, len);
+      break;
+    }
+  }
+
+  JSONObjIter iter = parser.find_first();
+
+  for (; !iter.end(); ++iter) { 
+    JSONObj *obj = *iter;
+    cout << "is_object=" << obj->is_object() << std::endl;
+    cout << "is_array=" << obj->is_array() << std::endl;
+    cout << "name=" << obj->get_name() << std::endl;
+    cout << "data=" << obj->get_data() << std::endl;
+  }
+
+  iter = parser.find_first("conditions");
+  if (!iter.end()) {
+    JSONObj *obj = *iter;
+
+    JSONObjIter iter2 = obj->find_first();
+    for (; !iter2.end(); ++iter2) {
+      JSONObj *child = *iter2;
+      cout << "is_object=" << child->is_object() << std::endl;
+      cout << "is_array=" << child->is_array() << std::endl;
+      if (child->is_array()) {
+        dump_array(child);
+      }
+      cout << "name=" << child->get_name() <<std::endl;
+      cout << "data=" << child->get_data() <<std::endl;
+    }
+  }
+
+  RGWUserInfo ui;
+
+  try {
+    ui.decode_json(&parser);
+  } catch (const JSONDecoder::err& e) {
+    cout << "failed to decode JSON input: " << e.what() << std::endl;
+    exit(1);
+  }
+
+  JSONFormatter formatter(true);
+
+  formatter.open_object_section("user_info");
+  ui.dump(&formatter);
+  formatter.close_section();
+
+  formatter.flush(std::cout);
+
+  std::cout << std::endl;
+}
+
diff --git a/src/rgw/rgw_kafka.cc b/src/rgw/rgw_kafka.cc
new file mode 100644
index 000000000..642787a38
--- /dev/null
+++ b/src/rgw/rgw_kafka.cc
@@ -0,0 +1,742 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_kafka.h"
+#include "rgw_url.h"
+#include <librdkafka/rdkafka.h>
+#include "include/ceph_assert.h"
+#include <sstream>
+#include <cstring>
+#include <unordered_map>
+#include <string>
+#include <vector>
+#include <thread>
+#include <atomic>
+#include <mutex>
+#include <boost/lockfree/queue.hpp>
+#include "common/dout.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+// TODO investigation, not necessarily issues:
+// (1) in case of single threaded writer context use spsc_queue
+// (2) check performance of emptying queue to local list, and go over the list and publish
+// (3) use std::shared_mutex (c++17) or equivalent for the connections lock
+
+// cmparisson operator between topic pointer and name
+bool operator==(const rd_kafka_topic_t* rkt, const std::string& name) {
+    return name == std::string_view(rd_kafka_topic_name(rkt)); 
+}
+
+namespace rgw::kafka {
+
+// status codes for publishing
+static const int STATUS_CONNECTION_CLOSED =      -0x1002;
+static const int STATUS_QUEUE_FULL =             -0x1003;
+static const int STATUS_MAX_INFLIGHT =           -0x1004;
+static const int STATUS_MANAGER_STOPPED =        -0x1005;
+static const int STATUS_CONNECTION_IDLE =        -0x1006;
+// status code for connection opening
+static const int STATUS_CONF_ALLOC_FAILED      = -0x2001;
+static const int STATUS_CONF_REPLCACE          = -0x2002;
+
+static const int STATUS_OK =                     0x0;
+
+// struct for holding the callback and its tag in the callback list
+struct reply_callback_with_tag_t {
+  uint64_t tag;
+  reply_callback_t cb;
+  
+  reply_callback_with_tag_t(uint64_t _tag, reply_callback_t _cb) : tag(_tag), cb(_cb) {}
+  
+  bool operator==(uint64_t rhs) {
+    return tag == rhs;
+  }
+};
+
+typedef std::vector<reply_callback_with_tag_t> CallbackList;
+
+// struct for holding the connection state object as well as list of topics
+// it is used inside an intrusive ref counted pointer (boost::intrusive_ptr)
+// since references to deleted objects may still exist in the calling code
+struct connection_t {
+  rd_kafka_t* producer = nullptr;
+  rd_kafka_conf_t* temp_conf = nullptr;
+  std::vector<rd_kafka_topic_t*> topics;
+  uint64_t delivery_tag = 1;
+  int status = STATUS_OK;
+  CephContext* const cct;
+  CallbackList callbacks;
+  const std::string broker;
+  const bool use_ssl;
+  const bool verify_ssl; // TODO currently iognored, not supported in librdkafka v0.11.6
+  const boost::optional<std::string> ca_location;
+  const std::string user;
+  const std::string password;
+  const boost::optional<std::string> mechanism;
+  utime_t timestamp = ceph_clock_now();
+
+  // cleanup of all internal connection resource
+  // the object can still remain, and internal connection
+  // resources created again on successful reconnection
+  void destroy(int s) {
+    status = s;
+    // destroy temporary conf (if connection was never established)
+    if (temp_conf) {
+        rd_kafka_conf_destroy(temp_conf);
+        return;
+    }
+    if (!is_ok()) {
+      // no producer, nothing to destroy
+      return;
+    }
+    // wait for all remaining acks/nacks
+    rd_kafka_flush(producer, 5*1000 /* wait for max 5 seconds */);
+    // destroy all topics
+    std::for_each(topics.begin(), topics.end(), [](auto topic) {rd_kafka_topic_destroy(topic);});
+    // destroy producer
+    rd_kafka_destroy(producer);
+    producer = nullptr;
+    // fire all remaining callbacks (if not fired by rd_kafka_flush)
+    std::for_each(callbacks.begin(), callbacks.end(), [this](auto& cb_tag) {
+        cb_tag.cb(status);
+        ldout(cct, 20) << "Kafka destroy: invoking callback with tag=" << cb_tag.tag << 
+          " for: " << broker << dendl;
+      });
+    callbacks.clear();
+    delivery_tag = 1;
+    ldout(cct, 20) << "Kafka destroy: complete for: " << broker << dendl;
+  }
+
+  bool is_ok() const {
+    return (producer != nullptr);
+  }
+
+  // ctor for setting immutable values
+  connection_t(CephContext* _cct, const std::string& _broker, bool _use_ssl, bool _verify_ssl, 
+          const boost::optional<const std::string&>& _ca_location,
+          const std::string& _user, const std::string& _password, const boost::optional<const std::string&>& _mechanism) :
+      cct(_cct), broker(_broker), use_ssl(_use_ssl), verify_ssl(_verify_ssl), ca_location(_ca_location), user(_user), password(_password), mechanism(_mechanism) {}                                                                                                                                                        
+
+  // dtor also destroys the internals
+  ~connection_t() {
+    destroy(status);
+  }
+};
+
+// convert int status to string - including RGW specific values
+std::string status_to_string(int s) {
+  switch (s) {
+    case STATUS_OK:
+        return "STATUS_OK";
+    case STATUS_CONNECTION_CLOSED:
+      return "RGW_KAFKA_STATUS_CONNECTION_CLOSED";
+    case STATUS_QUEUE_FULL:
+      return "RGW_KAFKA_STATUS_QUEUE_FULL";
+    case STATUS_MAX_INFLIGHT:
+      return "RGW_KAFKA_STATUS_MAX_INFLIGHT";
+    case STATUS_MANAGER_STOPPED:
+      return "RGW_KAFKA_STATUS_MANAGER_STOPPED";
+    case STATUS_CONF_ALLOC_FAILED:
+      return "RGW_KAFKA_STATUS_CONF_ALLOC_FAILED";
+    case STATUS_CONF_REPLCACE:
+      return "RGW_KAFKA_STATUS_CONF_REPLCACE";
+    case STATUS_CONNECTION_IDLE:
+      return "RGW_KAFKA_STATUS_CONNECTION_IDLE";
+  }
+  return std::string(rd_kafka_err2str((rd_kafka_resp_err_t)s));
+}
+
+void message_callback(rd_kafka_t* rk, const rd_kafka_message_t* rkmessage, void* opaque) {
+  ceph_assert(opaque);
+
+  const auto conn = reinterpret_cast<connection_t*>(opaque);
+  const auto result = rkmessage->err;
+
+  if (!rkmessage->_private) {
+    ldout(conn->cct, 20) << "Kafka run: n/ack received, (no callback) with result=" << result << dendl;
+    return;  
+  }
+
+  const auto tag = reinterpret_cast<uint64_t*>(rkmessage->_private);
+  const auto& callbacks_end = conn->callbacks.end();
+  const auto& callbacks_begin = conn->callbacks.begin();
+  const auto tag_it = std::find(callbacks_begin, callbacks_end, *tag);
+  if (tag_it != callbacks_end) {
+      ldout(conn->cct, 20) << "Kafka run: n/ack received, invoking callback with tag=" << 
+          *tag << " and result=" << rd_kafka_err2str(result) << dendl;
+      tag_it->cb(result);
+      conn->callbacks.erase(tag_it);
+  } else {
+    // TODO add counter for acks with no callback
+    ldout(conn->cct, 10) << "Kafka run: unsolicited n/ack received with tag=" << 
+        *tag << dendl;
+  }
+  delete tag;
+  // rkmessage is destroyed automatically by librdkafka
+}
+
+void log_callback(const rd_kafka_t* rk, int level, const char *fac, const char *buf) {
+  ceph_assert(rd_kafka_opaque(rk));
+  
+  const auto conn = reinterpret_cast<connection_t*>(rd_kafka_opaque(rk));
+  if (level <= 3)
+    ldout(conn->cct, 1) << "RDKAFKA-" << level << "-" << fac << ": " << rd_kafka_name(rk) << ": " << buf << dendl;
+  else if (level <= 5)
+    ldout(conn->cct, 2) << "RDKAFKA-" << level << "-" << fac << ": " << rd_kafka_name(rk) << ": " << buf << dendl;
+  else if (level <= 6)
+    ldout(conn->cct, 10) << "RDKAFKA-" << level << "-" << fac << ": " << rd_kafka_name(rk) << ": " << buf << dendl;
+  else
+    ldout(conn->cct, 20) << "RDKAFKA-" << level << "-" << fac << ": " << rd_kafka_name(rk) << ": " << buf << dendl;
+}
+
+void poll_err_callback(rd_kafka_t *rk, int err, const char *reason, void *opaque) {
+  const auto conn = reinterpret_cast<connection_t*>(rd_kafka_opaque(rk));
+  ldout(conn->cct, 10) << "Kafka run: poll error(" << err << "): " << reason << dendl;
+}
+
+using connection_t_ptr = std::unique_ptr<connection_t>;
+
+// utility function to create a producer, when the connection object already exists
+bool new_producer(connection_t* conn) {
+  // reset all status codes
+  conn->status = STATUS_OK; 
+  char errstr[512] = {0};
+
+  conn->temp_conf = rd_kafka_conf_new();
+  if (!conn->temp_conf) {
+    conn->status = STATUS_CONF_ALLOC_FAILED;
+    return false;
+  }
+
+  // get list of brokers based on the bootsrap broker
+  if (rd_kafka_conf_set(conn->temp_conf, "bootstrap.servers", conn->broker.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error;
+
+  if (conn->use_ssl) {
+    if (!conn->user.empty()) {
+      // use SSL+SASL
+      if (rd_kafka_conf_set(conn->temp_conf, "security.protocol", "SASL_SSL", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK ||
+              rd_kafka_conf_set(conn->temp_conf, "sasl.username", conn->user.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK ||
+              rd_kafka_conf_set(conn->temp_conf, "sasl.password", conn->password.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error;
+      ldout(conn->cct, 20) << "Kafka connect: successfully configured SSL+SASL security" << dendl;
+
+      if (conn->mechanism) {
+        if (rd_kafka_conf_set(conn->temp_conf, "sasl.mechanism", conn->mechanism->c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error;
+        ldout(conn->cct, 20) << "Kafka connect: successfully configured SASL mechanism" << dendl;
+      } else {
+        if (rd_kafka_conf_set(conn->temp_conf, "sasl.mechanism", "PLAIN", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error;
+        ldout(conn->cct, 20) << "Kafka connect: using default SASL mechanism" << dendl;
+      }
+
+    } else {
+      // use only SSL
+      if (rd_kafka_conf_set(conn->temp_conf, "security.protocol", "SSL", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error;
+      ldout(conn->cct, 20) << "Kafka connect: successfully configured SSL security" << dendl;
+    }
+    if (conn->ca_location) {
+      if (rd_kafka_conf_set(conn->temp_conf, "ssl.ca.location", conn->ca_location->c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error;
+      ldout(conn->cct, 20) << "Kafka connect: successfully configured CA location" << dendl;
+    } else {
+      ldout(conn->cct, 20) << "Kafka connect: using default CA location" << dendl;
+    }
+    // Note: when librdkafka.1.0 is available the following line could be uncommented instead of the callback setting call
+    // if (rd_kafka_conf_set(conn->temp_conf, "enable.ssl.certificate.verification", "0", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error;
+
+    ldout(conn->cct, 20) << "Kafka connect: successfully configured security" << dendl;
+  } else if (!conn->user.empty()) {
+      // use SASL+PLAINTEXT
+      if (rd_kafka_conf_set(conn->temp_conf, "security.protocol", "SASL_PLAINTEXT", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK ||
+              rd_kafka_conf_set(conn->temp_conf, "sasl.username", conn->user.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK ||
+              rd_kafka_conf_set(conn->temp_conf, "sasl.password", conn->password.c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error;
+      ldout(conn->cct, 20) << "Kafka connect: successfully configured SASL_PLAINTEXT" << dendl;
+
+      if (conn->mechanism) {
+        if (rd_kafka_conf_set(conn->temp_conf, "sasl.mechanism", conn->mechanism->c_str(), errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error;
+        ldout(conn->cct, 20) << "Kafka connect: successfully configured SASL mechanism" << dendl;
+      } else {
+        if (rd_kafka_conf_set(conn->temp_conf, "sasl.mechanism", "PLAIN", errstr, sizeof(errstr)) != RD_KAFKA_CONF_OK) goto conf_error;
+        ldout(conn->cct, 20) << "Kafka connect: using default SASL mechanism" << dendl;
+      }
+  }
+
+  // set the global callback for delivery success/fail
+  rd_kafka_conf_set_dr_msg_cb(conn->temp_conf, message_callback);
+
+  // set the global opaque pointer to be the connection itself
+  rd_kafka_conf_set_opaque(conn->temp_conf, conn);
+
+  // redirect kafka logs to RGW
+  rd_kafka_conf_set_log_cb(conn->temp_conf, log_callback);
+  // define poll callback to allow reconnect
+  rd_kafka_conf_set_error_cb(conn->temp_conf, poll_err_callback);
+  // create the producer
+  if (conn->producer) {
+    ldout(conn->cct, 5) << "Kafka connect: producer already exists. detroying the existing before creating a new one" << dendl;
+    conn->destroy(STATUS_CONF_REPLCACE);
+  }
+  conn->producer = rd_kafka_new(RD_KAFKA_PRODUCER, conn->temp_conf, errstr, sizeof(errstr));
+  if (!conn->producer) {
+    conn->status = rd_kafka_last_error();
+    ldout(conn->cct, 1) << "Kafka connect: failed to create producer: " << errstr << dendl;
+    return false;
+  }
+  ldout(conn->cct, 20) << "Kafka connect: successfully created new producer" << dendl;
+  {
+    // set log level of producer
+    const auto log_level = conn->cct->_conf->subsys.get_log_level(ceph_subsys_rgw);
+    if (log_level <= 1)
+      rd_kafka_set_log_level(conn->producer, 3);
+    else if (log_level <= 2)
+      rd_kafka_set_log_level(conn->producer, 5);
+    else if (log_level <= 10)
+      rd_kafka_set_log_level(conn->producer, 6);
+    else
+      rd_kafka_set_log_level(conn->producer, 7);
+  }
+
+  // conf ownership passed to producer
+  conn->temp_conf = nullptr;
+  return true;
+
+conf_error:
+  conn->status = rd_kafka_last_error();
+  ldout(conn->cct, 1) << "Kafka connect: configuration failed: " << errstr << dendl;
+  return false;
+}
+
+// struct used for holding messages in the message queue
+struct message_wrapper_t {
+  std::string conn_name; 
+  std::string topic;
+  std::string message;
+  const reply_callback_t cb;
+  
+  message_wrapper_t(const std::string& _conn_name,
+      const std::string& _topic,
+      const std::string& _message,
+      reply_callback_t _cb) : conn_name(_conn_name), topic(_topic), message(_message), cb(_cb) {}
+};
+
+typedef std::unordered_map<std::string, connection_t_ptr> ConnectionList;
+typedef boost::lockfree::queue<message_wrapper_t*, boost::lockfree::fixed_sized<true>> MessageQueue;
+
+class Manager {
+public:
+  const size_t max_connections;
+  const size_t max_inflight;
+  const size_t max_queue;
+  const size_t max_idle_time;
+private:
+  std::atomic<size_t> connection_count;
+  bool stopped;
+  int read_timeout_ms;
+  ConnectionList connections;
+  MessageQueue messages;
+  std::atomic<size_t> queued;
+  std::atomic<size_t> dequeued;
+  CephContext* const cct;
+  mutable std::mutex connections_lock;
+  std::thread runner;
+
+  // TODO use rd_kafka_produce_batch for better performance
+  void publish_internal(message_wrapper_t* message) {
+    const std::unique_ptr<message_wrapper_t> msg_deleter(message);
+    const auto conn_it = connections.find(message->conn_name);
+    if (conn_it == connections.end()) {
+      ldout(cct, 1) << "Kafka publish: connection was deleted while message was in the queue. error: " << STATUS_CONNECTION_CLOSED << dendl;
+      if (message->cb) {
+        message->cb(STATUS_CONNECTION_CLOSED);
+      }
+      return;
+    }
+    auto& conn = conn_it->second;
+
+    conn->timestamp = ceph_clock_now(); 
+
+    if (!conn->is_ok()) {
+      // connection had an issue while message was in the queue
+      // TODO add error stats
+      ldout(conn->cct, 1) << "Kafka publish: producer was closed while message was in the queue. error: " << status_to_string(conn->status) << dendl;
+      if (message->cb) {
+        message->cb(conn->status);
+      }
+      return;
+    }
+
+    // create a new topic unless it was already created
+    auto topic_it = std::find(conn->topics.begin(), conn->topics.end(), message->topic);
+    rd_kafka_topic_t* topic = nullptr;
+    if (topic_it == conn->topics.end()) {
+      topic = rd_kafka_topic_new(conn->producer, message->topic.c_str(), nullptr);
+      if (!topic) {
+        const auto err = rd_kafka_last_error();
+        ldout(conn->cct, 1) << "Kafka publish: failed to create topic: " << message->topic << " error: " << status_to_string(err) << dendl;
+        if (message->cb) {
+          message->cb(err);
+        }
+        conn->destroy(err);
+        return;
+      }
+      // TODO use the topics list as an LRU cache
+      conn->topics.push_back(topic);
+      ldout(conn->cct, 20) << "Kafka publish: successfully created topic: " << message->topic << dendl;
+    } else {
+        topic = *topic_it;
+        ldout(conn->cct, 20) << "Kafka publish: reused existing topic: " << message->topic << dendl;
+    }
+
+    const auto tag = (message->cb == nullptr ? nullptr : new uint64_t(conn->delivery_tag++));
+    const auto rc = rd_kafka_produce(
+            topic,
+            // TODO: non builtin partitioning
+            RD_KAFKA_PARTITION_UA,
+            // make a copy of the payload
+            // so it is safe to pass the pointer from the string
+            RD_KAFKA_MSG_F_COPY,
+            message->message.data(),
+            message->message.length(),
+            // optional key and its length
+            nullptr, 
+            0,
+            // opaque data: tag, used in the global callback
+            // in order to invoke the real callback
+            // null if no callback exists
+            tag);
+    if (rc == -1) {
+      const auto err = rd_kafka_last_error();
+      ldout(conn->cct, 10) << "Kafka publish: failed to produce: " << rd_kafka_err2str(err) << dendl;
+      // TODO: dont error on full queue, and don't destroy connection, retry instead
+      // immediatly invoke callback on error if needed
+      if (message->cb) {
+        message->cb(err);
+      }
+      conn->destroy(err);
+      delete tag;
+      return;
+    }
+   
+    if (tag) {
+      auto const q_len = conn->callbacks.size();
+      if (q_len < max_inflight) {
+        ldout(conn->cct, 20) << "Kafka publish (with callback, tag=" << *tag << "): OK. Queue has: " << q_len << " callbacks" << dendl;
+        conn->callbacks.emplace_back(*tag, message->cb);
+      } else {
+        // immediately invoke callback with error - this is not a connection error
+        ldout(conn->cct, 1) << "Kafka publish (with callback): failed with error: callback queue full" << dendl;
+        message->cb(STATUS_MAX_INFLIGHT);
+        // tag will be deleted when the global callback is invoked
+      }
+    } else {
+        ldout(conn->cct, 20) << "Kafka publish (no callback): OK" << dendl;
+    }
+  }
+
+  // the managers thread:
+  // (1) empty the queue of messages to be published
+  // (2) loop over all connections and read acks
+  // (3) manages deleted connections
+  // (4) TODO reconnect on connection errors
+  // (5) TODO cleanup timedout callbacks
+  void run() noexcept {
+    while (!stopped) {
+
+      // publish all messages in the queue
+      auto reply_count = 0U;
+      const auto send_count = messages.consume_all(std::bind(&Manager::publish_internal, this, std::placeholders::_1));
+      dequeued += send_count;
+      ConnectionList::iterator conn_it;
+      ConnectionList::const_iterator end_it;
+      {
+        // thread safe access to the connection list
+        // once the iterators are fetched they are guaranteed to remain valid
+        std::lock_guard lock(connections_lock);
+        conn_it = connections.begin();
+        end_it = connections.end();
+      }
+      // loop over all connections to read acks
+      for (;conn_it != end_it;) {
+        
+        auto& conn = conn_it->second;
+
+        // Checking the connection idlesness
+        if(conn->timestamp.sec() + max_idle_time < ceph_clock_now()) {
+          ldout(conn->cct, 20) << "kafka run: deleting a connection due to idle behaviour: " << ceph_clock_now() << dendl;
+          std::lock_guard lock(connections_lock);
+          conn->destroy(STATUS_CONNECTION_IDLE);
+          conn_it = connections.erase(conn_it);
+          --connection_count; \
+          continue;
+        }
+
+        // try to reconnect the connection if it has an error
+        if (!conn->is_ok()) {
+          ldout(conn->cct, 10) << "Kafka run: connection status is: " << status_to_string(conn->status) << dendl;
+          const auto& broker = conn_it->first;
+          ldout(conn->cct, 20) << "Kafka run: retry connection" << dendl;
+          if (new_producer(conn.get()) == false) {
+            ldout(conn->cct, 10) << "Kafka run: connection (" << broker << ") retry failed" << dendl;
+            // TODO: add error counter for failed retries
+            // TODO: add exponential backoff for retries
+          } else {
+            ldout(conn->cct, 10) << "Kafka run: connection (" << broker << ") retry successfull" << dendl;
+          }
+          ++conn_it;
+          continue;
+        }
+
+        reply_count += rd_kafka_poll(conn->producer, read_timeout_ms);
+
+        // just increment the iterator
+        ++conn_it;
+      }
+      // if no messages were received or published
+      // across all connection, sleep for 100ms
+      if (send_count == 0 && reply_count == 0) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      }
+    }
+  }
+
+  // used in the dtor for message cleanup
+  static void delete_message(const message_wrapper_t* message) {
+    delete message;
+  }
+
+public:
+  Manager(size_t _max_connections,
+      size_t _max_inflight,
+      size_t _max_queue, 
+      int _read_timeout_ms,
+      CephContext* _cct) : 
+    max_connections(_max_connections),
+    max_inflight(_max_inflight),
+    max_queue(_max_queue),
+    max_idle_time(30),
+    connection_count(0),
+    stopped(false),
+    read_timeout_ms(_read_timeout_ms),
+    connections(_max_connections),
+    messages(max_queue),
+    queued(0),
+    dequeued(0),
+    cct(_cct),
+    runner(&Manager::run, this) {
+      // The hashmap has "max connections" as the initial number of buckets, 
+      // and allows for 10 collisions per bucket before rehash.
+      // This is to prevent rehashing so that iterators are not invalidated 
+      // when a new connection is added.
+      connections.max_load_factor(10.0);
+      // give the runner thread a name for easier debugging
+      const auto rc = ceph_pthread_setname(runner.native_handle(), "kafka_manager");
+      ceph_assert(rc==0);
+  }
+
+  // non copyable
+  Manager(const Manager&) = delete;
+  const Manager& operator=(const Manager&) = delete;
+
+  // stop the main thread
+  void stop() {
+    stopped = true;
+  }
+
+  // connect to a broker, or reuse an existing connection if already connected
+  bool connect(std::string& broker,
+          const std::string& url, 
+          bool use_ssl,
+          bool verify_ssl,
+          boost::optional<const std::string&> ca_location,
+          boost::optional<const std::string&> mechanism) {
+    if (stopped) {
+      ldout(cct, 1) << "Kafka connect: manager is stopped" << dendl;
+      return false;
+    }
+
+    std::string user;
+    std::string password;
+    if (!parse_url_authority(url, broker, user, password)) {
+      // TODO: increment counter
+      ldout(cct, 1) << "Kafka connect: URL parsing failed" << dendl;
+      return false;
+    }
+
+    // this should be validated by the regex in parse_url()
+    ceph_assert(user.empty() == password.empty());
+
+    if (!user.empty() && !use_ssl && !g_conf().get_val<bool>("rgw_allow_notification_secrets_in_cleartext")) {
+      ldout(cct, 1) << "Kafka connect: user/password are only allowed over secure connection" << dendl;
+      return false;
+    }
+
+    std::lock_guard lock(connections_lock);
+    const auto it = connections.find(broker);
+    // note that ssl vs. non-ssl connection to the same host are two separate conenctions
+    if (it != connections.end()) {
+      // connection found - return even if non-ok
+      ldout(cct, 20) << "Kafka connect: connection found" << dendl;
+      return it->second.get();
+    }
+
+    // connection not found, creating a new one
+    if (connection_count >= max_connections) {
+      // TODO: increment counter
+      ldout(cct, 1) << "Kafka connect: max connections exceeded" << dendl;
+      return false;
+    }
+    // create_connection must always return a connection object
+    // even if error occurred during creation. 
+    // in such a case the creation will be retried in the main thread
+    ++connection_count;
+    ldout(cct, 10) << "Kafka connect: new connection is created. Total connections: " << connection_count << dendl;
+    auto conn = connections.emplace(broker, std::make_unique<connection_t>(cct, broker, use_ssl, verify_ssl, ca_location, user, password, mechanism)).first->second.get();
+    if (!new_producer(conn)) {
+      ldout(cct, 10) << "Kafka connect: new connection is created. But producer creation failed. will retry" << dendl;
+    }
+    return true;
+  }
+
+  // TODO publish with confirm is needed in "none" case as well, cb should be invoked publish is ok (no ack)
+  int publish(const std::string& conn_name, 
+    const std::string& topic,
+    const std::string& message) {
+    if (stopped) {
+      return STATUS_MANAGER_STOPPED;
+    }
+    if (messages.push(new message_wrapper_t(conn_name, topic, message, nullptr))) {
+      ++queued;
+      return STATUS_OK;
+    }
+    return STATUS_QUEUE_FULL;
+  }
+  
+  int publish_with_confirm(const std::string& conn_name, 
+    const std::string& topic,
+    const std::string& message,
+    reply_callback_t cb) {
+    if (stopped) {
+      return STATUS_MANAGER_STOPPED;
+    }
+    if (messages.push(new message_wrapper_t(conn_name, topic, message, cb))) {
+      ++queued;
+      return STATUS_OK;
+    }
+    return STATUS_QUEUE_FULL;
+  }
+
+  // dtor wait for thread to stop
+  // then connection are cleaned-up
+  ~Manager() {
+    stopped = true;
+    runner.join();
+    messages.consume_all(delete_message);
+  }
+
+  // get the number of connections
+  size_t get_connection_count() const {
+    return connection_count;
+  }
+  
+  // get the number of in-flight messages
+  size_t get_inflight() const {
+    size_t sum = 0;
+    std::lock_guard lock(connections_lock);
+    std::for_each(connections.begin(), connections.end(), [&sum](auto& conn_pair) {
+        sum += conn_pair.second->callbacks.size();
+      });
+    return sum;
+  }
+
+  // running counter of the queued messages
+  size_t get_queued() const {
+    return queued;
+  }
+
+  // running counter of the dequeued messages
+  size_t get_dequeued() const {
+    return dequeued;
+  }
+};
+
+// singleton manager
+// note that the manager itself is not a singleton, and multiple instances may co-exist
+// TODO make the pointer atomic in allocation and deallocation to avoid race conditions
+static Manager* s_manager = nullptr;
+
+static const size_t MAX_CONNECTIONS_DEFAULT = 256;
+static const size_t MAX_INFLIGHT_DEFAULT = 8192; 
+static const size_t MAX_QUEUE_DEFAULT = 8192;
+static const int READ_TIMEOUT_MS_DEFAULT = 500;
+
+bool init(CephContext* cct) {
+  if (s_manager) {
+    return false;
+  }
+  // TODO: take conf from CephContext
+  s_manager = new Manager(MAX_CONNECTIONS_DEFAULT, MAX_INFLIGHT_DEFAULT, MAX_QUEUE_DEFAULT, READ_TIMEOUT_MS_DEFAULT, cct);
+  return true;
+}
+
+void shutdown() {
+  delete s_manager;
+  s_manager = nullptr;
+}
+
+bool connect(std::string& broker, const std::string& url, bool use_ssl, bool verify_ssl,
+        boost::optional<const std::string&> ca_location,
+        boost::optional<const std::string&> mechanism) {
+  if (!s_manager) return false;
+  return s_manager->connect(broker, url, use_ssl, verify_ssl, ca_location, mechanism);
+}
+
+int publish(const std::string& conn_name,
+    const std::string& topic,
+    const std::string& message) {
+  if (!s_manager) return STATUS_MANAGER_STOPPED;
+  return s_manager->publish(conn_name, topic, message);
+}
+
+int publish_with_confirm(const std::string& conn_name,
+    const std::string& topic,
+    const std::string& message,
+    reply_callback_t cb) {
+  if (!s_manager) return STATUS_MANAGER_STOPPED;
+  return s_manager->publish_with_confirm(conn_name, topic, message, cb);
+}
+
+size_t get_connection_count() {
+  if (!s_manager) return 0;
+  return s_manager->get_connection_count();
+}
+  
+size_t get_inflight() {
+  if (!s_manager) return 0;
+  return s_manager->get_inflight();
+}
+
+size_t get_queued() {
+  if (!s_manager) return 0;
+  return s_manager->get_queued();
+}
+
+size_t get_dequeued() {
+  if (!s_manager) return 0;
+  return s_manager->get_dequeued();
+}
+
+size_t get_max_connections() {
+  if (!s_manager) return MAX_CONNECTIONS_DEFAULT;
+  return s_manager->max_connections;
+}
+
+size_t get_max_inflight() {
+  if (!s_manager) return MAX_INFLIGHT_DEFAULT;
+  return s_manager->max_inflight;
+}
+
+size_t get_max_queue() {
+  if (!s_manager) return MAX_QUEUE_DEFAULT;
+  return s_manager->max_queue;
+}
+
+} // namespace kafka
+
diff --git a/src/rgw/rgw_kafka.h b/src/rgw/rgw_kafka.h
new file mode 100644
index 000000000..813fda329
--- /dev/null
+++ b/src/rgw/rgw_kafka.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <functional>
+#include <boost/optional.hpp>
+
+#include "include/common_fwd.h"
+
+namespace rgw::kafka {
+
+// the reply callback is expected to get an integer parameter
+// indicating the result, and not to return anything
+typedef std::function<void(int)> reply_callback_t;
+
+// initialize the kafka manager
+bool init(CephContext* cct);
+
+// shutdown the kafka manager
+void shutdown();
+
+// connect to a kafka endpoint
+bool connect(std::string& broker, const std::string& url, bool use_ssl, bool verify_ssl, boost::optional<const std::string&> ca_location, boost::optional<const std::string&> mechanism);
+
+// publish a message over a connection that was already created
+int publish(const std::string& conn_name,
+    const std::string& topic,
+    const std::string& message);
+
+// publish a message over a connection that was already created
+// and pass a callback that will be invoked (async) when broker confirms
+// receiving the message
+int publish_with_confirm(const std::string& conn_name,
+    const std::string& topic,
+    const std::string& message,
+    reply_callback_t cb);
+
+// convert the integer status returned from the "publish" function to a string
+std::string status_to_string(int s);
+
+// number of connections
+size_t get_connection_count();
+
+// return the number of messages that were sent
+// to broker, but were not yet acked/nacked/timedout
+size_t get_inflight();
+
+// running counter of successfully queued messages
+size_t get_queued();
+
+// running counter of dequeued messages
+size_t get_dequeued();
+
+// number of maximum allowed connections
+size_t get_max_connections();
+
+// number of maximum allowed inflight messages
+size_t get_max_inflight();
+
+// maximum number of messages in the queue
+size_t get_max_queue();
+
+}
+
diff --git a/src/rgw/rgw_keystone.cc b/src/rgw/rgw_keystone.cc
new file mode 100644
index 000000000..2df417bd0
--- /dev/null
+++ b/src/rgw/rgw_keystone.cc
@@ -0,0 +1,684 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <fnmatch.h>
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/algorithm/string.hpp>
+#include <fstream>
+
+#include "common/errno.h"
+#include "common/ceph_json.h"
+#include "include/types.h"
+#include "include/str_list.h"
+
+#include "rgw_common.h"
+#include "rgw_keystone.h"
+#include "common/armor.h"
+#include "common/Cond.h"
+#include "rgw_perf_counters.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+#define PKI_ANS1_PREFIX "MII"
+
+using namespace std;
+
+bool rgw_is_pki_token(const string& token)
+{
+  return token.compare(0, sizeof(PKI_ANS1_PREFIX) - 1, PKI_ANS1_PREFIX) == 0;
+}
+
+void rgw_get_token_id(const string& token, string& token_id)
+{
+  if (!rgw_is_pki_token(token)) {
+    token_id = token;
+    return;
+  }
+
+  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  hash.Update((const unsigned char *)token.c_str(), token.size());
+  hash.Final(m);
+
+  char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+  buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
+  token_id = calc_md5;
+}
+
+
+namespace rgw {
+namespace keystone {
+
+ApiVersion CephCtxConfig::get_api_version() const noexcept
+{
+  switch (g_ceph_context->_conf->rgw_keystone_api_version) {
+  case 3:
+    return ApiVersion::VER_3;
+  case 2:
+    return ApiVersion::VER_2;
+  default:
+    dout(0) << "ERROR: wrong Keystone API version: "
+            << g_ceph_context->_conf->rgw_keystone_api_version
+            << "; falling back to v2" <<  dendl;
+    return ApiVersion::VER_2;
+  }
+}
+
+std::string CephCtxConfig::get_endpoint_url() const noexcept
+{
+  static const std::string url = g_ceph_context->_conf->rgw_keystone_url;
+
+  if (url.empty() || boost::algorithm::ends_with(url, "/")) {
+    return url;
+  } else {
+    static const std::string url_normalised = url + '/';
+    return url_normalised;
+  }
+}
+
+/* secrets */
+const std::string CephCtxConfig::empty{""};
+
+static inline std::string read_secret(const std::string& file_path)
+{
+  using namespace std;
+
+  constexpr int16_t size{1024};
+  char buf[size];
+  string s;
+
+  s.reserve(size);
+  ifstream ifs(file_path, ios::in | ios::binary);
+  if (ifs) {
+    while (true) {
+      auto sbuf = ifs.rdbuf();
+      auto len =  sbuf->sgetn(buf, size);
+      if (!len)
+	break;
+      s.append(buf, len);
+    }
+    boost::algorithm::trim(s);
+    if (s.back() == '\n')
+      s.pop_back();
+  }
+  return s;
+}
+
+std::string CephCtxConfig::get_admin_token() const noexcept
+{
+  auto& atv = g_ceph_context->_conf->rgw_keystone_admin_token_path;
+  if (!atv.empty()) {
+    return read_secret(atv);
+  } else {
+    auto& atv = g_ceph_context->_conf->rgw_keystone_admin_token;
+    if (!atv.empty()) {
+      return atv;
+    }
+  }
+  return empty;
+}
+
+std::string CephCtxConfig::get_admin_password() const noexcept  {
+  auto& apv = g_ceph_context->_conf->rgw_keystone_admin_password_path;
+  if (!apv.empty()) {
+    return read_secret(apv);
+  } else {
+    auto& apv = g_ceph_context->_conf->rgw_keystone_admin_password;
+    if (!apv.empty()) {
+      return apv;
+    }
+  }
+  return empty;
+}
+
+int Service::get_admin_token(const DoutPrefixProvider *dpp,
+                             CephContext* const cct,
+                             TokenCache& token_cache,
+                             const Config& config,
+                             std::string& token)
+{
+  /* Let's check whether someone uses the deprecated "admin token" feauture
+   * based on a shared secret from keystone.conf file. */
+  const auto& admin_token = config.get_admin_token();
+  if (! admin_token.empty()) {
+    token = std::string(admin_token.data(), admin_token.length());
+    return 0;
+  }
+
+  TokenEnvelope t;
+
+  /* Try cache first before calling Keystone for a new admin token. */
+  if (token_cache.find_admin(t)) {
+    ldpp_dout(dpp, 20) << "found cached admin token" << dendl;
+    token = t.token.id;
+    return 0;
+  }
+
+  /* Call Keystone now. */
+  const auto ret = issue_admin_token_request(dpp, cct, config, t);
+  if (! ret) {
+    token_cache.add_admin(t);
+    token = t.token.id;
+  }
+
+  return ret;
+}
+
+int Service::issue_admin_token_request(const DoutPrefixProvider *dpp,
+                                       CephContext* const cct,
+                                       const Config& config,
+                                       TokenEnvelope& t)
+{
+  std::string token_url = config.get_endpoint_url();
+  if (token_url.empty()) {
+    return -EINVAL;
+  }
+
+  bufferlist token_bl;
+  RGWGetKeystoneAdminToken token_req(cct, "POST", "", &token_bl);
+  token_req.append_header("Content-Type", "application/json");
+  JSONFormatter jf;
+
+  const auto keystone_version = config.get_api_version();
+  if (keystone_version == ApiVersion::VER_2) {
+    AdminTokenRequestVer2 req_serializer(config);
+    req_serializer.dump(&jf);
+
+    std::stringstream ss;
+    jf.flush(ss);
+    token_req.set_post_data(ss.str());
+    token_req.set_send_length(ss.str().length());
+    token_url.append("v2.0/tokens");
+
+  } else if (keystone_version == ApiVersion::VER_3) {
+    AdminTokenRequestVer3 req_serializer(config);
+    req_serializer.dump(&jf);
+
+    std::stringstream ss;
+    jf.flush(ss);
+    token_req.set_post_data(ss.str());
+    token_req.set_send_length(ss.str().length());
+    token_url.append("v3/auth/tokens");
+  } else {
+    return -ENOTSUP;
+  }
+
+  token_req.set_url(token_url);
+
+  const int ret = token_req.process(null_yield);
+  if (ret < 0) {
+    return ret;
+  }
+
+  /* Detect rejection earlier than during the token parsing step. */
+  if (token_req.get_http_status() ==
+          RGWGetKeystoneAdminToken::HTTP_STATUS_UNAUTHORIZED) {
+    return -EACCES;
+  }
+
+  if (t.parse(dpp, cct, token_req.get_subject_token(), token_bl,
+              keystone_version) != 0) {
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int Service::get_keystone_barbican_token(const DoutPrefixProvider *dpp,
+                                         CephContext * const cct,
+                                         std::string& token)
+{
+  using keystone_config_t = rgw::keystone::CephCtxConfig;
+  using keystone_cache_t = rgw::keystone::TokenCache;
+
+  auto& config = keystone_config_t::get_instance();
+  auto& token_cache = keystone_cache_t::get_instance<keystone_config_t>();
+
+  std::string token_url = config.get_endpoint_url();
+  if (token_url.empty()) {
+    return -EINVAL;
+  }
+
+  rgw::keystone::TokenEnvelope t;
+
+  /* Try cache first. */
+  if (token_cache.find_barbican(t)) {
+    ldpp_dout(dpp, 20) << "found cached barbican token" << dendl;
+    token = t.token.id;
+    return 0;
+  }
+
+  bufferlist token_bl;
+  RGWKeystoneHTTPTransceiver token_req(cct, "POST", "", &token_bl);
+  token_req.append_header("Content-Type", "application/json");
+  JSONFormatter jf;
+
+  const auto keystone_version = config.get_api_version();
+  if (keystone_version == ApiVersion::VER_2) {
+    rgw::keystone::BarbicanTokenRequestVer2 req_serializer(cct);
+    req_serializer.dump(&jf);
+
+    std::stringstream ss;
+    jf.flush(ss);
+    token_req.set_post_data(ss.str());
+    token_req.set_send_length(ss.str().length());
+    token_url.append("v2.0/tokens");
+
+  } else if (keystone_version == ApiVersion::VER_3) {
+    BarbicanTokenRequestVer3 req_serializer(cct);
+    req_serializer.dump(&jf);
+
+    std::stringstream ss;
+    jf.flush(ss);
+    token_req.set_post_data(ss.str());
+    token_req.set_send_length(ss.str().length());
+    token_url.append("v3/auth/tokens");
+  } else {
+    return -ENOTSUP;
+  }
+
+  token_req.set_url(token_url);
+
+  ldpp_dout(dpp, 20) << "Requesting secret from barbican url=" << token_url << dendl;
+  const int ret = token_req.process(null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << "Barbican process error:" << token_bl.c_str() << dendl;
+    return ret;
+  }
+
+  /* Detect rejection earlier than during the token parsing step. */
+  if (token_req.get_http_status() ==
+      RGWKeystoneHTTPTransceiver::HTTP_STATUS_UNAUTHORIZED) {
+    return -EACCES;
+  }
+
+  if (t.parse(dpp, cct, token_req.get_subject_token(), token_bl,
+              keystone_version) != 0) {
+    return -EINVAL;
+  }
+
+  token_cache.add_barbican(t);
+  token = t.token.id;
+  return 0;
+}
+
+
+bool TokenEnvelope::has_role(const std::string& r) const
+{
+  list<Role>::const_iterator iter;
+  for (iter = roles.cbegin(); iter != roles.cend(); ++iter) {
+      if (fnmatch(r.c_str(), ((*iter).name.c_str()), 0) == 0) {
+        return true;
+      }
+  }
+  return false;
+}
+
+int TokenEnvelope::parse(const DoutPrefixProvider *dpp,
+                         CephContext* const cct,
+                         const std::string& token_str,
+                         ceph::bufferlist& bl,
+                         const ApiVersion version)
+{
+  JSONParser parser;
+  if (! parser.parse(bl.c_str(), bl.length())) {
+    ldpp_dout(dpp, 0) << "Keystone token parse error: malformed json" << dendl;
+    return -EINVAL;
+  }
+
+  JSONObjIter token_iter = parser.find_first("token");
+  JSONObjIter access_iter = parser.find_first("access");
+
+  try {
+    if (version == rgw::keystone::ApiVersion::VER_2) {
+      if (! access_iter.end()) {
+        decode_v2(*access_iter);
+      } else if (! token_iter.end()) {
+        /* TokenEnvelope structure doesn't follow Identity API v2, so let's
+         * fallback to v3. Otherwise we can assume it's wrongly formatted.
+         * The whole mechanism is a workaround for s3_token middleware that
+         * speaks in v2 disregarding the promise to go with v3. */
+        decode_v3(*token_iter);
+
+        /* Identity v3 conveys the token inforamtion not as a part of JSON but
+         * in the X-Subject-Token HTTP header we're getting from caller. */
+        token.id = token_str;
+      } else {
+        return -EINVAL;
+      }
+    } else if (version == rgw::keystone::ApiVersion::VER_3) {
+      if (! token_iter.end()) {
+        decode_v3(*token_iter);
+        /* v3 suceeded. We have to fill token.id from external input as it
+         * isn't a part of the JSON response anymore. It has been moved
+         * to X-Subject-Token HTTP header instead. */
+        token.id = token_str;
+      } else if (! access_iter.end()) {
+        /* If the token cannot be parsed according to V3, try V2. */
+        decode_v2(*access_iter);
+      } else {
+        return -EINVAL;
+      }
+    } else {
+      return -ENOTSUP;
+    }
+  } catch (const JSONDecoder::err& err) {
+    ldpp_dout(dpp, 0) << "Keystone token parse error: " << err.what() << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+bool TokenCache::find(const std::string& token_id,
+                      rgw::keystone::TokenEnvelope& token)
+{
+  std::lock_guard l{lock};
+  return find_locked(token_id, token, tokens, tokens_lru);
+}
+
+bool TokenCache::find_service(const std::string& token_id,
+                              rgw::keystone::TokenEnvelope& token)
+{
+  std::lock_guard l{lock};
+  return find_locked(token_id, token, service_tokens, service_tokens_lru);
+}
+
+bool TokenCache::find_locked(const std::string& token_id, rgw::keystone::TokenEnvelope& token,
+                             std::map<std::string, token_entry>& tokens, std::list<std::string>& tokens_lru)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+  map<string, token_entry>::iterator iter = tokens.find(token_id);
+  if (iter == tokens.end()) {
+    if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_miss);
+    return false;
+  }
+
+  token_entry& entry = iter->second;
+  tokens_lru.erase(entry.lru_iter);
+
+  if (entry.token.expired()) {
+    tokens.erase(iter);
+    if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_hit);
+    return false;
+  }
+  token = entry.token;
+
+  tokens_lru.push_front(token_id);
+  entry.lru_iter = tokens_lru.begin();
+
+  if (perfcounter) perfcounter->inc(l_rgw_keystone_token_cache_hit);
+
+  return true;
+}
+
+bool TokenCache::find_admin(rgw::keystone::TokenEnvelope& token)
+{
+  std::lock_guard l{lock};
+
+  return find_locked(admin_token_id, token, tokens, tokens_lru);
+}
+
+bool TokenCache::find_barbican(rgw::keystone::TokenEnvelope& token)
+{
+  std::lock_guard l{lock};
+
+  return find_locked(barbican_token_id, token, tokens, tokens_lru);
+}
+
+void TokenCache::add(const std::string& token_id,
+                     const rgw::keystone::TokenEnvelope& token)
+{
+  std::lock_guard l{lock};
+  add_locked(token_id, token, tokens, tokens_lru);
+}
+
+void TokenCache::add_service(const std::string& token_id,
+                             const rgw::keystone::TokenEnvelope& token)
+{
+  std::lock_guard l{lock};
+  add_locked(token_id, token, service_tokens, service_tokens_lru);
+}
+
+void TokenCache::add_locked(const std::string& token_id, const rgw::keystone::TokenEnvelope& token,
+                            std::map<std::string, token_entry>& tokens, std::list<std::string>& tokens_lru)
+{
+  ceph_assert(ceph_mutex_is_locked_by_me(lock));
+  map<string, token_entry>::iterator iter = tokens.find(token_id);
+  if (iter != tokens.end()) {
+    token_entry& e = iter->second;
+    tokens_lru.erase(e.lru_iter);
+  }
+
+  tokens_lru.push_front(token_id);
+  token_entry& entry = tokens[token_id];
+  entry.token = token;
+  entry.lru_iter = tokens_lru.begin();
+
+  while (tokens_lru.size() > max) {
+    list<string>::reverse_iterator riter = tokens_lru.rbegin();
+    iter = tokens.find(*riter);
+    ceph_assert(iter != tokens.end());
+    tokens.erase(iter);
+    tokens_lru.pop_back();
+  }
+}
+
+void TokenCache::add_admin(const rgw::keystone::TokenEnvelope& token)
+{
+  std::lock_guard l{lock};
+
+  rgw_get_token_id(token.token.id, admin_token_id);
+  add_locked(admin_token_id, token, tokens, tokens_lru);
+}
+
+void TokenCache::add_barbican(const rgw::keystone::TokenEnvelope& token)
+{
+  std::lock_guard l{lock};
+
+  rgw_get_token_id(token.token.id, barbican_token_id);
+  add_locked(barbican_token_id, token, tokens, tokens_lru);
+}
+
+void TokenCache::invalidate(const DoutPrefixProvider *dpp, const std::string& token_id)
+{
+  std::lock_guard l{lock};
+  map<string, token_entry>::iterator iter = tokens.find(token_id);
+  if (iter == tokens.end())
+    return;
+
+  ldpp_dout(dpp, 20) << "invalidating revoked token id=" << token_id << dendl;
+  token_entry& e = iter->second;
+  tokens_lru.erase(e.lru_iter);
+  tokens.erase(iter);
+}
+
+bool TokenCache::going_down() const
+{
+  return down_flag;
+}
+
+}; /* namespace keystone */
+}; /* namespace rgw */
+
+void rgw::keystone::TokenEnvelope::Token::decode_json(JSONObj *obj)
+{
+  string expires_iso8601;
+  struct tm t;
+
+  JSONDecoder::decode_json("id", id, obj, true);
+  JSONDecoder::decode_json("tenant", tenant_v2, obj, true);
+  JSONDecoder::decode_json("expires", expires_iso8601, obj, true);
+
+  if (parse_iso8601(expires_iso8601.c_str(), &t)) {
+    expires = internal_timegm(&t);
+  } else {
+    expires = 0;
+    throw JSONDecoder::err("Failed to parse ISO8601 expiration date from Keystone response.");
+  }
+}
+
+void rgw::keystone::TokenEnvelope::Role::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("name", name, obj, true);
+}
+
+void rgw::keystone::TokenEnvelope::Domain::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj, true);
+  JSONDecoder::decode_json("name", name, obj, true);
+}
+
+void rgw::keystone::TokenEnvelope::Project::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj, true);
+  JSONDecoder::decode_json("name", name, obj, true);
+  JSONDecoder::decode_json("domain", domain, obj);
+}
+
+void rgw::keystone::TokenEnvelope::User::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj, true);
+  JSONDecoder::decode_json("name", name, obj, true);
+  JSONDecoder::decode_json("domain", domain, obj);
+  JSONDecoder::decode_json("roles", roles_v2, obj);
+}
+
+void rgw::keystone::TokenEnvelope::decode_v3(JSONObj* const root_obj)
+{
+  std::string expires_iso8601;
+
+  JSONDecoder::decode_json("user", user, root_obj, true);
+  JSONDecoder::decode_json("expires_at", expires_iso8601, root_obj, true);
+  JSONDecoder::decode_json("roles", roles, root_obj, true);
+  JSONDecoder::decode_json("project", project, root_obj, true);
+
+  struct tm t;
+  if (parse_iso8601(expires_iso8601.c_str(), &t)) {
+    token.expires = internal_timegm(&t);
+  } else {
+    token.expires = 0;
+    throw JSONDecoder::err("Failed to parse ISO8601 expiration date"
+                           "from Keystone response.");
+  }
+}
+
+void rgw::keystone::TokenEnvelope::decode_v2(JSONObj* const root_obj)
+{
+  JSONDecoder::decode_json("user", user, root_obj, true);
+  JSONDecoder::decode_json("token", token, root_obj, true);
+
+  roles = user.roles_v2;
+  project = token.tenant_v2;
+}
+
+/* This utility function shouldn't conflict with the overload of std::to_string
+ * provided by string_ref since Boost 1.54 as it's defined outside of the std
+ * namespace. I hope we'll remove it soon - just after merging the Matt's PR
+ * for bundled Boost. It would allow us to forget that CentOS 7 has Boost 1.53. */
+static inline std::string to_string(const std::string_view& s)
+{
+  return std::string(s.data(), s.length());
+}
+
+void rgw::keystone::AdminTokenRequestVer2::dump(Formatter* const f) const
+{
+  f->open_object_section("token_request");
+    f->open_object_section("auth");
+      f->open_object_section("passwordCredentials");
+        encode_json("username", ::to_string(conf.get_admin_user()), f);
+        encode_json("password", ::to_string(conf.get_admin_password()), f);
+      f->close_section();
+      encode_json("tenantName", ::to_string(conf.get_admin_tenant()), f);
+    f->close_section();
+  f->close_section();
+}
+
+void rgw::keystone::AdminTokenRequestVer3::dump(Formatter* const f) const
+{
+  f->open_object_section("token_request");
+    f->open_object_section("auth");
+      f->open_object_section("identity");
+        f->open_array_section("methods");
+          f->dump_string("", "password");
+        f->close_section();
+        f->open_object_section("password");
+          f->open_object_section("user");
+            f->open_object_section("domain");
+              encode_json("name", ::to_string(conf.get_admin_domain()), f);
+            f->close_section();
+            encode_json("name", ::to_string(conf.get_admin_user()), f);
+            encode_json("password", ::to_string(conf.get_admin_password()), f);
+          f->close_section();
+        f->close_section();
+      f->close_section();
+      f->open_object_section("scope");
+        f->open_object_section("project");
+          if (! conf.get_admin_project().empty()) {
+            encode_json("name", ::to_string(conf.get_admin_project()), f);
+          } else {
+            encode_json("name", ::to_string(conf.get_admin_tenant()), f);
+          }
+          f->open_object_section("domain");
+            encode_json("name", ::to_string(conf.get_admin_domain()), f);
+          f->close_section();
+        f->close_section();
+      f->close_section();
+    f->close_section();
+  f->close_section();
+}
+
+void rgw::keystone::BarbicanTokenRequestVer2::dump(Formatter* const f) const
+{
+  f->open_object_section("token_request");
+    f->open_object_section("auth");
+      f->open_object_section("passwordCredentials");
+        encode_json("username", cct->_conf->rgw_keystone_barbican_user, f);
+        encode_json("password", cct->_conf->rgw_keystone_barbican_password, f);
+      f->close_section();
+      encode_json("tenantName", cct->_conf->rgw_keystone_barbican_tenant, f);
+    f->close_section();
+  f->close_section();
+}
+
+void rgw::keystone::BarbicanTokenRequestVer3::dump(Formatter* const f) const
+{
+  f->open_object_section("token_request");
+    f->open_object_section("auth");
+      f->open_object_section("identity");
+        f->open_array_section("methods");
+          f->dump_string("", "password");
+        f->close_section();
+        f->open_object_section("password");
+          f->open_object_section("user");
+            f->open_object_section("domain");
+              encode_json("name", cct->_conf->rgw_keystone_barbican_domain, f);
+            f->close_section();
+            encode_json("name", cct->_conf->rgw_keystone_barbican_user, f);
+            encode_json("password", cct->_conf->rgw_keystone_barbican_password, f);
+          f->close_section();
+        f->close_section();
+      f->close_section();
+      f->open_object_section("scope");
+        f->open_object_section("project");
+          if (!cct->_conf->rgw_keystone_barbican_project.empty()) {
+            encode_json("name", cct->_conf->rgw_keystone_barbican_project, f);
+          } else {
+            encode_json("name", cct->_conf->rgw_keystone_barbican_tenant, f);
+          }
+          f->open_object_section("domain");
+            encode_json("name", cct->_conf->rgw_keystone_barbican_domain, f);
+          f->close_section();
+        f->close_section();
+      f->close_section();
+    f->close_section();
+  f->close_section();
+}
+
+
diff --git a/src/rgw/rgw_keystone.h b/src/rgw/rgw_keystone.h
new file mode 100644
index 000000000..0ba882782
--- /dev/null
+++ b/src/rgw/rgw_keystone.h
@@ -0,0 +1,333 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <atomic>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+
+#include <boost/optional.hpp>
+
+#include "rgw_common.h"
+#include "rgw_http_client.h"
+#include "common/ceph_mutex.h"
+#include "global/global_init.h"
+
+
+bool rgw_is_pki_token(const std::string& token);
+void rgw_get_token_id(const std::string& token, std::string& token_id);
+static inline std::string rgw_get_token_id(const std::string& token)
+{
+  std::string token_id;
+  rgw_get_token_id(token, token_id);
+
+  return token_id;
+}
+
+namespace rgw {
+namespace keystone {
+
+enum class ApiVersion {
+  VER_2,
+  VER_3
+};
+
+
+class Config {
+protected:
+  Config() = default;
+  virtual ~Config() = default;
+
+public:
+  virtual std::string get_endpoint_url() const noexcept = 0;
+  virtual ApiVersion get_api_version() const noexcept = 0;
+
+  virtual std::string get_admin_token() const noexcept = 0;
+  virtual std::string_view get_admin_user() const noexcept = 0;
+  virtual std::string get_admin_password() const noexcept = 0;
+  virtual std::string_view get_admin_tenant() const noexcept = 0;
+  virtual std::string_view get_admin_project() const noexcept = 0;
+  virtual std::string_view get_admin_domain() const noexcept = 0;
+};
+
+class CephCtxConfig : public Config {
+protected:
+  CephCtxConfig() = default;
+  virtual ~CephCtxConfig() = default;
+
+  const static std::string empty;
+
+public:
+  static CephCtxConfig& get_instance() {
+    static CephCtxConfig instance;
+    return instance;
+  }
+
+  std::string get_endpoint_url() const noexcept override;
+  ApiVersion get_api_version() const noexcept override;
+
+  std::string get_admin_token() const noexcept override;
+
+  std::string_view get_admin_user() const noexcept override {
+    return g_ceph_context->_conf->rgw_keystone_admin_user;
+  }
+
+  std::string get_admin_password() const noexcept override;
+
+  std::string_view get_admin_tenant() const noexcept override {
+    return g_ceph_context->_conf->rgw_keystone_admin_tenant;
+  }
+
+  std::string_view get_admin_project() const noexcept override {
+    return g_ceph_context->_conf->rgw_keystone_admin_project;
+  }
+
+  std::string_view get_admin_domain() const noexcept override {
+    return g_ceph_context->_conf->rgw_keystone_admin_domain;
+  }
+};
+
+
+class TokenEnvelope;
+class TokenCache;
+
+class Service {
+public:
+  class RGWKeystoneHTTPTransceiver : public RGWHTTPTransceiver {
+  public:
+    RGWKeystoneHTTPTransceiver(CephContext * const cct,
+                               const std::string& method,
+                               const std::string& url,
+                               bufferlist * const token_body_bl)
+      : RGWHTTPTransceiver(cct, method, url, token_body_bl,
+                           cct->_conf->rgw_keystone_verify_ssl,
+                           { "X-Subject-Token" }) {
+    }
+
+    const header_value_t& get_subject_token() const {
+      try {
+        return get_header_value("X-Subject-Token");
+      } catch (std::out_of_range&) {
+        static header_value_t empty_val;
+        return empty_val;
+      }
+    }
+  };
+
+  typedef RGWKeystoneHTTPTransceiver RGWValidateKeystoneToken;
+  typedef RGWKeystoneHTTPTransceiver RGWGetKeystoneAdminToken;
+
+  static int get_admin_token(const DoutPrefixProvider *dpp,
+                             CephContext* const cct,
+                             TokenCache& token_cache,
+                             const Config& config,
+                             std::string& token);
+  static int issue_admin_token_request(const DoutPrefixProvider *dpp,
+                                       CephContext* const cct,
+                                       const Config& config,
+                                       TokenEnvelope& token);
+  static int get_keystone_barbican_token(const DoutPrefixProvider *dpp,
+                                         CephContext * const cct,
+                                         std::string& token);
+};
+
+
+class TokenEnvelope {
+public:
+  class Domain {
+  public:
+    std::string id;
+    std::string name;
+    void decode_json(JSONObj *obj);
+  };
+  class Project {
+  public:
+    Domain domain;
+    std::string id;
+    std::string name;
+    void decode_json(JSONObj *obj);
+  };
+
+  class Token {
+  public:
+    Token() : expires(0) { }
+    std::string id;
+    time_t expires;
+    Project tenant_v2;
+    void decode_json(JSONObj *obj);
+  };
+
+  class Role {
+  public:
+    std::string id;
+    std::string name;
+    void decode_json(JSONObj *obj);
+  };
+
+  class User {
+  public:
+    std::string id;
+    std::string name;
+    Domain domain;
+    std::list<Role> roles_v2;
+    void decode_json(JSONObj *obj);
+  };
+
+  Token token;
+  Project project;
+  User user;
+  std::list<Role> roles;
+
+  void decode_v3(JSONObj* obj);
+  void decode_v2(JSONObj* obj);
+
+public:
+  /* We really need the default ctor because of the internals of TokenCache. */
+  TokenEnvelope() = default;
+
+  void set_expires(time_t expires) { token.expires = expires; }
+  time_t get_expires() const { return token.expires; }
+  const std::string& get_domain_id() const {return project.domain.id;};
+  const std::string& get_domain_name() const {return project.domain.name;};
+  const std::string& get_project_id() const {return project.id;};
+  const std::string& get_project_name() const {return project.name;};
+  const std::string& get_user_id() const {return user.id;};
+  const std::string& get_user_name() const {return user.name;};
+  bool has_role(const std::string& r) const;
+  bool expired() const {
+    const uint64_t now = ceph_clock_now().sec();
+    return std::cmp_greater_equal(now, get_expires());
+  }
+  int parse(const DoutPrefixProvider *dpp, CephContext* cct,
+            const std::string& token_str,
+            ceph::buffer::list& bl /* in */,
+            ApiVersion version);
+};
+
+
+class TokenCache {
+  struct token_entry {
+    TokenEnvelope token;
+    std::list<std::string>::iterator lru_iter;
+  };
+
+  std::atomic<bool> down_flag = { false };
+  const boost::intrusive_ptr<CephContext> cct;
+
+  std::string admin_token_id;
+  std::string barbican_token_id;
+  std::map<std::string, token_entry> tokens;
+  std::map<std::string, token_entry> service_tokens;
+  std::list<std::string> tokens_lru;
+  std::list<std::string> service_tokens_lru;
+
+  ceph::mutex lock = ceph::make_mutex("rgw::keystone::TokenCache");
+
+  const size_t max;
+
+  explicit TokenCache(const rgw::keystone::Config& config)
+    : cct(g_ceph_context),
+      max(cct->_conf->rgw_keystone_token_cache_size) {
+  }
+
+  ~TokenCache() {
+    down_flag = true;
+  }
+
+public:
+  TokenCache(const TokenCache&) = delete;
+  void operator=(const TokenCache&) = delete;
+
+  template<class ConfigT>
+  static TokenCache& get_instance() {
+    static_assert(std::is_base_of<rgw::keystone::Config, ConfigT>::value,
+                  "ConfigT must be a subclass of rgw::keystone::Config");
+
+    /* In C++11 this is thread safe. */
+    static TokenCache instance(ConfigT::get_instance());
+    return instance;
+  }
+
+  bool find(const std::string& token_id, TokenEnvelope& token);
+  bool find_service(const std::string& token_id, TokenEnvelope& token);
+  boost::optional<TokenEnvelope> find(const std::string& token_id) {
+    TokenEnvelope token_envlp;
+    if (find(token_id, token_envlp)) {
+      return token_envlp;
+    }
+    return boost::none;
+  }
+  boost::optional<TokenEnvelope> find_service(const std::string& token_id) {
+    TokenEnvelope token_envlp;
+    if (find_service(token_id, token_envlp)) {
+      return token_envlp;
+    }
+    return boost::none;
+  }
+  bool find_admin(TokenEnvelope& token);
+  bool find_barbican(TokenEnvelope& token);
+  void add(const std::string& token_id, const TokenEnvelope& token);
+  void add_service(const std::string& token_id, const TokenEnvelope& token);
+  void add_admin(const TokenEnvelope& token);
+  void add_barbican(const TokenEnvelope& token);
+  void invalidate(const DoutPrefixProvider *dpp, const std::string& token_id);
+  bool going_down() const;
+private:
+  void add_locked(const std::string& token_id, const TokenEnvelope& token,
+                  std::map<std::string, token_entry>& tokens, std::list<std::string>& tokens_lru);
+  bool find_locked(const std::string& token_id, TokenEnvelope& token,
+                   std::map<std::string, token_entry>& tokens, std::list<std::string>& tokens_lru);
+};
+
+
+class AdminTokenRequest {
+public:
+  virtual ~AdminTokenRequest() = default;
+  virtual void dump(Formatter* f) const = 0;
+};
+
+class AdminTokenRequestVer2 : public AdminTokenRequest {
+  const Config& conf;
+
+public:
+  explicit AdminTokenRequestVer2(const Config& conf)
+    : conf(conf) {
+  }
+  void dump(Formatter *f) const override;
+};
+
+class AdminTokenRequestVer3 : public AdminTokenRequest {
+  const Config& conf;
+
+public:
+  explicit AdminTokenRequestVer3(const Config& conf)
+    : conf(conf) {
+  }
+  void dump(Formatter *f) const override;
+};
+
+class BarbicanTokenRequestVer2 : public AdminTokenRequest {
+  CephContext *cct;
+
+public:
+  explicit BarbicanTokenRequestVer2(CephContext * const _cct)
+    : cct(_cct) {
+  }
+  void dump(Formatter *f) const override;
+};
+
+class BarbicanTokenRequestVer3 : public AdminTokenRequest {
+  CephContext *cct;
+
+public:
+  explicit BarbicanTokenRequestVer3(CephContext * const _cct)
+    : cct(_cct) {
+  }
+  void dump(Formatter *f) const override;
+};
+
+
+}; /* namespace keystone */
+}; /* namespace rgw */
diff --git a/src/rgw/rgw_kmip_client.cc b/src/rgw/rgw_kmip_client.cc
new file mode 100644
index 000000000..e801972ea
--- /dev/null
+++ b/src/rgw/rgw_kmip_client.cc
@@ -0,0 +1,82 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/Thread.h"
+#include "include/compat.h"
+#include "common/errno.h"
+#include "rgw_common.h"
+#include "rgw_kmip_client.h"
+
+#include <atomic>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+RGWKMIPManager *rgw_kmip_manager;
+
+int
+RGWKMIPTransceiver::wait(optional_yield y)
+{
+  if (done)
+    return ret;
+  std::unique_lock l{lock};
+  if (!done)
+    cond.wait(l);
+  if (ret) {
+    lderr(cct) << "kmip process failed, " << ret << dendl;
+  }
+  return ret;
+}
+
+int
+RGWKMIPTransceiver::send()
+{
+  int r = rgw_kmip_manager->add_request(this);
+  if (r < 0) {
+    lderr(cct) << "kmip send failed, " << r << dendl;
+  }
+  return r;
+}
+
+int
+RGWKMIPTransceiver::process(optional_yield y)
+{
+  int r = send();
+  if (r < 0)
+    return r;
+  return wait(y);
+}
+
+RGWKMIPTransceiver::~RGWKMIPTransceiver()
+{
+  int i;
+  if (out)
+    free(out);
+  out = nullptr;
+  if (outlist->strings) {
+    for (i = 0; i < outlist->string_count; ++i) {
+      free(outlist->strings[i]);
+    }
+    free(outlist->strings);
+    outlist->strings = 0;
+  }
+  if (outkey->data) {
+    ::ceph::crypto::zeroize_for_security(outkey->data, outkey->keylen);
+    free(outkey->data);
+    outkey->data = 0;
+  }
+}
+
+void
+rgw_kmip_client_init(RGWKMIPManager &m)
+{
+  rgw_kmip_manager = &m;
+  rgw_kmip_manager->start();
+}
+
+void
+rgw_kmip_client_cleanup()
+{
+  rgw_kmip_manager->stop();
+  delete rgw_kmip_manager;
+}
diff --git a/src/rgw/rgw_kmip_client.h b/src/rgw/rgw_kmip_client.h
new file mode 100644
index 000000000..299292113
--- /dev/null
+++ b/src/rgw/rgw_kmip_client.h
@@ -0,0 +1,65 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+class RGWKMIPManager;
+
+class RGWKMIPTransceiver {
+public:
+  enum kmip_operation {
+    CREATE,
+    LOCATE,
+    GET,
+    GET_ATTRIBUTES,
+    GET_ATTRIBUTE_LIST,
+    DESTROY
+  };
+  CephContext *cct;
+  kmip_operation operation;
+  char *name = 0;
+  char *unique_id = 0;
+  // output - must free
+  char *out = 0;    // unique_id, several
+  struct {    // unique_ids, locate
+    char **strings;
+    int string_count;
+  } outlist[1] = {{0, 0}};
+  struct {    // key, get
+    unsigned char *data;
+    int keylen;
+  } outkey[1] = {0, 0};
+  // end must free
+  int ret;
+  bool done;
+  ceph::mutex lock = ceph::make_mutex("rgw_kmip_req::lock");
+  ceph::condition_variable cond;
+
+  int wait(optional_yield y);
+  RGWKMIPTransceiver(CephContext * const cct,
+    kmip_operation operation)
+  : cct(cct),
+    operation(operation),
+    ret(-EDOM),
+    done(false)
+  {}
+  ~RGWKMIPTransceiver();
+
+  int send();
+  int process(optional_yield y);
+};
+
+class RGWKMIPManager {
+protected:
+  CephContext *cct;
+  bool is_started = false;
+  RGWKMIPManager(CephContext *cct) : cct(cct) {};
+public:
+  virtual ~RGWKMIPManager() { };
+  virtual int start() = 0;
+  virtual void stop() = 0;
+  virtual int add_request(RGWKMIPTransceiver*) = 0;
+};
+
+void rgw_kmip_client_init(RGWKMIPManager &);
+void rgw_kmip_client_cleanup();
diff --git a/src/rgw/rgw_kmip_client_impl.cc b/src/rgw/rgw_kmip_client_impl.cc
new file mode 100644
index 000000000..0824273e6
--- /dev/null
+++ b/src/rgw/rgw_kmip_client_impl.cc
@@ -0,0 +1,728 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <boost/intrusive/list.hpp>
+#include <atomic>
+#include <mutex>
+#include <string.h>
+
+#include "include/compat.h"
+#include "common/errno.h"
+#include "rgw_common.h"
+#include "rgw_kmip_client.h"
+#include "rgw_kmip_client_impl.h"
+
+#include <openssl/err.h>
+#include <openssl/ssl.h>
+extern "C" {
+#include "kmip.h"
+#include "kmip_bio.h"
+#include "kmip_memset.h"
+};
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+static enum kmip_version protocol_version = KMIP_1_0;
+
+struct RGWKmipHandle {
+  int uses;
+  mono_time lastuse;
+  SSL_CTX *ctx;
+  SSL *ssl;
+  BIO *bio;
+  KMIP kmip_ctx[1];
+  TextString textstrings[2];
+  UsernamePasswordCredential upc[1];
+  Credential credential[1];
+  int need_to_free_kmip;
+  size_t buffer_blocks, buffer_block_size, buffer_total_size;
+  uint8 *encoding;
+
+  explicit RGWKmipHandle() :
+    uses(0), ctx(0), ssl(0), bio(0),
+    need_to_free_kmip(0),
+    encoding(0) {
+      memset(kmip_ctx, 0, sizeof kmip_ctx);
+      memset(textstrings, 0, sizeof textstrings);
+      memset(upc, 0, sizeof upc);
+      memset(credential, 0, sizeof credential);
+  };
+};
+
+struct RGWKmipWorker: public Thread {
+  RGWKMIPManagerImpl &m;
+  RGWKmipWorker(RGWKMIPManagerImpl& m) : m(m) {}
+  void *entry() override;
+  void signal() {
+    std::lock_guard l{m.lock};
+    m.cond.notify_all();
+  }
+};
+
+static void
+kmip_free_handle_stuff(RGWKmipHandle *kmip)
+{
+  if (kmip->encoding) {
+    kmip_free_buffer(kmip->kmip_ctx,
+      kmip->encoding,
+      kmip->buffer_total_size);
+    kmip_set_buffer(kmip->kmip_ctx, NULL, 0);
+  }
+  if (kmip->need_to_free_kmip)
+    kmip_destroy(kmip->kmip_ctx);
+  if (kmip->bio)
+    BIO_free_all(kmip->bio);
+  if (kmip->ctx)
+    SSL_CTX_free(kmip->ctx);
+}
+
+class RGWKmipHandleBuilder {
+private:
+  CephContext *cct;
+  const char *clientcert = 0;
+  const char *clientkey = 0;
+  const char *capath = 0;
+  const char *host = 0;
+  const char *portstring = 0;
+  const char *username = 0;
+  const char *password = 0;
+public:
+  RGWKmipHandleBuilder(CephContext *cct) : cct(cct) {};
+  RGWKmipHandleBuilder& set_clientcert(const std::string &v) {
+    const char *s = v.c_str();
+    if (*s) {
+      clientcert = s;
+    }
+    return *this;
+  }
+  RGWKmipHandleBuilder& set_clientkey(const std::string &v) {
+    const char *s = v.c_str();
+    if (*s) {
+      clientkey = s;
+    }
+    return *this;
+  }
+  RGWKmipHandleBuilder& set_capath(const std::string &v) {
+    const char *s = v.c_str();
+    if (*s) {
+      capath = s;
+    }
+    return *this;
+  }
+  RGWKmipHandleBuilder& set_host(const char *v) {
+    host = v;
+    return *this;
+  }
+  RGWKmipHandleBuilder& set_portstring(const char *v) {
+    portstring = v;
+    return *this;
+  }
+  RGWKmipHandleBuilder& set_username(const std::string &v) {
+    const char *s = v.c_str();
+    if (*s) {
+      username = s;
+    }
+    return *this;
+  }
+  RGWKmipHandleBuilder& set_password(const std::string& v) {
+    const char *s = v.c_str();
+    if (*s) {
+      password = s;
+    }
+    return *this;
+  }
+  RGWKmipHandle *build() const;
+};
+
+static int
+kmip_write_an_error_helper(const char *s, size_t l, void *u) {
+  CephContext *cct = (CephContext *)u;
+  std::string_view es(s, l);
+  lderr(cct) << es << dendl;
+  return l;
+}
+
+void
+ERR_print_errors_ceph(CephContext *cct)
+{
+  ERR_print_errors_cb(kmip_write_an_error_helper, cct);
+}
+
+RGWKmipHandle *
+RGWKmipHandleBuilder::build() const
+{
+  int failed = 1;
+  RGWKmipHandle *r = new RGWKmipHandle();
+  TextString *up = 0;
+	size_t ns;
+
+  r->ctx = SSL_CTX_new(TLS_client_method());
+
+  if (!clientcert)
+    ;
+  else if (SSL_CTX_use_certificate_file(r->ctx, clientcert, SSL_FILETYPE_PEM) != 1) {
+    lderr(cct) << "ERROR: can't load client cert from "
+      << clientcert << dendl;
+    ERR_print_errors_ceph(cct);
+    goto Done;
+  }
+
+  if (!clientkey)
+    ;
+  else if (SSL_CTX_use_PrivateKey_file(r->ctx, clientkey,
+      SSL_FILETYPE_PEM) != 1) {
+    lderr(cct) << "ERROR: can't load client key from "
+      << clientkey << dendl;
+    ERR_print_errors_ceph(cct);
+    goto Done;
+  }
+
+  if (!capath)
+    ;
+  else if (SSL_CTX_load_verify_locations(r->ctx, capath, NULL) != 1) {
+    lderr(cct) << "ERROR: can't load cacert from "
+      << capath << dendl;
+    ERR_print_errors_ceph(cct);
+    goto Done;
+  }
+  r->bio = BIO_new_ssl_connect(r->ctx);
+  if (!r->bio) {
+    lderr(cct) << "BIO_new_ssl_connect failed" << dendl;
+    goto Done;
+  }
+  BIO_get_ssl(r->bio, &r->ssl);
+  SSL_set_mode(r->ssl, SSL_MODE_AUTO_RETRY);
+
+  BIO_set_conn_hostname(r->bio, host);
+  BIO_set_conn_port(r->bio, portstring);
+  if (BIO_do_connect(r->bio) != 1) {
+    lderr(cct) << "BIO_do_connect failed to " << host
+      << ":" << portstring << dendl;
+    ERR_print_errors_ceph(cct);
+    goto Done;
+  }
+
+  // setup kmip
+
+  kmip_init(r->kmip_ctx, NULL, 0, protocol_version);
+	r->need_to_free_kmip = 1;
+	r->buffer_blocks = 1;
+	r->buffer_block_size = 1024;
+	r->encoding = static_cast<uint8*>(r->kmip_ctx->calloc_func(
+    r->kmip_ctx->state, r->buffer_blocks, r->buffer_block_size));
+	if (!r->encoding) {
+		lderr(cct) << "kmip buffer alloc failed: "
+      << r->buffer_blocks <<
+      " * " << r->buffer_block_size << dendl;
+		goto Done;
+	}
+	ns = r->buffer_blocks * r->buffer_block_size;
+	kmip_set_buffer(r->kmip_ctx, r->encoding, ns);
+	r->buffer_total_size = ns;
+
+  up = r->textstrings;
+  if (username) {
+    memset(r->upc, 0, sizeof *r->upc);
+    up->value = (char *) username;
+    up->size = strlen(username);
+    r->upc->username = up++;
+    if (password) {
+      up->value = (char *) password;
+      up->size = strlen(password);
+      r->upc->password = up++;
+    }
+    r->credential->credential_type = KMIP_CRED_USERNAME_AND_PASSWORD;
+    r->credential->credential_value = r->upc;
+    int i = kmip_add_credential(r->kmip_ctx, r->credential);
+    if (i != KMIP_OK) {
+      fprintf(stderr,"failed to add credential to kmip\n");
+      goto Done;
+    }
+  }
+
+  failed = 0;
+Done:
+  if (!failed)
+    ;
+  else if (!r)
+    ;
+  else {
+    kmip_free_handle_stuff(r);
+    delete r;
+    r = 0;
+  }
+  return r;
+}
+
+struct RGWKmipHandles : public Thread {
+  CephContext *cct;
+  ceph::mutex cleaner_lock = ceph::make_mutex("RGWKmipHandles::cleaner_lock");
+  std::vector<RGWKmipHandle*> saved_kmip;
+  int cleaner_shutdown;
+  bool cleaner_active = false;
+  ceph::condition_variable cleaner_cond;
+  RGWKmipHandles(CephContext *cct) :
+    cct(cct), cleaner_shutdown{0} {
+  }
+  RGWKmipHandle* get_kmip_handle();
+  void release_kmip_handle_now(RGWKmipHandle* kmip);
+  void release_kmip_handle(RGWKmipHandle* kmip);
+  void flush_kmip_handles();
+  int do_one_entry(RGWKMIPTransceiver &element);
+  void* entry();
+  void start();
+  void stop();
+};
+
+RGWKmipHandle*
+RGWKmipHandles::get_kmip_handle()
+{
+  RGWKmipHandle* kmip = 0;
+  const char *hostaddr = cct->_conf->rgw_crypt_kmip_addr.c_str();
+  {
+    std::lock_guard lock{cleaner_lock};
+    if (!saved_kmip.empty()) {
+      kmip = *saved_kmip.begin();
+      saved_kmip.erase(saved_kmip.begin());
+    }
+  }
+  if (!kmip && hostaddr) {
+    char *hosttemp = strdup(hostaddr);
+    char *port = strchr(hosttemp, ':');
+    if (port)
+      *port++ = 0;
+    kmip = RGWKmipHandleBuilder{cct}
+      .set_clientcert(cct->_conf->rgw_crypt_kmip_client_cert)
+      .set_clientkey(cct->_conf->rgw_crypt_kmip_client_key)
+      .set_capath(cct->_conf->rgw_crypt_kmip_ca_path)
+      .set_host(hosttemp)
+      .set_portstring(port ? port : "5696")
+      .set_username(cct->_conf->rgw_crypt_kmip_username)
+      .set_password(cct->_conf->rgw_crypt_kmip_password)
+      .build();
+    free(hosttemp);
+  }
+  return kmip;
+}
+
+void
+RGWKmipHandles::release_kmip_handle_now(RGWKmipHandle* kmip)
+{
+  kmip_free_handle_stuff(kmip);
+  delete kmip;
+}
+
+#define MAXIDLE 5
+void
+RGWKmipHandles::release_kmip_handle(RGWKmipHandle* kmip)
+{
+  if (cleaner_shutdown) {
+    release_kmip_handle_now(kmip);
+  } else {
+    std::lock_guard lock{cleaner_lock};
+    kmip->lastuse = mono_clock::now();
+    saved_kmip.insert(saved_kmip.begin(), 1, kmip);
+  }
+}
+
+void*
+RGWKmipHandles::entry()
+{
+  RGWKmipHandle* kmip;
+  std::unique_lock lock{cleaner_lock};
+
+  for (;;) {
+    if (cleaner_shutdown) {
+      if (saved_kmip.empty())
+	break;
+    } else {
+      cleaner_cond.wait_for(lock, std::chrono::seconds(MAXIDLE));
+    }
+    mono_time now = mono_clock::now();
+    while (!saved_kmip.empty()) {
+      auto cend = saved_kmip.end();
+      --cend;
+      kmip = *cend;
+      if (!cleaner_shutdown && now - kmip->lastuse
+	  < std::chrono::seconds(MAXIDLE))
+	break;
+      saved_kmip.erase(cend);
+      release_kmip_handle_now(kmip);
+    }
+  }
+  return nullptr;
+}
+
+void
+RGWKmipHandles::start()
+{
+  std::lock_guard lock{cleaner_lock};
+  if (!cleaner_active) {
+    cleaner_active = true;
+    this->create("KMIPcleaner");  // len<16!!!
+  }
+}
+
+void
+RGWKmipHandles::stop()
+{
+  std::unique_lock lock{cleaner_lock};
+  cleaner_shutdown = 1;
+  cleaner_cond.notify_all();
+  if (cleaner_active) {
+    lock.unlock();
+    this->join();
+    cleaner_active = false;
+  }
+}
+
+void
+RGWKmipHandles::flush_kmip_handles()
+{
+  stop();
+  join();
+  if (!saved_kmip.empty()) {
+    ldout(cct, 0) << "ERROR: " << __func__ << " failed final cleanup" << dendl;
+  }
+  saved_kmip.shrink_to_fit();
+}
+
+int
+RGWKMIPManagerImpl::start()
+{
+  if (worker) {
+    lderr(cct) << "kmip worker already started" << dendl;
+    return -1;
+  }
+  worker = new RGWKmipWorker(*this);
+  worker->create("kmip worker");
+  return 0;
+}
+
+void
+RGWKMIPManagerImpl::stop()
+{
+  going_down = true;
+  if (worker) {
+    worker->signal();
+    worker->join();
+    delete worker;
+    worker = 0;
+  }
+}
+
+int
+RGWKMIPManagerImpl::add_request(RGWKMIPTransceiver *req)
+{
+  std::unique_lock l{lock};
+  if (going_down)
+    return -ECANCELED;
+  requests.push_back(*new Request{*req});
+  l.unlock();
+  if (worker)
+    worker->signal();
+  return 0;
+}
+
+int
+RGWKmipHandles::do_one_entry(RGWKMIPTransceiver &element)
+{
+  auto h = get_kmip_handle();
+  std::unique_lock l{element.lock};
+  Attribute a[8], *ap;
+  TextString nvalue[1], uvalue[1];
+  Name nattr[1];
+  enum cryptographic_algorithm alg = KMIP_CRYPTOALG_AES;
+  int32 length = 256;
+  int32 mask = KMIP_CRYPTOMASK_ENCRYPT | KMIP_CRYPTOMASK_DECRYPT;
+  size_t ns;
+  ProtocolVersion pv[1];
+  RequestHeader rh[1];
+  RequestMessage rm[1];
+  Authentication auth[1];
+  ResponseMessage resp_m[1];
+  int i;
+  union {
+    CreateRequestPayload create_req[1];
+    LocateRequestPayload locate_req[1];
+    GetRequestPayload get_req[1];
+    GetAttributeListRequestPayload lsattrs_req[1];
+    GetAttributesRequestPayload getattrs_req[1];
+  } u[1];
+  RequestBatchItem rbi[1];
+  TemplateAttribute ta[1];
+  const char *what = "?";
+  int need_to_free_response = 0;
+  char *response = NULL;
+  int response_size = 0;
+  enum result_status rs;
+  ResponseBatchItem *req;
+
+  if (!h) {
+    element.ret = -ERR_SERVICE_UNAVAILABLE;
+    return element.ret;
+  }
+  memset(a, 0, sizeof *a);
+  for (i = 0; i < (int)(sizeof a/sizeof *a); ++i)
+    kmip_init_attribute(a+i);
+  ap = a;
+  switch(element.operation) {
+  case RGWKMIPTransceiver::CREATE:
+    ap->type = KMIP_ATTR_CRYPTOGRAPHIC_ALGORITHM;
+    ap->value = &alg;
+    ++ap;
+    ap->type = KMIP_ATTR_CRYPTOGRAPHIC_LENGTH;
+    ap->value = &length;
+    ++ap;
+    ap->type = KMIP_ATTR_CRYPTOGRAPHIC_USAGE_MASK;
+    ap->value = &mask;
+    ++ap;
+    break;
+  default:
+    break;
+  }
+  if (element.name) {
+    memset(nvalue, 0, sizeof *nvalue);
+    nvalue->value = element.name;
+    nvalue->size = strlen(element.name);
+    memset(nattr, 0, sizeof *nattr);
+    nattr->value = nvalue;
+    nattr->type = KMIP_NAME_UNINTERPRETED_TEXT_STRING;
+    ap->type = KMIP_ATTR_NAME;
+    ap->value = nattr;
+    ++ap;
+  }
+  if (element.unique_id) {
+    memset(uvalue, 0, sizeof *uvalue);
+    uvalue->value = element.unique_id;
+    uvalue->size = strlen(element.unique_id);
+  }
+  memset(pv, 0, sizeof *pv);
+  memset(rh, 0, sizeof *rh);
+  memset(rm, 0, sizeof *rm);
+  memset(auth, 0, sizeof *auth);
+  memset(resp_m, 0, sizeof *resp_m);
+  kmip_init_protocol_version(pv, h->kmip_ctx->version);
+  kmip_init_request_header(rh);
+  rh->protocol_version = pv;
+  rh->maximum_response_size = h->kmip_ctx->max_message_size;
+  rh->time_stamp = time(NULL);
+  rh->batch_count = 1;
+  memset(rbi, 0, sizeof *rbi);
+  kmip_init_request_batch_item(rbi);
+  memset(u, 0, sizeof *u);
+  rbi->request_payload = u;
+  switch(element.operation) {
+  case RGWKMIPTransceiver::CREATE:
+    memset(ta, 0, sizeof *ta);
+    ta->attributes = a;
+    ta->attribute_count = ap-a;
+    u->create_req->object_type = KMIP_OBJTYPE_SYMMETRIC_KEY;
+    u->create_req->template_attribute = ta;
+    rbi->operation = KMIP_OP_CREATE;
+    what = "create";
+    break;
+  case RGWKMIPTransceiver::GET:
+    if (element.unique_id)
+      u->get_req->unique_identifier = uvalue;
+    rbi->operation = KMIP_OP_GET;
+    what = "get";
+    break;
+  case RGWKMIPTransceiver::LOCATE:
+    if (ap > a) {
+      u->locate_req->attributes = a;
+      u->locate_req->attribute_count = ap - a;
+    }
+    rbi->operation = KMIP_OP_LOCATE;
+    what = "locate";
+    break;
+  case RGWKMIPTransceiver::GET_ATTRIBUTES:
+  case RGWKMIPTransceiver::GET_ATTRIBUTE_LIST:
+  case RGWKMIPTransceiver::DESTROY:
+  default:
+    lderr(cct) << "Missing operation logic op=" << element.operation << dendl;
+    element.ret = -EINVAL;
+    goto Done;
+  }
+  rm->request_header = rh;
+  rm->batch_items = rbi;
+  rm->batch_count = 1;
+  if (h->kmip_ctx->credential_list) {
+    LinkedListItem *item = h->kmip_ctx->credential_list->head;
+    if (item) {
+      auth->credential = (Credential *)item->data;
+      rh->authentication = auth;
+    }
+  }
+  for (;;) {
+    i = kmip_encode_request_message(h->kmip_ctx, rm);
+    if (i != KMIP_ERROR_BUFFER_FULL) break;
+    h->kmip_ctx->free_func(h->kmip_ctx->state, h->encoding);
+    h->encoding = 0;
+    ++h->buffer_blocks;
+    h->encoding = static_cast<uint8*>(h->kmip_ctx->calloc_func(h->kmip_ctx->state, h->buffer_blocks, h->buffer_block_size));
+    if (!h->encoding) {
+      lderr(cct) << "kmip buffer alloc failed: "
+	<< h->buffer_blocks
+	<< " * " << h->buffer_block_size << dendl;
+      element.ret = -ENOMEM;
+      goto Done;
+    }
+    ns = h->buffer_blocks * h->buffer_block_size;
+    kmip_set_buffer(h->kmip_ctx, h->encoding, ns);
+    h->buffer_total_size = ns;
+  }
+  if (i != KMIP_OK) {
+    lderr(cct) << " Failed to encode " << what
+      << " request; err=" << i
+      << " ctx error message " << h->kmip_ctx->error_message
+      << dendl;
+    element.ret = -EINVAL;
+    goto Done;
+  }
+  i = kmip_bio_send_request_encoding(h->kmip_ctx, h->bio,
+    (char*)h->encoding,
+    h->kmip_ctx->index - h->kmip_ctx->buffer,
+    &response, &response_size);
+  if (i < 0) {
+    lderr(cct) << "Problem sending request to " << what << " " << i << " context error message " << h->kmip_ctx->error_message << dendl;
+    element.ret = -EINVAL;
+    goto Done;
+  }
+  kmip_free_buffer(h->kmip_ctx, h->encoding,
+    h->buffer_total_size);
+  h->encoding = 0;
+  kmip_set_buffer(h->kmip_ctx, response, response_size);
+  need_to_free_response = 1;
+  i = kmip_decode_response_message(h->kmip_ctx, resp_m);
+  if (i != KMIP_OK) {
+    lderr(cct) << "Failed to decode " << what << " " << i << " context error message " << h->kmip_ctx->error_message << dendl;
+    element.ret = -EINVAL;
+    goto Done;
+  }
+  if (resp_m->batch_count != 1) {
+    lderr(cct) << "Failed; weird response count doing " << what << " " << resp_m->batch_count << dendl;
+    element.ret = -EINVAL;
+    goto Done;
+  }
+  req = resp_m->batch_items;
+  rs = req->result_status;
+  if (rs != KMIP_STATUS_SUCCESS) {
+    lderr(cct) << "Failed; result status not success " << rs << dendl;
+    element.ret = -EINVAL;
+    goto Done;
+  }
+  if (req->operation != rbi->operation) {
+    lderr(cct) << "Failed; response operation mismatch, got " << req->operation << " expected " << rbi->operation << dendl;
+    element.ret = -EINVAL;
+    goto Done;
+  }
+  switch(req->operation)
+  {
+  case KMIP_OP_CREATE: {
+      CreateResponsePayload *pld = (CreateResponsePayload *)req->response_payload;
+      element.out = static_cast<char *>(malloc(pld->unique_identifier->size+1));
+      memcpy(element.out, pld->unique_identifier->value, pld->unique_identifier->size);
+      element.out[pld->unique_identifier->size] = 0;
+    } break;
+  case KMIP_OP_LOCATE: {
+      LocateResponsePayload *pld = (LocateResponsePayload *)req->response_payload;
+      char **list = static_cast<char **>(malloc(sizeof (char*) * (1 + pld->unique_identifiers_count)));
+      for (i = 0; i < pld->unique_identifiers_count; ++i) {
+	list[i] = static_cast<char *>(malloc(pld->unique_identifiers[i].size+1));
+	memcpy(list[i], pld->unique_identifiers[i].value, pld->unique_identifiers[i].size);
+	list[i][pld->unique_identifiers[i].size] = 0;
+      }
+      list[i] = 0;
+      element.outlist->strings = list;
+      element.outlist->string_count = pld->unique_identifiers_count;
+    } break;
+  case KMIP_OP_GET: {
+      GetResponsePayload *pld = (GetResponsePayload *)req->response_payload;
+      element.out = static_cast<char *>(malloc(pld->unique_identifier->size+1));
+      memcpy(element.out, pld->unique_identifier->value, pld->unique_identifier->size);
+      element.out[pld->unique_identifier->size] = 0;
+      if (pld->object_type != KMIP_OBJTYPE_SYMMETRIC_KEY) {
+	lderr(cct) << "get: expected symmetric key got " << pld->object_type << dendl;
+	element.ret = -EINVAL;
+	goto Done;
+      }
+      KeyBlock *kp = static_cast<SymmetricKey *>(pld->object)->key_block;
+      ByteString *bp;
+      if (kp->key_format_type != KMIP_KEYFORMAT_RAW) {
+	lderr(cct) << "get: expected raw key fromat got  " << kp->key_format_type << dendl;
+	element.ret = -EINVAL;
+	goto Done;
+      }
+      KeyValue *kv = static_cast<KeyValue *>(kp->key_value);
+      bp  = static_cast<ByteString*>(kv->key_material);
+      element.outkey->data = static_cast<unsigned char *>(malloc(bp->size));
+      element.outkey->keylen = bp->size;
+      memcpy(element.outkey->data, bp->value, bp->size);
+    } break;
+  case KMIP_OP_GET_ATTRIBUTES: {
+      GetAttributesResponsePayload *pld = (GetAttributesResponsePayload *)req->response_payload;
+      element.out = static_cast<char *>(malloc(pld->unique_identifier->size+1));
+      memcpy(element.out, pld->unique_identifier->value, pld->unique_identifier->size);
+      element.out[pld->unique_identifier->size] = 0;
+    } break;
+  case KMIP_OP_GET_ATTRIBUTE_LIST: {
+      GetAttributeListResponsePayload *pld = (GetAttributeListResponsePayload *)req->response_payload;
+      element.out = static_cast<char *>(malloc(pld->unique_identifier->size+1));
+      memcpy(element.out, pld->unique_identifier->value, pld->unique_identifier->size);
+      element.out[pld->unique_identifier->size] = 0;
+    } break;
+  case KMIP_OP_DESTROY: {
+      DestroyResponsePayload *pld = (DestroyResponsePayload *)req->response_payload;
+      element.out = static_cast<char *>(malloc(pld->unique_identifier->size+1));
+      memcpy(element.out, pld->unique_identifier->value, pld->unique_identifier->size);
+      element.out[pld->unique_identifier->size] = 0;
+    } break;
+  default:
+    lderr(cct) << "Missing response logic op=" << element.operation << dendl;
+    element.ret = -EINVAL;
+    goto Done;
+  }
+  element.ret = 0;
+Done:
+  if (need_to_free_response)
+    kmip_free_response_message(h->kmip_ctx, resp_m);
+  element.done = true;
+  element.cond.notify_all();
+  release_kmip_handle(h);
+  return element.ret;
+}
+
+void *
+RGWKmipWorker::entry()
+{
+  std::unique_lock entry_lock{m.lock};
+  ldout(m.cct, 10) << __func__ << " start" << dendl;
+  RGWKmipHandles handles{m.cct};
+  handles.start();
+  while (!m.going_down) {
+    if (m.requests.empty()) {
+      m.cond.wait_for(entry_lock, std::chrono::seconds(MAXIDLE));
+      continue;
+    }
+    auto iter = m.requests.begin();
+    auto element = *iter;
+    m.requests.erase(iter);
+    entry_lock.unlock();
+    (void) handles.do_one_entry(element.details);
+    entry_lock.lock();
+  }
+  for (;;) {
+    if (m.requests.empty()) break;
+    auto iter = m.requests.begin();
+    auto element = std::move(*iter);
+    m.requests.erase(iter);
+    element.details.ret = -666;
+    element.details.done = true;
+    element.details.cond.notify_all();
+  }
+  handles.stop();
+  ldout(m.cct, 10) << __func__ << " finish" << dendl;
+  return nullptr;
+}
diff --git a/src/rgw/rgw_kmip_client_impl.h b/src/rgw/rgw_kmip_client_impl.h
new file mode 100644
index 000000000..d36903a4b
--- /dev/null
+++ b/src/rgw/rgw_kmip_client_impl.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+struct RGWKmipWorker;
+class RGWKMIPManagerImpl: public RGWKMIPManager {
+protected:
+  ceph::mutex lock = ceph::make_mutex("RGWKMIPManager");
+  ceph::condition_variable cond;
+
+  struct Request : boost::intrusive::list_base_hook<> {
+    boost::intrusive::list_member_hook<> req_hook;
+    RGWKMIPTransceiver &details;
+    Request(RGWKMIPTransceiver &details) : details(details) {}
+  };
+  boost::intrusive::list<Request, boost::intrusive::member_hook< Request,
+  boost::intrusive::list_member_hook<>, &Request::req_hook>> requests;
+  bool going_down = false;
+  RGWKmipWorker *worker = 0;
+public:
+  RGWKMIPManagerImpl(CephContext *cct) : RGWKMIPManager(cct) {};
+  int add_request(RGWKMIPTransceiver *);
+  int start();
+  void stop();
+  friend RGWKmipWorker;
+};
diff --git a/src/rgw/rgw_kms.cc b/src/rgw/rgw_kms.cc
new file mode 100644
index 000000000..936580276
--- /dev/null
+++ b/src/rgw/rgw_kms.cc
@@ -0,0 +1,1279 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/**
+ * Server-side encryption integrations with Key Management Systems (SSE-KMS)
+ */
+
+#include <sys/stat.h>
+#include "include/str_map.h"
+#include "common/safe_io.h"
+#include "rgw/rgw_crypt.h"
+#include "rgw/rgw_keystone.h"
+#include "rgw/rgw_b64.h"
+#include "rgw/rgw_kms.h"
+#include "rgw/rgw_kmip_client.h"
+#include <rapidjson/allocators.h>
+#include <rapidjson/document.h>
+#include <rapidjson/writer.h>
+#include "rapidjson/error/error.h"
+#include "rapidjson/error/en.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw;
+
+#ifndef FORTEST_VIRTUAL
+#define  FORTEST_VIRTUAL	/**/
+#endif
+
+/**
+ * Memory pool for use with rapidjson.  This version
+ * carefully zeros out all memory before returning it to
+ * the system.
+ */
+#define ALIGNTYPE double
+#define MINCHUNKSIZE 4096
+class ZeroPoolAllocator {
+private:
+    struct element {
+	struct element *next;
+	int size;
+	char data[4];
+    } *b;
+    size_t left;
+public:
+    static const bool kNeedFree { false };
+    ZeroPoolAllocator(){
+	b = 0;
+	left = 0;
+    }
+    ~ZeroPoolAllocator(){
+        element *p;
+	while ((p = b)) {
+	    b = p->next;
+	    memset(p->data, 0, p->size);
+	    free(p);
+	}
+    }
+    void * Malloc(size_t size) {
+	void *r;
+	if (!size) return 0;
+	size = (size + sizeof(ALIGNTYPE)-1)&(-sizeof(ALIGNTYPE));
+	if (size > left) {
+		size_t ns { size };
+		if (ns < MINCHUNKSIZE) ns = MINCHUNKSIZE;
+		element *nw { (element *) malloc(sizeof *b + ns) };
+		if (!nw) {
+// std::cerr << "out of memory" << std::endl;
+			return 0;
+		}
+		left = ns - sizeof *b;
+		nw->size = ns;
+		nw->next = b;
+		b = nw;
+	}
+	left -= size;
+	r = static_cast<void*>(b->data + left);
+	return r;
+    }
+    void* Realloc(void* p, size_t old, size_t nw) {
+	void *r = nullptr;
+	if (nw) r = malloc(nw);
+	if (nw > old) nw = old;
+	if (r && old) memcpy(r, p, nw);
+	return r;
+    }
+    static void Free(void *p) {
+	ceph_assert(0 == "Free should not be called");
+    }
+private:
+    //! Copy constructor is not permitted.
+    ZeroPoolAllocator(const ZeroPoolAllocator& rhs) /* = delete */;
+    //! Copy assignment operator is not permitted.
+    ZeroPoolAllocator& operator=(const ZeroPoolAllocator& rhs) /* = delete */;
+};
+
+typedef rapidjson::GenericDocument<rapidjson::UTF8<>,
+		ZeroPoolAllocator,
+		rapidjson::CrtAllocator
+		> ZeroPoolDocument;
+typedef rapidjson::GenericValue<rapidjson::UTF8<>, ZeroPoolAllocator> ZeroPoolValue;
+
+/**
+ * Construct a full URL string by concatenating a "base" URL with another path,
+ * ensuring there is one and only one forward slash between them. If path is
+ * empty, the URL is not changed.
+ */
+static void concat_url(std::string &url, std::string path) {
+  bool url_has_slash = !url.empty() && url.back() == '/';
+  if (!path.empty()) {
+    if (url_has_slash && path.front() == '/') {
+      url.pop_back();
+    } else if (!url_has_slash && path.front() != '/') {
+      url.push_back('/');
+    }
+    url.append(path);
+  }
+}
+
+/**
+ * Determine if a string (url) ends with a given suffix.
+ * Must deal with (ignore) trailing slashes.
+ */
+static bool string_ends_maybe_slash(std::string_view hay,
+    std::string_view needle)
+{
+  auto hay_len { hay.size() };
+  auto needle_len { needle.size() };
+  if (hay_len < needle_len) return false;
+  auto hay_suffix_start { hay.data() + (hay_len - needle_len) };
+  while (hay_len > needle_len && hay[hay_len-1] == '/') {
+    --hay_len;
+    --hay_suffix_start;
+  }
+  std::string_view hay_suffix { hay_suffix_start, needle_len };
+  return hay_suffix == needle;
+}
+
+template<typename E, typename A = ZeroPoolAllocator>
+static inline void
+add_name_val_to_obj(std::string &n, std::string &v, rapidjson::GenericValue<E,A> &d,
+  A &allocator)
+{
+  rapidjson::GenericValue<E,A> name, val;
+  name.SetString(n.c_str(), n.length(), allocator);
+  val.SetString(v.c_str(), v.length(), allocator);
+  d.AddMember(name, val, allocator);
+}
+
+template<typename E, typename A = ZeroPoolAllocator>
+static inline void
+add_name_val_to_obj(std::string &n, bool v, rapidjson::GenericValue<E,A> &d,
+  A &allocator)
+{
+  rapidjson::GenericValue<E,A> name, val;
+  name.SetString(n.c_str(), n.length(), allocator);
+  val.SetBool(v);
+  d.AddMember(name, val, allocator);
+}
+
+template<typename E, typename A = ZeroPoolAllocator>
+static inline void
+add_name_val_to_obj(const char *n, std::string &v, rapidjson::GenericValue<E,A> &d,
+  A &allocator)
+{
+  std::string ns{n, strlen(n) };
+  add_name_val_to_obj(ns, v, d, allocator);
+}
+
+template<typename E, typename A = ZeroPoolAllocator>
+static inline void
+add_name_val_to_obj(const char *n, bool v, rapidjson::GenericValue<E,A> &d,
+  A &allocator)
+{
+  std::string ns{n, strlen(n) };
+  add_name_val_to_obj(ns, v, d, allocator);
+}
+
+typedef std::map<std::string, std::string> EngineParmMap;
+
+
+class SSEContext {
+protected:
+  virtual ~SSEContext(){};
+public:
+  virtual const std::string & backend() = 0;
+  virtual const std::string & addr() = 0;
+  virtual const std::string & auth() = 0;
+  virtual const std::string & k_namespace() = 0;
+  virtual const std::string & prefix() = 0;
+  virtual const std::string & secret_engine() = 0;
+  virtual const std::string & ssl_cacert() = 0;
+  virtual const std::string & ssl_clientcert() = 0;
+  virtual const std::string & ssl_clientkey() = 0;
+  virtual const std::string & token_file() = 0;
+  virtual const bool verify_ssl() = 0;
+};
+
+class VaultSecretEngine: public SecretEngine {
+
+protected:
+  CephContext *cct;
+  SSEContext & kctx;
+
+  int load_token_from_file(const DoutPrefixProvider *dpp, std::string *vault_token)
+  {
+
+    int res = 0;
+    std::string token_file = kctx.token_file();
+    if (token_file.empty()) {
+      ldpp_dout(dpp, 0) << "ERROR: Vault token file not set in rgw_crypt_vault_token_file" << dendl;
+      return -EINVAL;
+    }
+    ldpp_dout(dpp, 20) << "Vault token file: " << token_file << dendl;
+
+    struct stat token_st;
+    if (stat(token_file.c_str(), &token_st) != 0) {
+      ldpp_dout(dpp, 0) << "ERROR: Vault token file '" << token_file << "' not found  " << dendl;
+      return -ENOENT;
+    }
+
+    if (token_st.st_mode & (S_IRWXG | S_IRWXO)) {
+      ldpp_dout(dpp, 0) << "ERROR: Vault token file '" << token_file << "' permissions are "
+                    << "too open, it must not be accessible by other users" << dendl;
+      return -EACCES;
+    }
+
+    char buf[2048];
+    res = safe_read_file("", token_file.c_str(), buf, sizeof(buf));
+    if (res < 0) {
+      if (-EACCES == res) {
+        ldpp_dout(dpp, 0) << "ERROR: Permission denied reading Vault token file" << dendl;
+      } else {
+        ldpp_dout(dpp, 0) << "ERROR: Failed to read Vault token file with error " << res << dendl;
+      }
+      return res;
+    }
+    // drop trailing newlines
+    while (res && isspace(buf[res-1])) {
+      --res;
+    }
+    vault_token->assign(std::string{buf, static_cast<size_t>(res)});
+    memset(buf, 0, sizeof(buf));
+    ::ceph::crypto::zeroize_for_security(buf, sizeof(buf));
+    return res;
+  }
+
+  FORTEST_VIRTUAL
+  int send_request(const DoutPrefixProvider *dpp, const char *method, std::string_view infix,
+    std::string_view key_id,
+    const std::string& postdata,
+    bufferlist &secret_bl)
+  {
+    int res;
+    string vault_token = "";
+    if (RGW_SSE_KMS_VAULT_AUTH_TOKEN == kctx.auth()){
+      ldpp_dout(dpp, 0) << "Loading Vault Token from filesystem" << dendl;
+      res = load_token_from_file(dpp, &vault_token);
+      if (res < 0){
+        return res;
+      }
+    }
+
+    std::string secret_url = kctx.addr();
+    if (secret_url.empty()) {
+      ldpp_dout(dpp, 0) << "ERROR: Vault address not set in rgw_crypt_vault_addr" << dendl;
+      return -EINVAL;
+    }
+
+    concat_url(secret_url, kctx.prefix());
+    concat_url(secret_url, std::string(infix));
+    concat_url(secret_url, std::string(key_id));
+
+    RGWHTTPTransceiver secret_req(cct, method, secret_url, &secret_bl);
+
+    if (postdata.length()) {
+      secret_req.set_post_data(postdata);
+      secret_req.set_send_length(postdata.length());
+    }
+
+    secret_req.append_header("X-Vault-Token", vault_token);
+    if (!vault_token.empty()){
+      secret_req.append_header("X-Vault-Token", vault_token);
+      vault_token.replace(0, vault_token.length(), vault_token.length(), '\000');
+    }
+
+    string vault_namespace = kctx.k_namespace();
+    if (!vault_namespace.empty()){
+      ldpp_dout(dpp, 20) << "Vault Namespace: " << vault_namespace << dendl;
+      secret_req.append_header("X-Vault-Namespace", vault_namespace);
+    }
+
+    secret_req.set_verify_ssl(kctx.verify_ssl());
+
+    if (!kctx.ssl_cacert().empty()) {
+      secret_req.set_ca_path(kctx.ssl_cacert());
+    }
+
+    if (!kctx.ssl_clientcert().empty()) {
+      secret_req.set_client_cert(kctx.ssl_clientcert());
+    }
+    if (!kctx.ssl_clientkey().empty()) {
+      secret_req.set_client_key(kctx.ssl_clientkey());
+    }
+
+    res = secret_req.process(null_yield);
+
+    // map 401 to EACCES instead of EPERM
+    if (secret_req.get_http_status() ==
+        RGWHTTPTransceiver::HTTP_STATUS_UNAUTHORIZED) {
+      ldpp_dout(dpp, 0) << "ERROR: Vault request failed authorization" << dendl;
+      return -EACCES;
+    }
+    if (res < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: Request to Vault failed with error " << res << dendl;
+      return res;
+    }
+
+    ldpp_dout(dpp, 20) << "Request to Vault returned " << res << " and HTTP status "
+      << secret_req.get_http_status() << dendl;
+
+    return res;
+  }
+
+  int send_request(const DoutPrefixProvider *dpp, std::string_view key_id, bufferlist &secret_bl)
+  {
+    return send_request(dpp, "GET", "", key_id, string{}, secret_bl);
+  }
+
+  int decode_secret(const DoutPrefixProvider *dpp, std::string encoded, std::string& actual_key){
+    try {
+      actual_key = from_base64(encoded);
+    } catch (std::exception&) {
+      ldpp_dout(dpp, 0) << "ERROR: Failed to base64 decode key retrieved from Vault" << dendl;
+      return -EINVAL;
+    }
+    memset(encoded.data(), 0, encoded.length());
+    return 0;
+  }
+
+public:
+
+  VaultSecretEngine(CephContext *_c, SSEContext & _k) : cct(_c), kctx(_k) {
+  }
+};
+
+class TransitSecretEngine: public VaultSecretEngine {
+public:
+  int compat;
+  static const int COMPAT_NEW_ONLY = 0;
+  static const int COMPAT_OLD_AND_NEW = 1;
+  static const int COMPAT_ONLY_OLD = 2;
+  static const int COMPAT_UNSET = -1;
+
+private:
+  EngineParmMap parms;
+
+  int get_key_version(std::string_view key_id, string& version)
+  {
+    size_t pos = 0;
+
+    pos = key_id.rfind("/");
+    if (pos != std::string_view::npos){
+      std::string_view token = key_id.substr(pos+1, key_id.length()-pos);
+      if (!token.empty() && token.find_first_not_of("0123456789") == std::string_view::npos){
+        version.assign(std::string(token));
+        return 0;
+      }
+    }
+    return -1;
+  }
+
+public:
+  TransitSecretEngine(CephContext *cct, SSEContext & kctx, EngineParmMap parms): VaultSecretEngine(cct, kctx), parms(parms) {
+    compat = COMPAT_UNSET;
+    for (auto& e: parms) {
+      if (e.first == "compat") {
+	if (e.second.empty()) {
+	  compat = COMPAT_OLD_AND_NEW;
+	} else {
+	  size_t ep;
+
+	  compat = std::stoi(e.second, &ep);
+	  if (ep != e.second.length()) {
+	    lderr(cct) << "warning: vault transit secrets engine : compat="
+	      << e.second << " trailing junk? (ignored)" << dendl;
+	  }
+	}
+	continue;
+      }
+      lderr(cct) << "ERROR: vault transit secrets engine : parameter "
+	<< e.first << "=" << e.second << " ignored" << dendl;
+    }
+    if (compat == COMPAT_UNSET) {
+      std::string_view v { kctx.prefix() };
+      if (string_ends_maybe_slash(v,"/export/encryption-key")) {
+	compat = COMPAT_ONLY_OLD;
+      } else {
+	compat = COMPAT_NEW_ONLY;
+      }
+    }
+  }
+
+  int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key)
+  {
+    ZeroPoolDocument d;
+    ZeroPoolValue *v;
+    string version;
+    bufferlist secret_bl;
+
+    if (get_key_version(key_id, version) < 0){
+      ldpp_dout(dpp, 20) << "Missing or invalid key version" << dendl;
+      return -EINVAL;
+    }
+
+    int res = send_request(dpp, "GET", compat == COMPAT_ONLY_OLD ? "" : "/export/encryption-key",
+	key_id, string{}, secret_bl);
+    if (res < 0) {
+      return res;
+    }
+
+    ldpp_dout(dpp, 20) << "Parse response into JSON Object" << dendl;
+
+    secret_bl.append('\0');
+    rapidjson::StringStream isw(secret_bl.c_str());
+    d.ParseStream<>(isw);
+
+    if (d.HasParseError()) {
+      ldpp_dout(dpp, 0) << "ERROR: Failed to parse JSON response from Vault: "
+	 << rapidjson::GetParseError_En(d.GetParseError()) << dendl;
+      return -EINVAL;
+    }
+    secret_bl.zero();
+
+    const char *elements[] = {"data", "keys", version.c_str()};
+    v = &d;
+    for (auto &elem: elements) {
+      if (!v->IsObject()) {
+	v = nullptr;
+	break;
+      }
+      auto endr { v->MemberEnd() };
+      auto itr { v->FindMember(elem) };
+      if (itr == endr) {
+	v = nullptr;
+	break;
+      }
+      v = &itr->value;
+    }
+    if (!v || !v->IsString()) {
+      ldpp_dout(dpp, 0) << "ERROR: Key not found in JSON response from Vault using Transit Engine" << dendl;
+      return -EINVAL;
+    }
+    return decode_secret(dpp, v->GetString(), actual_key);
+  }
+
+  int make_actual_key(const DoutPrefixProvider *dpp, map<string, bufferlist>& attrs, std::string& actual_key)
+  {
+    std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID);
+    if (compat == COMPAT_ONLY_OLD) return get_key(dpp, key_id, actual_key);
+    if (key_id.find("/") != std::string::npos) {
+      ldpp_dout(dpp, 0) << "sorry, can't allow / in keyid" << dendl;
+      return -EINVAL;
+    }
+/*
+	data: {context }
+	post to prefix + /datakey/plaintext/ + key_id
+	jq: .data.plaintext	-> key
+	jq: .data.ciphertext	-> (to-be) named attribute
+    return decode_secret(json_obj, actual_key)
+*/
+    std::string context = get_str_attribute(attrs, RGW_ATTR_CRYPT_CONTEXT);
+    ZeroPoolDocument d { rapidjson::kObjectType };
+    auto &allocator { d.GetAllocator() };
+    bufferlist secret_bl;
+
+    add_name_val_to_obj("context", context, d, allocator);
+    rapidjson::StringBuffer buf;
+    rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
+    if (!d.Accept(writer)) {
+      ldpp_dout(dpp, 0) << "ERROR: can't make json for vault" << dendl;
+      return -EINVAL;
+    }
+    std::string post_data { buf.GetString() };
+
+    int res = send_request(dpp, "POST", "/datakey/plaintext/", key_id,
+	post_data, secret_bl);
+    if (res < 0) {
+      return res;
+    }
+
+    ldpp_dout(dpp, 20) << "Parse response into JSON Object" << dendl;
+
+    secret_bl.append('\0');
+    rapidjson::StringStream isw(secret_bl.c_str());
+    d.SetNull();
+    d.ParseStream<>(isw);
+
+    if (d.HasParseError()) {
+      ldpp_dout(dpp, 0) << "ERROR: Failed to parse JSON response from Vault: "
+	 << rapidjson::GetParseError_En(d.GetParseError()) << dendl;
+      return -EINVAL;
+    }
+    secret_bl.zero();
+
+    if (!d.IsObject()) {
+      ldpp_dout(dpp, 0) << "ERROR: response from Vault is not an object" << dendl;
+      return -EINVAL;
+    }
+    {
+      auto data_itr { d.FindMember("data") };
+      if (data_itr == d.MemberEnd()) {
+	ldpp_dout(dpp, 0) << "ERROR: no .data in response from Vault" << dendl;
+        return -EINVAL;
+      }
+      auto ciphertext_itr { data_itr->value.FindMember("ciphertext") };
+      auto plaintext_itr { data_itr->value.FindMember("plaintext") };
+      if (ciphertext_itr == data_itr->value.MemberEnd()) {
+	ldpp_dout(dpp, 0) << "ERROR: no .data.ciphertext in response from Vault" << dendl;
+	return -EINVAL;
+      }
+      if (plaintext_itr == data_itr->value.MemberEnd()) {
+	ldpp_dout(dpp, 0) << "ERROR: no .data.plaintext in response from Vault" << dendl;
+	return -EINVAL;
+      }
+      auto &ciphertext_v { ciphertext_itr->value };
+      auto &plaintext_v { plaintext_itr->value };
+      if (!ciphertext_v.IsString()) {
+	ldpp_dout(dpp, 0) << "ERROR: .data.ciphertext not a string in response from Vault" << dendl;
+	return -EINVAL;
+      }
+      if (!plaintext_v.IsString()) {
+	ldpp_dout(dpp, 0) << "ERROR: .data.plaintext not a string in response from Vault" << dendl;
+	return -EINVAL;
+      }
+      set_attr(attrs, RGW_ATTR_CRYPT_DATAKEY, ciphertext_v.GetString());
+      return decode_secret(dpp, plaintext_v.GetString(), actual_key);
+    }
+  }
+
+  int reconstitute_actual_key(const DoutPrefixProvider *dpp, map<string, bufferlist>& attrs, std::string& actual_key)
+  {
+    std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID);
+    std::string wrapped_key = get_str_attribute(attrs, RGW_ATTR_CRYPT_DATAKEY);
+    if (compat == COMPAT_ONLY_OLD || key_id.rfind("/") != std::string::npos) {
+      return get_key(dpp, key_id, actual_key);
+    }
+/*
+	.data.ciphertext <- (to-be) named attribute
+	data: {context ciphertext}
+	post to prefix + /decrypt/ + key_id
+	jq: .data.plaintext
+    return decode_secret(json_obj, actual_key)
+*/
+    std::string context = get_str_attribute(attrs, RGW_ATTR_CRYPT_CONTEXT);
+    ZeroPoolDocument d { rapidjson::kObjectType };
+    auto &allocator { d.GetAllocator() };
+    bufferlist secret_bl;
+
+    add_name_val_to_obj("context", context, d, allocator);
+    add_name_val_to_obj("ciphertext", wrapped_key, d, allocator);
+    rapidjson::StringBuffer buf;
+    rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
+    if (!d.Accept(writer)) {
+      ldpp_dout(dpp, 0) << "ERROR: can't make json for vault" << dendl;
+      return -EINVAL;
+    }
+    std::string post_data { buf.GetString() };
+
+    int res = send_request(dpp, "POST", "/decrypt/", key_id,
+	post_data, secret_bl);
+    if (res < 0) {
+      return res;
+    }
+
+    ldpp_dout(dpp, 20) << "Parse response into JSON Object" << dendl;
+
+    secret_bl.append('\0');
+    rapidjson::StringStream isw(secret_bl.c_str());
+    d.SetNull();
+    d.ParseStream<>(isw);
+
+    if (d.HasParseError()) {
+      ldpp_dout(dpp, 0) << "ERROR: Failed to parse JSON response from Vault: "
+	 << rapidjson::GetParseError_En(d.GetParseError()) << dendl;
+      return -EINVAL;
+    }
+    secret_bl.zero();
+
+    if (!d.IsObject()) {
+      ldpp_dout(dpp, 0) << "ERROR: response from Vault is not an object" << dendl;
+      return -EINVAL;
+    }
+    {
+      auto data_itr { d.FindMember("data") };
+      if (data_itr == d.MemberEnd()) {
+	ldpp_dout(dpp, 0) << "ERROR: no .data in response from Vault" << dendl;
+        return -EINVAL;
+      }
+      auto plaintext_itr { data_itr->value.FindMember("plaintext") };
+      if (plaintext_itr == data_itr->value.MemberEnd()) {
+	ldpp_dout(dpp, 0) << "ERROR: no .data.plaintext in response from Vault" << dendl;
+	return -EINVAL;
+      }
+      auto &plaintext_v { plaintext_itr->value };
+      if (!plaintext_v.IsString()) {
+	ldpp_dout(dpp, 0) << "ERROR: .data.plaintext not a string in response from Vault" << dendl;
+	return -EINVAL;
+      }
+      return decode_secret(dpp, plaintext_v.GetString(), actual_key);
+    }
+  }
+
+  int create_bucket_key(const DoutPrefixProvider *dpp, const std::string& key_name)
+  {
+/*
+	.data.ciphertext <- (to-be) named attribute
+	data: {"type": "chacha20-poly1305", "derived": true}
+	post to prefix + key_name
+	empty output.
+*/
+    ZeroPoolDocument d { rapidjson::kObjectType };
+    auto &allocator { d.GetAllocator() };
+    bufferlist dummy_bl;
+    std::string chacha20_poly1305 { "chacha20-poly1305" };
+
+    add_name_val_to_obj("type", chacha20_poly1305, d, allocator);
+    add_name_val_to_obj("derived", true, d, allocator);
+    rapidjson::StringBuffer buf;
+    rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
+    if (!d.Accept(writer)) {
+      ldpp_dout(dpp, 0) << "ERROR: can't make json for vault" << dendl;
+      return -EINVAL;
+    }
+    std::string post_data { buf.GetString() };
+
+    int res = send_request(dpp, "POST", "/keys/", key_name,
+	post_data, dummy_bl);
+    if (res < 0) {
+      return res;
+    }
+    if (dummy_bl.length() != 0) {
+      ldpp_dout(dpp, 0) << "ERROR: unexpected response from Vault making a key: "
+	<< dummy_bl
+	<< dendl;
+    }
+    return 0;
+  }
+
+  int delete_bucket_key(const DoutPrefixProvider *dpp, const std::string& key_name)
+  {
+/*
+	/keys/<keyname>/config
+	data: {"deletion_allowed": true}
+	post to prefix + key_name
+	empty output.
+*/
+    ZeroPoolDocument d { rapidjson::kObjectType };
+    auto &allocator { d.GetAllocator() };
+    bufferlist dummy_bl;
+    std::ostringstream path_temp;
+    path_temp << "/keys/";
+    path_temp << key_name;
+    std::string delete_path { path_temp.str() };
+    path_temp << "/config";
+    std::string config_path { path_temp.str() };
+
+    add_name_val_to_obj("deletion_allowed", true, d, allocator);
+    rapidjson::StringBuffer buf;
+    rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
+    if (!d.Accept(writer)) {
+      ldpp_dout(dpp, 0) << "ERROR: can't make json for vault" << dendl;
+      return -EINVAL;
+    }
+    std::string post_data { buf.GetString() };
+
+    int res = send_request(dpp, "POST", "", config_path,
+	post_data, dummy_bl);
+    if (res < 0) {
+      return res;
+    }
+    if (dummy_bl.length() != 0) {
+      ldpp_dout(dpp, 0) << "ERROR: unexpected response from Vault marking key to delete: "
+	<< dummy_bl
+	<< dendl;
+      return -EINVAL;
+    }
+
+    res = send_request(dpp, "DELETE", "", delete_path,
+	string{}, dummy_bl);
+    if (res < 0) {
+      return res;
+    }
+    if (dummy_bl.length() != 0) {
+      ldpp_dout(dpp, 0) << "ERROR: unexpected response from Vault deleting key: "
+	<< dummy_bl
+	<< dendl;
+      return -EINVAL;
+    }
+    return 0;
+  }
+};
+
+class KvSecretEngine: public VaultSecretEngine {
+
+public:
+
+  KvSecretEngine(CephContext *cct, SSEContext & kctx, EngineParmMap parms): VaultSecretEngine(cct, kctx){
+    if (!parms.empty()) {
+      lderr(cct) << "ERROR: vault kv secrets engine takes no parameters (ignoring them)" << dendl;
+    }
+  }
+
+  virtual ~KvSecretEngine(){}
+
+  int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key){
+    ZeroPoolDocument d;
+    ZeroPoolValue *v;
+    bufferlist secret_bl;
+
+    int res = send_request(dpp, key_id, secret_bl);
+    if (res < 0) {
+      return res;
+    }
+
+    ldpp_dout(dpp, 20) << "Parse response into JSON Object" << dendl;
+
+    secret_bl.append('\0');
+    rapidjson::StringStream isw(secret_bl.c_str());
+    d.ParseStream<>(isw);
+
+    if (d.HasParseError()) {
+      ldpp_dout(dpp, 0) << "ERROR: Failed to parse JSON response from Vault: "
+	 << rapidjson::GetParseError_En(d.GetParseError()) << dendl;
+      return -EINVAL;
+    }
+    secret_bl.zero();
+
+    static const char *elements[] = {"data", "data", "key"};
+    v = &d;
+    for (auto &elem: elements) {
+      if (!v->IsObject()) {
+	v = nullptr;
+	break;
+      }
+      auto endr { v->MemberEnd() };
+      auto itr { v->FindMember(elem) };
+      if (itr == endr) {
+	v = nullptr;
+	break;
+      }
+      v = &itr->value;
+    }
+    if (!v || !v->IsString()) {
+      ldpp_dout(dpp, 0) << "ERROR: Key not found in JSON response from Vault using KV Engine" << dendl;
+      return -EINVAL;
+    }
+    return decode_secret(dpp, v->GetString(), actual_key);
+  }
+
+};
+
+class KmipSecretEngine;
+class KmipGetTheKey {
+private:
+	CephContext *cct;
+	std::string work;
+	bool failed = false;
+	int ret;
+protected:
+	KmipGetTheKey(CephContext *cct) : cct(cct) {}
+	KmipGetTheKey& keyid_to_keyname(std::string_view key_id);
+	KmipGetTheKey& get_uniqueid_for_keyname();
+	int get_key_for_uniqueid(std::string &);
+	friend KmipSecretEngine;
+};
+
+KmipGetTheKey&
+KmipGetTheKey::keyid_to_keyname(std::string_view key_id)
+{
+	work = cct->_conf->rgw_crypt_kmip_kms_key_template;
+	std::string keyword = "$keyid";
+	std::string replacement = std::string(key_id);
+	size_t pos = 0;
+	if (work.length() == 0) {
+		work = std::move(replacement);
+	} else {
+		while (pos < work.length()) {
+			pos = work.find(keyword, pos);
+			if (pos == std::string::npos) break;
+			work.replace(pos, keyword.length(), replacement);
+			pos += key_id.length();
+		}
+	}
+	return *this;
+}
+
+KmipGetTheKey&
+KmipGetTheKey::get_uniqueid_for_keyname()
+{
+	RGWKMIPTransceiver secret_req(cct, RGWKMIPTransceiver::LOCATE);
+
+	secret_req.name = work.data();
+	ret = secret_req.process(null_yield);
+	if (ret < 0) {
+		failed = true;
+	} else if (!secret_req.outlist->string_count) {
+		ret = -ENOENT;
+		lderr(cct) << "error: locate returned no results for "
+			<< secret_req.name << dendl;
+		failed = true;
+	} else if (secret_req.outlist->string_count != 1) {
+		ret = -EINVAL;
+		lderr(cct) << "error: locate found "
+			<< secret_req.outlist->string_count
+			<< " results for " << secret_req.name << dendl;
+		failed = true;
+	} else {
+		work = std::string(secret_req.outlist->strings[0]);
+	}
+	return *this;
+}
+
+int
+KmipGetTheKey::get_key_for_uniqueid(std::string& actual_key)
+{
+	if (failed) return ret;
+	RGWKMIPTransceiver secret_req(cct, RGWKMIPTransceiver::GET);
+	secret_req.unique_id = work.data();
+	ret = secret_req.process(null_yield);
+	if (ret < 0) {
+		failed = true;
+	} else {
+		actual_key = std::string((char*)(secret_req.outkey->data),
+			secret_req.outkey->keylen);
+	}
+	return ret;
+}
+
+class KmipSecretEngine: public SecretEngine {
+
+protected:
+  CephContext *cct;
+
+public:
+
+  KmipSecretEngine(CephContext *cct) {
+    this->cct = cct;
+  }
+
+  int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key)
+  {
+	int r;
+	r = KmipGetTheKey{cct}
+		.keyid_to_keyname(key_id)
+		.get_uniqueid_for_keyname()
+		.get_key_for_uniqueid(actual_key);
+	return r;
+  }
+};
+
+static int get_actual_key_from_conf(const DoutPrefixProvider* dpp,
+                                    CephContext *cct,
+                                    std::string_view key_id,
+                                    std::string_view key_selector,
+                                    std::string& actual_key)
+{
+  int res = 0;
+
+  static map<string,string> str_map = get_str_map(
+      cct->_conf->rgw_crypt_s3_kms_encryption_keys);
+
+  map<string, string>::iterator it = str_map.find(std::string(key_id));
+  if (it == str_map.end())
+    return -EINVAL;
+
+  std::string master_key;
+  try {
+    master_key = from_base64((*it).second);
+  } catch (std::exception&) {
+    ldpp_dout(dpp, 5) << "ERROR: get_actual_key_from_conf invalid encryption key id "
+                  << "which contains character that is not base64 encoded."
+                  << dendl;
+    return -EINVAL;
+  }
+
+  if (master_key.length() == AES_256_KEYSIZE) {
+    uint8_t _actual_key[AES_256_KEYSIZE];
+    if (AES_256_ECB_encrypt(dpp, cct,
+        reinterpret_cast<const uint8_t*>(master_key.c_str()), AES_256_KEYSIZE,
+        reinterpret_cast<const uint8_t*>(key_selector.data()),
+        _actual_key, AES_256_KEYSIZE)) {
+      actual_key = std::string((char*)&_actual_key[0], AES_256_KEYSIZE);
+    } else {
+      res = -EIO;
+    }
+    ::ceph::crypto::zeroize_for_security(_actual_key, sizeof(_actual_key));
+  } else {
+    ldpp_dout(dpp, 20) << "Wrong size for key=" << key_id << dendl;
+    res = -EIO;
+  }
+
+  return res;
+}
+
+static int request_key_from_barbican(const DoutPrefixProvider *dpp,
+                                     CephContext *cct,
+                                     std::string_view key_id,
+                                     const std::string& barbican_token,
+                                     std::string& actual_key) {
+  int res;
+
+  std::string secret_url = cct->_conf->rgw_barbican_url;
+  if (secret_url.empty()) {
+    ldpp_dout(dpp, 0) << "ERROR: conf rgw_barbican_url is not set" << dendl;
+    return -EINVAL;
+  }
+  concat_url(secret_url, "/v1/secrets/");
+  concat_url(secret_url, std::string(key_id));
+
+  bufferlist secret_bl;
+  RGWHTTPTransceiver secret_req(cct, "GET", secret_url, &secret_bl);
+  secret_req.append_header("Accept", "application/octet-stream");
+  secret_req.append_header("X-Auth-Token", barbican_token);
+
+  res = secret_req.process(null_yield);
+  // map 401 to EACCES instead of EPERM
+  if (secret_req.get_http_status() ==
+      RGWHTTPTransceiver::HTTP_STATUS_UNAUTHORIZED) {
+    return -EACCES;
+  }
+  if (res < 0) {
+    return res;
+  }
+
+  if (secret_req.get_http_status() >=200 &&
+      secret_req.get_http_status() < 300 &&
+      secret_bl.length() == AES_256_KEYSIZE) {
+    actual_key.assign(secret_bl.c_str(), secret_bl.length());
+    secret_bl.zero();
+    } else {
+      res = -EACCES;
+    }
+  return res;
+}
+
+static int get_actual_key_from_barbican(const DoutPrefixProvider *dpp,
+                                        CephContext *cct,
+                                        std::string_view key_id,
+                                        std::string& actual_key)
+{
+  int res = 0;
+  std::string token;
+
+  if (rgw::keystone::Service::get_keystone_barbican_token(dpp, cct, token) < 0) {
+    ldpp_dout(dpp, 5) << "Failed to retrieve token for Barbican" << dendl;
+    return -EINVAL;
+  }
+
+  res = request_key_from_barbican(dpp, cct, key_id, token, actual_key);
+  if (res != 0) {
+    ldpp_dout(dpp, 5) << "Failed to retrieve secret from Barbican:" << key_id << dendl;
+  }
+  return res;
+}
+
+
+std::string config_to_engine_and_parms(CephContext *cct,
+    const char* which,
+    std::string& secret_engine_str,
+    EngineParmMap& secret_engine_parms)
+{
+  std::ostringstream oss;
+  std::vector<std::string> secret_engine_v;
+  std::string secret_engine;
+
+  get_str_vec(secret_engine_str, " ", secret_engine_v);
+
+  cct->_conf.early_expand_meta(secret_engine_str, &oss);
+  auto meta_errors {oss.str()};
+  if (meta_errors.length()) {
+    meta_errors.erase(meta_errors.find_last_not_of("\n")+1);
+    lderr(cct) << "ERROR: while expanding " << which << ": "
+	<< meta_errors << dendl;
+  }
+  for (auto& e: secret_engine_v) {
+    if (!secret_engine.length()) {
+      secret_engine = std::move(e);
+      continue;
+    }
+    auto p { e.find('=') };
+    if (p == std::string::npos) {
+      secret_engine_parms.emplace(std::move(e), "");
+      continue;
+    }
+    std::string key{ e.substr(0,p) };
+    std::string val{ e.substr(p+1) };
+    secret_engine_parms.emplace(std::move(key), std::move(val));
+  }
+  return secret_engine;
+}
+
+
+static int get_actual_key_from_vault(const DoutPrefixProvider *dpp,
+                                     CephContext *cct,
+                                     SSEContext & kctx,
+                                     map<string, bufferlist>& attrs,
+                                     std::string& actual_key, bool make_it)
+{
+  std::string secret_engine_str = kctx.secret_engine();
+  EngineParmMap secret_engine_parms;
+  auto secret_engine { config_to_engine_and_parms(
+    cct, "rgw_crypt_vault_secret_engine",
+    secret_engine_str, secret_engine_parms) };
+  ldpp_dout(dpp, 20) << "Vault authentication method: " << kctx.auth() << dendl;
+  ldpp_dout(dpp, 20) << "Vault Secrets Engine: " << secret_engine << dendl;
+
+  if (RGW_SSE_KMS_VAULT_SE_KV == secret_engine){
+    std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID);
+    KvSecretEngine engine(cct, kctx, std::move(secret_engine_parms));
+    return engine.get_key(dpp, key_id, actual_key);
+  }
+  else if (RGW_SSE_KMS_VAULT_SE_TRANSIT == secret_engine){
+    TransitSecretEngine engine(cct, kctx, std::move(secret_engine_parms));
+    std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID);
+    return make_it
+	? engine.make_actual_key(dpp, attrs, actual_key)
+	: engine.reconstitute_actual_key(dpp, attrs, actual_key);
+  }
+  else {
+    ldpp_dout(dpp, 0) << "Missing or invalid secret engine" << dendl;
+    return -EINVAL;
+  }
+}
+
+
+static int make_actual_key_from_vault(const DoutPrefixProvider *dpp,
+                                     CephContext *cct,
+                                     SSEContext & kctx,
+                                     map<string, bufferlist>& attrs,
+                                     std::string& actual_key)
+{
+    return get_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key, true);
+}
+
+
+static int reconstitute_actual_key_from_vault(const DoutPrefixProvider *dpp,
+                                     CephContext *cct,
+                                     SSEContext & kctx,
+                                     map<string, bufferlist>& attrs,
+                                     std::string& actual_key)
+{
+    return get_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key, false);
+}
+
+
+static int get_actual_key_from_kmip(const DoutPrefixProvider *dpp,
+                                     CephContext *cct,
+                                     std::string_view key_id,
+                                     std::string& actual_key)
+{
+  std::string secret_engine = RGW_SSE_KMS_KMIP_SE_KV;
+
+  if (RGW_SSE_KMS_KMIP_SE_KV == secret_engine){
+    KmipSecretEngine engine(cct);
+    return engine.get_key(dpp, key_id, actual_key);
+  }
+  else{
+    ldpp_dout(dpp, 0) << "Missing or invalid secret engine" << dendl;
+    return -EINVAL;
+  }
+}
+class KMSContext : public SSEContext {
+  CephContext *cct;
+public:
+  KMSContext(CephContext*_cct) : cct{_cct} {};
+  ~KMSContext() override {};
+  const std::string & backend() override {
+    return cct->_conf->rgw_crypt_s3_kms_backend;
+  };
+  const std::string & addr() override {
+    return cct->_conf->rgw_crypt_vault_addr;
+  };
+  const std::string & auth() override {
+    return cct->_conf->rgw_crypt_vault_auth;
+  };
+  const std::string & k_namespace() override {
+    return cct->_conf->rgw_crypt_vault_namespace;
+  };
+  const std::string & prefix() override {
+    return cct->_conf->rgw_crypt_vault_prefix;
+  };
+  const std::string & secret_engine() override {
+    return cct->_conf->rgw_crypt_vault_secret_engine;
+  };
+  const std::string & ssl_cacert() override {
+    return cct->_conf->rgw_crypt_vault_ssl_cacert;
+  };
+  const std::string & ssl_clientcert() override {
+    return cct->_conf->rgw_crypt_vault_ssl_clientcert;
+  };
+  const std::string & ssl_clientkey() override {
+    return cct->_conf->rgw_crypt_vault_ssl_clientkey;
+  };
+  const std::string & token_file() override {
+    return cct->_conf->rgw_crypt_vault_token_file;
+  };
+  const bool verify_ssl() override {
+    return cct->_conf->rgw_crypt_vault_verify_ssl;
+  };
+};
+
+class SseS3Context : public SSEContext {
+  CephContext *cct;
+public:
+  static const std::string sse_s3_secret_engine;
+  SseS3Context(CephContext*_cct) : cct{_cct} {};
+  ~SseS3Context(){};
+  const std::string & backend() override {
+   return cct->_conf->rgw_crypt_sse_s3_backend;
+  };
+  const std::string & addr() override {
+    return cct->_conf->rgw_crypt_sse_s3_vault_addr;
+  };
+  const std::string & auth() override {
+    return cct->_conf->rgw_crypt_sse_s3_vault_auth;
+  };
+  const std::string & k_namespace() override {
+    return cct->_conf->rgw_crypt_sse_s3_vault_namespace;
+  };
+  const std::string & prefix() override {
+    return cct->_conf->rgw_crypt_sse_s3_vault_prefix;
+  };
+  const std::string & secret_engine() override {
+    return cct->_conf->rgw_crypt_sse_s3_vault_secret_engine;
+  };
+  const std::string & ssl_cacert() override {
+    return cct->_conf->rgw_crypt_sse_s3_vault_ssl_cacert;
+  };
+  const std::string & ssl_clientcert() override {
+    return cct->_conf->rgw_crypt_sse_s3_vault_ssl_clientcert;
+  };
+  const std::string & ssl_clientkey() override {
+    return cct->_conf->rgw_crypt_sse_s3_vault_ssl_clientkey;
+  };
+  const std::string & token_file() override {
+    return cct->_conf->rgw_crypt_sse_s3_vault_token_file;
+  };
+  const bool verify_ssl() override {
+    return cct->_conf->rgw_crypt_sse_s3_vault_verify_ssl;
+  };
+};
+
+int reconstitute_actual_key_from_kms(const DoutPrefixProvider *dpp, CephContext *cct,
+                            map<string, bufferlist>& attrs,
+                            std::string& actual_key)
+{
+  std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID);
+  KMSContext kctx { cct };
+  const std::string &kms_backend { kctx.backend() };
+
+  ldpp_dout(dpp, 20) << "Getting KMS encryption key for key " << key_id << dendl;
+  ldpp_dout(dpp, 20) << "SSE-KMS backend is " << kms_backend << dendl;
+
+  if (RGW_SSE_KMS_BACKEND_BARBICAN == kms_backend) {
+    return get_actual_key_from_barbican(dpp, cct, key_id, actual_key);
+  }
+
+  if (RGW_SSE_KMS_BACKEND_VAULT == kms_backend) {
+    return reconstitute_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key);
+  }
+
+  if (RGW_SSE_KMS_BACKEND_KMIP == kms_backend) {
+    return get_actual_key_from_kmip(dpp, cct, key_id, actual_key);
+  }
+
+  if (RGW_SSE_KMS_BACKEND_TESTING == kms_backend) {
+    std::string key_selector = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYSEL);
+    return get_actual_key_from_conf(dpp, cct, key_id, key_selector, actual_key);
+  }
+
+  ldpp_dout(dpp, 0) << "ERROR: Invalid rgw_crypt_s3_kms_backend: " << kms_backend << dendl;
+  return -EINVAL;
+}
+
+int make_actual_key_from_kms(const DoutPrefixProvider *dpp, CephContext *cct,
+                            map<string, bufferlist>& attrs,
+                            std::string& actual_key)
+{
+  KMSContext kctx { cct };
+  const std::string &kms_backend { kctx.backend() };
+  if (RGW_SSE_KMS_BACKEND_VAULT == kms_backend)
+    return make_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key);
+  return reconstitute_actual_key_from_kms(dpp, cct, attrs, actual_key);
+}
+
+int reconstitute_actual_key_from_sse_s3(const DoutPrefixProvider *dpp,
+                            CephContext *cct,
+                            map<string, bufferlist>& attrs,
+                            std::string& actual_key)
+{
+  std::string key_id = get_str_attribute(attrs, RGW_ATTR_CRYPT_KEYID);
+  SseS3Context kctx { cct };
+  const std::string &kms_backend { kctx.backend() };
+
+  ldpp_dout(dpp, 20) << "Getting SSE-S3  encryption key for key " << key_id << dendl;
+  ldpp_dout(dpp, 20) << "SSE-KMS backend is " << kms_backend << dendl;
+
+  if (RGW_SSE_KMS_BACKEND_VAULT == kms_backend) {
+    return reconstitute_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key);
+  }
+
+  ldpp_dout(dpp, 0) << "ERROR: Invalid rgw_crypt_sse_s3_backend: " << kms_backend << dendl;
+  return -EINVAL;
+}
+
+int make_actual_key_from_sse_s3(const DoutPrefixProvider *dpp,
+                            CephContext *cct,
+                            map<string, bufferlist>& attrs,
+                            std::string& actual_key)
+{
+  SseS3Context kctx { cct };
+  const std::string kms_backend { kctx.backend() };
+  if (RGW_SSE_KMS_BACKEND_VAULT != kms_backend) {
+    ldpp_dout(dpp, 0) << "ERROR: Unsupported rgw_crypt_sse_s3_backend: " << kms_backend << dendl;
+    return -EINVAL;
+  }
+  return make_actual_key_from_vault(dpp, cct, kctx, attrs, actual_key);
+}
+
+
+int create_sse_s3_bucket_key(const DoutPrefixProvider *dpp,
+                                     CephContext *cct,
+                                     const std::string& bucket_key)
+{
+  SseS3Context kctx { cct };
+
+  const std::string kms_backend { kctx.backend() };
+  if (RGW_SSE_KMS_BACKEND_VAULT != kms_backend) {
+    ldpp_dout(dpp, 0) << "ERROR: Unsupported rgw_crypt_sse_s3_backend: " << kms_backend << dendl;
+    return -EINVAL;
+  }
+
+  std::string secret_engine_str = kctx.secret_engine();
+  EngineParmMap secret_engine_parms;
+  auto secret_engine { config_to_engine_and_parms(
+    cct, "rgw_crypt_sse_s3_vault_secret_engine",
+    secret_engine_str, secret_engine_parms) };
+  if (RGW_SSE_KMS_VAULT_SE_TRANSIT == secret_engine){
+    TransitSecretEngine engine(cct, kctx, std::move(secret_engine_parms));
+	return engine.create_bucket_key(dpp, bucket_key);
+  }
+  else {
+    ldpp_dout(dpp, 0) << "Missing or invalid secret engine" << dendl;
+    return -EINVAL;
+  }
+}
+
+int remove_sse_s3_bucket_key(const DoutPrefixProvider *dpp,
+                                     CephContext *cct,
+                                     const std::string& bucket_key)
+{
+  SseS3Context kctx { cct };
+  std::string secret_engine_str = kctx.secret_engine();
+  EngineParmMap secret_engine_parms;
+  auto secret_engine { config_to_engine_and_parms(
+    cct, "rgw_crypt_sse_s3_vault_secret_engine",
+    secret_engine_str, secret_engine_parms) };
+  if (RGW_SSE_KMS_VAULT_SE_TRANSIT == secret_engine){
+    TransitSecretEngine engine(cct, kctx, std::move(secret_engine_parms));
+	return engine.delete_bucket_key(dpp, bucket_key);
+  }
+  else {
+    ldpp_dout(dpp, 0) << "Missing or invalid secret engine" << dendl;
+    return -EINVAL;
+  }
+}
diff --git a/src/rgw/rgw_kms.h b/src/rgw/rgw_kms.h
new file mode 100644
index 000000000..f8e8655f2
--- /dev/null
+++ b/src/rgw/rgw_kms.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/**
+ * Server-side encryption integrations with Key Management Systems (SSE-KMS)
+ */
+
+#pragma once
+
+#include <string>
+
+static const std::string RGW_SSE_KMS_BACKEND_TESTING = "testing";
+static const std::string RGW_SSE_KMS_BACKEND_BARBICAN = "barbican";
+static const std::string RGW_SSE_KMS_BACKEND_VAULT = "vault";
+static const std::string RGW_SSE_KMS_BACKEND_KMIP = "kmip";
+
+static const std::string RGW_SSE_KMS_VAULT_AUTH_TOKEN = "token";
+static const std::string RGW_SSE_KMS_VAULT_AUTH_AGENT = "agent";
+
+static const std::string RGW_SSE_KMS_VAULT_SE_TRANSIT = "transit";
+static const std::string RGW_SSE_KMS_VAULT_SE_KV = "kv";
+
+static const std::string RGW_SSE_KMS_KMIP_SE_KV = "kv";
+
+/**
+ * Retrieves the actual server-side encryption key from a KMS system given a
+ * key ID. Currently supported KMS systems are OpenStack Barbican and HashiCorp
+ * Vault, but keys can also be retrieved from Ceph configuration file (if
+ * kms is set to 'local').
+ *
+ * \params
+ * TODO
+ * \return
+ */
+int make_actual_key_from_kms(const DoutPrefixProvider *dpp, CephContext *cct,
+                            std::map<std::string, bufferlist>& attrs,
+                            std::string& actual_key);
+int reconstitute_actual_key_from_kms(const DoutPrefixProvider *dpp, CephContext *cct,
+                            std::map<std::string, bufferlist>& attrs,
+                            std::string& actual_key);
+int make_actual_key_from_sse_s3(const DoutPrefixProvider *dpp, CephContext *cct,
+                            std::map<std::string, bufferlist>& attrs,
+                            std::string& actual_key);
+int reconstitute_actual_key_from_sse_s3(const DoutPrefixProvider *dpp, CephContext *cct,
+                            std::map<std::string, bufferlist>& attrs,
+                            std::string& actual_key);
+
+int create_sse_s3_bucket_key(const DoutPrefixProvider *dpp, CephContext *cct,
+                            const std::string& actual_key);
+
+int remove_sse_s3_bucket_key(const DoutPrefixProvider *dpp, CephContext *cct,
+                            const std::string& actual_key);
+
+/**
+ * SecretEngine Interface
+ * Defining interface here such that we can use both a real implementation
+ * of this interface, and a mock implementation in tests.
+**/
+class SecretEngine {
+
+public:
+  virtual int get_key(const DoutPrefixProvider *dpp, std::string_view key_id, std::string& actual_key) = 0;
+  virtual ~SecretEngine(){};
+};
diff --git a/src/rgw/rgw_lc.cc b/src/rgw/rgw_lc.cc
new file mode 100644
index 000000000..7f4a79501
--- /dev/null
+++ b/src/rgw/rgw_lc.cc
@@ -0,0 +1,2869 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+#include <iostream>
+#include <map>
+#include <algorithm>
+#include <tuple>
+#include <functional>
+
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/variant.hpp>
+
+#include "include/scope_guard.h"
+#include "include/function2.hpp"
+#include "common/Formatter.h"
+#include "common/containers.h"
+#include "common/split.h"
+#include <common/errno.h>
+#include "include/random.h"
+#include "cls/lock/cls_lock_client.h"
+#include "rgw_perf_counters.h"
+#include "rgw_common.h"
+#include "rgw_bucket.h"
+#include "rgw_lc.h"
+#include "rgw_zone.h"
+#include "rgw_string.h"
+#include "rgw_multi.h"
+#include "rgw_sal.h"
+#include "rgw_lc_tier.h"
+#include "rgw_notify.h"
+
+#include "fmt/format.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_zone.h"
+#include "services/svc_tier_rados.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+const char* LC_STATUS[] = {
+      "UNINITIAL",
+      "PROCESSING",
+      "FAILED",
+      "COMPLETE"
+};
+
+using namespace librados;
+
+bool LCRule::valid() const
+{
+  if (id.length() > MAX_ID_LEN) {
+    return false;
+  }
+  else if(expiration.empty() && noncur_expiration.empty() &&
+	  mp_expiration.empty() && !dm_expiration &&
+          transitions.empty() && noncur_transitions.empty()) {
+    return false;
+  }
+  else if (!expiration.valid() || !noncur_expiration.valid() ||
+	   !mp_expiration.valid()) {
+    return false;
+  }
+  if (!transitions.empty()) {
+    bool using_days = expiration.has_days();
+    bool using_date = expiration.has_date();
+    for (const auto& elem : transitions) {
+      if (!elem.second.valid()) {
+        return false;
+      }
+      using_days = using_days || elem.second.has_days();
+      using_date = using_date || elem.second.has_date();
+      if (using_days && using_date) {
+        return false;
+      }
+    }
+  }
+  for (const auto& elem : noncur_transitions) {
+    if (!elem.second.valid()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void LCRule::init_simple_days_rule(std::string_view _id,
+				   std::string_view _prefix, int num_days)
+{
+  id = _id;
+  prefix = _prefix;
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%d", num_days);
+  expiration.set_days(buf);
+  set_enabled(true);
+}
+
+void RGWLifecycleConfiguration::add_rule(const LCRule& rule)
+{
+  auto& id = rule.get_id(); // note that this will return false for groups, but that's ok, we won't search groups
+  rule_map.insert(pair<string, LCRule>(id, rule));
+}
+
+bool RGWLifecycleConfiguration::_add_rule(const LCRule& rule)
+{
+  lc_op op(rule.get_id());
+  op.status = rule.is_enabled();
+  if (rule.get_expiration().has_days()) {
+    op.expiration = rule.get_expiration().get_days();
+  }
+  if (rule.get_expiration().has_date()) {
+    op.expiration_date = ceph::from_iso_8601(rule.get_expiration().get_date());
+  }
+  if (rule.get_noncur_expiration().has_days()) {
+    op.noncur_expiration = rule.get_noncur_expiration().get_days();
+  }
+  if (rule.get_mp_expiration().has_days()) {
+    op.mp_expiration = rule.get_mp_expiration().get_days();
+  }
+  op.dm_expiration = rule.get_dm_expiration();
+  for (const auto &elem : rule.get_transitions()) {
+    transition_action action;
+    if (elem.second.has_days()) {
+      action.days = elem.second.get_days();
+    } else {
+      action.date = ceph::from_iso_8601(elem.second.get_date());
+    }
+    action.storage_class
+      = rgw_placement_rule::get_canonical_storage_class(elem.first);
+    op.transitions.emplace(elem.first, std::move(action));
+  }
+  for (const auto &elem : rule.get_noncur_transitions()) {
+    transition_action action;
+    action.days = elem.second.get_days();
+    action.date = ceph::from_iso_8601(elem.second.get_date());
+    action.storage_class
+      = rgw_placement_rule::get_canonical_storage_class(elem.first);
+    op.noncur_transitions.emplace(elem.first, std::move(action));
+  }
+  std::string prefix;
+  if (rule.get_filter().has_prefix()){
+    prefix = rule.get_filter().get_prefix();
+  } else {
+    prefix = rule.get_prefix();
+  }
+  if (rule.get_filter().has_tags()){
+    op.obj_tags = rule.get_filter().get_tags();
+  }
+  op.rule_flags = rule.get_filter().get_flags();
+  prefix_map.emplace(std::move(prefix), std::move(op));
+  return true;
+}
+
+int RGWLifecycleConfiguration::check_and_add_rule(const LCRule& rule)
+{
+  if (!rule.valid()) {
+    return -EINVAL;
+  }
+  auto& id = rule.get_id();
+  if (rule_map.find(id) != rule_map.end()) {  //id shouldn't be the same 
+    return -EINVAL;
+  }
+  if (rule.get_filter().has_tags() && (rule.get_dm_expiration() ||
+				       !rule.get_mp_expiration().empty())) {
+    return -ERR_INVALID_REQUEST;
+  }
+  rule_map.insert(pair<string, LCRule>(id, rule));
+
+  if (!_add_rule(rule)) {
+    return -ERR_INVALID_REQUEST;
+  }
+  return 0;
+}
+
+bool RGWLifecycleConfiguration::has_same_action(const lc_op& first,
+						const lc_op& second) {
+  if ((first.expiration > 0 || first.expiration_date != boost::none) && 
+    (second.expiration > 0 || second.expiration_date != boost::none)) {
+    return true;
+  } else if (first.noncur_expiration > 0 && second.noncur_expiration > 0) {
+    return true;
+  } else if (first.mp_expiration > 0 && second.mp_expiration > 0) {
+    return true;
+  } else if (!first.transitions.empty() && !second.transitions.empty()) {
+    for (auto &elem : first.transitions) {
+      if (second.transitions.find(elem.first) != second.transitions.end()) {
+        return true;
+      }
+    }
+  } else if (!first.noncur_transitions.empty() &&
+	     !second.noncur_transitions.empty()) {
+    for (auto &elem : first.noncur_transitions) {
+      if (second.noncur_transitions.find(elem.first) !=
+	  second.noncur_transitions.end()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/* Formerly, this method checked for duplicate rules using an invalid
+ * method (prefix uniqueness). */
+bool RGWLifecycleConfiguration::valid() 
+{
+  return true;
+}
+
+void *RGWLC::LCWorker::entry() {
+  do {
+    std::unique_ptr<rgw::sal::Bucket> all_buckets; // empty restriction
+    utime_t start = ceph_clock_now();
+    if (should_work(start)) {
+      ldpp_dout(dpp, 2) << "life cycle: start" << dendl;
+      int r = lc->process(this, all_buckets, false /* once */);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: do life cycle process() returned error r="
+			  << r << dendl;
+      }
+      ldpp_dout(dpp, 2) << "life cycle: stop" << dendl;
+      cloud_targets.clear(); // clear cloud targets
+    }
+    if (lc->going_down())
+      break;
+
+    utime_t end = ceph_clock_now();
+    int secs = schedule_next_start_time(start, end);
+    utime_t next;
+    next.set_from_double(end + secs);
+
+    ldpp_dout(dpp, 5) << "schedule life cycle next start time: "
+		      << rgw_to_asctime(next) << dendl;
+
+    std::unique_lock l{lock};
+    cond.wait_for(l, std::chrono::seconds(secs));
+  } while (!lc->going_down());
+
+  return NULL;
+}
+
+void RGWLC::initialize(CephContext *_cct, rgw::sal::Driver* _driver) {
+  cct = _cct;
+  driver = _driver;
+  sal_lc = driver->get_lifecycle();
+  max_objs = cct->_conf->rgw_lc_max_objs;
+  if (max_objs > HASH_PRIME)
+    max_objs = HASH_PRIME;
+
+  obj_names = new string[max_objs];
+
+  for (int i = 0; i < max_objs; i++) {
+    obj_names[i] = lc_oid_prefix;
+    char buf[32];
+    snprintf(buf, 32, ".%d", i);
+    obj_names[i].append(buf);
+  }
+
+#define COOKIE_LEN 16
+  char cookie_buf[COOKIE_LEN + 1];
+  gen_rand_alphanumeric(cct, cookie_buf, sizeof(cookie_buf) - 1);
+  cookie = cookie_buf;
+}
+
+void RGWLC::finalize()
+{
+  delete[] obj_names;
+}
+
+static inline std::ostream& operator<<(std::ostream &os, rgw::sal::Lifecycle::LCEntry& ent) {
+  os << "<ent: bucket=";
+  os << ent.get_bucket();
+  os << "; start_time=";
+  os << rgw_to_asctime(utime_t(time_t(ent.get_start_time()), 0));
+  os << "; status=";
+  os << LC_STATUS[ent.get_status()];
+  os << ">";
+  return os;
+}
+
+static bool obj_has_expired(const DoutPrefixProvider *dpp, CephContext *cct, ceph::real_time mtime, int days,
+			    ceph::real_time *expire_time = nullptr)
+{
+  double timediff, cmp;
+  utime_t base_time;
+  if (cct->_conf->rgw_lc_debug_interval <= 0) {
+    /* Normal case, run properly */
+    cmp = double(days)*24*60*60;
+    base_time = ceph_clock_now().round_to_day();
+  } else {
+    /* We're in debug mode; Treat each rgw_lc_debug_interval seconds as a day */
+    cmp = double(days)*cct->_conf->rgw_lc_debug_interval;
+    base_time = ceph_clock_now();
+  }
+  auto tt_mtime = ceph::real_clock::to_time_t(mtime);
+  timediff = base_time - tt_mtime;
+
+  if (expire_time) {
+    *expire_time = mtime + make_timespan(cmp);
+  }
+
+  ldpp_dout(dpp, 20) << __func__
+		 << "(): mtime=" << mtime << " days=" << days
+		 << " base_time=" << base_time << " timediff=" << timediff
+		 << " cmp=" << cmp
+		 << " is_expired=" << (timediff >= cmp) 
+		 << dendl;
+
+  return (timediff >= cmp);
+}
+
+static bool pass_object_lock_check(rgw::sal::Driver* driver, rgw::sal::Object* obj, const DoutPrefixProvider *dpp)
+{
+  if (!obj->get_bucket()->get_info().obj_lock_enabled()) {
+    return true;
+  }
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op = obj->get_read_op();
+  int ret = read_op->prepare(null_yield, dpp);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      return true;
+    } else {
+      return false;
+    }
+  } else {
+    auto iter = obj->get_attrs().find(RGW_ATTR_OBJECT_RETENTION);
+    if (iter != obj->get_attrs().end()) {
+      RGWObjectRetention retention;
+      try {
+        decode(retention, iter->second);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to decode RGWObjectRetention"
+			       << dendl;
+        return false;
+      }
+      if (ceph::real_clock::to_time_t(retention.get_retain_until_date()) >
+	  ceph_clock_now()) {
+        return false;
+      }
+    }
+    iter = obj->get_attrs().find(RGW_ATTR_OBJECT_LEGAL_HOLD);
+    if (iter != obj->get_attrs().end()) {
+      RGWObjectLegalHold obj_legal_hold;
+      try {
+        decode(obj_legal_hold, iter->second);
+      } catch (buffer::error& err) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to decode RGWObjectLegalHold"
+			       << dendl;
+        return false;
+      }
+      if (obj_legal_hold.is_enabled()) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
+
+class LCObjsLister {
+  rgw::sal::Driver* driver;
+  rgw::sal::Bucket* bucket;
+  rgw::sal::Bucket::ListParams list_params;
+  rgw::sal::Bucket::ListResults list_results;
+  string prefix;
+  vector<rgw_bucket_dir_entry>::iterator obj_iter;
+  rgw_bucket_dir_entry pre_obj;
+  int64_t delay_ms;
+
+public:
+  LCObjsLister(rgw::sal::Driver* _driver, rgw::sal::Bucket* _bucket) :
+      driver(_driver), bucket(_bucket) {
+    list_params.list_versions = bucket->versioned();
+    list_params.allow_unordered = true;
+    delay_ms = driver->ctx()->_conf.get_val<int64_t>("rgw_lc_thread_delay");
+  }
+
+  void set_prefix(const string& p) {
+    prefix = p;
+    list_params.prefix = prefix;
+  }
+
+  int init(const DoutPrefixProvider *dpp) {
+    return fetch(dpp);
+  }
+
+  int fetch(const DoutPrefixProvider *dpp) {
+    int ret = bucket->list(dpp, list_params, 1000, list_results, null_yield);
+    if (ret < 0) {
+      return ret;
+    }
+
+    obj_iter = list_results.objs.begin();
+
+    return 0;
+  }
+
+  void delay() {
+    std::this_thread::sleep_for(std::chrono::milliseconds(delay_ms));
+  }
+
+  bool get_obj(const DoutPrefixProvider *dpp, rgw_bucket_dir_entry **obj,
+	       std::function<void(void)> fetch_barrier
+	       = []() { /* nada */}) {
+    if (obj_iter == list_results.objs.end()) {
+      if (!list_results.is_truncated) {
+        delay();
+        return false;
+      } else {
+	fetch_barrier();
+        list_params.marker = pre_obj.key;
+        int ret = fetch(dpp);
+        if (ret < 0) {
+          ldpp_dout(dpp, 0) << "ERROR: list_op returned ret=" << ret
+				 << dendl;
+          return false;
+        }
+      }
+      delay();
+    }
+    /* returning address of entry in objs */
+    *obj = &(*obj_iter);
+    return obj_iter != list_results.objs.end();
+  }
+
+  rgw_bucket_dir_entry get_prev_obj() {
+    return pre_obj;
+  }
+
+  void next() {
+    pre_obj = *obj_iter;
+    ++obj_iter;
+  }
+
+  boost::optional<std::string> next_key_name() {
+    if (obj_iter == list_results.objs.end() ||
+	(obj_iter + 1) == list_results.objs.end()) {
+      /* this should have been called after get_obj() was called, so this should
+       * only happen if is_truncated is false */
+      return boost::none;
+    }
+
+    return ((obj_iter + 1)->key.name);
+  }
+
+}; /* LCObjsLister */
+
+struct op_env {
+
+  using LCWorker = RGWLC::LCWorker;
+
+  lc_op op;
+  rgw::sal::Driver* driver;
+  LCWorker* worker;
+  rgw::sal::Bucket* bucket;
+  LCObjsLister& ol;
+
+  op_env(lc_op& _op, rgw::sal::Driver* _driver, LCWorker* _worker,
+	 rgw::sal::Bucket* _bucket, LCObjsLister& _ol)
+    : op(_op), driver(_driver), worker(_worker), bucket(_bucket),
+      ol(_ol) {}
+}; /* op_env */
+
+class LCRuleOp;
+class WorkQ;
+
+struct lc_op_ctx {
+  CephContext *cct;
+  op_env env;
+  rgw_bucket_dir_entry o;
+  boost::optional<std::string> next_key_name;
+  ceph::real_time effective_mtime;
+
+  rgw::sal::Driver* driver;
+  rgw::sal::Bucket* bucket;
+  lc_op& op; // ok--refers to expanded env.op
+  LCObjsLister& ol;
+
+  std::unique_ptr<rgw::sal::Object> obj;
+  RGWObjectCtx rctx;
+  const DoutPrefixProvider *dpp;
+  WorkQ* wq;
+
+  std::unique_ptr<rgw::sal::PlacementTier> tier;
+
+  lc_op_ctx(op_env& env, rgw_bucket_dir_entry& o,
+	    boost::optional<std::string> next_key_name,
+	    ceph::real_time effective_mtime,
+	    const DoutPrefixProvider *dpp, WorkQ* wq)
+    : cct(env.driver->ctx()), env(env), o(o), next_key_name(next_key_name),
+      effective_mtime(effective_mtime),
+      driver(env.driver), bucket(env.bucket), op(env.op), ol(env.ol),
+      rctx(env.driver), dpp(dpp), wq(wq)
+    {
+      obj = bucket->get_object(o.key);
+    }
+
+  bool next_has_same_name(const std::string& key_name) {
+    return (next_key_name && key_name.compare(
+	      boost::get<std::string>(next_key_name)) == 0);
+  }
+
+}; /* lc_op_ctx */
+
+
+static std::string lc_id = "rgw lifecycle";
+static std::string lc_req_id = "0";
+
+static int remove_expired_obj(
+  const DoutPrefixProvider *dpp, lc_op_ctx& oc, bool remove_indeed,
+  rgw::notify::EventType event_type)
+{
+  auto& driver = oc.driver;
+  auto& bucket_info = oc.bucket->get_info();
+  auto& o = oc.o;
+  auto obj_key = o.key;
+  auto& meta = o.meta;
+  int ret;
+  std::string version_id;
+  std::unique_ptr<rgw::sal::Notification> notify;
+
+  if (!remove_indeed) {
+    obj_key.instance.clear();
+  } else if (obj_key.instance.empty()) {
+    obj_key.instance = "null";
+  }
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  std::unique_ptr<rgw::sal::Object> obj;
+
+  ret = driver->get_bucket(nullptr, bucket_info, &bucket);
+  if (ret < 0) {
+    return ret;
+  }
+
+  // XXXX currently, rgw::sal::Bucket.owner is always null here
+  std::unique_ptr<rgw::sal::User> user;
+  if (! bucket->get_owner()) {
+    auto& bucket_info = bucket->get_info();
+    user = driver->get_user(bucket_info.owner);
+    // forgive me, lord
+    if (user) {
+      bucket->set_owner(user.get());
+    }
+  }
+
+  obj = bucket->get_object(obj_key);
+
+  RGWObjState* obj_state{nullptr};
+  ret = obj->get_obj_state(dpp, &obj_state, null_yield, true);
+  if (ret < 0) {
+    return ret;
+  }
+
+  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op
+    = obj->get_delete_op();
+  del_op->params.versioning_status
+    = obj->get_bucket()->get_info().versioning_status();
+  del_op->params.obj_owner.set_id(rgw_user {meta.owner});
+  del_op->params.obj_owner.set_name(meta.owner_display_name);
+  del_op->params.bucket_owner.set_id(bucket_info.owner);
+  del_op->params.unmod_since = meta.mtime;
+  del_op->params.marker_version_id = version_id;
+
+  // notification supported only for RADOS driver for now
+  notify = driver->get_notification(dpp, obj.get(), nullptr, event_type,
+				   bucket.get(), lc_id,
+				   const_cast<std::string&>(oc.bucket->get_tenant()),
+				   lc_req_id, null_yield);
+
+  ret = notify->publish_reserve(dpp, nullptr);
+  if ( ret < 0) {
+    ldpp_dout(dpp, 1)
+      << "ERROR: notify reservation failed, deferring delete of object k="
+      << o.key
+      << dendl;
+    return ret;
+  }
+  ret =  del_op->delete_obj(dpp, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) <<
+      "ERROR: publishing notification failed, with error: " << ret << dendl;
+  } else {
+    // send request to notification manager
+    (void) notify->publish_commit(dpp, obj_state->size,
+				  ceph::real_clock::now(),
+				  obj_state->attrset[RGW_ATTR_ETAG].to_str(),
+				  version_id);
+  }
+
+  return ret;
+
+} /* remove_expired_obj */
+
+class LCOpAction {
+public:
+  virtual ~LCOpAction() {}
+
+  virtual bool check(lc_op_ctx& oc, ceph::real_time *exp_time, const DoutPrefixProvider *dpp) {
+    return false;
+  }
+
+  /* called after check(). Check should tell us whether this action
+   * is applicable. If there are multiple actions, we'll end up executing
+   * the latest applicable action
+   * For example:
+   *   one action after 10 days, another after 20, third after 40.
+   *   After 10 days, the latest applicable action would be the first one,
+   *   after 20 days it will be the second one. After 21 days it will still be the
+   *   second one. So check() should return true for the second action at that point,
+   *   but should_process() if the action has already been applied. In object removal
+   *   it doesn't matter, but in object transition it does.
+   */
+  virtual bool should_process() {
+    return true;
+  }
+
+  virtual int process(lc_op_ctx& oc) {
+    return 0;
+  }
+
+  friend class LCOpRule;
+}; /* LCOpAction */
+
+class LCOpFilter {
+public:
+virtual ~LCOpFilter() {}
+  virtual bool check(const DoutPrefixProvider *dpp, lc_op_ctx& oc) {
+    return false;
+  }
+}; /* LCOpFilter */
+
+class LCOpRule {
+  friend class LCOpAction;
+
+  op_env env;
+  boost::optional<std::string> next_key_name;
+  ceph::real_time effective_mtime;
+
+  std::vector<shared_ptr<LCOpFilter> > filters; // n.b., sharing ovhd
+  std::vector<shared_ptr<LCOpAction> > actions;
+
+public:
+  LCOpRule(op_env& _env) : env(_env) {}
+
+  boost::optional<std::string> get_next_key_name() {
+    return next_key_name;
+  }
+
+  std::vector<shared_ptr<LCOpAction>>& get_actions() {
+    return actions;
+  }
+
+  void build();
+  void update();
+  int process(rgw_bucket_dir_entry& o, const DoutPrefixProvider *dpp,
+	      WorkQ* wq);
+}; /* LCOpRule */
+
+using WorkItem =
+  boost::variant<void*,
+		 /* out-of-line delete */
+		 std::tuple<LCOpRule, rgw_bucket_dir_entry>,
+		 /* uncompleted MPU expiration */
+		 std::tuple<lc_op, rgw_bucket_dir_entry>,
+		 rgw_bucket_dir_entry>;
+
+class WorkQ : public Thread
+{
+public:
+  using unique_lock = std::unique_lock<std::mutex>;
+  using work_f = std::function<void(RGWLC::LCWorker*, WorkQ*, WorkItem&)>;
+  using dequeue_result = boost::variant<void*, WorkItem>;
+
+  static constexpr uint32_t FLAG_NONE =        0x0000;
+  static constexpr uint32_t FLAG_EWAIT_SYNC =  0x0001;
+  static constexpr uint32_t FLAG_DWAIT_SYNC =  0x0002;
+  static constexpr uint32_t FLAG_EDRAIN_SYNC = 0x0004;
+
+private:
+  const work_f bsf = [](RGWLC::LCWorker* wk, WorkQ* wq, WorkItem& wi) {};
+  RGWLC::LCWorker* wk;
+  uint32_t qmax;
+  int ix;
+  std::mutex mtx;
+  std::condition_variable cv;
+  uint32_t flags;
+  vector<WorkItem> items;
+  work_f f;
+
+public:
+  WorkQ(RGWLC::LCWorker* wk, uint32_t ix, uint32_t qmax)
+    : wk(wk), qmax(qmax), ix(ix), flags(FLAG_NONE), f(bsf)
+    {
+      create(thr_name().c_str());
+    }
+
+  std::string thr_name() {
+    return std::string{"wp_thrd: "}
+    + std::to_string(wk->ix) + ", " + std::to_string(ix);
+  }
+
+  void setf(work_f _f) {
+    f = _f;
+  }
+
+  void enqueue(WorkItem&& item) {
+    unique_lock uniq(mtx);
+    while ((!wk->get_lc()->going_down()) &&
+	   (items.size() > qmax)) {
+      flags |= FLAG_EWAIT_SYNC;
+      cv.wait_for(uniq, 200ms);
+    }
+    items.push_back(item);
+    if (flags & FLAG_DWAIT_SYNC) {
+      flags &= ~FLAG_DWAIT_SYNC;
+      cv.notify_one();
+    }
+  }
+
+  void drain() {
+    unique_lock uniq(mtx);
+    flags |= FLAG_EDRAIN_SYNC;
+    while (flags & FLAG_EDRAIN_SYNC) {
+      cv.wait_for(uniq, 200ms);
+    }
+  }
+
+private:
+  dequeue_result dequeue() {
+    unique_lock uniq(mtx);
+    while ((!wk->get_lc()->going_down()) &&
+	   (items.size() == 0)) {
+      /* clear drain state, as we are NOT doing work and qlen==0 */
+      if (flags & FLAG_EDRAIN_SYNC) {
+	flags &= ~FLAG_EDRAIN_SYNC;
+      }
+      flags |= FLAG_DWAIT_SYNC;
+      cv.wait_for(uniq, 200ms);
+    }
+    if (items.size() > 0) {
+      auto item = items.back();
+      items.pop_back();
+      if (flags & FLAG_EWAIT_SYNC) {
+	flags &= ~FLAG_EWAIT_SYNC;
+	cv.notify_one();
+      }
+      return {item};
+    }
+    return nullptr;
+  }
+
+  void* entry() override {
+    while (!wk->get_lc()->going_down()) {
+      auto item = dequeue();
+      if (item.which() == 0) {
+	/* going down */
+	break;
+      }
+      f(wk, this, boost::get<WorkItem>(item));
+    }
+    return nullptr;
+  }
+}; /* WorkQ */
+
+class RGWLC::WorkPool
+{
+  using TVector = ceph::containers::tiny_vector<WorkQ, 3>;
+  TVector wqs;
+  uint64_t ix;
+
+public:
+  WorkPool(RGWLC::LCWorker* wk, uint16_t n_threads, uint32_t qmax)
+    : wqs(TVector{
+	n_threads,
+	[&](const size_t ix, auto emplacer) {
+	  emplacer.emplace(wk, ix, qmax);
+	}}),
+      ix(0)
+    {}
+
+  ~WorkPool() {
+    for (auto& wq : wqs) {
+      wq.join();
+    }
+  }
+
+  void setf(WorkQ::work_f _f) {
+    for (auto& wq : wqs) {
+      wq.setf(_f);
+    }
+  }
+
+  void enqueue(WorkItem item) {
+    const auto tix = ix;
+    ix = (ix+1) % wqs.size();
+    (wqs[tix]).enqueue(std::move(item));
+  }
+
+  void drain() {
+    for (auto& wq : wqs) {
+      wq.drain();
+    }
+  }
+}; /* WorkPool */
+
+RGWLC::LCWorker::LCWorker(const DoutPrefixProvider* dpp, CephContext *cct,
+			  RGWLC *lc, int ix)
+  : dpp(dpp), cct(cct), lc(lc), ix(ix)
+{
+  auto wpw = cct->_conf.get_val<int64_t>("rgw_lc_max_wp_worker");
+  workpool = new WorkPool(this, wpw, 512);
+}
+
+static inline bool worker_should_stop(time_t stop_at, bool once)
+{
+  return !once && stop_at < time(nullptr);
+}
+
+int RGWLC::handle_multipart_expiration(rgw::sal::Bucket* target,
+				       const multimap<string, lc_op>& prefix_map,
+				       LCWorker* worker, time_t stop_at, bool once)
+{
+  MultipartMetaFilter mp_filter;
+  int ret;
+  rgw::sal::Bucket::ListParams params;
+  rgw::sal::Bucket::ListResults results;
+  auto delay_ms = cct->_conf.get_val<int64_t>("rgw_lc_thread_delay");
+  params.list_versions = false;
+  /* lifecycle processing does not depend on total order, so can
+   * take advantage of unordered listing optimizations--such as
+   * operating on one shard at a time */
+  params.allow_unordered = true;
+  params.ns = RGW_OBJ_NS_MULTIPART;
+  params.access_list_filter = &mp_filter;
+
+  auto pf = [&](RGWLC::LCWorker* wk, WorkQ* wq, WorkItem& wi) {
+    auto wt = boost::get<std::tuple<lc_op, rgw_bucket_dir_entry>>(wi);
+    auto& [rule, obj] = wt;
+    if (obj_has_expired(this, cct, obj.meta.mtime, rule.mp_expiration)) {
+      rgw_obj_key key(obj.key);
+      std::unique_ptr<rgw::sal::MultipartUpload> mpu = target->get_multipart_upload(key.name);
+      int ret = mpu->abort(this, cct);
+      if (ret == 0) {
+        if (perfcounter) {
+          perfcounter->inc(l_rgw_lc_abort_mpu, 1);
+        }
+      } else {
+	if (ret == -ERR_NO_SUCH_UPLOAD) {
+	  ldpp_dout(wk->get_lc(), 5)
+	    << "ERROR: abort_multipart_upload failed, ret=" << ret
+	    << ", thread:" << wq->thr_name()
+	    << ", meta:" << obj.key
+	    << dendl;
+	} else {
+	  ldpp_dout(wk->get_lc(), 0)
+	    << "ERROR: abort_multipart_upload failed, ret=" << ret
+	    << ", thread:" << wq->thr_name()
+	    << ", meta:" << obj.key
+	    << dendl;
+	}
+      } /* abort failed */
+    } /* expired */
+  };
+
+  worker->workpool->setf(pf);
+
+  for (auto prefix_iter = prefix_map.begin(); prefix_iter != prefix_map.end();
+       ++prefix_iter) {
+
+    if (worker_should_stop(stop_at, once)) {
+      ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker "
+		     << worker->ix
+		     << dendl;
+      return 0;
+    }
+
+    if (!prefix_iter->second.status || prefix_iter->second.mp_expiration <= 0) {
+      continue;
+    }
+    params.prefix = prefix_iter->first;
+    do {
+      auto offset = 0;
+      results.objs.clear();
+      ret = target->list(this, params, 1000, results, null_yield);
+      if (ret < 0) {
+          if (ret == (-ENOENT))
+            return 0;
+          ldpp_dout(this, 0) << "ERROR: driver->list_objects():" <<dendl;
+          return ret;
+      }
+
+      for (auto obj_iter = results.objs.begin(); obj_iter != results.objs.end(); ++obj_iter, ++offset) {
+	std::tuple<lc_op, rgw_bucket_dir_entry> t1 =
+	  {prefix_iter->second, *obj_iter};
+	worker->workpool->enqueue(WorkItem{t1});
+	if (going_down()) {
+	  return 0;
+	}
+      } /* for objs */
+
+      if ((offset % 100) == 0) {
+	if (worker_should_stop(stop_at, once)) {
+	  ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker "
+			     << worker->ix
+			     << dendl;
+	  return 0;
+	}
+      }
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(delay_ms));
+    } while(results.is_truncated);
+  } /* for prefix_map */
+
+  worker->workpool->drain();
+  return 0;
+} /* RGWLC::handle_multipart_expiration */
+
+static int read_obj_tags(const DoutPrefixProvider *dpp, rgw::sal::Object* obj, bufferlist& tags_bl)
+{
+  std::unique_ptr<rgw::sal::Object::ReadOp> rop = obj->get_read_op();
+
+  return rop->get_attr(dpp, RGW_ATTR_TAGS, tags_bl, null_yield);
+}
+
+static bool is_valid_op(const lc_op& op)
+{
+      return (op.status &&
+              (op.expiration > 0
+               || op.expiration_date != boost::none
+               || op.noncur_expiration > 0
+               || op.dm_expiration
+               || !op.transitions.empty()
+               || !op.noncur_transitions.empty()));
+}
+
+static bool zone_check(const lc_op& op, rgw::sal::Zone* zone)
+{
+
+  if (zone->get_tier_type() == "archive") {
+    return (op.rule_flags & uint32_t(LCFlagType::ArchiveZone));
+  } else {
+    return (! (op.rule_flags & uint32_t(LCFlagType::ArchiveZone)));
+  }
+}
+
+static inline bool has_all_tags(const lc_op& rule_action,
+				const RGWObjTags& object_tags)
+{
+  if(! rule_action.obj_tags)
+    return false;
+  if(object_tags.count() < rule_action.obj_tags->count())
+    return false;
+  size_t tag_count = 0;
+  for (const auto& tag : object_tags.get_tags()) {
+    const auto& rule_tags = rule_action.obj_tags->get_tags();
+    const auto& iter = rule_tags.find(tag.first);
+    if(iter == rule_tags.end())
+        continue;
+    if(iter->second == tag.second)
+    {
+      tag_count++;
+    }
+  /* all tags in the rule appear in obj tags */
+  }
+  return tag_count == rule_action.obj_tags->count();
+}
+
+static int check_tags(const DoutPrefixProvider *dpp, lc_op_ctx& oc, bool *skip)
+{
+  auto& op = oc.op;
+
+  if (op.obj_tags != boost::none) {
+    *skip = true;
+
+    bufferlist tags_bl;
+    int ret = read_obj_tags(dpp, oc.obj.get(), tags_bl);
+    if (ret < 0) {
+      if (ret != -ENODATA) {
+        ldpp_dout(oc.dpp, 5) << "ERROR: read_obj_tags returned r="
+			 << ret << " " << oc.wq->thr_name() << dendl;
+      }
+      return 0;
+    }
+    RGWObjTags dest_obj_tags;
+    try {
+      auto iter = tags_bl.cbegin();
+      dest_obj_tags.decode(iter);
+    } catch (buffer::error& err) {
+      ldpp_dout(oc.dpp,0) << "ERROR: caught buffer::error, couldn't decode TagSet "
+		      << oc.wq->thr_name() << dendl;
+      return -EIO;
+    }
+
+    if (! has_all_tags(op, dest_obj_tags)) {
+      ldpp_dout(oc.dpp, 20) << __func__ << "() skipping obj " << oc.obj
+			<< " as tags do not match in rule: "
+			<< op.id << " "
+			<< oc.wq->thr_name() << dendl;
+      return 0;
+    }
+  }
+  *skip = false;
+  return 0;
+}
+
+class LCOpFilter_Tags : public LCOpFilter {
+public:
+  bool check(const DoutPrefixProvider *dpp, lc_op_ctx& oc) override {
+    auto& o = oc.o;
+
+    if (o.is_delete_marker()) {
+      return true;
+    }
+
+    bool skip;
+
+    int ret = check_tags(dpp, oc, &skip);
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+        return false;
+      }
+      ldpp_dout(oc.dpp, 0) << "ERROR: check_tags on obj=" << oc.obj
+		       << " returned ret=" << ret << " "
+		       << oc.wq->thr_name() << dendl;
+      return false;
+    }
+
+    return !skip;
+  };
+};
+
+class LCOpAction_CurrentExpiration : public LCOpAction {
+public:
+  LCOpAction_CurrentExpiration(op_env& env) {}
+
+  bool check(lc_op_ctx& oc, ceph::real_time *exp_time, const DoutPrefixProvider *dpp) override {
+    auto& o = oc.o;
+    if (!o.is_current()) {
+      ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key
+			<< ": not current, skipping "
+			<< oc.wq->thr_name() << dendl;
+      return false;
+    }
+    if (o.is_delete_marker()) {
+      if (oc.next_key_name) {
+	std::string nkn = *oc.next_key_name;
+	if (oc.next_has_same_name(o.key.name)) {
+	  ldpp_dout(dpp, 7) << __func__ << "(): dm-check SAME: key=" << o.key
+			   << " next_key_name: %%" << nkn << "%% "
+			   << oc.wq->thr_name() << dendl;
+	  return false;
+	} else {
+	  ldpp_dout(dpp, 7) << __func__ << "(): dm-check DELE: key=" << o.key
+			   << " next_key_name: %%" << nkn << "%% "
+			   << oc.wq->thr_name() << dendl;
+        *exp_time = real_clock::now();
+        return true;
+	}
+      }
+      return false;
+    }
+
+    auto& mtime = o.meta.mtime;
+    bool is_expired;
+    auto& op = oc.op;
+    if (op.expiration <= 0) {
+      if (op.expiration_date == boost::none) {
+        ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key
+			  << ": no expiration set in rule, skipping "
+			  << oc.wq->thr_name() << dendl;
+        return false;
+      }
+      is_expired = ceph_clock_now() >=
+	ceph::real_clock::to_time_t(*op.expiration_date);
+      *exp_time = *op.expiration_date;
+    } else {
+      is_expired = obj_has_expired(dpp, oc.cct, mtime, op.expiration, exp_time);
+    }
+
+    ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key << ": is_expired="
+		      << (int)is_expired << " "
+		      << oc.wq->thr_name() << dendl;
+    return is_expired;
+  }
+
+  int process(lc_op_ctx& oc) {
+    auto& o = oc.o;
+    int r;
+    if (o.is_delete_marker()) {
+      r = remove_expired_obj(oc.dpp, oc, true,
+			     rgw::notify::ObjectExpirationDeleteMarker);
+      if (r < 0) {
+	ldpp_dout(oc.dpp, 0) << "ERROR: current is-dm remove_expired_obj "
+			 << oc.bucket << ":" << o.key
+			 << " " << cpp_strerror(r) << " "
+			 << oc.wq->thr_name() << dendl;
+      return r;
+      }
+      ldpp_dout(oc.dpp, 2) << "DELETED: current is-dm "
+		       << oc.bucket << ":" << o.key
+		       << " " << oc.wq->thr_name() << dendl;
+    } else {
+      /* ! o.is_delete_marker() */
+      r = remove_expired_obj(oc.dpp, oc, !oc.bucket->versioned(),
+			     rgw::notify::ObjectExpirationCurrent);
+      if (r < 0) {
+	ldpp_dout(oc.dpp, 0) << "ERROR: remove_expired_obj "
+			 << oc.bucket << ":" << o.key
+			 << " " << cpp_strerror(r) << " "
+			 << oc.wq->thr_name() << dendl;
+	return r;
+      }
+      if (perfcounter) {
+        perfcounter->inc(l_rgw_lc_expire_current, 1);
+      }
+      ldpp_dout(oc.dpp, 2) << "DELETED:" << oc.bucket << ":" << o.key
+		       << " " << oc.wq->thr_name() << dendl;
+    }
+    return 0;
+  }
+};
+
+class LCOpAction_NonCurrentExpiration : public LCOpAction {
+protected:
+public:
+  LCOpAction_NonCurrentExpiration(op_env& env)
+    {}
+
+  bool check(lc_op_ctx& oc, ceph::real_time *exp_time, const DoutPrefixProvider *dpp) override {
+    auto& o = oc.o;
+    if (o.is_current()) {
+      ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key
+			<< ": current version, skipping "
+			<< oc.wq->thr_name() << dendl;
+      return false;
+    }
+
+    int expiration = oc.op.noncur_expiration;
+    bool is_expired = obj_has_expired(dpp, oc.cct, oc.effective_mtime, expiration,
+				      exp_time);
+
+    ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key << ": is_expired="
+		      << is_expired << " "
+		      << oc.wq->thr_name() << dendl;
+
+    return is_expired &&
+      pass_object_lock_check(oc.driver, oc.obj.get(), dpp);
+  }
+
+  int process(lc_op_ctx& oc) {
+    auto& o = oc.o;
+    int r = remove_expired_obj(oc.dpp, oc, true,
+			       rgw::notify::ObjectExpirationNoncurrent);
+    if (r < 0) {
+      ldpp_dout(oc.dpp, 0) << "ERROR: remove_expired_obj (non-current expiration) " 
+		       << oc.bucket << ":" << o.key
+		       << " " << cpp_strerror(r)
+		       << " " << oc.wq->thr_name() << dendl;
+      return r;
+    }
+    if (perfcounter) {
+      perfcounter->inc(l_rgw_lc_expire_noncurrent, 1);
+    }
+    ldpp_dout(oc.dpp, 2) << "DELETED:" << oc.bucket << ":" << o.key
+		     << " (non-current expiration) "
+		     << oc.wq->thr_name() << dendl;
+    return 0;
+  }
+};
+
+class LCOpAction_DMExpiration : public LCOpAction {
+public:
+  LCOpAction_DMExpiration(op_env& env) {}
+
+  bool check(lc_op_ctx& oc, ceph::real_time *exp_time, const DoutPrefixProvider *dpp) override {
+    auto& o = oc.o;
+    if (!o.is_delete_marker()) {
+      ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key
+			<< ": not a delete marker, skipping "
+			<< oc.wq->thr_name() << dendl;
+      return false;
+    }
+    if (oc.next_has_same_name(o.key.name)) {
+      ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key
+			<< ": next is same object, skipping "
+			<< oc.wq->thr_name() << dendl;
+      return false;
+    }
+
+    *exp_time = real_clock::now();
+
+    return true;
+  }
+
+  int process(lc_op_ctx& oc) {
+    auto& o = oc.o;
+    int r = remove_expired_obj(oc.dpp, oc, true,
+			       rgw::notify::ObjectExpirationDeleteMarker);
+    if (r < 0) {
+      ldpp_dout(oc.dpp, 0) << "ERROR: remove_expired_obj (delete marker expiration) "
+		       << oc.bucket << ":" << o.key
+		       << " " << cpp_strerror(r)
+		       << " " << oc.wq->thr_name()
+		       << dendl;
+      return r;
+    }
+    if (perfcounter) {
+      perfcounter->inc(l_rgw_lc_expire_dm, 1);
+    }
+    ldpp_dout(oc.dpp, 2) << "DELETED:" << oc.bucket << ":" << o.key
+		     << " (delete marker expiration) "
+		     << oc.wq->thr_name() << dendl;
+    return 0;
+  }
+};
+
+class LCOpAction_Transition : public LCOpAction {
+  const transition_action& transition;
+  bool need_to_process{false};
+
+protected:
+  virtual bool check_current_state(bool is_current) = 0;
+  virtual ceph::real_time get_effective_mtime(lc_op_ctx& oc) = 0;
+public:
+  LCOpAction_Transition(const transition_action& _transition)
+    : transition(_transition) {}
+
+  bool check(lc_op_ctx& oc, ceph::real_time *exp_time, const DoutPrefixProvider *dpp) override {
+    auto& o = oc.o;
+
+    if (o.is_delete_marker()) {
+      return false;
+    }
+
+    if (!check_current_state(o.is_current())) {
+      return false;
+    }
+
+    auto mtime = get_effective_mtime(oc);
+    bool is_expired;
+    if (transition.days < 0) {
+      if (transition.date == boost::none) {
+        ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key
+			  << ": no transition day/date set in rule, skipping "
+			  << oc.wq->thr_name() << dendl;
+        return false;
+      }
+      is_expired = ceph_clock_now() >=
+	ceph::real_clock::to_time_t(*transition.date);
+      *exp_time = *transition.date;
+    } else {
+      is_expired = obj_has_expired(dpp, oc.cct, mtime, transition.days, exp_time);
+    }
+
+    ldpp_dout(oc.dpp, 20) << __func__ << "(): key=" << o.key << ": is_expired="
+		      << is_expired << " "
+		      << oc.wq->thr_name() << dendl;
+
+    need_to_process =
+      (rgw_placement_rule::get_canonical_storage_class(o.meta.storage_class) !=
+       transition.storage_class);
+
+    return is_expired;
+  }
+
+  bool should_process() override {
+    return need_to_process;
+  }
+
+  int delete_tier_obj(lc_op_ctx& oc) {
+    int ret = 0;
+
+    /* If bucket is versioned, create delete_marker for current version
+     */
+    if (oc.bucket->versioned() && oc.o.is_current() && !oc.o.is_delete_marker()) {
+      ret = remove_expired_obj(oc.dpp, oc, false, rgw::notify::ObjectExpiration);
+      ldpp_dout(oc.dpp, 20) << "delete_tier_obj Object(key:" << oc.o.key << ") current & not delete_marker" << " versioned_epoch:  " << oc.o.versioned_epoch << "flags: " << oc.o.flags << dendl;
+    } else {
+      ret = remove_expired_obj(oc.dpp, oc, true, rgw::notify::ObjectExpiration);
+      ldpp_dout(oc.dpp, 20) << "delete_tier_obj Object(key:" << oc.o.key << ") not current " << "versioned_epoch:  " << oc.o.versioned_epoch << "flags: " << oc.o.flags << dendl;
+    }
+    return ret;
+  }
+
+  int transition_obj_to_cloud(lc_op_ctx& oc) {
+    /* If CurrentVersion object, remove it & create delete marker */
+    bool delete_object = (!oc.tier->retain_head_object() ||
+                     (oc.o.is_current() && oc.bucket->versioned()));
+
+    int ret = oc.obj->transition_to_cloud(oc.bucket, oc.tier.get(), oc.o,
+					  oc.env.worker->get_cloud_targets(), oc.cct,
+					  !delete_object, oc.dpp, null_yield);
+    if (ret < 0) {
+      return ret;
+    }
+
+    if (delete_object) {
+      ret = delete_tier_obj(oc);
+      if (ret < 0) {
+        ldpp_dout(oc.dpp, 0) << "ERROR: Deleting tier object(" << oc.o.key << ") failed ret=" << ret << dendl;
+        return ret;
+      }
+    }
+
+    return 0;
+  }
+
+  int process(lc_op_ctx& oc) {
+    auto& o = oc.o;
+    int r;
+
+    if (oc.o.meta.category == RGWObjCategory::CloudTiered) {
+      /* Skip objects which are already cloud tiered. */
+      ldpp_dout(oc.dpp, 30) << "Object(key:" << oc.o.key << ") is already cloud tiered to cloud-s3 tier: " << oc.o.meta.storage_class << dendl;
+      return 0;
+    }
+
+    std::string tier_type = ""; 
+    rgw::sal::ZoneGroup& zonegroup = oc.driver->get_zone()->get_zonegroup();
+
+    rgw_placement_rule target_placement;
+    target_placement.inherit_from(oc.bucket->get_placement_rule());
+    target_placement.storage_class = transition.storage_class;
+
+    r = zonegroup.get_placement_tier(target_placement, &oc.tier);
+
+    if (!r && oc.tier->get_tier_type() == "cloud-s3") {
+      ldpp_dout(oc.dpp, 30) << "Found cloud s3 tier: " << target_placement.storage_class << dendl;
+      if (!oc.o.is_current() &&
+          !pass_object_lock_check(oc.driver, oc.obj.get(), oc.dpp)) {
+        /* Skip objects which has object lock enabled. */
+        ldpp_dout(oc.dpp, 10) << "Object(key:" << oc.o.key << ") is locked. Skipping transition to cloud-s3 tier: " << target_placement.storage_class << dendl;
+        return 0;
+      }
+
+      r = transition_obj_to_cloud(oc);
+      if (r < 0) {
+        ldpp_dout(oc.dpp, 0) << "ERROR: failed to transition obj(key:" << oc.o.key << ") to cloud (r=" << r << ")"
+                             << dendl;
+        return r;
+      }
+    } else {
+      if (!oc.driver->valid_placement(target_placement)) {
+        ldpp_dout(oc.dpp, 0) << "ERROR: non existent dest placement: "
+	  		     << target_placement
+                             << " bucket="<< oc.bucket
+                             << " rule_id=" << oc.op.id
+			     << " " << oc.wq->thr_name() << dendl;
+        return -EINVAL;
+      }
+
+      int r = oc.obj->transition(oc.bucket, target_placement, o.meta.mtime,
+	  		         o.versioned_epoch, oc.dpp, null_yield);
+      if (r < 0) {
+        ldpp_dout(oc.dpp, 0) << "ERROR: failed to transition obj " 
+			     << oc.bucket << ":" << o.key 
+			     << " -> " << transition.storage_class 
+			     << " " << cpp_strerror(r)
+			     << " " << oc.wq->thr_name() << dendl;
+        return r;
+      }
+    }
+    ldpp_dout(oc.dpp, 2) << "TRANSITIONED:" << oc.bucket
+			 << ":" << o.key << " -> "
+			 << transition.storage_class
+			 << " " << oc.wq->thr_name() << dendl;
+    return 0;
+  }
+};
+
+class LCOpAction_CurrentTransition : public LCOpAction_Transition {
+protected:
+  bool check_current_state(bool is_current) override {
+    return is_current;
+  }
+
+  ceph::real_time get_effective_mtime(lc_op_ctx& oc) override {
+    return oc.o.meta.mtime;
+  }
+public:
+  LCOpAction_CurrentTransition(const transition_action& _transition)
+    : LCOpAction_Transition(_transition) {}
+    int process(lc_op_ctx& oc) {
+      int r = LCOpAction_Transition::process(oc);
+      if (r == 0) {
+        if (perfcounter) {
+          perfcounter->inc(l_rgw_lc_transition_current, 1);
+        }
+      }
+      return r;
+    }
+};
+
+class LCOpAction_NonCurrentTransition : public LCOpAction_Transition {
+protected:
+  bool check_current_state(bool is_current) override {
+    return !is_current;
+  }
+
+  ceph::real_time get_effective_mtime(lc_op_ctx& oc) override {
+    return oc.effective_mtime;
+  }
+public:
+  LCOpAction_NonCurrentTransition(op_env& env,
+				  const transition_action& _transition)
+    : LCOpAction_Transition(_transition)
+    {}
+    int process(lc_op_ctx& oc) {
+      int r = LCOpAction_Transition::process(oc);
+      if (r == 0) {
+        if (perfcounter) {
+          perfcounter->inc(l_rgw_lc_transition_noncurrent, 1);
+        }
+      }
+      return r;
+    }
+};
+
+void LCOpRule::build()
+{
+  filters.emplace_back(new LCOpFilter_Tags);
+
+  auto& op = env.op;
+
+  if (op.expiration > 0 ||
+      op.expiration_date != boost::none) {
+    actions.emplace_back(new LCOpAction_CurrentExpiration(env));
+  }
+
+  if (op.dm_expiration) {
+    actions.emplace_back(new LCOpAction_DMExpiration(env));
+  }
+
+  if (op.noncur_expiration > 0) {
+    actions.emplace_back(new LCOpAction_NonCurrentExpiration(env));
+  }
+
+  for (auto& iter : op.transitions) {
+    actions.emplace_back(new LCOpAction_CurrentTransition(iter.second));
+  }
+
+  for (auto& iter : op.noncur_transitions) {
+    actions.emplace_back(new LCOpAction_NonCurrentTransition(env, iter.second));
+  }
+}
+
+void LCOpRule::update()
+{
+  next_key_name = env.ol.next_key_name();
+  effective_mtime = env.ol.get_prev_obj().meta.mtime;
+}
+
+int LCOpRule::process(rgw_bucket_dir_entry& o,
+		      const DoutPrefixProvider *dpp,
+		      WorkQ* wq)
+{
+  lc_op_ctx ctx(env, o, next_key_name, effective_mtime, dpp, wq);
+  shared_ptr<LCOpAction> *selected = nullptr; // n.b., req'd by sharing
+  real_time exp;
+
+  for (auto& a : actions) {
+    real_time action_exp;
+
+    if (a->check(ctx, &action_exp, dpp)) {
+      if (action_exp > exp) {
+        exp = action_exp;
+        selected = &a;
+      }
+    }
+  }
+
+  if (selected &&
+      (*selected)->should_process()) {
+
+    /*
+     * Calling filter checks after action checks because
+     * all action checks (as they are implemented now) do
+     * not access the objects themselves, but return result
+     * from info from bucket index listing. The current tags filter
+     * check does access the objects, so we avoid unnecessary rados calls
+     * having filters check later in the process.
+     */
+
+    bool cont = false;
+    for (auto& f : filters) {
+      if (f->check(dpp, ctx)) {
+        cont = true;
+        break;
+      }
+    }
+
+    if (!cont) {
+      ldpp_dout(dpp, 20) << __func__ << "(): key=" << o.key
+			 << ": no rule match, skipping "
+			 << wq->thr_name() << dendl;
+      return 0;
+    }
+
+    int r = (*selected)->process(ctx);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: remove_expired_obj " 
+			<< env.bucket << ":" << o.key
+			<< " " << cpp_strerror(r)
+			<< " " << wq->thr_name() << dendl;
+      return r;
+    }
+    ldpp_dout(dpp, 20) << "processed:" << env.bucket << ":"
+		       << o.key << " " << wq->thr_name() << dendl;
+  }
+
+  return 0;
+
+}
+
+int RGWLC::bucket_lc_process(string& shard_id, LCWorker* worker,
+			     time_t stop_at, bool once)
+{
+  RGWLifecycleConfiguration  config(cct);
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  string no_ns, list_versions;
+  vector<rgw_bucket_dir_entry> objs;
+  vector<std::string> result;
+  boost::split(result, shard_id, boost::is_any_of(":"));
+  string bucket_tenant = result[0];
+  string bucket_name = result[1];
+  string bucket_marker = result[2];
+
+  ldpp_dout(this, 5) << "RGWLC::bucket_lc_process ENTER " << bucket_name << dendl;
+  if (unlikely(cct->_conf->rgwlc_skip_bucket_step)) {
+    return 0;
+  }
+
+  int ret = driver->get_bucket(this, nullptr, bucket_tenant, bucket_name, &bucket, null_yield);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "LC:get_bucket for " << bucket_name
+		       << " failed" << dendl;
+    return ret;
+  }
+
+  ret = bucket->load_bucket(this, null_yield);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "LC:load_bucket for " << bucket_name
+		       << " failed" << dendl;
+    return ret;
+  }
+
+  auto stack_guard = make_scope_guard(
+    [&worker]
+      {
+	worker->workpool->drain();
+      }
+    );
+
+  if (bucket->get_marker() != bucket_marker) {
+    ldpp_dout(this, 1) << "LC: deleting stale entry found for bucket="
+		       << bucket_tenant << ":" << bucket_name
+		       << " cur_marker=" << bucket->get_marker()
+                       << " orig_marker=" << bucket_marker << dendl;
+    return -ENOENT;
+  }
+
+  map<string, bufferlist>::iterator aiter
+    = bucket->get_attrs().find(RGW_ATTR_LC);
+  if (aiter == bucket->get_attrs().end()) {
+    ldpp_dout(this, 0) << "WARNING: bucket_attrs.find(RGW_ATTR_LC) failed for "
+		       << bucket_name << " (terminates bucket_lc_process(...))"
+		       << dendl;
+    return 0;
+  }
+
+  bufferlist::const_iterator iter{&aiter->second};
+  try {
+      config.decode(iter);
+    } catch (const buffer::error& e) {
+      ldpp_dout(this, 0) << __func__ <<  "() decode life cycle config failed"
+			 << dendl;
+      return -1;
+    }
+
+  /* fetch information for zone checks */
+  rgw::sal::Zone* zone = driver->get_zone();
+
+  auto pf = [](RGWLC::LCWorker* wk, WorkQ* wq, WorkItem& wi) {
+    auto wt =
+      boost::get<std::tuple<LCOpRule, rgw_bucket_dir_entry>>(wi);
+    auto& [op_rule, o] = wt;
+
+    ldpp_dout(wk->get_lc(), 20)
+      << __func__ << "(): key=" << o.key << wq->thr_name() 
+      << dendl;
+    int ret = op_rule.process(o, wk->dpp, wq);
+    if (ret < 0) {
+      ldpp_dout(wk->get_lc(), 20)
+	<< "ERROR: orule.process() returned ret=" << ret
+	<< "thread:" << wq->thr_name()
+	<< dendl;
+    }
+  };
+  worker->workpool->setf(pf);
+
+  multimap<string, lc_op>& prefix_map = config.get_prefix_map();
+  ldpp_dout(this, 10) << __func__ <<  "() prefix_map size="
+		      << prefix_map.size()
+		      << dendl;
+
+  rgw_obj_key pre_marker;
+  rgw_obj_key next_marker;
+  for(auto prefix_iter = prefix_map.begin(); prefix_iter != prefix_map.end();
+      ++prefix_iter) {
+
+    if (worker_should_stop(stop_at, once)) {
+      ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker "
+		     << worker->ix
+		     << dendl;
+      return 0;
+    }
+
+    auto& op = prefix_iter->second;
+    if (!is_valid_op(op)) {
+      continue;
+    }
+    ldpp_dout(this, 20) << __func__ << "(): prefix=" << prefix_iter->first
+			<< dendl;
+    if (prefix_iter != prefix_map.begin() && 
+        (prefix_iter->first.compare(0, prev(prefix_iter)->first.length(),
+				    prev(prefix_iter)->first) == 0)) {
+      next_marker = pre_marker;
+    } else {
+      pre_marker = next_marker;
+    }
+
+    LCObjsLister ol(driver, bucket.get());
+    ol.set_prefix(prefix_iter->first);
+
+    if (! zone_check(op, zone)) {
+      ldpp_dout(this, 7) << "LC rule not executable in " << zone->get_tier_type()
+			 << " zone, skipping" << dendl;
+      continue;
+    }
+
+    ret = ol.init(this);
+    if (ret < 0) {
+      if (ret == (-ENOENT))
+        return 0;
+      ldpp_dout(this, 0) << "ERROR: driver->list_objects():" << dendl;
+      return ret;
+    }
+
+    op_env oenv(op, driver, worker, bucket.get(), ol);
+    LCOpRule orule(oenv);
+    orule.build(); // why can't ctor do it?
+    rgw_bucket_dir_entry* o{nullptr};
+    for (auto offset = 0; ol.get_obj(this, &o /* , fetch_barrier */); ++offset, ol.next()) {
+      orule.update();
+      std::tuple<LCOpRule, rgw_bucket_dir_entry> t1 = {orule, *o};
+      worker->workpool->enqueue(WorkItem{t1});
+      if ((offset % 100) == 0) {
+	if (worker_should_stop(stop_at, once)) {
+	  ldpp_dout(this, 5) << __func__ << " interval budget EXPIRED worker "
+			     << worker->ix
+			     << dendl;
+	  return 0;
+	}
+      }
+    }
+    worker->workpool->drain();
+  }
+
+  ret = handle_multipart_expiration(bucket.get(), prefix_map, worker, stop_at, once);
+  return ret;
+}
+
+class SimpleBackoff
+{
+  const int max_retries;
+  std::chrono::milliseconds sleep_ms;
+  int retries{0};
+public:
+  SimpleBackoff(int max_retries, std::chrono::milliseconds initial_sleep_ms)
+    : max_retries(max_retries), sleep_ms(initial_sleep_ms)
+    {}
+  SimpleBackoff(const SimpleBackoff&) = delete;
+  SimpleBackoff& operator=(const SimpleBackoff&) = delete;
+
+  int get_retries() const {
+    return retries;
+  }
+
+  void reset() {
+    retries = 0;
+  }
+
+  bool wait_backoff(const fu2::unique_function<bool(void) const>& barrier) {
+    reset();
+    while (retries < max_retries) {
+      auto r = barrier();
+      if (r) {
+	return r;
+      }
+      std::this_thread::sleep_for(sleep_ms * 2 * retries++);
+    }
+    return false;
+  }
+};
+
+int RGWLC::bucket_lc_post(int index, int max_lock_sec,
+			  rgw::sal::Lifecycle::LCEntry& entry, int& result,
+			  LCWorker* worker)
+{
+  utime_t lock_duration(cct->_conf->rgw_lc_lock_max_time, 0);
+
+  std::unique_ptr<rgw::sal::LCSerializer> lock =
+    sal_lc->get_serializer(lc_index_lock_name, obj_names[index], cookie);
+
+  ldpp_dout(this, 5) << "RGWLC::bucket_lc_post(): POST " << entry
+	  << " index: " << index << " worker ix: " << worker->ix
+	  << dendl;
+
+  do {
+    int ret = lock->try_lock(this, lock_duration, null_yield);
+    if (ret == -EBUSY || ret == -EEXIST) {
+      /* already locked by another lc processor */
+      ldpp_dout(this, 0) << "RGWLC::bucket_lc_post() failed to acquire lock on "
+			 << obj_names[index] << ", sleep 5, try again " << dendl;
+      sleep(5);
+      continue;
+    }
+
+    if (ret < 0)
+      return 0;
+    ldpp_dout(this, 20) << "RGWLC::bucket_lc_post() lock " << obj_names[index]
+			<< dendl;
+
+    if (result ==  -ENOENT) {
+      /* XXXX are we SURE the only way result could == ENOENT is when
+       * there is no such bucket?  It is currently the value returned
+       * from bucket_lc_process(...) */
+      ret = sal_lc->rm_entry(obj_names[index],  entry);
+      if (ret < 0) {
+        ldpp_dout(this, 0) << "RGWLC::bucket_lc_post() failed to remove entry "
+            << obj_names[index] << dendl;
+      }
+      goto clean;
+    } else if (result < 0) {
+      entry.set_status(lc_failed);
+    } else {
+      entry.set_status(lc_complete);
+    }
+
+    ret = sal_lc->set_entry(obj_names[index],  entry);
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "RGWLC::process() failed to set entry on "
+          << obj_names[index] << dendl;
+    }
+clean:
+    lock->unlock();
+    ldpp_dout(this, 20) << "RGWLC::bucket_lc_post() unlock "
+			<< obj_names[index] << dendl;
+    return 0;
+  } while (true);
+} /* RGWLC::bucket_lc_post */
+
+int RGWLC::list_lc_progress(string& marker, uint32_t max_entries,
+			    vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>& progress_map,
+			    int& index)
+{
+  progress_map.clear();
+  for(; index < max_objs; index++, marker="") {
+    vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> entries;
+    int ret = sal_lc->list_entries(obj_names[index], marker, max_entries, entries);
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+        ldpp_dout(this, 10) << __func__ << "() ignoring unfound lc object="
+                             << obj_names[index] << dendl;
+        continue;
+      } else {
+        return ret;
+      }
+    }
+    progress_map.reserve(progress_map.size() + entries.size());
+    std::move(begin(entries), end(entries), std::back_inserter(progress_map));
+    //progress_map.insert(progress_map.end(), entries.begin(), entries.end());
+
+    /* update index, marker tuple */
+    if (progress_map.size() > 0)
+      marker = progress_map.back()->get_bucket();
+
+    if (progress_map.size() >= max_entries)
+      break;
+  }
+  return 0;
+}
+
+static inline vector<int> random_sequence(uint32_t n)
+{
+  vector<int> v(n, 0);
+  std::generate(v.begin(), v.end(),
+    [ix = 0]() mutable {
+      return ix++;
+    });
+  std::random_device rd;
+  std::default_random_engine rng{rd()};
+  std::shuffle(v.begin(), v.end(), rng);
+  return v;
+}
+
+static inline int get_lc_index(CephContext *cct,
+			       const std::string& shard_id)
+{
+  int max_objs =
+    (cct->_conf->rgw_lc_max_objs > HASH_PRIME ? HASH_PRIME :
+     cct->_conf->rgw_lc_max_objs);
+  /* n.b. review hash algo */
+  int index = ceph_str_hash_linux(shard_id.c_str(),
+				  shard_id.size()) % HASH_PRIME % max_objs;
+  return index;
+}
+
+static inline void get_lc_oid(CephContext *cct,
+			      const std::string& shard_id, string *oid)
+{
+  /* n.b. review hash algo */
+  int index = get_lc_index(cct, shard_id);
+  *oid = lc_oid_prefix;
+  char buf[32];
+  snprintf(buf, 32, ".%d", index);
+  oid->append(buf);
+  return;
+}
+
+static std::string get_bucket_lc_key(const rgw_bucket& bucket){
+  return string_join_reserve(':', bucket.tenant, bucket.name, bucket.marker);
+}
+
+int RGWLC::process(LCWorker* worker,
+		   const std::unique_ptr<rgw::sal::Bucket>& optional_bucket,
+		   bool once = false)
+{
+  int ret = 0;
+  int max_secs = cct->_conf->rgw_lc_lock_max_time;
+
+  if (optional_bucket) {
+    /* if a bucket is provided, this is a single-bucket run, and
+     * can be processed without traversing any state entries (we
+     * do need the entry {pro,epi}logue which update the state entry
+     * for this bucket) */
+    auto bucket_lc_key = get_bucket_lc_key(optional_bucket->get_key());
+    auto index = get_lc_index(driver->ctx(), bucket_lc_key);
+    ret = process_bucket(index, max_secs, worker, bucket_lc_key, once);
+    return ret;
+  } else {
+    /* generate an index-shard sequence unrelated to any other
+     * that might be running in parallel */
+    std::string all_buckets{""};
+    vector<int> shard_seq = random_sequence(max_objs);
+    for (auto index : shard_seq) {
+      ret = process(index, max_secs, worker, once);
+      if (ret < 0)
+	return ret;
+    }
+  }
+
+  return 0;
+}
+
+bool RGWLC::expired_session(time_t started)
+{
+  if (! cct->_conf->rgwlc_auto_session_clear) {
+    return false;
+  }
+
+  time_t interval = (cct->_conf->rgw_lc_debug_interval > 0)
+    ? cct->_conf->rgw_lc_debug_interval
+    : 24*60*60;
+
+  auto now = time(nullptr);
+
+  ldpp_dout(this, 16) << "RGWLC::expired_session"
+	   << " started: " << started
+	   << " interval: " << interval << "(*2==" << 2*interval << ")"
+	   << " now: " << now
+	   << dendl;
+
+  return (started + 2*interval < now);
+}
+
+time_t RGWLC::thread_stop_at()
+{
+  uint64_t interval = (cct->_conf->rgw_lc_debug_interval > 0)
+    ? cct->_conf->rgw_lc_debug_interval
+    : 24*60*60;
+
+  return time(nullptr) + interval;
+}
+
+int RGWLC::process_bucket(int index, int max_lock_secs, LCWorker* worker,
+			  const std::string& bucket_entry_marker,
+			  bool once = false)
+{
+  ldpp_dout(this, 5) << "RGWLC::process_bucket(): ENTER: "
+	  << "index: " << index << " worker ix: " << worker->ix
+	  << dendl;
+
+  int ret = 0;
+  std::unique_ptr<rgw::sal::LCSerializer> serializer =
+    sal_lc->get_serializer(lc_index_lock_name, obj_names[index],
+			   worker->thr_name());
+  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
+  if (max_lock_secs <= 0) {
+    return -EAGAIN;
+  }
+
+  utime_t time(max_lock_secs, 0);
+  ret = serializer->try_lock(this, time, null_yield);
+  if (ret == -EBUSY || ret == -EEXIST) {
+    /* already locked by another lc processor */
+    ldpp_dout(this, 0) << "RGWLC::process() failed to acquire lock on "
+		       << obj_names[index] << dendl;
+    return -EBUSY;
+  }
+  if (ret < 0)
+    return 0;
+
+  std::unique_lock<rgw::sal::LCSerializer> lock(
+    *(serializer.get()), std::adopt_lock);
+
+  ret = sal_lc->get_entry(obj_names[index], bucket_entry_marker, &entry);
+  if (ret >= 0) {
+    if (entry->get_status() == lc_processing) {
+      if (expired_session(entry->get_start_time())) {
+	ldpp_dout(this, 5) << "RGWLC::process_bucket(): STALE lc session found for: " << entry
+			   << " index: " << index << " worker ix: " << worker->ix
+			   << " (clearing)"
+			   << dendl;
+      } else {
+	ldpp_dout(this, 5) << "RGWLC::process_bucket(): ACTIVE entry: "
+			   << entry
+			   << " index: " << index
+			   << " worker ix: " << worker->ix
+			   << dendl;
+	return ret;
+      }
+    }
+  }
+
+  /* do nothing if no bucket */
+  if (entry->get_bucket().empty()) {
+    return ret;
+  }
+
+  ldpp_dout(this, 5) << "RGWLC::process_bucket(): START entry 1: " << entry
+		     << " index: " << index << " worker ix: " << worker->ix
+		     << dendl;
+
+  entry->set_status(lc_processing);
+  ret = sal_lc->set_entry(obj_names[index], *entry);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "RGWLC::process_bucket() failed to set obj entry "
+		       << obj_names[index] << entry->get_bucket() << entry->get_status()
+		       << dendl;
+    return ret;
+  }
+
+  ldpp_dout(this, 5) << "RGWLC::process_bucket(): START entry 2: " << entry
+		     << " index: " << index << " worker ix: " << worker->ix
+		     << dendl;
+
+  lock.unlock();
+  ret = bucket_lc_process(entry->get_bucket(), worker, thread_stop_at(), once);
+  bucket_lc_post(index, max_lock_secs, *entry, ret, worker);
+
+  return ret;
+} /* RGWLC::process_bucket */
+
+static inline bool allow_shard_rollover(CephContext* cct, time_t now, time_t shard_rollover_date)
+{
+  /* return true iff:
+   *    - non-debug scheduling is in effect, and
+   *    - the current shard has not rolled over in the last 24 hours
+   */
+  if (((shard_rollover_date < now) &&
+       (now - shard_rollover_date > 24*60*60)) ||
+      (! shard_rollover_date /* no rollover date stored */) ||
+      (cct->_conf->rgw_lc_debug_interval > 0 /* defaults to -1 == disabled */)) {
+    return true;
+  }
+  return false;
+} /* allow_shard_rollover */
+
+static inline bool already_run_today(CephContext* cct, time_t start_date)
+{
+  struct tm bdt;
+  time_t begin_of_day;
+  utime_t now = ceph_clock_now();
+  localtime_r(&start_date, &bdt);
+
+  if (cct->_conf->rgw_lc_debug_interval > 0) {
+    if (now - start_date < cct->_conf->rgw_lc_debug_interval)
+      return true;
+    else
+      return false;
+  }
+
+  bdt.tm_hour = 0;
+  bdt.tm_min = 0;
+  bdt.tm_sec = 0;
+  begin_of_day = mktime(&bdt);
+  if (now - begin_of_day < 24*60*60)
+    return true;
+  else
+    return false;
+} /* already_run_today */
+
+inline int RGWLC::advance_head(const std::string& lc_shard,
+			       rgw::sal::Lifecycle::LCHead& head,
+			       rgw::sal::Lifecycle::LCEntry& entry,
+			       time_t start_date)
+{
+  int ret{0};
+  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> next_entry;
+
+  ret = sal_lc->get_next_entry(lc_shard, entry.get_bucket(), &next_entry);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "RGWLC::process() failed to get obj entry "
+		       << lc_shard << dendl;
+    goto exit;
+  }
+
+  /* save the next position */
+  head.set_marker(next_entry->get_bucket());
+  head.set_start_date(start_date);
+
+  ret = sal_lc->put_head(lc_shard, head);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "RGWLC::process() failed to put head "
+		       << lc_shard
+		       << dendl;
+    goto exit;
+  }
+exit:
+  return ret;
+} /* advance head */
+
+int RGWLC::process(int index, int max_lock_secs, LCWorker* worker,
+		   bool once = false)
+{
+  int ret{0};
+  const auto& lc_shard = obj_names[index];
+
+  std::unique_ptr<rgw::sal::Lifecycle::LCHead> head;
+  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry; //string = bucket_name:bucket_id, start_time, int = LC_BUCKET_STATUS
+
+  ldpp_dout(this, 5) << "RGWLC::process(): ENTER: "
+	  << "index: " << index << " worker ix: " << worker->ix
+	  << dendl;
+
+  std::unique_ptr<rgw::sal::LCSerializer> lock =
+    sal_lc->get_serializer(lc_index_lock_name, lc_shard, worker->thr_name());
+
+  utime_t lock_for_s(max_lock_secs, 0);
+  const auto& lock_lambda = [&]() {
+    ret = lock->try_lock(this, lock_for_s, null_yield);
+    if (ret == 0) {
+      return true;
+    }
+    if (ret == -EBUSY || ret == -EEXIST) {
+      /* already locked by another lc processor */
+      return false;
+      }
+    return false;
+  };
+
+  SimpleBackoff shard_lock(5 /* max retries */, 50ms);
+  if (! shard_lock.wait_backoff(lock_lambda)) {
+    ldpp_dout(this, 0) << "RGWLC::process(): failed to aquire lock on "
+		       << lc_shard << " after " << shard_lock.get_retries()
+		       << dendl;
+    return 0;
+  }
+
+  do {
+    utime_t now = ceph_clock_now();
+
+    /* preamble: find an inital bucket/marker */
+    ret = sal_lc->get_head(lc_shard, &head);
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "RGWLC::process() failed to get obj head "
+          << lc_shard << ", ret=" << ret << dendl;
+      goto exit;
+    }
+
+    /* if there is nothing at head, try to reinitialize head.marker with the
+     * first entry in the queue */
+    if (head->get_marker().empty() &&
+	allow_shard_rollover(cct, now, head->get_shard_rollover_date()) /* prevent multiple passes by diff.
+								  * rgws,in same cycle */) {
+
+      ldpp_dout(this, 5) << "RGWLC::process() process shard rollover lc_shard=" << lc_shard
+			 << " head.marker=" << head->get_marker()
+			 << " head.shard_rollover_date=" << head->get_shard_rollover_date()
+			 << dendl;
+
+      vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>> entries;
+      int ret = sal_lc->list_entries(lc_shard, head->get_marker(), 1, entries);
+      if (ret < 0) {
+	ldpp_dout(this, 0) << "RGWLC::process() sal_lc->list_entries(lc_shard, head.marker, 1, "
+			   << "entries) returned error ret==" << ret << dendl;
+	goto exit;
+      }
+      if (entries.size() > 0) {
+	entry = std::move(entries.front());
+	head->set_marker(entry->get_bucket());
+	head->set_start_date(now);
+	head->set_shard_rollover_date(0);
+      }
+    } else {
+      ldpp_dout(this, 0) << "RGWLC::process() head.marker !empty() at START for shard=="
+			 << lc_shard << " head last stored at "
+			 << rgw_to_asctime(utime_t(time_t(head->get_start_date()), 0))
+			 << dendl;
+
+      /* fetches the entry pointed to by head.bucket */
+      ret = sal_lc->get_entry(lc_shard, head->get_marker(), &entry);
+      if (ret < 0) {
+	ldpp_dout(this, 0) << "RGWLC::process() sal_lc->get_entry(lc_shard, head.marker, entry) "
+			   << "returned error ret==" << ret << dendl;
+	goto exit;
+      }
+    }
+
+    if (entry && !entry->get_bucket().empty()) {
+      if (entry->get_status() == lc_processing) {
+        if (expired_session(entry->get_start_time())) {
+          ldpp_dout(this, 5)
+              << "RGWLC::process(): STALE lc session found for: " << entry
+              << " index: " << index << " worker ix: " << worker->ix
+              << " (clearing)" << dendl;
+        } else {
+          ldpp_dout(this, 5)
+              << "RGWLC::process(): ACTIVE entry: " << entry
+              << " index: " << index << " worker ix: " << worker->ix << dendl;
+	  /* skip to next entry */
+	  if (advance_head(lc_shard, *head.get(), *entry.get(), now) < 0) {
+	    goto exit;
+	  }
+	  /* done with this shard */
+	  if (head->get_marker().empty()) {
+	    ldpp_dout(this, 5) <<
+	      "RGWLC::process() cycle finished lc_shard="
+			       << lc_shard
+			       << dendl;
+	    head->set_shard_rollover_date(ceph_clock_now());
+	    ret = sal_lc->put_head(lc_shard, *head.get());
+	    if (ret < 0) {
+	      ldpp_dout(this, 0) << "RGWLC::process() failed to put head "
+				 << lc_shard
+				 << dendl;
+	    }
+	    goto exit;
+	  }
+          continue;
+        }
+      } else {
+	if ((entry->get_status() == lc_complete) &&
+	    already_run_today(cct, entry->get_start_time())) {
+	  /* skip to next entry */
+	  if (advance_head(lc_shard, *head.get(), *entry.get(), now) < 0) {
+	    goto exit;
+	  }
+	  ldpp_dout(this, 5) << "RGWLC::process() worker ix; " << worker->ix
+			     << " SKIP processing for already-processed bucket " << entry->get_bucket()
+			     << dendl;
+	  /* done with this shard */
+	  if (head->get_marker().empty()) {
+	    ldpp_dout(this, 5) <<
+	      "RGWLC::process() cycle finished lc_shard="
+			       << lc_shard
+			       << dendl;
+	    head->set_shard_rollover_date(ceph_clock_now());
+	    ret = sal_lc->put_head(lc_shard, *head.get());
+	    if (ret < 0) {
+	      ldpp_dout(this, 0) << "RGWLC::process() failed to put head "
+				 << lc_shard
+				 << dendl;
+	    }
+	    goto exit;
+	  }
+	  continue;
+	}
+      }
+    } else {
+      ldpp_dout(this, 5) << "RGWLC::process() entry.bucket.empty() == true at START 1"
+			 << " (this is possible mainly before any lc policy has been stored"
+			 << " or after removal of an lc_shard object)"
+                         << dendl;
+      goto exit;
+    }
+
+    /* When there are no more entries to process, entry will be
+     * equivalent to an empty marker and so the following resets the
+     * processing for the shard automatically when processing is
+     * finished for the shard */
+    ldpp_dout(this, 5) << "RGWLC::process(): START entry 1: " << entry
+	    << " index: " << index << " worker ix: " << worker->ix
+	    << dendl;
+
+    entry->set_status(lc_processing);
+    entry->set_start_time(now);
+
+    ret = sal_lc->set_entry(lc_shard, *entry);
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "RGWLC::process() failed to set obj entry "
+	      << lc_shard << entry->get_bucket() << entry->get_status() << dendl;
+      goto exit;
+    }
+
+    /* advance head for next waiter, then process */
+    if (advance_head(lc_shard, *head.get(), *entry.get(), now) < 0) {
+      goto exit;
+    }
+
+    ldpp_dout(this, 5) << "RGWLC::process(): START entry 2: " << entry
+	    << " index: " << index << " worker ix: " << worker->ix
+	    << dendl;
+
+    /* drop lock so other instances can make progress while this
+     * bucket is being processed */
+    lock->unlock();
+    ret = bucket_lc_process(entry->get_bucket(), worker, thread_stop_at(), once);
+
+    /* postamble */
+    //bucket_lc_post(index, max_lock_secs, entry, ret, worker);
+    if (! shard_lock.wait_backoff(lock_lambda)) {
+      ldpp_dout(this, 0) << "RGWLC::process(): failed to aquire lock on "
+			 << lc_shard << " after " << shard_lock.get_retries()
+			 << dendl;
+      return 0;
+    }
+
+    if (ret == -ENOENT) {
+      /* XXXX are we SURE the only way result could == ENOENT is when
+       * there is no such bucket?  It is currently the value returned
+       * from bucket_lc_process(...) */
+      ret = sal_lc->rm_entry(lc_shard,  *entry);
+      if (ret < 0) {
+        ldpp_dout(this, 0) << "RGWLC::process() failed to remove entry "
+			   << lc_shard << " (nonfatal)"
+			   << dendl;
+	/* not fatal, could result from a race */
+      }
+    } else {
+      if (ret < 0) {
+        entry->set_status(lc_failed);
+      } else {
+        entry->set_status(lc_complete);
+      }
+      ret = sal_lc->set_entry(lc_shard, *entry);
+      if (ret < 0) {
+        ldpp_dout(this, 0) << "RGWLC::process() failed to set entry on "
+                           << lc_shard
+                           << dendl;
+        /* fatal, locked */
+        goto exit;
+      }
+    }
+
+    /* done with this shard */
+    if (head->get_marker().empty()) {
+      ldpp_dout(this, 5) <<
+	"RGWLC::process() cycle finished lc_shard="
+			 << lc_shard
+			 << dendl;
+      head->set_shard_rollover_date(ceph_clock_now());
+      ret = sal_lc->put_head(lc_shard,  *head.get());
+      if (ret < 0) {
+	ldpp_dout(this, 0) << "RGWLC::process() failed to put head "
+			   << lc_shard
+			   << dendl;
+      }
+      goto exit;
+    }
+  } while(1 && !once && !going_down());
+
+exit:
+  lock->unlock();
+  return 0;
+}
+
+void RGWLC::start_processor()
+{
+  auto maxw = cct->_conf->rgw_lc_max_worker;
+  workers.reserve(maxw);
+  for (int ix = 0; ix < maxw; ++ix) {
+    auto worker  =
+      std::make_unique<RGWLC::LCWorker>(this /* dpp */, cct, this, ix);
+    worker->create((string{"lifecycle_thr_"} + to_string(ix)).c_str());
+    workers.emplace_back(std::move(worker));
+  }
+}
+
+void RGWLC::stop_processor()
+{
+  down_flag = true;
+  for (auto& worker : workers) {
+    worker->stop();
+    worker->join();
+  }
+  workers.clear();
+}
+
+unsigned RGWLC::get_subsys() const
+{
+  return dout_subsys;
+}
+
+std::ostream& RGWLC::gen_prefix(std::ostream& out) const
+{
+  return out << "lifecycle: ";
+}
+
+void RGWLC::LCWorker::stop()
+{
+  std::lock_guard l{lock};
+  cond.notify_all();
+}
+
+bool RGWLC::going_down()
+{
+  return down_flag;
+}
+
+bool RGWLC::LCWorker::should_work(utime_t& now)
+{
+  int start_hour;
+  int start_minute;
+  int end_hour;
+  int end_minute;
+  string worktime = cct->_conf->rgw_lifecycle_work_time;
+  sscanf(worktime.c_str(),"%d:%d-%d:%d",&start_hour, &start_minute,
+	 &end_hour, &end_minute);
+  struct tm bdt;
+  time_t tt = now.sec();
+  localtime_r(&tt, &bdt);
+
+  if (cct->_conf->rgw_lc_debug_interval > 0) {
+	  /* We're debugging, so say we can run */
+	  return true;
+  } else if ((bdt.tm_hour*60 + bdt.tm_min >= start_hour*60 + start_minute) &&
+		     (bdt.tm_hour*60 + bdt.tm_min <= end_hour*60 + end_minute)) {
+	  return true;
+  } else {
+	  return false;
+  }
+
+}
+
+int RGWLC::LCWorker::schedule_next_start_time(utime_t &start, utime_t& now)
+{
+  int secs;
+
+  if (cct->_conf->rgw_lc_debug_interval > 0) {
+	secs = start + cct->_conf->rgw_lc_debug_interval - now;
+	if (secs < 0)
+	  secs = 0;
+	return (secs);
+  }
+
+  int start_hour;
+  int start_minute;
+  int end_hour;
+  int end_minute;
+  string worktime = cct->_conf->rgw_lifecycle_work_time;
+  sscanf(worktime.c_str(),"%d:%d-%d:%d",&start_hour, &start_minute, &end_hour,
+	 &end_minute);
+  struct tm bdt;
+  time_t tt = now.sec();
+  time_t nt;
+  localtime_r(&tt, &bdt);
+  bdt.tm_hour = start_hour;
+  bdt.tm_min = start_minute;
+  bdt.tm_sec = 0;
+  nt = mktime(&bdt);
+  secs = nt - tt;
+
+  return secs>0 ? secs : secs+24*60*60;
+}
+
+RGWLC::LCWorker::~LCWorker()
+{
+  delete workpool;
+} /* ~LCWorker */
+
+void RGWLifecycleConfiguration::generate_test_instances(
+  list<RGWLifecycleConfiguration*>& o)
+{
+  o.push_back(new RGWLifecycleConfiguration);
+}
+
+template<typename F>
+static int guard_lc_modify(const DoutPrefixProvider *dpp,
+                           rgw::sal::Driver* driver,
+			   rgw::sal::Lifecycle* sal_lc,
+			   const rgw_bucket& bucket, const string& cookie,
+			   const F& f) {
+  CephContext *cct = driver->ctx();
+
+  auto bucket_lc_key = get_bucket_lc_key(bucket);
+  string oid; 
+  get_lc_oid(cct, bucket_lc_key, &oid);
+
+  /* XXX it makes sense to take shard_id for a bucket_id? */
+  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry = sal_lc->get_entry();
+  entry->set_bucket(bucket_lc_key);
+  entry->set_status(lc_uninitial);
+  int max_lock_secs = cct->_conf->rgw_lc_lock_max_time;
+
+  std::unique_ptr<rgw::sal::LCSerializer> lock =
+    sal_lc->get_serializer(lc_index_lock_name, oid, cookie);
+  utime_t time(max_lock_secs, 0);
+
+  int ret;
+  uint16_t retries{0};
+
+  // due to reports of starvation trying to save lifecycle policy, try hard
+  do {
+    ret = lock->try_lock(dpp, time, null_yield);
+    if (ret == -EBUSY || ret == -EEXIST) {
+      ldpp_dout(dpp, 0) << "RGWLC::RGWPutLC() failed to acquire lock on "
+			<< oid << ", retry in 100ms, ret=" << ret << dendl;
+      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      // the typical S3 client will time out in 60s
+      if(retries++ < 500) {
+	continue;
+      }
+    }
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "RGWLC::RGWPutLC() failed to acquire lock on "
+          << oid << ", ret=" << ret << dendl;
+      break;
+    }
+    ret = f(sal_lc, oid, *entry.get());
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "RGWLC::RGWPutLC() failed to set entry on "
+          << oid << ", ret=" << ret << dendl;
+    }
+    break;
+  } while(true);
+  lock->unlock();
+  return ret;
+}
+
+int RGWLC::set_bucket_config(rgw::sal::Bucket* bucket,
+                         const rgw::sal::Attrs& bucket_attrs,
+                         RGWLifecycleConfiguration *config)
+{
+  int ret{0};
+  rgw::sal::Attrs attrs = bucket_attrs;
+  if (config) {
+    /* if no RGWLifecycleconfiguration provided, it means
+     * RGW_ATTR_LC is already valid and present */
+    bufferlist lc_bl;
+    config->encode(lc_bl);
+    attrs[RGW_ATTR_LC] = std::move(lc_bl);
+
+    ret =
+      bucket->merge_and_store_attrs(this, attrs, null_yield);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  rgw_bucket& b = bucket->get_key();
+
+
+  ret = guard_lc_modify(this, driver, sal_lc.get(), b, cookie,
+			[&](rgw::sal::Lifecycle* sal_lc, const string& oid,
+			    rgw::sal::Lifecycle::LCEntry& entry) {
+    return sal_lc->set_entry(oid, entry);
+  });
+
+  return ret;
+}
+
+int RGWLC::remove_bucket_config(rgw::sal::Bucket* bucket,
+                                const rgw::sal::Attrs& bucket_attrs,
+				bool merge_attrs)
+{
+  rgw::sal::Attrs attrs = bucket_attrs;
+  rgw_bucket& b = bucket->get_key();
+  int ret{0};
+
+  if (merge_attrs) {
+    attrs.erase(RGW_ATTR_LC);
+    ret = bucket->merge_and_store_attrs(this, attrs, null_yield);
+
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "RGWLC::RGWDeleteLC() failed to set attrs on bucket="
+			 << b.name << " returned err=" << ret << dendl;
+      return ret;
+    }
+  }
+
+  ret = guard_lc_modify(this, driver, sal_lc.get(), b, cookie,
+			[&](rgw::sal::Lifecycle* sal_lc, const string& oid,
+			    rgw::sal::Lifecycle::LCEntry& entry) {
+    return sal_lc->rm_entry(oid, entry);
+  });
+
+  return ret;
+} /* RGWLC::remove_bucket_config */
+
+RGWLC::~RGWLC()
+{
+  stop_processor();
+  finalize();
+} /* ~RGWLC() */
+
+namespace rgw::lc {
+
+int fix_lc_shard_entry(const DoutPrefixProvider *dpp,
+                       rgw::sal::Driver* driver,
+		       rgw::sal::Lifecycle* sal_lc,
+		       rgw::sal::Bucket* bucket)
+{
+  if (auto aiter = bucket->get_attrs().find(RGW_ATTR_LC);
+      aiter == bucket->get_attrs().end()) {
+    return 0;    // No entry, nothing to fix
+  }
+
+  auto bucket_lc_key = get_bucket_lc_key(bucket->get_key());
+  std::string lc_oid;
+  get_lc_oid(driver->ctx(), bucket_lc_key, &lc_oid);
+
+  std::unique_ptr<rgw::sal::Lifecycle::LCEntry> entry;
+  // There are multiple cases we need to encounter here
+  // 1. entry exists and is already set to marker, happens in plain buckets & newly resharded buckets
+  // 2. entry doesn't exist, which usually happens when reshard has happened prior to update and next LC process has already dropped the update
+  // 3. entry exists matching the current bucket id which was after a reshard (needs to be updated to the marker)
+  // We are not dropping the old marker here as that would be caught by the next LC process update
+  int ret = sal_lc->get_entry(lc_oid, bucket_lc_key, &entry);
+  if (ret == 0) {
+    ldpp_dout(dpp, 5) << "Entry already exists, nothing to do" << dendl;
+    return ret; // entry is already existing correctly set to marker
+  }
+  ldpp_dout(dpp, 5) << "lc_get_entry errored ret code=" << ret << dendl;
+  if (ret == -ENOENT) {
+    ldpp_dout(dpp, 1) << "No entry for bucket=" << bucket
+			   << " creating " << dendl;
+    // TODO: we have too many ppl making cookies like this!
+    char cookie_buf[COOKIE_LEN + 1];
+    gen_rand_alphanumeric(driver->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
+    std::string cookie = cookie_buf;
+
+    ret = guard_lc_modify(dpp,
+      driver, sal_lc, bucket->get_key(), cookie,
+      [&lc_oid](rgw::sal::Lifecycle* slc,
+			      const string& oid,
+			      rgw::sal::Lifecycle::LCEntry& entry) {
+	return slc->set_entry(lc_oid, entry);
+      });
+
+  }
+
+  return ret;
+}
+
+std::string s3_expiration_header(
+  DoutPrefixProvider* dpp,
+  const rgw_obj_key& obj_key,
+  const RGWObjTags& obj_tagset,
+  const ceph::real_time& mtime,
+  const std::map<std::string, buffer::list>& bucket_attrs)
+{
+  CephContext* cct = dpp->get_cct();
+  RGWLifecycleConfiguration config(cct);
+  std::string hdr{""};
+
+  const auto& aiter = bucket_attrs.find(RGW_ATTR_LC);
+  if (aiter == bucket_attrs.end())
+    return hdr;
+
+  bufferlist::const_iterator iter{&aiter->second};
+  try {
+      config.decode(iter);
+  } catch (const buffer::error& e) {
+      ldpp_dout(dpp, 0) << __func__
+			<<  "() decode life cycle config failed"
+			<< dendl;
+      return hdr;
+  } /* catch */
+
+  /* dump tags at debug level 16 */
+  RGWObjTags::tag_map_t obj_tag_map = obj_tagset.get_tags();
+  if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 16)) {
+    for (const auto& elt : obj_tag_map) {
+      ldpp_dout(dpp, 16) << __func__
+		     <<  "() key=" << elt.first << " val=" << elt.second
+		     << dendl;
+    }
+  }
+
+  boost::optional<ceph::real_time> expiration_date;
+  boost::optional<std::string> rule_id;
+
+  const auto& rule_map = config.get_rule_map();
+  for (const auto& ri : rule_map) {
+    const auto& rule = ri.second;
+    auto& id = rule.get_id();
+    auto& filter = rule.get_filter();
+    auto& prefix = filter.has_prefix() ? filter.get_prefix(): rule.get_prefix();
+    auto& expiration = rule.get_expiration();
+    auto& noncur_expiration = rule.get_noncur_expiration();
+
+    ldpp_dout(dpp, 10) << "rule: " << ri.first
+		       << " prefix: " << prefix
+		       << " expiration: "
+		       << " date: " << expiration.get_date()
+		       << " days: " << expiration.get_days()
+		       << " noncur_expiration: "
+		       << " date: " << noncur_expiration.get_date()
+		       << " days: " << noncur_expiration.get_days()
+		       << dendl;
+
+    /* skip if rule !enabled
+     * if rule has prefix, skip iff object !match prefix
+     * if rule has tags, skip iff object !match tags
+     * note if object is current or non-current, compare accordingly
+     * if rule has days, construct date expression and save iff older
+     * than last saved
+     * if rule has date, convert date expression and save iff older
+     * than last saved
+     * if the date accum has a value, format it into hdr
+     */
+
+    if (! rule.is_enabled())
+      continue;
+
+    if(! prefix.empty()) {
+      if (! boost::starts_with(obj_key.name, prefix))
+        continue;
+    }
+
+    if (filter.has_tags()) {
+      bool tag_match = false;
+      const RGWObjTags& rule_tagset = filter.get_tags();
+      for (auto& tag : rule_tagset.get_tags()) {
+	/* remember, S3 tags are {key,value} tuples */
+        tag_match = true;
+        auto obj_tag = obj_tag_map.find(tag.first);
+        if (obj_tag == obj_tag_map.end() || obj_tag->second != tag.second) {
+	        ldpp_dout(dpp, 10) << "tag does not match obj_key=" << obj_key
+			         << " rule_id=" << id
+			         << " tag=" << tag
+			         << dendl;
+	        tag_match = false;
+	        break;
+	      }
+      }
+      if (! tag_match)
+	      continue;
+    }
+
+    // compute a uniform expiration date
+    boost::optional<ceph::real_time> rule_expiration_date;
+    const LCExpiration& rule_expiration =
+      (obj_key.instance.empty()) ? expiration : noncur_expiration;
+
+    if (rule_expiration.has_date()) {
+      rule_expiration_date =
+	boost::optional<ceph::real_time>(
+	  ceph::from_iso_8601(rule.get_expiration().get_date()));
+    } else {
+      if (rule_expiration.has_days()) {
+	rule_expiration_date =
+	  boost::optional<ceph::real_time>(
+	    mtime + make_timespan(double(rule_expiration.get_days())*24*60*60 - ceph::real_clock::to_time_t(mtime)%(24*60*60) + 24*60*60));
+      }
+    }
+
+    // update earliest expiration
+    if (rule_expiration_date) {
+      if ((! expiration_date) ||
+	  (*expiration_date > *rule_expiration_date)) {
+      expiration_date =
+	boost::optional<ceph::real_time>(rule_expiration_date);
+      rule_id = boost::optional<std::string>(id);
+      }
+    }
+  }
+
+  // cond format header
+  if (expiration_date && rule_id) {
+    // Fri, 23 Dec 2012 00:00:00 GMT
+    char exp_buf[100];
+    time_t exp = ceph::real_clock::to_time_t(*expiration_date);
+    if (std::strftime(exp_buf, sizeof(exp_buf),
+		      "%a, %d %b %Y %T %Z", std::gmtime(&exp))) {
+      hdr = fmt::format("expiry-date=\"{0}\", rule-id=\"{1}\"", exp_buf,
+			*rule_id);
+    } else {
+      ldpp_dout(dpp, 0) << __func__ <<
+	"() strftime of life cycle expiration header failed"
+			<< dendl;
+    }
+  }
+
+  return hdr;
+
+} /* rgwlc_s3_expiration_header */
+
+bool s3_multipart_abort_header(
+  DoutPrefixProvider* dpp,
+  const rgw_obj_key& obj_key,
+  const ceph::real_time& mtime,
+  const std::map<std::string, buffer::list>& bucket_attrs,
+  ceph::real_time& abort_date,
+  std::string& rule_id)
+{
+  CephContext* cct = dpp->get_cct();
+  RGWLifecycleConfiguration config(cct);
+
+  const auto& aiter = bucket_attrs.find(RGW_ATTR_LC);
+  if (aiter == bucket_attrs.end())
+    return false;
+
+  bufferlist::const_iterator iter{&aiter->second};
+  try {
+    config.decode(iter);
+  } catch (const buffer::error& e) {
+    ldpp_dout(dpp, 0) << __func__
+                      <<  "() decode life cycle config failed"
+                      << dendl;
+    return false;
+  } /* catch */
+
+  std::optional<ceph::real_time> abort_date_tmp;
+  std::optional<std::string_view> rule_id_tmp;
+  const auto& rule_map = config.get_rule_map();
+  for (const auto& ri : rule_map) {
+    const auto& rule = ri.second;
+    const auto& id = rule.get_id();
+    const auto& filter = rule.get_filter();
+    const auto& prefix = filter.has_prefix()?filter.get_prefix():rule.get_prefix();
+    const auto& mp_expiration = rule.get_mp_expiration();
+    if (!rule.is_enabled()) {
+      continue;
+    }
+    if(!prefix.empty() && !boost::starts_with(obj_key.name, prefix)) {
+      continue;
+    }
+
+    std::optional<ceph::real_time> rule_abort_date;
+    if (mp_expiration.has_days()) {
+      rule_abort_date = std::optional<ceph::real_time>(
+              mtime + make_timespan(mp_expiration.get_days()*24*60*60 - ceph::real_clock::to_time_t(mtime)%(24*60*60) + 24*60*60));
+    }
+
+    // update earliest abort date
+    if (rule_abort_date) {
+      if ((! abort_date_tmp) ||
+          (*abort_date_tmp > *rule_abort_date)) {
+        abort_date_tmp =
+                std::optional<ceph::real_time>(rule_abort_date);
+        rule_id_tmp = std::optional<std::string_view>(id);
+      }
+    }
+  }
+  if (abort_date_tmp && rule_id_tmp) {
+    abort_date = *abort_date_tmp;
+    rule_id = *rule_id_tmp;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+} /* namespace rgw::lc */
+
+void lc_op::dump(Formatter *f) const
+{
+  f->dump_bool("status", status);
+  f->dump_bool("dm_expiration", dm_expiration);
+
+  f->dump_int("expiration", expiration);
+  f->dump_int("noncur_expiration", noncur_expiration);
+  f->dump_int("mp_expiration", mp_expiration);
+  if (expiration_date) {
+    utime_t ut(*expiration_date);
+    f->dump_stream("expiration_date") << ut;
+  }
+  if (obj_tags) {
+    f->dump_object("obj_tags", *obj_tags);
+  }
+  f->open_object_section("transitions");
+  for(auto& [storage_class, transition] : transitions) {
+    f->dump_object(storage_class, transition);
+  }
+  f->close_section();
+
+  f->open_object_section("noncur_transitions");
+  for (auto& [storage_class, transition] : noncur_transitions) {
+    f->dump_object(storage_class, transition);
+  }
+  f->close_section();
+}
+
+void LCFilter::dump(Formatter *f) const
+{
+  f->dump_string("prefix", prefix);
+  f->dump_object("obj_tags", obj_tags);
+  if (have_flag(LCFlagType::ArchiveZone)) {
+    f->dump_string("archivezone", "");
+  }
+}
+
+void LCExpiration::dump(Formatter *f) const
+{
+  f->dump_string("days", days);
+  f->dump_string("date", date);
+}
+
+void LCRule::dump(Formatter *f) const
+{
+  f->dump_string("id", id);
+  f->dump_string("prefix", prefix);
+  f->dump_string("status", status);
+  f->dump_object("expiration", expiration);
+  f->dump_object("noncur_expiration", noncur_expiration);
+  f->dump_object("mp_expiration", mp_expiration);
+  f->dump_object("filter", filter);
+  f->open_object_section("transitions");
+  for (auto& [storage_class, transition] : transitions) {
+    f->dump_object(storage_class, transition);
+  }
+  f->close_section();
+
+  f->open_object_section("noncur_transitions");
+  for (auto& [storage_class, transition] : noncur_transitions) {
+    f->dump_object(storage_class, transition);
+  }
+  f->close_section();
+  f->dump_bool("dm_expiration", dm_expiration);
+}
+
+
+void RGWLifecycleConfiguration::dump(Formatter *f) const
+{
+  f->open_object_section("prefix_map");
+  for (auto& prefix : prefix_map) {
+    f->dump_object(prefix.first.c_str(), prefix.second);
+  }
+  f->close_section();
+
+  f->open_array_section("rule_map");
+  for (auto& rule : rule_map) {
+    f->open_object_section("entry");
+    f->dump_string("id", rule.first);
+    f->open_object_section("rule");
+    rule.second.dump(f);
+    f->close_section();
+    f->close_section();
+  }
+  f->close_section();
+}
+
diff --git a/src/rgw/rgw_lc.h b/src/rgw/rgw_lc.h
new file mode 100644
index 000000000..bd8efd9b6
--- /dev/null
+++ b/src/rgw/rgw_lc.h
@@ -0,0 +1,640 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <map>
+#include <array>
+#include <string>
+#include <iostream>
+
+#include "common/debug.h"
+
+#include "include/types.h"
+#include "include/rados/librados.hpp"
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/iso_8601.h"
+#include "common/Thread.h"
+#include "rgw_common.h"
+#include "cls/rgw/cls_rgw_types.h"
+#include "rgw_tag.h"
+#include "rgw_sal.h"
+
+#include <atomic>
+#include <tuple>
+
+#define HASH_PRIME 7877
+#define MAX_ID_LEN 255
+static std::string lc_oid_prefix = "lc";
+static std::string lc_index_lock_name = "lc_process";
+
+extern const char* LC_STATUS[];
+
+typedef enum {
+  lc_uninitial = 0,
+  lc_processing,
+  lc_failed,
+  lc_complete,
+} LC_BUCKET_STATUS;
+
+class LCExpiration
+{
+protected:
+  std::string days;
+  //At present only current object has expiration date
+  std::string date;
+public:
+  LCExpiration() {}
+  LCExpiration(const std::string& _days, const std::string& _date) : days(_days), date(_date) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 2, bl);
+    encode(days, bl);
+    encode(date, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
+    decode(days, bl);
+    if (struct_v >= 3) {
+      decode(date, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+//  static void generate_test_instances(list<ACLOwner*>& o);
+  void set_days(const std::string& _days) { days = _days; }
+  std::string get_days_str() const {
+    return days;
+  }
+  int get_days() const {return atoi(days.c_str()); }
+  bool has_days() const {
+    return !days.empty();
+  }
+  void set_date(const std::string& _date) { date = _date; }
+  std::string get_date() const {
+    return date;
+  }
+  bool has_date() const {
+    return !date.empty();
+  }
+  bool empty() const {
+    return days.empty() && date.empty();
+  }
+  bool valid() const {
+    if (!days.empty() && !date.empty()) {
+      return false;
+    } else if (!days.empty() && get_days() <= 0) {
+      return false;
+    }
+    //We've checked date in xml parsing
+    return true;
+  }
+};
+WRITE_CLASS_ENCODER(LCExpiration)
+
+class LCTransition
+{
+protected:
+  std::string days;
+  std::string date;
+  std::string storage_class;
+
+public:
+  int get_days() const {
+    return atoi(days.c_str());
+  }
+
+  std::string get_date() const {
+    return date;
+  }
+
+  std::string get_storage_class() const {
+    return storage_class;
+  }
+
+  bool has_days() const {
+    return !days.empty();
+  }
+
+  bool has_date() const {
+    return !date.empty();
+  }
+
+  bool empty() const {
+    return days.empty() && date.empty();
+  }
+
+  bool valid() const {
+    if (!days.empty() && !date.empty()) {
+      return false;
+    } else if (!days.empty() && get_days() < 0) {
+      return false;
+    }
+    //We've checked date in xml parsing
+    return true;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(days, bl);
+    encode(date, bl);
+    encode(storage_class, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(days, bl);
+    decode(date, bl);
+    decode(storage_class, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const {  
+    f->dump_string("days", days);
+    f->dump_string("date", date);
+    f->dump_string("storage_class", storage_class);
+  }
+};
+WRITE_CLASS_ENCODER(LCTransition)
+
+enum class LCFlagType : uint16_t
+{
+  none = 0,
+  ArchiveZone,
+};
+
+class LCFlag {
+public:
+  LCFlagType bit;
+  const char* name;
+
+  constexpr LCFlag(LCFlagType ord, const char* name) : bit(ord), name(name)
+    {}
+};
+
+class LCFilter
+{
+ public:
+
+  static constexpr uint32_t make_flag(LCFlagType type) {
+    switch (type) {
+    case LCFlagType::none:
+      return 0;
+      break;
+    default:
+      return 1 << (uint32_t(type) - 1);
+    }
+   }
+
+  static constexpr std::array<LCFlag, 2> filter_flags =
+  {
+    LCFlag(LCFlagType::none, "none"),
+    LCFlag(LCFlagType::ArchiveZone, "ArchiveZone"),
+  };
+
+protected:
+  std::string prefix;
+  RGWObjTags obj_tags;
+  uint32_t flags;
+
+public:
+
+  LCFilter() : flags(make_flag(LCFlagType::none))
+    {}
+
+  const std::string& get_prefix() const {
+    return prefix;
+  }
+
+  const RGWObjTags& get_tags() const {
+    return obj_tags;
+  }
+
+  const uint32_t get_flags() const {
+    return flags;
+  }
+
+  bool empty() const {
+    return !(has_prefix() || has_tags() || has_flags());
+  }
+
+  // Determine if we need AND tag when creating xml
+  bool has_multi_condition() const {
+    if (obj_tags.count() + int(has_prefix()) + int(has_flags()) > 1) // Prefix is a member of Filter
+      return true;
+    return false;
+  }
+
+  bool has_prefix() const {
+    return !prefix.empty();
+  }
+
+  bool has_tags() const {
+    return !obj_tags.empty();
+  }
+
+  bool has_flags() const {
+    return !(flags == uint32_t(LCFlagType::none));
+  }
+
+  bool have_flag(LCFlagType flag) const {
+    return flags & make_flag(flag);
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    encode(prefix, bl);
+    encode(obj_tags, bl);
+    encode(flags, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(3, bl);
+    decode(prefix, bl);
+    if (struct_v >= 2) {
+      decode(obj_tags, bl);
+      if (struct_v >= 3) {
+	decode(flags, bl);
+      }
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(LCFilter)
+
+class LCRule
+{
+protected:
+  std::string id;
+  std::string prefix;
+  std::string status;
+  LCExpiration expiration;
+  LCExpiration noncur_expiration;
+  LCExpiration mp_expiration;
+  LCFilter filter;
+  std::map<std::string, LCTransition> transitions;
+  std::map<std::string, LCTransition> noncur_transitions;
+  bool dm_expiration = false;
+
+public:
+
+  LCRule(){};
+  virtual ~LCRule() {}
+
+  const std::string& get_id() const {
+      return id;
+  }
+
+  const std::string& get_status() const {
+      return status;
+  }
+
+  bool is_enabled() const {
+    return status == "Enabled";
+  }
+
+  void set_enabled(bool flag) {
+    status = (flag ? "Enabled" : "Disabled");
+  }
+
+  const std::string& get_prefix() const {
+      return prefix;
+  }
+
+  const LCFilter& get_filter() const {
+    return filter;
+  }
+
+  const LCExpiration& get_expiration() const {
+    return expiration;
+  }
+
+  const LCExpiration& get_noncur_expiration() const {
+    return noncur_expiration;
+  }
+
+  const LCExpiration& get_mp_expiration() const {
+    return mp_expiration;
+  }
+
+  bool get_dm_expiration() const {
+    return dm_expiration;
+  }
+
+  const std::map<std::string, LCTransition>& get_transitions() const {
+    return transitions;
+  }
+
+  const std::map<std::string, LCTransition>& get_noncur_transitions() const {
+    return noncur_transitions;
+  }
+
+  void set_id(const std::string& _id) {
+    id = _id;
+  }
+
+  void set_prefix(const std::string& _prefix) {
+    prefix = _prefix;
+  }
+
+  void set_status(const std::string& _status) {
+    status = _status;
+  }
+
+  void set_expiration(const LCExpiration& _expiration) {
+    expiration = _expiration;
+  }
+
+  void set_noncur_expiration(const LCExpiration& _noncur_expiration) {
+    noncur_expiration = _noncur_expiration;
+  }
+
+  void set_mp_expiration(const LCExpiration& _mp_expiration) {
+    mp_expiration = _mp_expiration;
+  }
+
+  void set_dm_expiration(bool _dm_expiration) {
+    dm_expiration = _dm_expiration;
+  }
+
+  bool add_transition(const LCTransition& _transition) {
+    auto ret = transitions.emplace(_transition.get_storage_class(), _transition);
+    return ret.second;
+  }
+
+  bool add_noncur_transition(const LCTransition& _noncur_transition) {
+    auto ret = noncur_transitions.emplace(_noncur_transition.get_storage_class(), _noncur_transition);
+    return ret.second;
+  }
+
+  bool valid() const;
+  
+  void encode(bufferlist& bl) const {
+     ENCODE_START(6, 1, bl);
+     encode(id, bl);
+     encode(prefix, bl);
+     encode(status, bl);
+     encode(expiration, bl);
+     encode(noncur_expiration, bl);
+     encode(mp_expiration, bl);
+     encode(dm_expiration, bl);
+     encode(filter, bl);
+     encode(transitions, bl);
+     encode(noncur_transitions, bl);
+     ENCODE_FINISH(bl);
+   }
+   void decode(bufferlist::const_iterator& bl) {
+     DECODE_START_LEGACY_COMPAT_LEN(6, 1, 1, bl);
+     decode(id, bl);
+     decode(prefix, bl);
+     decode(status, bl);
+     decode(expiration, bl);
+     if (struct_v >=2) {
+       decode(noncur_expiration, bl);
+     }
+     if (struct_v >= 3) {
+       decode(mp_expiration, bl);
+     }
+     if (struct_v >= 4) {
+        decode(dm_expiration, bl);
+     }
+     if (struct_v >= 5) {
+       decode(filter, bl);
+     }
+     if (struct_v >= 6) {
+       decode(transitions, bl);
+       decode(noncur_transitions, bl);
+     }
+     DECODE_FINISH(bl);
+   }
+  void dump(Formatter *f) const;
+
+  void init_simple_days_rule(std::string_view _id, std::string_view _prefix, int num_days);
+};
+WRITE_CLASS_ENCODER(LCRule)
+
+struct transition_action
+{
+  int days;
+  boost::optional<ceph::real_time> date;
+  std::string storage_class;
+  transition_action() : days(0) {}
+  void dump(Formatter *f) const {
+    if (!date) {
+      f->dump_int("days", days);
+    } else {
+      utime_t ut(*date);
+      f->dump_stream("date") << ut;
+    }
+  }
+};
+
+/* XXX why not LCRule? */
+struct lc_op
+{
+  std::string id;
+  bool status{false};
+  bool dm_expiration{false};
+  int expiration{0};
+  int noncur_expiration{0};
+  int mp_expiration{0};
+  boost::optional<ceph::real_time> expiration_date;
+  boost::optional<RGWObjTags> obj_tags;
+  std::map<std::string, transition_action> transitions;
+  std::map<std::string, transition_action> noncur_transitions;
+  uint32_t rule_flags;
+
+  /* ctors are nice */
+  lc_op() = delete;
+
+  lc_op(const std::string id) : id(id)
+    {}
+
+  void dump(Formatter *f) const;
+};
+
+class RGWLifecycleConfiguration
+{
+protected:
+  CephContext *cct;
+  std::multimap<std::string, lc_op> prefix_map;
+  std::multimap<std::string, LCRule> rule_map;
+  bool _add_rule(const LCRule& rule);
+  bool has_same_action(const lc_op& first, const lc_op& second);
+public:
+  explicit RGWLifecycleConfiguration(CephContext *_cct) : cct(_cct) {}
+  RGWLifecycleConfiguration() : cct(NULL) {}
+
+  void set_ctx(CephContext *ctx) {
+    cct = ctx;
+  }
+
+  virtual ~RGWLifecycleConfiguration() {}
+
+//  int get_perm(std::string& id, int perm_mask);
+//  int get_group_perm(ACLGroupTypeEnum group, int perm_mask);
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(rule_map, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, bl);
+    decode(rule_map, bl);
+    std::multimap<std::string, LCRule>::iterator iter;
+    for (iter = rule_map.begin(); iter != rule_map.end(); ++iter) {
+      LCRule& rule = iter->second;
+      _add_rule(rule);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<RGWLifecycleConfiguration*>& o);
+
+  void add_rule(const LCRule& rule);
+
+  int check_and_add_rule(const LCRule& rule);
+
+  bool valid();
+
+  std::multimap<std::string, LCRule>& get_rule_map() { return rule_map; }
+  std::multimap<std::string, lc_op>& get_prefix_map() { return prefix_map; }
+/*
+  void create_default(std::string id, std::string name) {
+    ACLGrant grant;
+    grant.set_canon(id, name, RGW_PERM_FULL_CONTROL);
+    add_grant(&grant);
+  }
+*/
+};
+WRITE_CLASS_ENCODER(RGWLifecycleConfiguration)
+
+class RGWLC : public DoutPrefixProvider {
+  CephContext *cct;
+  rgw::sal::Driver* driver;
+  std::unique_ptr<rgw::sal::Lifecycle> sal_lc;
+  int max_objs{0};
+  std::string *obj_names{nullptr};
+  std::atomic<bool> down_flag = { false };
+  std::string cookie;
+
+public:
+
+  class WorkPool;
+
+  class LCWorker : public Thread
+  {
+    const DoutPrefixProvider *dpp;
+    CephContext *cct;
+    RGWLC *lc;
+    int ix;
+    std::mutex lock;
+    std::condition_variable cond;
+    WorkPool* workpool{nullptr};
+    /* save the target bucket names created as part of object transition
+     * to cloud. This list is maintained for the duration of each RGWLC::process()
+     * post which it is discarded. */
+    std::set<std::string> cloud_targets;
+
+  public:
+
+    using lock_guard = std::lock_guard<std::mutex>;
+    using unique_lock = std::unique_lock<std::mutex>;
+
+    LCWorker(const DoutPrefixProvider* dpp, CephContext *_cct, RGWLC *_lc,
+	     int ix);
+    RGWLC* get_lc() { return lc; }
+
+    std::string thr_name() {
+      return std::string{"lc_thrd: "} + std::to_string(ix);
+    }
+
+    void *entry() override;
+    void stop();
+    bool should_work(utime_t& now);
+    int schedule_next_start_time(utime_t& start, utime_t& now);
+    std::set<std::string>& get_cloud_targets() { return cloud_targets; }
+    virtual ~LCWorker() override;
+
+    friend class RGWRados;
+    friend class RGWLC;
+    friend class WorkQ;
+  }; /* LCWorker */
+
+  friend class RGWRados;
+
+  std::vector<std::unique_ptr<RGWLC::LCWorker>> workers;
+
+  RGWLC() : cct(nullptr), driver(nullptr) {}
+  virtual ~RGWLC() override;
+
+  void initialize(CephContext *_cct, rgw::sal::Driver* _driver);
+  void finalize();
+
+  int process(LCWorker* worker,
+	      const std::unique_ptr<rgw::sal::Bucket>& optional_bucket,
+	      bool once);
+  int advance_head(const std::string& lc_shard,
+		   rgw::sal::Lifecycle::LCHead& head,
+		   rgw::sal::Lifecycle::LCEntry& entry,
+		   time_t start_date);
+  int process(int index, int max_lock_secs, LCWorker* worker, bool once);
+  int process_bucket(int index, int max_lock_secs, LCWorker* worker,
+		     const std::string& bucket_entry_marker, bool once);
+  bool expired_session(time_t started);
+  time_t thread_stop_at();
+  int list_lc_progress(std::string& marker, uint32_t max_entries,
+		       std::vector<std::unique_ptr<rgw::sal::Lifecycle::LCEntry>>&,
+		       int& index);
+  int bucket_lc_process(std::string& shard_id, LCWorker* worker, time_t stop_at,
+			bool once);
+  int bucket_lc_post(int index, int max_lock_sec,
+		     rgw::sal::Lifecycle::LCEntry& entry, int& result, LCWorker* worker);
+  bool going_down();
+  void start_processor();
+  void stop_processor();
+  int set_bucket_config(rgw::sal::Bucket* bucket,
+                        const rgw::sal::Attrs& bucket_attrs,
+                        RGWLifecycleConfiguration *config);
+  int remove_bucket_config(rgw::sal::Bucket* bucket,
+                           const rgw::sal::Attrs& bucket_attrs,
+			   bool merge_attrs = true);
+
+  CephContext *get_cct() const override { return cct; }
+  rgw::sal::Lifecycle* get_lc() const { return sal_lc.get(); }
+  unsigned get_subsys() const;
+  std::ostream& gen_prefix(std::ostream& out) const;
+
+  private:
+
+  int handle_multipart_expiration(rgw::sal::Bucket* target,
+				  const std::multimap<std::string, lc_op>& prefix_map,
+				  LCWorker* worker, time_t stop_at, bool once);
+};
+
+namespace rgw::lc {
+
+int fix_lc_shard_entry(const DoutPrefixProvider *dpp,
+                       rgw::sal::Driver* driver,
+		       rgw::sal::Lifecycle* sal_lc,
+		       rgw::sal::Bucket* bucket);
+
+std::string s3_expiration_header(
+  DoutPrefixProvider* dpp,
+  const rgw_obj_key& obj_key,
+  const RGWObjTags& obj_tagset,
+  const ceph::real_time& mtime,
+  const std::map<std::string, buffer::list>& bucket_attrs);
+
+bool s3_multipart_abort_header(
+  DoutPrefixProvider* dpp,
+  const rgw_obj_key& obj_key,
+  const ceph::real_time& mtime,
+  const std::map<std::string, buffer::list>& bucket_attrs,
+  ceph::real_time& abort_date,
+  std::string& rule_id);
+
+} // namespace rgw::lc
diff --git a/src/rgw/rgw_lc_s3.cc b/src/rgw/rgw_lc_s3.cc
new file mode 100644
index 000000000..cf152b84a
--- /dev/null
+++ b/src/rgw/rgw_lc_s3.cc
@@ -0,0 +1,353 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+
+#include <iostream>
+#include <map>
+
+#include "include/types.h"
+
+#include "rgw_user.h"
+#include "rgw_lc_s3.h"
+
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static bool check_date(const string& _date)
+{
+  boost::optional<ceph::real_time> date = ceph::from_iso_8601(_date);
+  if (boost::none == date) {
+    return false;
+  }
+  struct timespec time = ceph::real_clock::to_timespec(*date);
+  if (time.tv_sec % (24*60*60) || time.tv_nsec) {
+    return false;
+  }
+  return true;
+}
+
+void LCExpiration_S3::dump_xml(Formatter *f) const {
+  if (dm_expiration) {
+    encode_xml("ExpiredObjectDeleteMarker", "true", f);
+  } else if (!days.empty()) {
+    encode_xml("Days", days, f);
+  } else {
+    encode_xml("Date", date, f);
+  }
+}
+
+void LCExpiration_S3::decode_xml(XMLObj *obj)
+{
+  bool has_days = RGWXMLDecoder::decode_xml("Days", days, obj);
+  bool has_date = RGWXMLDecoder::decode_xml("Date", date, obj);
+  string dm;
+  bool has_dm = RGWXMLDecoder::decode_xml("ExpiredObjectDeleteMarker", dm, obj);
+
+  int num = !!has_days + !!has_date + !!has_dm;
+
+  if (num != 1) {
+    throw RGWXMLDecoder::err("bad Expiration section");
+  }
+
+  if (has_date && !check_date(date)) {
+    //We need return xml error according to S3
+    throw RGWXMLDecoder::err("bad date in Date section");
+  }
+
+  if (has_dm) {
+    dm_expiration = (dm == "true");
+  }
+}
+
+void LCNoncurExpiration_S3::decode_xml(XMLObj *obj)
+{
+  RGWXMLDecoder::decode_xml("NoncurrentDays", days, obj, true);
+}
+
+void LCNoncurExpiration_S3::dump_xml(Formatter *f) const
+{
+  encode_xml("NoncurrentDays", days, f);
+}
+
+void LCMPExpiration_S3::decode_xml(XMLObj *obj)
+{
+  RGWXMLDecoder::decode_xml("DaysAfterInitiation", days, obj, true);
+}
+
+void LCMPExpiration_S3::dump_xml(Formatter *f) const
+{
+  encode_xml("DaysAfterInitiation", days, f);
+}
+
+void RGWLifecycleConfiguration_S3::decode_xml(XMLObj *obj)
+{
+  if (!cct) {
+    throw RGWXMLDecoder::err("ERROR: RGWLifecycleConfiguration_S3 can't be decoded without cct initialized");
+  }
+  vector<LCRule_S3> rules;
+
+  RGWXMLDecoder::decode_xml("Rule", rules, obj, true);
+
+  for (auto& rule : rules) {
+    if (rule.get_id().empty()) {
+      // S3 generates a 48 bit random ID, maybe we could generate shorter IDs
+      static constexpr auto LC_ID_LENGTH = 48;
+      string id = gen_rand_alphanumeric_lower(cct, LC_ID_LENGTH);
+      rule.set_id(id);
+    }
+
+    add_rule(rule);
+  }
+
+  if (cct->_conf->rgw_lc_max_rules < rule_map.size()) {
+    stringstream ss;
+    ss << "Warn: The lifecycle config has too many rules, rule number is:" 
+      << rule_map.size() << ", max number is:" << cct->_conf->rgw_lc_max_rules;
+    throw RGWXMLDecoder::err(ss.str());
+  }
+}
+
+void LCFilter_S3::dump_xml(Formatter *f) const
+{
+  bool multi = has_multi_condition();
+  if (multi) {
+    f->open_array_section("And");
+  }
+  if (has_prefix()) {
+    encode_xml("Prefix", prefix, f);
+  }
+  if (has_tags()) {
+    const auto& tagset_s3 = static_cast<const RGWObjTagSet_S3 &>(obj_tags);
+    tagset_s3.dump_xml(f);
+  }
+  if (has_flags()) {
+    if (have_flag(LCFlagType::ArchiveZone)) {
+      encode_xml("ArchiveZone", "", f);
+    }
+  }
+  if (multi) {
+    f->close_section(); // And
+  }
+}
+
+void LCFilter_S3::decode_xml(XMLObj *obj)
+{
+  /*
+   * The prior logic here looked for an And element, but did not
+   * structurally parse the Filter clause (and incorrectly rejected
+   * the base case where a Prefix and one Tag were supplied).  It
+   * could not reject generally malformed Filter syntax.
+   *
+   * Empty filters are allowed:
+   * https://docs.aws.amazon.com/AmazonS3/latest/dev/intro-lifecycle-rules.html
+   */
+  XMLObj* o = obj->find_first("And");
+  if (o == nullptr){
+    o = obj;
+  }
+
+  RGWXMLDecoder::decode_xml("Prefix", prefix, o);
+
+  /* parse optional ArchiveZone flag (extension) */
+  if (o->find_first("ArchiveZone")) {
+    flags |= make_flag(LCFlagType::ArchiveZone);
+  }
+
+  obj_tags.clear(); // why is this needed?
+  auto tags_iter = o->find("Tag");
+  while (auto tag_xml = tags_iter.get_next()){
+    std::string _key,_val;
+    RGWXMLDecoder::decode_xml("Key", _key, tag_xml);
+    RGWXMLDecoder::decode_xml("Value", _val, tag_xml);
+    obj_tags.emplace_tag(std::move(_key), std::move(_val));
+  }
+}
+
+void LCTransition_S3::decode_xml(XMLObj *obj)
+{
+  bool has_days = RGWXMLDecoder::decode_xml("Days", days, obj);
+  bool has_date = RGWXMLDecoder::decode_xml("Date", date, obj);
+  if ((has_days && has_date) || (!has_days && !has_date)) {
+    throw RGWXMLDecoder::err("bad Transition section");
+  }
+
+  if (has_date && !check_date(date)) {
+    //We need return xml error according to S3
+    throw RGWXMLDecoder::err("bad Date in Transition section");
+  }
+
+  if (!RGWXMLDecoder::decode_xml("StorageClass", storage_class, obj)) {
+    throw RGWXMLDecoder::err("missing StorageClass in Transition section");
+  }
+}
+
+void LCTransition_S3::dump_xml(Formatter *f) const {
+  if (!days.empty()) {
+    encode_xml("Days", days, f);
+  } else {
+    encode_xml("Date", date, f);
+  }
+  encode_xml("StorageClass", storage_class, f);
+}
+
+void LCNoncurTransition_S3::decode_xml(XMLObj *obj)
+{
+  if (!RGWXMLDecoder::decode_xml("NoncurrentDays", days, obj)) {
+    throw RGWXMLDecoder::err("missing NoncurrentDays in NoncurrentVersionTransition section");
+  }
+  if (!RGWXMLDecoder::decode_xml("StorageClass", storage_class, obj)) {
+    throw RGWXMLDecoder::err("missing StorageClass in NoncurrentVersionTransition section");
+  }
+}
+
+void LCNoncurTransition_S3::dump_xml(Formatter *f) const
+{
+  encode_xml("NoncurrentDays", days, f);
+  encode_xml("StorageClass", storage_class, f);
+}
+
+void LCRule_S3::decode_xml(XMLObj *obj)
+{
+  id.clear();
+  prefix.clear();
+  status.clear();
+  dm_expiration = false;
+
+  RGWXMLDecoder::decode_xml("ID", id, obj);
+
+  LCFilter_S3 filter_s3;
+  if (!RGWXMLDecoder::decode_xml("Filter", filter_s3, obj)) {
+    // Ideally the following code should be deprecated and we should return
+    // False here, The new S3 LC configuration xml spec. makes Filter mandatory
+    // and Prefix optional. However older clients including boto2 still generate
+    // xml according to the older spec, where Prefix existed outside of Filter
+    // and S3 itself seems to be sloppy on enforcing the mandatory Filter
+    // argument. A day will come when S3 enforces their own xml-spec, but it is
+    // not this day
+
+    if (!RGWXMLDecoder::decode_xml("Prefix", prefix, obj)) {
+      throw RGWXMLDecoder::err("missing Prefix in Filter");
+    }
+  }
+  filter = (LCFilter)filter_s3;
+
+  if (!RGWXMLDecoder::decode_xml("Status", status, obj)) {
+    throw RGWXMLDecoder::err("missing Status in Filter");
+  }
+  if (status.compare("Enabled") != 0 && status.compare("Disabled") != 0) {
+    throw RGWXMLDecoder::err("bad Status in Filter");
+  }
+
+  LCExpiration_S3 s3_expiration;
+  LCNoncurExpiration_S3 s3_noncur_expiration;
+  LCMPExpiration_S3 s3_mp_expiration;
+  LCFilter_S3 s3_filter;
+
+  bool has_expiration = RGWXMLDecoder::decode_xml("Expiration", s3_expiration, obj);
+  bool has_noncur_expiration = RGWXMLDecoder::decode_xml("NoncurrentVersionExpiration", s3_noncur_expiration, obj);
+  bool has_mp_expiration = RGWXMLDecoder::decode_xml("AbortIncompleteMultipartUpload", s3_mp_expiration, obj);
+
+  vector<LCTransition_S3> transitions;
+  vector<LCNoncurTransition_S3> noncur_transitions;
+
+  bool has_transition = RGWXMLDecoder::decode_xml("Transition", transitions, obj);
+  bool has_noncur_transition = RGWXMLDecoder::decode_xml("NoncurrentVersionTransition", noncur_transitions, obj);
+
+  if (!has_expiration &&
+      !has_noncur_expiration &&
+      !has_mp_expiration &&
+      !has_transition &&
+      !has_noncur_transition) {
+    throw RGWXMLDecoder::err("bad Rule");
+  }
+
+  if (has_expiration) {
+    if (s3_expiration.has_days() ||
+        s3_expiration.has_date()) {
+      expiration = s3_expiration;
+    } else {
+      dm_expiration = s3_expiration.get_dm_expiration();
+    }
+  }
+  if (has_noncur_expiration) {
+    noncur_expiration = s3_noncur_expiration;
+  }
+  if (has_mp_expiration) {
+    mp_expiration = s3_mp_expiration;
+  }
+  for (auto& t : transitions) {
+    if (!add_transition(t)) {
+      throw RGWXMLDecoder::err("Failed to add transition");
+    }
+  }
+  for (auto& t : noncur_transitions) {
+    if (!add_noncur_transition(t)) {
+      throw RGWXMLDecoder::err("Failed to add non-current version transition");
+    }
+  }
+}
+
+void LCRule_S3::dump_xml(Formatter *f) const {
+  encode_xml("ID", id, f);
+  // In case of an empty filter and an empty Prefix, we defer to Prefix.
+  if (!filter.empty()) {
+    const LCFilter_S3& lc_filter = static_cast<const LCFilter_S3&>(filter);
+    encode_xml("Filter", lc_filter, f);
+  } else {
+    encode_xml("Prefix", prefix, f);
+  }
+  encode_xml("Status", status, f);
+  if (!expiration.empty() || dm_expiration) {
+    LCExpiration_S3 expir(expiration.get_days_str(), expiration.get_date(), dm_expiration);
+    encode_xml("Expiration", expir, f);
+  }
+  if (!noncur_expiration.empty()) {
+    const LCNoncurExpiration_S3& noncur_expir = static_cast<const LCNoncurExpiration_S3&>(noncur_expiration);
+    encode_xml("NoncurrentVersionExpiration", noncur_expir, f);
+  }
+  if (!mp_expiration.empty()) {
+    const LCMPExpiration_S3& mp_expir = static_cast<const LCMPExpiration_S3&>(mp_expiration);
+    encode_xml("AbortIncompleteMultipartUpload", mp_expir, f);
+  }
+  if (!transitions.empty()) {
+    for (auto &elem : transitions) {
+      const LCTransition_S3& tran = static_cast<const LCTransition_S3&>(elem.second);
+      encode_xml("Transition", tran, f);
+    }
+  }
+  if (!noncur_transitions.empty()) {
+    for (auto &elem : noncur_transitions) {
+      const LCNoncurTransition_S3& noncur_tran = static_cast<const LCNoncurTransition_S3&>(elem.second);
+      encode_xml("NoncurrentVersionTransition", noncur_tran, f);
+    }
+  }
+}
+
+int RGWLifecycleConfiguration_S3::rebuild(RGWLifecycleConfiguration& dest)
+{
+  int ret = 0;
+  multimap<string, LCRule>::iterator iter;
+  for (iter = rule_map.begin(); iter != rule_map.end(); ++iter) {
+    LCRule& src_rule = iter->second;
+    ret = dest.check_and_add_rule(src_rule);
+    if (ret < 0)
+      return ret;
+  }
+  if (!dest.valid()) {
+    ret = -ERR_INVALID_REQUEST;
+  }
+  return ret;
+}
+
+
+void RGWLifecycleConfiguration_S3::dump_xml(Formatter *f) const
+{
+  for (auto iter = rule_map.begin(); iter != rule_map.end(); ++iter) {
+    const LCRule_S3& rule = static_cast<const LCRule_S3&>(iter->second);
+    encode_xml("Rule", rule, f);
+  }
+}
+
diff --git a/src/rgw/rgw_lc_s3.h b/src/rgw/rgw_lc_s3.h
new file mode 100644
index 000000000..5486aef35
--- /dev/null
+++ b/src/rgw/rgw_lc_s3.h
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <iostream>
+#include <include/types.h>
+
+#include "include/str_list.h"
+#include "rgw_lc.h"
+#include "rgw_xml.h"
+#include "rgw_tag_s3.h"
+
+
+class LCFilter_S3 : public LCFilter
+{
+public:
+  void dump_xml(Formatter *f) const;
+  void decode_xml(XMLObj *obj);
+};
+
+class LCExpiration_S3 : public LCExpiration
+{
+private:
+  bool dm_expiration{false};
+public:
+  LCExpiration_S3() {}
+  LCExpiration_S3(std::string _days, std::string _date, bool _dm_expiration) : LCExpiration(_days, _date), dm_expiration(_dm_expiration) {}
+
+  void dump_xml(Formatter *f) const;
+  void decode_xml(XMLObj *obj);
+
+  void set_dm_expiration(bool _dm_expiration) {
+    dm_expiration = _dm_expiration;
+  }
+
+  bool get_dm_expiration() {
+    return dm_expiration;
+  }
+};
+
+class LCNoncurExpiration_S3 : public LCExpiration
+{
+public:
+  LCNoncurExpiration_S3() {}
+  
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+
+class LCMPExpiration_S3 : public LCExpiration
+{
+public:
+  LCMPExpiration_S3() {}
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+
+class LCTransition_S3 : public LCTransition
+{
+public:
+  LCTransition_S3() {}
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+
+class LCNoncurTransition_S3 : public LCTransition
+{
+public:
+  LCNoncurTransition_S3() {}
+  ~LCNoncurTransition_S3() {}
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+
+
+class LCRule_S3 : public LCRule
+{
+public:
+  LCRule_S3() {}
+
+  void dump_xml(Formatter *f) const;
+  void decode_xml(XMLObj *obj);
+};
+
+class RGWLifecycleConfiguration_S3 : public RGWLifecycleConfiguration
+{
+public:
+  explicit RGWLifecycleConfiguration_S3(CephContext *_cct) : RGWLifecycleConfiguration(_cct) {}
+  RGWLifecycleConfiguration_S3() : RGWLifecycleConfiguration(nullptr) {}
+
+  void decode_xml(XMLObj *obj);
+  int rebuild(RGWLifecycleConfiguration& dest);
+  void dump_xml(Formatter *f) const;
+};
diff --git a/src/rgw/rgw_ldap.cc b/src/rgw/rgw_ldap.cc
new file mode 100644
index 000000000..7ad6b74b1
--- /dev/null
+++ b/src/rgw/rgw_ldap.cc
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_ldap.h"
+
+#include "common/ceph_crypto.h"
+#include "common/ceph_context.h"
+#include "common/common_init.h"
+#include "common/dout.h"
+#include "common/safe_io.h"
+#include <boost/algorithm/string.hpp>
+
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+std::string parse_rgw_ldap_bindpw(CephContext* ctx)
+{
+  string ldap_bindpw;
+  string ldap_secret = ctx->_conf->rgw_ldap_secret;
+
+  if (ldap_secret.empty()) {
+    ldout(ctx, 10)
+      << __func__ << " LDAP auth no rgw_ldap_secret file found in conf"
+      << dendl;
+    } else {
+      // FIPS zeroization audit 20191116: this memset is not intended to
+      // wipe out a secret after use.
+      char bindpw[1024];
+      memset(bindpw, 0, 1024);
+      int pwlen = safe_read_file("" /* base */, ldap_secret.c_str(),
+				 bindpw, 1023);
+      if (pwlen > 0) {
+        ldap_bindpw = bindpw;
+        boost::algorithm::trim(ldap_bindpw);
+        if (ldap_bindpw.back() == '\n')
+          ldap_bindpw.pop_back();
+      }
+      ::ceph::crypto::zeroize_for_security(bindpw, sizeof(bindpw));
+  }
+
+  return ldap_bindpw;
+}
+
+#if defined(HAVE_OPENLDAP)
+namespace rgw {
+
+  int LDAPHelper::auth(const std::string &uid, const std::string &pwd) {
+    int ret;
+    std::string filter;
+    if (msad) {
+      filter = "(&(objectClass=user)(sAMAccountName=";
+      filter += uid;
+      filter += "))";
+    } else {
+      /* openldap */
+      if (searchfilter.empty()) {
+        /* no search filter provided in config, we construct our own */
+        filter = "(";
+        filter += dnattr;
+        filter += "=";
+        filter += uid;
+        filter += ")";
+      } else {
+        if (searchfilter.find("@USERNAME@") != std::string::npos) {
+        /* we need to substitute the @USERNAME@ placeholder */
+	  filter = searchfilter;
+          filter.replace(searchfilter.find("@USERNAME@"), std::string("@USERNAME@").length(), uid);
+        } else {
+        /* no placeholder for username, so we need to append our own username filter to the custom searchfilter */
+          filter = "(&(";
+          filter += searchfilter;
+          filter += ")(";
+          filter += dnattr;
+          filter += "=";
+          filter += uid;
+          filter += "))";
+        }
+      }
+    }
+    ldout(g_ceph_context, 12)
+      << __func__ << " search filter: " << filter
+      << dendl;
+    char *attrs[] = { const_cast<char*>(dnattr.c_str()), nullptr };
+    LDAPMessage *answer = nullptr, *entry = nullptr;
+    bool once = true;
+
+    lock_guard guard(mtx);
+
+  retry_bind:
+    ret = ldap_search_s(ldap, searchdn.c_str(), LDAP_SCOPE_SUBTREE,
+			filter.c_str(), attrs, 0, &answer);
+    if (ret == LDAP_SUCCESS) {
+      entry = ldap_first_entry(ldap, answer);
+      if (entry) {
+	char *dn = ldap_get_dn(ldap, entry);
+	ret = simple_bind(dn, pwd);
+	if (ret != LDAP_SUCCESS) {
+	  ldout(g_ceph_context, 10)
+	    << __func__ << " simple_bind failed uid=" << uid
+	    << "ldap err=" << ret
+	    << dendl;
+	}
+	ldap_memfree(dn);
+      } else {
+	ldout(g_ceph_context, 12)
+	  << __func__ << " ldap_search_s no user matching uid=" << uid
+	  << dendl;
+	ret = LDAP_NO_SUCH_ATTRIBUTE; // fixup result
+      }
+      ldap_msgfree(answer);
+    } else {
+      ldout(g_ceph_context, 5)
+	<< __func__ << " ldap_search_s error uid=" << uid
+	<< " ldap err=" << ret
+	<< dendl;
+      /* search should never fail--try to rebind */
+      if (once) {
+	rebind();
+	once = false;
+	goto retry_bind;
+      }
+    }
+    return (ret == LDAP_SUCCESS) ? ret : -EACCES;
+  } /* LDAPHelper::auth */
+}
+
+#endif /* defined(HAVE_OPENLDAP) */
diff --git a/src/rgw/rgw_ldap.h b/src/rgw/rgw_ldap.h
new file mode 100644
index 000000000..05a48ce19
--- /dev/null
+++ b/src/rgw/rgw_ldap.h
@@ -0,0 +1,138 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "acconfig.h"
+
+#if defined(HAVE_OPENLDAP)
+#define LDAP_DEPRECATED 1
+#include "ldap.h"
+#endif
+
+#include <stdint.h>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <mutex>
+
+namespace rgw {
+
+#if defined(HAVE_OPENLDAP)
+
+  class LDAPHelper
+  {
+    std::string uri;
+    std::string binddn;
+    std::string bindpw;
+    std::string searchdn;
+    std::string searchfilter;
+    std::string dnattr;
+    LDAP *ldap;
+    bool msad = false; /* TODO: possible future specialization */
+    std::mutex mtx;
+
+  public:
+    using lock_guard = std::lock_guard<std::mutex>;
+
+    LDAPHelper(std::string _uri, std::string _binddn, std::string _bindpw,
+	       const std::string &_searchdn, const std::string &_searchfilter, const std::string &_dnattr)
+      : uri(std::move(_uri)), binddn(std::move(_binddn)),
+	bindpw(std::move(_bindpw)), searchdn(_searchdn), searchfilter(_searchfilter), dnattr(_dnattr),
+	ldap(nullptr) {
+      // nothing
+    }
+
+    int init() {
+      int ret;
+      ret = ldap_initialize(&ldap, uri.c_str());
+      if (ret == LDAP_SUCCESS) {
+	unsigned long ldap_ver = LDAP_VERSION3;
+	ret = ldap_set_option(ldap, LDAP_OPT_PROTOCOL_VERSION,
+			      (void*) &ldap_ver);
+      }
+      if (ret == LDAP_SUCCESS) {
+	ret = ldap_set_option(ldap, LDAP_OPT_REFERRALS, LDAP_OPT_OFF); 
+      }
+      return (ret == LDAP_SUCCESS) ? ret : -EINVAL;
+    }
+
+    int bind() {
+      int ret;
+      ret = ldap_simple_bind_s(ldap, binddn.c_str(), bindpw.c_str());
+      return (ret == LDAP_SUCCESS) ? ret : -EINVAL;
+    }
+
+    int rebind() {
+      if (ldap) {
+	(void) ldap_unbind(ldap);
+	(void) init();
+	return bind();
+      }
+      return -EINVAL;
+    }
+
+    int simple_bind(const char *dn, const std::string& pwd) {
+      LDAP* tldap;
+      int ret = ldap_initialize(&tldap, uri.c_str());
+      if (ret == LDAP_SUCCESS) {
+	unsigned long ldap_ver = LDAP_VERSION3;
+	ret = ldap_set_option(tldap, LDAP_OPT_PROTOCOL_VERSION,
+			      (void*) &ldap_ver);
+	if (ret == LDAP_SUCCESS) {
+	  ret = ldap_simple_bind_s(tldap, dn, pwd.c_str());
+	}
+	(void) ldap_unbind(tldap);
+      }
+      return ret; // OpenLDAP client error space
+    }
+
+    int auth(const std::string &uid, const std::string &pwd);
+
+    ~LDAPHelper() {
+      if (ldap)
+	(void) ldap_unbind(ldap);
+    }
+
+  }; /* LDAPHelper */
+
+#else
+
+  class LDAPHelper
+  {
+  public:
+    LDAPHelper(const std::string &_uri, const std::string &_binddn, const std::string &_bindpw,
+	       const std::string &_searchdn, const std::string &_searchfilter, const std::string &_dnattr)
+      {}
+
+    int init() {
+      return -ENOTSUP;
+    }
+
+    int bind() {
+      return -ENOTSUP;
+    }
+
+    int auth(const std::string &uid, const std::string &pwd) {
+      return -EACCES;
+    }
+
+    ~LDAPHelper() {}
+
+  }; /* LDAPHelper */
+
+
+#endif /* HAVE_OPENLDAP */
+  
+} /* namespace rgw */
+
+#include "common/ceph_context.h"
+#include "common/common_init.h"
+#include "common/dout.h"
+#include "common/safe_io.h"
+#include <boost/algorithm/string.hpp>
+
+#include "include/ceph_assert.h"
+
+std::string parse_rgw_ldap_bindpw(CephContext* ctx);
diff --git a/src/rgw/rgw_lib.cc b/src/rgw/rgw_lib.cc
new file mode 100644
index 000000000..f449cce21
--- /dev/null
+++ b/src/rgw/rgw_lib.cc
@@ -0,0 +1,610 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <string.h>
+#include <chrono>
+
+#include "include/rados/librgw.h"
+#include "rgw_acl.h"
+
+#include "include/str_list.h"
+#include "global/signal_handler.h"
+#include "common/Timer.h"
+#include "common/WorkQueue.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_context.h"
+#include "common/common_init.h"
+#include "common/dout.h"
+
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_log.h"
+#include "rgw_frontend.h"
+#include "rgw_request.h"
+#include "rgw_process.h"
+#include "rgw_auth.h"
+#include "rgw_lib.h"
+#include "rgw_lib_frontend.h"
+#include "rgw_perf_counters.h"
+#include "rgw_signal.h"
+#include "rgw_main.h"
+
+#include <errno.h>
+#include <thread>
+#include <string>
+#include <mutex>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw {
+
+  RGWLib* g_rgwlib = nullptr;
+
+  class C_InitTimeout : public Context {
+  public:
+    C_InitTimeout() {}
+    void finish(int r) override {
+      derr << "Initialization timeout, failed to initialize" << dendl;
+      exit(1);
+    }
+  };
+
+  void RGWLibProcess::checkpoint()
+  {
+    m_tp.drain(&req_wq);
+  }
+
+#define MIN_EXPIRE_S 120
+
+  void RGWLibProcess::run()
+  {
+    /* write completion interval */
+    RGWLibFS::write_completion_interval_s =
+      cct->_conf->rgw_nfs_write_completion_interval_s;
+
+    /* start write timer */
+    RGWLibFS::write_timer.resume();
+
+    /* gc loop */
+    while (! shutdown) {
+      lsubdout(cct, rgw, 5) << "RGWLibProcess GC" << dendl;
+
+      /* dirent invalidate timeout--basically, the upper-bound on
+       * inconsistency with the S3 namespace */
+      auto expire_s = cct->_conf->rgw_nfs_namespace_expire_secs;
+
+      /* delay between gc cycles */
+      auto delay_s = std::max(int64_t(1), std::min(int64_t(MIN_EXPIRE_S), expire_s/2));
+
+      unique_lock uniq(mtx);
+    restart:
+      int cur_gen = gen;
+      for (auto iter = mounted_fs.begin(); iter != mounted_fs.end();
+	   ++iter) {
+	RGWLibFS* fs = iter->first->ref();
+	uniq.unlock();
+	fs->gc();
+        const DoutPrefix dp(cct, dout_subsys, "librgw: ");
+	fs->update_user(&dp);
+	fs->rele();
+	uniq.lock();
+	if (cur_gen != gen)
+	  goto restart; /* invalidated */
+      }
+      cv.wait_for(uniq, std::chrono::seconds(delay_s));
+      uniq.unlock();
+    }
+  }
+
+  void RGWLibProcess::handle_request(const DoutPrefixProvider *dpp, RGWRequest* r)
+  {
+    /*
+     * invariant: valid requests are derived from RGWLibRequst
+     */
+    RGWLibRequest* req = static_cast<RGWLibRequest*>(r);
+
+    // XXX move RGWLibIO and timing setup into process_request
+
+#if 0 /* XXX */
+    utime_t tm = ceph_clock_now();
+#endif
+
+    RGWLibIO io_ctx;
+
+    int ret = process_request(req, &io_ctx);
+    if (ret < 0) {
+      /* we don't really care about return code */
+      dout(20) << "process_request() returned " << ret << dendl;
+
+    }
+    delete req;
+  } /* handle_request */
+
+  int RGWLibProcess::process_request(RGWLibRequest* req)
+  {
+    // XXX move RGWLibIO and timing setup into process_request
+
+#if 0 /* XXX */
+    utime_t tm = ceph_clock_now();
+#endif
+
+    RGWLibIO io_ctx;
+
+    int ret = process_request(req, &io_ctx);
+    if (ret < 0) {
+      /* we don't really care about return code */
+      dout(20) << "process_request() returned " << ret << dendl;
+    }
+    return ret;
+  } /* process_request */
+
+  static inline void abort_req(req_state *s, RGWOp *op, int err_no)
+  {
+    if (!s)
+      return;
+
+    /* XXX the dump_errno and dump_bucket_from_state behaviors in
+     * the abort_early (rgw_rest.cc) might be valuable, but aren't
+     * safe to call presently as they return HTTP data */
+
+    perfcounter->inc(l_rgw_failed_req);
+  } /* abort_req */
+
+  int RGWLibProcess::process_request(RGWLibRequest* req, RGWLibIO* io)
+  {
+    int ret = 0;
+    bool should_log = true; // XXX
+
+    dout(1) << "====== " << __func__
+	    << " starting new request req=" << hex << req << dec
+	    << " ======" << dendl;
+
+    /*
+     * invariant: valid requests are derived from RGWOp--well-formed
+     * requests should have assigned RGWRequest::op in their descendant
+     * constructor--if not, the compiler can find it, at the cost of
+     * a runtime check
+     */
+    RGWOp *op = (req->op) ? req->op : dynamic_cast<RGWOp*>(req);
+    if (! op) {
+      ldpp_dout(op, 1) << "failed to derive cognate RGWOp (invalid op?)" << dendl;
+      return -EINVAL;
+    }
+
+    io->init(req->cct);
+
+    perfcounter->inc(l_rgw_req);
+
+    RGWEnv& rgw_env = io->get_env();
+
+    /* XXX
+     * until major refactoring of req_state and req_info, we need
+     * to build their RGWEnv boilerplate from the RGWLibRequest,
+     * pre-staging any strings (HTTP_HOST) that provoke a crash when
+     * not found
+     */
+
+    /* XXX for now, use "";  could be a legit hostname, or, in future,
+     * perhaps a tenant (Yehuda) */
+    rgw_env.set("HTTP_HOST", "");
+
+    /* XXX and -then- bloat up req_state with string copies from it */
+    req_state rstate(req->cct, env, &rgw_env, req->id);
+    req_state *s = &rstate;
+
+    // XXX fix this
+    s->cio = io;
+
+    /* XXX and -then- stash req_state pointers everywhere they are needed */
+    ret = req->init(rgw_env, env.driver, io, s);
+    if (ret < 0) {
+      ldpp_dout(op, 10) << "failed to initialize request" << dendl;
+      abort_req(s, op, ret);
+      goto done;
+    }
+
+    /* req is-a RGWOp, currently initialized separately */
+    ret = req->op_init();
+    if (ret < 0) {
+      dout(10) << "failed to initialize RGWOp" << dendl;
+      abort_req(s, op, ret);
+      goto done;
+    }
+
+    /* now expected by rgw_log_op() */
+    rgw_env.set("REQUEST_METHOD", s->info.method);
+    rgw_env.set("REQUEST_URI", s->info.request_uri);
+    rgw_env.set("QUERY_STRING", "");
+
+    try {
+      /* XXX authorize does less here then in the REST path, e.g.,
+       * the user's info is cached, but still incomplete */
+      ldpp_dout(s, 2) << "authorizing" << dendl;
+      ret = req->authorize(op, null_yield);
+      if (ret < 0) {
+	dout(10) << "failed to authorize request" << dendl;
+	abort_req(s, op, ret);
+	goto done;
+      }
+
+      /* FIXME: remove this after switching all handlers to the new
+       * authentication infrastructure. */
+      if (! s->auth.identity) {
+	s->auth.identity = rgw::auth::transform_old_authinfo(s);
+      }
+
+      ldpp_dout(s, 2) << "reading op permissions" << dendl;
+      ret = req->read_permissions(op, null_yield);
+      if (ret < 0) {
+	abort_req(s, op, ret);
+	goto done;
+      }
+
+      ldpp_dout(s, 2) << "init op" << dendl;
+      ret = op->init_processing(null_yield);
+      if (ret < 0) {
+	abort_req(s, op, ret);
+	goto done;
+      }
+
+      ldpp_dout(s, 2) << "verifying op mask" << dendl;
+      ret = op->verify_op_mask();
+      if (ret < 0) {
+	abort_req(s, op, ret);
+	goto done;
+      }
+
+      ldpp_dout(s, 2) << "verifying op permissions" << dendl;
+      ret = op->verify_permission(null_yield);
+      if (ret < 0) {
+	if (s->system_request) {
+	  ldpp_dout(op, 2) << "overriding permissions due to system operation" << dendl;
+	} else if (s->auth.identity->is_admin_of(s->user->get_id())) {
+	  ldpp_dout(op, 2) << "overriding permissions due to admin operation" << dendl;
+	} else {
+	  abort_req(s, op, ret);
+	  goto done;
+	}
+      }
+
+      ldpp_dout(s, 2) << "verifying op params" << dendl;
+      ret = op->verify_params();
+      if (ret < 0) {
+	abort_req(s, op, ret);
+	goto done;
+      }
+
+      ldpp_dout(s, 2) << "executing" << dendl;
+      op->pre_exec();
+      op->execute(null_yield);
+      op->complete();
+
+    } catch (const ceph::crypto::DigestException& e) {
+      dout(0) << "authentication failed" << e.what() << dendl;
+      abort_req(s, op, -ERR_INVALID_SECRET_KEY);
+    }
+
+  done:
+    try {
+      io->complete_request();
+    } catch (rgw::io::Exception& e) {
+      dout(0) << "ERROR: io->complete_request() returned "
+              << e.what() << dendl;
+    }
+    if (should_log) {
+      rgw_log_op(nullptr /* !rest */, s, op, env.olog);
+    }
+
+    int http_ret = s->err.http_ret;
+
+    ldpp_dout(s, 2) << "http status=" << http_ret << dendl;
+
+    ldpp_dout(op, 1) << "====== " << __func__
+	    << " req done req=" << hex << req << dec << " http_status="
+	    << http_ret
+	    << " ======" << dendl;
+
+    return (ret < 0 ? ret : s->err.ret);
+  } /* process_request */
+
+  int RGWLibProcess::start_request(RGWLibContinuedReq* req)
+  {
+
+    dout(1) << "====== " << __func__
+	    << " starting new continued request req=" << hex << req << dec
+	    << " ======" << dendl;
+
+    /*
+     * invariant: valid requests are derived from RGWOp--well-formed
+     * requests should have assigned RGWRequest::op in their descendant
+     * constructor--if not, the compiler can find it, at the cost of
+     * a runtime check
+     */
+    RGWOp *op = (req->op) ? req->op : dynamic_cast<RGWOp*>(req);
+    if (! op) {
+      ldpp_dout(op, 1) << "failed to derive cognate RGWOp (invalid op?)" << dendl;
+      return -EINVAL;
+    }
+
+    req_state* s = req->get_state();
+    RGWLibIO& io_ctx = req->get_io();
+    RGWEnv& rgw_env = io_ctx.get_env();
+
+    rgw_env.set("HTTP_HOST", "");
+
+    int ret = req->init(rgw_env, env.driver, &io_ctx, s);
+    if (ret < 0) {
+      ldpp_dout(op, 10) << "failed to initialize request" << dendl;
+      abort_req(s, op, ret);
+      goto done;
+    }
+
+    /* req is-a RGWOp, currently initialized separately */
+    ret = req->op_init();
+    if (ret < 0) {
+      dout(10) << "failed to initialize RGWOp" << dendl;
+      abort_req(s, op, ret);
+      goto done;
+    }
+
+    /* XXX authorize does less here then in the REST path, e.g.,
+     * the user's info is cached, but still incomplete */
+    ldpp_dout(s, 2) << "authorizing" << dendl;
+    ret = req->authorize(op, null_yield);
+    if (ret < 0) {
+      dout(10) << "failed to authorize request" << dendl;
+      abort_req(s, op, ret);
+      goto done;
+    }
+
+    /* FIXME: remove this after switching all handlers to the new authentication
+     * infrastructure. */
+    if (! s->auth.identity) {
+      s->auth.identity = rgw::auth::transform_old_authinfo(s);
+    }
+
+    ldpp_dout(s, 2) << "reading op permissions" << dendl;
+    ret = req->read_permissions(op, null_yield);
+    if (ret < 0) {
+      abort_req(s, op, ret);
+      goto done;
+    }
+
+    ldpp_dout(s, 2) << "init op" << dendl;
+    ret = op->init_processing(null_yield);
+    if (ret < 0) {
+      abort_req(s, op, ret);
+      goto done;
+    }
+
+    ldpp_dout(s, 2) << "verifying op mask" << dendl;
+    ret = op->verify_op_mask();
+    if (ret < 0) {
+      abort_req(s, op, ret);
+      goto done;
+    }
+
+    ldpp_dout(s, 2) << "verifying op permissions" << dendl;
+    ret = op->verify_permission(null_yield);
+    if (ret < 0) {
+      if (s->system_request) {
+	ldpp_dout(op, 2) << "overriding permissions due to system operation" << dendl;
+      } else if (s->auth.identity->is_admin_of(s->user->get_id())) {
+	ldpp_dout(op, 2) << "overriding permissions due to admin operation" << dendl;
+      } else {
+	abort_req(s, op, ret);
+	goto done;
+      }
+    }
+
+    ldpp_dout(s, 2) << "verifying op params" << dendl;
+    ret = op->verify_params();
+    if (ret < 0) {
+      abort_req(s, op, ret);
+      goto done;
+    }
+
+    op->pre_exec();
+    req->exec_start();
+
+  done:
+    return (ret < 0 ? ret : s->err.ret);
+  }
+
+  int RGWLibProcess::finish_request(RGWLibContinuedReq* req)
+  {
+    RGWOp *op = (req->op) ? req->op : dynamic_cast<RGWOp*>(req);
+    if (! op) {
+      ldpp_dout(op, 1) << "failed to derive cognate RGWOp (invalid op?)" << dendl;
+      return -EINVAL;
+    }
+
+    int ret = req->exec_finish();
+    int op_ret = op->get_ret();
+
+    ldpp_dout(op, 1) << "====== " << __func__
+	    << " finishing continued request req=" << hex << req << dec
+	    << " op status=" << op_ret
+	    << " ======" << dendl;
+
+    perfcounter->inc(l_rgw_req);
+
+    return ret;
+  }
+
+  int RGWLibFrontend::init()
+  {
+    std::string uri_prefix; // empty
+    pprocess = new RGWLibProcess(g_ceph_context, env,
+				 g_conf()->rgw_thread_pool_size, uri_prefix, conf);
+    return 0;
+  }
+
+  void RGWLib::set_fe(rgw::RGWLibFrontend* fe)
+  {
+    this->fe = fe;
+  }
+
+  int RGWLib::init()
+  {
+    vector<const char*> args;
+    return init(args);
+  }
+
+  int RGWLib::init(vector<const char*>& args)
+  {
+    /* alternative default for module */
+    map<std::string,std::string> defaults = {
+      { "debug_rgw", "1/5" },
+      { "keyring", "$rgw_data/keyring" },
+      { "log_file", "/var/log/radosgw/$cluster-$name.log" },
+      { "objecter_inflight_ops", "24576" },
+      // require a secure mon connection by default
+      { "ms_mon_client_mode", "secure" },
+      { "auth_client_required", "cephx" },
+    };
+
+    cct = rgw_global_init(&defaults, args,
+			  CEPH_ENTITY_TYPE_CLIENT,
+			  CODE_ENVIRONMENT_DAEMON,
+			  CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+
+    ceph::mutex mutex = ceph::make_mutex("main");
+    SafeTimer init_timer(g_ceph_context, mutex);
+    init_timer.init();
+    mutex.lock();
+    init_timer.add_event_after(g_conf()->rgw_init_timeout, new C_InitTimeout);
+    mutex.unlock();
+
+    /* stage all front-ends (before common-init-finish) */
+    main.init_frontends1(true /* nfs */);
+
+    common_init_finish(g_ceph_context);
+
+    main.init_perfcounters();
+    main.init_http_clients();
+
+    main.init_storage();
+    if (! main.get_driver()) {
+      mutex.lock();
+      init_timer.cancel_all_events();
+      init_timer.shutdown();
+      mutex.unlock();
+
+      derr << "Couldn't init storage provider (RADOS)" << dendl;
+      return -EIO;
+    }
+
+    main.cond_init_apis();
+
+    mutex.lock();
+    init_timer.cancel_all_events();
+    init_timer.shutdown();
+    mutex.unlock();
+
+    main.init_ldap();
+    main.init_opslog();
+
+    init_async_signal_handler();
+    register_async_signal_handler(SIGUSR1, rgw::signal::handle_sigterm);
+
+    main.init_tracepoints();
+    main.init_frontends2(this /* rgwlib */);
+    main.init_notification_endpoints();
+    main.init_lua();
+
+    return 0;
+  } /* RGWLib::init() */
+
+  int RGWLib::stop()
+  {
+    derr << "shutting down" << dendl;
+
+    const auto finalize_async_signals = []() {
+      unregister_async_signal_handler(SIGUSR1, rgw::signal::handle_sigterm);
+      shutdown_async_signal_handler();
+    };
+
+    main.shutdown(finalize_async_signals);
+
+    return 0;
+  } /* RGWLib::stop() */
+
+  int RGWLibIO::set_uid(rgw::sal::Driver* driver, const rgw_user& uid)
+  {
+    const DoutPrefix dp(driver->ctx(), dout_subsys, "librgw: ");
+    std::unique_ptr<rgw::sal::User> user = driver->get_user(uid);
+    /* object exists, but policy is broken */
+    int ret = user->load_user(&dp, null_yield);
+    if (ret < 0) {
+      derr << "ERROR: failed reading user info: uid=" << uid << " ret="
+	   << ret << dendl;
+    }
+    user_info = user->get_info();
+    return ret;
+  }
+
+  int RGWLibRequest::read_permissions(RGWOp* op, optional_yield y) {
+    /* bucket and object ops */
+    int ret =
+      rgw_build_bucket_policies(op, g_rgwlib->get_driver(), get_state(), y);
+    if (ret < 0) {
+      ldpp_dout(op, 10) << "read_permissions (bucket policy) on "
+				  << get_state()->bucket << ":"
+				  << get_state()->object
+				  << " only_bucket=" << only_bucket()
+				  << " ret=" << ret << dendl;
+      if (ret == -ENODATA)
+	ret = -EACCES;
+    } else if (! only_bucket()) {
+      /* object ops */
+      ret = rgw_build_object_policies(op, g_rgwlib->get_driver(), get_state(),
+				      op->prefetch_data(), y);
+      if (ret < 0) {
+	ldpp_dout(op, 10) << "read_permissions (object policy) on"
+				    << get_state()->bucket << ":"
+				    << get_state()->object
+				    << " ret=" << ret << dendl;
+	if (ret == -ENODATA)
+	  ret = -EACCES;
+      }
+    }
+    return ret;
+  } /* RGWLibRequest::read_permissions */
+
+  int RGWHandler_Lib::authorize(const DoutPrefixProvider *dpp, optional_yield y)
+  {
+    /* TODO: handle
+     *  1. subusers
+     *  2. anonymous access
+     *  3. system access
+     *  4. ?
+     *
+     *  Much or all of this depends on handling the cached authorization
+     *  correctly (e.g., dealing with keystone) at mount time.
+     */
+    s->perm_mask = RGW_PERM_FULL_CONTROL;
+
+    // populate the owner info
+    s->owner.set_id(s->user->get_id());
+    s->owner.set_name(s->user->get_display_name());
+
+    return 0;
+  } /* RGWHandler_Lib::authorize */
+
+} /* namespace rgw */
diff --git a/src/rgw/rgw_lib.h b/src/rgw/rgw_lib.h
new file mode 100644
index 000000000..1ad54b49b
--- /dev/null
+++ b/src/rgw/rgw_lib.h
@@ -0,0 +1,209 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <mutex>
+#include "rgw_common.h"
+#include "rgw_client_io.h"
+#include "rgw_rest.h"
+#include "rgw_request.h"
+#include "rgw_ldap.h"
+#include "include/ceph_assert.h"
+#include "rgw_main.h"
+
+class OpsLogSink;
+
+namespace rgw {
+
+  class RGWLibFrontend;
+
+  class RGWLib : public DoutPrefixProvider {
+    boost::intrusive_ptr<CephContext> cct;
+    AppMain main;
+    RGWLibFrontend* fe;
+
+  public:
+    RGWLib() : main(this), fe(nullptr)
+      {}
+    ~RGWLib() {}
+
+    rgw::sal::Driver* get_driver() { return main.get_driver(); }
+
+    RGWLibFrontend* get_fe() { return fe; }
+
+    rgw::LDAPHelper* get_ldh() { return main.get_ldh(); }
+    CephContext *get_cct() const override { return cct.get(); }
+    unsigned get_subsys() const { return ceph_subsys_rgw; }
+    std::ostream& gen_prefix(std::ostream& out) const { return out << "lib rgw: "; }
+
+    void set_fe(RGWLibFrontend* fe);
+
+    int init();
+    int init(std::vector<const char *>& args);
+    int stop();
+  };
+
+  extern RGWLib* g_rgwlib;
+
+/* request interface */
+
+  class RGWLibIO : public rgw::io::BasicClient,
+                   public rgw::io::Accounter
+  {
+    RGWUserInfo user_info;
+    RGWEnv env;
+  public:
+    RGWLibIO() {
+      get_env().set("HTTP_HOST", "");
+    }
+    explicit RGWLibIO(const RGWUserInfo &_user_info)
+      : user_info(_user_info) {}
+
+    int init_env(CephContext *cct) override {
+      env.init(cct);
+      return 0;
+    }
+
+    const RGWUserInfo& get_user() {
+      return user_info;
+    }
+
+    int set_uid(rgw::sal::Driver* driver, const rgw_user& uid);
+
+    int write_data(const char *buf, int len);
+    int read_data(char *buf, int len);
+    int send_status(int status, const char *status_name);
+    int send_100_continue();
+    int complete_header();
+    int send_content_length(uint64_t len);
+
+    RGWEnv& get_env() noexcept override {
+      return env;
+    }
+
+    size_t complete_request() override { /* XXX */
+      return 0;
+    };
+
+    void set_account(bool) override {
+      return;
+    }
+
+    uint64_t get_bytes_sent() const override {
+      return 0;
+    }
+
+    uint64_t get_bytes_received() const override {
+      return 0;
+    }
+
+  }; /* RGWLibIO */
+
+  class RGWRESTMgr_Lib : public RGWRESTMgr {
+  public:
+    RGWRESTMgr_Lib() {}
+    ~RGWRESTMgr_Lib() override {}
+  }; /* RGWRESTMgr_Lib */
+
+  class RGWHandler_Lib : public RGWHandler {
+    friend class RGWRESTMgr_Lib;
+  public:
+
+    int authorize(const DoutPrefixProvider *dpp, optional_yield y) override;
+
+    RGWHandler_Lib() {}
+    ~RGWHandler_Lib() override {}
+    static int init_from_header(rgw::sal::Driver* driver,
+				req_state *s);
+  }; /* RGWHandler_Lib */
+
+  class RGWLibRequest : public RGWRequest,
+			public RGWHandler_Lib {
+  private:
+    std::unique_ptr<rgw::sal::User> tuser; // Don't use this.  It's empty except during init.
+  public:
+    CephContext* cct;
+
+    /* unambiguiously return req_state */
+    inline req_state* get_state() { return this->RGWRequest::s; }
+
+    RGWLibRequest(CephContext* _cct, std::unique_ptr<rgw::sal::User> _user)
+      :  RGWRequest(g_rgwlib->get_driver()->get_new_req_id()),
+	 tuser(std::move(_user)), cct(_cct)
+      {}
+
+  int postauth_init(optional_yield) override { return 0; }
+
+    /* descendant equivalent of *REST*::init_from_header(...):
+     * prepare request for execute()--should mean, fixup URI-alikes
+     * and any other expected stat vars in local req_state, for
+     * now */
+    virtual int header_init() = 0;
+
+    /* descendant initializer responsible to call RGWOp::init()--which
+     * descendants are required to inherit */
+    virtual int op_init() = 0;
+
+    using RGWHandler::init;
+
+    int init(const RGWEnv& rgw_env, rgw::sal::Driver* _driver,
+	     RGWLibIO* io, req_state* _s) {
+
+      RGWRequest::init_state(_s);
+      RGWHandler::init(_driver, _s, io);
+
+      get_state()->req_id = driver->zone_unique_id(id);
+      get_state()->trans_id = driver->zone_unique_trans_id(id);
+      get_state()->bucket_tenant = tuser->get_tenant();
+      get_state()->set_user(tuser);
+
+      ldpp_dout(_s, 2) << "initializing for trans_id = "
+	  << get_state()->trans_id.c_str() << dendl;
+
+      int ret = header_init();
+      if (ret == 0) {
+	ret = init_from_header(driver, _s);
+      }
+      return ret;
+    }
+
+    virtual bool only_bucket() = 0;
+
+    int read_permissions(RGWOp *op, optional_yield y) override;
+
+  }; /* RGWLibRequest */
+
+  class RGWLibContinuedReq : public RGWLibRequest {
+    RGWLibIO io_ctx;
+    req_state rstate;
+  public:
+
+    RGWLibContinuedReq(CephContext* _cct, const RGWProcessEnv& penv,
+		       std::unique_ptr<rgw::sal::User> _user)
+      :  RGWLibRequest(_cct, std::move(_user)), io_ctx(),
+	 rstate(_cct, penv, &io_ctx.get_env(), id)
+      {
+	io_ctx.init(_cct);
+
+	RGWRequest::init_state(&rstate);
+	RGWHandler::init(g_rgwlib->get_driver(), &rstate, &io_ctx);
+
+	get_state()->req_id = driver->zone_unique_id(id);
+	get_state()->trans_id = driver->zone_unique_trans_id(id);
+
+	ldpp_dout(get_state(), 2) << "initializing for trans_id = "
+	    << get_state()->trans_id.c_str() << dendl;
+      }
+
+    inline rgw::sal::Driver* get_driver() { return driver; }
+    inline RGWLibIO& get_io() { return io_ctx; }
+
+    virtual int execute() final { ceph_abort(); }
+    virtual int exec_start() = 0;
+    virtual int exec_continue() = 0;
+    virtual int exec_finish() = 0;
+
+  }; /* RGWLibContinuedReq */
+
+} /* namespace rgw */
diff --git a/src/rgw/rgw_lib_frontend.h b/src/rgw/rgw_lib_frontend.h
new file mode 100644
index 000000000..1772724d2
--- /dev/null
+++ b/src/rgw/rgw_lib_frontend.h
@@ -0,0 +1,113 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <boost/container/flat_map.hpp>
+
+#include "rgw_lib.h"
+#include "rgw_file.h"
+
+namespace rgw {
+
+  class RGWLibProcess : public RGWProcess {
+    RGWAccessKey access_key;
+    std::mutex mtx;
+    std::condition_variable cv;
+    int gen;
+    bool shutdown;
+
+    typedef flat_map<RGWLibFS*, RGWLibFS*> FSMAP;
+    FSMAP mounted_fs;
+
+    using lock_guard = std::lock_guard<std::mutex>;
+    using unique_lock = std::unique_lock<std::mutex>;
+
+  public:
+    RGWLibProcess(CephContext* cct, RGWProcessEnv& pe, int num_threads,
+		  std::string uri_prefix, RGWFrontendConfig* _conf) :
+      RGWProcess(cct, pe, num_threads, std::move(uri_prefix), _conf),
+      gen(0), shutdown(false) {}
+
+    void run() override;
+    void checkpoint();
+
+    void stop() {
+      shutdown = true;
+      for (const auto& fs: mounted_fs) {
+	fs.second->stop();
+      }
+      cv.notify_all();
+    }
+
+    void register_fs(RGWLibFS* fs) {
+      lock_guard guard(mtx);
+      mounted_fs.insert(FSMAP::value_type(fs, fs));
+      ++gen;
+    }
+
+    void unregister_fs(RGWLibFS* fs) {
+      lock_guard guard(mtx);
+      FSMAP::iterator it = mounted_fs.find(fs);
+      if (it != mounted_fs.end()) {
+	mounted_fs.erase(it);
+	++gen;
+      }
+    }
+
+    void enqueue_req(RGWLibRequest* req) {
+
+      lsubdout(g_ceph_context, rgw, 10)
+	<< __func__ << " enqueue request req="
+	<< std::hex << req << std::dec << dendl;
+
+      req_throttle.get(1);
+      req_wq.queue(req);
+    } /* enqueue_req */
+
+    /* "regular" requests */
+    void handle_request(const DoutPrefixProvider *dpp, RGWRequest* req) override; // async handler, deletes req
+    int process_request(RGWLibRequest* req);
+    int process_request(RGWLibRequest* req, RGWLibIO* io);
+    void set_access_key(RGWAccessKey& key) { access_key = key; }
+
+    /* requests w/continue semantics */
+    int start_request(RGWLibContinuedReq* req);
+    int finish_request(RGWLibContinuedReq* req);
+  }; /* RGWLibProcess */
+
+  class RGWLibFrontend : public RGWProcessFrontend {
+  public:
+    RGWLibFrontend(RGWProcessEnv& pe, RGWFrontendConfig *_conf)
+      : RGWProcessFrontend(pe, _conf) {}
+		
+    int init() override;
+
+    void stop() override {
+      RGWProcessFrontend::stop();
+      get_process()->stop();
+    }
+
+    RGWLibProcess* get_process() {
+      return static_cast<RGWLibProcess*>(pprocess);
+    }
+
+    inline void enqueue_req(RGWLibRequest* req) {
+      static_cast<RGWLibProcess*>(pprocess)->enqueue_req(req); // async
+    }
+
+    inline int execute_req(RGWLibRequest* req) {
+      return static_cast<RGWLibProcess*>(pprocess)->process_request(req); // !async
+    }
+
+    inline int start_req(RGWLibContinuedReq* req) {
+      return static_cast<RGWLibProcess*>(pprocess)->start_request(req);
+    }
+
+    inline int finish_req(RGWLibContinuedReq* req) {
+      return static_cast<RGWLibProcess*>(pprocess)->finish_request(req);
+    }
+
+  }; /* RGWLibFrontend */
+
+} /* namespace rgw */
diff --git a/src/rgw/rgw_loadgen.cc b/src/rgw/rgw_loadgen.cc
new file mode 100644
index 000000000..015057e9c
--- /dev/null
+++ b/src/rgw/rgw_loadgen.cc
@@ -0,0 +1,131 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <algorithm>
+#include <sstream>
+#include <string.h>
+
+#include "rgw_loadgen.h"
+#include "rgw_auth_s3.h"
+
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void RGWLoadGenRequestEnv::set_date(utime_t& tm)
+{
+  date_str = rgw_to_asctime(tm);
+}
+
+int RGWLoadGenRequestEnv::sign(const DoutPrefixProvider *dpp, RGWAccessKey& access_key)
+{
+  meta_map_t meta_map;
+  map<string, string> sub_resources;
+
+  string canonical_header;
+  string digest;
+
+  rgw_create_s3_canonical_header(dpp,
+                                 request_method.c_str(),
+                                 nullptr, /* const char *content_md5 */
+                                 content_type.c_str(),
+                                 date_str.c_str(),
+                                 meta_map,
+				                 meta_map_t{},
+                                 uri.c_str(),
+                                 sub_resources,
+                                 canonical_header);
+
+  headers["HTTP_DATE"] = date_str;
+  try {
+    /* FIXME(rzarzynski): kill the dependency on g_ceph_context. */
+    const auto signature = static_cast<std::string>(
+      rgw::auth::s3::get_v2_signature(g_ceph_context, canonical_header,
+                                      access_key.key));
+    headers["HTTP_AUTHORIZATION"] = \
+      std::string("AWS ") + access_key.id + ":" + signature;
+  } catch (int ret) {
+    return ret;
+  }
+
+  return 0;
+}
+
+size_t RGWLoadGenIO::write_data(const char* const buf,
+                                const size_t len)
+{
+  return len;
+}
+
+size_t RGWLoadGenIO::read_data(char* const buf, const size_t len)
+{
+  const size_t read_len = std::min(left_to_read,
+                                   static_cast<uint64_t>(len));
+  left_to_read -= read_len;
+  return read_len;
+}
+
+void RGWLoadGenIO::flush()
+{
+}
+
+size_t RGWLoadGenIO::complete_request()
+{
+  return 0;
+}
+
+int RGWLoadGenIO::init_env(CephContext *cct)
+{
+  env.init(cct);
+
+  left_to_read = req->content_length;
+
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%lld", (long long)req->content_length);
+  env.set("CONTENT_LENGTH", buf);
+
+  env.set("CONTENT_TYPE", req->content_type.c_str());
+  env.set("HTTP_DATE", req->date_str.c_str());
+
+  for (map<string, string>::iterator iter = req->headers.begin(); iter != req->headers.end(); ++iter) {
+    env.set(iter->first.c_str(), iter->second.c_str());
+  }
+
+  env.set("REQUEST_METHOD", req->request_method.c_str());
+  env.set("REQUEST_URI", req->uri.c_str());
+  env.set("QUERY_STRING", req->query_string.c_str());
+  env.set("SCRIPT_URI", req->uri.c_str());
+
+  char port_buf[16];
+  snprintf(port_buf, sizeof(port_buf), "%d", req->port);
+  env.set("SERVER_PORT", port_buf);
+  return 0;
+}
+
+size_t RGWLoadGenIO::send_status(const int status,
+                                 const char* const status_name)
+{
+  return 0;
+}
+
+size_t RGWLoadGenIO::send_100_continue()
+{
+  return 0;
+}
+
+size_t RGWLoadGenIO::send_header(const std::string_view& name,
+                                 const std::string_view& value)
+{
+  return 0;
+}
+
+size_t RGWLoadGenIO::complete_header()
+{
+  return 0;
+}
+
+size_t RGWLoadGenIO::send_content_length(const uint64_t len)
+{
+  return 0;
+}
diff --git a/src/rgw/rgw_loadgen.h b/src/rgw/rgw_loadgen.h
new file mode 100644
index 000000000..7f3f847c2
--- /dev/null
+++ b/src/rgw/rgw_loadgen.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rgw_client_io.h"
+
+
+struct RGWLoadGenRequestEnv {
+  int port;
+  uint64_t content_length;
+  std::string content_type;
+  std::string request_method;
+  std::string uri;
+  std::string query_string;
+  std::string date_str;
+
+  std::map<std::string, std::string> headers;
+
+  RGWLoadGenRequestEnv()
+    : port(0),
+      content_length(0) {
+  }
+
+  void set_date(utime_t& tm);
+  int sign(const DoutPrefixProvider *dpp, RGWAccessKey& access_key);
+};
+
+/* XXX does RGWLoadGenIO actually want to perform stream/HTTP I/O,
+ * or (e.g) are these NOOPs? */
+class RGWLoadGenIO : public rgw::io::RestfulClient
+{
+  uint64_t left_to_read;
+  RGWLoadGenRequestEnv* req;
+  RGWEnv env;
+
+  int init_env(CephContext *cct) override;
+  size_t read_data(char *buf, size_t len);
+  size_t write_data(const char *buf, size_t len);
+
+public:
+  explicit RGWLoadGenIO(RGWLoadGenRequestEnv* const req)
+    : left_to_read(0),
+      req(req) {
+  }
+
+  size_t send_status(int status, const char *status_name) override;
+  size_t send_100_continue() override;
+  size_t send_header(const std::string_view& name,
+                     const std::string_view& value) override;
+  size_t complete_header() override;
+  size_t send_content_length(uint64_t len) override;
+
+  size_t recv_body(char* buf, size_t max) override {
+    return read_data(buf, max);
+  }
+
+  size_t send_body(const char* buf, size_t len) override {
+    return write_data(buf, len);
+  }
+
+  void flush() override;
+
+  RGWEnv& get_env() noexcept override {
+    return env;
+  }
+
+  size_t complete_request() override;
+};
diff --git a/src/rgw/rgw_loadgen_process.cc b/src/rgw/rgw_loadgen_process.cc
new file mode 100644
index 000000000..f8165185d
--- /dev/null
+++ b/src/rgw/rgw_loadgen_process.cc
@@ -0,0 +1,147 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include "common/WorkQueue.h"
+
+#include "rgw_rest.h"
+#include "rgw_frontend.h"
+#include "rgw_request.h"
+#include "rgw_process.h"
+#include "rgw_loadgen.h"
+#include "rgw_client_io.h"
+#include "rgw_signal.h"
+
+#include <atomic>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void RGWLoadGenProcess::checkpoint()
+{
+  m_tp.drain(&req_wq);
+}
+
+void RGWLoadGenProcess::run()
+{
+  m_tp.start(); /* start thread pool */
+
+  int i;
+
+  int num_objs;
+
+  conf->get_val("num_objs", 1000, &num_objs);
+
+  int num_buckets;
+  conf->get_val("num_buckets", 1, &num_buckets);
+
+  vector<string> buckets(num_buckets);
+
+  std::atomic<bool> failed = { false };
+
+  for (i = 0; i < num_buckets; i++) {
+    buckets[i] = "/loadgen";
+    string& bucket = buckets[i];
+    append_rand_alpha(cct, bucket, bucket, 16);
+
+    /* first create a bucket */
+    gen_request("PUT", bucket, 0, &failed);
+    checkpoint();
+  }
+
+  string *objs = new string[num_objs];
+
+  if (failed) {
+    derr << "ERROR: bucket creation failed" << dendl;
+    goto done;
+  }
+
+  for (i = 0; i < num_objs; i++) {
+    char buf[16 + 1];
+    gen_rand_alphanumeric(cct, buf, sizeof(buf));
+    buf[16] = '\0';
+    objs[i] = buckets[i % num_buckets] + "/" + buf;
+  }
+
+  for (i = 0; i < num_objs; i++) {
+    gen_request("PUT", objs[i], 4096, &failed);
+  }
+
+  checkpoint();
+
+  if (failed) {
+    derr << "ERROR: bucket creation failed" << dendl;
+    goto done;
+  }
+
+  for (i = 0; i < num_objs; i++) {
+    gen_request("GET", objs[i], 4096, NULL);
+  }
+
+  checkpoint();
+
+  for (i = 0; i < num_objs; i++) {
+    gen_request("DELETE", objs[i], 0, NULL);
+  }
+
+  checkpoint();
+
+  for (i = 0; i < num_buckets; i++) {
+    gen_request("DELETE", buckets[i], 0, NULL);
+  }
+
+done:
+  checkpoint();
+
+  m_tp.stop();
+
+  delete[] objs;
+
+  rgw::signal::signal_shutdown();
+} /* RGWLoadGenProcess::run() */
+
+void RGWLoadGenProcess::gen_request(const string& method,
+				    const string& resource,
+				    int content_length, std::atomic<bool>* fail_flag)
+{
+  RGWLoadGenRequest* req =
+    new RGWLoadGenRequest(env.driver->get_new_req_id(), method, resource,
+			  content_length, fail_flag);
+  dout(10) << "allocated request req=" << hex << req << dec << dendl;
+  req_throttle.get(1);
+  req_wq.queue(req);
+} /* RGWLoadGenProcess::gen_request */
+
+void RGWLoadGenProcess::handle_request(const DoutPrefixProvider *dpp, RGWRequest* r)
+{
+  RGWLoadGenRequest* req = static_cast<RGWLoadGenRequest*>(r);
+
+  RGWLoadGenRequestEnv renv;
+
+  utime_t tm = ceph_clock_now();
+
+  renv.port = 80;
+  renv.content_length = req->content_length;
+  renv.content_type = "binary/octet-stream";
+  renv.request_method = req->method;
+  renv.uri = req->resource;
+  renv.set_date(tm);
+  renv.sign(dpp, access_key);
+
+  RGWLoadGenIO real_client_io(&renv);
+  RGWRestfulIO client_io(cct, &real_client_io);
+  int ret = process_request(env, req, uri_prefix, &client_io,
+                            null_yield, nullptr, nullptr, nullptr);
+  if (ret < 0) {
+    /* we don't really care about return code */
+    dout(20) << "process_request() returned " << ret << dendl;
+
+    if (req->fail_flag) {
+      req->fail_flag++;
+    }
+  }
+
+  delete req;
+} /* RGWLoadGenProcess::handle_request */
diff --git a/src/rgw/rgw_log.cc b/src/rgw/rgw_log.cc
new file mode 100644
index 000000000..de67fcd4b
--- /dev/null
+++ b/src/rgw/rgw_log.cc
@@ -0,0 +1,722 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/Clock.h"
+#include "common/Timer.h"
+#include "common/utf8.h"
+#include "common/OutputDataSocket.h"
+#include "common/Formatter.h"
+
+#include "rgw_bucket.h"
+#include "rgw_log.h"
+#include "rgw_acl.h"
+#include "rgw_client_io.h"
+#include "rgw_rest.h"
+#include "rgw_zone.h"
+#include "rgw_rados.h"
+
+#include "services/svc_zone.h"
+
+#include <chrono>
+#include <math.h>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static void set_param_str(req_state *s, const char *name, string& str)
+{
+  const char *p = s->info.env->get(name);
+  if (p)
+    str = p;
+}
+
+string render_log_object_name(const string& format,
+			      struct tm *dt, const string& bucket_id,
+			      const string& bucket_name)
+{
+  string o;
+  for (unsigned i=0; i<format.size(); i++) {
+    if (format[i] == '%' && i+1 < format.size()) {
+      i++;
+      char buf[32];
+      switch (format[i]) {
+      case '%':
+	strcpy(buf, "%");
+	break;
+      case 'Y':
+	sprintf(buf, "%.4d", dt->tm_year + 1900);
+	break;
+      case 'y':
+	sprintf(buf, "%.2d", dt->tm_year % 100);
+	break;
+      case 'm':
+	sprintf(buf, "%.2d", dt->tm_mon + 1);
+	break;
+      case 'd':
+	sprintf(buf, "%.2d", dt->tm_mday);
+	break;
+      case 'H':
+	sprintf(buf, "%.2d", dt->tm_hour);
+	break;
+      case 'I':
+	sprintf(buf, "%.2d", (dt->tm_hour % 12) + 1);
+	break;
+      case 'k':
+	sprintf(buf, "%d", dt->tm_hour);
+	break;
+      case 'l':
+	sprintf(buf, "%d", (dt->tm_hour % 12) + 1);
+	break;
+      case 'M':
+	sprintf(buf, "%.2d", dt->tm_min);
+	break;
+
+      case 'i':
+	o += bucket_id;
+	continue;
+      case 'n':
+	o += bucket_name;
+	continue;
+      default:
+	// unknown code
+	sprintf(buf, "%%%c", format[i]);
+	break;
+      }
+      o += buf;
+      continue;
+    }
+    o += format[i];
+  }
+  return o;
+}
+
+/* usage logger */
+class UsageLogger : public DoutPrefixProvider {
+  CephContext *cct;
+  rgw::sal::Driver* driver;
+  map<rgw_user_bucket, RGWUsageBatch> usage_map;
+  ceph::mutex lock = ceph::make_mutex("UsageLogger");
+  int32_t num_entries;
+  ceph::mutex timer_lock = ceph::make_mutex("UsageLogger::timer_lock");
+  SafeTimer timer;
+  utime_t round_timestamp;
+
+  class C_UsageLogTimeout : public Context {
+    UsageLogger *logger;
+  public:
+    explicit C_UsageLogTimeout(UsageLogger *_l) : logger(_l) {}
+    void finish(int r) override {
+      logger->flush();
+      logger->set_timer();
+    }
+  };
+
+  void set_timer() {
+    timer.add_event_after(cct->_conf->rgw_usage_log_tick_interval, new C_UsageLogTimeout(this));
+  }
+public:
+
+  UsageLogger(CephContext *_cct, rgw::sal::Driver* _driver) : cct(_cct), driver(_driver), num_entries(0), timer(cct, timer_lock) {
+    timer.init();
+    std::lock_guard l{timer_lock};
+    set_timer();
+    utime_t ts = ceph_clock_now();
+    recalc_round_timestamp(ts);
+  }
+
+  ~UsageLogger() {
+    std::lock_guard l{timer_lock};
+    flush();
+    timer.cancel_all_events();
+    timer.shutdown();
+  }
+
+  void recalc_round_timestamp(utime_t& ts) {
+    round_timestamp = ts.round_to_hour();
+  }
+
+  void insert_user(utime_t& timestamp, const rgw_user& user, rgw_usage_log_entry& entry) {
+    lock.lock();
+    if (timestamp.sec() > round_timestamp + 3600)
+      recalc_round_timestamp(timestamp);
+    entry.epoch = round_timestamp.sec();
+    bool account;
+    string u = user.to_str();
+    rgw_user_bucket ub(u, entry.bucket);
+    real_time rt = round_timestamp.to_real_time();
+    usage_map[ub].insert(rt, entry, &account);
+    if (account)
+      num_entries++;
+    bool need_flush = (num_entries > cct->_conf->rgw_usage_log_flush_threshold);
+    lock.unlock();
+    if (need_flush) {
+      std::lock_guard l{timer_lock};
+      flush();
+    }
+  }
+
+  void insert(utime_t& timestamp, rgw_usage_log_entry& entry) {
+    if (entry.payer.empty()) {
+      insert_user(timestamp, entry.owner, entry);
+    } else {
+      insert_user(timestamp, entry.payer, entry);
+    }
+  }
+
+  void flush() {
+    map<rgw_user_bucket, RGWUsageBatch> old_map;
+    lock.lock();
+    old_map.swap(usage_map);
+    num_entries = 0;
+    lock.unlock();
+
+    driver->log_usage(this, old_map);
+  }
+
+  CephContext *get_cct() const override { return cct; }
+  unsigned get_subsys() const override { return dout_subsys; }
+  std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw UsageLogger: "; }
+};
+
+static UsageLogger *usage_logger = NULL;
+
+void rgw_log_usage_init(CephContext *cct, rgw::sal::Driver* driver)
+{
+  usage_logger = new UsageLogger(cct, driver);
+}
+
+void rgw_log_usage_finalize()
+{
+  delete usage_logger;
+  usage_logger = NULL;
+}
+
+static void log_usage(req_state *s, const string& op_name)
+{
+  if (s->system_request) /* don't log system user operations */
+    return;
+
+  if (!usage_logger)
+    return;
+
+  rgw_user user;
+  rgw_user payer;
+  string bucket_name;
+
+  bucket_name = s->bucket_name;
+
+  if (!bucket_name.empty()) {
+  bucket_name = s->bucket_name;
+    user = s->bucket_owner.get_id();
+    if (!rgw::sal::Bucket::empty(s->bucket.get()) &&
+	s->bucket->get_info().requester_pays) {
+      payer = s->user->get_id();
+    }
+  } else {
+      user = s->user->get_id();
+  }
+
+  bool error = s->err.is_err();
+  if (error && s->err.http_ret == 404) {
+    bucket_name = "-"; /* bucket not found, use the invalid '-' as bucket name */
+  }
+
+  string u = user.to_str();
+  string p = payer.to_str();
+  rgw_usage_log_entry entry(u, p, bucket_name);
+
+  uint64_t bytes_sent = ACCOUNTING_IO(s)->get_bytes_sent();
+  uint64_t bytes_received = ACCOUNTING_IO(s)->get_bytes_received();
+
+  rgw_usage_data data(bytes_sent, bytes_received);
+
+  data.ops = 1;
+  if (!s->is_err())
+    data.successful_ops = 1;
+
+  ldpp_dout(s, 30) << "log_usage: bucket_name=" << bucket_name
+	<< " tenant=" << s->bucket_tenant
+	<< ", bytes_sent=" << bytes_sent << ", bytes_received="
+	<< bytes_received << ", success=" << data.successful_ops << dendl;
+
+  entry.add(op_name, data);
+
+  utime_t ts = ceph_clock_now();
+
+  usage_logger->insert(ts, entry);
+}
+
+void rgw_format_ops_log_entry(struct rgw_log_entry& entry, Formatter *formatter)
+{
+  formatter->open_object_section("log_entry");
+  formatter->dump_string("bucket", entry.bucket);
+  {
+    auto t = utime_t{entry.time};
+    t.gmtime(formatter->dump_stream("time"));      // UTC
+    t.localtime(formatter->dump_stream("time_local"));
+  }
+  formatter->dump_string("remote_addr", entry.remote_addr);
+  string obj_owner = entry.object_owner.to_str();
+  if (obj_owner.length())
+    formatter->dump_string("object_owner", obj_owner);
+  formatter->dump_string("user", entry.user);
+  formatter->dump_string("operation", entry.op);
+  formatter->dump_string("uri", entry.uri);
+  formatter->dump_string("http_status", entry.http_status);
+  formatter->dump_string("error_code", entry.error_code);
+  formatter->dump_int("bytes_sent", entry.bytes_sent);
+  formatter->dump_int("bytes_received", entry.bytes_received);
+  formatter->dump_int("object_size", entry.obj_size);
+  {
+    using namespace std::chrono;
+    uint64_t total_time = duration_cast<milliseconds>(entry.total_time).count();
+    formatter->dump_int("total_time", total_time);
+  }
+  formatter->dump_string("user_agent",  entry.user_agent);
+  formatter->dump_string("referrer",  entry.referrer);
+  if (entry.x_headers.size() > 0) {
+    formatter->open_array_section("http_x_headers");
+    for (const auto& iter: entry.x_headers) {
+      formatter->open_object_section(iter.first.c_str());
+      formatter->dump_string(iter.first.c_str(), iter.second);
+      formatter->close_section();
+    }
+    formatter->close_section();
+  }
+  formatter->dump_string("trans_id", entry.trans_id);
+  switch(entry.identity_type) {
+    case TYPE_RGW:
+      formatter->dump_string("authentication_type","Local");
+      break;
+    case TYPE_LDAP:
+      formatter->dump_string("authentication_type","LDAP");
+      break;
+    case TYPE_KEYSTONE:
+      formatter->dump_string("authentication_type","Keystone");
+      break;
+    case TYPE_WEB:
+      formatter->dump_string("authentication_type","OIDC Provider");
+      break;
+    case TYPE_ROLE:
+      formatter->dump_string("authentication_type","STS");
+      break;
+    default:
+      break;
+  }
+  if (entry.token_claims.size() > 0) {
+    if (entry.token_claims[0] == "sts") {
+      formatter->open_object_section("sts_info");
+      for (const auto& iter: entry.token_claims) {
+        auto pos = iter.find(":");
+        if (pos != string::npos) {
+          formatter->dump_string(iter.substr(0, pos), iter.substr(pos + 1));
+        }
+      }
+      formatter->close_section();
+    }
+  }
+  if (!entry.access_key_id.empty()) {
+    formatter->dump_string("access_key_id", entry.access_key_id);
+  }
+  if (!entry.subuser.empty()) {
+    formatter->dump_string("subuser", entry.subuser);
+  }
+  formatter->dump_bool("temp_url", entry.temp_url);
+
+  if (entry.op == "multi_object_delete") {
+    formatter->open_object_section("op_data");
+    formatter->dump_int("num_ok", entry.delete_multi_obj_meta.num_ok);
+    formatter->dump_int("num_err", entry.delete_multi_obj_meta.num_err);
+    formatter->open_array_section("objects");
+    for (const auto& iter: entry.delete_multi_obj_meta.objects) {
+      formatter->open_object_section("");
+      formatter->dump_string("key", iter.key);
+      formatter->dump_string("version_id", iter.version_id);
+      formatter->dump_int("http_status", iter.http_status);
+      formatter->dump_bool("error", iter.error);
+      if (iter.error) {
+        formatter->dump_string("error_message", iter.error_message);
+      } else {
+        formatter->dump_bool("delete_marker", iter.delete_marker);
+        formatter->dump_string("marker_version_id", iter.marker_version_id);
+      }
+      formatter->close_section();
+    }
+    formatter->close_section();
+    formatter->close_section();
+  }
+  formatter->close_section();
+}
+
+OpsLogManifold::~OpsLogManifold()
+{
+    for (const auto &sink : sinks) {
+        delete sink;
+    }
+}
+
+void OpsLogManifold::add_sink(OpsLogSink* sink)
+{
+    sinks.push_back(sink);
+}
+
+int OpsLogManifold::log(req_state* s, struct rgw_log_entry& entry)
+{
+  int ret = 0;
+  for (const auto &sink : sinks) {
+    if (sink->log(s, entry) < 0) {
+      ret = -1;
+    }
+  }
+  return ret;
+}
+
+OpsLogFile::OpsLogFile(CephContext* cct, std::string& path, uint64_t max_data_size) :
+  cct(cct), data_size(0), max_data_size(max_data_size), path(path), need_reopen(false)
+{
+}
+
+void OpsLogFile::reopen() {
+  need_reopen = true;
+}
+
+void OpsLogFile::flush()
+{
+  {
+    std::scoped_lock log_lock(mutex);
+    assert(flush_buffer.empty());
+    flush_buffer.swap(log_buffer);
+    data_size = 0;
+  }
+  for (auto bl : flush_buffer) {
+    int try_num = 0;
+    while (true) {
+      if (!file.is_open() || need_reopen) {
+        need_reopen = false;
+        file.close();
+        file.open(path, std::ofstream::app);
+      }
+      bl.write_stream(file);
+      if (!file) {
+        ldpp_dout(this, 0) << "ERROR: failed to log RGW ops log file entry" << dendl;
+        file.clear();
+        if (stopped) {
+          break;
+        }
+        int sleep_time_secs = std::min((int) pow(2, try_num), 60);
+        std::this_thread::sleep_for(std::chrono::seconds(sleep_time_secs));
+        try_num++;
+      } else {
+        break;
+      }
+    }
+  }
+  flush_buffer.clear();
+  file << std::endl;
+}
+
+void* OpsLogFile::entry() {
+  std::unique_lock lock(mutex);
+  while (!stopped) {
+    if (!log_buffer.empty()) {
+      lock.unlock();
+      flush();
+      lock.lock();
+      continue;
+    }
+    cond.wait(lock);
+  }
+  lock.unlock();
+  flush();
+  return NULL;
+}
+
+void OpsLogFile::start() {
+  stopped = false;
+  create("ops_log_file");
+}
+
+void OpsLogFile::stop() {
+  {
+    std::unique_lock lock(mutex);
+    cond.notify_one();
+    stopped = true;
+  }
+  join();
+}
+
+OpsLogFile::~OpsLogFile()
+{
+  if (!stopped) {
+    stop();
+  }
+  file.close();
+}
+
+int OpsLogFile::log_json(req_state* s, bufferlist& bl)
+{
+  std::unique_lock lock(mutex);
+  if (data_size + bl.length() >= max_data_size) {
+    ldout(s->cct, 0) << "ERROR: RGW ops log file buffer too full, dropping log for txn: " << s->trans_id << dendl;
+    return -1;
+  }
+  log_buffer.push_back(bl);
+  data_size += bl.length();
+  cond.notify_all();
+  return 0;
+}
+
+unsigned OpsLogFile::get_subsys() const {
+  return dout_subsys;
+}
+
+JsonOpsLogSink::JsonOpsLogSink() {
+  formatter = new JSONFormatter;
+}
+
+JsonOpsLogSink::~JsonOpsLogSink() {
+  delete formatter;
+}
+
+void JsonOpsLogSink::formatter_to_bl(bufferlist& bl)
+{
+  stringstream ss;
+  formatter->flush(ss);
+  const string& s = ss.str();
+  bl.append(s);
+}
+
+int JsonOpsLogSink::log(req_state* s, struct rgw_log_entry& entry)
+{
+  bufferlist bl;
+
+  lock.lock();
+  rgw_format_ops_log_entry(entry, formatter);
+  formatter_to_bl(bl);
+  lock.unlock();
+
+  return log_json(s, bl);
+}
+
+void OpsLogSocket::init_connection(bufferlist& bl)
+{
+  bl.append("[");
+}
+
+OpsLogSocket::OpsLogSocket(CephContext *cct, uint64_t _backlog) : OutputDataSocket(cct, _backlog)
+{
+  delim.append(",\n");
+}
+
+int OpsLogSocket::log_json(req_state* s, bufferlist& bl)
+{
+  append_output(bl);
+  return 0;
+}
+
+OpsLogRados::OpsLogRados(rgw::sal::Driver* const& driver): driver(driver)
+{
+}
+
+int OpsLogRados::log(req_state* s, struct rgw_log_entry& entry)
+{
+  if (!s->cct->_conf->rgw_ops_log_rados) {
+    return 0;
+  }
+  bufferlist bl;
+  encode(entry, bl);
+
+  struct tm bdt;
+  time_t t = req_state::Clock::to_time_t(entry.time);
+  if (s->cct->_conf->rgw_log_object_name_utc)
+    gmtime_r(&t, &bdt);
+  else
+    localtime_r(&t, &bdt);
+  string oid = render_log_object_name(s->cct->_conf->rgw_log_object_name, &bdt,
+                                      entry.bucket_id, entry.bucket);
+  if (driver->log_op(s, oid, bl) < 0) {
+    ldpp_dout(s, 0) << "ERROR: failed to log RADOS RGW ops log entry for txn: " << s->trans_id << dendl;
+    return -1;
+  }
+  return 0;
+}
+
+int rgw_log_op(RGWREST* const rest, req_state *s, const RGWOp* op, OpsLogSink *olog)
+{
+  struct rgw_log_entry entry;
+  string bucket_id;
+  string op_name = (op ? op->name() : "unknown");
+
+  if (s->enable_usage_log)
+    log_usage(s, op_name);
+
+  if (!s->enable_ops_log)
+    return 0;
+
+  if (s->bucket_name.empty()) {
+    /* this case is needed for, e.g., list_buckets */
+  } else {
+    if (s->err.ret == -ERR_NO_SUCH_BUCKET ||
+	rgw::sal::Bucket::empty(s->bucket.get())) {
+      if (!s->cct->_conf->rgw_log_nonexistent_bucket) {
+	ldout(s->cct, 5) << "bucket " << s->bucket_name << " doesn't exist, not logging" << dendl;
+	return 0;
+      }
+      bucket_id = "";
+    } else {
+      bucket_id = s->bucket->get_bucket_id();
+    }
+    entry.bucket = rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name);
+
+    if (check_utf8(entry.bucket.c_str(), entry.bucket.size()) != 0) {
+      ldpp_dout(s, 5) << "not logging op on bucket with non-utf8 name" << dendl;
+      return 0;
+    }
+
+    if (!rgw::sal::Object::empty(s->object.get())) {
+      entry.obj = s->object->get_key();
+    } else {
+      entry.obj = rgw_obj_key("-");
+    }
+
+    entry.obj_size = s->obj_size;
+  } /* !bucket empty */
+
+  if (s->cct->_conf->rgw_remote_addr_param.length())
+    set_param_str(s, s->cct->_conf->rgw_remote_addr_param.c_str(),
+		  entry.remote_addr);
+  else
+    set_param_str(s, "REMOTE_ADDR", entry.remote_addr);
+  set_param_str(s, "HTTP_USER_AGENT", entry.user_agent);
+  // legacy apps are still using misspelling referer, such as curl -e option
+  if (s->info.env->exists("HTTP_REFERRER"))
+    set_param_str(s, "HTTP_REFERRER", entry.referrer);
+  else
+    set_param_str(s, "HTTP_REFERER", entry.referrer);
+
+  std::string uri;
+  if (s->info.env->exists("REQUEST_METHOD")) {
+    uri.append(s->info.env->get("REQUEST_METHOD"));
+    uri.append(" ");
+  }
+
+  if (s->info.env->exists("REQUEST_URI")) {
+    uri.append(s->info.env->get("REQUEST_URI"));
+  }
+
+  /* Formerly, we appended QUERY_STRING to uri, but in RGW, QUERY_STRING is a
+   * substring of REQUEST_URI--appending qs to uri here duplicates qs to the
+   * ops log */
+
+  if (s->info.env->exists("HTTP_VERSION")) {
+    uri.append(" ");
+    uri.append("HTTP/");
+    uri.append(s->info.env->get("HTTP_VERSION"));
+  }
+
+  entry.uri = std::move(uri);
+
+  entry.op = op_name;
+  if (op) {
+    op->write_ops_log_entry(entry);
+  }
+
+  if (s->auth.identity) {
+    entry.identity_type = s->auth.identity->get_identity_type();
+    s->auth.identity->write_ops_log_entry(entry);
+  } else {
+    entry.identity_type = TYPE_NONE;
+  }
+
+  if (! s->token_claims.empty()) {
+    entry.token_claims = std::move(s->token_claims);
+  }
+
+  /* custom header logging */
+  if (rest) {
+    if (rest->log_x_headers()) {
+      for (const auto& iter : s->info.env->get_map()) {
+	if (rest->log_x_header(iter.first)) {
+	  entry.x_headers.insert(
+	    rgw_log_entry::headers_map::value_type(iter.first, iter.second));
+	}
+      }
+    }
+  }
+
+  entry.user = s->user->get_id().to_str();
+  if (s->object_acl)
+    entry.object_owner = s->object_acl->get_owner().get_id();
+  entry.bucket_owner = s->bucket_owner.get_id();
+
+  uint64_t bytes_sent = ACCOUNTING_IO(s)->get_bytes_sent();
+  uint64_t bytes_received = ACCOUNTING_IO(s)->get_bytes_received();
+
+  entry.time = s->time;
+  entry.total_time = s->time_elapsed();
+  entry.bytes_sent = bytes_sent;
+  entry.bytes_received = bytes_received;
+  if (s->err.http_ret) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%d", s->err.http_ret);
+    entry.http_status = buf;
+  } else {
+    entry.http_status = "200"; // default
+  }
+  entry.error_code = s->err.err_code;
+  entry.bucket_id = bucket_id;
+  entry.trans_id = s->trans_id;
+  if (olog) {
+    return olog->log(s, entry);
+  }
+  return 0;
+}
+
+void rgw_log_entry::generate_test_instances(list<rgw_log_entry*>& o)
+{
+  rgw_log_entry *e = new rgw_log_entry;
+  e->object_owner = "object_owner";
+  e->bucket_owner = "bucket_owner";
+  e->bucket = "bucket";
+  e->remote_addr = "1.2.3.4";
+  e->user = "user";
+  e->obj = rgw_obj_key("obj");
+  e->uri = "http://uri/bucket/obj";
+  e->http_status = "200";
+  e->error_code = "error_code";
+  e->bytes_sent = 1024;
+  e->bytes_received = 512;
+  e->obj_size = 2048;
+  e->user_agent = "user_agent";
+  e->referrer = "referrer";
+  e->bucket_id = "10";
+  e->trans_id = "trans_id";
+  e->identity_type = TYPE_RGW;
+  o.push_back(e);
+  o.push_back(new rgw_log_entry);
+}
+
+void rgw_log_entry::dump(Formatter *f) const
+{
+  f->dump_string("object_owner", object_owner.to_str());
+  f->dump_string("bucket_owner", bucket_owner.to_str());
+  f->dump_string("bucket", bucket);
+  f->dump_stream("time") << time;
+  f->dump_string("remote_addr", remote_addr);
+  f->dump_string("user", user);
+  f->dump_stream("obj") << obj;
+  f->dump_string("op", op);
+  f->dump_string("uri", uri);
+  f->dump_string("http_status", http_status);
+  f->dump_string("error_code", error_code);
+  f->dump_unsigned("bytes_sent", bytes_sent);
+  f->dump_unsigned("bytes_received", bytes_received);
+  f->dump_unsigned("obj_size", obj_size);
+  f->dump_stream("total_time") << total_time;
+  f->dump_string("user_agent", user_agent);
+  f->dump_string("referrer", referrer);
+  f->dump_string("bucket_id", bucket_id);
+  f->dump_string("trans_id", trans_id);
+  f->dump_unsigned("identity_type", identity_type);
+}
diff --git a/src/rgw/rgw_log.h b/src/rgw/rgw_log.h
new file mode 100644
index 000000000..1dd79273e
--- /dev/null
+++ b/src/rgw/rgw_log.h
@@ -0,0 +1,289 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <boost/container/flat_map.hpp>
+#include "rgw_common.h"
+#include "common/OutputDataSocket.h"
+#include <vector>
+#include <fstream>
+#include "rgw_sal_fwd.h"
+
+class RGWOp;
+
+struct delete_multi_obj_entry {
+  std::string key;
+  std::string version_id;
+  std::string error_message;
+  std::string marker_version_id;
+  uint32_t http_status = 0;
+  bool error = false;
+  bool delete_marker = false;
+
+  void encode(bufferlist &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(key, bl);
+    encode(version_id, bl);
+    encode(error_message, bl);
+    encode(marker_version_id, bl);
+    encode(http_status, bl);
+    encode(error, bl);
+    encode(delete_marker, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &p) {
+    DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
+    decode(key, p);
+    decode(version_id, p);
+    decode(error_message, p);
+    decode(marker_version_id, p);
+    decode(http_status, p);
+    decode(error, p);
+    decode(delete_marker, p);
+    DECODE_FINISH(p);
+  }
+};
+WRITE_CLASS_ENCODER(delete_multi_obj_entry)
+
+struct delete_multi_obj_op_meta {
+  uint32_t num_ok = 0;
+  uint32_t num_err = 0;
+  std::vector<delete_multi_obj_entry> objects;
+
+  void encode(bufferlist &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(num_ok, bl);
+    encode(num_err, bl);
+    encode(objects, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &p) {
+    DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
+    decode(num_ok, p);
+    decode(num_err, p);
+    decode(objects, p);
+    DECODE_FINISH(p);
+  }
+};
+WRITE_CLASS_ENCODER(delete_multi_obj_op_meta)
+
+struct rgw_log_entry {
+
+  using headers_map = boost::container::flat_map<std::string, std::string>;
+  using Clock = req_state::Clock;
+
+  rgw_user object_owner;
+  rgw_user bucket_owner;
+  std::string bucket;
+  Clock::time_point time;
+  std::string remote_addr;
+  std::string user;
+  rgw_obj_key obj;
+  std::string op;
+  std::string uri;
+  std::string http_status;
+  std::string error_code;
+  uint64_t bytes_sent = 0;
+  uint64_t bytes_received = 0;
+  uint64_t obj_size = 0;
+  Clock::duration total_time{};
+  std::string user_agent;
+  std::string referrer;
+  std::string bucket_id;
+  headers_map x_headers;
+  std::string trans_id;
+  std::vector<std::string> token_claims;
+  uint32_t identity_type = TYPE_NONE;
+  std::string access_key_id;
+  std::string subuser;
+  bool temp_url {false};
+  delete_multi_obj_op_meta delete_multi_obj_meta;
+
+  void encode(bufferlist &bl) const {
+    ENCODE_START(14, 5, bl);
+    encode(object_owner.id, bl);
+    encode(bucket_owner.id, bl);
+    encode(bucket, bl);
+    encode(time, bl);
+    encode(remote_addr, bl);
+    encode(user, bl);
+    encode(obj.name, bl);
+    encode(op, bl);
+    encode(uri, bl);
+    encode(http_status, bl);
+    encode(error_code, bl);
+    encode(bytes_sent, bl);
+    encode(obj_size, bl);
+    encode(total_time, bl);
+    encode(user_agent, bl);
+    encode(referrer, bl);
+    encode(bytes_received, bl);
+    encode(bucket_id, bl);
+    encode(obj, bl);
+    encode(object_owner, bl);
+    encode(bucket_owner, bl);
+    encode(x_headers, bl);
+    encode(trans_id, bl);
+    encode(token_claims, bl);
+    encode(identity_type,bl);
+    encode(access_key_id, bl);
+    encode(subuser, bl);
+    encode(temp_url, bl);
+    encode(delete_multi_obj_meta, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator &p) {
+    DECODE_START_LEGACY_COMPAT_LEN(14, 5, 5, p);
+    decode(object_owner.id, p);
+    if (struct_v > 3)
+      decode(bucket_owner.id, p);
+    decode(bucket, p);
+    decode(time, p);
+    decode(remote_addr, p);
+    decode(user, p);
+    decode(obj.name, p);
+    decode(op, p);
+    decode(uri, p);
+    decode(http_status, p);
+    decode(error_code, p);
+    decode(bytes_sent, p);
+    decode(obj_size, p);
+    decode(total_time, p);
+    decode(user_agent, p);
+    decode(referrer, p);
+    if (struct_v >= 2)
+      decode(bytes_received, p);
+    else
+      bytes_received = 0;
+
+    if (struct_v >= 3) {
+      if (struct_v <= 5) {
+        uint64_t id;
+        decode(id, p);
+        char buf[32];
+        snprintf(buf, sizeof(buf), "%" PRIu64, id);
+        bucket_id = buf;
+      } else {
+        decode(bucket_id, p);
+      }
+    } else {
+      bucket_id = "";
+    }
+    if (struct_v >= 7) {
+      decode(obj, p);
+    }
+    if (struct_v >= 8) {
+      decode(object_owner, p);
+      decode(bucket_owner, p);
+    }
+    if (struct_v >= 9) {
+      decode(x_headers, p);
+    }
+    if (struct_v >= 10) {
+      decode(trans_id, p);
+    }
+    if (struct_v >= 11) {
+      decode(token_claims, p);
+    }
+    if (struct_v >= 12) {
+      decode(identity_type, p);
+    }
+    if (struct_v >= 13) {
+      decode(access_key_id, p);
+      decode(subuser, p);
+      decode(temp_url, p);
+    }
+    if (struct_v >= 14) {
+      decode(delete_multi_obj_meta, p);
+    }
+    DECODE_FINISH(p);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<rgw_log_entry*>& o);
+};
+WRITE_CLASS_ENCODER(rgw_log_entry)
+
+class OpsLogSink {
+public:
+  virtual int log(req_state* s, struct rgw_log_entry& entry) = 0;
+  virtual ~OpsLogSink() = default;
+};
+
+class OpsLogManifold: public OpsLogSink {
+  std::vector<OpsLogSink*> sinks;
+public:
+  ~OpsLogManifold() override;
+  void add_sink(OpsLogSink* sink);
+  int log(req_state* s, struct rgw_log_entry& entry) override;
+};
+
+class JsonOpsLogSink : public OpsLogSink {
+  ceph::Formatter *formatter;
+  ceph::mutex lock = ceph::make_mutex("JsonOpsLogSink");
+
+  void formatter_to_bl(bufferlist& bl);
+protected:
+  virtual int log_json(req_state* s, bufferlist& bl) = 0;
+public:
+  JsonOpsLogSink();
+  ~JsonOpsLogSink() override;
+  int log(req_state* s, struct rgw_log_entry& entry) override;
+};
+
+class OpsLogFile : public JsonOpsLogSink, public Thread, public DoutPrefixProvider {
+  CephContext* cct;
+  ceph::mutex mutex = ceph::make_mutex("OpsLogFile");
+  std::vector<bufferlist> log_buffer;
+  std::vector<bufferlist> flush_buffer;
+  ceph::condition_variable cond;
+  std::ofstream file;
+  bool stopped;
+  uint64_t data_size;
+  uint64_t max_data_size;
+  std::string path;
+  std::atomic_bool need_reopen;
+
+  void flush();
+protected:
+  int log_json(req_state* s, bufferlist& bl) override;
+  void *entry() override;
+public:
+  OpsLogFile(CephContext* cct, std::string& path, uint64_t max_data_size);
+  ~OpsLogFile() override;
+  CephContext *get_cct() const override { return cct; }
+  unsigned get_subsys() const override;
+  std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw OpsLogFile: "; }
+  void reopen();
+  void start();
+  void stop();
+};
+
+class OpsLogSocket : public OutputDataSocket, public JsonOpsLogSink {
+protected:
+  int log_json(req_state* s, bufferlist& bl) override;
+  void init_connection(bufferlist& bl) override;
+
+public:
+  OpsLogSocket(CephContext *cct, uint64_t _backlog);
+};
+
+class OpsLogRados : public OpsLogSink {
+  // main()'s driver pointer as a reference, possibly modified by RGWRealmReloader
+  rgw::sal::Driver* const& driver;
+
+public:
+  OpsLogRados(rgw::sal::Driver* const& driver);
+  int log(req_state* s, struct rgw_log_entry& entry) override;
+};
+
+class RGWREST;
+
+int rgw_log_op(RGWREST* const rest, struct req_state* s,
+	             const RGWOp* op, OpsLogSink* olog);
+void rgw_log_usage_init(CephContext* cct, rgw::sal::Driver* driver);
+void rgw_log_usage_finalize();
+void rgw_format_ops_log_entry(struct rgw_log_entry& entry,
+			      ceph::Formatter *formatter);
diff --git a/src/rgw/rgw_lua.cc b/src/rgw/rgw_lua.cc
new file mode 100644
index 000000000..33af60370
--- /dev/null
+++ b/src/rgw/rgw_lua.cc
@@ -0,0 +1,214 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <lua.hpp>
+#include "services/svc_zone.h"
+#include "rgw_lua_utils.h"
+#include "rgw_sal_rados.h"
+#include "rgw_lua.h"
+#ifdef WITH_RADOSGW_LUA_PACKAGES
+#include <filesystem>
+#include <boost/process.hpp>
+#endif
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::lua {
+
+context to_context(const std::string& s) 
+{
+  if (strcasecmp(s.c_str(), "prerequest") == 0) {
+    return context::preRequest;
+  }
+  if (strcasecmp(s.c_str(), "postrequest") == 0) {
+    return context::postRequest;
+  }
+  if (strcasecmp(s.c_str(), "background") == 0) {
+    return context::background;
+  }
+  if (strcasecmp(s.c_str(), "getdata") == 0) {
+    return context::getData;
+  }
+  if (strcasecmp(s.c_str(), "putdata") == 0) {
+    return context::putData;
+  }
+  return context::none;
+}
+
+std::string to_string(context ctx) 
+{
+  switch (ctx) {
+    case context::preRequest:
+      return "prerequest";
+    case context::postRequest:
+      return "postrequest";
+    case context::background:
+      return "background";
+    case context::getData:
+      return "getdata";
+    case context::putData:
+      return "putdata";
+    case context::none:
+      break;
+  }
+  return "none";
+}
+
+bool verify(const std::string& script, std::string& err_msg) 
+{
+  lua_State *L = luaL_newstate();
+  lua_state_guard guard(L);
+  open_standard_libs(L);
+  try {
+    if (luaL_loadstring(L, script.c_str()) != LUA_OK) {
+      err_msg.assign(lua_tostring(L, -1));
+      return false;
+    }
+  } catch (const std::runtime_error& e) {
+    err_msg = e.what();
+    return false;
+  }
+  err_msg = "";
+  return true;
+}
+
+std::string script_oid(context ctx, const std::string& tenant) {
+  static const std::string SCRIPT_OID_PREFIX("script.");
+  return SCRIPT_OID_PREFIX + to_string(ctx) + "." + tenant;
+}
+
+
+int read_script(const DoutPrefixProvider *dpp, sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx, std::string& script)
+{
+  return manager ? manager->get_script(dpp, y, script_oid(ctx, tenant), script) : -ENOENT;
+}
+
+int write_script(const DoutPrefixProvider *dpp, sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx, const std::string& script)
+{
+  return manager ? manager->put_script(dpp, y, script_oid(ctx, tenant), script) : -ENOENT;
+}
+
+int delete_script(const DoutPrefixProvider *dpp, sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx)
+{
+  return manager ? manager->del_script(dpp, y, script_oid(ctx, tenant)) : -ENOENT;
+}
+
+#ifdef WITH_RADOSGW_LUA_PACKAGES
+
+namespace bp = boost::process;
+
+int add_package(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, const std::string& package_name, bool allow_compilation)
+{
+  // verify that luarocks can load this package
+  const auto p = bp::search_path("luarocks");
+  if (p.empty()) {
+    return -ECHILD;
+  }
+  bp::ipstream is;
+  const auto cmd = p.string() + " search --porcelain" + (allow_compilation ? " " : " --binary ") + package_name;
+  bp::child c(cmd,
+      bp::std_in.close(),
+      bp::std_err > bp::null,
+      bp::std_out > is);
+
+  std::string line;
+  bool package_found = false;
+  while (c.running() && std::getline(is, line) && !line.empty()) {
+    package_found = true;
+  }
+  c.wait();
+  auto ret = c.exit_code();
+  if (ret) {
+    return -ret;
+  }
+
+  if (!package_found) {
+    return -EINVAL;
+  }
+
+  //replace previous versions of the package
+  const std::string package_name_no_version = package_name.substr(0, package_name.find(" "));
+  ret = remove_package(dpp, driver, y, package_name_no_version);
+  if (ret < 0) {
+    return ret;
+  }
+
+  auto lua_mgr = driver->get_lua_manager();
+
+  return lua_mgr->add_package(dpp, y, package_name);
+}
+
+int remove_package(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, const std::string& package_name)
+{
+  auto lua_mgr = driver->get_lua_manager();
+
+  return lua_mgr->remove_package(dpp, y, package_name);
+}
+
+namespace bp = boost::process;
+
+int list_packages(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, packages_t& packages)
+{
+  auto lua_mgr = driver->get_lua_manager();
+
+  return lua_mgr->list_packages(dpp, y, packages);
+}
+
+int install_packages(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+                     optional_yield y, const std::string& luarocks_path,
+                     packages_t& failed_packages, std::string& output) {
+  // luarocks directory cleanup
+  std::error_code ec;
+  if (std::filesystem::remove_all(luarocks_path, ec)
+      == static_cast<std::uintmax_t>(-1) &&
+      ec != std::errc::no_such_file_or_directory) {
+    output.append("failed to clear luarock directory: ");
+    output.append(ec.message());
+    output.append("\n");
+    return ec.value();
+  }
+
+  packages_t packages;
+  auto ret = list_packages(dpp, driver, y, packages);
+  if (ret == -ENOENT) {
+    // allowlist is empty 
+    return 0;
+  }
+  if (ret < 0) {
+    return ret;
+  }
+  // verify that luarocks exists
+  const auto p = bp::search_path("luarocks");
+  if (p.empty()) {
+    return -ECHILD;
+  }
+
+  // the lua rocks install dir will be created by luarocks the first time it is called
+  for (const auto& package : packages) {
+    bp::ipstream is;
+    const auto cmd = p.string() + " install --lua-version " + CEPH_LUA_VERSION + " --tree " + luarocks_path + " --deps-mode one " + package;
+    bp::child c(cmd, bp::std_in.close(), (bp::std_err & bp::std_out) > is);
+
+    // once package reload is supported, code should yield when reading output
+    std::string line = std::string("CMD: ") + cmd;
+
+    do {
+      if (!line.empty()) {
+        output.append(line);
+        output.append("\n");
+      }
+    } while (c.running() && std::getline(is, line));
+
+    c.wait();
+    if (c.exit_code()) {
+      failed_packages.insert(package);
+    }
+  }
+
+  return 0;
+}
+
+#endif
+
+}
+
diff --git a/src/rgw/rgw_lua.h b/src/rgw/rgw_lua.h
new file mode 100644
index 000000000..a6ebcc2d0
--- /dev/null
+++ b/src/rgw/rgw_lua.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <string>
+#include <set>
+#include "rgw_lua_version.h"
+#include "common/async/yield_context.h"
+#include "common/dout.h"
+#include "rgw_sal_fwd.h"
+
+class DoutPrefixProvider;
+class lua_State;
+class rgw_user;
+class DoutPrefixProvider;
+namespace rgw::sal {
+  class RadosStore;
+  class LuaManager;
+}
+
+namespace rgw::lua {
+
+enum class context {
+  preRequest,
+  postRequest,
+  background,
+  getData,
+  putData,
+  none
+};
+
+// get context enum from string 
+// the expected string the same as the enum (case insensitive)
+// return "none" if not matched
+context to_context(const std::string& s);
+
+// verify a lua script
+bool verify(const std::string& script, std::string& err_msg);
+
+// driver a lua script in a context
+int write_script(const DoutPrefixProvider *dpp, rgw::sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx, const std::string& script);
+
+// read the stored lua script from a context
+int read_script(const DoutPrefixProvider *dpp, rgw::sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx, std::string& script);
+
+// delete the stored lua script from a context
+int delete_script(const DoutPrefixProvider *dpp, rgw::sal::LuaManager* manager, const std::string& tenant, optional_yield y, context ctx);
+
+using packages_t = std::set<std::string>;
+
+#ifdef WITH_RADOSGW_LUA_PACKAGES
+
+// add a lua package to the allowlist
+int add_package(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, const std::string& package_name, bool allow_compilation);
+
+// remove a lua package from the allowlist
+int remove_package(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, const std::string& package_name);
+
+// list lua packages in the allowlist
+int list_packages(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y, packages_t& packages);
+
+// install all packages from the allowlist
+// return the list of packages that failed to install and the output of the install command
+int install_packages(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+                     optional_yield y, const std::string& luarocks_path,
+                     packages_t& failed_packages, std::string& output);
+#endif
+}
+
diff --git a/src/rgw/rgw_lua_background.cc b/src/rgw/rgw_lua_background.cc
new file mode 100644
index 000000000..35de4a7e9
--- /dev/null
+++ b/src/rgw/rgw_lua_background.cc
@@ -0,0 +1,181 @@
+#include "rgw_sal_rados.h"
+#include "rgw_lua_background.h"
+#include "rgw_lua.h"
+#include "rgw_lua_utils.h"
+#include "rgw_perf_counters.h"
+#include "include/ceph_assert.h"
+#include <lua.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::lua {
+
+const char* RGWTable::INCREMENT = "increment";
+const char* RGWTable::DECREMENT = "decrement";
+
+int RGWTable::increment_by(lua_State* L) {
+  const auto map = reinterpret_cast<BackgroundMap*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+  auto& mtx = *reinterpret_cast<std::mutex*>(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL)));
+  auto decrement = lua_toboolean(L, lua_upvalueindex(THIRD_UPVAL));
+
+  const auto args = lua_gettop(L);
+  const auto index = luaL_checkstring(L, 1);
+
+  // by default we increment by 1/-1
+  const long long int default_inc = (decrement ? -1 : 1);
+  BackgroundMapValue inc_by = default_inc;
+  if (args == 2) {
+    if (lua_isinteger(L, 2)) {
+      inc_by = lua_tointeger(L, 2)*default_inc;
+    } else if (lua_isnumber(L, 2)){
+      inc_by = lua_tonumber(L, 2)*static_cast<double>(default_inc);
+    } else {
+      return luaL_error(L, "can increment only by numeric values");
+    }
+  }
+
+  std::unique_lock l(mtx);
+
+  const auto it = map->find(std::string(index));
+  if (it != map->end()) {
+    auto& value = it->second;
+    if (std::holds_alternative<double>(value) && std::holds_alternative<double>(inc_by)) {
+      value = std::get<double>(value) + std::get<double>(inc_by);
+    } else if (std::holds_alternative<long long int>(value) && std::holds_alternative<long long int>(inc_by)) {
+      value = std::get<long long int>(value) + std::get<long long int>(inc_by);
+    } else if (std::holds_alternative<double>(value) && std::holds_alternative<long long int>(inc_by)) {
+      value = std::get<double>(value) + static_cast<double>(std::get<long long int>(inc_by));
+    } else if (std::holds_alternative<long long int>(value) && std::holds_alternative<double>(inc_by)) {
+      value = static_cast<double>(std::get<long long int>(value)) + std::get<double>(inc_by);
+    } else {
+      mtx.unlock();
+      return luaL_error(L, "can increment only numeric values");
+    }
+  }
+
+  return 0;
+}
+
+Background::Background(rgw::sal::Driver* driver,
+    CephContext* cct,
+      const std::string& luarocks_path,
+      int execute_interval) :
+    execute_interval(execute_interval),
+    dp(cct, dout_subsys, "lua background: "),
+    lua_manager(driver->get_lua_manager()),
+    cct(cct),
+    luarocks_path(luarocks_path) {}
+
+void Background::shutdown(){
+  stopped = true;
+  cond.notify_all();
+  if (runner.joinable()) {
+    runner.join();
+  }
+  started = false;
+  stopped = false;
+}
+
+void Background::start() {
+  if (started) {
+    // start the thread only once
+    return;
+  }
+  started = true;
+  runner = std::thread(&Background::run, this);
+  const auto rc = ceph_pthread_setname(runner.native_handle(),
+      "lua_background");
+  ceph_assert(rc == 0);
+}
+
+void Background::pause() {
+  {
+    std::unique_lock cond_lock(pause_mutex);
+    paused = true;
+  }
+  cond.notify_all();
+}
+
+void Background::resume(rgw::sal::Driver* driver) {
+  lua_manager = driver->get_lua_manager();
+  paused = false;
+  cond.notify_all();
+}
+
+int Background::read_script() {
+  std::unique_lock cond_lock(pause_mutex);
+  if (paused) {
+    return -EAGAIN;
+  }
+  std::string tenant;
+  return rgw::lua::read_script(&dp, lua_manager.get(), tenant, null_yield, rgw::lua::context::background, rgw_script);
+}
+
+const BackgroundMapValue Background::empty_table_value;
+
+const BackgroundMapValue& Background::get_table_value(const std::string& key) const {
+  std::unique_lock cond_lock(table_mutex);
+  const auto it = rgw_map.find(key);
+  if (it == rgw_map.end()) {
+    return empty_table_value;
+  }
+  return it->second;
+}
+
+//(1) Loads the script from the object if not paused
+//(2) Executes the script
+//(3) Sleep (configurable)
+void Background::run() {
+  lua_State* const L = luaL_newstate();
+  rgw::lua::lua_state_guard lguard(L);
+  open_standard_libs(L);
+  set_package_path(L, luarocks_path);
+  create_debug_action(L, cct);
+  create_background_metatable(L);
+  const DoutPrefixProvider* const dpp = &dp;
+
+  while (!stopped) {
+    if (paused) {
+      ldpp_dout(dpp, 10) << "Lua background thread paused" << dendl;
+      std::unique_lock cond_lock(cond_mutex);
+      cond.wait(cond_lock, [this]{return !paused || stopped;}); 
+      if (stopped) {
+        ldpp_dout(dpp, 10) << "Lua background thread stopped" << dendl;
+        return;
+      }
+      ldpp_dout(dpp, 10) << "Lua background thread resumed" << dendl;
+    }
+    const auto rc = read_script();
+    if (rc == -ENOENT || rc == -EAGAIN) {
+      // either no script or paused, nothing to do
+    } else if (rc < 0) {
+      ldpp_dout(dpp, 1) << "WARNING: failed to read background script. error " << rc << dendl;
+    } else {
+      auto failed = false;
+      try {
+        //execute the background lua script
+        if (luaL_dostring(L, rgw_script.c_str()) != LUA_OK) {
+          const std::string err(lua_tostring(L, -1));
+          ldpp_dout(dpp, 1) << "Lua ERROR: " << err << dendl;
+          failed = true;
+        }
+      } catch (const std::exception& e) {
+        ldpp_dout(dpp, 1) << "Lua ERROR: " << e.what() << dendl;
+        failed = true;
+      }
+      if (perfcounter) {
+        perfcounter->inc((failed ? l_rgw_lua_script_fail : l_rgw_lua_script_ok), 1);
+      }
+    }
+    std::unique_lock cond_lock(cond_mutex);
+    cond.wait_for(cond_lock, std::chrono::seconds(execute_interval), [this]{return stopped;}); 
+  }
+  ldpp_dout(dpp, 10) << "Lua background thread stopped" << dendl;
+}
+
+void Background::create_background_metatable(lua_State* L) {
+  create_metatable<rgw::lua::RGWTable>(L, true, &rgw_map, &table_mutex);
+}
+
+} //namespace rgw::lua
+
diff --git a/src/rgw/rgw_lua_background.h b/src/rgw/rgw_lua_background.h
new file mode 100644
index 000000000..e1271bceb
--- /dev/null
+++ b/src/rgw/rgw_lua_background.h
@@ -0,0 +1,230 @@
+#pragma once
+#include "common/dout.h"
+#include "rgw_common.h"
+#include <string>
+#include <unordered_map>
+#include <variant>
+#include "rgw_lua_utils.h"
+#include "rgw_realm_reloader.h"
+
+namespace rgw::lua {
+
+//Interval between each execution of the script is set to 5 seconds
+constexpr const int INIT_EXECUTE_INTERVAL = 5;
+
+//Writeable meta table named RGW with mutex protection
+using BackgroundMapValue = std::variant<std::string, long long int, double, bool>;
+using BackgroundMap  = std::unordered_map<std::string, BackgroundMapValue>;
+
+inline void pushvalue(lua_State* L, const std::string& value) {
+  pushstring(L, value);
+}
+
+inline void pushvalue(lua_State* L, long long value) {
+  lua_pushinteger(L, value);
+}
+
+inline void pushvalue(lua_State* L, double value) {
+  lua_pushnumber(L, value);
+}
+
+inline void pushvalue(lua_State* L, bool value) {
+  lua_pushboolean(L, value);
+}
+
+
+struct RGWTable : EmptyMetaTable {
+
+  static const char* INCREMENT;
+  static const char* DECREMENT;
+
+  static std::string TableName() {return "RGW";}
+  static std::string Name() {return TableName() + "Meta";}
+  
+  static int increment_by(lua_State* L);
+
+  static int IndexClosure(lua_State* L) {
+    const auto map = reinterpret_cast<BackgroundMap*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    auto& mtx = *reinterpret_cast<std::mutex*>(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL)));
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, INCREMENT) == 0) {
+      lua_pushlightuserdata(L, map);
+      lua_pushlightuserdata(L, &mtx);
+      lua_pushboolean(L, false /*increment*/);
+      lua_pushcclosure(L, increment_by, THREE_UPVALS);
+      return ONE_RETURNVAL;
+    } 
+    if (strcasecmp(index, DECREMENT) == 0) {
+      lua_pushlightuserdata(L, map);
+      lua_pushlightuserdata(L, &mtx);
+      lua_pushboolean(L, true /*decrement*/);
+      lua_pushcclosure(L, increment_by, THREE_UPVALS);
+      return ONE_RETURNVAL;
+    }
+
+    std::lock_guard l(mtx);
+
+    const auto it = map->find(std::string(index));
+    if (it == map->end()) {
+      lua_pushnil(L);
+    } else {
+      std::visit([L](auto&& value) { pushvalue(L, value); }, it->second);
+    }
+    return ONE_RETURNVAL;
+  }
+
+  static int LenClosure(lua_State* L) {
+    const auto map = reinterpret_cast<BackgroundMap*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    auto& mtx = *reinterpret_cast<std::mutex*>(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL)));
+
+    std::lock_guard l(mtx);
+
+    lua_pushinteger(L, map->size());
+
+    return ONE_RETURNVAL;
+  }
+
+  static int NewIndexClosure(lua_State* L) {
+    const auto map = reinterpret_cast<BackgroundMap*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    auto& mtx = *reinterpret_cast<std::mutex*>(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL)));
+    const auto index = luaL_checkstring(L, 2);
+    
+    if (strcasecmp(index, INCREMENT) == 0 || strcasecmp(index, DECREMENT) == 0) {
+        return luaL_error(L, "increment/decrement are reserved function names for RGW");
+    }
+
+    std::unique_lock l(mtx);
+
+    size_t len;
+    BackgroundMapValue value;
+    const int value_type = lua_type(L, 3);
+
+    switch (value_type) {
+      case LUA_TNIL:
+        map->erase(std::string(index));
+        return NO_RETURNVAL;
+      case LUA_TBOOLEAN:
+        value = static_cast<bool>(lua_toboolean(L, 3));
+        len = sizeof(bool);
+        break;
+      case LUA_TNUMBER:
+         if (lua_isinteger(L, 3)) {
+          value = lua_tointeger(L, 3);
+          len = sizeof(long long int);
+         } else {
+          value = lua_tonumber(L, 3);
+          len = sizeof(double);
+         }
+         break;
+      case LUA_TSTRING:
+      {
+        const auto str = lua_tolstring(L, 3, &len);
+        value = std::string{str, len};
+        break;
+      }
+      default:
+        l.unlock();
+        return luaL_error(L, "unsupported value type for RGW table");
+    }
+
+    if (len + strnlen(index, MAX_LUA_VALUE_SIZE)
+      > MAX_LUA_VALUE_SIZE) {
+      return luaL_error(L, "Lua maximum size of entry limit exceeded");
+    } else if (map->size() > MAX_LUA_KEY_ENTRIES) {
+      l.unlock();
+      return luaL_error(L, "Lua max number of entries limit exceeded");
+    } else {
+      map->insert_or_assign(index, value);
+    }
+
+    return NO_RETURNVAL;
+  }
+
+  static int PairsClosure(lua_State* L) {
+    auto map = reinterpret_cast<BackgroundMap*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    ceph_assert(map);
+    lua_pushlightuserdata(L, map);
+    lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function
+    lua_pushnil(L);                                 // indicate this is the first call
+    // return stateless_iter, nil
+
+    return TWO_RETURNVALS;
+  }
+  
+  static int stateless_iter(lua_State* L) {
+    // based on: http://lua-users.org/wiki/GeneralizedPairsAndIpairs
+    auto map = reinterpret_cast<BackgroundMap*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    typename BackgroundMap::const_iterator next_it;
+    if (lua_isnil(L, -1)) {
+      next_it = map->begin();
+    } else {
+      const char* index = luaL_checkstring(L, 2);
+      const auto it = map->find(std::string(index));
+      ceph_assert(it != map->end());
+      next_it = std::next(it);
+    }
+
+    if (next_it == map->end()) {
+      // index of the last element was provided
+      lua_pushnil(L);
+      lua_pushnil(L);
+      // return nil, nil
+    } else {
+      pushstring(L, next_it->first);
+      std::visit([L](auto&& value) { pushvalue(L, value); }, next_it->second);
+      // return key, value
+    }
+
+    return TWO_RETURNVALS;
+  }
+};
+
+class Background : public RGWRealmReloader::Pauser {
+
+private:
+  BackgroundMap rgw_map;
+  bool stopped = false;
+  bool started = false;
+  bool paused = false;
+  int execute_interval;
+  const DoutPrefix dp;
+  std::unique_ptr<rgw::sal::LuaManager> lua_manager; 
+  CephContext* const cct;
+  const std::string luarocks_path;
+  std::thread runner;
+  mutable std::mutex table_mutex;
+  std::mutex cond_mutex;
+  std::mutex pause_mutex;
+  std::condition_variable cond;
+  static const BackgroundMapValue empty_table_value;
+
+  void run();
+
+protected:
+  std::string rgw_script;
+  virtual int read_script();
+
+public:
+  Background(rgw::sal::Driver* driver,
+      CephContext* cct,
+      const std::string& luarocks_path,
+      int execute_interval = INIT_EXECUTE_INTERVAL);
+
+    virtual ~Background() = default;
+    void start();
+    void shutdown();
+    void create_background_metatable(lua_State* L);
+    const BackgroundMapValue& get_table_value(const std::string& key) const;
+    template<typename T>
+    void put_table_value(const std::string& key, T value) {
+      std::unique_lock cond_lock(table_mutex);
+      rgw_map[key] = value;
+    }
+    
+    void pause() override;
+    void resume(rgw::sal::Driver* _driver) override;
+};
+
+} //namepsace rgw::lua
+
diff --git a/src/rgw/rgw_lua_data_filter.cc b/src/rgw/rgw_lua_data_filter.cc
new file mode 100644
index 000000000..9ebaf3453
--- /dev/null
+++ b/src/rgw/rgw_lua_data_filter.cc
@@ -0,0 +1,143 @@
+#include "rgw_lua_data_filter.h"
+#include "rgw_lua_utils.h"
+#include "rgw_lua_request.h"
+#include "rgw_lua_background.h"
+#include "rgw_process_env.h"
+#include <lua.hpp>
+
+namespace rgw::lua {
+
+void push_bufferlist_byte(lua_State* L, bufferlist::iterator& it) {
+    char byte[1];
+    it.copy(1, byte);
+    lua_pushlstring(L, byte, 1);
+}
+
+struct BufferlistMetaTable : public EmptyMetaTable {
+
+  static std::string TableName() {return "Data";}
+  static std::string Name() {return TableName() + "Meta";}
+  
+  static int IndexClosure(lua_State* L) {
+    auto bl = reinterpret_cast<bufferlist*>(lua_touserdata(L, lua_upvalueindex(1)));
+    const auto index = luaL_checkinteger(L, 2);
+    if (index <= 0 || index > bl->length()) {
+      // lua arrays start from 1
+      lua_pushnil(L);
+      return ONE_RETURNVAL;
+    }
+    auto it = bl->begin(index-1);
+    if (it != bl->end()) {
+      push_bufferlist_byte(L, it);
+    } else {
+      lua_pushnil(L);
+    }
+    
+    return ONE_RETURNVAL;
+  }
+
+  static int PairsClosure(lua_State* L) {
+    auto bl = reinterpret_cast<bufferlist*>(lua_touserdata(L, lua_upvalueindex(1)));
+    ceph_assert(bl);
+    lua_pushlightuserdata(L, bl);
+    lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function
+    lua_pushnil(L);                                 // indicate this is the first call
+    // return stateless_iter, nil
+
+    return TWO_RETURNVALS;
+  }
+  
+  static int stateless_iter(lua_State* L) {
+    // based on: http://lua-users.org/wiki/GeneralizedPairsAndIpairs
+    auto bl = reinterpret_cast<bufferlist*>(lua_touserdata(L, lua_upvalueindex(1)));
+    lua_Integer index;
+    if (lua_isnil(L, -1)) {
+      index = 1;
+    } else {
+      index = luaL_checkinteger(L, -1) + 1;
+    }
+
+    // lua arrays start from 1
+    auto it = bl->begin(index-1);
+
+    if (index > bl->length()) {
+      // index of the last element was provided
+      lua_pushnil(L);
+      lua_pushnil(L);
+      // return nil, nil
+    } else {
+      lua_pushinteger(L, index);
+      push_bufferlist_byte(L, it);
+      // return key, value
+    }
+
+    return TWO_RETURNVALS;
+  }
+  
+  static int LenClosure(lua_State* L) {
+    const auto bl = reinterpret_cast<bufferlist*>(lua_touserdata(L, lua_upvalueindex(1)));
+
+    lua_pushinteger(L, bl->length());
+
+    return ONE_RETURNVAL;
+  }
+};
+
+int RGWObjFilter::execute(bufferlist& bl, off_t offset, const char* op_name) const {
+  auto L = luaL_newstate();
+  lua_state_guard lguard(L);
+
+  open_standard_libs(L);
+
+  create_debug_action(L, s->cct);  
+
+  // create the "Data" table
+  create_metatable<BufferlistMetaTable>(L, true, &bl);
+  lua_getglobal(L, BufferlistMetaTable::TableName().c_str());
+  ceph_assert(lua_istable(L, -1));
+
+  // create the "Request" table
+  request::create_top_metatable(L, s, op_name);
+
+  // create the "Offset" variable
+  lua_pushinteger(L, offset);
+  lua_setglobal(L, "Offset");
+
+  if (s->penv.lua.background) {
+    // create the "RGW" table
+    s->penv.lua.background->create_background_metatable(L);
+    lua_getglobal(L, rgw::lua::RGWTable::TableName().c_str());
+    ceph_assert(lua_istable(L, -1));
+  }
+
+  try {
+    // execute the lua script
+    if (luaL_dostring(L, script.c_str()) != LUA_OK) {
+      const std::string err(lua_tostring(L, -1));
+      ldpp_dout(s, 1) << "Lua ERROR: " << err << dendl;
+      return -EINVAL;
+    }
+  } catch (const std::runtime_error& e) {
+    ldpp_dout(s, 1) << "Lua ERROR: " << e.what() << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int RGWGetObjFilter::handle_data(bufferlist& bl,
+                  off_t bl_ofs,
+                  off_t bl_len) {
+  filter.execute(bl, bl_ofs, "get_obj");
+  // return value is ignored since we don't want to fail execution if lua script fails
+  return RGWGetObj_Filter::handle_data(bl, bl_ofs, bl_len);
+}
+
+int RGWPutObjFilter::process(bufferlist&& data, uint64_t logical_offset) {
+  filter.execute(data, logical_offset, "put_obj");
+  // return value is ignored since we don't want to fail execution if lua script fails
+  return rgw::putobj::Pipe::process(std::move(data), logical_offset); 
+}
+
+} // namespace rgw::lua
+
diff --git a/src/rgw/rgw_lua_data_filter.h b/src/rgw/rgw_lua_data_filter.h
new file mode 100644
index 000000000..75596b64e
--- /dev/null
+++ b/src/rgw/rgw_lua_data_filter.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "rgw_op.h"
+
+class DoutPrefixProvider;
+
+namespace rgw::lua {
+
+class RGWObjFilter {
+  req_state* const s;
+  const std::string script;
+
+public:
+  RGWObjFilter(req_state* s,
+      const std::string& script) : 
+    s(s), script(script) {}
+
+  int execute(bufferlist& bl, off_t offset, const char* op_name) const;
+};
+
+class RGWGetObjFilter : public RGWGetObj_Filter {
+  const RGWObjFilter filter;
+
+public:
+  RGWGetObjFilter(req_state* s,
+      const std::string& script,
+      RGWGetObj_Filter* next) : RGWGetObj_Filter(next), filter(s, script) 
+  {}
+
+  ~RGWGetObjFilter() override = default;
+
+  int handle_data(bufferlist& bl,
+                  off_t bl_ofs,
+                  off_t bl_len) override;
+
+};
+
+class RGWPutObjFilter : public rgw::putobj::Pipe {
+  const RGWObjFilter filter;
+
+public:
+  RGWPutObjFilter(req_state* s,
+      const std::string& script,
+      rgw::sal::DataProcessor* next) : rgw::putobj::Pipe(next), filter(s, script) 
+  {}
+
+  ~RGWPutObjFilter() override = default;
+
+  int process(bufferlist&& data, uint64_t logical_offset) override;
+};
+} // namespace rgw::lua
+
diff --git a/src/rgw/rgw_lua_request.cc b/src/rgw/rgw_lua_request.cc
new file mode 100644
index 000000000..6d324d4fc
--- /dev/null
+++ b/src/rgw/rgw_lua_request.cc
@@ -0,0 +1,906 @@
+#include <sstream>
+#include <stdexcept>
+#include <lua.hpp>
+#include "common/dout.h"
+#include "services/svc_zone.h"
+#include "rgw_lua_utils.h"
+#include "rgw_lua.h"
+#include "rgw_common.h"
+#include "rgw_log.h"
+#include "rgw_op.h"
+#include "rgw_process_env.h"
+#include "rgw_zone.h"
+#include "rgw_acl.h"
+#include "rgw_sal_rados.h"
+#include "rgw_lua_background.h"
+#include "rgw_perf_counters.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::lua::request {
+
+// closure that perform ops log action
+// e.g.
+//    Request.Log()
+//
+constexpr const char* RequestLogAction{"Log"};
+
+int RequestLog(lua_State* L) 
+{
+  const auto rest = reinterpret_cast<RGWREST*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+  const auto olog = reinterpret_cast<OpsLogSink*>(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL)));
+  const auto s = reinterpret_cast<req_state*>(lua_touserdata(L, lua_upvalueindex(THIRD_UPVAL)));
+  const auto op(reinterpret_cast<RGWOp*>(lua_touserdata(L, lua_upvalueindex(FOURTH_UPVAL))));
+  if (s) {
+    const auto rc = rgw_log_op(rest, s, op, olog);
+    lua_pushinteger(L, rc);
+  } else {
+    ldpp_dout(s, 1) << "Lua ERROR: missing request state, cannot use ops log"  << dendl;
+    lua_pushinteger(L, -EINVAL);
+  }
+
+  return ONE_RETURNVAL;
+}
+
+int SetAttribute(lua_State* L)  {
+  auto s = reinterpret_cast<req_state*>(lua_touserdata(L, lua_upvalueindex(1)));
+
+  if (!s->trace || !s->trace->IsRecording()) {
+    return 0;
+  }
+
+  auto key = luaL_checkstring(L, 1);
+  int value_type = lua_type(L, 2);
+
+  switch (value_type) {
+    case LUA_TSTRING:
+      s->trace->SetAttribute(key, lua_tostring(L, 2));
+      break;
+
+    case LUA_TNUMBER:
+      if (lua_isinteger(L, 2)) {
+        s->trace->SetAttribute(key, static_cast<int64_t>(lua_tointeger(L, 2)));
+      } else {
+        s->trace->SetAttribute(key, static_cast<double>(lua_tonumber(L, 2)));
+      }
+      break;
+
+    default:
+      luaL_error(L, "unsupported value type for SetAttribute");
+  }
+  return 0;
+}
+
+int AddEvent(lua_State* L)  {
+  auto s = reinterpret_cast<req_state*>(lua_touserdata(L, lua_upvalueindex(1)));
+
+  if (!s->trace || !s->trace->IsRecording()) {
+    return 0;
+  }
+
+  int args = lua_gettop(L);
+  if (args == 1) {
+    auto log = luaL_checkstring(L, 1);
+    s->trace->AddEvent(log);
+  } else if(args == 2) {
+    auto event_name = luaL_checkstring(L, 1);
+    std::unordered_map<const char*, jspan_attribute> event_values;
+    lua_pushnil(L);
+    while (lua_next(L, 2) != 0) {
+      if (lua_type(L, -2) != LUA_TSTRING) {
+        // skip pair if key is not a string
+        lua_pop(L, 1);
+        continue;
+      }
+
+      auto key = luaL_checkstring(L, -2);
+      int value_type = lua_type(L, -1);
+      switch (value_type) {
+        case LUA_TSTRING:
+          event_values.emplace(key, lua_tostring(L, -1));
+          break;
+
+        case LUA_TNUMBER:
+          if (lua_isinteger(L, -1)) {
+            event_values.emplace(key, static_cast<int64_t>(lua_tointeger(L, -1)));
+          } else {
+            event_values.emplace(key, static_cast<double>(lua_tonumber(L, -1)));
+          }
+          break;
+      }
+      lua_pop(L, 1);
+    }
+    lua_pop(L, 1);
+    s->trace->AddEvent(event_name, event_values);
+  }
+  return 0;
+}
+
+struct ResponseMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Response";} 
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto err = reinterpret_cast<const rgw_err*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "HTTPStatusCode") == 0) {
+      lua_pushinteger(L, err->http_ret);
+    } else if (strcasecmp(index, "RGWCode") == 0) {
+      lua_pushinteger(L, err->ret);
+    } else if (strcasecmp(index, "HTTPStatus") == 0) {
+      pushstring(L, err->err_code);
+    } else if (strcasecmp(index, "Message") == 0) {
+      pushstring(L, err->message);
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+  
+  static int NewIndexClosure(lua_State* L) {
+    auto err = reinterpret_cast<rgw_err*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "HTTPStatusCode") == 0) {
+      err->http_ret = luaL_checkinteger(L, 3);
+    } else if (strcasecmp(index, "RGWCode") == 0) {
+      err->ret = luaL_checkinteger(L, 3);
+    } else if (strcasecmp(index, "HTTPStatus") == 0) {
+      err->err_code.assign(luaL_checkstring(L, 3));
+    } else if (strcasecmp(index, "Message") == 0) {
+      err->message.assign(luaL_checkstring(L, 3));
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return NO_RETURNVAL;
+  }
+};
+
+struct QuotaMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Quota";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto info = reinterpret_cast<RGWQuotaInfo*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "MaxSize") == 0) {
+      lua_pushinteger(L, info->max_size);
+    } else if (strcasecmp(index, "MaxObjects") == 0) {
+      lua_pushinteger(L, info->max_objects);
+    } else if (strcasecmp(index, "Enabled") == 0) {
+      lua_pushboolean(L, info->enabled);
+    } else if (strcasecmp(index, "Rounded") == 0) {
+      lua_pushboolean(L, !info->check_on_raw);
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+struct PlacementRuleMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "PlacementRule";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto rule = reinterpret_cast<rgw_placement_rule*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Name") == 0) {
+      pushstring(L, rule->name);
+    } else if (strcasecmp(index, "StorageClass") == 0) {
+      pushstring(L, rule->storage_class);
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+struct UserMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "User";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto user = reinterpret_cast<const rgw_user*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Tenant") == 0) {
+      pushstring(L, user->tenant);
+    } else if (strcasecmp(index, "Id") == 0) {
+      pushstring(L, user->id);
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+struct TraceMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Trace";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto s = reinterpret_cast<req_state*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Enable") == 0) {
+      lua_pushboolean(L, s->trace_enabled);
+    } else if(strcasecmp(index, "SetAttribute") == 0) {
+        lua_pushlightuserdata(L, s);
+        lua_pushcclosure(L, SetAttribute, ONE_UPVAL);
+    } else if(strcasecmp(index, "AddEvent") == 0) {
+        lua_pushlightuserdata(L, s);
+        lua_pushcclosure(L, AddEvent, ONE_UPVAL);
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+
+  static int NewIndexClosure(lua_State* L) {
+    const auto s = reinterpret_cast<req_state*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Enable") == 0) {
+      s->trace_enabled = lua_toboolean(L, 3);
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return NO_RETURNVAL;
+  }
+};
+
+struct OwnerMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Owner";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto owner = reinterpret_cast<ACLOwner*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "DisplayName") == 0) {
+      pushstring(L, owner->get_display_name());
+    } else if (strcasecmp(index, "User") == 0) {
+      create_metatable<UserMetaTable>(L, false, &(owner->get_id()));
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+struct BucketMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Bucket";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto s = reinterpret_cast<req_state*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    const auto bucket = s->bucket.get();
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (rgw::sal::Bucket::empty(bucket)) {
+      if (strcasecmp(index, "Name") == 0) {
+        pushstring(L, s->init_state.url_bucket);
+      } else {
+        lua_pushnil(L);
+      }
+    } else if (strcasecmp(index, "Tenant") == 0) {
+      pushstring(L, bucket->get_tenant());
+    } else if (strcasecmp(index, "Name") == 0) {
+      pushstring(L, bucket->get_name());
+    } else if (strcasecmp(index, "Marker") == 0) {
+      pushstring(L, bucket->get_marker());
+    } else if (strcasecmp(index, "Id") == 0) {
+      pushstring(L, bucket->get_bucket_id());
+    } else if (strcasecmp(index, "Count") == 0) {
+      lua_pushinteger(L, bucket->get_count());
+    } else if (strcasecmp(index, "Size") == 0) {
+      lua_pushinteger(L, bucket->get_size());
+    } else if (strcasecmp(index, "ZoneGroupId") == 0) {
+      pushstring(L, bucket->get_info().zonegroup);
+    } else if (strcasecmp(index, "CreationTime") == 0) {
+      pushtime(L, bucket->get_creation_time());
+    } else if (strcasecmp(index, "MTime") == 0) {
+      pushtime(L, bucket->get_modification_time());
+    } else if (strcasecmp(index, "Quota") == 0) {
+      create_metatable<QuotaMetaTable>(L, false, &(bucket->get_info().quota));
+    } else if (strcasecmp(index, "PlacementRule") == 0) {
+      create_metatable<PlacementRuleMetaTable>(L, false, &(bucket->get_info().placement_rule));
+    } else if (strcasecmp(index, "User") == 0) {
+      create_metatable<UserMetaTable>(L, false, &(bucket->get_info().owner));
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+  
+  static int NewIndexClosure(lua_State* L) {
+    const auto s = reinterpret_cast<req_state*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    const auto bucket = s->bucket.get();
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (rgw::sal::Bucket::empty(bucket)) {
+      if (strcasecmp(index, "Name") == 0) {
+        s->init_state.url_bucket = luaL_checkstring(L, 3);
+        return NO_RETURNVAL;
+      }
+    }
+    return error_unknown_field(L, index, TableName());
+  }
+};
+
+struct ObjectMetaTable : public EmptyMetaTable {
+  static const std::string TableName() {return "Object";}
+  static std::string Name() {return TableName() + "Meta";}
+  
+  using Type = rgw::sal::Object;
+
+  static int IndexClosure(lua_State* L) {
+    const auto obj = reinterpret_cast<const Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Name") == 0) {
+      pushstring(L, obj->get_name());
+    } else if (strcasecmp(index, "Instance") == 0) {
+      pushstring(L, obj->get_instance());
+    } else if (strcasecmp(index, "Id") == 0) {
+      pushstring(L, obj->get_oid());
+    } else if (strcasecmp(index, "Size") == 0) {
+      lua_pushinteger(L, obj->get_obj_size());
+    } else if (strcasecmp(index, "MTime") == 0) {
+      pushtime(L, obj->get_mtime());
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+struct GrantMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Grant";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto grant = reinterpret_cast<ACLGrant*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Type") == 0) {
+      lua_pushinteger(L, grant->get_type().get_type());
+    } else if (strcasecmp(index, "User") == 0) {
+      const auto id_ptr = grant->get_id();
+      if (id_ptr) {
+        create_metatable<UserMetaTable>(L, false, const_cast<rgw_user*>(id_ptr));
+      } else {
+        lua_pushnil(L);
+      }
+    } else if (strcasecmp(index, "Permission") == 0) {
+      lua_pushinteger(L, grant->get_permission().get_permissions());
+    } else if (strcasecmp(index, "GroupType") == 0) {
+      lua_pushinteger(L, grant->get_group());
+    } else if (strcasecmp(index, "Referer") == 0) {
+      pushstring(L, grant->get_referer());
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+struct GrantsMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Grants";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto map = reinterpret_cast<ACLGrantMap*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    const auto it = map->find(std::string(index));
+    if (it == map->end()) {
+      lua_pushnil(L);
+    } else {
+      create_metatable<GrantMetaTable>(L, false, &(it->second));
+    }
+    return ONE_RETURNVAL;
+  }
+  
+  static int PairsClosure(lua_State* L) {
+    auto map = reinterpret_cast<ACLGrantMap*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    ceph_assert(map);
+    lua_pushlightuserdata(L, map);
+    lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function
+    lua_pushnil(L);                                 // indicate this is the first call
+    // return stateless_iter, nil
+
+    return TWO_RETURNVALS;
+  }
+  
+  static int stateless_iter(lua_State* L) {
+    // based on: http://lua-users.org/wiki/GeneralizedPairsAndIpairs
+    auto map = reinterpret_cast<ACLGrantMap*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    ACLGrantMap::iterator next_it;
+    if (lua_isnil(L, -1)) {
+      next_it = map->begin();
+    } else {
+      const char* index = luaL_checkstring(L, 2);
+      const auto it = map->find(std::string(index));
+      ceph_assert(it != map->end());
+      next_it = std::next(it);
+    }
+
+    if (next_it == map->end()) {
+      // index of the last element was provided
+      lua_pushnil(L);
+      lua_pushnil(L);
+      return TWO_RETURNVALS;
+      // return nil, nil
+    }
+
+    while (next_it->first.empty()) {
+      // this is a multimap and the next element does not have a unique key
+      ++next_it;
+      if (next_it == map->end()) {
+        // index of the last element was provided
+        lua_pushnil(L);
+        lua_pushnil(L);
+        return TWO_RETURNVALS;
+        // return nil, nil
+      }
+    }
+
+    pushstring(L, next_it->first);
+    create_metatable<GrantMetaTable>(L, false, &(next_it->second));
+    // return key, value
+    
+    return TWO_RETURNVALS;
+  }
+
+  static int LenClosure(lua_State* L) {
+    const auto map = reinterpret_cast<ACLGrantMap*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    lua_pushinteger(L, map->size());
+
+    return ONE_RETURNVAL;
+  }
+};
+
+struct ACLMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "ACL";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  using Type = RGWAccessControlPolicy;
+
+  static int IndexClosure(lua_State* L) {
+    const auto acl = reinterpret_cast<Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Owner") == 0) {
+      create_metatable<OwnerMetaTable>(L, false, &(acl->get_owner()));
+    } else if (strcasecmp(index, "Grants") == 0) {
+      create_metatable<GrantsMetaTable>(L, false, &(acl->get_acl().get_grant_map()));
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+struct StatementsMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Statements";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  using Type = std::vector<rgw::IAM::Statement>;
+
+  static std::string statement_to_string(const rgw::IAM::Statement& statement) {
+    std::stringstream ss;
+    ss << statement;
+    return ss.str();
+  }
+
+  static int IndexClosure(lua_State* L) {
+    const auto statements = reinterpret_cast<Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const auto index = luaL_checkinteger(L, 2);
+
+    if (index >= (int)statements->size() || index < 0) {
+      lua_pushnil(L);
+    } else {
+      // TODO: policy language could be interpreted to lua and executed as such
+      pushstring(L, statement_to_string((*statements)[index]));
+    }
+    return ONE_RETURNVAL;
+  }
+  
+  static int PairsClosure(lua_State* L) {
+    auto statements = reinterpret_cast<Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    ceph_assert(statements);
+    lua_pushlightuserdata(L, statements);
+    lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function
+    lua_pushnil(L);                                 // indicate this is the first call
+    // return stateless_iter, nil
+
+    return TWO_RETURNVALS;
+  }
+  
+  static int stateless_iter(lua_State* L) {
+    auto statements = reinterpret_cast<Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    size_t next_it;
+    if (lua_isnil(L, -1)) {
+      next_it = 0;
+    } else {
+      const auto it = luaL_checkinteger(L, -1);
+      next_it = it+1;
+    }
+
+    if (next_it >= statements->size()) {
+      // index of the last element was provided
+      lua_pushnil(L);
+      lua_pushnil(L);
+      // return nil, nil
+    } else {
+      lua_pushinteger(L, next_it);
+      pushstring(L, statement_to_string((*statements)[next_it]));
+      // return key, value
+    }
+
+    return TWO_RETURNVALS;
+  }
+
+  static int LenClosure(lua_State* L) {
+    const auto statements = reinterpret_cast<Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    lua_pushinteger(L, statements->size());
+
+    return ONE_RETURNVAL;
+  }
+};
+
+struct PolicyMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Policy";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto policy = reinterpret_cast<rgw::IAM::Policy*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Text") == 0) {
+      pushstring(L, policy->text);
+    } else if (strcasecmp(index, "Id") == 0) {
+      // TODO create pushstring for std::unique_ptr
+      if (!policy->id) {
+        lua_pushnil(L);
+      } else {
+        pushstring(L, policy->id.get());
+      }
+    } else if (strcasecmp(index, "Statements") == 0) {
+      create_metatable<StatementsMetaTable>(L, false, &(policy->statements));
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+struct PoliciesMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Policies";}
+  static std::string Name() {return TableName() + "Meta";}
+  
+  using Type = std::vector<rgw::IAM::Policy>;
+
+  static int IndexClosure(lua_State* L) {
+    const auto policies = reinterpret_cast<Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const auto index = luaL_checkinteger(L, 2);
+
+    if (index >= (int)policies->size() || index < 0) {
+      lua_pushnil(L);
+    } else {
+      create_metatable<PolicyMetaTable>(L, false, &((*policies)[index]));
+    }
+    return ONE_RETURNVAL;
+  }
+  
+  static int PairsClosure(lua_State* L) {
+    auto policies = reinterpret_cast<Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    ceph_assert(policies);
+    lua_pushlightuserdata(L, policies);
+    lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function
+    lua_pushnil(L);                                 // indicate this is the first call
+    // return stateless_iter, nil
+
+    return TWO_RETURNVALS;
+  }
+  
+  static int stateless_iter(lua_State* L) {
+    auto policies = reinterpret_cast<Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    size_t next_it;
+    if (lua_isnil(L, -1)) {
+      next_it = 0;
+    } else {
+      ceph_assert(lua_isinteger(L, -1));
+      const auto it = luaL_checkinteger(L, -1);
+      next_it = it+1;
+    }
+
+    if (next_it >= policies->size()) {
+      // index of the last element was provided
+      lua_pushnil(L);
+      lua_pushnil(L);
+      // return nil, nil
+    } else {
+      lua_pushinteger(L, next_it);
+      create_metatable<PolicyMetaTable>(L, false, &((*policies)[next_it]));
+      // return key, value
+    }
+
+    return TWO_RETURNVALS;
+  }
+
+  static int LenClosure(lua_State* L) {
+    const auto policies = reinterpret_cast<Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    lua_pushinteger(L, policies->size());
+
+    return ONE_RETURNVAL;
+  }
+};
+
+struct HTTPMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "HTTP";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto info = reinterpret_cast<req_info*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Parameters") == 0) {
+      create_metatable<StringMapMetaTable<>>(L, false, &(info->args.get_params()));
+    } else if (strcasecmp(index, "Resources") == 0) {
+      // TODO: add non-const api to get resources
+      create_metatable<StringMapMetaTable<>>(L, false, 
+          const_cast<std::map<std::string, std::string>*>(&(info->args.get_sub_resources())));
+    } else if (strcasecmp(index, "Metadata") == 0) {
+      create_metatable<StringMapMetaTable<meta_map_t, StringMapWriteableNewIndex<meta_map_t>>>(L, false, &(info->x_meta_map));
+    } else if (strcasecmp(index, "Host") == 0) {
+      pushstring(L, info->host);
+    } else if (strcasecmp(index, "Method") == 0) {
+      pushstring(L, info->method);
+    } else if (strcasecmp(index, "URI") == 0) {
+      pushstring(L, info->request_uri);
+    } else if (strcasecmp(index, "QueryString") == 0) {
+      pushstring(L, info->request_params);
+    } else if (strcasecmp(index, "Domain") == 0) {
+      pushstring(L, info->domain);
+    } else if (strcasecmp(index, "StorageClass") == 0) {
+      pushstring(L, info->storage_class);
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+
+  static int NewIndexClosure(lua_State* L) {
+    auto info = reinterpret_cast<req_info*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "StorageClass") == 0) {
+      info->storage_class = luaL_checkstring(L, 3);
+   } else {
+      return error_unknown_field(L, index, TableName());
+   }
+    return NO_RETURNVAL;
+  }
+};
+
+struct CopyFromMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "CopyFrom";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto s = reinterpret_cast<req_state*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Tenant") == 0) {
+      pushstring(L, s->src_tenant_name);
+    } else if (strcasecmp(index, "Bucket") == 0) {
+      pushstring(L, s->src_bucket_name);
+    } else if (strcasecmp(index, "Object") == 0) {
+      create_metatable<ObjectMetaTable>(L, false, s->src_object);
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+struct ZoneGroupMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "ZoneGroup";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto s = reinterpret_cast<req_state*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "Name") == 0) {
+      pushstring(L, s->zonegroup_name);
+    } else if (strcasecmp(index, "Endpoint") == 0) {
+      pushstring(L, s->zonegroup_endpoint);
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+struct RequestMetaTable : public EmptyMetaTable {
+  static std::string TableName() {return "Request";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  // __index closure that expect req_state to be captured
+  static int IndexClosure(lua_State* L) {
+    const auto s = reinterpret_cast<req_state*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    const auto op_name = reinterpret_cast<const char*>(lua_touserdata(L, lua_upvalueindex(SECOND_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    if (strcasecmp(index, "RGWOp") == 0) {
+      pushstring(L, op_name);
+    } else if (strcasecmp(index, "DecodedURI") == 0) {
+      pushstring(L, s->decoded_uri);
+    } else if (strcasecmp(index, "ContentLength") == 0) {
+      lua_pushinteger(L, s->content_length);
+    } else if (strcasecmp(index, "GenericAttributes") == 0) {
+      create_metatable<StringMapMetaTable<>>(L, false, &(s->generic_attrs));
+    } else if (strcasecmp(index, "Response") == 0) {
+      create_metatable<ResponseMetaTable>(L, false, &(s->err));
+    } else if (strcasecmp(index, "SwiftAccountName") == 0) {
+      if (s->dialect == "swift") {
+        pushstring(L, s->account_name);
+      } else {
+        lua_pushnil(L);
+      }
+    } else if (strcasecmp(index, "Bucket") == 0) {
+      create_metatable<BucketMetaTable>(L, false, s);
+    } else if (strcasecmp(index, "Object") == 0) {
+      create_metatable<ObjectMetaTable>(L, false, s->object);
+    } else if (strcasecmp(index, "CopyFrom") == 0) {
+      if (s->op_type == RGW_OP_COPY_OBJ) {
+        create_metatable<CopyFromMetaTable>(L, s);
+      } else {
+        lua_pushnil(L);
+      }
+    } else if (strcasecmp(index, "ObjectOwner") == 0) {
+      create_metatable<OwnerMetaTable>(L, false, &(s->owner));
+    } else if (strcasecmp(index, "ZoneGroup") == 0) {
+      create_metatable<ZoneGroupMetaTable>(L, false, s);
+    } else if (strcasecmp(index, "UserACL") == 0) {
+      create_metatable<ACLMetaTable>(L, false, s->user_acl);
+    } else if (strcasecmp(index, "BucketACL") == 0) {
+      create_metatable<ACLMetaTable>(L, false, s->bucket_acl);
+    } else if (strcasecmp(index, "ObjectACL") == 0) {
+      create_metatable<ACLMetaTable>(L, false, s->object_acl);
+    } else if (strcasecmp(index, "Environment") == 0) {
+        create_metatable<StringMapMetaTable<rgw::IAM::Environment>>(L, false, &(s->env));
+    } else if (strcasecmp(index, "Policy") == 0) {
+      // TODO: create a wrapper to std::optional
+      if (!s->iam_policy) {
+        lua_pushnil(L);
+      } else {
+        create_metatable<PolicyMetaTable>(L, false, s->iam_policy.get_ptr());
+      }
+    } else if (strcasecmp(index, "UserPolicies") == 0) {
+        create_metatable<PoliciesMetaTable>(L, false, &(s->iam_user_policies));
+    } else if (strcasecmp(index, "RGWId") == 0) {
+      pushstring(L, s->host_id);
+    } else if (strcasecmp(index, "HTTP") == 0) {
+        create_metatable<HTTPMetaTable>(L, false, &(s->info));
+    } else if (strcasecmp(index, "Time") == 0) {
+      pushtime(L, s->time);
+    } else if (strcasecmp(index, "Dialect") == 0) {
+      pushstring(L, s->dialect);
+    } else if (strcasecmp(index, "Id") == 0) {
+      pushstring(L, s->req_id);
+    } else if (strcasecmp(index, "TransactionId") == 0) {
+      pushstring(L, s->trans_id);
+    } else if (strcasecmp(index, "Tags") == 0) {
+      create_metatable<StringMapMetaTable<RGWObjTags::tag_map_t>>(L, false, &(s->tagset.get_tags()));
+    } else if (strcasecmp(index, "User") == 0) {
+      if (!s->user) {
+        lua_pushnil(L);
+      } else {
+        create_metatable<UserMetaTable>(L, false, const_cast<rgw_user*>(&(s->user->get_id())));
+      }
+    } else if (strcasecmp(index, "Trace") == 0) {
+        create_metatable<TraceMetaTable>(L, false, s);
+    } else {
+      return error_unknown_field(L, index, TableName());
+    }
+    return ONE_RETURNVAL;
+  }
+};
+
+void create_top_metatable(lua_State* L, req_state* s, const char* op_name) {
+  create_metatable<RequestMetaTable>(L, true, s, const_cast<char*>(op_name));
+  lua_getglobal(L, RequestMetaTable::TableName().c_str());
+  ceph_assert(lua_istable(L, -1));
+}
+
+int execute(
+    rgw::sal::Driver* driver,
+    RGWREST* rest,
+    OpsLogSink* olog,
+    req_state* s, 
+    RGWOp* op,
+    const std::string& script)
+{
+  auto L = luaL_newstate();
+  const char* op_name = op ? op->name() : "Unknown";
+  lua_state_guard lguard(L);
+
+  open_standard_libs(L);
+  set_package_path(L, s->penv.lua.luarocks_path);
+
+  create_debug_action(L, s->cct);  
+  
+  create_metatable<RequestMetaTable>(L, true, s, const_cast<char*>(op_name));
+
+  lua_getglobal(L, RequestMetaTable::TableName().c_str());
+  ceph_assert(lua_istable(L, -1));
+
+  // add the ops log action
+  pushstring(L, RequestLogAction);
+  lua_pushlightuserdata(L, rest);
+  lua_pushlightuserdata(L, olog);
+  lua_pushlightuserdata(L, s);
+  lua_pushlightuserdata(L, op);
+  lua_pushcclosure(L, RequestLog, FOUR_UPVALS);
+  lua_rawset(L, -3);
+  
+  if (s->penv.lua.background) {
+    s->penv.lua.background->create_background_metatable(L);
+    lua_getglobal(L, rgw::lua::RGWTable::TableName().c_str());
+    ceph_assert(lua_istable(L, -1));
+  }
+
+  int rc = 0;
+  try {
+    // execute the lua script
+    if (luaL_dostring(L, script.c_str()) != LUA_OK) {
+      const std::string err(lua_tostring(L, -1));
+      ldpp_dout(s, 1) << "Lua ERROR: " << err << dendl;
+      rc = -1;
+    }
+  } catch (const std::runtime_error& e) {
+    ldpp_dout(s, 1) << "Lua ERROR: " << e.what() << dendl;
+    rc = -1;
+  }
+  if (perfcounter) {
+    perfcounter->inc((rc == -1 ? l_rgw_lua_script_fail : l_rgw_lua_script_ok), 1);
+  }
+
+  return rc;
+}
+
+} // namespace rgw::lua::request
+
diff --git a/src/rgw/rgw_lua_request.h b/src/rgw/rgw_lua_request.h
new file mode 100644
index 000000000..7c85ac9cd
--- /dev/null
+++ b/src/rgw/rgw_lua_request.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <string>
+#include "include/common_fwd.h"
+#include "rgw_sal_fwd.h"
+
+struct lua_State;
+class req_state;
+class RGWREST;
+class OpsLogSink;
+
+namespace rgw::lua::request {
+
+// create the request metatable
+void create_top_metatable(lua_State* L, req_state* s, const char* op_name);
+
+// execute a lua script in the Request context
+int execute(
+    rgw::sal::Driver* driver,
+    RGWREST* rest,
+    OpsLogSink* olog,
+    req_state *s, 
+    RGWOp* op,
+    const std::string& script);
+} // namespace rgw::lua::request
+
diff --git a/src/rgw/rgw_lua_utils.cc b/src/rgw/rgw_lua_utils.cc
new file mode 100644
index 000000000..3ffe23662
--- /dev/null
+++ b/src/rgw/rgw_lua_utils.cc
@@ -0,0 +1,77 @@
+#include <string>
+#include <lua.hpp>
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "rgw_lua_utils.h"
+#include "rgw_lua_version.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::lua {
+
+// TODO - add the folowing generic functions
+// lua_push(lua_State* L, const std::string& str)
+// template<typename T> lua_push(lua_State* L, const std::optional<T>& val)
+// lua_push(lua_State* L, const ceph::real_time& tp)
+
+constexpr const char* RGWDebugLogAction{"RGWDebugLog"};
+
+int RGWDebugLog(lua_State* L) 
+{
+  auto cct = reinterpret_cast<CephContext*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+  auto message = luaL_checkstring(L, 1);
+  ldout(cct, 20) << "Lua INFO: " << message << dendl;
+  return 0;
+}
+
+void create_debug_action(lua_State* L, CephContext* cct) {
+  lua_pushlightuserdata(L, cct);
+  lua_pushcclosure(L, RGWDebugLog, ONE_UPVAL);
+  lua_setglobal(L, RGWDebugLogAction);
+}
+
+void stack_dump(lua_State* L) {
+  int top = lua_gettop(L);
+  std::cout << std::endl << " ----------------  Stack Dump ----------------" << std::endl;
+  std::cout << "Stack Size: " << top << std::endl;
+  for (int i = 1, j = -top; i <= top; i++, j++) {
+    std::cout << "[" << i << "," << j << "]: " << luaL_tolstring(L, i, NULL) << std::endl;
+    lua_pop(L, 1);
+  }
+  std::cout << "--------------- Stack Dump Finished ---------------" << std::endl;
+}
+
+void set_package_path(lua_State* L, const std::string& install_dir) {
+  if (install_dir.empty()) {
+    return;
+  }
+  lua_getglobal(L, "package");
+  if (!lua_istable(L, -1)) {
+    return;
+  }
+  const auto path = install_dir+"/share/lua/"+CEPH_LUA_VERSION+"/?.lua";  
+  pushstring(L, path);
+  lua_setfield(L, -2, "path");
+  
+  const auto cpath = install_dir+"/lib/lua/"+CEPH_LUA_VERSION+"/?.so";
+  pushstring(L, cpath);
+  lua_setfield(L, -2, "cpath");
+}
+
+void open_standard_libs(lua_State* L) {
+  luaL_openlibs(L);
+  unsetglobal(L, "load");
+  unsetglobal(L, "loadfile");
+  unsetglobal(L, "loadstring");
+  unsetglobal(L, "dofile");
+  unsetglobal(L, "debug");
+  // remove os.exit()
+  lua_getglobal(L, "os");
+  lua_pushstring(L, "exit");
+  lua_pushnil(L);
+  lua_settable(L, -3);
+}
+
+} // namespace rgw::lua
+
diff --git a/src/rgw/rgw_lua_utils.h b/src/rgw/rgw_lua_utils.h
new file mode 100644
index 000000000..cc77dae7a
--- /dev/null
+++ b/src/rgw/rgw_lua_utils.h
@@ -0,0 +1,315 @@
+#pragma once
+
+#include <string.h>
+#include <memory>
+#include <map>
+#include <string>
+#include <string_view>
+#include <ctime>
+#include <lua.hpp>
+
+#include "include/common_fwd.h"
+#include "rgw_perf_counters.h"
+
+namespace rgw::lua {
+
+// push ceph time in string format: "%Y-%m-%d %H:%M:%S"
+template <typename CephTime>
+void pushtime(lua_State* L, const CephTime& tp)
+{
+  const auto tt = CephTime::clock::to_time_t(tp);
+  const auto tm = *std::localtime(&tt);
+  char buff[64];
+  std::strftime(buff, sizeof(buff), "%Y-%m-%d %H:%M:%S", &tm);
+  lua_pushstring(L, buff);
+}
+
+static inline void pushstring(lua_State* L, std::string_view str)
+{
+  lua_pushlstring(L, str.data(), str.size());
+}
+
+static inline void unsetglobal(lua_State* L, const char* name) 
+{
+  lua_pushnil(L);
+  lua_setglobal(L, name);
+}
+
+// dump the lua stack to stdout
+void stack_dump(lua_State* L);
+
+class lua_state_guard {
+  lua_State* l;
+public:
+  lua_state_guard(lua_State* _l) : l(_l) {
+    if (perfcounter) {
+      perfcounter->inc(l_rgw_lua_current_vms, 1);
+    }
+  }
+  ~lua_state_guard() {
+    lua_close(l);
+    if (perfcounter) {
+      perfcounter->dec(l_rgw_lua_current_vms, 1);
+    }
+  }
+  void reset(lua_State* _l=nullptr) {l = _l;}
+};
+
+constexpr const int MAX_LUA_VALUE_SIZE = 1000;
+constexpr const int MAX_LUA_KEY_ENTRIES = 100000;
+
+constexpr auto ONE_UPVAL    = 1;
+constexpr auto TWO_UPVALS   = 2;
+constexpr auto THREE_UPVALS = 3;
+constexpr auto FOUR_UPVALS  = 4;
+constexpr auto FIVE_UPVALS  = 5;
+
+constexpr auto FIRST_UPVAL    = 1;
+constexpr auto SECOND_UPVAL   = 2;
+constexpr auto THIRD_UPVAL    = 3;
+constexpr auto FOURTH_UPVAL   = 4;
+constexpr auto FIFTH_UPVAL    = 5;
+
+constexpr auto NO_RETURNVAL    = 0;
+constexpr auto ONE_RETURNVAL    = 1;
+constexpr auto TWO_RETURNVALS   = 2;
+constexpr auto THREE_RETURNVALS = 3;
+constexpr auto FOUR_RETURNVALS  = 4;
+// utility functions to create a metatable
+// and tie it to an unnamed table
+//
+// add an __index method to it, to allow reading values
+// if "readonly" parameter is set to "false", it will also add
+// a __newindex method to it, to allow writing values
+// if the "toplevel" parameter is set to "true", it will name the
+// table as well as the metatable, this would allow direct access from
+// the lua script.
+//
+// The MetaTable is expected to be a class with the following members:
+// Name (static function returning the unique name of the metatable)
+// TableName (static function returning the unique name of the table - needed only for "toplevel" tables)
+// Type (typename) - the type of the "upvalue" (the type that the meta table represent)
+// IndexClosure (static function return "int" and accept "lua_State*") 
+// NewIndexClosure (static function return "int" and accept "lua_State*") 
+// e.g.
+// struct MyStructMetaTable {
+//   static std::string TableName() {
+//     return "MyStruct";
+//   }
+//
+//   using Type = MyStruct;
+//
+//   static int IndexClosure(lua_State* L) {
+//     const auto value = reinterpret_cast<const Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+//     ...
+//   }
+
+//   static int NewIndexClosure(lua_State* L) {
+//     auto value = reinterpret_cast<Type*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+//     ...
+//   }
+// };
+//
+
+template<typename MetaTable, typename... Upvalues>
+void create_metatable(lua_State* L, bool toplevel, Upvalues... upvalues)
+{
+  constexpr auto upvals_size = sizeof...(upvalues);
+  const std::array<void*, upvals_size> upvalue_arr = {upvalues...};
+  // create table
+  lua_newtable(L);
+  if (toplevel) {
+    // duplicate the table to make sure it remain in the stack
+    lua_pushvalue(L, -1);
+    // give table a name (in cae of "toplevel")
+    lua_setglobal(L, MetaTable::TableName().c_str());
+  }
+  // create metatable
+  [[maybe_unused]] const auto rc = luaL_newmetatable(L, MetaTable::Name().c_str());
+  lua_pushliteral(L, "__index");
+  for (const auto upvalue : upvalue_arr) {
+    lua_pushlightuserdata(L, upvalue);
+  }
+  lua_pushcclosure(L, MetaTable::IndexClosure, upvals_size);
+  lua_rawset(L, -3);
+  lua_pushliteral(L, "__newindex");
+  for (const auto upvalue : upvalue_arr) {
+    lua_pushlightuserdata(L, upvalue);
+  }
+  lua_pushcclosure(L, MetaTable::NewIndexClosure, upvals_size);
+  lua_rawset(L, -3);
+  lua_pushliteral(L, "__pairs");
+  for (const auto upvalue : upvalue_arr) {
+    lua_pushlightuserdata(L, upvalue);
+  }
+  lua_pushcclosure(L, MetaTable::PairsClosure, upvals_size);
+  lua_rawset(L, -3);
+  lua_pushliteral(L, "__len");
+  for (const auto upvalue : upvalue_arr) {
+    lua_pushlightuserdata(L, upvalue);
+  }
+  lua_pushcclosure(L, MetaTable::LenClosure, upvals_size);
+  lua_rawset(L, -3);
+  // tie metatable and table
+  lua_setmetatable(L, -2);
+}
+
+template<typename MetaTable>
+void create_metatable(lua_State* L, bool toplevel, std::unique_ptr<typename MetaTable::Type>& ptr)
+{
+  if (ptr) {
+    create_metatable<MetaTable>(L, toplevel, reinterpret_cast<void*>(ptr.get()));
+  } else {
+    lua_pushnil(L);
+  }
+}
+
+// following struct may be used as a base class for other MetaTable classes
+// note, however, this is not mandatory to use it as a base
+struct EmptyMetaTable {
+  // by default everythinmg is "readonly"
+  // to change, overload this function in the derived
+  static int NewIndexClosure(lua_State* L) {
+    return luaL_error(L, "trying to write to readonly field");
+  }
+  
+  // by default nothing is iterable
+  // to change, overload this function in the derived
+  static int PairsClosure(lua_State* L) {
+    return luaL_error(L, "trying to iterate over non-iterable field");
+  }
+  
+  // by default nothing is iterable
+  // to change, overload this function in the derived
+  static int LenClosure(lua_State* L) {
+    return luaL_error(L, "trying to get length of non-iterable field");
+  }
+
+  static int error_unknown_field(lua_State* L, const std::string& index, const std::string& table) {
+    return luaL_error(L, "unknown field name: %s provided to: %s",
+                      index.c_str(), table.c_str());
+  }
+};
+
+// create a debug log action
+// it expects CephContext to be captured
+// it expects one string parameter, which is the message to log
+// could be executed from any context that has CephContext
+// e.g.
+//    RGWDebugLog("hello world from lua")
+//
+void create_debug_action(lua_State* L, CephContext* cct);
+
+// set the packages search path according to:
+// package.path = "<install_dir>/share/lua/5.3/?.lua"                                                                         │                         LuaRocks.
+// package.cpath= "<install_dir>/lib/lua/5.3/?.so"
+void set_package_path(lua_State* L, const std::string& install_dir);
+
+// open standard lua libs and remove the following functions:
+// os.exit()
+// load()
+// loadfile()
+// loadstring()
+// dofile()
+// and the "debug" library
+void open_standard_libs(lua_State* L);
+
+typedef int MetaTableClosure(lua_State* L);
+
+template<typename MapType=std::map<std::string, std::string>>
+int StringMapWriteableNewIndex(lua_State* L) {
+  const auto map = reinterpret_cast<MapType*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+  const char* index = luaL_checkstring(L, 2);
+
+  if (lua_isnil(L, 3) == 0) {
+    const char* value = luaL_checkstring(L, 3);
+    if (strnlen(value, MAX_LUA_VALUE_SIZE) + strnlen(index, MAX_LUA_VALUE_SIZE)
+        > MAX_LUA_VALUE_SIZE) {
+      return luaL_error(L, "Lua maximum size of entry limit exceeded");
+    } else if (map->size() > MAX_LUA_KEY_ENTRIES) {
+      return luaL_error(L, "Lua max number of entries limit exceeded");
+    } else {
+      map->insert_or_assign(index, value);
+    }
+  } else {
+    map->erase(std::string(index));
+  }
+
+  return NO_RETURNVAL;
+}
+
+template<typename MapType=std::map<std::string, std::string>,
+  MetaTableClosure NewIndex=EmptyMetaTable::NewIndexClosure>
+struct StringMapMetaTable : public EmptyMetaTable {
+
+  static std::string TableName() {return "StringMap";}
+  static std::string Name() {return TableName() + "Meta";}
+
+  static int IndexClosure(lua_State* L) {
+    const auto map = reinterpret_cast<MapType*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    const char* index = luaL_checkstring(L, 2);
+
+    const auto it = map->find(std::string(index));
+    if (it == map->end()) {
+      lua_pushnil(L);
+    } else {
+        pushstring(L, it->second);
+    }
+    return ONE_RETURNVAL;
+  }
+
+  static int NewIndexClosure(lua_State* L) {
+    return NewIndex(L);
+  }
+
+  static int PairsClosure(lua_State* L) {
+    auto map = reinterpret_cast<MapType*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    ceph_assert(map);
+    lua_pushlightuserdata(L, map);
+    lua_pushcclosure(L, stateless_iter, ONE_UPVAL); // push the stateless iterator function
+    lua_pushnil(L);                                 // indicate this is the first call
+    // return stateless_iter, nil
+
+    return TWO_RETURNVALS;
+  }
+  
+  static int stateless_iter(lua_State* L) {
+    // based on: http://lua-users.org/wiki/GeneralizedPairsAndIpairs
+    auto map = reinterpret_cast<MapType*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+    typename MapType::const_iterator next_it;
+    if (lua_isnil(L, -1)) {
+      next_it = map->begin();
+    } else {
+      const char* index = luaL_checkstring(L, 2);
+      const auto it = map->find(std::string(index));
+      ceph_assert(it != map->end());
+      next_it = std::next(it);
+    }
+
+    if (next_it == map->end()) {
+      // index of the last element was provided
+      lua_pushnil(L);
+      lua_pushnil(L);
+      // return nil, nil
+    } else {
+      pushstring(L, next_it->first);
+      pushstring(L, next_it->second);
+      // return key, value
+    }
+
+    return TWO_RETURNVALS;
+  }
+
+  static int LenClosure(lua_State* L) {
+    const auto map = reinterpret_cast<MapType*>(lua_touserdata(L, lua_upvalueindex(FIRST_UPVAL)));
+
+    lua_pushinteger(L, map->size());
+
+    return ONE_RETURNVAL;
+  }
+};
+
+} // namespace rgw::lua
+
diff --git a/src/rgw/rgw_lua_version.h b/src/rgw/rgw_lua_version.h
new file mode 100644
index 000000000..ff096334a
--- /dev/null
+++ b/src/rgw/rgw_lua_version.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <lua.hpp>
+#include <string>
+
+namespace rgw::lua {
+
+const std::string CEPH_LUA_VERSION(LUA_VERSION_MAJOR "." LUA_VERSION_MINOR);
+
+}
+
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
new file mode 100644
index 000000000..6d2630251
--- /dev/null
+++ b/src/rgw/rgw_main.cc
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <boost/intrusive/list.hpp>
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/Timer.h"
+#include "common/TracepointProvider.h"
+#include "rgw_main.h"
+#include "rgw_signal.h"
+#include "rgw_common.h"
+#include "rgw_lib.h"
+#include "rgw_log.h"
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+using namespace std;
+
+static constexpr auto dout_subsys = ceph_subsys_rgw;
+
+static sig_t sighandler_alrm;
+
+static void godown_alarm(int signum)
+{
+  _exit(0);
+}
+
+class C_InitTimeout : public Context {
+public:
+  C_InitTimeout() {}
+  void finish(int r) override {
+    derr << "Initialization timeout, failed to initialize" << dendl;
+    exit(1);
+  }
+};
+
+static int usage()
+{
+  cout << "usage: radosgw [options...]" << std::endl;
+  cout << "options:\n";
+  cout << "  --rgw-region=<region>     region in which radosgw runs\n";
+  cout << "  --rgw-zone=<zone>         zone in which radosgw runs\n";
+  cout << "  --rgw-socket-path=<path>  specify a unix domain socket path\n";
+  cout << "  -m monaddress[:port]      connect to specified monitor\n";
+  cout << "  --keyring=<path>          path to radosgw keyring\n";
+  cout << "  --logfile=<logfile>       file to log debug output\n";
+  cout << "  --debug-rgw=<log-level>/<memory-level>  set radosgw debug level\n";
+  generic_server_usage();
+
+  return 0;
+}
+
+/*
+ * start up the RADOS connection and then handle HTTP messages as they come in
+ */
+int main(int argc, char *argv[])
+{ 
+  int r{0};
+
+  // dout() messages will be sent to stderr, but FCGX wants messages on stdout
+  // Redirect stderr to stdout.
+  TEMP_FAILURE_RETRY(close(STDERR_FILENO));
+  if (TEMP_FAILURE_RETRY(dup2(STDOUT_FILENO, STDERR_FILENO)) < 0) {
+    int err = errno;
+    cout << "failed to redirect stderr to stdout: " << cpp_strerror(err)
+         << std::endl;
+    return ENOSYS;
+  }
+
+  /* alternative default for module */
+  map<std::string,std::string> defaults = {
+    { "debug_rgw", "1/5" },
+    { "keyring", "$rgw_data/keyring" },
+    { "objecter_inflight_ops", "24576" },
+    // require a secure mon connection by default
+    { "ms_mon_client_mode", "secure" },
+    { "auth_client_required", "cephx" }
+  };
+
+  auto args = argv_to_vec(argc, argv);
+  if (args.empty()) {
+    cerr << argv[0] << ": -h or --help for usage" << std::endl;
+    exit(1);
+  }
+  if (ceph_argparse_need_usage(args)) {
+    usage();
+    exit(0);
+  }
+
+  int flags = CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS;
+  // Prevent global_init() from dropping permissions until frontends can bind
+  // privileged ports
+  flags |= CINIT_FLAG_DEFER_DROP_PRIVILEGES;
+
+  auto cct = rgw_global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT,
+			     CODE_ENVIRONMENT_DAEMON, flags);
+
+  DoutPrefix dp(cct.get(), dout_subsys, "rgw main: ");
+  rgw::AppMain main(&dp);
+
+  main.init_frontends1(false /* nfs */);
+  main.init_numa();
+
+  if (g_conf()->daemonize) {
+    global_init_daemonize(g_ceph_context);
+  }
+  ceph::mutex mutex = ceph::make_mutex("main");
+  SafeTimer init_timer(g_ceph_context, mutex);
+  init_timer.init();
+  mutex.lock();
+  init_timer.add_event_after(g_conf()->rgw_init_timeout, new C_InitTimeout);
+  mutex.unlock();
+
+  common_init_finish(g_ceph_context);
+  init_async_signal_handler();
+
+  /* XXXX check locations thru sighandler_alrm */
+  register_async_signal_handler(SIGHUP, rgw::signal::sighup_handler);
+  r = rgw::signal::signal_fd_init();
+  if (r < 0) {
+    derr << "ERROR: unable to initialize signal fds" << dendl;
+  exit(1);
+  }
+
+  register_async_signal_handler(SIGTERM, rgw::signal::handle_sigterm);
+  register_async_signal_handler(SIGINT, rgw::signal::handle_sigterm);
+  register_async_signal_handler(SIGUSR1, rgw::signal::handle_sigterm);
+  sighandler_alrm = signal(SIGALRM, godown_alarm);
+
+  main.init_perfcounters();
+  main.init_http_clients();
+
+  main.init_storage();
+  if (! main.get_driver()) {
+    mutex.lock();
+    init_timer.cancel_all_events();
+    init_timer.shutdown();
+    mutex.unlock();
+
+    derr << "Couldn't init storage provider (RADOS)" << dendl;
+    return EIO;
+  }
+
+  main.cond_init_apis();
+
+  mutex.lock();
+  init_timer.cancel_all_events();
+  init_timer.shutdown();
+  mutex.unlock();
+
+  main.init_ldap();
+  main.init_opslog();
+  main.init_tracepoints();
+  main.init_lua();
+  main.init_frontends2(nullptr /* RGWLib */);
+  main.init_notification_endpoints();
+
+#if defined(HAVE_SYS_PRCTL_H)
+  if (prctl(PR_SET_DUMPABLE, 1) == -1) {
+    cerr << "warning: unable to set dumpable flag: " << cpp_strerror(errno) << std::endl;
+  }
+#endif
+
+  rgw::signal::wait_shutdown();
+
+  derr << "shutting down" << dendl;
+
+  const auto finalize_async_signals = []() {
+    unregister_async_signal_handler(SIGHUP, rgw::signal::sighup_handler);
+    unregister_async_signal_handler(SIGTERM, rgw::signal::handle_sigterm);
+    unregister_async_signal_handler(SIGINT, rgw::signal::handle_sigterm);
+    unregister_async_signal_handler(SIGUSR1, rgw::signal::handle_sigterm);
+    shutdown_async_signal_handler();
+  };
+
+  main.shutdown(finalize_async_signals);
+
+  dout(1) << "final shutdown" << dendl;
+
+  rgw::signal::signal_fd_finalize();
+
+  return 0;
+} /* main(int argc, char* argv[]) */
diff --git a/src/rgw/rgw_main.h b/src/rgw/rgw_main.h
new file mode 100644
index 000000000..bbe514351
--- /dev/null
+++ b/src/rgw/rgw_main.h
@@ -0,0 +1,134 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <vector>
+#include <map>
+#include <string>
+#include "rgw_common.h"
+#include "rgw_rest.h"
+#include "rgw_frontend.h"
+#include "rgw_period_pusher.h"
+#include "rgw_realm_reloader.h"
+#include "rgw_ldap.h"
+#include "rgw_lua.h"
+#include "rgw_dmclock_scheduler_ctx.h"
+#include "rgw_ratelimit.h"
+
+
+class RGWPauser : public RGWRealmReloader::Pauser {
+  std::vector<Pauser*> pausers;
+
+public:
+  ~RGWPauser() override = default;
+
+  void add_pauser(Pauser* pauser) {
+    pausers.push_back(pauser);
+  }
+
+  void pause() override {
+    std::for_each(pausers.begin(), pausers.end(), [](Pauser* p){p->pause();});
+  }
+  void resume(rgw::sal::Driver* driver) override {
+    std::for_each(pausers.begin(), pausers.end(), [driver](Pauser* p){p->resume(driver);});
+  }
+
+};
+
+namespace rgw {
+
+namespace lua { class Background; }
+
+class RGWLib;
+class AppMain {
+  /* several components should be initalized only if librgw is
+    * also serving HTTP */
+  bool have_http_frontend{false};
+  bool nfs{false};
+
+  std::vector<RGWFrontend*> fes;
+  std::vector<RGWFrontendConfig*> fe_configs;
+  std::multimap<string, RGWFrontendConfig*> fe_map;
+  std::unique_ptr<rgw::LDAPHelper> ldh;
+  OpsLogSink* olog;
+  RGWREST rest;
+  std::unique_ptr<rgw::lua::Background> lua_background;
+  std::unique_ptr<rgw::auth::ImplicitTenants> implicit_tenant_context;
+  std::unique_ptr<rgw::dmclock::SchedulerCtx> sched_ctx;
+  std::unique_ptr<ActiveRateLimiter> ratelimiter;
+  std::map<std::string, std::string> service_map_meta;
+  // wow, realm reloader has a lot of parts
+  std::unique_ptr<RGWRealmReloader> reloader;
+  std::unique_ptr<RGWPeriodPusher> pusher;
+  std::unique_ptr<RGWFrontendPauser> fe_pauser;
+  std::unique_ptr<RGWRealmWatcher> realm_watcher;
+  std::unique_ptr<RGWPauser> rgw_pauser;
+  DoutPrefixProvider* dpp;
+  RGWProcessEnv env;
+
+public:
+  AppMain(DoutPrefixProvider* dpp)
+    : dpp(dpp)
+    {}
+
+  void shutdown(std::function<void(void)> finalize_async_signals
+	       = []() { /* nada */});
+
+  rgw::sal::Driver* get_driver() {
+    return env.driver;
+  }
+
+  rgw::LDAPHelper* get_ldh() {
+    return ldh.get();
+  }
+
+  void init_frontends1(bool nfs = false);
+  void init_numa();
+  void init_storage();
+  void init_perfcounters();
+  void init_http_clients();
+  void cond_init_apis();
+  void init_ldap();
+  void init_opslog();
+  int init_frontends2(RGWLib* rgwlib = nullptr);
+  void init_tracepoints();
+  void init_notification_endpoints();
+  void init_lua();
+
+  bool have_http() {
+    return have_http_frontend;
+  }
+
+  static OpsLogFile* ops_log_file;
+}; /* AppMain */
+} // namespace rgw
+
+static inline RGWRESTMgr *set_logging(RGWRESTMgr* mgr)
+{
+  mgr->set_logging(true);
+  return mgr;
+}
+
+static inline RGWRESTMgr *rest_filter(rgw::sal::Driver* driver, int dialect, RGWRESTMgr* orig)
+{
+  RGWSyncModuleInstanceRef sync_module = driver->get_sync_module();
+  if (sync_module) {
+    return sync_module->get_rest_filter(dialect, orig);
+  } else {
+    return orig;
+  }
+}
+
diff --git a/src/rgw/rgw_mdlog.h b/src/rgw/rgw_mdlog.h
new file mode 100644
index 000000000..179cc2aca
--- /dev/null
+++ b/src/rgw/rgw_mdlog.h
@@ -0,0 +1,185 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "common/RWLock.h"
+
+#include "rgw_metadata.h"
+#include "rgw_mdlog_types.h"
+
+#include "services/svc_rados.h"
+
+#define META_LOG_OBJ_PREFIX "meta.log."
+
+struct RGWMetadataLogInfo {
+  std::string marker;
+  real_time last_update;
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+
+class RGWCompletionManager;
+
+class RGWMetadataLogInfoCompletion : public RefCountedObject {
+ public:
+  using info_callback_t = std::function<void(int, const cls_log_header&)>;
+ private:
+  cls_log_header header;
+  RGWSI_RADOS::Obj io_obj;
+  librados::AioCompletion *completion;
+  std::mutex mutex; //< protects callback between cancel/complete
+  boost::optional<info_callback_t> callback; //< cleared on cancel
+ public:
+  explicit RGWMetadataLogInfoCompletion(info_callback_t callback);
+  ~RGWMetadataLogInfoCompletion() override;
+
+  RGWSI_RADOS::Obj& get_io_obj() { return io_obj; }
+  cls_log_header& get_header() { return header; }
+  librados::AioCompletion* get_completion() { return completion; }
+
+  void finish(librados::completion_t cb) {
+    std::lock_guard<std::mutex> lock(mutex);
+    if (callback) {
+      (*callback)(completion->get_return_value(), header);
+    }
+  }
+  void cancel() {
+    std::lock_guard<std::mutex> lock(mutex);
+    callback = boost::none;
+  }
+};
+
+class RGWMetadataLog {
+  CephContext *cct;
+  const std::string prefix;
+
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_Cls *cls{nullptr};
+  } svc;
+
+  static std::string make_prefix(const std::string& period) {
+    if (period.empty())
+      return META_LOG_OBJ_PREFIX;
+    return META_LOG_OBJ_PREFIX + period + ".";
+  }
+
+  RWLock lock;
+  std::set<int> modified_shards;
+
+  void mark_modified(int shard_id);
+public:
+  RGWMetadataLog(CephContext *_cct,
+                 RGWSI_Zone *_zone_svc,
+                 RGWSI_Cls *_cls_svc,
+                 const std::string& period)
+    : cct(_cct),
+      prefix(make_prefix(period)),
+      lock("RGWMetaLog::lock") {
+    svc.zone = _zone_svc;
+    svc.cls = _cls_svc;
+  }
+
+
+  void get_shard_oid(int id, std::string& oid) const {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%d", id);
+    oid = prefix + buf;
+  }
+
+  int add_entry(const DoutPrefixProvider *dpp, const std::string& hash_key, const std::string& section, const std::string& key, bufferlist& bl);
+  int get_shard_id(const std::string& hash_key, int *shard_id);
+  int store_entries_in_shard(const DoutPrefixProvider *dpp, std::list<cls_log_entry>& entries, int shard_id, librados::AioCompletion *completion);
+
+  struct LogListCtx {
+    int cur_shard;
+    std::string marker;
+    real_time from_time;
+    real_time end_time;
+
+    std::string cur_oid;
+
+    bool done;
+
+    LogListCtx() : cur_shard(0), done(false) {}
+  };
+
+  void init_list_entries(int shard_id, const real_time& from_time,
+			 const real_time& end_time, const std::string& marker,
+			 void **handle);
+  void complete_list_entries(void *handle);
+  int list_entries(const DoutPrefixProvider *dpp,
+                   void *handle,
+                   int max_entries,
+                   std::list<cls_log_entry>& entries,
+		   std::string *out_marker,
+		   bool *truncated);
+
+  int trim(const DoutPrefixProvider *dpp, int shard_id, const real_time& from_time, const real_time& end_time, const std::string& start_marker, const std::string& end_marker);
+  int get_info(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfo *info);
+  int get_info_async(const DoutPrefixProvider *dpp, int shard_id, RGWMetadataLogInfoCompletion *completion);
+  int lock_exclusive(const DoutPrefixProvider *dpp, int shard_id, timespan duration, std::string&zone_id, std::string& owner_id);
+  int unlock(const DoutPrefixProvider *dpp, int shard_id, std::string& zone_id, std::string& owner_id);
+
+  int update_shards(std::list<int>& shards);
+
+  void read_clear_modified(std::set<int> &modified);
+};
+
+struct LogStatusDump {
+  RGWMDLogStatus status;
+
+  explicit LogStatusDump(RGWMDLogStatus _status) : status(_status) {}
+  void dump(Formatter *f) const;
+};
+
+struct RGWMetadataLogData {
+  obj_version read_version;
+  obj_version write_version;
+  RGWMDLogStatus status;
+
+  RGWMetadataLogData() : status(MDLOG_STATUS_UNKNOWN) {}
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWMetadataLogData)
+
+struct RGWMetadataLogHistory {
+  epoch_t oldest_realm_epoch;
+  std::string oldest_period_id;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(oldest_realm_epoch, bl);
+    encode(oldest_period_id, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& p) {
+    DECODE_START(1, p);
+    decode(oldest_realm_epoch, p);
+    decode(oldest_period_id, p);
+    DECODE_FINISH(p);
+  }
+
+  static const std::string oid;
+};
+WRITE_CLASS_ENCODER(RGWMetadataLogHistory)
+
diff --git a/src/rgw/rgw_mdlog_types.h b/src/rgw/rgw_mdlog_types.h
new file mode 100644
index 000000000..1862974d8
--- /dev/null
+++ b/src/rgw/rgw_mdlog_types.h
@@ -0,0 +1,35 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+enum RGWMDLogSyncType {
+  APPLY_ALWAYS,
+  APPLY_UPDATES,
+  APPLY_NEWER,
+  APPLY_EXCLUSIVE
+};
+
+enum RGWMDLogStatus {
+  MDLOG_STATUS_UNKNOWN,
+  MDLOG_STATUS_WRITE,
+  MDLOG_STATUS_SETATTRS,
+  MDLOG_STATUS_REMOVE,
+  MDLOG_STATUS_COMPLETE,
+  MDLOG_STATUS_ABORT,
+};
+
diff --git a/src/rgw/rgw_meta_sync_status.h b/src/rgw/rgw_meta_sync_status.h
new file mode 100644
index 000000000..f8a2ae3ee
--- /dev/null
+++ b/src/rgw/rgw_meta_sync_status.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+
+#include "common/ceph_time.h"
+
+struct rgw_meta_sync_info {
+  enum SyncState {
+    StateInit = 0,
+    StateBuildingFullSyncMaps = 1,
+    StateSync = 2,
+  };
+
+  uint16_t state;
+  uint32_t num_shards;
+  std::string period; //< period id of current metadata log
+  epoch_t realm_epoch = 0; //< realm epoch of period
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(state, bl);
+    encode(num_shards, bl);
+    encode(period, bl);
+    encode(realm_epoch, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(state, bl);
+    decode(num_shards, bl);
+    if (struct_v >= 2) {
+      decode(period, bl);
+      decode(realm_epoch, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void decode_json(JSONObj *obj);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<rgw_meta_sync_info*>& ls);
+
+  rgw_meta_sync_info() : state((int)StateInit), num_shards(0) {}
+};
+WRITE_CLASS_ENCODER(rgw_meta_sync_info)
+
+struct rgw_meta_sync_marker {
+  enum SyncState {
+    FullSync = 0,
+    IncrementalSync = 1,
+  };
+  uint16_t state;
+  std::string marker;
+  std::string next_step_marker;
+  uint64_t total_entries;
+  uint64_t pos;
+  real_time timestamp;
+  epoch_t realm_epoch{0}; //< realm_epoch of period marker
+
+  rgw_meta_sync_marker() : state(FullSync), total_entries(0), pos(0) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(state, bl);
+    encode(marker, bl);
+    encode(next_step_marker, bl);
+    encode(total_entries, bl);
+    encode(pos, bl);
+    encode(timestamp, bl);
+    encode(realm_epoch, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(state, bl);
+    decode(marker, bl);
+    decode(next_step_marker, bl);
+    decode(total_entries, bl);
+    decode(pos, bl);
+    decode(timestamp, bl);
+    if (struct_v >= 2) {
+      decode(realm_epoch, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void decode_json(JSONObj *obj);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<rgw_meta_sync_marker*>& ls);
+};
+WRITE_CLASS_ENCODER(rgw_meta_sync_marker)
+
+struct rgw_meta_sync_status {
+  rgw_meta_sync_info sync_info;
+  std::map<uint32_t, rgw_meta_sync_marker> sync_markers;
+
+  rgw_meta_sync_status() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(sync_info, bl);
+    encode(sync_markers, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(sync_info, bl);
+     decode(sync_markers, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<rgw_meta_sync_status*>& ls);
+};
+WRITE_CLASS_ENCODER(rgw_meta_sync_status)
diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc
new file mode 100644
index 000000000..7fd25ae75
--- /dev/null
+++ b/src/rgw/rgw_metadata.cc
@@ -0,0 +1,683 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_metadata.h"
+
+#include "rgw_mdlog.h"
+
+
+#include "services/svc_meta.h"
+#include "services/svc_meta_be_sobj.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void LogStatusDump::dump(Formatter *f) const {
+  string s;
+  switch (status) {
+    case MDLOG_STATUS_WRITE:
+      s = "write";
+      break;
+    case MDLOG_STATUS_SETATTRS:
+      s = "set_attrs";
+      break;
+    case MDLOG_STATUS_REMOVE:
+      s = "remove";
+      break;
+    case MDLOG_STATUS_COMPLETE:
+      s = "complete";
+      break;
+    case MDLOG_STATUS_ABORT:
+      s = "abort";
+      break;
+    default:
+      s = "unknown";
+      break;
+  }
+  encode_json("status", s, f);
+}
+
+void encode_json(const char *name, const obj_version& v, Formatter *f)
+{
+  f->open_object_section(name);
+  f->dump_string("tag", v.tag);
+  f->dump_unsigned("ver", v.ver);
+  f->close_section();
+}
+
+void decode_json_obj(obj_version& v, JSONObj *obj)
+{
+  JSONDecoder::decode_json("tag", v.tag, obj);
+  JSONDecoder::decode_json("ver", v.ver, obj);
+}
+
+void RGWMetadataLogData::encode(bufferlist& bl) const {
+  ENCODE_START(1, 1, bl);
+  encode(read_version, bl);
+  encode(write_version, bl);
+  uint32_t s = (uint32_t)status;
+  encode(s, bl);
+  ENCODE_FINISH(bl);
+}
+
+void RGWMetadataLogData::decode(bufferlist::const_iterator& bl) {
+   DECODE_START(1, bl);
+   decode(read_version, bl);
+   decode(write_version, bl);
+   uint32_t s;
+   decode(s, bl);
+   status = (RGWMDLogStatus)s;
+   DECODE_FINISH(bl);
+}
+
+void RGWMetadataLogData::dump(Formatter *f) const {
+  encode_json("read_version", read_version, f);
+  encode_json("write_version", write_version, f);
+  encode_json("status", LogStatusDump(status), f);
+}
+
+void decode_json_obj(RGWMDLogStatus& status, JSONObj *obj) {
+  string s;
+  JSONDecoder::decode_json("status", s, obj);
+  if (s == "complete") {
+    status = MDLOG_STATUS_COMPLETE;
+  } else if (s == "write") {
+    status = MDLOG_STATUS_WRITE;
+  } else if (s == "remove") {
+    status = MDLOG_STATUS_REMOVE;
+  } else if (s == "set_attrs") {
+    status = MDLOG_STATUS_SETATTRS;
+  } else if (s == "abort") {
+    status = MDLOG_STATUS_ABORT;
+  } else {
+    status = MDLOG_STATUS_UNKNOWN;
+  }
+}
+
+void RGWMetadataLogData::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("read_version", read_version, obj);
+  JSONDecoder::decode_json("write_version", write_version, obj);
+  JSONDecoder::decode_json("status", status, obj);
+}
+
+RGWMetadataHandler_GenericMetaBE::Put::Put(RGWMetadataHandler_GenericMetaBE *_handler,
+					   RGWSI_MetaBackend_Handler::Op *_op,
+					   string& _entry, RGWMetadataObject *_obj,
+					   RGWObjVersionTracker& _objv_tracker,
+					   optional_yield _y,
+					   RGWMDLogSyncType _type, bool _from_remote_zone):
+  handler(_handler), op(_op),
+  entry(_entry), obj(_obj),
+  objv_tracker(_objv_tracker),
+  apply_type(_type),
+  y(_y),
+  from_remote_zone(_from_remote_zone)
+{
+}
+
+int RGWMetadataHandler_GenericMetaBE::do_put_operate(Put *put_op, const DoutPrefixProvider *dpp)
+{
+  int r = put_op->put_pre(dpp);
+  if (r != 0) { /* r can also be STATUS_NO_APPLY */
+    return r;
+  }
+
+  r = put_op->put(dpp);
+  if (r != 0) {
+    return r;
+  }
+
+  r = put_op->put_post(dpp);
+  if (r != 0) {  /* e.g., -error or STATUS_APPLIED */
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWMetadataHandler_GenericMetaBE::get(string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return do_get(op, entry, obj, y, dpp);
+  });
+}
+
+int RGWMetadataHandler_GenericMetaBE::put(string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
+                                          optional_yield y, const DoutPrefixProvider *dpp, RGWMDLogSyncType type, bool from_remote_zone)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return do_put(op, entry, obj, objv_tracker, y, dpp, type, from_remote_zone);
+  });
+}
+
+int RGWMetadataHandler_GenericMetaBE::remove(string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return do_remove(op, entry, objv_tracker, y, dpp);
+  });
+}
+
+int RGWMetadataHandler_GenericMetaBE::mutate(const string& entry,
+                                             const ceph::real_time& mtime,
+                                             RGWObjVersionTracker *objv_tracker,
+                                             optional_yield y,
+                                             const DoutPrefixProvider *dpp,
+                                             RGWMDLogStatus op_type,
+                                             std::function<int()> f)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    RGWSI_MetaBackend::MutateParams params(mtime, op_type);
+    return op->mutate(entry,
+                      params,
+                      objv_tracker,
+		      y,
+                      f,
+                      dpp);
+  });
+}
+
+int RGWMetadataHandler_GenericMetaBE::get_shard_id(const string& entry, int *shard_id)
+{
+  return be_handler->call([&](RGWSI_MetaBackend_Handler::Op *op) {
+    return op->get_shard_id(entry, shard_id);
+  });
+}
+
+int RGWMetadataHandler_GenericMetaBE::list_keys_init(const DoutPrefixProvider *dpp, const string& marker, void **phandle)
+{
+  auto op = std::make_unique<RGWSI_MetaBackend_Handler::Op_ManagedCtx>(be_handler);
+
+  int ret = op->list_init(dpp, marker);
+  if (ret < 0) {
+    return ret;
+  }
+
+  *phandle = (void *)op.release();
+
+  return 0;
+}
+
+int RGWMetadataHandler_GenericMetaBE::list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, list<string>& keys, bool *truncated)
+{
+  auto op = static_cast<RGWSI_MetaBackend_Handler::Op_ManagedCtx *>(handle);
+
+  int ret = op->list_next(dpp, max, &keys, truncated);
+  if (ret < 0 && ret != -ENOENT) {
+    return ret;
+  }
+  if (ret == -ENOENT) {
+    if (truncated) {
+      *truncated = false;
+    }
+    return 0;
+  }
+
+  return 0;
+}
+
+void RGWMetadataHandler_GenericMetaBE::list_keys_complete(void *handle)
+{
+  auto op = static_cast<RGWSI_MetaBackend_Handler::Op_ManagedCtx *>(handle);
+  delete op;
+}
+
+string RGWMetadataHandler_GenericMetaBE::get_marker(void *handle)
+{
+  auto op = static_cast<RGWSI_MetaBackend_Handler::Op_ManagedCtx *>(handle);
+  string marker;
+  int r = op->list_get_marker(&marker);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: " << __func__ << "(): list_get_marker() returned: r=" << r << dendl;
+    /* not much else to do */
+  }
+
+  return marker;
+}
+
+RGWMetadataHandlerPut_SObj::RGWMetadataHandlerPut_SObj(RGWMetadataHandler_GenericMetaBE *handler,
+                                                       RGWSI_MetaBackend_Handler::Op *op,
+                                                       string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
+						       optional_yield y,
+                                                       RGWMDLogSyncType type, bool from_remote_zone) : Put(handler, op, entry, obj, objv_tracker, y, type, from_remote_zone) {
+}
+
+int RGWMetadataHandlerPut_SObj::put_pre(const DoutPrefixProvider *dpp)
+{
+  int ret = get(&old_obj, dpp);
+  if (ret < 0 && ret != -ENOENT) {
+    return ret;
+  }
+  exists = (ret != -ENOENT);
+
+  oo.reset(old_obj);
+
+  auto old_ver = (!old_obj ? obj_version() : old_obj->get_version());
+  auto old_mtime = (!old_obj ? ceph::real_time() : old_obj->get_mtime());
+
+  // are we actually going to perform this put, or is it too old?
+  if (!handler->check_versions(exists, old_ver, old_mtime,
+                               objv_tracker.write_version, obj->get_mtime(),
+                               apply_type)) {
+    return STATUS_NO_APPLY;
+  }
+
+  objv_tracker.read_version = old_ver; /* maintain the obj version we just read */
+
+  return 0;
+}
+
+int RGWMetadataHandlerPut_SObj::put(const DoutPrefixProvider *dpp)
+{
+  int ret = put_check(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  return put_checked(dpp);
+}
+
+int RGWMetadataHandlerPut_SObj::put_checked(const DoutPrefixProvider *dpp)
+{
+  RGWSI_MBSObj_PutParams params(obj->get_pattrs(), obj->get_mtime());
+
+  encode_obj(&params.bl);
+
+  int ret = op->put(entry, params, &objv_tracker, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+class RGWMetadataTopHandler : public RGWMetadataHandler {
+  struct iter_data {
+    set<string> sections;
+    set<string>::iterator iter;
+  };
+
+  struct Svc {
+    RGWSI_Meta *meta{nullptr};
+  } svc;
+
+  RGWMetadataManager *mgr;
+
+public:
+  RGWMetadataTopHandler(RGWSI_Meta *meta_svc,
+                        RGWMetadataManager *_mgr) : mgr(_mgr) {
+    base_init(meta_svc->ctx());
+    svc.meta = meta_svc;
+  }
+
+  string get_type() override { return string(); }
+
+  RGWMetadataObject *get_meta_obj(JSONObj *jo, const obj_version& objv, const ceph::real_time& mtime) {
+    return new RGWMetadataObject;
+  }
+
+  int get(string& entry, RGWMetadataObject **obj, optional_yield y, const DoutPrefixProvider *dpp) override {
+    return -ENOTSUP;
+  }
+
+  int put(string& entry, RGWMetadataObject *obj, RGWObjVersionTracker& objv_tracker,
+          optional_yield y, const DoutPrefixProvider *dpp, RGWMDLogSyncType type, bool from_remote_zone) override {
+    return -ENOTSUP;
+  }
+
+  int remove(string& entry, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) override {
+    return -ENOTSUP;
+  }
+
+  int mutate(const string& entry,
+             const ceph::real_time& mtime,
+             RGWObjVersionTracker *objv_tracker,
+             optional_yield y,
+             const DoutPrefixProvider *dpp,
+             RGWMDLogStatus op_type,
+             std::function<int()> f) {
+    return -ENOTSUP;
+  }
+
+  int list_keys_init(const DoutPrefixProvider *dpp, const string& marker, void **phandle) override {
+    iter_data *data = new iter_data;
+    list<string> sections;
+    mgr->get_sections(sections);
+    for (auto& s : sections) {
+      data->sections.insert(s);
+    }
+    data->iter = data->sections.lower_bound(marker);
+
+    *phandle = data;
+
+    return 0;
+  }
+  int list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, list<string>& keys, bool *truncated) override  {
+    iter_data *data = static_cast<iter_data *>(handle);
+    for (int i = 0; i < max && data->iter != data->sections.end(); ++i, ++(data->iter)) {
+      keys.push_back(*data->iter);
+    }
+
+    *truncated = (data->iter != data->sections.end());
+
+    return 0;
+  }
+  void list_keys_complete(void *handle) override {
+    iter_data *data = static_cast<iter_data *>(handle);
+
+    delete data;
+  }
+
+  virtual string get_marker(void *handle) override {
+    iter_data *data = static_cast<iter_data *>(handle);
+
+    if (data->iter != data->sections.end()) {
+      return *(data->iter);
+    }
+
+    return string();
+  }
+};
+
+RGWMetadataHandlerPut_SObj::~RGWMetadataHandlerPut_SObj() {}
+
+int RGWMetadataHandler::attach(RGWMetadataManager *manager)
+{
+  return manager->register_handler(this);
+}
+
+RGWMetadataHandler::~RGWMetadataHandler() {}
+
+obj_version& RGWMetadataObject::get_version()
+{
+  return objv;
+}
+
+RGWMetadataManager::RGWMetadataManager(RGWSI_Meta *_meta_svc)
+  : cct(_meta_svc->ctx()), meta_svc(_meta_svc)
+{
+  md_top_handler.reset(new RGWMetadataTopHandler(meta_svc, this));
+}
+
+RGWMetadataManager::~RGWMetadataManager()
+{
+}
+
+int RGWMetadataManager::register_handler(RGWMetadataHandler *handler)
+{
+  string type = handler->get_type();
+
+  if (handlers.find(type) != handlers.end())
+    return -EEXIST;
+
+  handlers[type] = handler;
+
+  return 0;
+}
+
+RGWMetadataHandler *RGWMetadataManager::get_handler(const string& type)
+{
+  map<string, RGWMetadataHandler *>::iterator iter = handlers.find(type);
+  if (iter == handlers.end())
+    return NULL;
+
+  return iter->second;
+}
+
+void RGWMetadataManager::parse_metadata_key(const string& metadata_key, string& type, string& entry)
+{
+  auto pos = metadata_key.find(':');
+  if (pos == string::npos) {
+    type = metadata_key;
+  } else {
+    type = metadata_key.substr(0, pos);
+    entry = metadata_key.substr(pos + 1);
+  }
+}
+
+int RGWMetadataManager::find_handler(const string& metadata_key, RGWMetadataHandler **handler, string& entry)
+{
+  string type;
+
+  parse_metadata_key(metadata_key, type, entry);
+
+  if (type.empty()) {
+    *handler = md_top_handler.get();
+    return 0;
+  }
+
+  map<string, RGWMetadataHandler *>::iterator iter = handlers.find(type);
+  if (iter == handlers.end())
+    return -ENOENT;
+
+  *handler = iter->second;
+
+  return 0;
+
+}
+
+int RGWMetadataManager::get(string& metadata_key, Formatter *f, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  RGWMetadataHandler *handler;
+  string entry;
+  int ret = find_handler(metadata_key, &handler, entry);
+  if (ret < 0) {
+    return ret;
+  }
+
+  RGWMetadataObject *obj;
+
+  ret = handler->get(entry, &obj, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  f->open_object_section("metadata_info");
+  encode_json("key", metadata_key, f);
+  encode_json("ver", obj->get_version(), f);
+  real_time mtime = obj->get_mtime();
+  if (!real_clock::is_zero(mtime)) {
+    utime_t ut(mtime);
+    encode_json("mtime", ut, f);
+  }
+  encode_json("data", *obj, f);
+  f->close_section();
+
+  delete obj;
+
+  return 0;
+}
+
+int RGWMetadataManager::put(string& metadata_key, bufferlist& bl,
+                            optional_yield y,
+                            const DoutPrefixProvider *dpp,
+                            RGWMDLogSyncType sync_type,
+                            bool from_remote_zone,
+                            obj_version *existing_version)
+{
+  RGWMetadataHandler *handler;
+  string entry;
+
+  int ret = find_handler(metadata_key, &handler, entry);
+  if (ret < 0) {
+    return ret;
+  }
+
+  JSONParser parser;
+  if (!parser.parse(bl.c_str(), bl.length())) {
+    return -EINVAL;
+  }
+
+  RGWObjVersionTracker objv_tracker;
+
+  obj_version *objv = &objv_tracker.write_version;
+
+  utime_t mtime;
+
+  try {
+    JSONDecoder::decode_json("key", metadata_key, &parser);
+    JSONDecoder::decode_json("ver", *objv, &parser);
+    JSONDecoder::decode_json("mtime", mtime, &parser);
+  } catch (JSONDecoder::err& e) {
+    return -EINVAL;
+  }
+
+  JSONObj *jo = parser.find_obj("data");
+  if (!jo) {
+    return -EINVAL;
+  }
+  RGWMetadataObject *obj = handler->get_meta_obj(jo, *objv, mtime.to_real_time());
+  if (!obj) {
+    return -EINVAL;
+  }
+
+  ret = handler->put(entry, obj, objv_tracker, y, dpp, sync_type, from_remote_zone);
+  if (existing_version) {
+    *existing_version = objv_tracker.read_version;
+  }
+
+  delete obj;
+
+  return ret;
+}
+
+int RGWMetadataManager::remove(string& metadata_key, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  RGWMetadataHandler *handler;
+  string entry;
+
+  int ret = find_handler(metadata_key, &handler, entry);
+  if (ret < 0) {
+    return ret;
+  }
+
+  RGWMetadataObject *obj;
+  ret = handler->get(entry, &obj, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+  RGWObjVersionTracker objv_tracker;
+  objv_tracker.read_version = obj->get_version();
+  delete obj;
+
+  return handler->remove(entry, objv_tracker, y, dpp);
+}
+
+int RGWMetadataManager::mutate(const string& metadata_key,
+                               const ceph::real_time& mtime,
+                               RGWObjVersionTracker *objv_tracker,
+                               optional_yield y,
+                               const DoutPrefixProvider *dpp,
+                               RGWMDLogStatus op_type,
+                               std::function<int()> f)
+{
+  RGWMetadataHandler *handler;
+  string entry;
+
+  int ret = find_handler(metadata_key, &handler, entry);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return handler->mutate(entry, mtime, objv_tracker, y, dpp, op_type, f);
+}
+
+int RGWMetadataManager::get_shard_id(const string& section, const string& entry, int *shard_id)
+{
+  RGWMetadataHandler *handler = get_handler(section);
+  if (!handler) {
+    return -EINVAL;
+  }
+
+  return handler->get_shard_id(entry, shard_id);
+}
+
+struct list_keys_handle {
+  void *handle;
+  RGWMetadataHandler *handler;
+};
+
+int RGWMetadataManager::list_keys_init(const DoutPrefixProvider *dpp, const string& section, void **handle)
+{
+  return list_keys_init(dpp, section, string(), handle);
+}
+
+int RGWMetadataManager::list_keys_init(const DoutPrefixProvider *dpp, const string& section,
+                                       const string& marker, void **handle)
+{
+  string entry;
+  RGWMetadataHandler *handler;
+
+  int ret;
+
+  ret = find_handler(section, &handler, entry);
+  if (ret < 0) {
+    return -ENOENT;
+  }
+
+  list_keys_handle *h = new list_keys_handle;
+  h->handler = handler;
+  ret = handler->list_keys_init(dpp, marker, &h->handle);
+  if (ret < 0) {
+    delete h;
+    return ret;
+  }
+
+  *handle = (void *)h;
+
+  return 0;
+}
+
+int RGWMetadataManager::list_keys_next(const DoutPrefixProvider *dpp, void *handle, int max, list<string>& keys, bool *truncated)
+{
+  list_keys_handle *h = static_cast<list_keys_handle *>(handle);
+
+  RGWMetadataHandler *handler = h->handler;
+
+  return handler->list_keys_next(dpp, h->handle, max, keys, truncated);
+}
+
+void RGWMetadataManager::list_keys_complete(void *handle)
+{
+  list_keys_handle *h = static_cast<list_keys_handle *>(handle);
+
+  RGWMetadataHandler *handler = h->handler;
+
+  handler->list_keys_complete(h->handle);
+  delete h;
+}
+
+string RGWMetadataManager::get_marker(void *handle)
+{
+  list_keys_handle *h = static_cast<list_keys_handle *>(handle);
+
+  return h->handler->get_marker(h->handle);
+}
+
+void RGWMetadataManager::dump_log_entry(cls_log_entry& entry, Formatter *f)
+{
+  f->open_object_section("entry");
+  f->dump_string("id", entry.id);
+  f->dump_string("section", entry.section);
+  f->dump_string("name", entry.name);
+  entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
+
+  try {
+    RGWMetadataLogData log_data;
+    auto iter = entry.data.cbegin();
+    decode(log_data, iter);
+
+    encode_json("data", log_data, f);
+  } catch (buffer::error& err) {
+    lderr(cct) << "failed to decode log entry: " << entry.section << ":" << entry.name<< " ts=" << entry.timestamp << dendl;
+  }
+  f->close_section();
+}
+
+void RGWMetadataManager::get_sections(list<string>& sections)
+{
+  for (map<string, RGWMetadataHandler *>::iterator iter = handlers.begin(); iter != handlers.end(); ++iter) {
+    sections.push_back(iter->first);
+  }
+}
+
diff --git a/src/rgw/rgw_multi.cc b/src/rgw/rgw_multi.cc
new file mode 100644
index 000000000..6e090d6b5
--- /dev/null
+++ b/src/rgw/rgw_multi.cc
@@ -0,0 +1,103 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+
+#include <iostream>
+#include <map>
+
+#include "include/types.h"
+
+#include "rgw_xml.h"
+#include "rgw_multi.h"
+#include "rgw_op.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_tier_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+bool RGWMultiPart::xml_end(const char *el)
+{
+  RGWMultiPartNumber *num_obj = static_cast<RGWMultiPartNumber *>(find_first("PartNumber"));
+  RGWMultiETag *etag_obj = static_cast<RGWMultiETag *>(find_first("ETag"));
+
+  if (!num_obj || !etag_obj)
+    return false;
+
+  string s = num_obj->get_data();
+  if (s.empty())
+    return false;
+
+  num = atoi(s.c_str());
+
+  s = etag_obj->get_data();
+  etag = s;
+
+  return true;
+}
+
+bool RGWMultiCompleteUpload::xml_end(const char *el) {
+  XMLObjIter iter = find("Part");
+  RGWMultiPart *part = static_cast<RGWMultiPart *>(iter.get_next());
+  while (part) {
+    int num = part->get_num();
+    string etag = part->get_etag();
+    parts[num] = etag;
+    part = static_cast<RGWMultiPart *>(iter.get_next());
+  }
+  return true;
+}
+
+RGWMultiXMLParser::~RGWMultiXMLParser() {}
+
+XMLObj *RGWMultiXMLParser::alloc_obj(const char *el) {
+  XMLObj *obj = NULL;
+  // CompletedMultipartUpload is incorrect but some versions of some libraries use it, see PR #41700
+  if (strcmp(el, "CompleteMultipartUpload") == 0 ||
+      strcmp(el, "CompletedMultipartUpload") == 0 ||
+      strcmp(el, "MultipartUpload") == 0) {
+    obj = new RGWMultiCompleteUpload();
+  } else if (strcmp(el, "Part") == 0) {
+    obj = new RGWMultiPart();
+  } else if (strcmp(el, "PartNumber") == 0) {
+    obj = new RGWMultiPartNumber();
+  } else if (strcmp(el, "ETag") == 0) {
+    obj = new RGWMultiETag();
+  }
+
+  return obj;
+}
+
+bool is_v2_upload_id(const string& upload_id)
+{
+  const char *uid = upload_id.c_str();
+
+  return (strncmp(uid, MULTIPART_UPLOAD_ID_PREFIX, sizeof(MULTIPART_UPLOAD_ID_PREFIX) - 1) == 0) ||
+         (strncmp(uid, MULTIPART_UPLOAD_ID_PREFIX_LEGACY, sizeof(MULTIPART_UPLOAD_ID_PREFIX_LEGACY) - 1) == 0);
+}
+
+void RGWUploadPartInfo::generate_test_instances(list<RGWUploadPartInfo*>& o)
+{
+  RGWUploadPartInfo *i = new RGWUploadPartInfo;
+  i->num = 1;
+  i->size = 10 * 1024 * 1024;
+  i->etag = "etag";
+  o.push_back(i);
+  o.push_back(new RGWUploadPartInfo);
+}
+
+void RGWUploadPartInfo::dump(Formatter *f) const
+{
+  encode_json("num", num, f);
+  encode_json("size", size, f);
+  encode_json("etag", etag, f);
+  utime_t ut(modified);
+  encode_json("modified", ut, f);
+  encode_json("past_prefixes", past_prefixes, f);
+}
+
diff --git a/src/rgw/rgw_multi.h b/src/rgw/rgw_multi.h
new file mode 100644
index 000000000..f57c90e74
--- /dev/null
+++ b/src/rgw/rgw_multi.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <map>
+#include "rgw_xml.h"
+#include "rgw_obj_types.h"
+#include "rgw_obj_manifest.h"
+#include "rgw_compression_types.h"
+#include "common/dout.h"
+#include "rgw_sal_fwd.h"
+
+#define MULTIPART_UPLOAD_ID_PREFIX_LEGACY "2/"
+#define MULTIPART_UPLOAD_ID_PREFIX "2~" // must contain a unique char that may not come up in gen_rand_alpha()
+
+class RGWMultiCompleteUpload : public XMLObj
+{
+public:
+  RGWMultiCompleteUpload() {}
+  ~RGWMultiCompleteUpload() override {}
+  bool xml_end(const char *el) override;
+
+  std::map<int, std::string> parts;
+};
+
+class RGWMultiPart : public XMLObj
+{
+  std::string etag;
+  int num;
+public:
+  RGWMultiPart() : num(0) {}
+  ~RGWMultiPart() override {}
+  bool xml_end(const char *el) override;
+
+  std::string& get_etag() { return etag; }
+  int get_num() { return num; }
+};
+
+class RGWMultiPartNumber : public XMLObj
+{
+public:
+  RGWMultiPartNumber() {}
+  ~RGWMultiPartNumber() override {}
+};
+
+class RGWMultiETag : public XMLObj
+{
+public:
+  RGWMultiETag() {}
+  ~RGWMultiETag() override {}
+};
+
+class RGWMultiXMLParser : public RGWXMLParser
+{
+  XMLObj *alloc_obj(const char *el) override;
+public:
+  RGWMultiXMLParser() {}
+  virtual ~RGWMultiXMLParser() override;
+};
+
+extern bool is_v2_upload_id(const std::string& upload_id);
diff --git a/src/rgw/rgw_multi_del.cc b/src/rgw/rgw_multi_del.cc
new file mode 100644
index 000000000..443ffd60a
--- /dev/null
+++ b/src/rgw/rgw_multi_del.cc
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+
+#include <iostream>
+
+#include "include/types.h"
+
+#include "rgw_xml.h"
+#include "rgw_multi_del.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+bool RGWMultiDelObject::xml_end(const char *el)
+{
+  RGWMultiDelKey *key_obj = static_cast<RGWMultiDelKey *>(find_first("Key"));
+  RGWMultiDelVersionId *vid = static_cast<RGWMultiDelVersionId *>(find_first("VersionId"));
+
+  if (!key_obj)
+    return false;
+
+  string s = key_obj->get_data();
+  if (s.empty())
+    return false;
+
+  key = s;
+
+  if (vid) {
+    version_id = vid->get_data();
+  }
+
+  return true;
+}
+
+bool RGWMultiDelDelete::xml_end(const char *el) {
+  RGWMultiDelQuiet *quiet_set = static_cast<RGWMultiDelQuiet *>(find_first("Quiet"));
+  if (quiet_set) {
+    string quiet_val = quiet_set->get_data();
+    quiet = (strcasecmp(quiet_val.c_str(), "true") == 0);
+  }
+
+  XMLObjIter iter = find("Object");
+  RGWMultiDelObject *object = static_cast<RGWMultiDelObject *>(iter.get_next());
+  while (object) {
+    const string& key = object->get_key();
+    const string& instance = object->get_version_id();
+    rgw_obj_key k(key, instance);
+    objects.push_back(k);
+    object = static_cast<RGWMultiDelObject *>(iter.get_next());
+  }
+  return true;
+}
+
+XMLObj *RGWMultiDelXMLParser::alloc_obj(const char *el) {
+  XMLObj *obj = NULL;
+  if (strcmp(el, "Delete") == 0) {
+    obj = new RGWMultiDelDelete();
+  } else if (strcmp(el, "Quiet") == 0) {
+    obj = new RGWMultiDelQuiet();
+  } else if (strcmp(el, "Object") == 0) {
+    obj = new RGWMultiDelObject ();
+  } else if (strcmp(el, "Key") == 0) {
+    obj = new RGWMultiDelKey();
+  } else if (strcmp(el, "VersionId") == 0) {
+    obj = new RGWMultiDelVersionId();
+  }
+
+  return obj;
+}
+
diff --git a/src/rgw/rgw_multi_del.h b/src/rgw/rgw_multi_del.h
new file mode 100644
index 000000000..b060decf4
--- /dev/null
+++ b/src/rgw/rgw_multi_del.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <vector>
+#include "rgw_xml.h"
+#include "rgw_common.h"
+
+class RGWMultiDelDelete : public XMLObj
+{
+public:
+  RGWMultiDelDelete() :quiet(false) {}
+  ~RGWMultiDelDelete() override {}
+  bool xml_end(const char *el) override;
+
+  std::vector<rgw_obj_key> objects;
+  bool quiet;
+  bool is_quiet() { return quiet; }
+};
+
+class RGWMultiDelQuiet : public XMLObj
+{
+public:
+  RGWMultiDelQuiet() {}
+  ~RGWMultiDelQuiet() override {}
+};
+
+class RGWMultiDelObject : public XMLObj
+{
+  std::string key;
+  std::string version_id;
+public:
+  RGWMultiDelObject() {}
+  ~RGWMultiDelObject() override {}
+  bool xml_end(const char *el) override;
+
+  const std::string& get_key() { return key; }
+  const std::string& get_version_id() { return version_id; }
+};
+
+class RGWMultiDelKey : public XMLObj
+{
+public:
+  RGWMultiDelKey() {}
+  ~RGWMultiDelKey() override {}
+};
+
+class RGWMultiDelVersionId : public XMLObj
+{
+public:
+  RGWMultiDelVersionId() {}
+  ~RGWMultiDelVersionId() override {}
+};
+
+class RGWMultiDelXMLParser : public RGWXMLParser
+{
+  XMLObj *alloc_obj(const char *el) override;
+public:
+  RGWMultiDelXMLParser() {}
+  ~RGWMultiDelXMLParser() override {}
+};
diff --git a/src/rgw/rgw_multiparser.cc b/src/rgw/rgw_multiparser.cc
new file mode 100644
index 000000000..a8778abd9
--- /dev/null
+++ b/src/rgw/rgw_multiparser.cc
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+
+#include <iostream>
+#include <map>
+
+#include "include/types.h"
+
+#include "rgw_multi.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int main(int argc, char **argv) {
+  RGWMultiXMLParser parser;
+
+  if (!parser.init())
+    exit(1);
+
+  char buf[1024];
+
+  for (;;) {
+    int done;
+    int len;
+
+    len = fread(buf, 1, sizeof(buf), stdin);
+    if (ferror(stdin)) {
+      fprintf(stderr, "Read error\n");
+      exit(-1);
+    }
+    done = feof(stdin);
+
+    bool result = parser.parse(buf, len, done);
+    if (!result) {
+      cerr << "failed to parse!" << std::endl;
+    }
+
+    if (done)
+      break;
+  }
+
+  exit(0);
+}
+
diff --git a/src/rgw/rgw_multipart_meta_filter.cc b/src/rgw/rgw_multipart_meta_filter.cc
new file mode 100644
index 000000000..c616cd480
--- /dev/null
+++ b/src/rgw/rgw_multipart_meta_filter.cc
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_tier_rados.h"
+
+using namespace std;
+
+const std::string MP_META_SUFFIX = ".meta";
+
+bool MultipartMetaFilter::filter(const string& name, string& key) {
+  // the length of the suffix so we can skip past it
+  static const size_t MP_META_SUFFIX_LEN = MP_META_SUFFIX.length();
+
+  size_t len = name.size();
+
+  // make sure there's room for suffix plus at least one more
+  // character
+  if (len <= MP_META_SUFFIX_LEN)
+    return false;
+
+  size_t pos = name.find(MP_META_SUFFIX, len - MP_META_SUFFIX_LEN);
+  if (pos == string::npos)
+    return false;
+
+  pos = name.rfind('.', pos - 1);
+  if (pos == string::npos)
+    return false;
+
+  key = name.substr(0, pos);
+
+  return true;
+}
diff --git a/src/rgw/rgw_notify_event_type.cc b/src/rgw/rgw_notify_event_type.cc
new file mode 100644
index 000000000..7a0ef9568
--- /dev/null
+++ b/src/rgw/rgw_notify_event_type.cc
@@ -0,0 +1,119 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "rgw_notify_event_type.h"
+#include "include/str_list.h"
+
+namespace rgw::notify {
+
+  std::string to_string(EventType t) {
+    switch (t) {
+    case ObjectCreated:
+      return "s3:ObjectCreated:*";
+    case ObjectCreatedPut:
+      return "s3:ObjectCreated:Put";
+    case ObjectCreatedPost:
+      return "s3:ObjectCreated:Post";
+    case ObjectCreatedCopy:
+      return "s3:ObjectCreated:Copy";
+    case ObjectCreatedCompleteMultipartUpload:
+      return "s3:ObjectCreated:CompleteMultipartUpload";
+    case ObjectRemoved:
+      return "s3:ObjectRemoved:*";
+    case ObjectRemovedDelete:
+      return "s3:ObjectRemoved:Delete";
+    case ObjectRemovedDeleteMarkerCreated:
+      return "s3:ObjectRemoved:DeleteMarkerCreated";
+    case ObjectLifecycle:
+      return "s3:ObjectLifecycle:*";
+    case ObjectExpiration:
+      return "s3:ObjectLifecycle:Expiration:*";
+    case ObjectExpirationCurrent:
+      return "s3:ObjectLifecycle:Expiration:Current";
+    case ObjectExpirationNoncurrent:
+      return "s3:ObjectLifecycle:Expiration:Noncurrent";
+    case ObjectExpirationDeleteMarker:
+      return "s3:ObjectLifecycle:Expiration:DeleteMarker";
+    case ObjectExpirationAbortMPU:
+      return "s3:ObjectLifecycle:Expiration:AbortMPU";
+    case ObjectTransition:
+      return "s3:ObjectLifecycle:Transition:*";
+    case ObjectTransitionCurrent:
+      return "s3:ObjectLifecycle:Transition:Current";
+    case ObjectTransitionNoncurrent:
+      return "s3:ObjectLifecycle:Transition:Noncurrent";
+    case ObjectSynced:
+      return "s3:ObjectSynced:*";
+    case ObjectSyncedCreate:
+      return "s3:ObjectSynced:Create";
+    case ObjectSyncedDelete:
+      return "s3:ObjectSynced:Delete";
+    case ObjectSyncedDeletionMarkerCreated:
+      return "s3:ObjectSynced:DeletionMarkerCreated";
+    case UnknownEvent:
+        return "s3:UnknownEvent";
+    }
+    return "s3:UnknownEvent";
+  }
+
+  std::string to_event_string(EventType t) {
+    return to_string(t).substr(3);
+  }
+
+  EventType from_string(const std::string& s) {
+    if (s == "s3:ObjectCreated:*")
+        return ObjectCreated;
+    if (s == "s3:ObjectCreated:Put")
+        return ObjectCreatedPut;
+    if (s == "s3:ObjectCreated:Post")
+        return ObjectCreatedPost;
+    if (s == "s3:ObjectCreated:Copy")
+        return ObjectCreatedCopy;
+    if (s == "s3:ObjectCreated:CompleteMultipartUpload")
+        return ObjectCreatedCompleteMultipartUpload;
+    if (s == "s3:ObjectRemoved:*")
+        return ObjectRemoved;
+    if (s == "s3:ObjectRemoved:Delete")
+        return ObjectRemovedDelete;
+    if (s == "s3:ObjectRemoved:DeleteMarkerCreated")
+        return ObjectRemovedDeleteMarkerCreated;
+    if (s == "s3:ObjectLifecycle:*")
+        return ObjectLifecycle;
+    if (s == "s3:ObjectLifecycle:Expiration:*")
+        return ObjectExpiration;
+    if (s == "s3:ObjectLifecycle:Expiration:Current")
+        return ObjectExpirationCurrent;
+    if (s == "s3:ObjectLifecycle:Expiration:Noncurrent")
+        return ObjectExpirationNoncurrent;
+    if (s == "s3:ObjectLifecycle:Expiration:DeleteMarker")
+        return ObjectExpirationDeleteMarker;
+    if (s == "s3:ObjectLifecycle:Expiration:AbortMultipartUpload")
+        return ObjectExpirationAbortMPU;
+    if (s == "s3:ObjectLifecycle:Transition:*")
+        return ObjectTransition;
+    if (s == "s3:ObjectLifecycle:Transition:Current")
+        return ObjectTransitionCurrent;
+    if (s == "s3:ObjectLifecycle:Transition:Noncurrent")
+        return ObjectTransitionNoncurrent;
+    if (s == "s3:ObjectSynced:*")
+        return ObjectSynced;
+    if (s == "s3:ObjectSynced:Create")
+        return ObjectSyncedCreate;
+    if (s == "s3:ObjectSynced:Delete")
+        return ObjectSyncedDelete;
+    if (s == "s3:ObjectSynced:DeletionMarkerCreated")
+        return ObjectSyncedDeletionMarkerCreated;
+    return UnknownEvent;
+  }
+
+bool operator==(EventType lhs, EventType rhs) {
+  return lhs & rhs;
+}
+
+void from_string_list(const std::string& string_list, EventTypeList& event_list) {
+  event_list.clear();
+  ceph::for_each_substr(string_list, ",", [&event_list] (auto token) {
+    event_list.push_back(rgw::notify::from_string(std::string(token.begin(), token.end())));
+  });
+}
+}
diff --git a/src/rgw/rgw_notify_event_type.h b/src/rgw/rgw_notify_event_type.h
new file mode 100644
index 000000000..4fe1b5c90
--- /dev/null
+++ b/src/rgw/rgw_notify_event_type.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+#include <string>
+#include <vector>
+
+namespace rgw::notify {
+  enum EventType {
+    ObjectCreated                        = 0xF,
+    ObjectCreatedPut                     = 0x1,
+    ObjectCreatedPost                    = 0x2,
+    ObjectCreatedCopy                    = 0x4,
+    ObjectCreatedCompleteMultipartUpload = 0x8,
+    ObjectRemoved                        = 0xF0,
+    ObjectRemovedDelete                  = 0x10,
+    ObjectRemovedDeleteMarkerCreated     = 0x20,
+    // lifecycle events (RGW extension)
+    ObjectLifecycle                       = 0xFF00,
+    ObjectExpiration                     = 0xF00,
+    ObjectExpirationCurrent              = 0x100,
+    ObjectExpirationNoncurrent           = 0x200,
+    ObjectExpirationDeleteMarker         = 0x400,
+    ObjectExpirationAbortMPU             = 0x800,
+    ObjectTransition                     = 0xF000,
+    ObjectTransitionCurrent              = 0x1000,
+    ObjectTransitionNoncurrent           = 0x2000,
+    ObjectSynced                         = 0xF0000,
+    ObjectSyncedCreate                   = 0x10000,
+    ObjectSyncedDelete                   = 0x20000,
+    ObjectSyncedDeletionMarkerCreated    = 0x40000,
+    UnknownEvent                         = 0x100000
+  };
+
+  using EventTypeList = std::vector<EventType>;
+
+  // two event types are considered equal if their bits intersect
+  bool operator==(EventType lhs, EventType rhs);
+
+  std::string to_string(EventType t);
+
+  std::string to_event_string(EventType t);
+
+  EventType from_string(const std::string& s);
+ 
+  // create a vector of event types from comma separated list of event types
+  void from_string_list(const std::string& string_list, EventTypeList& event_list);
+}
+
diff --git a/src/rgw/rgw_obj_manifest.cc b/src/rgw/rgw_obj_manifest.cc
new file mode 100644
index 000000000..1d1c3b5cf
--- /dev/null
+++ b/src/rgw/rgw_obj_manifest.cc
@@ -0,0 +1,260 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_obj_manifest.h"
+
+#include "rgw_rados.h" // RGW_OBJ_NS_SHADOW and RGW_OBJ_NS_MULTIPART
+
+using namespace std;
+
+void RGWObjManifest::obj_iterator::operator++()
+{
+  if (manifest->explicit_objs) {
+    ++explicit_iter;
+
+    if (explicit_iter == manifest->objs.end()) {
+      ofs = manifest->obj_size;
+      stripe_size = 0;
+      return;
+    }
+
+    update_explicit_pos();
+
+    update_location();
+    return;
+  }
+
+  uint64_t obj_size = manifest->get_obj_size();
+  uint64_t head_size = manifest->get_head_size();
+
+  if (ofs == obj_size) {
+    return;
+  }
+
+  if (manifest->rules.empty()) {
+    return;
+  }
+
+  /* are we still pointing at the head? */
+  if (ofs < head_size) {
+    rule_iter = manifest->rules.begin();
+    const RGWObjManifestRule *rule = &rule_iter->second;
+    ofs = std::min(head_size, obj_size);
+    stripe_ofs = ofs;
+    cur_stripe = 1;
+    stripe_size = std::min(obj_size - ofs, rule->stripe_max_size);
+    if (rule->part_size > 0) {
+      stripe_size = std::min(stripe_size, rule->part_size);
+    }
+    update_location();
+    return;
+  }
+
+  const RGWObjManifestRule *rule = &rule_iter->second;
+
+  stripe_ofs += rule->stripe_max_size;
+  cur_stripe++;
+  ldpp_dout(dpp, 20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
+
+  if (rule->part_size > 0) {
+    /* multi part, multi stripes object */
+
+    ldpp_dout(dpp, 20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
+
+    if (stripe_ofs >= part_ofs + rule->part_size) {
+      /* moved to the next part */
+      cur_stripe = 0;
+      part_ofs += rule->part_size;
+      stripe_ofs = part_ofs;
+
+      bool last_rule = (next_rule_iter == manifest->rules.end());
+      /* move to the next rule? */
+      if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
+        rule_iter = next_rule_iter;
+        last_rule = (next_rule_iter == manifest->rules.end());
+        if (!last_rule) {
+          ++next_rule_iter;
+        }
+        cur_part_id = rule_iter->second.start_part_num;
+      } else {
+        cur_part_id++;
+      }
+
+      rule = &rule_iter->second;
+    }
+
+    stripe_size = std::min(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
+  }
+
+  cur_override_prefix = rule->override_prefix;
+
+  ofs = stripe_ofs;
+  if (ofs > obj_size) {
+    ofs = obj_size;
+    stripe_ofs = ofs;
+    stripe_size = 0;
+  }
+
+  ldpp_dout(dpp, 20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
+  update_location();
+}
+
+void RGWObjManifest::obj_iterator::seek(uint64_t o)
+{
+  ofs = o;
+  if (manifest->explicit_objs) {
+    explicit_iter = manifest->objs.upper_bound(ofs);
+    if (explicit_iter != manifest->objs.begin()) {
+      --explicit_iter;
+    }
+    if (ofs < manifest->obj_size) {
+      update_explicit_pos();
+    } else {
+      ofs = manifest->obj_size;
+    }
+    update_location();
+    return;
+  }
+  if (o < manifest->get_head_size()) {
+    rule_iter = manifest->rules.begin();
+    stripe_ofs = 0;
+    stripe_size = manifest->get_head_size();
+    if (rule_iter != manifest->rules.end()) {
+      cur_part_id = rule_iter->second.start_part_num;
+      cur_override_prefix = rule_iter->second.override_prefix;
+    }
+    update_location();
+    return;
+  }
+
+  rule_iter = manifest->rules.upper_bound(ofs);
+  next_rule_iter = rule_iter;
+  if (rule_iter != manifest->rules.begin()) {
+    --rule_iter;
+  }
+
+  if (rule_iter == manifest->rules.end()) {
+    update_location();
+    return;
+  }
+
+  const RGWObjManifestRule& rule = rule_iter->second;
+
+  if (rule.part_size > 0) {
+    cur_part_id = rule.start_part_num + (ofs - rule.start_ofs) / rule.part_size;
+  } else {
+    cur_part_id = rule.start_part_num;
+  }
+  part_ofs = rule.start_ofs + (cur_part_id - rule.start_part_num) * rule.part_size;
+
+  if (rule.stripe_max_size > 0) {
+    cur_stripe = (ofs - part_ofs) / rule.stripe_max_size;
+
+    stripe_ofs = part_ofs + cur_stripe * rule.stripe_max_size;
+    if (!cur_part_id && manifest->get_head_size() > 0) {
+      cur_stripe++;
+    }
+  } else {
+    cur_stripe = 0;
+    stripe_ofs = part_ofs;
+  }
+
+  if (!rule.part_size) {
+    stripe_size = rule.stripe_max_size;
+    stripe_size = std::min(manifest->get_obj_size() - stripe_ofs, stripe_size);
+  } else {
+    uint64_t next = std::min(stripe_ofs + rule.stripe_max_size, part_ofs + rule.part_size);
+    stripe_size = next - stripe_ofs;
+  }
+
+  cur_override_prefix = rule.override_prefix;
+
+  update_location();
+}
+
+void RGWObjManifest::obj_iterator::update_explicit_pos()
+{
+  ofs = explicit_iter->first;
+  stripe_ofs = ofs;
+
+  auto next_iter = explicit_iter;
+  ++next_iter;
+  if (next_iter != manifest->objs.end()) {
+    stripe_size = next_iter->first - ofs;
+  } else {
+    stripe_size = manifest->obj_size - ofs;
+  }
+}
+
+void RGWObjManifest::obj_iterator::update_location()
+{
+  if (manifest->explicit_objs) {
+    if (manifest->empty()) {
+      location = rgw_obj_select{};
+    } else {
+      location = explicit_iter->second.loc;
+    }
+    return;
+  }
+
+  if (ofs < manifest->get_head_size()) {
+    location = manifest->get_obj();
+    location.set_placement_rule(manifest->get_head_placement_rule());
+    return;
+  }
+
+  manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, &cur_override_prefix, &location);
+}
+
+void RGWObjManifest::get_implicit_location(uint64_t cur_part_id, uint64_t cur_stripe,
+                                           uint64_t ofs, string *override_prefix, rgw_obj_select *location) const
+{
+  rgw_obj loc;
+
+  string& oid = loc.key.name;
+  string& ns = loc.key.ns;
+
+  if (!override_prefix || override_prefix->empty()) {
+    oid = prefix;
+  } else {
+    oid = *override_prefix;
+  }
+
+  if (!cur_part_id) {
+    if (ofs < max_head_size) {
+      location->set_placement_rule(head_placement_rule);
+      *location = obj;
+      return;
+    } else {
+      char buf[16];
+      snprintf(buf, sizeof(buf), "%d", (int)cur_stripe);
+      oid += buf;
+      ns = RGW_OBJ_NS_SHADOW;
+    }
+  } else {
+    char buf[32];
+    if (cur_stripe == 0) {
+      snprintf(buf, sizeof(buf), ".%d", (int)cur_part_id);
+      oid += buf;
+      ns= RGW_OBJ_NS_MULTIPART;
+    } else {
+      snprintf(buf, sizeof(buf), ".%d_%d", (int)cur_part_id, (int)cur_stripe);
+      oid += buf;
+      ns = RGW_OBJ_NS_SHADOW;
+    }
+  }
+
+  if (!tail_placement.bucket.name.empty()) {
+    loc.bucket = tail_placement.bucket;
+  } else {
+    loc.bucket = obj.bucket;
+  }
+
+  // Always overwrite instance with tail_instance
+  // to get the right shadow object location
+  loc.key.set_instance(tail_instance);
+
+  location->set_placement_rule(tail_placement.placement_rule);
+  *location = loc;
+}
+
diff --git a/src/rgw/rgw_obj_types.h b/src/rgw/rgw_obj_types.h
new file mode 100644
index 000000000..1347a8ad0
--- /dev/null
+++ b/src/rgw/rgw_obj_types.h
@@ -0,0 +1,622 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * include files which can only be compiled in radosgw or OSD
+ * contexts (e.g., rgw_sal.h, rgw_common.h) */
+
+#pragma once
+
+#include <fmt/format.h>
+
+#include "rgw_pool_types.h"
+#include "rgw_bucket_types.h"
+#include "rgw_user_types.h"
+
+#include "common/dout.h"
+#include "common/Formatter.h"
+
+struct rgw_obj_index_key { // cls_rgw_obj_key now aliases this type
+  std::string name;
+  std::string instance;
+
+  rgw_obj_index_key() {}
+  rgw_obj_index_key(const std::string &_name) : name(_name) {}
+  rgw_obj_index_key(const std::string& n, const std::string& i) : name(n), instance(i) {}
+
+  std::string to_string() const {
+    return fmt::format("{}({})", name, instance);
+  }
+
+  bool empty() const {
+    return name.empty();
+  }
+
+  void set(const std::string& _name) {
+    name = _name;
+    instance.clear();
+  }
+
+  bool operator==(const rgw_obj_index_key& k) const {
+    return (name.compare(k.name) == 0) &&
+           (instance.compare(k.instance) == 0);
+  }
+
+  bool operator!=(const rgw_obj_index_key& k) const {
+    return (name.compare(k.name) != 0) ||
+           (instance.compare(k.instance) != 0);
+  }
+
+  bool operator<(const rgw_obj_index_key& k) const {
+    int r = name.compare(k.name);
+    if (r == 0) {
+      r = instance.compare(k.instance);
+    }
+    return (r < 0);
+  }
+
+  bool operator<=(const rgw_obj_index_key& k) const {
+    return !(k < *this);
+  }
+
+  void encode(ceph::buffer::list &bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(name, bl);
+    encode(instance, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator &bl) {
+    DECODE_START(1, bl);
+    decode(name, bl);
+    decode(instance, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(ceph::Formatter *f) const {
+    f->dump_string("name", name);
+    f->dump_string("instance", instance);
+  }
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<rgw_obj_index_key*>& ls) {
+    ls.push_back(new rgw_obj_index_key);
+    ls.push_back(new rgw_obj_index_key);
+    ls.back()->name = "name";
+    ls.back()->instance = "instance";
+  }
+
+  size_t estimate_encoded_size() const {
+    constexpr size_t start_overhead = sizeof(__u8) + sizeof(__u8) + sizeof(ceph_le32); // version and length prefix
+    constexpr size_t string_overhead = sizeof(__u32); // strings are encoded with 32-bit length prefix
+    return start_overhead +
+        string_overhead + name.size() +
+        string_overhead + instance.size();
+  }
+};
+WRITE_CLASS_ENCODER(rgw_obj_index_key)
+
+struct rgw_obj_key {
+  std::string name;
+  std::string instance;
+  std::string ns;
+
+  rgw_obj_key() {}
+
+  // cppcheck-suppress noExplicitConstructor
+  rgw_obj_key(const std::string& n) : name(n) {}
+  rgw_obj_key(const std::string& n, const std::string& i) : name(n), instance(i) {}
+  rgw_obj_key(const std::string& n, const std::string& i, const std::string& _ns) : name(n), instance(i), ns(_ns) {}
+
+  rgw_obj_key(const rgw_obj_index_key& k) {
+    parse_index_key(k.name, &name, &ns);
+    instance = k.instance;
+  }
+
+  static void parse_index_key(const std::string& key, std::string *name, std::string *ns) {
+    if (key[0] != '_') {
+      *name = key;
+      ns->clear();
+      return;
+    }
+    if (key[1] == '_') {
+      *name = key.substr(1);
+      ns->clear();
+      return;
+    }
+    ssize_t pos = key.find('_', 1);
+    if (pos < 0) {
+      /* shouldn't happen, just use key */
+      *name = key;
+      ns->clear();
+      return;
+    }
+
+    *name = key.substr(pos + 1);
+    *ns = key.substr(1, pos -1);
+  }
+
+  void set(const std::string& n) {
+    name = n;
+    instance.clear();
+    ns.clear();
+  }
+
+  void set(const std::string& n, const std::string& i) {
+    name = n;
+    instance = i;
+    ns.clear();
+  }
+
+  void set(const std::string& n, const std::string& i, const std::string& _ns) {
+    name = n;
+    instance = i;
+    ns = _ns;
+  }
+
+  bool set(const rgw_obj_index_key& index_key) {
+    if (!parse_raw_oid(index_key.name, this)) {
+      return false;
+    }
+    instance = index_key.instance;
+    return true;
+  }
+
+  void set_instance(const std::string& i) {
+    instance = i;
+  }
+
+  const std::string& get_instance() const {
+    return instance;
+  }
+
+  void set_ns(const std::string& _ns) {
+    ns = _ns;
+  }
+
+  const std::string& get_ns() const {
+    return ns;
+  }
+
+  std::string get_index_key_name() const {
+    if (ns.empty()) {
+      if (name.size() < 1 || name[0] != '_') {
+        return name;
+      }
+      return std::string("_") + name;
+    };
+
+    char buf[ns.size() + 16];
+    snprintf(buf, sizeof(buf), "_%s_", ns.c_str());
+    return std::string(buf) + name;
+  };
+
+  void get_index_key(rgw_obj_index_key* key) const {
+    key->name = get_index_key_name();
+    key->instance = instance;
+  }
+
+  std::string get_loc() const {
+    /*
+     * For backward compatibility. Older versions used to have object locator on all objects,
+     * however, the name was the effective object locator. This had the same effect as not
+     * having object locator at all for most objects but the ones that started with underscore as
+     * these were escaped.
+     */
+    if (name[0] == '_' && ns.empty()) {
+      return name;
+    }
+
+    return {};
+  }
+
+  bool empty() const {
+    return name.empty();
+  }
+
+  bool have_null_instance() const {
+    return instance == "null";
+  }
+
+  bool have_instance() const {
+    return !instance.empty();
+  }
+
+  bool need_to_encode_instance() const {
+    return have_instance() && !have_null_instance();
+  }
+
+  std::string get_oid() const {
+    if (ns.empty() && !need_to_encode_instance()) {
+      if (name.size() < 1 || name[0] != '_') {
+        return name;
+      }
+      return std::string("_") + name;
+    }
+
+    std::string oid = "_";
+    oid.append(ns);
+    if (need_to_encode_instance()) {
+      oid.append(std::string(":") + instance);
+    }
+    oid.append("_");
+    oid.append(name);
+    return oid;
+  }
+
+  bool operator==(const rgw_obj_key& k) const {
+    return (name.compare(k.name) == 0) &&
+           (instance.compare(k.instance) == 0);
+  }
+
+  bool operator<(const rgw_obj_key& k) const {
+    int r = name.compare(k.name);
+    if (r == 0) {
+      r = instance.compare(k.instance);
+    }
+    return (r < 0);
+  }
+
+  bool operator<=(const rgw_obj_key& k) const {
+    return !(k < *this);
+  }
+
+  static void parse_ns_field(std::string& ns, std::string& instance) {
+    int pos = ns.find(':');
+    if (pos >= 0) {
+      instance = ns.substr(pos + 1);
+      ns = ns.substr(0, pos);
+    } else {
+      instance.clear();
+    }
+  }
+
+  // takes an oid and parses out the namespace (ns), name, and
+  // instance
+  static bool parse_raw_oid(const std::string& oid, rgw_obj_key *key) {
+    key->instance.clear();
+    key->ns.clear();
+    if (oid[0] != '_') {
+      key->name = oid;
+      return true;
+    }
+
+    if (oid.size() >= 2 && oid[1] == '_') {
+      key->name = oid.substr(1);
+      return true;
+    }
+
+    if (oid.size() < 3) // for namespace, min size would be 3: _x_
+      return false;
+
+    size_t pos = oid.find('_', 2); // oid must match ^_[^_].+$
+    if (pos == std::string::npos)
+      return false;
+
+    key->ns = oid.substr(1, pos - 1);
+    parse_ns_field(key->ns, key->instance);
+
+    key->name = oid.substr(pos + 1);
+    return true;
+  }
+
+  /**
+   * Translate a namespace-mangled object name to the user-facing name
+   * existing in the given namespace.
+   *
+   * If the object is part of the given namespace, it returns true
+   * and cuts down the name to the unmangled version. If it is not
+   * part of the given namespace, it returns false.
+   */
+  static bool oid_to_key_in_ns(const std::string& oid, rgw_obj_key *key, const std::string& ns) {
+    bool ret = parse_raw_oid(oid, key);
+    if (!ret) {
+      return ret;
+    }
+
+    return (ns == key->ns);
+  }
+
+  /**
+   * Given a mangled object name and an empty namespace std::string, this
+   * function extracts the namespace into the std::string and sets the object
+   * name to be the unmangled version.
+   *
+   * It returns true after successfully doing so, or
+   * false if it fails.
+   */
+  static bool strip_namespace_from_name(std::string& name, std::string& ns, std::string& instance) {
+    ns.clear();
+    instance.clear();
+    if (name[0] != '_') {
+      return true;
+    }
+
+    size_t pos = name.find('_', 1);
+    if (pos == std::string::npos) {
+      return false;
+    }
+
+    if (name[1] == '_') {
+      name = name.substr(1);
+      return true;
+    }
+
+    size_t period_pos = name.find('.');
+    if (period_pos < pos) {
+      return false;
+    }
+
+    ns = name.substr(1, pos-1);
+    name = name.substr(pos+1, std::string::npos);
+
+    parse_ns_field(ns, instance);
+    return true;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(name, bl);
+    encode(instance, bl);
+    encode(ns, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(name, bl);
+    decode(instance, bl);
+    if (struct_v >= 2) {
+      decode(ns, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_obj_key)
+
+#if FMT_VERSION >= 90000
+template<> struct fmt::formatter<rgw_obj_key> : fmt::formatter<std::string_view> {
+  template <typename FormatContext>
+  auto format(const rgw_obj_key& key, FormatContext& ctx) const {
+    if (key.instance.empty()) {
+      return formatter<std::string_view>::format(key.name, ctx);
+    } else {
+      return fmt::format_to(ctx.out(), "{}[{}]", key.name, key.instance);
+    }
+  }
+};
+#endif
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_obj_key &key) {
+#if FMT_VERSION >= 90000
+  return out << fmt::format("{}", key);
+#else
+  if (key.instance.empty()) {
+    return out << fmt::format("{}", key.name);
+  } else {
+    return out << fmt::format("{}[{}]", key.name, key.instance);
+  }
+#endif
+}
+
+struct rgw_raw_obj {
+  rgw_pool pool;
+  std::string oid;
+  std::string loc;
+
+  rgw_raw_obj() {}
+  rgw_raw_obj(const rgw_pool& _pool, const std::string& _oid) {
+    init(_pool, _oid);
+  }
+  rgw_raw_obj(const rgw_pool& _pool, const std::string& _oid, const std::string& _loc) : loc(_loc) {
+    init(_pool, _oid);
+  }
+
+  void init(const rgw_pool& _pool, const std::string& _oid) {
+    pool = _pool;
+    oid = _oid;
+  }
+
+  bool empty() const {
+    return oid.empty();
+  }
+
+  void encode(bufferlist& bl) const {
+     ENCODE_START(6, 6, bl);
+    encode(pool, bl);
+    encode(oid, bl);
+    encode(loc, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode_from_rgw_obj(bufferlist::const_iterator& bl);
+
+  void decode(bufferlist::const_iterator& bl) {
+    unsigned ofs = bl.get_off();
+    DECODE_START(6, bl);
+    if (struct_v < 6) {
+      /*
+       * this object was encoded as rgw_obj, prior to rgw_raw_obj been split out of it,
+       * let's decode it as rgw_obj and convert it
+       */
+      bl.seek(ofs);
+      decode_from_rgw_obj(bl);
+      return;
+    }
+    decode(pool, bl);
+    decode(oid, bl);
+    decode(loc, bl);
+    DECODE_FINISH(bl);
+  }
+
+  bool operator<(const rgw_raw_obj& o) const {
+    int r = pool.compare(o.pool);
+    if (r == 0) {
+      r = oid.compare(o.oid);
+      if (r == 0) {
+        r = loc.compare(o.loc);
+      }
+    }
+    return (r < 0);
+  }
+
+  bool operator==(const rgw_raw_obj& o) const {
+    return (pool == o.pool && oid == o.oid && loc == o.loc);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_raw_obj)
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_raw_obj& o) {
+  out << o.pool << ":" << o.oid;
+  return out;
+}
+
+struct rgw_obj {
+  rgw_bucket bucket;
+  rgw_obj_key key;
+
+  bool in_extra_data{false}; /* in-memory only member, does not serialize */
+
+  // Represents the hash index source for this object once it is set (non-empty)
+  std::string index_hash_source;
+
+  rgw_obj() {}
+  rgw_obj(const rgw_bucket& b, const std::string& name) : bucket(b), key(name) {}
+  rgw_obj(const rgw_bucket& b, const rgw_obj_key& k) : bucket(b), key(k) {}
+  rgw_obj(const rgw_bucket& b, const rgw_obj_index_key& k) : bucket(b), key(k) {}
+
+  void init(const rgw_bucket& b, const rgw_obj_key& k) {
+    bucket = b;
+    key = k;
+  }
+
+  void init(const rgw_bucket& b, const std::string& name) {
+    bucket = b;
+    key.set(name);
+  }
+
+  void init(const rgw_bucket& b, const std::string& name, const std::string& i, const std::string& n) {
+    bucket = b;
+    key.set(name, i, n);
+  }
+
+  void init_ns(const rgw_bucket& b, const std::string& name, const std::string& n) {
+    bucket = b;
+    key.name = name;
+    key.instance.clear();
+    key.ns = n;
+  }
+
+  bool empty() const {
+    return key.empty();
+  }
+
+  void set_key(const rgw_obj_key& k) {
+    key = k;
+  }
+
+  std::string get_oid() const {
+    return key.get_oid();
+  }
+
+  const std::string& get_hash_object() const {
+    return index_hash_source.empty() ? key.name : index_hash_source;
+  }
+
+  void set_in_extra_data(bool val) {
+    in_extra_data = val;
+  }
+
+  bool is_in_extra_data() const {
+    return in_extra_data;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(6, 6, bl);
+    encode(bucket, bl);
+    encode(key.ns, bl);
+    encode(key.name, bl);
+    encode(key.instance, bl);
+//    encode(placement_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
+    if (struct_v < 6) {
+      std::string s;
+      decode(bucket.name, bl); /* bucket.name */
+      decode(s, bl); /* loc */
+      decode(key.ns, bl);
+      decode(key.name, bl);
+      if (struct_v >= 2)
+        decode(bucket, bl);
+      if (struct_v >= 4)
+        decode(key.instance, bl);
+      if (key.ns.empty() && key.instance.empty()) {
+        if (key.name[0] == '_') {
+          key.name = key.name.substr(1);
+        }
+      } else {
+        if (struct_v >= 5) {
+          decode(key.name, bl);
+        } else {
+          ssize_t pos = key.name.find('_', 1);
+          if (pos < 0) {
+            throw buffer::malformed_input();
+          }
+          key.name = key.name.substr(pos + 1);
+        }
+      }
+    } else {
+      decode(bucket, bl);
+      decode(key.ns, bl);
+      decode(key.name, bl);
+      decode(key.instance, bl);
+//      decode(placement_id, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<rgw_obj*>& o);
+
+  bool operator==(const rgw_obj& o) const {
+    return (key == o.key) &&
+           (bucket == o.bucket);
+  }
+  bool operator<(const rgw_obj& o) const {
+    int r = key.name.compare(o.key.name);
+    if (r == 0) {
+      r = bucket.bucket_id.compare(o.bucket.bucket_id); /* not comparing bucket.name, if bucket_id is equal so will be bucket.name */
+      if (r == 0) {
+        r = key.ns.compare(o.key.ns);
+        if (r == 0) {
+          r = key.instance.compare(o.key.instance);
+        }
+      }
+    }
+
+    return (r < 0);
+  }
+
+  const rgw_pool& get_explicit_data_pool() {
+    if (!in_extra_data || bucket.explicit_placement.data_extra_pool.empty()) {
+      return bucket.explicit_placement.data_pool;
+    }
+    return bucket.explicit_placement.data_extra_pool;
+  }
+};
+WRITE_CLASS_ENCODER(rgw_obj)
diff --git a/src/rgw/rgw_object_expirer.cc b/src/rgw/rgw_object_expirer.cc
new file mode 100644
index 000000000..fd36a49c6
--- /dev/null
+++ b/src/rgw/rgw_object_expirer.cc
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+
+#include "auth/Crypto.h"
+
+#include "common/armor.h"
+#include "common/ceph_json.h"
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "global/global_init.h"
+
+#include "include/utime.h"
+#include "include/str_list.h"
+
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_log.h"
+#include "rgw_formats.h"
+#include "rgw_usage.h"
+#include "rgw_object_expirer_core.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+static rgw::sal::Driver* driver = NULL;
+
+class StoreDestructor {
+  rgw::sal::Driver* driver;
+
+public:
+  explicit StoreDestructor(rgw::sal::Driver* _s) : driver(_s) {}
+  ~StoreDestructor() {
+    if (driver) {
+      DriverManager::close_storage(driver);
+    }
+  }
+};
+
+static void usage()
+{
+  generic_server_usage();
+}
+
+int main(const int argc, const char **argv)
+{
+  auto args = argv_to_vec(argc, argv);
+  if (args.empty()) {
+    std::cerr << argv[0] << ": -h or --help for usage" << std::endl;
+    exit(1);
+  }
+  if (ceph_argparse_need_usage(args)) {
+    usage();
+    exit(0);
+  }
+
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+			 CODE_ENVIRONMENT_DAEMON,
+			 CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
+
+  for (std::vector<const char *>::iterator i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_double_dash(args, i)) {
+      break;
+    }
+  }
+
+  if (g_conf()->daemonize) {
+    global_init_daemonize(g_ceph_context);
+  }
+
+  common_init_finish(g_ceph_context);
+
+  const DoutPrefix dp(cct.get(), dout_subsys, "rgw object expirer: ");
+  DriverManager::Config cfg;
+  cfg.store_name = "rados";
+  cfg.filter_name = "none";
+  driver = DriverManager::get_storage(&dp, g_ceph_context, cfg, false, false, false, false, false);
+  if (!driver) {
+    std::cerr << "couldn't init storage provider" << std::endl;
+    return EIO;
+  }
+
+  /* Guard to not forget about closing the rados driver. */
+  StoreDestructor store_dtor(driver);
+
+  RGWObjectExpirer objexp(driver);
+  objexp.start_processor();
+
+  const utime_t interval(g_ceph_context->_conf->rgw_objexp_gc_interval, 0);
+  while (true) {
+    interval.sleep();
+  }
+
+  /* unreachable */
+
+  return EXIT_SUCCESS;
+}
diff --git a/src/rgw/rgw_object_lock.cc b/src/rgw/rgw_object_lock.cc
new file mode 100644
index 000000000..1d44328fe
--- /dev/null
+++ b/src/rgw/rgw_object_lock.cc
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+//
+#include "rgw_object_lock.h"
+
+using namespace std;
+
+void DefaultRetention::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("Mode", mode, obj, true);
+  if (mode.compare("GOVERNANCE") != 0 && mode.compare("COMPLIANCE") != 0) {
+    throw RGWXMLDecoder::err("bad Mode in lock rule");
+  }
+  bool days_exist = RGWXMLDecoder::decode_xml("Days", days, obj);
+  bool years_exist = RGWXMLDecoder::decode_xml("Years", years, obj);
+  if ((days_exist && years_exist) || (!days_exist && !years_exist)) {
+    throw RGWXMLDecoder::err("either Days or Years must be specified, but not both");
+  }
+}
+
+void DefaultRetention::dump_xml(Formatter *f) const {
+  encode_xml("Mode", mode, f);
+  if (days > 0) {
+    encode_xml("Days", days, f);
+  } else {
+    encode_xml("Years", years, f);
+  }
+}
+
+void ObjectLockRule::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("DefaultRetention", defaultRetention, obj, true);
+}
+
+void ObjectLockRule::dump_xml(Formatter *f) const {
+  encode_xml("DefaultRetention", defaultRetention, f);
+}
+
+void RGWObjectLock::decode_xml(XMLObj *obj) {
+  string enabled_str;
+  RGWXMLDecoder::decode_xml("ObjectLockEnabled", enabled_str, obj, true);
+  if (enabled_str.compare("Enabled") != 0) {
+    throw RGWXMLDecoder::err("invalid ObjectLockEnabled value");
+  } else {
+    enabled = true;
+  }
+  rule_exist = RGWXMLDecoder::decode_xml("Rule", rule, obj);
+}
+
+void RGWObjectLock::dump_xml(Formatter *f) const {
+  if (enabled) {
+    encode_xml("ObjectLockEnabled", "Enabled", f);
+  }
+  if (rule_exist) {
+    encode_xml("Rule", rule, f);
+  }
+}
+
+ceph::real_time RGWObjectLock::get_lock_until_date(const ceph::real_time& mtime) const {
+  if (!rule_exist) {
+    return ceph::real_time();
+  }
+  if (int days = get_days(); days > 0) {
+    return mtime + std::chrono::days(days);
+  }
+  return mtime + std::chrono::years(get_years());
+}
+
+void RGWObjectRetention::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("Mode", mode, obj, true);
+  if (mode.compare("GOVERNANCE") != 0 && mode.compare("COMPLIANCE") != 0) {
+    throw RGWXMLDecoder::err("bad Mode in retention");
+  }
+  string date_str;
+  RGWXMLDecoder::decode_xml("RetainUntilDate", date_str, obj, true);
+  boost::optional<ceph::real_time> date = ceph::from_iso_8601(date_str);
+  if (boost::none == date) {
+    throw RGWXMLDecoder::err("invalid RetainUntilDate value");
+  }
+  retain_until_date = *date;
+}
+
+void RGWObjectRetention::dump_xml(Formatter *f) const {
+  encode_xml("Mode", mode, f);
+  string date = ceph::to_iso_8601(retain_until_date);
+  encode_xml("RetainUntilDate", date, f);
+}
+
+void RGWObjectLegalHold::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("Status", status, obj, true);
+  if (status.compare("ON") != 0 && status.compare("OFF") != 0) {
+    throw RGWXMLDecoder::err("bad status in legal hold");
+  }
+}
+
+void RGWObjectLegalHold::dump_xml(Formatter *f) const {
+  encode_xml("Status", status, f);
+}
+
+bool RGWObjectLegalHold::is_enabled() const {
+  return status.compare("ON") == 0;
+}
diff --git a/src/rgw/rgw_object_lock.h b/src/rgw/rgw_object_lock.h
new file mode 100644
index 000000000..27c73feae
--- /dev/null
+++ b/src/rgw/rgw_object_lock.h
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include "common/ceph_time.h"
+#include "common/iso_8601.h"
+#include "rgw_xml.h"
+
+class DefaultRetention
+{
+protected:
+  std::string mode;
+  int days;
+  int years;
+
+public:
+  DefaultRetention(): days(0), years(0) {};
+
+  int get_days() const {
+    return days;
+  }
+
+  int get_years() const {
+    return years;
+  }
+
+  std::string get_mode() const {
+    return mode;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(mode, bl);
+    encode(days, bl);
+    encode(years, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(mode, bl);
+    decode(days, bl);
+    decode(years, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(DefaultRetention)
+
+class ObjectLockRule
+{
+protected:
+  DefaultRetention defaultRetention;
+public:
+  int get_days() const {
+    return defaultRetention.get_days();
+  }
+
+  int get_years() const {
+    return defaultRetention.get_years();
+  }
+
+  std::string get_mode() const {
+    return defaultRetention.get_mode();
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(defaultRetention, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(defaultRetention, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(ObjectLockRule)
+
+class RGWObjectLock
+{
+protected:
+  bool enabled;
+  bool rule_exist;
+  ObjectLockRule rule;
+
+public:
+  RGWObjectLock():enabled(true), rule_exist(false) {}
+
+  int get_days() const {
+    return rule.get_days();
+  }
+
+  int get_years() const {
+    return rule.get_years();
+  }
+
+  std::string get_mode() const {
+    return rule.get_mode();
+  }
+
+  bool retention_period_valid() const {
+    // DefaultRetention requires either Days or Years.
+    // You can't specify both at the same time.
+    // see https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTObjectLockConfiguration.html
+    return (get_years() > 0) != (get_days() > 0);
+  }
+
+  bool has_rule() const {
+    return rule_exist;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(enabled, bl);
+    encode(rule_exist, bl);
+    if (rule_exist) {
+      encode(rule, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(enabled, bl);
+    decode(rule_exist, bl);
+    if (rule_exist) {
+      decode(rule, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+  ceph::real_time get_lock_until_date(const ceph::real_time& mtime) const;
+};
+WRITE_CLASS_ENCODER(RGWObjectLock)
+
+class RGWObjectRetention
+{
+protected:
+  std::string mode;
+  ceph::real_time retain_until_date;
+public:
+  RGWObjectRetention() {}
+  RGWObjectRetention(std::string _mode, ceph::real_time _date): mode(_mode), retain_until_date(_date) {}
+
+  void set_mode(std::string _mode) {
+    mode = _mode;
+  }
+
+  std::string get_mode() const {
+    return mode;
+  }
+
+  void set_retain_until_date(ceph::real_time _retain_until_date) {
+    retain_until_date = _retain_until_date;
+  }
+
+  ceph::real_time get_retain_until_date() const {
+    return retain_until_date;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(mode, bl);
+    encode(retain_until_date, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(mode, bl);
+    decode(retain_until_date, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWObjectRetention)
+
+class RGWObjectLegalHold
+{
+protected:
+  std::string status;
+public:
+  RGWObjectLegalHold() {}
+  RGWObjectLegalHold(std::string _status): status(_status) {}
+  void set_status(std::string _status) {
+    status = _status;
+  }
+
+  std::string get_status() const {
+    return status;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(status, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(status, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+  bool is_enabled() const;
+};
+WRITE_CLASS_ENCODER(RGWObjectLegalHold)
diff --git a/src/rgw/rgw_oidc_provider.cc b/src/rgw/rgw_oidc_provider.cc
new file mode 100644
index 000000000..da6d73e23
--- /dev/null
+++ b/src/rgw/rgw_oidc_provider.cc
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <ctime>
+#include <regex>
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+#include "common/ceph_time.h"
+#include "rgw_rados.h"
+#include "rgw_zone.h"
+
+#include "include/types.h"
+#include "rgw_string.h"
+
+#include "rgw_common.h"
+#include "rgw_tools.h"
+#include "rgw_oidc_provider.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw { namespace sal {
+
+const string RGWOIDCProvider::oidc_url_oid_prefix = "oidc_url.";
+const string RGWOIDCProvider::oidc_arn_prefix = "arn:aws:iam::";
+
+int RGWOIDCProvider::get_tenant_url_from_arn(string& tenant, string& url)
+{
+  auto provider_arn = rgw::ARN::parse(arn);
+  if (!provider_arn) {
+    return -EINVAL;
+  }
+  url = provider_arn->resource;
+  tenant = provider_arn->account;
+  auto pos = url.find("oidc-provider/");
+  if (pos != std::string::npos) {
+    url.erase(pos, 14);
+  }
+  return 0;
+}
+
+int RGWOIDCProvider::create(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  int ret;
+
+  if (! validate_input(dpp)) {
+    return -EINVAL;
+  }
+
+  string idp_url = url_remove_prefix(provider_url);
+
+  /* check to see the name is not used */
+  ret = read_url(dpp, idp_url, tenant);
+  if (exclusive && ret == 0) {
+    ldpp_dout(dpp, 0) << "ERROR: url " << provider_url << " already in use"
+                    << id << dendl;
+    return -EEXIST;
+  } else if ( ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "failed reading provider url  " << provider_url << ": "
+                  << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  //arn
+  arn = oidc_arn_prefix + tenant + ":oidc-provider/" + idp_url;
+
+  // Creation time
+  real_clock::time_point t = real_clock::now();
+
+  struct timeval tv;
+  real_clock::to_timeval(t, tv);
+
+  char buf[30];
+  struct tm result;
+  gmtime_r(&tv.tv_sec, &result);
+  strftime(buf,30,"%Y-%m-%dT%H:%M:%S", &result);
+  sprintf(buf + strlen(buf),".%dZ",(int)tv.tv_usec/1000);
+  creation_date.assign(buf, strlen(buf));
+
+  ret = store_url(dpp, idp_url, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR:  storing role info in OIDC pool: "
+                  << provider_url << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWOIDCProvider::get(const DoutPrefixProvider *dpp)
+{
+  string url, tenant;
+  auto ret = get_tenant_url_from_arn(tenant, url);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to parse arn" << dendl;
+    return -EINVAL;
+  }
+
+  if (this->tenant != tenant) {
+    ldpp_dout(dpp, 0) << "ERROR: tenant in arn doesn't match that of user " << this->tenant << ", "
+                  << tenant << ": " << dendl;
+    return -EINVAL;
+  }
+
+  ret = read_url(dpp, url, tenant);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+void RGWOIDCProvider::dump(Formatter *f) const
+{
+  encode_json("OpenIDConnectProviderArn", arn, f);
+}
+
+void RGWOIDCProvider::dump_all(Formatter *f) const
+{
+  f->open_object_section("ClientIDList");
+  for (auto it : client_ids) {
+    encode_json("member", it, f);
+  }
+  f->close_section();
+  encode_json("CreateDate", creation_date, f);
+  f->open_object_section("ThumbprintList");
+  for (auto it : thumbprints) {
+    encode_json("member", it, f);
+  }
+  f->close_section();
+  encode_json("Url", provider_url, f);
+}
+
+void RGWOIDCProvider::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("OpenIDConnectProviderArn", arn, obj);
+}
+
+bool RGWOIDCProvider::validate_input(const DoutPrefixProvider *dpp)
+{
+  if (provider_url.length() > MAX_OIDC_URL_LEN) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid length of url " << dendl;
+    return false;
+  }
+  if (client_ids.size() > MAX_OIDC_NUM_CLIENT_IDS) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid number of client ids " << dendl;
+    return false;
+  }
+
+  for (auto& it : client_ids) {
+    if (it.length() > MAX_OIDC_CLIENT_ID_LEN) {
+      return false;
+    }
+  }
+
+  if (thumbprints.size() > MAX_OIDC_NUM_THUMBPRINTS) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid number of thumbprints " << thumbprints.size() << dendl;
+    return false;
+  }
+
+  for (auto& it : thumbprints) {
+    if (it.length() > MAX_OIDC_THUMBPRINT_LEN) {
+      return false;
+    }
+  }
+  
+  return true;
+}
+
+const string& RGWOIDCProvider::get_url_oid_prefix()
+{
+  return oidc_url_oid_prefix;
+}
+
+} } // namespace rgw::sal
diff --git a/src/rgw/rgw_oidc_provider.h b/src/rgw/rgw_oidc_provider.h
new file mode 100644
index 000000000..581ee879a
--- /dev/null
+++ b/src/rgw/rgw_oidc_provider.h
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+
+#include "common/ceph_context.h"
+#include "common/ceph_json.h"
+
+#include "rgw/rgw_sal.h"
+
+namespace rgw { namespace sal {
+
+class RGWOIDCProvider
+{
+public:
+  static const std::string oidc_url_oid_prefix;
+  static const std::string oidc_arn_prefix;
+  static constexpr int MAX_OIDC_NUM_CLIENT_IDS = 100;
+  static constexpr int MAX_OIDC_CLIENT_ID_LEN = 255;
+  static constexpr int MAX_OIDC_NUM_THUMBPRINTS = 5;
+  static constexpr int MAX_OIDC_THUMBPRINT_LEN = 40;
+  static constexpr int MAX_OIDC_URL_LEN = 255;
+
+protected:
+  std::string id;
+  std::string provider_url;
+  std::string arn;
+  std::string creation_date;
+  std::string tenant;
+  std::vector<std::string> client_ids;
+  std::vector<std::string> thumbprints;
+
+  int get_tenant_url_from_arn(std::string& tenant, std::string& url);
+  virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) = 0;
+  virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) = 0;
+  bool validate_input(const DoutPrefixProvider *dpp);
+
+public:
+  void set_arn(std::string _arn) {
+    arn = _arn;
+  }
+  void set_url(std::string _provider_url) {
+    provider_url = _provider_url;
+  }
+  void set_tenant(std::string _tenant) {
+    tenant = _tenant;
+  }
+  void set_client_ids(std::vector<std::string>& _client_ids) {
+    client_ids = std::move(_client_ids);
+  }
+  void set_thumbprints(std::vector<std::string>& _thumbprints) {
+    thumbprints = std::move(_thumbprints);
+  }
+
+  RGWOIDCProvider(std::string provider_url,
+                    std::string tenant,
+                    std::vector<std::string> client_ids,
+                    std::vector<std::string> thumbprints)
+  : provider_url(std::move(provider_url)),
+    tenant(std::move(tenant)),
+    client_ids(std::move(client_ids)),
+    thumbprints(std::move(thumbprints)) {
+  }
+
+  RGWOIDCProvider( std::string arn,
+                    std::string tenant)
+  : arn(std::move(arn)),
+    tenant(std::move(tenant)) {
+  }
+
+  RGWOIDCProvider(std::string tenant)
+  : tenant(std::move(tenant)) {}
+
+  RGWOIDCProvider() {}
+
+  virtual ~RGWOIDCProvider() = default;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    encode(id, bl);
+    encode(provider_url, bl);
+    encode(arn, bl);
+    encode(creation_date, bl);
+    encode(tenant, bl);
+    encode(client_ids, bl);
+    encode(thumbprints, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(id, bl);
+    decode(provider_url, bl);
+    decode(arn, bl);
+    decode(creation_date, bl);
+    decode(tenant, bl);
+    decode(client_ids, bl);
+    decode(thumbprints, bl);
+    DECODE_FINISH(bl);
+  }
+
+  const std::string& get_provider_url() const { return provider_url; }
+  const std::string& get_arn() const { return arn; }
+  const std::string& get_create_date() const { return creation_date; }
+  const std::vector<std::string>& get_client_ids() const { return client_ids;}
+  const std::vector<std::string>& get_thumbprints() const { return thumbprints; }
+
+  int create(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y);
+  virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) = 0;
+  int get(const DoutPrefixProvider *dpp);
+  void dump(Formatter *f) const;
+  void dump_all(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  static const std::string& get_url_oid_prefix();
+};
+WRITE_CLASS_ENCODER(RGWOIDCProvider)
+
+} } // namespace rgw::sal
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
new file mode 100644
index 000000000..71fb198f3
--- /dev/null
+++ b/src/rgw/rgw_op.cc
@@ -0,0 +1,8958 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <stdlib.h>
+#include <system_error>
+#include <unistd.h>
+
+#include <sstream>
+#include <string_view>
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/optional.hpp>
+#include <boost/utility/in_place_factory.hpp>
+
+#include "include/scope_guard.h"
+#include "common/Clock.h"
+#include "common/armor.h"
+#include "common/errno.h"
+#include "common/mime.h"
+#include "common/utf8.h"
+#include "common/ceph_json.h"
+#include "common/static_ptr.h"
+#include "rgw_tracer.h"
+
+#include "rgw_rados.h"
+#include "rgw_zone.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_acl.h"
+#include "rgw_acl_s3.h"
+#include "rgw_acl_swift.h"
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_log.h"
+#include "rgw_multi.h"
+#include "rgw_multi_del.h"
+#include "rgw_cors.h"
+#include "rgw_cors_s3.h"
+#include "rgw_rest_conn.h"
+#include "rgw_rest_s3.h"
+#include "rgw_tar.h"
+#include "rgw_client_io.h"
+#include "rgw_compression.h"
+#include "rgw_role.h"
+#include "rgw_tag_s3.h"
+#include "rgw_putobj_processor.h"
+#include "rgw_crypt.h"
+#include "rgw_perf_counters.h"
+#include "rgw_process_env.h"
+#include "rgw_notify.h"
+#include "rgw_notify_event_type.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "rgw_lua_data_filter.h"
+#include "rgw_lua.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_quota.h"
+#include "services/svc_sys_obj.h"
+
+#include "cls/lock/cls_lock_client.h"
+#include "cls/rgw/cls_rgw_client.h"
+
+
+#include "include/ceph_assert.h"
+
+#include "compressor/Compressor.h"
+
+#ifdef WITH_ARROW_FLIGHT
+#include "rgw_flight.h"
+#include "rgw_flight_frontend.h"
+#endif
+
+#ifdef WITH_LTTNG
+#define TRACEPOINT_DEFINE
+#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#include "tracing/rgw_op.h"
+#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
+#undef TRACEPOINT_DEFINE
+#else
+#define tracepoint(...)
+#endif
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace librados;
+using ceph::crypto::MD5;
+using boost::optional;
+using boost::none;
+
+using rgw::ARN;
+using rgw::IAM::Effect;
+using rgw::IAM::Policy;
+
+static string mp_ns = RGW_OBJ_NS_MULTIPART;
+static string shadow_ns = RGW_OBJ_NS_SHADOW;
+
+static void forward_req_info(const DoutPrefixProvider *dpp, CephContext *cct, req_info& info, const std::string& bucket_name);
+
+static MultipartMetaFilter mp_filter;
+
+// this probably should belong in the rgw_iam_policy_keywords, I'll get it to it
+// at some point
+static constexpr auto S3_EXISTING_OBJTAG = "s3:ExistingObjectTag";
+static constexpr auto S3_RESOURCE_TAG = "s3:ResourceTag";
+static constexpr auto S3_RUNTIME_RESOURCE_VAL = "${s3:ResourceTag";
+
+int RGWGetObj::parse_range(void)
+{
+  int r = -ERANGE;
+  string rs(range_str);
+  string ofs_str;
+  string end_str;
+
+  ignore_invalid_range = s->cct->_conf->rgw_ignore_get_invalid_range;
+  partial_content = false;
+
+  size_t pos = rs.find("bytes=");
+  if (pos == string::npos) {
+    pos = 0;
+    while (isspace(rs[pos]))
+      pos++;
+    int end = pos;
+    while (isalpha(rs[end]))
+      end++;
+    if (strncasecmp(rs.c_str(), "bytes", end - pos) != 0)
+      return 0;
+    while (isspace(rs[end]))
+      end++;
+    if (rs[end] != '=')
+      return 0;
+    rs = rs.substr(end + 1);
+  } else {
+    rs = rs.substr(pos + 6); /* size of("bytes=")  */
+  }
+  pos = rs.find('-');
+  if (pos == string::npos)
+    goto done;
+
+  partial_content = true;
+
+  ofs_str = rs.substr(0, pos);
+  end_str = rs.substr(pos + 1);
+  if (end_str.length()) {
+    end = atoll(end_str.c_str());
+    if (end < 0)
+      goto done;
+  }
+
+  if (ofs_str.length()) {
+    ofs = atoll(ofs_str.c_str());
+  } else { // RFC2616 suffix-byte-range-spec
+    ofs = -end;
+    end = -1;
+  }
+
+  if (end >= 0 && end < ofs)
+    goto done;
+
+  range_parsed = true;
+  return 0;
+
+done:
+  if (ignore_invalid_range) {
+    partial_content = false;
+    ofs = 0;
+    end = -1;
+    range_parsed = false; // allow retry
+    r = 0;
+  }
+
+  return r;
+}
+
+static int decode_policy(const DoutPrefixProvider *dpp,
+                         CephContext *cct,
+                         bufferlist& bl,
+                         RGWAccessControlPolicy *policy)
+{
+  auto iter = bl.cbegin();
+  try {
+    policy->decode(iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+    return -EIO;
+  }
+  if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
+    ldpp_dout(dpp, 15) << __func__ << " Read AccessControlPolicy";
+    RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
+    s3policy->to_xml(*_dout);
+    *_dout << dendl;
+  }
+  return 0;
+}
+
+
+static int get_user_policy_from_attr(const DoutPrefixProvider *dpp,
+                                     CephContext * const cct,
+				     map<string, bufferlist>& attrs,
+				     RGWAccessControlPolicy& policy    /* out */)
+{
+  auto aiter = attrs.find(RGW_ATTR_ACL);
+  if (aiter != attrs.end()) {
+    int ret = decode_policy(dpp, cct, aiter->second, &policy);
+    if (ret < 0) {
+      return ret;
+    }
+  } else {
+    return -ENOENT;
+  }
+
+  return 0;
+}
+
+/**
+ * Get the AccessControlPolicy for an object off of disk.
+ * policy: must point to a valid RGWACL, and will be filled upon return.
+ * bucket: name of the bucket containing the object.
+ * object: name of the object to get the ACL for.
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int rgw_op_get_bucket_policy_from_attr(const DoutPrefixProvider *dpp, 
+                                       CephContext *cct,
+				       rgw::sal::Driver* driver,
+				       RGWBucketInfo& bucket_info,
+				       map<string, bufferlist>& bucket_attrs,
+				       RGWAccessControlPolicy *policy,
+				       optional_yield y)
+{
+  map<string, bufferlist>::iterator aiter = bucket_attrs.find(RGW_ATTR_ACL);
+
+  if (aiter != bucket_attrs.end()) {
+    int ret = decode_policy(dpp, cct, aiter->second, policy);
+    if (ret < 0)
+      return ret;
+  } else {
+    ldpp_dout(dpp, 0) << "WARNING: couldn't find acl header for bucket, generating default" << dendl;
+    std::unique_ptr<rgw::sal::User> user = driver->get_user(bucket_info.owner);
+    /* object exists, but policy is broken */
+    int r = user->load_user(dpp, y);
+    if (r < 0)
+      return r;
+
+    policy->create_default(bucket_info.owner, user->get_display_name());
+  }
+  return 0;
+}
+
+static int get_obj_policy_from_attr(const DoutPrefixProvider *dpp, 
+                                    CephContext *cct,
+				    rgw::sal::Driver* driver,
+				    RGWBucketInfo& bucket_info,
+				    map<string, bufferlist>& bucket_attrs,
+				    RGWAccessControlPolicy *policy,
+                                    string *storage_class,
+				    rgw::sal::Object* obj,
+                                    optional_yield y)
+{
+  bufferlist bl;
+  int ret = 0;
+
+  std::unique_ptr<rgw::sal::Object::ReadOp> rop = obj->get_read_op();
+
+  ret = rop->get_attr(dpp, RGW_ATTR_ACL, bl, y);
+  if (ret >= 0) {
+    ret = decode_policy(dpp, cct, bl, policy);
+    if (ret < 0)
+      return ret;
+  } else if (ret == -ENODATA) {
+    /* object exists, but policy is broken */
+    ldpp_dout(dpp, 0) << "WARNING: couldn't find acl header for object, generating default" << dendl;
+    std::unique_ptr<rgw::sal::User> user = driver->get_user(bucket_info.owner);
+    ret = user->load_user(dpp, y);
+    if (ret < 0)
+      return ret;
+
+    policy->create_default(bucket_info.owner, user->get_display_name());
+  }
+
+  if (storage_class) {
+    bufferlist scbl;
+    int r = rop->get_attr(dpp, RGW_ATTR_STORAGE_CLASS, scbl, y);
+    if (r >= 0) {
+      *storage_class = scbl.to_str();
+    } else {
+      storage_class->clear();
+    }
+  }
+
+  return ret;
+}
+
+
+static boost::optional<Policy> get_iam_policy_from_attr(CephContext* cct,
+							map<string, bufferlist>& attrs,
+							const string& tenant) {
+  auto i = attrs.find(RGW_ATTR_IAM_POLICY);
+  if (i != attrs.end()) {
+    return Policy(cct, tenant, i->second, false);
+  } else {
+    return none;
+  }
+}
+
+static boost::optional<PublicAccessBlockConfiguration>
+get_public_access_conf_from_attr(const map<string, bufferlist>& attrs)
+{
+  if (auto aiter = attrs.find(RGW_ATTR_PUBLIC_ACCESS);
+      aiter != attrs.end()) {
+    bufferlist::const_iterator iter{&aiter->second};
+    PublicAccessBlockConfiguration access_conf;
+    try {
+      access_conf.decode(iter);
+    } catch (const buffer::error& e) {
+      return boost::none;
+    }
+    return access_conf;
+  }
+  return boost::none;
+}
+
+vector<Policy> get_iam_user_policy_from_attr(CephContext* cct,
+                        map<string, bufferlist>& attrs,
+                        const string& tenant) {
+  vector<Policy> policies;
+  if (auto it = attrs.find(RGW_ATTR_USER_POLICY); it != attrs.end()) {
+   bufferlist out_bl = attrs[RGW_ATTR_USER_POLICY];
+   map<string, string> policy_map;
+   decode(policy_map, out_bl);
+   for (auto& it : policy_map) {
+     bufferlist bl = bufferlist::static_from_string(it.second);
+     Policy p(cct, tenant, bl, false);
+     policies.push_back(std::move(p));
+   }
+  }
+  return policies;
+}
+
+static int read_bucket_policy(const DoutPrefixProvider *dpp, 
+                              rgw::sal::Driver* driver,
+                              req_state *s,
+                              RGWBucketInfo& bucket_info,
+                              map<string, bufferlist>& bucket_attrs,
+                              RGWAccessControlPolicy *policy,
+                              rgw_bucket& bucket,
+			      optional_yield y)
+{
+  if (!s->system_request && bucket_info.flags & BUCKET_SUSPENDED) {
+    ldpp_dout(dpp, 0) << "NOTICE: bucket " << bucket_info.bucket.name
+        << " is suspended" << dendl;
+    return -ERR_USER_SUSPENDED;
+  }
+
+  if (bucket.name.empty()) {
+    return 0;
+  }
+
+  int ret = rgw_op_get_bucket_policy_from_attr(dpp, s->cct, driver, bucket_info, bucket_attrs, policy, y);
+  if (ret == -ENOENT) {
+      ret = -ERR_NO_SUCH_BUCKET;
+  }
+
+  return ret;
+}
+
+static int read_obj_policy(const DoutPrefixProvider *dpp, 
+                           rgw::sal::Driver* driver,
+                           req_state *s,
+                           RGWBucketInfo& bucket_info,
+                           map<string, bufferlist>& bucket_attrs,
+                           RGWAccessControlPolicy* acl,
+                           string *storage_class,
+                           boost::optional<Policy>& policy,
+                           rgw::sal::Bucket* bucket,
+                           rgw::sal::Object* object,
+                           optional_yield y,
+                           bool copy_src=false)
+{
+  string upload_id;
+  upload_id = s->info.args.get("uploadId");
+  std::unique_ptr<rgw::sal::Object> mpobj;
+  rgw_obj obj;
+
+  if (!s->system_request && bucket_info.flags & BUCKET_SUSPENDED) {
+    ldpp_dout(dpp, 0) << "NOTICE: bucket " << bucket_info.bucket.name
+        << " is suspended" << dendl;
+    return -ERR_USER_SUSPENDED;
+  }
+
+  // when getting policy info for copy-source obj, upload_id makes no sense.
+  // 'copy_src' is used to make this function backward compatible.
+  if (!upload_id.empty() && !copy_src) {
+    /* multipart upload */
+    std::unique_ptr<rgw::sal::MultipartUpload> upload;
+    upload = bucket->get_multipart_upload(object->get_name(), upload_id);
+    mpobj = upload->get_meta_obj();
+    mpobj->set_in_extra_data(true);
+    object = mpobj.get();
+  }
+  policy = get_iam_policy_from_attr(s->cct, bucket_attrs, bucket->get_tenant());
+
+  int ret = get_obj_policy_from_attr(dpp, s->cct, driver, bucket_info,
+				     bucket_attrs, acl, storage_class, object,
+				     s->yield);
+  if (ret == -ENOENT) {
+    /* object does not exist checking the bucket's ACL to make sure
+       that we send a proper error code */
+    RGWAccessControlPolicy bucket_policy(s->cct);
+    ret = rgw_op_get_bucket_policy_from_attr(dpp, s->cct, driver, bucket_info, bucket_attrs, &bucket_policy, y);
+    if (ret < 0) {
+      return ret;
+    }
+    const rgw_user& bucket_owner = bucket_policy.get_owner().get_id();
+    if (bucket_owner.compare(s->user->get_id()) != 0 &&
+        ! s->auth.identity->is_admin_of(bucket_owner)) {
+      auto r = eval_identity_or_session_policies(dpp, s->iam_user_policies, s->env,
+                                  rgw::IAM::s3ListBucket, ARN(bucket->get_key()));
+      if (r == Effect::Allow)
+        return -ENOENT;
+      if (r == Effect::Deny)
+        return -EACCES;
+      if (policy) {
+        ARN b_arn(bucket->get_key());
+        r = policy->eval(s->env, *s->auth.identity, rgw::IAM::s3ListBucket, b_arn);
+        if (r == Effect::Allow)
+          return -ENOENT;
+        if (r == Effect::Deny)
+          return -EACCES;
+      }
+      if (! s->session_policies.empty()) {
+        r = eval_identity_or_session_policies(dpp, s->session_policies, s->env,
+                                  rgw::IAM::s3ListBucket, ARN(bucket->get_key()));
+        if (r == Effect::Allow)
+          return -ENOENT;
+        if (r == Effect::Deny)
+          return -EACCES;
+      }
+      if (! bucket_policy.verify_permission(s, *s->auth.identity, s->perm_mask, RGW_PERM_READ))
+        ret = -EACCES;
+      else
+        ret = -ENOENT;
+    } else {
+      ret = -ENOENT;
+    }
+  }
+
+  return ret;
+}
+
+/**
+ * Get the AccessControlPolicy for an user, bucket or object off of disk.
+ * s: The req_state to draw information from.
+ * only_bucket: If true, reads the user and bucket ACLs rather than the object ACL.
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int rgw_build_bucket_policies(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, req_state* s, optional_yield y)
+{
+  int ret = 0;
+
+  string bi = s->info.args.get(RGW_SYS_PARAM_PREFIX "bucket-instance");
+  if (!bi.empty()) {
+    // note: overwrites s->bucket_name, may include a tenant/
+    ret = rgw_bucket_parse_bucket_instance(bi, &s->bucket_name, &s->bucket_instance_id, &s->bucket_instance_shard_id);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  if(s->dialect.compare("s3") == 0) {
+    s->bucket_acl = std::make_unique<RGWAccessControlPolicy_S3>(s->cct);
+  } else if(s->dialect.compare("swift")  == 0) {
+    /* We aren't allocating the account policy for those operations using
+     * the Swift's infrastructure that don't really need req_state::user.
+     * Typical example here is the implementation of /info. */
+    if (!s->user->get_id().empty()) {
+      s->user_acl = std::make_unique<RGWAccessControlPolicy_SWIFTAcct>(s->cct);
+    }
+    s->bucket_acl = std::make_unique<RGWAccessControlPolicy_SWIFT>(s->cct);
+  } else {
+    s->bucket_acl = std::make_unique<RGWAccessControlPolicy>(s->cct);
+  }
+
+  /* check if copy source is within the current domain */
+  if (!s->src_bucket_name.empty()) {
+    std::unique_ptr<rgw::sal::Bucket> src_bucket;
+    ret = driver->get_bucket(dpp, nullptr,
+                             rgw_bucket_key(s->src_tenant_name,
+                                            s->src_bucket_name),
+                             &src_bucket, y);
+    if (ret == 0) {
+      string& zonegroup = src_bucket->get_info().zonegroup;
+      s->local_source = driver->get_zone()->get_zonegroup().equals(zonegroup);
+    }
+  }
+
+  struct {
+    rgw_user uid;
+    std::string display_name;
+  } acct_acl_user = {
+    s->user->get_id(),
+    s->user->get_display_name(),
+  };
+
+  if (!s->bucket_name.empty()) {
+    s->bucket_exists = true;
+
+    /* This is the only place that s->bucket is created.  It should never be
+     * overwritten. */
+    ret = driver->get_bucket(dpp, s->user.get(), rgw_bucket(s->bucket_tenant, s->bucket_name, s->bucket_instance_id), &s->bucket, y);
+    if (ret < 0) {
+      if (ret != -ENOENT) {
+	string bucket_log;
+	bucket_log = rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name);
+	ldpp_dout(dpp, 0) << "NOTICE: couldn't get bucket from bucket_name (name="
+	  << bucket_log << ")" << dendl;
+	return ret;
+      }
+      s->bucket_exists = false;
+      return -ERR_NO_SUCH_BUCKET;
+    }
+    if (!rgw::sal::Object::empty(s->object.get())) {
+      s->object->set_bucket(s->bucket.get());
+    }
+    
+    s->bucket_mtime = s->bucket->get_modification_time();
+    s->bucket_attrs = s->bucket->get_attrs();
+    ret = read_bucket_policy(dpp, driver, s, s->bucket->get_info(),
+			     s->bucket->get_attrs(),
+			     s->bucket_acl.get(), s->bucket->get_key(), y);
+    acct_acl_user = {
+      s->bucket->get_info().owner,
+      s->bucket_acl->get_owner().get_display_name(),
+    };
+
+    s->bucket_owner = s->bucket_acl->get_owner();
+
+    std::unique_ptr<rgw::sal::ZoneGroup> zonegroup;
+    int r = driver->get_zonegroup(s->bucket->get_info().zonegroup, &zonegroup);
+    if (!r) {
+      s->zonegroup_endpoint = zonegroup->get_endpoint();
+      s->zonegroup_name = zonegroup->get_name();
+    }
+    if (r < 0 && ret == 0) {
+      ret = r;
+    }
+
+    if (!driver->get_zone()->get_zonegroup().equals(s->bucket->get_info().zonegroup)) {
+      ldpp_dout(dpp, 0) << "NOTICE: request for data in a different zonegroup ("
+          << s->bucket->get_info().zonegroup << " != "
+          << driver->get_zone()->get_zonegroup().get_id() << ")" << dendl;
+      /* we now need to make sure that the operation actually requires copy source, that is
+       * it's a copy operation
+       */
+      if (driver->get_zone()->get_zonegroup().is_master_zonegroup() && s->system_request) {
+        /*If this is the master, don't redirect*/
+      } else if (s->op_type == RGW_OP_GET_BUCKET_LOCATION ) {
+        /* If op is get bucket location, don't redirect */
+      } else if (!s->local_source ||
+          (s->op != OP_PUT && s->op != OP_COPY) ||
+          rgw::sal::Object::empty(s->object.get())) {
+        return -ERR_PERMANENT_REDIRECT;
+      }
+    }
+
+    /* init dest placement */
+    s->dest_placement.storage_class = s->info.storage_class;
+    s->dest_placement.inherit_from(s->bucket->get_placement_rule());
+
+    if (!driver->valid_placement(s->dest_placement)) {
+      ldpp_dout(dpp, 0) << "NOTICE: invalid dest placement: " << s->dest_placement.to_str() << dendl;
+      return -EINVAL;
+    }
+
+    s->bucket_access_conf = get_public_access_conf_from_attr(s->bucket->get_attrs());
+  }
+
+  /* handle user ACL only for those APIs which support it */
+  if (s->user_acl) {
+    std::unique_ptr<rgw::sal::User> acl_user = driver->get_user(acct_acl_user.uid);
+
+    ret = acl_user->read_attrs(dpp, y);
+    if (!ret) {
+      ret = get_user_policy_from_attr(dpp, s->cct, acl_user->get_attrs(), *s->user_acl);
+    }
+    if (-ENOENT == ret) {
+      /* In already existing clusters users won't have ACL. In such case
+       * assuming that only account owner has the rights seems to be
+       * reasonable. That allows to have only one verification logic.
+       * NOTE: there is small compatibility kludge for global, empty tenant:
+       *  1. if we try to reach an existing bucket, its owner is considered
+       *     as account owner.
+       *  2. otherwise account owner is identity stored in s->user->user_id.  */
+      s->user_acl->create_default(acct_acl_user.uid,
+                                  acct_acl_user.display_name);
+      ret = 0;
+    } else if (ret < 0) {
+      ldpp_dout(dpp, 0) << "NOTICE: couldn't get user attrs for handling ACL "
+          "(user_id=" << s->user->get_id() << ", ret=" << ret << ")" << dendl;
+      return ret;
+    }
+  }
+  // We don't need user policies in case of STS token returned by AssumeRole,
+  // hence the check for user type
+  if (! s->user->get_id().empty() && s->auth.identity->get_identity_type() != TYPE_ROLE) {
+    try {
+      ret = s->user->read_attrs(dpp, y);
+      if (ret == 0) {
+	auto user_policies = get_iam_user_policy_from_attr(s->cct,
+							   s->user->get_attrs(),
+							   s->user->get_tenant());
+          s->iam_user_policies.insert(s->iam_user_policies.end(),
+                                      std::make_move_iterator(user_policies.begin()),
+                                      std::make_move_iterator(user_policies.end()));
+      } else {
+        if (ret == -ENOENT)
+          ret = 0;
+        else ret = -EACCES;
+      }
+    } catch (const std::exception& e) {
+      ldpp_dout(dpp, -1) << "Error reading IAM User Policy: " << e.what() << dendl;
+      ret = -EACCES;
+    }
+  }
+
+  try {
+    s->iam_policy = get_iam_policy_from_attr(s->cct, s->bucket_attrs, s->bucket_tenant);
+  } catch (const std::exception& e) {
+    // Really this is a can't happen condition. We parse the policy
+    // when it's given to us, so perhaps we should abort or otherwise
+    // raise bloody murder.
+    ldpp_dout(dpp, 0) << "Error reading IAM Policy: " << e.what() << dendl;
+    ret = -EACCES;
+  }
+
+  bool success = driver->get_zone()->get_redirect_endpoint(&s->redirect_zone_endpoint);
+  if (success) {
+    ldpp_dout(dpp, 20) << "redirect_zone_endpoint=" << s->redirect_zone_endpoint << dendl;
+  }
+
+  return ret;
+}
+
+/**
+ * Get the AccessControlPolicy for a bucket or object off of disk.
+ * s: The req_state to draw information from.
+ * only_bucket: If true, reads the bucket ACL rather than the object ACL.
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int rgw_build_object_policies(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+			      req_state *s, bool prefetch_data, optional_yield y)
+{
+  int ret = 0;
+
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    if (!s->bucket_exists) {
+      return -ERR_NO_SUCH_BUCKET;
+    }
+    s->object_acl = std::make_unique<RGWAccessControlPolicy>(s->cct);
+
+    s->object->set_atomic();
+    if (prefetch_data) {
+      s->object->set_prefetch_data();
+    }
+    ret = read_obj_policy(dpp, driver, s, s->bucket->get_info(), s->bucket_attrs,
+			  s->object_acl.get(), nullptr, s->iam_policy, s->bucket.get(),
+                          s->object.get(), y);
+  }
+
+  return ret;
+}
+
+static int rgw_iam_remove_objtags(const DoutPrefixProvider *dpp, req_state* s, rgw::sal::Object* object, bool has_existing_obj_tag, bool has_resource_tag) {
+  object->set_atomic();
+  int op_ret = object->get_obj_attrs(s->yield, dpp);
+  if (op_ret < 0)
+    return op_ret;
+  rgw::sal::Attrs attrs = object->get_attrs();
+  auto tags = attrs.find(RGW_ATTR_TAGS);
+  if (tags != attrs.end()) {
+    RGWObjTags tagset;
+    try {
+      auto bliter = tags->second.cbegin();
+      tagset.decode(bliter);
+    } catch (buffer::error& err) {
+      ldpp_dout(s, 0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl;
+      return -EIO;
+    }
+    for (auto& tag: tagset.get_tags()) {
+      if (has_existing_obj_tag) {
+        vector<std::unordered_multimap<string, string>::iterator> iters;
+        string key = "s3:ExistingObjectTag/" + tag.first;
+        auto result = s->env.equal_range(key);
+        for (auto& it = result.first; it != result.second; ++it)
+        {
+          if (tag.second == it->second) {
+            iters.emplace_back(it);
+          }
+        }
+        for (auto& it : iters) {
+          s->env.erase(it);
+        }
+      }//end if has_existing_obj_tag
+      if (has_resource_tag) {
+        vector<std::unordered_multimap<string, string>::iterator> iters;
+        string key = "s3:ResourceTag/" + tag.first;
+        auto result = s->env.equal_range(key);
+        for (auto& it = result.first; it != result.second; ++it)
+        {
+          if (tag.second == it->second) {
+            iters.emplace_back(it);
+          }
+        }
+        for (auto& it : iters) {
+          s->env.erase(it);
+        }
+      }//end if has_resource_tag
+    }
+  }
+  return 0;
+}
+
+void rgw_add_to_iam_environment(rgw::IAM::Environment& e, std::string_view key, std::string_view val){
+  // This variant just adds non empty key pairs to IAM env., values can be empty
+  // in certain cases like tagging
+  if (!key.empty())
+    e.emplace(key,val);
+}
+
+static int rgw_iam_add_tags_from_bl(req_state* s, bufferlist& bl, bool has_existing_obj_tag=false, bool has_resource_tag=false){
+  RGWObjTags& tagset = s->tagset;
+  try {
+    auto bliter = bl.cbegin();
+    tagset.decode(bliter);
+  } catch (buffer::error& err) {
+    ldpp_dout(s, 0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl;
+    return -EIO;
+  }
+
+  for (const auto& tag: tagset.get_tags()){
+    if (has_existing_obj_tag)
+      rgw_add_to_iam_environment(s->env, "s3:ExistingObjectTag/" + tag.first, tag.second);
+    if (has_resource_tag)
+      rgw_add_to_iam_environment(s->env, "s3:ResourceTag/" + tag.first, tag.second);
+  }
+  return 0;
+}
+
+static int rgw_iam_add_objtags(const DoutPrefixProvider *dpp, req_state* s, rgw::sal::Object* object, bool has_existing_obj_tag, bool has_resource_tag) {
+  object->set_atomic();
+  int op_ret = object->get_obj_attrs(s->yield, dpp);
+  if (op_ret < 0)
+    return op_ret;
+  rgw::sal::Attrs attrs = object->get_attrs();
+  auto tags = attrs.find(RGW_ATTR_TAGS);
+  if (tags != attrs.end()){
+    return rgw_iam_add_tags_from_bl(s, tags->second, has_existing_obj_tag, has_resource_tag);
+  }
+  return 0;
+}
+
+static int rgw_iam_add_objtags(const DoutPrefixProvider *dpp, req_state* s, bool has_existing_obj_tag, bool has_resource_tag) {
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    return rgw_iam_add_objtags(dpp, s, s->object.get(), has_existing_obj_tag, has_resource_tag);
+  }
+  return 0;
+}
+
+static int rgw_iam_add_buckettags(const DoutPrefixProvider *dpp, req_state* s, rgw::sal::Bucket* bucket) {
+  rgw::sal::Attrs attrs = bucket->get_attrs();
+  auto tags = attrs.find(RGW_ATTR_TAGS);
+  if (tags != attrs.end()) {
+    return rgw_iam_add_tags_from_bl(s, tags->second, false, true);
+  }
+  return 0;
+}
+
+static int rgw_iam_add_buckettags(const DoutPrefixProvider *dpp, req_state* s) {
+  return rgw_iam_add_buckettags(dpp, s, s->bucket.get());
+}
+
+static void rgw_iam_add_crypt_attrs(rgw::IAM::Environment& e,
+                                    const meta_map_t& attrs)
+{
+  constexpr auto encrypt_attr = "x-amz-server-side-encryption";
+  constexpr auto s3_encrypt_attr = "s3:x-amz-server-side-encryption";
+  if (auto h = attrs.find(encrypt_attr); h != attrs.end()) {
+    rgw_add_to_iam_environment(e, s3_encrypt_attr, h->second);
+  }
+
+  constexpr auto kms_attr = "x-amz-server-side-encryption-aws-kms-key-id";
+  constexpr auto s3_kms_attr = "s3:x-amz-server-side-encryption-aws-kms-key-id";
+  if (auto h = attrs.find(kms_attr); h != attrs.end()) {
+    rgw_add_to_iam_environment(e, s3_kms_attr, h->second);
+  }
+}
+
+static std::tuple<bool, bool> rgw_check_policy_condition(const DoutPrefixProvider *dpp,
+                                                          boost::optional<rgw::IAM::Policy> iam_policy,
+                                                          boost::optional<vector<rgw::IAM::Policy>> identity_policies,
+                                                          boost::optional<vector<rgw::IAM::Policy>> session_policies,
+                                                          bool check_obj_exist_tag=true) {
+  bool has_existing_obj_tag = false, has_resource_tag = false;
+  bool iam_policy_s3_exist_tag = false, iam_policy_s3_resource_tag = false;
+  if (iam_policy) {
+    if (check_obj_exist_tag) {
+      iam_policy_s3_exist_tag = iam_policy->has_partial_conditional(S3_EXISTING_OBJTAG);
+    }
+    iam_policy_s3_resource_tag = iam_policy->has_partial_conditional(S3_RESOURCE_TAG) || iam_policy->has_partial_conditional_value(S3_RUNTIME_RESOURCE_VAL);
+  }
+
+  bool identity_policy_s3_exist_tag = false, identity_policy_s3_resource_tag = false;
+  if (identity_policies) {
+    for (auto& identity_policy : identity_policies.get()) {
+      if (check_obj_exist_tag) {
+        if (identity_policy.has_partial_conditional(S3_EXISTING_OBJTAG))
+          identity_policy_s3_exist_tag = true;
+      }
+      if (identity_policy.has_partial_conditional(S3_RESOURCE_TAG) || identity_policy.has_partial_conditional_value(S3_RUNTIME_RESOURCE_VAL))
+        identity_policy_s3_resource_tag = true;
+      if (identity_policy_s3_exist_tag && identity_policy_s3_resource_tag) // check all policies till both are set to true
+        break;
+    }
+  }
+
+  bool session_policy_s3_exist_tag = false, session_policy_s3_resource_flag = false;
+  if (session_policies) {
+    for (auto& session_policy : session_policies.get()) {
+      if (check_obj_exist_tag) {
+        if (session_policy.has_partial_conditional(S3_EXISTING_OBJTAG))
+          session_policy_s3_exist_tag = true;
+      }
+      if (session_policy.has_partial_conditional(S3_RESOURCE_TAG) || session_policy.has_partial_conditional_value(S3_RUNTIME_RESOURCE_VAL))
+        session_policy_s3_resource_flag = true;
+      if (session_policy_s3_exist_tag && session_policy_s3_resource_flag)
+        break;
+    }
+  }
+
+  has_existing_obj_tag = iam_policy_s3_exist_tag || identity_policy_s3_exist_tag || session_policy_s3_exist_tag;
+  has_resource_tag = iam_policy_s3_resource_tag || identity_policy_s3_resource_tag || session_policy_s3_resource_flag;
+  return make_tuple(has_existing_obj_tag, has_resource_tag);
+}
+
+static std::tuple<bool, bool> rgw_check_policy_condition(const DoutPrefixProvider *dpp, req_state* s, bool check_obj_exist_tag=true) {
+  return rgw_check_policy_condition(dpp, s->iam_policy, s->iam_user_policies, s->session_policies, check_obj_exist_tag);
+}
+
+static void rgw_add_grant_to_iam_environment(rgw::IAM::Environment& e, req_state *s){
+
+  using header_pair_t = std::pair <const char*, const char*>;
+  static const std::initializer_list <header_pair_t> acl_header_conditionals {
+    {"HTTP_X_AMZ_GRANT_READ", "s3:x-amz-grant-read"},
+    {"HTTP_X_AMZ_GRANT_WRITE", "s3:x-amz-grant-write"},
+    {"HTTP_X_AMZ_GRANT_READ_ACP", "s3:x-amz-grant-read-acp"},
+    {"HTTP_X_AMZ_GRANT_WRITE_ACP", "s3:x-amz-grant-write-acp"},
+    {"HTTP_X_AMZ_GRANT_FULL_CONTROL", "s3:x-amz-grant-full-control"}
+  };
+
+  if (s->has_acl_header){
+    for (const auto& c: acl_header_conditionals){
+      auto hdr = s->info.env->get(c.first);
+      if(hdr) {
+        e.emplace(c.second, hdr);
+      }
+    }
+  }
+}
+
+void rgw_build_iam_environment(rgw::sal::Driver* driver,
+	                              req_state* s)
+{
+  const auto& m = s->info.env->get_map();
+  auto t = ceph::real_clock::now();
+  s->env.emplace("aws:CurrentTime", std::to_string(ceph::real_clock::to_time_t(t)));
+  s->env.emplace("aws:EpochTime", ceph::to_iso_8601(t));
+  // TODO: This is fine for now, but once we have STS we'll need to
+  // look and see. Also this won't work with the IdentityApplier
+  // model, since we need to know the actual credential.
+  s->env.emplace("aws:PrincipalType", "User");
+
+  auto i = m.find("HTTP_REFERER");
+  if (i != m.end()) {
+    s->env.emplace("aws:Referer", i->second);
+  }
+
+  if (rgw_transport_is_secure(s->cct, *s->info.env)) {
+    s->env.emplace("aws:SecureTransport", "true");
+  }
+
+  const auto remote_addr_param = s->cct->_conf->rgw_remote_addr_param;
+  if (remote_addr_param.length()) {
+    i = m.find(remote_addr_param);
+  } else {
+    i = m.find("REMOTE_ADDR");
+  }
+  if (i != m.end()) {
+    const string* ip = &(i->second);
+    string temp;
+    if (remote_addr_param == "HTTP_X_FORWARDED_FOR") {
+      const auto comma = ip->find(',');
+      if (comma != string::npos) {
+	temp.assign(*ip, 0, comma);
+	ip = &temp;
+      }
+    }
+    s->env.emplace("aws:SourceIp", *ip);
+  }
+
+  i = m.find("HTTP_USER_AGENT"); {
+  if (i != m.end())
+    s->env.emplace("aws:UserAgent", i->second);
+  }
+
+  if (s->user) {
+    // What to do about aws::userid? One can have multiple access
+    // keys so that isn't really suitable. Do we have a durable
+    // identifier that can persist through name changes?
+    s->env.emplace("aws:username", s->user->get_id().id);
+  }
+
+  i = m.find("HTTP_X_AMZ_SECURITY_TOKEN");
+  if (i != m.end()) {
+    s->env.emplace("sts:authentication", "true");
+  } else {
+    s->env.emplace("sts:authentication", "false");
+  }
+}
+
+/*
+ * GET on CloudTiered objects is processed only when sent from the sync client.
+ * In all other cases, fail with `ERR_INVALID_OBJECT_STATE`.
+ */
+int handle_cloudtier_obj(rgw::sal::Attrs& attrs, bool sync_cloudtiered) {
+  int op_ret = 0;
+  auto attr_iter = attrs.find(RGW_ATTR_MANIFEST);
+  if (attr_iter != attrs.end()) {
+    RGWObjManifest m;
+    try {
+      decode(m, attr_iter->second);
+      if (m.get_tier_type() == "cloud-s3") {
+        if (!sync_cloudtiered) {
+          /* XXX: Instead send presigned redirect or read-through */
+          op_ret = -ERR_INVALID_OBJECT_STATE;
+        } else { // fetch object for sync and set cloud_tier attrs
+          bufferlist t, t_tier;
+          RGWObjTier tier_config;
+          m.get_tier_config(&tier_config);
+
+          t.append("cloud-s3");
+          attrs[RGW_ATTR_CLOUD_TIER_TYPE] = t;
+          encode(tier_config, t_tier);
+          attrs[RGW_ATTR_CLOUD_TIER_CONFIG] = t_tier;
+        }
+      }
+    } catch (const buffer::end_of_buffer&) {
+      // ignore empty manifest; it's not cloud-tiered
+    } catch (const std::exception& e) {
+    }
+  }
+
+  return op_ret;
+}
+
+void rgw_bucket_object_pre_exec(req_state *s)
+{
+  if (s->expect_cont)
+    dump_continue(s);
+
+  dump_bucket_from_state(s);
+}
+
+// So! Now and then when we try to update bucket information, the
+// bucket has changed during the course of the operation. (Or we have
+// a cache consistency problem that Watch/Notify isn't ruling out
+// completely.)
+//
+// When this happens, we need to update the bucket info and try
+// again. We have, however, to try the right *part* again.  We can't
+// simply re-send, since that will obliterate the previous update.
+//
+// Thus, callers of this function should include everything that
+// merges information to be changed into the bucket information as
+// well as the call to set it.
+//
+// The called function must return an integer, negative on error. In
+// general, they should just return op_ret.
+namespace {
+template<typename F>
+int retry_raced_bucket_write(const DoutPrefixProvider *dpp, rgw::sal::Bucket* b, const F& f) {
+  auto r = f();
+  for (auto i = 0u; i < 15u && r == -ECANCELED; ++i) {
+    r = b->try_refresh_info(dpp, nullptr);
+    if (r >= 0) {
+      r = f();
+    }
+  }
+  return r;
+}
+}
+
+
+int RGWGetObj::verify_permission(optional_yield y)
+{
+  s->object->set_atomic();
+
+  if (prefetch_data()) {
+    s->object->set_prefetch_data();
+  }
+
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+    if (has_s3_existing_tag || has_s3_resource_tag)
+      rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  if (torrent.get_flag()) {
+    if (s->object->get_instance().empty()) {
+      action = rgw::IAM::s3GetObjectTorrent;
+    } else {
+      action = rgw::IAM::s3GetObjectVersionTorrent;
+    }
+  } else {
+    if (s->object->get_instance().empty()) {
+      action = rgw::IAM::s3GetObject;
+    } else {
+      action = rgw::IAM::s3GetObjectVersion;
+    }
+  }
+
+  if (!verify_object_permission(this, s, action)) {
+    return -EACCES;
+  }
+
+  if (s->bucket->get_info().obj_lock_enabled()) {
+    get_retention = verify_object_permission(this, s, rgw::IAM::s3GetObjectRetention);
+    get_legal_hold = verify_object_permission(this, s, rgw::IAM::s3GetObjectLegalHold);
+  }
+
+  return 0;
+}
+
+RGWOp::~RGWOp(){};
+
+int RGWOp::verify_op_mask()
+{
+  uint32_t required_mask = op_mask();
+
+  ldpp_dout(this, 20) << "required_mask= " << required_mask
+      << " user.op_mask=" << s->user->get_info().op_mask << dendl;
+
+  if ((s->user->get_info().op_mask & required_mask) != required_mask) {
+    return -EPERM;
+  }
+
+  if (!s->system_request && (required_mask & RGW_OP_TYPE_MODIFY) && !driver->get_zone()->is_writeable()) {
+    ldpp_dout(this, 5) << "NOTICE: modify request to a read-only zone by a "
+        "non-system user, permission denied"  << dendl;
+    return -EPERM;
+  }
+
+  return 0;
+}
+
+int RGWGetObjTags::verify_permission(optional_yield y)
+{
+  auto iam_action = s->object->get_instance().empty()?
+    rgw::IAM::s3GetObjectTagging:
+    rgw::IAM::s3GetObjectVersionTagging;
+
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+  if (has_s3_existing_tag || has_s3_resource_tag)
+    rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+  if (!verify_object_permission(this, s,iam_action))
+    return -EACCES;
+
+  return 0;
+}
+
+void RGWGetObjTags::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetObjTags::execute(optional_yield y)
+{
+  rgw::sal::Attrs attrs;
+
+  s->object->set_atomic();
+
+  op_ret = s->object->get_obj_attrs(y, this);
+
+  if (op_ret == 0){
+    attrs = s->object->get_attrs();
+    auto tags = attrs.find(RGW_ATTR_TAGS);
+    if(tags != attrs.end()){
+      has_tags = true;
+      tags_bl.append(tags->second);
+    }
+  }
+  send_response_data(tags_bl);
+}
+
+int RGWPutObjTags::verify_permission(optional_yield y)
+{
+  auto iam_action = s->object->get_instance().empty() ?
+    rgw::IAM::s3PutObjectTagging:
+    rgw::IAM::s3PutObjectVersionTagging;
+
+  //Using buckets tags for authorization makes more sense.
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, true);
+  if (has_s3_existing_tag)
+    rgw_iam_add_objtags(this, s, true, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+  if (!verify_object_permission(this, s,iam_action))
+    return -EACCES;
+  return 0;
+}
+
+void RGWPutObjTags::execute(optional_yield y)
+{
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+
+  if (rgw::sal::Object::empty(s->object.get())){
+    op_ret= -EINVAL; // we only support tagging on existing objects
+    return;
+  }
+
+  s->object->set_atomic();
+  op_ret = s->object->modify_obj_attrs(RGW_ATTR_TAGS, tags_bl, y, this);
+  if (op_ret == -ECANCELED){
+    op_ret = -ERR_TAG_CONFLICT;
+  }
+}
+
+void RGWDeleteObjTags::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+
+int RGWDeleteObjTags::verify_permission(optional_yield y)
+{
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    auto iam_action = s->object->get_instance().empty() ?
+      rgw::IAM::s3DeleteObjectTagging:
+      rgw::IAM::s3DeleteObjectVersionTagging;
+
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+  if (has_s3_existing_tag || has_s3_resource_tag)
+    rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+  if (!verify_object_permission(this, s, iam_action))
+    return -EACCES;
+  }
+  return 0;
+}
+
+void RGWDeleteObjTags::execute(optional_yield y)
+{
+  if (rgw::sal::Object::empty(s->object.get()))
+    return;
+
+  op_ret = s->object->delete_obj_attrs(this, RGW_ATTR_TAGS, y);
+}
+
+int RGWGetBucketTags::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketTagging)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWGetBucketTags::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetBucketTags::execute(optional_yield y)
+{
+  auto iter = s->bucket_attrs.find(RGW_ATTR_TAGS);
+  if (iter != s->bucket_attrs.end()) {
+    has_tags = true;
+    tags_bl.append(iter->second);
+  } else {
+    op_ret = -ERR_NO_SUCH_TAG_SET;
+  }
+  send_response_data(tags_bl);
+}
+
+int RGWPutBucketTags::verify_permission(optional_yield y) {
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketTagging);
+}
+
+void RGWPutBucketTags::execute(optional_yield y)
+{
+
+  op_ret = get_params(this, y);
+  if (op_ret < 0) 
+    return;
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this, y] {
+    rgw::sal::Attrs attrs = s->bucket->get_attrs();
+    attrs[RGW_ATTR_TAGS] = tags_bl;
+    return s->bucket->merge_and_store_attrs(this, attrs, y);
+  });
+
+}
+
+void RGWDeleteBucketTags::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+int RGWDeleteBucketTags::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketTagging);
+}
+
+void RGWDeleteBucketTags::execute(optional_yield y)
+{
+  bufferlist in_data;
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this, y] {
+    rgw::sal::Attrs attrs = s->bucket->get_attrs();
+    attrs.erase(RGW_ATTR_TAGS);
+    op_ret = s->bucket->merge_and_store_attrs(this, attrs, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 0) << "RGWDeleteBucketTags() failed to remove RGW_ATTR_TAGS on bucket="
+			 << s->bucket->get_name()
+			 << " returned err= " << op_ret << dendl;
+    }
+    return op_ret;
+  });
+}
+
+int RGWGetBucketReplication::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3GetReplicationConfiguration)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWGetBucketReplication::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetBucketReplication::execute(optional_yield y)
+{
+  send_response_data();
+}
+
+int RGWPutBucketReplication::verify_permission(optional_yield y) {
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutReplicationConfiguration);
+}
+
+void RGWPutBucketReplication::execute(optional_yield y) {
+
+  op_ret = get_params(y);
+  if (op_ret < 0) 
+    return;
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] {
+    auto sync_policy = (s->bucket->get_info().sync_policy ? *s->bucket->get_info().sync_policy : rgw_sync_policy_info());
+
+    for (auto& group : sync_policy_groups) {
+      sync_policy.groups[group.id] = group;
+    }
+
+    s->bucket->get_info().set_sync_policy(std::move(sync_policy));
+
+    int ret = s->bucket->put_info(this, false, real_time());
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "ERROR: put_bucket_instance_info (bucket=" << s->bucket << ") returned ret=" << ret << dendl;
+      return ret;
+    }
+
+    return 0;
+  });
+}
+
+void RGWDeleteBucketReplication::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+int RGWDeleteBucketReplication::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3DeleteReplicationConfiguration);
+}
+
+void RGWDeleteBucketReplication::execute(optional_yield y)
+{
+  bufferlist in_data;
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] {
+    if (!s->bucket->get_info().sync_policy) {
+      return 0;
+    }
+
+    rgw_sync_policy_info sync_policy = *s->bucket->get_info().sync_policy;
+
+    update_sync_policy(&sync_policy);
+
+    s->bucket->get_info().set_sync_policy(std::move(sync_policy));
+
+    int ret = s->bucket->put_info(this, false, real_time());
+    if (ret < 0) {
+      ldpp_dout(this, 0) << "ERROR: put_bucket_instance_info (bucket=" << s->bucket << ") returned ret=" << ret << dendl;
+      return ret;
+    }
+
+    return 0;
+  });
+}
+
+int RGWOp::do_aws4_auth_completion()
+{
+  ldpp_dout(this, 5) << "NOTICE: call to do_aws4_auth_completion"  << dendl;
+  if (s->auth.completer) {
+    if (!s->auth.completer->complete()) {
+      return -ERR_AMZ_CONTENT_SHA256_MISMATCH;
+    } else {
+      ldpp_dout(this, 10) << "v4 auth ok -- do_aws4_auth_completion" << dendl;
+    }
+
+    /* TODO(rzarzynski): yes, we're really called twice on PUTs. Only first
+     * call passes, so we disable second one. This is old behaviour, sorry!
+     * Plan for tomorrow: seek and destroy. */
+    s->auth.completer = nullptr;
+  }
+
+  return 0;
+}
+
+int RGWOp::init_quota()
+{
+  /* no quota enforcement for system requests */
+  if (s->system_request)
+    return 0;
+
+  /* init quota related stuff */
+  if (!(s->user->get_info().op_mask & RGW_OP_TYPE_MODIFY)) {
+    return 0;
+  }
+
+  /* Need a bucket to get quota */
+  if (rgw::sal::Bucket::empty(s->bucket.get())) {
+    return 0;
+  }
+
+  std::unique_ptr<rgw::sal::User> owner_user =
+			driver->get_user(s->bucket->get_info().owner);
+  rgw::sal::User* user;
+
+  if (s->user->get_id() == s->bucket_owner.get_id()) {
+    user = s->user.get();
+  } else {
+    int r = owner_user->load_user(this, s->yield);
+    if (r < 0)
+      return r;
+    user = owner_user.get();
+    
+  }
+
+  driver->get_quota(quota);
+
+  if (s->bucket->get_info().quota.enabled) {
+    quota.bucket_quota = s->bucket->get_info().quota;
+  } else if (user->get_info().quota.bucket_quota.enabled) {
+    quota.bucket_quota = user->get_info().quota.bucket_quota;
+  }
+
+  if (user->get_info().quota.user_quota.enabled) {
+    quota.user_quota = user->get_info().quota.user_quota;
+  }
+
+  return 0;
+}
+
+static bool validate_cors_rule_method(const DoutPrefixProvider *dpp, RGWCORSRule *rule, const char *req_meth) {
+  if (!req_meth) {
+    ldpp_dout(dpp, 5) << "req_meth is null" << dendl;
+    return false;
+  }
+
+  uint8_t flags = get_cors_method_flags(req_meth);
+
+  if (rule->get_allowed_methods() & flags) {
+    ldpp_dout(dpp, 10) << "Method " << req_meth << " is supported" << dendl;
+  } else {
+    ldpp_dout(dpp, 5) << "Method " << req_meth << " is not supported" << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+static bool validate_cors_rule_header(const DoutPrefixProvider *dpp, RGWCORSRule *rule, const char *req_hdrs) {
+  if (req_hdrs) {
+    vector<string> hdrs;
+    get_str_vec(req_hdrs, hdrs);
+    for (const auto& hdr : hdrs) {
+      if (!rule->is_header_allowed(hdr.c_str(), hdr.length())) {
+        ldpp_dout(dpp, 5) << "Header " << hdr << " is not registered in this rule" << dendl;
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+int RGWOp::read_bucket_cors()
+{
+  bufferlist bl;
+
+  map<string, bufferlist>::iterator aiter = s->bucket_attrs.find(RGW_ATTR_CORS);
+  if (aiter == s->bucket_attrs.end()) {
+    ldpp_dout(this, 20) << "no CORS configuration attr found" << dendl;
+    cors_exist = false;
+    return 0; /* no CORS configuration found */
+  }
+
+  cors_exist = true;
+
+  bl = aiter->second;
+
+  auto iter = bl.cbegin();
+  try {
+    bucket_cors.decode(iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(this, 0) << "ERROR: could not decode CORS, caught buffer::error" << dendl;
+    return -EIO;
+  }
+  if (s->cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
+    RGWCORSConfiguration_S3 *s3cors = static_cast<RGWCORSConfiguration_S3 *>(&bucket_cors);
+    ldpp_dout(this, 15) << "Read RGWCORSConfiguration";
+    s3cors->to_xml(*_dout);
+    *_dout << dendl;
+  }
+  return 0;
+}
+
+/** CORS 6.2.6.
+ * If any of the header field-names is not a ASCII case-insensitive match for
+ * any of the values in list of headers do not set any additional headers and
+ * terminate this set of steps.
+ * */
+static void get_cors_response_headers(const DoutPrefixProvider *dpp, RGWCORSRule *rule, const char *req_hdrs, string& hdrs, string& exp_hdrs, unsigned *max_age) {
+  if (req_hdrs) {
+    list<string> hl;
+    get_str_list(req_hdrs, hl);
+    for(list<string>::iterator it = hl.begin(); it != hl.end(); ++it) {
+      if (!rule->is_header_allowed((*it).c_str(), (*it).length())) {
+        ldpp_dout(dpp, 5) << "Header " << (*it) << " is not registered in this rule" << dendl;
+      } else {
+        if (hdrs.length() > 0) hdrs.append(",");
+        hdrs.append((*it));
+      }
+    }
+  }
+  rule->format_exp_headers(exp_hdrs);
+  *max_age = rule->get_max_age();
+}
+
+/**
+ * Generate the CORS header response
+ *
+ * This is described in the CORS standard, section 6.2.
+ */
+bool RGWOp::generate_cors_headers(string& origin, string& method, string& headers, string& exp_headers, unsigned *max_age)
+{
+  /* CORS 6.2.1. */
+  const char *orig = s->info.env->get("HTTP_ORIGIN");
+  if (!orig) {
+    return false;
+  }
+
+  /* Custom: */
+  origin = orig;
+  int temp_op_ret = read_bucket_cors();
+  if (temp_op_ret < 0) {
+    op_ret = temp_op_ret;
+    return false;
+  }
+
+  if (!cors_exist) {
+    ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl;
+    return false;
+  }
+
+  /* CORS 6.2.2. */
+  RGWCORSRule *rule = bucket_cors.host_name_rule(orig);
+  if (!rule)
+    return false;
+
+  /*
+   * Set the Allowed-Origin header to a asterisk if this is allowed in the rule
+   * and no Authorization was send by the client
+   *
+   * The origin parameter specifies a URI that may access the resource.  The browser must enforce this.
+   * For requests without credentials, the server may specify "*" as a wildcard,
+   * thereby allowing any origin to access the resource.
+   */
+  const char *authorization = s->info.env->get("HTTP_AUTHORIZATION");
+  if (!authorization && rule->has_wildcard_origin())
+    origin = "*";
+
+  /* CORS 6.2.3. */
+  const char *req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD");
+  if (!req_meth) {
+    req_meth = s->info.method;
+  }
+
+  if (req_meth) {
+    method = req_meth;
+    /* CORS 6.2.5. */
+    if (!validate_cors_rule_method(this, rule, req_meth)) {
+     return false;
+    }
+  }
+
+  /* CORS 6.2.4. */
+  const char *req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_HEADERS");
+
+  /* CORS 6.2.6. */
+  get_cors_response_headers(this, rule, req_hdrs, headers, exp_headers, max_age);
+
+  return true;
+}
+
+int rgw_policy_from_attrset(const DoutPrefixProvider *dpp, CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
+{
+  map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
+  if (aiter == attrset.end())
+    return -EIO;
+
+  bufferlist& bl = aiter->second;
+  auto iter = bl.cbegin();
+  try {
+    policy->decode(iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+    return -EIO;
+  }
+  if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
+    RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
+    ldpp_dout(dpp, 15) << __func__ << " Read AccessControlPolicy";
+    s3policy->to_xml(*_dout);
+    *_dout << dendl;
+  }
+  return 0;
+}
+
+int RGWGetObj::read_user_manifest_part(rgw::sal::Bucket* bucket,
+                                       const rgw_bucket_dir_entry& ent,
+                                       RGWAccessControlPolicy * const bucket_acl,
+                                       const boost::optional<Policy>& bucket_policy,
+                                       const off_t start_ofs,
+                                       const off_t end_ofs,
+                                       bool swift_slo)
+{
+  ldpp_dout(this, 20) << "user manifest obj=" << ent.key.name
+      << "[" << ent.key.instance << "]" << dendl;
+  RGWGetObj_CB cb(this);
+  RGWGetObj_Filter* filter = &cb;
+  boost::optional<RGWGetObj_Decompress> decompress;
+
+  int64_t cur_ofs = start_ofs;
+  int64_t cur_end = end_ofs;
+
+  std::unique_ptr<rgw::sal::Object> part = bucket->get_object(ent.key);
+
+  RGWAccessControlPolicy obj_policy(s->cct);
+
+  ldpp_dout(this, 20) << "reading obj=" << part << " ofs=" << cur_ofs
+      << " end=" << cur_end << dendl;
+
+  part->set_atomic();
+  part->set_prefetch_data();
+
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op = part->get_read_op();
+
+  if (!swift_slo) {
+    /* SLO etag is optional */
+    read_op->params.if_match = ent.meta.etag.c_str();
+  }
+
+  op_ret = read_op->prepare(s->yield, this);
+  if (op_ret < 0)
+    return op_ret;
+  op_ret = part->range_to_ofs(ent.meta.accounted_size, cur_ofs, cur_end);
+  if (op_ret < 0)
+    return op_ret;
+  bool need_decompress;
+  op_ret = rgw_compression_info_from_attrset(part->get_attrs(), need_decompress, cs_info);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode compression info" << dendl;
+    return -EIO;
+  }
+
+  if (need_decompress)
+  {
+    if (cs_info.orig_size != ent.meta.accounted_size) {
+      // hmm.. something wrong, object not as expected, abort!
+      ldpp_dout(this, 0) << "ERROR: expected cs_info.orig_size=" << cs_info.orig_size
+          << ", actual read size=" << ent.meta.size << dendl;
+      return -EIO;
+    }
+    decompress.emplace(s->cct, &cs_info, partial_content, filter);
+    filter = &*decompress;
+  }
+  else
+  {
+    if (part->get_obj_size() != ent.meta.size) {
+      // hmm.. something wrong, object not as expected, abort!
+      ldpp_dout(this, 0) << "ERROR: expected obj_size=" << part->get_obj_size()
+          << ", actual read size=" << ent.meta.size << dendl;
+      return -EIO;
+	  }
+  }
+
+  op_ret = rgw_policy_from_attrset(s, s->cct, part->get_attrs(), &obj_policy);
+  if (op_ret < 0)
+    return op_ret;
+
+  /* We can use global user_acl because LOs cannot have segments
+   * stored inside different accounts. */
+  if (s->system_request) {
+    ldpp_dout(this, 2) << "overriding permissions due to system operation" << dendl;
+  } else if (s->auth.identity->is_admin_of(s->user->get_id())) {
+    ldpp_dout(this, 2) << "overriding permissions due to admin operation" << dendl;
+  } else if (!verify_object_permission(this, s, part->get_obj(), s->user_acl.get(),
+				       bucket_acl, &obj_policy, bucket_policy,
+				       s->iam_user_policies, s->session_policies, action)) {
+    return -EPERM;
+  }
+  if (ent.meta.size == 0) {
+    return 0;
+  }
+
+  perfcounter->inc(l_rgw_get_b, cur_end - cur_ofs);
+  filter->fixup_range(cur_ofs, cur_end);
+  op_ret = read_op->iterate(this, cur_ofs, cur_end, filter, s->yield);
+  if (op_ret >= 0)
+	  op_ret = filter->flush();
+  return op_ret;
+}
+
+static int iterate_user_manifest_parts(const DoutPrefixProvider *dpp, 
+                                       CephContext * const cct,
+                                       rgw::sal::Driver* const driver,
+                                       const off_t ofs,
+                                       const off_t end,
+                                       rgw::sal::Bucket* bucket,
+                                       const string& obj_prefix,
+                                       RGWAccessControlPolicy * const bucket_acl,
+                                       const boost::optional<Policy>& bucket_policy,
+                                       uint64_t * const ptotal_len,
+                                       uint64_t * const pobj_size,
+                                       string * const pobj_sum,
+                                       int (*cb)(rgw::sal::Bucket* bucket,
+                                                 const rgw_bucket_dir_entry& ent,
+                                                 RGWAccessControlPolicy * const bucket_acl,
+                                                 const boost::optional<Policy>& bucket_policy,
+                                                 off_t start_ofs,
+                                                 off_t end_ofs,
+                                                 void *param,
+                                                 bool swift_slo),
+                                       void * const cb_param,
+				       optional_yield y)
+{
+  uint64_t obj_ofs = 0, len_count = 0;
+  bool found_start = false, found_end = false, handled_end = false;
+  string delim;
+
+  utime_t start_time = ceph_clock_now();
+
+  rgw::sal::Bucket::ListParams params;
+  params.prefix = obj_prefix;
+  params.delim = delim;
+
+  rgw::sal::Bucket::ListResults results;
+  MD5 etag_sum;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  etag_sum.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  do {
+    static constexpr auto MAX_LIST_OBJS = 100u;
+    int r = bucket->list(dpp, params, MAX_LIST_OBJS, results, y);
+    if (r < 0) {
+      return r;
+    }
+
+    for (rgw_bucket_dir_entry& ent : results.objs) {
+      const uint64_t cur_total_len = obj_ofs;
+      const uint64_t obj_size = ent.meta.accounted_size;
+      uint64_t start_ofs = 0, end_ofs = obj_size;
+
+      if ((ptotal_len || cb) && !found_start && cur_total_len + obj_size > (uint64_t)ofs) {
+	start_ofs = ofs - obj_ofs;
+	found_start = true;
+      }
+
+      obj_ofs += obj_size;
+      if (pobj_sum) {
+        etag_sum.Update((const unsigned char *)ent.meta.etag.c_str(),
+                        ent.meta.etag.length());
+      }
+
+      if ((ptotal_len || cb) && !found_end && obj_ofs > (uint64_t)end) {
+	end_ofs = end - cur_total_len + 1;
+	found_end = true;
+      }
+
+      perfcounter->tinc(l_rgw_get_lat,
+			(ceph_clock_now() - start_time));
+
+      if (found_start && !handled_end) {
+        len_count += end_ofs - start_ofs;
+
+        if (cb) {
+          r = cb(bucket, ent, bucket_acl, bucket_policy, start_ofs, end_ofs,
+		 cb_param, false /* swift_slo */);
+          if (r < 0) {
+            return r;
+          }
+        }
+      }
+
+      handled_end = found_end;
+      start_time = ceph_clock_now();
+    }
+  } while (results.is_truncated);
+
+  if (ptotal_len) {
+    *ptotal_len = len_count;
+  }
+  if (pobj_size) {
+    *pobj_size = obj_ofs;
+  }
+  if (pobj_sum) {
+    complete_etag(etag_sum, pobj_sum);
+  }
+
+  return 0;
+}
+
+struct rgw_slo_part {
+  RGWAccessControlPolicy *bucket_acl = nullptr;
+  Policy* bucket_policy = nullptr;
+  rgw::sal::Bucket* bucket;
+  string obj_name;
+  uint64_t size = 0;
+  string etag;
+};
+
+static int iterate_slo_parts(const DoutPrefixProvider *dpp,
+                             CephContext *cct,
+                             rgw::sal::Driver* driver,
+                             off_t ofs,
+                             off_t end,
+                             map<uint64_t, rgw_slo_part>& slo_parts,
+                             int (*cb)(rgw::sal::Bucket* bucket,
+                                       const rgw_bucket_dir_entry& ent,
+                                       RGWAccessControlPolicy *bucket_acl,
+                                       const boost::optional<Policy>& bucket_policy,
+                                       off_t start_ofs,
+                                       off_t end_ofs,
+                                       void *param,
+                                       bool swift_slo),
+                             void *cb_param)
+{
+  bool found_start = false, found_end = false;
+
+  if (slo_parts.empty()) {
+    return 0;
+  }
+
+  utime_t start_time = ceph_clock_now();
+
+  map<uint64_t, rgw_slo_part>::iterator iter = slo_parts.upper_bound(ofs);
+  if (iter != slo_parts.begin()) {
+    --iter;
+  }
+
+  uint64_t obj_ofs = iter->first;
+
+  for (; iter != slo_parts.end() && !found_end; ++iter) {
+    rgw_slo_part& part = iter->second;
+    rgw_bucket_dir_entry ent;
+
+    ent.key.name = part.obj_name;
+    ent.meta.accounted_size = ent.meta.size = part.size;
+    ent.meta.etag = part.etag;
+
+    uint64_t cur_total_len = obj_ofs;
+    uint64_t start_ofs = 0, end_ofs = ent.meta.size - 1;
+
+    if (!found_start && cur_total_len + ent.meta.size > (uint64_t)ofs) {
+      start_ofs = ofs - obj_ofs;
+      found_start = true;
+    }
+
+    obj_ofs += ent.meta.size;
+
+    if (!found_end && obj_ofs > (uint64_t)end) {
+      end_ofs = end - cur_total_len;
+      found_end = true;
+    }
+
+    perfcounter->tinc(l_rgw_get_lat,
+		      (ceph_clock_now() - start_time));
+
+    if (found_start) {
+      if (cb) {
+        ldpp_dout(dpp, 20) << "iterate_slo_parts()"
+                          << " obj=" << part.obj_name
+                          << " start_ofs=" << start_ofs
+                          << " end_ofs=" << end_ofs
+                          << dendl;
+
+	// SLO is a Swift thing, and Swift has no knowledge of S3 Policies.
+        int r = cb(part.bucket, ent, part.bucket_acl,
+		   (part.bucket_policy ?
+		    boost::optional<Policy>(*part.bucket_policy) : none),
+		   start_ofs, end_ofs, cb_param, true /* swift_slo */);
+	if (r < 0)
+          return r;
+      }
+    }
+
+    start_time = ceph_clock_now();
+  }
+
+  return 0;
+}
+
+static int get_obj_user_manifest_iterate_cb(rgw::sal::Bucket* bucket,
+                                            const rgw_bucket_dir_entry& ent,
+                                            RGWAccessControlPolicy * const bucket_acl,
+                                            const boost::optional<Policy>& bucket_policy,
+                                            const off_t start_ofs,
+                                            const off_t end_ofs,
+                                            void * const param,
+                                            bool swift_slo = false)
+{
+  RGWGetObj *op = static_cast<RGWGetObj *>(param);
+  return op->read_user_manifest_part(
+    bucket, ent, bucket_acl, bucket_policy, start_ofs, end_ofs, swift_slo);
+}
+
+int RGWGetObj::handle_user_manifest(const char *prefix, optional_yield y)
+{
+  const std::string_view prefix_view(prefix);
+  ldpp_dout(this, 2) << "RGWGetObj::handle_user_manifest() prefix="
+                   << prefix_view << dendl;
+
+  const size_t pos = prefix_view.find('/');
+  if (pos == string::npos) {
+    return -EINVAL;
+  }
+
+  const std::string bucket_name = url_decode(prefix_view.substr(0, pos));
+  const std::string obj_prefix = url_decode(prefix_view.substr(pos + 1));
+
+  RGWAccessControlPolicy _bucket_acl(s->cct);
+  RGWAccessControlPolicy *bucket_acl;
+  boost::optional<Policy> _bucket_policy;
+  boost::optional<Policy>* bucket_policy;
+  RGWBucketInfo bucket_info;
+  std::unique_ptr<rgw::sal::Bucket> ubucket;
+  rgw::sal::Bucket* pbucket = NULL;
+  int r = 0;
+
+  if (bucket_name.compare(s->bucket->get_name()) != 0) {
+    map<string, bufferlist> bucket_attrs;
+    r = driver->get_bucket(this, s->user.get(), s->user->get_tenant(), bucket_name, &ubucket, y);
+    if (r < 0) {
+      ldpp_dout(this, 0) << "could not get bucket info for bucket="
+		       << bucket_name << dendl;
+      return r;
+    }
+    bucket_acl = &_bucket_acl;
+    r = read_bucket_policy(this, driver, s, ubucket->get_info(), bucket_attrs, bucket_acl, ubucket->get_key(), y);
+    if (r < 0) {
+      ldpp_dout(this, 0) << "failed to read bucket policy" << dendl;
+      return r;
+    }
+    _bucket_policy = get_iam_policy_from_attr(s->cct, bucket_attrs, s->user->get_tenant());
+    bucket_policy = &_bucket_policy;
+    pbucket = ubucket.get();
+  } else {
+    pbucket = s->bucket.get();
+    bucket_acl = s->bucket_acl.get();
+    bucket_policy = &s->iam_policy;
+  }
+
+  /* dry run to find out:
+   * - total length (of the parts we are going to send to client),
+   * - overall DLO's content size,
+   * - md5 sum of overall DLO's content (for etag of Swift API). */
+  r = iterate_user_manifest_parts(this, s->cct, driver, ofs, end,
+        pbucket, obj_prefix, bucket_acl, *bucket_policy,
+        nullptr, &s->obj_size, &lo_etag,
+	nullptr /* cb */, nullptr /* cb arg */, y);
+  if (r < 0) {
+    return r;
+  }
+  s->object->set_obj_size(s->obj_size);
+
+  r = s->object->range_to_ofs(s->obj_size, ofs, end);
+  if (r < 0) {
+    return r;
+  }
+
+  r = iterate_user_manifest_parts(this, s->cct, driver, ofs, end,
+        pbucket, obj_prefix, bucket_acl, *bucket_policy,
+        &total_len, nullptr, nullptr,
+	nullptr, nullptr, y);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!get_data) {
+    bufferlist bl;
+    send_response_data(bl, 0, 0);
+    return 0;
+  }
+
+  r = iterate_user_manifest_parts(this, s->cct, driver, ofs, end,
+        pbucket, obj_prefix, bucket_acl, *bucket_policy,
+        nullptr, nullptr, nullptr,
+	get_obj_user_manifest_iterate_cb, (void *)this, y);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!total_len) {
+    bufferlist bl;
+    send_response_data(bl, 0, 0);
+  }
+
+  return r;
+}
+
+int RGWGetObj::handle_slo_manifest(bufferlist& bl, optional_yield y)
+{
+  RGWSLOInfo slo_info;
+  auto bliter = bl.cbegin();
+  try {
+    decode(slo_info, bliter);
+  } catch (buffer::error& err) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode slo manifest" << dendl;
+    return -EIO;
+  }
+  ldpp_dout(this, 2) << "RGWGetObj::handle_slo_manifest()" << dendl;
+
+  vector<RGWAccessControlPolicy> allocated_acls;
+  map<string, pair<RGWAccessControlPolicy *, boost::optional<Policy>>> policies;
+  map<string, std::unique_ptr<rgw::sal::Bucket>> buckets;
+
+  map<uint64_t, rgw_slo_part> slo_parts;
+
+  MD5 etag_sum;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  etag_sum.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  total_len = 0;
+
+  for (const auto& entry : slo_info.entries) {
+    const string& path = entry.path;
+
+    /* If the path starts with slashes, strip them all. */
+    const size_t pos_init = path.find_first_not_of('/');
+    /* According to the documentation of std::string::find following check
+     * is not necessary as we should get the std::string::npos propagation
+     * here. This might be true with the accuracy to implementation's bugs.
+     * See following question on SO:
+     * http://stackoverflow.com/questions/1011790/why-does-stdstring-findtext-stdstringnpos-not-return-npos
+     */
+    if (pos_init == string::npos) {
+      return -EINVAL;
+    }
+
+    const size_t pos_sep = path.find('/', pos_init);
+    if (pos_sep == string::npos) {
+      return -EINVAL;
+    }
+
+    string bucket_name = path.substr(pos_init, pos_sep - pos_init);
+    string obj_name = path.substr(pos_sep + 1);
+
+    rgw::sal::Bucket* bucket;
+    RGWAccessControlPolicy *bucket_acl;
+    Policy* bucket_policy;
+
+    if (bucket_name.compare(s->bucket->get_name()) != 0) {
+      const auto& piter = policies.find(bucket_name);
+      if (piter != policies.end()) {
+        bucket_acl = piter->second.first;
+        bucket_policy = piter->second.second.get_ptr();
+	bucket = buckets[bucket_name].get();
+      } else {
+	allocated_acls.push_back(RGWAccessControlPolicy(s->cct));
+	RGWAccessControlPolicy& _bucket_acl = allocated_acls.back();
+
+	std::unique_ptr<rgw::sal::Bucket> tmp_bucket;
+	int r = driver->get_bucket(this, s->user.get(), s->user->get_tenant(), bucket_name, &tmp_bucket, y);
+        if (r < 0) {
+          ldpp_dout(this, 0) << "could not get bucket info for bucket="
+			   << bucket_name << dendl;
+          return r;
+        }
+        bucket = tmp_bucket.get();
+        bucket_acl = &_bucket_acl;
+        r = read_bucket_policy(this, driver, s, tmp_bucket->get_info(), tmp_bucket->get_attrs(), bucket_acl,
+                               tmp_bucket->get_key(), y);
+        if (r < 0) {
+          ldpp_dout(this, 0) << "failed to read bucket ACL for bucket "
+                           << bucket << dendl;
+          return r;
+	}
+	auto _bucket_policy = get_iam_policy_from_attr(
+	  s->cct, tmp_bucket->get_attrs(), tmp_bucket->get_tenant());
+        bucket_policy = _bucket_policy.get_ptr();
+	buckets[bucket_name].swap(tmp_bucket);
+        policies[bucket_name] = make_pair(bucket_acl, _bucket_policy);
+      }
+    } else {
+      bucket = s->bucket.get();
+      bucket_acl = s->bucket_acl.get();
+      bucket_policy = s->iam_policy.get_ptr();
+    }
+
+    rgw_slo_part part;
+    part.bucket_acl = bucket_acl;
+    part.bucket_policy = bucket_policy;
+    part.bucket = bucket;
+    part.obj_name = obj_name;
+    part.size = entry.size_bytes;
+    part.etag = entry.etag;
+    ldpp_dout(this, 20) << "slo_part: bucket=" << part.bucket
+                      << " obj=" << part.obj_name
+                      << " size=" << part.size
+                      << " etag=" << part.etag
+                      << dendl;
+
+    etag_sum.Update((const unsigned char *)entry.etag.c_str(),
+                    entry.etag.length());
+
+    slo_parts[total_len] = part;
+    total_len += part.size;
+  } /* foreach entry */
+
+  complete_etag(etag_sum, &lo_etag);
+
+  s->obj_size = slo_info.total_size;
+  s->object->set_obj_size(slo_info.total_size);
+  ldpp_dout(this, 20) << "s->obj_size=" << s->obj_size << dendl;
+
+  int r = s->object->range_to_ofs(total_len, ofs, end);
+  if (r < 0) {
+    return r;
+  }
+
+  total_len = end - ofs + 1;
+  ldpp_dout(this, 20) << "Requested: ofs=" << ofs
+                    << " end=" << end
+                    << " total=" << total_len
+                    << dendl;
+
+  r = iterate_slo_parts(this, s->cct, driver, ofs, end, slo_parts,
+        get_obj_user_manifest_iterate_cb, (void *)this);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWGetObj::get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len)
+{
+  /* garbage collection related handling:
+   * defer_gc disabled for https://tracker.ceph.com/issues/47866 */
+  return send_response_data(bl, bl_ofs, bl_len);
+}
+
+int RGWGetObj::get_lua_filter(std::unique_ptr<RGWGetObj_Filter>* filter, RGWGetObj_Filter* cb) {
+  std::string script;
+  const auto rc = rgw::lua::read_script(s, s->penv.lua.manager.get(), s->bucket_tenant, s->yield, rgw::lua::context::getData, script);
+  if (rc == -ENOENT) {
+    // no script, nothing to do
+    return 0;
+  } else if (rc < 0) {
+    ldpp_dout(this, 5) << "WARNING: failed to read data script. error: " << rc << dendl;
+    return rc;
+  }
+  filter->reset(new rgw::lua::RGWGetObjFilter(s, script, cb));
+  return 0;
+}
+
+bool RGWGetObj::prefetch_data()
+{
+  /* HEAD request, stop prefetch*/
+  if (!get_data || s->info.env->exists("HTTP_X_RGW_AUTH")) {
+    return false;
+  }
+
+  range_str = s->info.env->get("HTTP_RANGE");
+  // TODO: add range prefetch
+  if (range_str) {
+    parse_range();
+    return false;
+  }
+
+  return get_data;
+}
+
+void RGWGetObj::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+static inline void rgw_cond_decode_objtags(
+  req_state *s,
+  const std::map<std::string, buffer::list> &attrs)
+{
+  const auto& tags = attrs.find(RGW_ATTR_TAGS);
+  if (tags != attrs.end()) {
+    try {
+      bufferlist::const_iterator iter{&tags->second};
+      s->tagset.decode(iter);
+    } catch (buffer::error& err) {
+      ldpp_dout(s, 0)
+	<< "ERROR: caught buffer::error, couldn't decode TagSet" << dendl;
+    }
+  }
+}
+
+void RGWGetObj::execute(optional_yield y)
+{
+  bufferlist bl;
+  gc_invalidate_time = ceph_clock_now();
+  gc_invalidate_time += (s->cct->_conf->rgw_gc_obj_min_wait / 2);
+
+  bool need_decompress = false;
+  int64_t ofs_x = 0, end_x = 0;
+  bool encrypted = false;
+
+  RGWGetObj_CB cb(this);
+  RGWGetObj_Filter* filter = (RGWGetObj_Filter *)&cb;
+  boost::optional<RGWGetObj_Decompress> decompress;
+#ifdef WITH_ARROW_FLIGHT
+  boost::optional<rgw::flight::FlightGetObj_Filter> flight_filter;
+#endif
+  std::unique_ptr<RGWGetObj_Filter> decrypt;
+  std::unique_ptr<RGWGetObj_Filter> run_lua;
+  map<string, bufferlist>::iterator attr_iter;
+
+  perfcounter->inc(l_rgw_get);
+
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op(s->object->get_read_op());
+
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    goto done_err;
+
+  op_ret = init_common();
+  if (op_ret < 0)
+    goto done_err;
+
+  read_op->params.mod_ptr = mod_ptr;
+  read_op->params.unmod_ptr = unmod_ptr;
+  read_op->params.high_precision_time = s->system_request; /* system request need to use high precision time */
+  read_op->params.mod_zone_id = mod_zone_id;
+  read_op->params.mod_pg_ver = mod_pg_ver;
+  read_op->params.if_match = if_match;
+  read_op->params.if_nomatch = if_nomatch;
+  read_op->params.lastmod = &lastmod;
+
+  op_ret = read_op->prepare(s->yield, this);
+  if (op_ret < 0)
+    goto done_err;
+  version_id = s->object->get_instance();
+  s->obj_size = s->object->get_obj_size();
+  attrs = s->object->get_attrs();
+
+  /* STAT ops don't need data, and do no i/o */
+  if (get_type() == RGW_OP_STAT_OBJ) {
+    return;
+  }
+  if (s->info.env->exists("HTTP_X_RGW_AUTH")) {
+    op_ret = 0;
+    goto done_err;
+  }
+  /* start gettorrent */
+  if (torrent.get_flag())
+  {
+    attr_iter = attrs.find(RGW_ATTR_CRYPT_MODE);
+    if (attr_iter != attrs.end() && attr_iter->second.to_str() == "SSE-C-AES256") {
+      ldpp_dout(this, 0) << "ERROR: torrents are not supported for objects "
+          "encrypted with SSE-C" << dendl;
+      op_ret = -EINVAL;
+      goto done_err;
+    }
+    torrent.init(s, driver);
+    rgw_obj obj = s->object->get_obj();
+    op_ret = torrent.get_torrent_file(s->object.get(), total_len, bl, obj);
+    if (op_ret < 0)
+    {
+      ldpp_dout(this, 0) << "ERROR: failed to get_torrent_file ret= " << op_ret
+                       << dendl;
+      goto done_err;
+    }
+    op_ret = send_response_data(bl, 0, total_len);
+    if (op_ret < 0)
+    {
+      ldpp_dout(this, 0) << "ERROR: failed to send_response_data ret= " << op_ret << dendl;
+      goto done_err;
+    }
+    return;
+  }
+  /* end gettorrent */
+
+  // run lua script on decompressed and decrypted data - first filter runs last
+  op_ret = get_lua_filter(&run_lua, filter);
+  if (run_lua != nullptr) {
+    filter = run_lua.get();
+  }
+  if (op_ret < 0) {
+    goto done_err;
+  }
+
+#ifdef WITH_ARROW_FLIGHT
+  if (s->penv.flight_store) {
+    if (ofs == 0) {
+      // insert a GetObj_Filter to monitor and create flight
+      flight_filter.emplace(s, filter);
+      filter = &*flight_filter;
+    }
+  } else {
+    ldpp_dout(this, 0) << "ERROR: flight_store not created in " << __func__ << dendl;
+  }
+#endif
+
+  op_ret = rgw_compression_info_from_attrset(attrs, need_decompress, cs_info);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode compression info, cannot decompress" << dendl;
+    goto done_err;
+  }
+
+  // where encryption and compression are combined, compression was applied to
+  // the data before encryption. if the system header rgwx-skip-decrypt is
+  // present, we have to skip the decompression filter too
+  encrypted = attrs.count(RGW_ATTR_CRYPT_MODE);
+
+  if (need_decompress && (!encrypted || !skip_decrypt)) {
+    s->obj_size = cs_info.orig_size;
+    s->object->set_obj_size(cs_info.orig_size);
+    decompress.emplace(s->cct, &cs_info, partial_content, filter);
+    filter = &*decompress;
+  }
+
+  attr_iter = attrs.find(RGW_ATTR_OBJ_REPLICATION_TRACE);
+  if (attr_iter != attrs.end()) {
+    try {
+      std::vector<rgw_zone_set_entry> zones;
+      auto p = attr_iter->second.cbegin();
+      decode(zones, p);
+      for (const auto& zone: zones) {
+        if (zone == dst_zone_trace) {
+          op_ret = -ERR_NOT_MODIFIED;
+          ldpp_dout(this, 4) << "Object already has been copied to this destination. Returning "
+            << op_ret << dendl;
+          goto done_err;
+        }
+      }
+    } catch (const buffer::error&) {}
+  }
+
+  if (get_type() == RGW_OP_GET_OBJ && get_data) {
+    op_ret = handle_cloudtier_obj(attrs, sync_cloudtiered);
+    if (op_ret < 0) {
+      ldpp_dout(this, 4) << "Cannot get cloud tiered object: " << *s->object
+          <<". Failing with " << op_ret << dendl;
+      if (op_ret == -ERR_INVALID_OBJECT_STATE) {
+        s->err.message = "This object was transitioned to cloud-s3";
+      }
+      goto done_err;
+    }
+  }
+
+  attr_iter = attrs.find(RGW_ATTR_USER_MANIFEST);
+  if (attr_iter != attrs.end() && !skip_manifest) {
+    op_ret = handle_user_manifest(attr_iter->second.c_str(), y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 0) << "ERROR: failed to handle user manifest ret="
+		       << op_ret << dendl;
+      goto done_err;
+    }
+    return;
+  }
+
+  attr_iter = attrs.find(RGW_ATTR_SLO_MANIFEST);
+  if (attr_iter != attrs.end() && !skip_manifest) {
+    is_slo = true;
+    op_ret = handle_slo_manifest(attr_iter->second, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 0) << "ERROR: failed to handle slo manifest ret=" << op_ret
+		       << dendl;
+      goto done_err;
+    }
+    return;
+  }
+
+  // for range requests with obj size 0
+  if (range_str && !(s->obj_size)) {
+    total_len = 0;
+    op_ret = -ERANGE;
+    goto done_err;
+  }
+
+  op_ret = s->object->range_to_ofs(s->obj_size, ofs, end);
+  if (op_ret < 0)
+    goto done_err;
+  total_len = (ofs <= end ? end + 1 - ofs : 0);
+
+  ofs_x = ofs;
+  end_x = end;
+  filter->fixup_range(ofs_x, end_x);
+
+  /* Check whether the object has expired. Swift API documentation
+   * stands that we should return 404 Not Found in such case. */
+  if (need_object_expiration() && s->object->is_expired()) {
+    op_ret = -ENOENT;
+    goto done_err;
+  }
+
+  /* Decode S3 objtags, if any */
+  rgw_cond_decode_objtags(s, attrs);
+
+  start = ofs;
+
+  attr_iter = attrs.find(RGW_ATTR_MANIFEST);
+  op_ret = this->get_decrypt_filter(&decrypt, filter,
+                                    attr_iter != attrs.end() ? &(attr_iter->second) : nullptr);
+  if (decrypt != nullptr) {
+    filter = decrypt.get();
+    filter->fixup_range(ofs_x, end_x);
+  }
+  if (op_ret < 0) {
+    goto done_err;
+  }
+
+
+  if (!get_data || ofs > end) {
+    send_response_data(bl, 0, 0);
+    return;
+  }
+
+  perfcounter->inc(l_rgw_get_b, end - ofs);
+
+  op_ret = read_op->iterate(this, ofs_x, end_x, filter, s->yield);
+
+  if (op_ret >= 0)
+    op_ret = filter->flush();
+
+  perfcounter->tinc(l_rgw_get_lat, s->time_elapsed());
+  if (op_ret < 0) {
+    goto done_err;
+  }
+
+  op_ret = send_response_data(bl, 0, 0);
+  if (op_ret < 0) {
+    goto done_err;
+  }
+  return;
+
+done_err:
+  send_response_data_error(y);
+}
+
+int RGWGetObj::init_common()
+{
+  if (range_str) {
+    /* range parsed error when prefetch */
+    if (!range_parsed) {
+      int r = parse_range();
+      if (r < 0)
+        return r;
+    }
+  }
+  if (if_mod) {
+    if (parse_time(if_mod, &mod_time) < 0)
+      return -EINVAL;
+    mod_ptr = &mod_time;
+  }
+
+  if (if_unmod) {
+    if (parse_time(if_unmod, &unmod_time) < 0)
+      return -EINVAL;
+    unmod_ptr = &unmod_time;
+  }
+
+  return 0;
+}
+
+int RGWListBuckets::verify_permission(optional_yield y)
+{
+  rgw::Partition partition = rgw::Partition::aws;
+  rgw::Service service = rgw::Service::s3;
+
+  string tenant;
+  if (s->auth.identity->get_identity_type() == TYPE_ROLE) {
+    tenant = s->auth.identity->get_role_tenant();
+  } else {
+    tenant = s->user->get_tenant();
+  }
+
+  if (!verify_user_permission(this, s, ARN(partition, service, "", tenant, "*"), rgw::IAM::s3ListAllMyBuckets, false)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+int RGWGetUsage::verify_permission(optional_yield y)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWListBuckets::execute(optional_yield y)
+{
+  bool done;
+  bool started = false;
+  uint64_t total_count = 0;
+
+  const uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk;
+
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    goto send_end;
+  }
+
+  if (supports_account_metadata()) {
+    op_ret = s->user->read_attrs(this, s->yield);
+    if (op_ret < 0) {
+      goto send_end;
+    }
+  }
+
+  is_truncated = false;
+  do {
+    rgw::sal::BucketList buckets;
+    uint64_t read_count;
+    if (limit >= 0) {
+      read_count = min(limit - total_count, max_buckets);
+    } else {
+      read_count = max_buckets;
+    }
+
+    op_ret = s->user->list_buckets(this, marker, end_marker, read_count, should_get_stats(), buckets, y);
+
+    if (op_ret < 0) {
+      /* hmm.. something wrong here.. the user was authenticated, so it
+         should exist */
+      ldpp_dout(this, 10) << "WARNING: failed on rgw_get_user_buckets uid="
+			<< s->user->get_id() << dendl;
+      break;
+    }
+
+    is_truncated = buckets.is_truncated();
+
+    /* We need to have stats for all our policies - even if a given policy
+     * isn't actually used in a given account. In such situation its usage
+     * stats would be simply full of zeros. */
+    std::set<std::string> targets;
+    driver->get_zone()->get_zonegroup().get_placement_target_names(targets);
+    for (const auto& policy : targets) {
+      policies_stats.emplace(policy, decltype(policies_stats)::mapped_type());
+    }
+
+    std::map<std::string, std::unique_ptr<rgw::sal::Bucket>>& m = buckets.get_buckets();
+    for (const auto& kv : m) {
+      const auto& bucket = kv.second;
+
+      global_stats.bytes_used += bucket->get_size();
+      global_stats.bytes_used_rounded += bucket->get_size_rounded();
+      global_stats.objects_count += bucket->get_count();
+
+      /* operator[] still can create a new entry for storage policy seen
+       * for first time. */
+      auto& policy_stats = policies_stats[bucket->get_placement_rule().to_str()];
+      policy_stats.bytes_used += bucket->get_size();
+      policy_stats.bytes_used_rounded += bucket->get_size_rounded();
+      policy_stats.buckets_count++;
+      policy_stats.objects_count += bucket->get_count();
+    }
+    global_stats.buckets_count += m.size();
+    total_count += m.size();
+
+    done = (m.size() < read_count || (limit >= 0 && total_count >= (uint64_t)limit));
+
+    if (!started) {
+      send_response_begin(buckets.count() > 0);
+      started = true;
+    }
+
+    if (read_count > 0 &&
+        !m.empty()) {
+      auto riter = m.rbegin();
+      marker = riter->first;
+
+      handle_listing_chunk(std::move(buckets));
+    }
+  } while (is_truncated && !done);
+
+send_end:
+  if (!started) {
+    send_response_begin(false);
+  }
+  send_response_end();
+}
+
+void RGWGetUsage::execute(optional_yield y)
+{
+  uint64_t start_epoch = 0;
+  uint64_t end_epoch = (uint64_t)-1;
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+    
+  if (!start_date.empty()) {
+    op_ret = utime_t::parse_date(start_date, &start_epoch, NULL);
+    if (op_ret < 0) {
+      ldpp_dout(this, 0) << "ERROR: failed to parse start date" << dendl;
+      return;
+    }
+  }
+    
+  if (!end_date.empty()) {
+    op_ret = utime_t::parse_date(end_date, &end_epoch, NULL);
+    if (op_ret < 0) {
+      ldpp_dout(this, 0) << "ERROR: failed to parse end date" << dendl;
+      return;
+    }
+  }
+     
+  uint32_t max_entries = 1000;
+
+  bool is_truncated = true;
+
+  RGWUsageIter usage_iter;
+  
+  while (s->bucket && is_truncated) {
+    op_ret = s->bucket->read_usage(this, start_epoch, end_epoch, max_entries, &is_truncated,
+				   usage_iter, usage);
+    if (op_ret == -ENOENT) {
+      op_ret = 0;
+      is_truncated = false;
+    }
+
+    if (op_ret < 0) {
+      return;
+    }    
+  }
+
+  op_ret = rgw_user_sync_all_stats(this, driver, s->user.get(), y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to sync user stats" << dendl;
+    return;
+  }
+
+  op_ret = rgw_user_get_all_buckets_stats(this, driver, s->user.get(), buckets_usage, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to get user's buckets stats" << dendl;
+    return;
+  }
+
+  op_ret = s->user->read_stats(this, y, &stats);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: can't read user header"  << dendl;
+    return;
+  }
+  
+  return;
+}
+
+int RGWStatAccount::verify_permission(optional_yield y)
+{
+  if (!verify_user_permission_no_policy(this, s, RGW_PERM_READ)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWStatAccount::execute(optional_yield y)
+{
+  string marker;
+  rgw::sal::BucketList buckets;
+  uint64_t max_buckets = s->cct->_conf->rgw_list_buckets_max_chunk;
+  const string *lastmarker;
+
+  do {
+
+    lastmarker = nullptr;
+    op_ret = s->user->list_buckets(this, marker, string(), max_buckets, true, buckets, y);
+    if (op_ret < 0) {
+      /* hmm.. something wrong here.. the user was authenticated, so it
+         should exist */
+      ldpp_dout(this, 10) << "WARNING: failed on list_buckets uid="
+			<< s->user->get_id() << " ret=" << op_ret << dendl;
+      break;
+    } else {
+      /* We need to have stats for all our policies - even if a given policy
+       * isn't actually used in a given account. In such situation its usage
+       * stats would be simply full of zeros. */
+      std::set<std::string> names;
+      driver->get_zone()->get_zonegroup().get_placement_target_names(names);
+      for (const auto& policy : names) {
+        policies_stats.emplace(policy, decltype(policies_stats)::mapped_type());
+      }
+
+      std::map<std::string, std::unique_ptr<rgw::sal::Bucket>>& m = buckets.get_buckets();
+      for (const auto& kv : m) {
+        const auto& bucket = kv.second;
+	lastmarker = &kv.first;
+
+        global_stats.bytes_used += bucket->get_size();
+        global_stats.bytes_used_rounded += bucket->get_size_rounded();
+        global_stats.objects_count += bucket->get_count();
+
+        /* operator[] still can create a new entry for storage policy seen
+         * for first time. */
+        auto& policy_stats = policies_stats[bucket->get_placement_rule().to_str()];
+        policy_stats.bytes_used += bucket->get_size();
+        policy_stats.bytes_used_rounded += bucket->get_size_rounded();
+        policy_stats.buckets_count++;
+        policy_stats.objects_count += bucket->get_count();
+      }
+      global_stats.buckets_count += m.size();
+
+    }
+    if (!lastmarker) {
+	ldpp_dout(this, -1) << "ERROR: rgw_read_user_buckets, stasis at marker="
+	      << marker << " uid=" << s->user->get_id() << dendl;
+	break;
+    }
+    marker = *lastmarker;
+  } while (buckets.is_truncated());
+}
+
+int RGWGetBucketVersioning::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketVersioning);
+}
+
+void RGWGetBucketVersioning::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetBucketVersioning::execute(optional_yield y)
+{
+  if (! s->bucket_exists) {
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+
+  versioned = s->bucket->versioned();
+  versioning_enabled = s->bucket->versioning_enabled();
+  mfa_enabled = s->bucket->get_info().mfa_enabled();
+}
+
+int RGWSetBucketVersioning::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketVersioning);
+}
+
+void RGWSetBucketVersioning::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWSetBucketVersioning::execute(optional_yield y)
+{
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+
+  if (! s->bucket_exists) {
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+
+  if (s->bucket->get_info().obj_lock_enabled() && versioning_status != VersioningEnabled) {
+    s->err.message = "bucket versioning cannot be disabled on buckets with object lock enabled";
+    ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl;
+    op_ret = -ERR_INVALID_BUCKET_STATE;
+    return;
+  }
+
+  bool cur_mfa_status = s->bucket->get_info().mfa_enabled();
+
+  mfa_set_status &= (mfa_status != cur_mfa_status);
+
+  if (mfa_set_status &&
+      !s->mfa_verified) {
+    op_ret = -ERR_MFA_REQUIRED;
+    return;
+  }
+  //if mfa is enabled for bucket, make sure mfa code is validated in case versioned status gets changed
+  if (cur_mfa_status) {
+    bool req_versioning_status = false;
+    //if requested versioning status is not the same as the one set for the bucket, return error
+    if (versioning_status == VersioningEnabled) {
+      req_versioning_status = (s->bucket->get_info().flags & BUCKET_VERSIONS_SUSPENDED) != 0;
+    } else if (versioning_status == VersioningSuspended) {
+      req_versioning_status = (s->bucket->get_info().flags & BUCKET_VERSIONS_SUSPENDED) == 0;
+    }
+    if (req_versioning_status && !s->mfa_verified) {
+      op_ret = -ERR_MFA_REQUIRED;
+      return;
+    }
+  }
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  bool modified = mfa_set_status;
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [&] {
+      if (mfa_set_status) {
+        if (mfa_status) {
+          s->bucket->get_info().flags |= BUCKET_MFA_ENABLED;
+        } else {
+          s->bucket->get_info().flags &= ~BUCKET_MFA_ENABLED;
+        }
+      }
+
+      if (versioning_status == VersioningEnabled) {
+	s->bucket->get_info().flags |= BUCKET_VERSIONED;
+	s->bucket->get_info().flags &= ~BUCKET_VERSIONS_SUSPENDED;
+        modified = true;
+      } else if (versioning_status == VersioningSuspended) {
+	s->bucket->get_info().flags |= (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED);
+        modified = true;
+      } else {
+	return op_ret;
+      }
+      s->bucket->set_attrs(rgw::sal::Attrs(s->bucket_attrs));
+      return s->bucket->put_info(this, false, real_time());
+    });
+
+  if (!modified) {
+    return;
+  }
+
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket->get_name()
+		     << " returned err=" << op_ret << dendl;
+    return;
+  }
+}
+
+int RGWGetBucketWebsite::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketWebsite);
+}
+
+void RGWGetBucketWebsite::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetBucketWebsite::execute(optional_yield y)
+{
+  if (!s->bucket->get_info().has_website) {
+    op_ret = -ERR_NO_SUCH_WEBSITE_CONFIGURATION;
+  }
+}
+
+int RGWSetBucketWebsite::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketWebsite);
+}
+
+void RGWSetBucketWebsite::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWSetBucketWebsite::execute(optional_yield y)
+{
+  op_ret = get_params(y);
+
+  if (op_ret < 0)
+    return;
+
+  if (!s->bucket_exists) {
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << " forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] {
+      s->bucket->get_info().has_website = true;
+      s->bucket->get_info().website_conf = website_conf;
+      op_ret = s->bucket->put_info(this, false, real_time());
+      return op_ret;
+    });
+
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket->get_name()
+        << " returned err=" << op_ret << dendl;
+    return;
+  }
+}
+
+int RGWDeleteBucketWebsite::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3DeleteBucketWebsite);
+}
+
+void RGWDeleteBucketWebsite::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWDeleteBucketWebsite::execute(optional_yield y)
+{
+  if (!s->bucket_exists) {
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+
+  bufferlist in_data;
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "NOTICE: forward_to_master failed on bucket=" << s->bucket->get_name()
+      << "returned err=" << op_ret << dendl;
+    return;
+  }
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] {
+      s->bucket->get_info().has_website = false;
+      s->bucket->get_info().website_conf = RGWBucketWebsiteConf();
+      op_ret = s->bucket->put_info(this, false, real_time());
+      return op_ret;
+    });
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket
+        << " returned err=" << op_ret << dendl;
+    return;
+  }
+}
+
+int RGWStatBucket::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  // This (a HEAD request on a bucket) is governed by the s3:ListBucket permission.
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3ListBucket)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWStatBucket::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWStatBucket::execute(optional_yield y)
+{
+  if (!s->bucket_exists) {
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+
+  op_ret = driver->get_bucket(this, s->user.get(), s->bucket->get_key(), &bucket, y);
+  if (op_ret) {
+    return;
+  }
+  op_ret = bucket->update_container_stats(s);
+}
+
+int RGWListBucket::verify_permission(optional_yield y)
+{
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+  if (!prefix.empty())
+    s->env.emplace("s3:prefix", prefix);
+
+  if (!delimiter.empty())
+    s->env.emplace("s3:delimiter", delimiter);
+
+  s->env.emplace("s3:max-keys", std::to_string(max));
+
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this,
+                                s,
+				list_versions ?
+				rgw::IAM::s3ListBucketVersions :
+				rgw::IAM::s3ListBucket)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+int RGWListBucket::parse_max_keys()
+{
+  // Bound max value of max-keys to configured value for security
+  // Bound min value of max-keys to '0'
+  // Some S3 clients explicitly send max-keys=0 to detect if the bucket is
+  // empty without listing any items.
+  return parse_value_and_bound(max_keys, max, 0,
+			g_conf().get_val<uint64_t>("rgw_max_listing_results"),
+			default_max);
+}
+
+void RGWListBucket::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWListBucket::execute(optional_yield y)
+{
+  if (!s->bucket_exists) {
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+
+  if (allow_unordered && !delimiter.empty()) {
+    ldpp_dout(this, 0) <<
+      "ERROR: unordered bucket listing requested with a delimiter" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (need_container_stats()) {
+    op_ret = s->bucket->update_container_stats(s);
+  }
+
+  rgw::sal::Bucket::ListParams params;
+  params.prefix = prefix;
+  params.delim = delimiter;
+  params.marker = marker;
+  params.end_marker = end_marker;
+  params.list_versions = list_versions;
+  params.allow_unordered = allow_unordered;
+  params.shard_id = shard_id;
+
+  rgw::sal::Bucket::ListResults results;
+
+  op_ret = s->bucket->list(this, params, max, results, y);
+  if (op_ret >= 0) {
+    next_marker = results.next_marker;
+    is_truncated = results.is_truncated;
+    objs = std::move(results.objs);
+    common_prefixes = std::move(results.common_prefixes);
+  }
+}
+
+int RGWGetBucketLogging::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketLogging);
+}
+
+int RGWGetBucketLocation::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketLocation);
+}
+
+int RGWCreateBucket::verify_permission(optional_yield y)
+{
+  /* This check is mostly needed for S3 that doesn't support account ACL.
+   * Swift doesn't allow to delegate any permission to an anonymous user,
+   * so it will become an early exit in such case. */
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  rgw_bucket bucket;
+  bucket.name = s->bucket_name;
+  bucket.tenant = s->bucket_tenant;
+  ARN arn = ARN(bucket);
+  if (!verify_user_permission(this, s, arn, rgw::IAM::s3CreateBucket, false)) {
+    return -EACCES;
+  }
+
+  if (s->user->get_tenant() != s->bucket_tenant) {
+    //AssumeRole is meant for cross account access
+    if (s->auth.identity->get_identity_type() != TYPE_ROLE) {
+      ldpp_dout(this, 10) << "user cannot create a bucket in a different tenant"
+                        << " (user_id.tenant=" << s->user->get_tenant()
+                        << " requested=" << s->bucket_tenant << ")"
+                        << dendl;
+      return -EACCES;
+    }
+  }
+
+  if (s->user->get_max_buckets() < 0) {
+    return -EPERM;
+  }
+
+  if (s->user->get_max_buckets()) {
+    rgw::sal::BucketList buckets;
+    string marker;
+    op_ret = s->user->list_buckets(this, marker, string(), s->user->get_max_buckets(),
+				   false, buckets, y);
+    if (op_ret < 0) {
+      return op_ret;
+    }
+
+    if ((int)buckets.count() >= s->user->get_max_buckets()) {
+      return -ERR_TOO_MANY_BUCKETS;
+    }
+  }
+
+  return 0;
+}
+
+void RGWCreateBucket::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+static void prepare_add_del_attrs(const map<string, bufferlist>& orig_attrs,
+                                  map<string, bufferlist>& out_attrs,
+                                  map<string, bufferlist>& out_rmattrs)
+{
+  for (const auto& kv : orig_attrs) {
+    const string& name = kv.first;
+
+    /* Check if the attr is user-defined metadata item. */
+    if (name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1,
+                     RGW_ATTR_META_PREFIX) == 0) {
+      /* For the objects all existing meta attrs have to be removed. */
+      out_rmattrs[name] = kv.second;
+    } else if (out_attrs.find(name) == std::end(out_attrs)) {
+      out_attrs[name] = kv.second;
+    }
+  }
+}
+
+/* Fuse resource metadata basing on original attributes in @orig_attrs, set
+ * of _custom_ attribute names to remove in @rmattr_names and attributes in
+ * @out_attrs. Place results in @out_attrs.
+ *
+ * NOTE: it's supposed that all special attrs already present in @out_attrs
+ * will be preserved without any change. Special attributes are those which
+ * names start with RGW_ATTR_META_PREFIX. They're complement to custom ones
+ * used for X-Account-Meta-*, X-Container-Meta-*, X-Amz-Meta and so on.  */
+static void prepare_add_del_attrs(const map<string, bufferlist>& orig_attrs,
+                                  const set<string>& rmattr_names,
+                                  map<string, bufferlist>& out_attrs)
+{
+  for (const auto& kv : orig_attrs) {
+    const string& name = kv.first;
+
+    /* Check if the attr is user-defined metadata item. */
+    if (name.compare(0, strlen(RGW_ATTR_META_PREFIX),
+                     RGW_ATTR_META_PREFIX) == 0) {
+      /* For the buckets all existing meta attrs are preserved,
+         except those that are listed in rmattr_names. */
+      if (rmattr_names.find(name) != std::end(rmattr_names)) {
+        const auto aiter = out_attrs.find(name);
+
+        if (aiter != std::end(out_attrs)) {
+          out_attrs.erase(aiter);
+        }
+      } else {
+        /* emplace() won't alter the map if the key is already present.
+         * This behaviour is fully intensional here. */
+        out_attrs.emplace(kv);
+      }
+    } else if (out_attrs.find(name) == std::end(out_attrs)) {
+      out_attrs[name] = kv.second;
+    }
+  }
+}
+
+
+static void populate_with_generic_attrs(const req_state * const s,
+                                        map<string, bufferlist>& out_attrs)
+{
+  for (const auto& kv : s->generic_attrs) {
+    bufferlist& attrbl = out_attrs[kv.first];
+    const string& val = kv.second;
+    attrbl.clear();
+    attrbl.append(val.c_str(), val.size() + 1);
+  }
+}
+
+
+static int filter_out_quota_info(std::map<std::string, bufferlist>& add_attrs,
+                                 const std::set<std::string>& rmattr_names,
+                                 RGWQuotaInfo& quota,
+                                 bool * quota_extracted = nullptr)
+{
+  bool extracted = false;
+
+  /* Put new limit on max objects. */
+  auto iter = add_attrs.find(RGW_ATTR_QUOTA_NOBJS);
+  std::string err;
+  if (std::end(add_attrs) != iter) {
+    quota.max_objects =
+      static_cast<int64_t>(strict_strtoll(iter->second.c_str(), 10, &err));
+    if (!err.empty()) {
+      return -EINVAL;
+    }
+    add_attrs.erase(iter);
+    extracted = true;
+  }
+
+  /* Put new limit on bucket (container) size. */
+  iter = add_attrs.find(RGW_ATTR_QUOTA_MSIZE);
+  if (iter != add_attrs.end()) {
+    quota.max_size =
+      static_cast<int64_t>(strict_strtoll(iter->second.c_str(), 10, &err));
+    if (!err.empty()) {
+      return -EINVAL;
+    }
+    add_attrs.erase(iter);
+    extracted = true;
+  }
+
+  for (const auto& name : rmattr_names) {
+    /* Remove limit on max objects. */
+    if (name.compare(RGW_ATTR_QUOTA_NOBJS) == 0) {
+      quota.max_objects = -1;
+      extracted = true;
+    }
+
+    /* Remove limit on max bucket size. */
+    if (name.compare(RGW_ATTR_QUOTA_MSIZE) == 0) {
+      quota.max_size = -1;
+      extracted = true;
+    }
+  }
+
+  /* Swift requries checking on raw usage instead of the 4 KiB rounded one. */
+  quota.check_on_raw = true;
+  quota.enabled = quota.max_size > 0 || quota.max_objects > 0;
+
+  if (quota_extracted) {
+    *quota_extracted = extracted;
+  }
+
+  return 0;
+}
+
+
+static void filter_out_website(std::map<std::string, ceph::bufferlist>& add_attrs,
+                               const std::set<std::string>& rmattr_names,
+                               RGWBucketWebsiteConf& ws_conf)
+{
+  std::string lstval;
+
+  /* Let's define a mapping between each custom attribute and the memory where
+   * attribute's value should be stored. The memory location is expressed by
+   * a non-const reference. */
+  const auto mapping  = {
+    std::make_pair(RGW_ATTR_WEB_INDEX,     std::ref(ws_conf.index_doc_suffix)),
+    std::make_pair(RGW_ATTR_WEB_ERROR,     std::ref(ws_conf.error_doc)),
+    std::make_pair(RGW_ATTR_WEB_LISTINGS,  std::ref(lstval)),
+    std::make_pair(RGW_ATTR_WEB_LIST_CSS,  std::ref(ws_conf.listing_css_doc)),
+    std::make_pair(RGW_ATTR_SUBDIR_MARKER, std::ref(ws_conf.subdir_marker))
+  };
+
+  for (const auto& kv : mapping) {
+    const char * const key = kv.first;
+    auto& target = kv.second;
+
+    auto iter = add_attrs.find(key);
+
+    if (std::end(add_attrs) != iter) {
+      /* The "target" is a reference to ws_conf. */
+      target = iter->second.c_str();
+      add_attrs.erase(iter);
+    }
+
+    if (rmattr_names.count(key)) {
+      target = std::string();
+    }
+  }
+
+  if (! lstval.empty()) {
+    ws_conf.listing_enabled = boost::algorithm::iequals(lstval, "true");
+  }
+}
+
+
+void RGWCreateBucket::execute(optional_yield y)
+{
+  buffer::list aclbl;
+  buffer::list corsbl;
+  string bucket_name = rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name);
+
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+
+  if (!relaxed_region_enforcement &&
+      !location_constraint.empty() &&
+      !driver->get_zone()->has_zonegroup_api(location_constraint)) {
+      ldpp_dout(this, 0) << "location constraint (" << location_constraint << ")"
+                       << " can't be found." << dendl;
+      op_ret = -ERR_INVALID_LOCATION_CONSTRAINT;
+      s->err.message = "The specified location-constraint is not valid";
+      return;
+  }
+
+  if (!relaxed_region_enforcement && !driver->get_zone()->get_zonegroup().is_master_zonegroup() && !location_constraint.empty() &&
+      driver->get_zone()->get_zonegroup().get_api_name() != location_constraint) {
+    ldpp_dout(this, 0) << "location constraint (" << location_constraint << ")"
+                     << " doesn't match zonegroup" << " (" << driver->get_zone()->get_zonegroup().get_api_name() << ")"
+                     << dendl;
+    op_ret = -ERR_INVALID_LOCATION_CONSTRAINT;
+    s->err.message = "The specified location-constraint is not valid";
+    return;
+  }
+
+  std::set<std::string> names;
+  driver->get_zone()->get_zonegroup().get_placement_target_names(names);
+  if (!placement_rule.name.empty() &&
+      !names.count(placement_rule.name)) {
+    ldpp_dout(this, 0) << "placement target (" << placement_rule.name << ")"
+                     << " doesn't exist in the placement targets of zonegroup"
+                     << " (" << driver->get_zone()->get_zonegroup().get_api_name() << ")" << dendl;
+    op_ret = -ERR_INVALID_LOCATION_CONSTRAINT;
+    s->err.message = "The specified placement target does not exist";
+    return;
+  }
+
+  /* we need to make sure we read bucket info, it's not read before for this
+   * specific request */
+  {
+    std::unique_ptr<rgw::sal::Bucket> tmp_bucket;
+    op_ret = driver->get_bucket(this, s->user.get(), s->bucket_tenant,
+			       s->bucket_name, &tmp_bucket, y);
+    if (op_ret < 0 && op_ret != -ENOENT)
+      return;
+    s->bucket_exists = (op_ret != -ENOENT);
+
+    if (s->bucket_exists) {
+      if (!s->system_request &&
+	  driver->get_zone()->get_zonegroup().get_id() !=
+	  tmp_bucket->get_info().zonegroup) {
+	op_ret = -EEXIST;
+	return;
+      }
+      /* Initialize info from req_state */
+      info = tmp_bucket->get_info();
+    }
+  }
+
+  s->bucket_owner.set_id(s->user->get_id()); /* XXX dang use s->bucket->owner */
+  s->bucket_owner.set_name(s->user->get_display_name());
+
+  string zonegroup_id;
+
+  if (s->system_request) {
+    zonegroup_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "zonegroup");
+    if (zonegroup_id.empty()) {
+      zonegroup_id = driver->get_zone()->get_zonegroup().get_id();
+    }
+  } else {
+    zonegroup_id = driver->get_zone()->get_zonegroup().get_id();
+  }
+
+  /* Encode special metadata first as we're using std::map::emplace under
+   * the hood. This method will add the new items only if the map doesn't
+   * contain such keys yet. */
+  policy.encode(aclbl);
+  emplace_attr(RGW_ATTR_ACL, std::move(aclbl));
+
+  if (has_cors) {
+    cors_config.encode(corsbl);
+    emplace_attr(RGW_ATTR_CORS, std::move(corsbl));
+  }
+
+  RGWQuotaInfo quota_info;
+  const RGWQuotaInfo * pquota_info = nullptr;
+  if (need_metadata_upload()) {
+    /* It's supposed that following functions WILL NOT change any special
+     * attributes (like RGW_ATTR_ACL) if they are already present in attrs. */
+    op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs, false);
+    if (op_ret < 0) {
+      return;
+    }
+    prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs);
+    populate_with_generic_attrs(s, attrs);
+
+    op_ret = filter_out_quota_info(attrs, rmattr_names, quota_info);
+    if (op_ret < 0) {
+      return;
+    } else {
+      pquota_info = &quota_info;
+    }
+
+    /* Web site of Swift API. */
+    filter_out_website(attrs, rmattr_names, info.website_conf);
+    info.has_website = !info.website_conf.is_empty();
+  }
+
+  rgw_bucket tmp_bucket;
+  tmp_bucket.tenant = s->bucket_tenant; /* ignored if bucket exists */
+  tmp_bucket.name = s->bucket_name;
+
+  /* Handle updates of the metadata for Swift's object versioning. */
+  if (swift_ver_location) {
+    info.swift_ver_location = *swift_ver_location;
+    info.swift_versioning = (! swift_ver_location->empty());
+  }
+
+  /* We're replacing bucket with the newly created one */
+  ldpp_dout(this, 10) << "user=" << s->user << " bucket=" << tmp_bucket << dendl;
+  op_ret = s->user->create_bucket(this, tmp_bucket, zonegroup_id,
+				placement_rule,
+				info.swift_ver_location,
+				pquota_info, policy, attrs, info, ep_objv,
+				true, obj_lock_enabled, &s->bucket_exists, s->info,
+				&s->bucket, y);
+
+  /* continue if EEXIST and create_bucket will fail below.  this way we can
+   * recover from a partial create by retrying it. */
+  ldpp_dout(this, 20) << "rgw_create_bucket returned ret=" << op_ret << " bucket=" << s->bucket.get() << dendl;
+
+  if (op_ret)
+    return;
+
+  const bool existed = s->bucket_exists;
+  if (need_metadata_upload() && existed) {
+    /* OK, it looks we lost race with another request. As it's required to
+     * handle metadata fusion and upload, the whole operation becomes very
+     * similar in nature to PutMetadataBucket. However, as the attrs may
+     * changed in the meantime, we have to refresh. */
+    short tries = 0;
+    do {
+      map<string, bufferlist> battrs;
+
+      op_ret = s->bucket->load_bucket(this, y);
+      if (op_ret < 0) {
+        return;
+      } else if (!s->bucket->is_owner(s->user.get())) {
+        /* New bucket doesn't belong to the account we're operating on. */
+        op_ret = -EEXIST;
+        return;
+      } else {
+        s->bucket_attrs = s->bucket->get_attrs();
+      }
+
+      attrs.clear();
+
+      op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs, false);
+      if (op_ret < 0) {
+        return;
+      }
+      prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs);
+      populate_with_generic_attrs(s, attrs);
+      op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket->get_info().quota);
+      if (op_ret < 0) {
+        return;
+      }
+
+      /* Handle updates of the metadata for Swift's object versioning. */
+      if (swift_ver_location) {
+        s->bucket->get_info().swift_ver_location = *swift_ver_location;
+        s->bucket->get_info().swift_versioning = (! swift_ver_location->empty());
+      }
+
+      /* Web site of Swift API. */
+      filter_out_website(attrs, rmattr_names, s->bucket->get_info().website_conf);
+      s->bucket->get_info().has_website = !s->bucket->get_info().website_conf.is_empty();
+
+      /* This will also set the quota on the bucket. */
+      op_ret = s->bucket->merge_and_store_attrs(this, attrs, y);
+    } while (op_ret == -ECANCELED && tries++ < 20);
+
+    /* Restore the proper return code. */
+    if (op_ret >= 0) {
+      op_ret = -ERR_BUCKET_EXISTS;
+    }
+  }
+}
+
+int RGWDeleteBucket::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3DeleteBucket)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWDeleteBucket::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWDeleteBucket::execute(optional_yield y)
+{
+  if (s->bucket_name.empty()) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (!s->bucket_exists) {
+    ldpp_dout(this, 0) << "ERROR: bucket " << s->bucket_name << " not found" << dendl;
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+  RGWObjVersionTracker ot;
+  ot.read_version = s->bucket->get_version();
+
+  if (s->system_request) {
+    string tag = s->info.args.get(RGW_SYS_PARAM_PREFIX "tag");
+    string ver_str = s->info.args.get(RGW_SYS_PARAM_PREFIX "ver");
+    if (!tag.empty()) {
+      ot.read_version.tag = tag;
+      uint64_t ver;
+      string err;
+      ver = strict_strtol(ver_str.c_str(), 10, &err);
+      if (!err.empty()) {
+        ldpp_dout(this, 0) << "failed to parse ver param" << dendl;
+        op_ret = -EINVAL;
+        return;
+      }
+      ot.read_version.ver = ver;
+    }
+  }
+
+  op_ret = s->bucket->sync_user_stats(this, y);
+  if ( op_ret < 0) {
+     ldpp_dout(this, 1) << "WARNING: failed to sync user stats before bucket delete: op_ret= " << op_ret << dendl;
+  }
+
+  op_ret = s->bucket->check_empty(this, y);
+  if (op_ret < 0) {
+    return;
+  }
+
+  bufferlist in_data;
+  op_ret = driver->forward_request_to_master(this, s->user.get(), &ot.read_version, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    if (op_ret == -ENOENT) {
+      /* adjust error, we want to return with NoSuchBucket and not
+       * NoSuchKey */
+      op_ret = -ERR_NO_SUCH_BUCKET;
+    }
+    return;
+  }
+
+  op_ret = rgw_remove_sse_s3_bucket_key(s);
+  if (op_ret != 0) {
+      // do nothing; it will already have been logged
+  }
+
+  op_ret = s->bucket->remove_bucket(this, false, false, nullptr, y);
+  if (op_ret < 0 && op_ret == -ECANCELED) {
+      // lost a race, either with mdlog sync or another delete bucket operation.
+      // in either case, we've already called ctl.bucket->unlink_bucket()
+      op_ret = 0;
+  }
+
+  return;
+}
+
+int RGWPutObj::init_processing(optional_yield y) {
+  copy_source = url_decode(s->info.env->get("HTTP_X_AMZ_COPY_SOURCE", ""));
+  copy_source_range = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_RANGE");
+  size_t pos;
+  int ret;
+
+  /* handle x-amz-copy-source */
+  std::string_view cs_view(copy_source);
+  if (! cs_view.empty()) {
+    if (cs_view[0] == '/')
+      cs_view.remove_prefix(1);
+    copy_source_bucket_name = std::string(cs_view);
+    pos = copy_source_bucket_name.find("/");
+    if (pos == std::string::npos) {
+      ret = -EINVAL;
+      ldpp_dout(this, 5) << "x-amz-copy-source bad format" << dendl;
+      return ret;
+    }
+    copy_source_object_name =
+      copy_source_bucket_name.substr(pos + 1, copy_source_bucket_name.size());
+    copy_source_bucket_name = copy_source_bucket_name.substr(0, pos);
+#define VERSION_ID_STR "?versionId="
+    pos = copy_source_object_name.find(VERSION_ID_STR);
+    if (pos == std::string::npos) {
+      copy_source_object_name = url_decode(copy_source_object_name);
+    } else {
+      copy_source_version_id =
+        copy_source_object_name.substr(pos + sizeof(VERSION_ID_STR) - 1);
+      copy_source_object_name =
+        url_decode(copy_source_object_name.substr(0, pos));
+    }
+    pos = copy_source_bucket_name.find(":");
+    if (pos == std::string::npos) {
+      // if tenant is not specified in x-amz-copy-source, use tenant of the requester
+      copy_source_tenant_name = s->user->get_tenant();
+    } else {
+      copy_source_tenant_name = copy_source_bucket_name.substr(0, pos);
+      copy_source_bucket_name = copy_source_bucket_name.substr(pos + 1, copy_source_bucket_name.size());
+      if (copy_source_bucket_name.empty()) {
+        ret = -EINVAL;
+        ldpp_dout(this, 5) << "source bucket name is empty" << dendl;
+        return ret;
+      }
+    }
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    ret = driver->get_bucket(this, s->user.get(), copy_source_tenant_name, copy_source_bucket_name,
+			      &bucket, y);
+    if (ret < 0) {
+      ldpp_dout(this, 5) << __func__ << "(): get_bucket() returned ret=" << ret << dendl;
+      if (ret == -ENOENT) {
+        ret = -ERR_NO_SUCH_BUCKET;
+      }
+      return ret;
+    }
+
+    ret = bucket->load_bucket(this, y);
+    if (ret < 0) {
+      ldpp_dout(this, 5) << __func__ << "(): load_bucket() returned ret=" << ret << dendl;
+      return ret;
+    }
+    copy_source_bucket_info = bucket->get_info();
+
+    /* handle x-amz-copy-source-range */
+    if (copy_source_range) {
+      string range = copy_source_range;
+      pos = range.find("bytes=");
+      if (pos == std::string::npos || pos != 0) {
+        ret = -EINVAL;
+        ldpp_dout(this, 5) << "x-amz-copy-source-range bad format" << dendl;
+        return ret;
+      }
+      /* 6 is the length of "bytes=" */
+      range = range.substr(pos + 6);
+      pos = range.find("-");
+      if (pos == std::string::npos) {
+        ret = -EINVAL;
+        ldpp_dout(this, 5) << "x-amz-copy-source-range bad format" << dendl;
+        return ret;
+      }
+      string first = range.substr(0, pos);
+      string last = range.substr(pos + 1);
+      if (first.find_first_not_of("0123456789") != std::string::npos ||
+	  last.find_first_not_of("0123456789") != std::string::npos) {
+        ldpp_dout(this, 5) << "x-amz-copy-source-range bad format not an integer" << dendl;
+        ret = -EINVAL;
+        return ret;
+      }
+      copy_source_range_fst = strtoull(first.c_str(), NULL, 10);
+      copy_source_range_lst = strtoull(last.c_str(), NULL, 10);
+      if (copy_source_range_fst > copy_source_range_lst) {
+        ret = -ERANGE;
+        ldpp_dout(this, 5) << "x-amz-copy-source-range bad format first number bigger than second" << dendl;
+        return ret;
+      }
+    }
+
+  } /* copy_source */
+  return RGWOp::init_processing(y);
+}
+
+int RGWPutObj::verify_permission(optional_yield y)
+{
+  if (! copy_source.empty()) {
+
+    RGWAccessControlPolicy cs_acl(s->cct);
+    boost::optional<Policy> policy;
+    map<string, bufferlist> cs_attrs;
+    std::unique_ptr<rgw::sal::Bucket> cs_bucket;
+    int ret = driver->get_bucket(NULL, copy_source_bucket_info, &cs_bucket);
+    if (ret < 0)
+      return ret;
+
+    std::unique_ptr<rgw::sal::Object> cs_object =
+      cs_bucket->get_object(rgw_obj_key(copy_source_object_name, copy_source_version_id));
+
+    cs_object->set_atomic();
+    cs_object->set_prefetch_data();
+
+    /* check source object permissions */
+    if (ret = read_obj_policy(this, driver, s, copy_source_bucket_info, cs_attrs, &cs_acl, nullptr,
+			policy, cs_bucket.get(), cs_object.get(), y, true); ret < 0) {
+      return ret;
+    }
+
+    /* admin request overrides permission checks */
+    if (! s->auth.identity->is_admin_of(cs_acl.get_owner().get_id())) {
+      if (policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) {
+        //add source object tags for permission evaluation
+        auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, policy, s->iam_user_policies, s->session_policies);
+        if (has_s3_existing_tag || has_s3_resource_tag)
+          rgw_iam_add_objtags(this, s, cs_object.get(), has_s3_existing_tag, has_s3_resource_tag);
+        auto usr_policy_res = Effect::Pass;
+        rgw::ARN obj_arn(cs_object->get_obj());
+        for (auto& user_policy : s->iam_user_policies) {
+          if (usr_policy_res = user_policy.eval(s->env, *s->auth.identity,
+			      cs_object->get_instance().empty() ?
+			      rgw::IAM::s3GetObject :
+			      rgw::IAM::s3GetObjectVersion,
+			      obj_arn); usr_policy_res == Effect::Deny)
+            return -EACCES;
+          else if (usr_policy_res == Effect::Allow)
+            break;
+        }
+  rgw::IAM::Effect e = Effect::Pass;
+  if (policy) {
+    rgw::ARN obj_arn(cs_object->get_obj());
+	  e = policy->eval(s->env, *s->auth.identity,
+			      cs_object->get_instance().empty() ?
+			      rgw::IAM::s3GetObject :
+			      rgw::IAM::s3GetObjectVersion,
+			      obj_arn);
+  }
+	if (e == Effect::Deny) {
+	  return -EACCES; 
+	} else if (usr_policy_res == Effect::Pass && e == Effect::Pass &&
+		   !cs_acl.verify_permission(this, *s->auth.identity, s->perm_mask,
+						RGW_PERM_READ)) {
+	  return -EACCES;
+	}
+      rgw_iam_remove_objtags(this, s, cs_object.get(), has_s3_existing_tag, has_s3_resource_tag);
+      } else if (!cs_acl.verify_permission(this, *s->auth.identity, s->perm_mask,
+					   RGW_PERM_READ)) {
+	return -EACCES;
+      }
+    }
+  }
+
+  if (s->bucket_access_conf && s->bucket_access_conf->block_public_acls()) {
+    if (s->canned_acl.compare("public-read") ||
+        s->canned_acl.compare("public-read-write") ||
+        s->canned_acl.compare("authenticated-read"))
+      return -EACCES;
+  }
+
+  auto op_ret = get_params(y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "get_params() returned ret=" << op_ret << dendl;
+    return op_ret;
+  }
+
+  if (s->iam_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) {
+    rgw_add_grant_to_iam_environment(s->env, s);
+
+    rgw_add_to_iam_environment(s->env, "s3:x-amz-acl", s->canned_acl);
+
+    if (obj_tags != nullptr && obj_tags->count() > 0){
+      auto tags = obj_tags->get_tags();
+      for (const auto& kv: tags){
+        rgw_add_to_iam_environment(s->env, "s3:RequestObjectTag/"+kv.first, kv.second);
+      }
+    }
+
+    // add server-side encryption headers
+    rgw_iam_add_crypt_attrs(s->env, s->info.crypt_attribute_map);
+
+    // Add bucket tags for authorization
+    auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+    if (has_s3_resource_tag)
+      rgw_iam_add_buckettags(this, s);
+
+    auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                            rgw::IAM::s3PutObject,
+                                            s->object->get_obj());
+    if (identity_policy_res == Effect::Deny)
+      return -EACCES;
+
+    rgw::IAM::Effect e = Effect::Pass;
+    rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+    if (s->iam_policy) {
+      ARN obj_arn(s->object->get_obj());
+      e = s->iam_policy->eval(s->env, *s->auth.identity,
+          rgw::IAM::s3PutObject,
+          obj_arn,
+          princ_type);
+    }
+    if (e == Effect::Deny) {
+      return -EACCES;
+    }
+
+    if (!s->session_policies.empty()) {
+      auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                              rgw::IAM::s3PutObject,
+                                              s->object->get_obj());
+      if (session_policy_res == Effect::Deny) {
+          return -EACCES;
+      }
+      if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+        //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+            (session_policy_res == Effect::Allow && e == Effect::Allow))
+          return 0;
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+        //Intersection of session policy and identity policy plus bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow)
+          return 0;
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+        if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow)
+          return 0;
+      }
+      return -EACCES;
+    }
+    if (e == Effect::Allow || identity_policy_res == Effect::Allow) {
+      return 0;
+    }
+  }
+
+  if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+
+void RGWPutObj::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+class RGWPutObj_CB : public RGWGetObj_Filter
+{
+  RGWPutObj *op;
+public:
+  explicit RGWPutObj_CB(RGWPutObj *_op) : op(_op) {}
+  ~RGWPutObj_CB() override {}
+
+  int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
+    return op->get_data_cb(bl, bl_ofs, bl_len);
+  }
+};
+
+int RGWPutObj::get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len)
+{
+  bufferlist bl_tmp;
+  bl.begin(bl_ofs).copy(bl_len, bl_tmp);
+
+  bl_aux.append(bl_tmp);
+
+  return bl_len;
+}
+
+int RGWPutObj::get_data(const off_t fst, const off_t lst, bufferlist& bl)
+{
+  RGWPutObj_CB cb(this);
+  RGWGetObj_Filter* filter = &cb;
+  boost::optional<RGWGetObj_Decompress> decompress;
+  std::unique_ptr<RGWGetObj_Filter> decrypt;
+  RGWCompressionInfo cs_info;
+  map<string, bufferlist> attrs;
+  int ret = 0;
+
+  uint64_t obj_size;
+  int64_t new_ofs, new_end;
+
+  new_ofs = fst;
+  new_end = lst;
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  ret = driver->get_bucket(nullptr, copy_source_bucket_info, &bucket);
+  if (ret < 0)
+    return ret;
+
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(rgw_obj_key(copy_source_object_name, copy_source_version_id));
+  std::unique_ptr<rgw::sal::Object::ReadOp> read_op(obj->get_read_op());
+
+  ret = read_op->prepare(s->yield, this);
+  if (ret < 0)
+    return ret;
+
+  obj_size = obj->get_obj_size();
+
+  bool need_decompress;
+  op_ret = rgw_compression_info_from_attrset(obj->get_attrs(), need_decompress, cs_info);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode compression info" << dendl;
+    return -EIO;
+  }
+
+  bool partial_content = true;
+  if (need_decompress)
+  {
+    obj_size = cs_info.orig_size;
+    decompress.emplace(s->cct, &cs_info, partial_content, filter);
+    filter = &*decompress;
+  }
+
+  auto attr_iter = obj->get_attrs().find(RGW_ATTR_MANIFEST);
+  op_ret = this->get_decrypt_filter(&decrypt,
+                                    filter,
+                                    obj->get_attrs(),
+                                    attr_iter != obj->get_attrs().end() ? &(attr_iter->second) : nullptr);
+  if (decrypt != nullptr) {
+    filter = decrypt.get();
+  }
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  ret = obj->range_to_ofs(obj_size, new_ofs, new_end);
+  if (ret < 0)
+    return ret;
+
+  filter->fixup_range(new_ofs, new_end);
+  ret = read_op->iterate(this, new_ofs, new_end, filter, s->yield);
+
+  if (ret >= 0)
+    ret = filter->flush();
+
+  bl.claim_append(bl_aux);
+
+  return ret;
+}
+
+// special handling for compression type = "random" with multipart uploads
+static CompressorRef get_compressor_plugin(const req_state *s,
+                                           const std::string& compression_type)
+{
+  if (compression_type != "random") {
+    return Compressor::create(s->cct, compression_type);
+  }
+
+  bool is_multipart{false};
+  const auto& upload_id = s->info.args.get("uploadId", &is_multipart);
+
+  if (!is_multipart) {
+    return Compressor::create(s->cct, compression_type);
+  }
+
+  // use a hash of the multipart upload id so all parts use the same plugin
+  const auto alg = std::hash<std::string>{}(upload_id) % Compressor::COMP_ALG_LAST;
+  if (alg == Compressor::COMP_ALG_NONE) {
+    return nullptr;
+  }
+  return Compressor::create(s->cct, alg);
+}
+
+int RGWPutObj::get_lua_filter(std::unique_ptr<rgw::sal::DataProcessor>* filter, rgw::sal::DataProcessor* cb) {
+  std::string script;
+  const auto rc = rgw::lua::read_script(s, s->penv.lua.manager.get(), s->bucket_tenant, s->yield, rgw::lua::context::putData, script);
+  if (rc == -ENOENT) {
+    // no script, nothing to do
+    return 0;
+  } else if (rc < 0) {
+    ldpp_dout(this, 5) << "WARNING: failed to read data script. error: " << rc << dendl;
+    return rc;
+  }
+  filter->reset(new rgw::lua::RGWPutObjFilter(s, script, cb));
+  return 0;
+}
+
+void RGWPutObj::execute(optional_yield y)
+{
+  char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1];
+  char supplied_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+  char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  bufferlist bl, aclbl, bs;
+  int len;
+  
+  off_t fst;
+  off_t lst;
+
+  bool need_calc_md5 = (dlo_manifest == NULL) && (slo_info == NULL);
+  perfcounter->inc(l_rgw_put);
+  // report latency on return
+  auto put_lat = make_scope_guard([&] {
+      perfcounter->tinc(l_rgw_put_lat, s->time_elapsed());
+    });
+
+  op_ret = -EINVAL;
+  if (rgw::sal::Object::empty(s->object.get())) {
+    return;
+  }
+
+  if (!s->bucket_exists) {
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+
+  op_ret = get_system_versioning_params(s, &olh_epoch, &version_id);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "get_system_versioning_params() returned ret="
+		      << op_ret << dendl;
+    return;
+  }
+
+  if (supplied_md5_b64) {
+    need_calc_md5 = true;
+
+    ldpp_dout(this, 15) << "supplied_md5_b64=" << supplied_md5_b64 << dendl;
+    op_ret = ceph_unarmor(supplied_md5_bin, &supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1],
+                       supplied_md5_b64, supplied_md5_b64 + strlen(supplied_md5_b64));
+    ldpp_dout(this, 15) << "ceph_armor ret=" << op_ret << dendl;
+    if (op_ret != CEPH_CRYPTO_MD5_DIGESTSIZE) {
+      op_ret = -ERR_INVALID_DIGEST;
+      return;
+    }
+
+    buf_to_hex((const unsigned char *)supplied_md5_bin, CEPH_CRYPTO_MD5_DIGESTSIZE, supplied_md5);
+    ldpp_dout(this, 15) << "supplied_md5=" << supplied_md5 << dendl;
+  }
+
+  if (!chunked_upload) { /* with chunked upload we don't know how big is the upload.
+                            we also check sizes at the end anyway */
+    op_ret = s->bucket->check_quota(this, quota, s->content_length, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 20) << "check_quota() returned ret=" << op_ret << dendl;
+      return;
+    }
+  }
+
+  if (supplied_etag) {
+    strncpy(supplied_md5, supplied_etag, sizeof(supplied_md5) - 1);
+    supplied_md5[sizeof(supplied_md5) - 1] = '\0';
+  }
+
+  const bool multipart = !multipart_upload_id.empty();
+
+  /* Handle object versioning of Swift API. */
+  if (! multipart) {
+    op_ret = s->object->swift_versioning_copy(this, s->yield);
+    if (op_ret < 0) {
+      return;
+    }
+  }
+
+  // make reservation for notification if needed
+  std::unique_ptr<rgw::sal::Notification> res
+		     = driver->get_notification(
+		       s->object.get(), s->src_object.get(), s,
+		       rgw::notify::ObjectCreatedPut, y);
+  if(!multipart) {
+    op_ret = res->publish_reserve(this, obj_tags.get());
+    if (op_ret < 0) {
+      return;
+    }
+  }
+
+  // create the object processor
+  std::unique_ptr<rgw::sal::Writer> processor;
+
+  rgw_placement_rule *pdest_placement = &s->dest_placement;
+
+  if (multipart) {
+    std::unique_ptr<rgw::sal::MultipartUpload> upload;
+    upload = s->bucket->get_multipart_upload(s->object->get_name(),
+					 multipart_upload_id);
+    op_ret = upload->get_info(this, s->yield, &pdest_placement);
+
+    s->trace->SetAttribute(tracing::rgw::UPLOAD_ID, multipart_upload_id);
+    multipart_trace = tracing::rgw::tracer.add_span(name(), upload->get_trace());
+
+    if (op_ret < 0) {
+      if (op_ret != -ENOENT) {
+        ldpp_dout(this, 0) << "ERROR: get_multipart_info returned " << op_ret << ": " << cpp_strerror(-op_ret) << dendl;
+      } else {// -ENOENT: raced with upload complete/cancel, no need to spam log
+        ldpp_dout(this, 20) << "failed to get multipart info (returned " << op_ret << ": " << cpp_strerror(-op_ret) << "): probably raced with upload complete / cancel" << dendl;
+      }
+      return;
+    }
+    /* upload will go out of scope, so copy the dest placement for later use */
+    s->dest_placement = *pdest_placement;
+    pdest_placement = &s->dest_placement;
+    ldpp_dout(this, 20) << "dest_placement for part=" << *pdest_placement << dendl;
+    processor = upload->get_writer(this, s->yield, s->object.get(),
+				   s->user->get_id(), pdest_placement,
+				   multipart_part_num, multipart_part_str);
+  } else if(append) {
+    if (s->bucket->versioned()) {
+      op_ret = -ERR_INVALID_BUCKET_STATE;
+      return;
+    }
+    processor = driver->get_append_writer(this, s->yield, s->object.get(),
+					 s->bucket_owner.get_id(),
+					 pdest_placement, s->req_id, position,
+					 &cur_accounted_size);
+  } else {
+    if (s->bucket->versioning_enabled()) {
+      if (!version_id.empty()) {
+        s->object->set_instance(version_id);
+      } else {
+	s->object->gen_rand_obj_instance_name();
+        version_id = s->object->get_instance();
+      }
+    }
+    processor = driver->get_atomic_writer(this, s->yield, s->object.get(),
+					 s->bucket_owner.get_id(),
+					 pdest_placement, olh_epoch, s->req_id);
+  }
+
+  op_ret = processor->prepare(s->yield);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "processor->prepare() returned ret=" << op_ret
+		      << dendl;
+    return;
+  }
+  if ((! copy_source.empty()) && !copy_source_range) {
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    op_ret = driver->get_bucket(nullptr, copy_source_bucket_info, &bucket);
+    if (op_ret < 0) {
+      ldpp_dout(this, 0) << "ERROR: failed to get bucket with error" << op_ret << dendl;
+      return;
+    }
+    std::unique_ptr<rgw::sal::Object> obj =
+      bucket->get_object(rgw_obj_key(copy_source_object_name, copy_source_version_id));
+
+    RGWObjState *astate;
+    op_ret = obj->get_obj_state(this, &astate, s->yield);
+    if (op_ret < 0) {
+      ldpp_dout(this, 0) << "ERROR: get copy source obj state returned with error" << op_ret << dendl;
+      return;
+    }
+    bufferlist bl;
+    if (astate->get_attr(RGW_ATTR_MANIFEST, bl)) {
+      RGWObjManifest m;
+      try{
+        decode(m, bl);
+        if (m.get_tier_type() == "cloud-s3") {
+          op_ret = -ERR_INVALID_OBJECT_STATE;
+          s->err.message = "This object was transitioned to cloud-s3";
+          ldpp_dout(this, 4) << "Cannot copy cloud tiered object. Failing with "
+                         << op_ret << dendl;
+          return;
+        }
+      } catch (const buffer::end_of_buffer&) {
+        // ignore empty manifest; it's not cloud-tiered
+      } catch (const std::exception& e) {
+        ldpp_dout(this, 1) << "WARNING: failed to decode object manifest for "
+            << *s->object << ": " << e.what() << dendl;
+      }
+    }
+
+    if (!astate->exists){
+      op_ret = -ENOENT;
+      return;
+    }
+    lst = astate->accounted_size - 1;
+  } else {
+    lst = copy_source_range_lst;
+  }
+  fst = copy_source_range_fst;
+
+  // no filters by default
+  rgw::sal::DataProcessor *filter = processor.get();
+
+  const auto& compression_type = driver->get_compression_type(*pdest_placement);
+  CompressorRef plugin;
+  boost::optional<RGWPutObj_Compress> compressor;
+
+  std::unique_ptr<rgw::sal::DataProcessor> encrypt;
+  std::unique_ptr<rgw::sal::DataProcessor> run_lua;
+
+  if (!append) { // compression and encryption only apply to full object uploads
+    op_ret = get_encrypt_filter(&encrypt, filter);
+    if (op_ret < 0) {
+      return;
+    }
+    if (encrypt != nullptr) {
+      filter = &*encrypt;
+    }
+    // a zonegroup feature is required to combine compression and encryption
+    const rgw::sal::ZoneGroup& zonegroup = driver->get_zone()->get_zonegroup();
+    const bool compress_encrypted = zonegroup.supports(rgw::zone_features::compress_encrypted);
+    if (compression_type != "none" &&
+        (encrypt == nullptr || compress_encrypted)) {
+      plugin = get_compressor_plugin(s, compression_type);
+      if (!plugin) {
+        ldpp_dout(this, 1) << "Cannot load plugin for compression type "
+            << compression_type << dendl;
+      } else {
+        compressor.emplace(s->cct, plugin, filter);
+        filter = &*compressor;
+        // always send incompressible hint when rgw is itself doing compression
+        s->object->set_compressed();
+      }
+    }
+    // run lua script before data is compressed and encrypted - last filter runs first
+    op_ret = get_lua_filter(&run_lua, filter);
+    if (op_ret < 0) {
+      return;
+    }
+    if (run_lua) {
+      filter = &*run_lua;
+    }
+  }
+  tracepoint(rgw_op, before_data_transfer, s->req_id.c_str());
+  do {
+    bufferlist data;
+    if (fst > lst)
+      break;
+    if (copy_source.empty()) {
+      len = get_data(data);
+    } else {
+      off_t cur_lst = min<off_t>(fst + s->cct->_conf->rgw_max_chunk_size - 1, lst);
+      op_ret = get_data(fst, cur_lst, data);
+      if (op_ret < 0)
+        return;
+      len = data.length();
+      s->content_length += len;
+      fst += len;
+    }
+    if (len < 0) {
+      op_ret = len;
+      ldpp_dout(this, 20) << "get_data() returned ret=" << op_ret << dendl;
+      return;
+    } else if (len == 0) {
+      break;
+    }
+
+    if (need_calc_md5) {
+      hash.Update((const unsigned char *)data.c_str(), data.length());
+    }
+
+    /* update torrrent */
+    torrent.update(data);
+
+    op_ret = filter->process(std::move(data), ofs);
+    if (op_ret < 0) {
+      ldpp_dout(this, 20) << "processor->process() returned ret="
+          << op_ret << dendl;
+      return;
+    }
+
+    ofs += len;
+  } while (len > 0);
+  tracepoint(rgw_op, after_data_transfer, s->req_id.c_str(), ofs);
+
+  // flush any data in filters
+  op_ret = filter->process({}, ofs);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!chunked_upload && ofs != s->content_length) {
+    op_ret = -ERR_REQUEST_TIMEOUT;
+    return;
+  }
+  s->obj_size = ofs;
+  s->object->set_obj_size(ofs);
+
+  perfcounter->inc(l_rgw_put_b, s->obj_size);
+
+  op_ret = do_aws4_auth_completion();
+  if (op_ret < 0) {
+    return;
+  }
+
+  op_ret = s->bucket->check_quota(this, quota, s->obj_size, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "second check_quota() returned op_ret=" << op_ret << dendl;
+    return;
+  }
+
+  hash.Final(m);
+
+  if (compressor && compressor->is_compressed()) {
+    bufferlist tmp;
+    RGWCompressionInfo cs_info;
+    cs_info.compression_type = plugin->get_type_name();
+    cs_info.orig_size = s->obj_size;
+    cs_info.compressor_message = compressor->get_compressor_message();
+    cs_info.blocks = move(compressor->get_compression_blocks());
+    encode(cs_info, tmp);
+    attrs[RGW_ATTR_COMPRESSION] = tmp;
+    ldpp_dout(this, 20) << "storing " << RGW_ATTR_COMPRESSION
+        << " with type=" << cs_info.compression_type
+        << ", orig_size=" << cs_info.orig_size
+        << ", blocks=" << cs_info.blocks.size() << dendl;
+  }
+
+  buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
+
+  etag = calc_md5;
+
+  if (supplied_md5_b64 && strcmp(calc_md5, supplied_md5)) {
+    op_ret = -ERR_BAD_DIGEST;
+    return;
+  }
+
+  policy.encode(aclbl);
+  emplace_attr(RGW_ATTR_ACL, std::move(aclbl));
+
+  if (dlo_manifest) {
+    op_ret = encode_dlo_manifest_attr(dlo_manifest, attrs);
+    if (op_ret < 0) {
+      ldpp_dout(this, 0) << "bad user manifest: " << dlo_manifest << dendl;
+      return;
+    }
+  }
+
+  if (slo_info) {
+    bufferlist manifest_bl;
+    encode(*slo_info, manifest_bl);
+    emplace_attr(RGW_ATTR_SLO_MANIFEST, std::move(manifest_bl));
+  }
+
+  if (supplied_etag && etag.compare(supplied_etag) != 0) {
+    op_ret = -ERR_UNPROCESSABLE_ENTITY;
+    return;
+  }
+  bl.append(etag.c_str(), etag.size());
+  emplace_attr(RGW_ATTR_ETAG, std::move(bl));
+
+  populate_with_generic_attrs(s, attrs);
+  op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    return;
+  }
+  encode_delete_at_attr(delete_at, attrs);
+  encode_obj_tags_attr(obj_tags.get(), attrs);
+  rgw_cond_decode_objtags(s, attrs);
+
+  /* Add a custom metadata to expose the information whether an object
+   * is an SLO or not. Appending the attribute must be performed AFTER
+   * processing any input from user in order to prohibit overwriting. */
+  if (slo_info) {
+    bufferlist slo_userindicator_bl;
+    slo_userindicator_bl.append("True", 4);
+    emplace_attr(RGW_ATTR_SLO_UINDICATOR, std::move(slo_userindicator_bl));
+  }
+  if (obj_legal_hold) {
+    bufferlist obj_legal_hold_bl;
+    obj_legal_hold->encode(obj_legal_hold_bl);
+    emplace_attr(RGW_ATTR_OBJECT_LEGAL_HOLD, std::move(obj_legal_hold_bl));
+  }
+  if (obj_retention) {
+    bufferlist obj_retention_bl;
+    obj_retention->encode(obj_retention_bl);
+    emplace_attr(RGW_ATTR_OBJECT_RETENTION, std::move(obj_retention_bl));
+  }
+
+  tracepoint(rgw_op, processor_complete_enter, s->req_id.c_str());
+  op_ret = processor->complete(s->obj_size, etag, &mtime, real_time(), attrs,
+                               (delete_at ? *delete_at : real_time()), if_match, if_nomatch,
+                               (user_data.empty() ? nullptr : &user_data), nullptr, nullptr,
+                               s->yield);
+  tracepoint(rgw_op, processor_complete_exit, s->req_id.c_str());
+
+  /* produce torrent */
+  if (s->cct->_conf->rgw_torrent_flag && (ofs == torrent.get_data_len()))
+  {
+    torrent.init(s, driver);
+    torrent.set_create_date(mtime);
+    op_ret =  torrent.complete(y);
+    if (0 != op_ret)
+    {
+      ldpp_dout(this, 0) << "ERROR: torrent.handle_data() returned " << op_ret << dendl;
+      return;
+    }
+  }
+
+  // send request to notification manager
+  int ret = res->publish_commit(this, s->obj_size, mtime, etag, s->object->get_instance());
+  if (ret < 0) {
+    ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
+    // too late to rollback operation, hence op_ret is not set here
+  }
+}
+
+int RGWPostObj::verify_permission(optional_yield y)
+{
+  return 0;
+}
+
+void RGWPostObj::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWPostObj::execute(optional_yield y)
+{
+  boost::optional<RGWPutObj_Compress> compressor;
+  CompressorRef plugin;
+  char supplied_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+
+  /* Read in the data from the POST form. */
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    return;
+  }
+
+  op_ret = verify_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  // add server-side encryption headers
+  rgw_iam_add_crypt_attrs(s->env, s->info.crypt_attribute_map);
+
+  if (s->iam_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) {
+    auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                            rgw::IAM::s3PutObject,
+                                            s->object->get_obj());
+    if (identity_policy_res == Effect::Deny) {
+      op_ret = -EACCES;
+      return;
+    }
+
+    rgw::IAM::Effect e = Effect::Pass;
+    rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+    if (s->iam_policy) {
+      ARN obj_arn(s->object->get_obj());
+      e = s->iam_policy->eval(s->env, *s->auth.identity,
+				 rgw::IAM::s3PutObject,
+				 obj_arn,
+         princ_type);
+    }
+    if (e == Effect::Deny) {
+      op_ret = -EACCES;
+      return;
+    }
+
+    if (!s->session_policies.empty()) {
+      auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                              rgw::IAM::s3PutObject,
+                                              s->object->get_obj());
+      if (session_policy_res == Effect::Deny) {
+          op_ret = -EACCES;
+          return;
+      }
+      if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+        //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+            (session_policy_res == Effect::Allow && e == Effect::Allow)) {
+          op_ret = 0;
+          return;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+        //Intersection of session policy and identity policy plus bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) {
+          op_ret = 0;
+          return;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+        if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) {
+          op_ret = 0;
+          return;
+        }
+      }
+      op_ret = -EACCES;
+      return;
+    }
+    if (identity_policy_res == Effect::Pass && e == Effect::Pass && !verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+      op_ret = -EACCES;
+      return;
+    }
+  } else if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+    op_ret = -EACCES;
+    return;
+  }
+
+  // make reservation for notification if needed
+  std::unique_ptr<rgw::sal::Notification> res
+    = driver->get_notification(s->object.get(), s->src_object.get(), s, rgw::notify::ObjectCreatedPost, y);
+  op_ret = res->publish_reserve(this);
+  if (op_ret < 0) {
+    return;
+  }
+
+  /* Start iteration over data fields. It's necessary as Swift's FormPost
+   * is capable to handle multiple files in single form. */
+  do {
+    char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+    unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    MD5 hash;
+    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+    hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+    ceph::buffer::list bl, aclbl;
+
+    op_ret = s->bucket->check_quota(this, quota, s->content_length, y);
+    if (op_ret < 0) {
+      return;
+    }
+
+    if (supplied_md5_b64) {
+      char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1];
+      ldpp_dout(this, 15) << "supplied_md5_b64=" << supplied_md5_b64 << dendl;
+      op_ret = ceph_unarmor(supplied_md5_bin, &supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1],
+                            supplied_md5_b64, supplied_md5_b64 + strlen(supplied_md5_b64));
+      ldpp_dout(this, 15) << "ceph_armor ret=" << op_ret << dendl;
+      if (op_ret != CEPH_CRYPTO_MD5_DIGESTSIZE) {
+        op_ret = -ERR_INVALID_DIGEST;
+        return;
+      }
+
+      buf_to_hex((const unsigned char *)supplied_md5_bin, CEPH_CRYPTO_MD5_DIGESTSIZE, supplied_md5);
+      ldpp_dout(this, 15) << "supplied_md5=" << supplied_md5 << dendl;
+    }
+
+    std::unique_ptr<rgw::sal::Object> obj =
+		     s->bucket->get_object(rgw_obj_key(get_current_filename()));
+    if (s->bucket->versioning_enabled()) {
+      obj->gen_rand_obj_instance_name();
+    }
+
+    std::unique_ptr<rgw::sal::Writer> processor;
+    processor = driver->get_atomic_writer(this, s->yield, obj.get(),
+					 s->bucket_owner.get_id(),
+					 &s->dest_placement, 0, s->req_id);
+    op_ret = processor->prepare(s->yield);
+    if (op_ret < 0) {
+      return;
+    }
+
+    /* No filters by default. */
+    rgw::sal::DataProcessor *filter = processor.get();
+
+    std::unique_ptr<rgw::sal::DataProcessor> encrypt;
+    op_ret = get_encrypt_filter(&encrypt, filter);
+    if (op_ret < 0) {
+      return;
+    }
+    if (encrypt != nullptr) {
+      filter = encrypt.get();
+    } else {
+      const auto& compression_type = driver->get_compression_type(s->dest_placement);
+      if (compression_type != "none") {
+        plugin = Compressor::create(s->cct, compression_type);
+        if (!plugin) {
+          ldpp_dout(this, 1) << "Cannot load plugin for compression type "
+                           << compression_type << dendl;
+        } else {
+          compressor.emplace(s->cct, plugin, filter);
+          filter = &*compressor;
+        }
+      }
+    }
+
+    bool again;
+    do {
+      ceph::bufferlist data;
+      int len = get_data(data, again);
+
+      if (len < 0) {
+        op_ret = len;
+        return;
+      }
+
+      if (!len) {
+        break;
+      }
+
+      hash.Update((const unsigned char *)data.c_str(), data.length());
+      op_ret = filter->process(std::move(data), ofs);
+      if (op_ret < 0) {
+        return;
+      }
+
+      ofs += len;
+
+      if (ofs > max_len) {
+        op_ret = -ERR_TOO_LARGE;
+        return;
+      }
+    } while (again);
+
+    // flush
+    op_ret = filter->process({}, ofs);
+    if (op_ret < 0) {
+      return;
+    }
+
+    if (ofs < min_len) {
+      op_ret = -ERR_TOO_SMALL;
+      return;
+    }
+
+    s->obj_size = ofs;
+    s->object->set_obj_size(ofs);
+
+
+    op_ret = s->bucket->check_quota(this, quota, s->obj_size, y);
+    if (op_ret < 0) {
+      return;
+    }
+
+    hash.Final(m);
+    buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
+
+    etag = calc_md5;
+    
+    if (supplied_md5_b64 && strcmp(calc_md5, supplied_md5)) {
+      op_ret = -ERR_BAD_DIGEST;
+      return;
+    }
+
+    bl.append(etag.c_str(), etag.size());
+    emplace_attr(RGW_ATTR_ETAG, std::move(bl));
+
+    policy.encode(aclbl);
+    emplace_attr(RGW_ATTR_ACL, std::move(aclbl));
+
+    const std::string content_type = get_current_content_type();
+    if (! content_type.empty()) {
+      ceph::bufferlist ct_bl;
+      ct_bl.append(content_type.c_str(), content_type.size() + 1);
+      emplace_attr(RGW_ATTR_CONTENT_TYPE, std::move(ct_bl));
+    }
+
+    if (compressor && compressor->is_compressed()) {
+      ceph::bufferlist tmp;
+      RGWCompressionInfo cs_info;
+      cs_info.compression_type = plugin->get_type_name();
+      cs_info.orig_size = s->obj_size;
+      cs_info.compressor_message = compressor->get_compressor_message();
+      cs_info.blocks = move(compressor->get_compression_blocks());
+      encode(cs_info, tmp);
+      emplace_attr(RGW_ATTR_COMPRESSION, std::move(tmp));
+    }
+
+    op_ret = processor->complete(s->obj_size, etag, nullptr, real_time(), attrs,
+                                (delete_at ? *delete_at : real_time()),
+                                nullptr, nullptr, nullptr, nullptr, nullptr,
+                                s->yield);
+    if (op_ret < 0) {
+      return;
+    }
+  } while (is_next_file_to_upload());
+
+  // send request to notification manager
+  int ret = res->publish_commit(this, ofs, s->object->get_mtime(), etag, s->object->get_instance());
+  if (ret < 0) {
+    ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
+    // too late to rollback operation, hence op_ret is not set here
+  }
+}
+
+
+void RGWPutMetadataAccount::filter_out_temp_url(map<string, bufferlist>& add_attrs,
+                                                const set<string>& rmattr_names,
+                                                map<int, string>& temp_url_keys)
+{
+  map<string, bufferlist>::iterator iter;
+
+  iter = add_attrs.find(RGW_ATTR_TEMPURL_KEY1);
+  if (iter != add_attrs.end()) {
+    temp_url_keys[0] = iter->second.c_str();
+    add_attrs.erase(iter);
+  }
+
+  iter = add_attrs.find(RGW_ATTR_TEMPURL_KEY2);
+  if (iter != add_attrs.end()) {
+    temp_url_keys[1] = iter->second.c_str();
+    add_attrs.erase(iter);
+  }
+
+  for (const string& name : rmattr_names) {
+    if (name.compare(RGW_ATTR_TEMPURL_KEY1) == 0) {
+      temp_url_keys[0] = string();
+    }
+    if (name.compare(RGW_ATTR_TEMPURL_KEY2) == 0) {
+      temp_url_keys[1] = string();
+    }
+  }
+}
+
+int RGWPutMetadataAccount::init_processing(optional_yield y)
+{
+  /* First, go to the base class. At the time of writing the method was
+   * responsible only for initializing the quota. This isn't necessary
+   * here as we are touching metadata only. I'm putting this call only
+   * for the future. */
+  op_ret = RGWOp::init_processing(y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  op_ret = s->user->read_attrs(this, y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+  orig_attrs = s->user->get_attrs();
+
+  if (has_policy) {
+    bufferlist acl_bl;
+    policy.encode(acl_bl);
+    attrs.emplace(RGW_ATTR_ACL, std::move(acl_bl));
+  }
+
+  op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs, false);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+  prepare_add_del_attrs(orig_attrs, rmattr_names, attrs);
+  populate_with_generic_attrs(s, attrs);
+
+  /* Try extract the TempURL-related stuff now to allow verify_permission
+   * evaluate whether we need FULL_CONTROL or not. */
+  filter_out_temp_url(attrs, rmattr_names, temp_url_keys);
+
+  /* The same with quota except a client needs to be reseller admin. */
+  op_ret = filter_out_quota_info(attrs, rmattr_names, new_quota,
+                                 &new_quota_extracted);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  return 0;
+}
+
+int RGWPutMetadataAccount::verify_permission(optional_yield y)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  if (!verify_user_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+    return -EACCES;
+  }
+
+  /* Altering TempURL keys requires FULL_CONTROL. */
+  if (!temp_url_keys.empty() && s->perm_mask != RGW_PERM_FULL_CONTROL) {
+    return -EPERM;
+  }
+
+  /* We are failing this intensionally to allow system user/reseller admin
+   * override in rgw_process.cc. This is the way to specify a given RGWOp
+   * expect extra privileges.  */
+  if (new_quota_extracted) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWPutMetadataAccount::execute(optional_yield y)
+{
+  /* Params have been extracted earlier. See init_processing(). */
+  op_ret = s->user->load_user(this, y);
+  if (op_ret < 0) {
+    return;
+  }
+
+  /* Handle the TempURL-related stuff. */
+  if (!temp_url_keys.empty()) {
+    for (auto& pair : temp_url_keys) {
+      s->user->get_info().temp_url_keys[pair.first] = std::move(pair.second);
+    }
+  }
+
+  /* Handle the quota extracted at the verify_permission step. */
+  if (new_quota_extracted) {
+    s->user->get_info().quota.user_quota = std::move(new_quota);
+  }
+
+  /* We are passing here the current (old) user info to allow the function
+   * optimize-out some operations. */
+  s->user->set_attrs(attrs);
+  op_ret = s->user->store_user(this, y, false, &s->user->get_info());
+}
+
+int RGWPutMetadataBucket::verify_permission(optional_yield y)
+{
+  if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWPutMetadataBucket::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWPutMetadataBucket::execute(optional_yield y)
+{
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    return;
+  }
+
+  op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs, false);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!placement_rule.empty() &&
+      placement_rule != s->bucket->get_placement_rule()) {
+    op_ret = -EEXIST;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] {
+      /* Encode special metadata first as we're using std::map::emplace under
+       * the hood. This method will add the new items only if the map doesn't
+       * contain such keys yet. */
+      if (has_policy) {
+	if (s->dialect.compare("swift") == 0) {
+	  auto old_policy =						\
+	    static_cast<RGWAccessControlPolicy_SWIFT*>(s->bucket_acl.get());
+	  auto new_policy = static_cast<RGWAccessControlPolicy_SWIFT*>(&policy);
+	  new_policy->filter_merge(policy_rw_mask, old_policy);
+	  policy = *new_policy;
+	}
+	buffer::list bl;
+	policy.encode(bl);
+	emplace_attr(RGW_ATTR_ACL, std::move(bl));
+      }
+
+      if (has_cors) {
+	buffer::list bl;
+	cors_config.encode(bl);
+	emplace_attr(RGW_ATTR_CORS, std::move(bl));
+      }
+
+      /* It's supposed that following functions WILL NOT change any
+       * special attributes (like RGW_ATTR_ACL) if they are already
+       * present in attrs. */
+      prepare_add_del_attrs(s->bucket_attrs, rmattr_names, attrs);
+      populate_with_generic_attrs(s, attrs);
+
+      /* According to the Swift's behaviour and its container_quota
+       * WSGI middleware implementation: anyone with write permissions
+       * is able to set the bucket quota. This stays in contrast to
+       * account quotas that can be set only by clients holding
+       * reseller admin privileges. */
+      op_ret = filter_out_quota_info(attrs, rmattr_names, s->bucket->get_info().quota);
+      if (op_ret < 0) {
+	return op_ret;
+      }
+
+      if (swift_ver_location) {
+	s->bucket->get_info().swift_ver_location = *swift_ver_location;
+	s->bucket->get_info().swift_versioning = (!swift_ver_location->empty());
+      }
+
+      /* Web site of Swift API. */
+      filter_out_website(attrs, rmattr_names, s->bucket->get_info().website_conf);
+      s->bucket->get_info().has_website = !s->bucket->get_info().website_conf.is_empty();
+
+      /* Setting attributes also stores the provided bucket info. Due
+       * to this fact, the new quota settings can be serialized with
+       * the same call. */
+      op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield);
+      return op_ret;
+    });
+}
+
+int RGWPutMetadataObject::verify_permission(optional_yield y)
+{
+  // This looks to be something specific to Swift. We could add
+  // operations like swift:PutMetadataObject to the Policy Engine.
+  if (!verify_object_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWPutMetadataObject::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWPutMetadataObject::execute(optional_yield y)
+{
+  rgw_obj target_obj;
+  rgw::sal::Attrs attrs, rmattrs;
+
+  s->object->set_atomic();
+
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    return;
+  }
+
+  op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    return;
+  }
+
+  /* check if obj exists, read orig attrs */
+  op_ret = s->object->get_obj_attrs(s->yield, s, &target_obj);
+  if (op_ret < 0) {
+    return;
+  }
+
+  /* Check whether the object has expired. Swift API documentation
+   * stands that we should return 404 Not Found in such case. */
+  if (need_object_expiration() && s->object->is_expired()) {
+    op_ret = -ENOENT;
+    return;
+  }
+
+  /* Filter currently existing attributes. */
+  prepare_add_del_attrs(s->object->get_attrs(), attrs, rmattrs);
+  populate_with_generic_attrs(s, attrs);
+  encode_delete_at_attr(delete_at, attrs);
+
+  if (dlo_manifest) {
+    op_ret = encode_dlo_manifest_attr(dlo_manifest, attrs);
+    if (op_ret < 0) {
+      ldpp_dout(this, 0) << "bad user manifest: " << dlo_manifest << dendl;
+      return;
+    }
+  }
+
+  op_ret = s->object->set_obj_attrs(this, &attrs, &rmattrs, s->yield);
+}
+
+int RGWDeleteObj::handle_slo_manifest(bufferlist& bl, optional_yield y)
+{
+  RGWSLOInfo slo_info;
+  auto bliter = bl.cbegin();
+  try {
+    decode(slo_info, bliter);
+  } catch (buffer::error& err) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode slo manifest" << dendl;
+    return -EIO;
+  }
+
+  try {
+    deleter = std::unique_ptr<RGWBulkDelete::Deleter>(\
+          new RGWBulkDelete::Deleter(this, driver, s));
+  } catch (const std::bad_alloc&) {
+    return -ENOMEM;
+  }
+
+  list<RGWBulkDelete::acct_path_t> items;
+  for (const auto& iter : slo_info.entries) {
+    const string& path_str = iter.path;
+
+    const size_t pos_init = path_str.find_first_not_of('/');
+    if (std::string_view::npos == pos_init) {
+      return -EINVAL;
+    }
+
+    const size_t sep_pos = path_str.find('/', pos_init);
+    if (std::string_view::npos == sep_pos) {
+      return -EINVAL;
+    }
+
+    RGWBulkDelete::acct_path_t path;
+
+    path.bucket_name = url_decode(path_str.substr(pos_init, sep_pos - pos_init));
+    path.obj_key = url_decode(path_str.substr(sep_pos + 1));
+
+    items.push_back(path);
+  }
+
+  /* Request removal of the manifest object itself. */
+  RGWBulkDelete::acct_path_t path;
+  path.bucket_name = s->bucket_name;
+  path.obj_key = s->object->get_key();
+  items.push_back(path);
+
+  int ret = deleter->delete_chunk(items, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWDeleteObj::verify_permission(optional_yield y)
+{
+  int op_ret = get_params(y);
+  if (op_ret) {
+    return op_ret;
+  }
+
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+    if (has_s3_existing_tag || has_s3_resource_tag)
+      rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  if (s->iam_policy || ! s->iam_user_policies.empty() || ! s->session_policies.empty()) {
+    if (s->bucket->get_info().obj_lock_enabled() && bypass_governance_mode) {
+      auto r = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                               rgw::IAM::s3BypassGovernanceRetention, ARN(s->bucket->get_key(), s->object->get_name()));
+      if (r == Effect::Deny) {
+        bypass_perm = false;
+      } else if (r == Effect::Pass && s->iam_policy) {
+        ARN obj_arn(ARN(s->bucket->get_key(), s->object->get_name()));
+        r = s->iam_policy->eval(s->env, *s->auth.identity, rgw::IAM::s3BypassGovernanceRetention, obj_arn);
+        if (r == Effect::Deny) {
+          bypass_perm = false;
+        }
+      } else if (r == Effect::Pass && !s->session_policies.empty()) {
+        r = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                               rgw::IAM::s3BypassGovernanceRetention, ARN(s->bucket->get_key(), s->object->get_name()));
+        if (r == Effect::Deny) {
+          bypass_perm = false;
+        }
+      }
+    }
+    auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                              s->object->get_instance().empty() ?
+                                              rgw::IAM::s3DeleteObject :
+                                              rgw::IAM::s3DeleteObjectVersion,
+                                              ARN(s->bucket->get_key(), s->object->get_name()));
+    if (identity_policy_res == Effect::Deny) {
+      return -EACCES;
+    }
+
+    rgw::IAM::Effect r = Effect::Pass;
+    rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+    ARN obj_arn(ARN(s->bucket->get_key(), s->object->get_name()));
+    if (s->iam_policy) {
+      r = s->iam_policy->eval(s->env, *s->auth.identity,
+				 s->object->get_instance().empty() ?
+				 rgw::IAM::s3DeleteObject :
+				 rgw::IAM::s3DeleteObjectVersion,
+				 obj_arn,
+         princ_type);
+    }
+    if (r == Effect::Deny)
+      return -EACCES;
+
+    if (!s->session_policies.empty()) {
+      auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                              s->object->get_instance().empty() ?
+                                              rgw::IAM::s3DeleteObject :
+                                              rgw::IAM::s3DeleteObjectVersion,
+                                              obj_arn);
+      if (session_policy_res == Effect::Deny) {
+          return -EACCES;
+      }
+      if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+        //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+            (session_policy_res == Effect::Allow && r == Effect::Allow)) {
+          return 0;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+        //Intersection of session policy and identity policy plus bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || r == Effect::Allow) {
+          return 0;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+        if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) {
+          return 0;
+        }
+      }
+      return -EACCES;
+    }
+    if (r == Effect::Allow || identity_policy_res == Effect::Allow)
+      return 0;
+  }
+
+  if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+    return -EACCES;
+  }
+
+  if (s->bucket->get_info().mfa_enabled() &&
+      !s->object->get_instance().empty() &&
+      !s->mfa_verified) {
+    ldpp_dout(this, 5) << "NOTICE: object delete request with a versioned object, mfa auth not provided" << dendl;
+    return -ERR_MFA_REQUIRED;
+  }
+
+  return 0;
+}
+
+void RGWDeleteObj::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWDeleteObj::execute(optional_yield y)
+{
+  if (!s->bucket_exists) {
+    op_ret = -ERR_NO_SUCH_BUCKET;
+    return;
+  }
+
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    uint64_t obj_size = 0;
+    std::string etag;
+    {
+      RGWObjState* astate = nullptr;
+      bool check_obj_lock = s->object->have_instance() && s->bucket->get_info().obj_lock_enabled();
+
+      op_ret = s->object->get_obj_state(this, &astate, s->yield, true);
+      if (op_ret < 0) {
+        if (need_object_expiration() || multipart_delete) {
+          return;
+        }
+
+        if (check_obj_lock) {
+          /* check if obj exists, read orig attrs */
+          if (op_ret == -ENOENT) {
+            /* object maybe delete_marker, skip check_obj_lock*/
+            check_obj_lock = false;
+          } else {
+            return;
+          }
+        }
+      } else {
+        obj_size = astate->size;
+        etag = astate->attrset[RGW_ATTR_ETAG].to_str();
+      }
+
+      // ignore return value from get_obj_attrs in all other cases
+      op_ret = 0;
+
+      if (check_obj_lock) {
+        ceph_assert(astate);
+        int object_lock_response = verify_object_lock(this, astate->attrset, bypass_perm, bypass_governance_mode);
+        if (object_lock_response != 0) {
+          op_ret = object_lock_response;
+          if (op_ret == -EACCES) {
+            s->err.message = "forbidden by object lock";
+          }
+          return;
+        }
+      }
+
+      if (multipart_delete) {
+        if (!astate) {
+          op_ret = -ERR_NOT_SLO_MANIFEST;
+          return;
+        }
+
+        const auto slo_attr = astate->attrset.find(RGW_ATTR_SLO_MANIFEST);
+
+        if (slo_attr != astate->attrset.end()) {
+          op_ret = handle_slo_manifest(slo_attr->second, y);
+          if (op_ret < 0) {
+            ldpp_dout(this, 0) << "ERROR: failed to handle slo manifest ret=" << op_ret << dendl;
+          }
+        } else {
+          op_ret = -ERR_NOT_SLO_MANIFEST;
+        }
+
+        return;
+      }
+    }
+
+    // make reservation for notification if needed
+    const auto versioned_object = s->bucket->versioning_enabled();
+    const auto event_type = versioned_object &&
+      s->object->get_instance().empty() ?
+      rgw::notify::ObjectRemovedDeleteMarkerCreated :
+      rgw::notify::ObjectRemovedDelete;
+    std::unique_ptr<rgw::sal::Notification> res
+      = driver->get_notification(s->object.get(), s->src_object.get(), s,
+				event_type, y);
+    op_ret = res->publish_reserve(this);
+    if (op_ret < 0) {
+      return;
+    }
+
+    s->object->set_atomic();
+    
+    bool ver_restored = false;
+    op_ret = s->object->swift_versioning_restore(ver_restored, this);
+    if (op_ret < 0) {
+      return;
+    }
+
+    if (!ver_restored) {
+      uint64_t epoch = 0;
+
+      /* Swift's versioning mechanism hasn't found any previous version of
+       * the object that could be restored. This means we should proceed
+       * with the regular delete path. */
+      op_ret = get_system_versioning_params(s, &epoch, &version_id);
+      if (op_ret < 0) {
+	return;
+      }
+
+      std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = s->object->get_delete_op();
+      del_op->params.obj_owner = s->owner;
+      del_op->params.bucket_owner = s->bucket_owner;
+      del_op->params.versioning_status = s->bucket->get_info().versioning_status();
+      del_op->params.unmod_since = unmod_since;
+      del_op->params.high_precision_time = s->system_request;
+      del_op->params.olh_epoch = epoch;
+      del_op->params.marker_version_id = version_id;
+
+      op_ret = del_op->delete_obj(this, y);
+      if (op_ret >= 0) {
+	delete_marker = del_op->result.delete_marker;
+	version_id = del_op->result.version_id;
+      }
+
+      /* Check whether the object has expired. Swift API documentation
+       * stands that we should return 404 Not Found in such case. */
+      if (need_object_expiration() && s->object->is_expired()) {
+        op_ret = -ENOENT;
+        return;
+      }
+    }
+
+    if (op_ret == -ECANCELED) {
+      op_ret = 0;
+    }
+    if (op_ret == -ERR_PRECONDITION_FAILED && no_precondition_error) {
+      op_ret = 0;
+    }
+
+    // send request to notification manager
+    int ret = res->publish_commit(this, obj_size, ceph::real_clock::now(), etag, version_id);
+    if (ret < 0) {
+      ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
+      // too late to rollback operation, hence op_ret is not set here
+    }
+  } else {
+    op_ret = -EINVAL;
+  }
+}
+
+bool RGWCopyObj::parse_copy_location(const std::string_view& url_src,
+				     string& bucket_name,
+				     rgw_obj_key& key,
+                                     req_state* s)
+{
+  std::string_view name_str;
+  std::string_view params_str;
+
+  // search for ? before url-decoding so we don't accidentally match %3F
+  size_t pos = url_src.find('?');
+  if (pos == string::npos) {
+    name_str = url_src;
+  } else {
+    name_str = url_src.substr(0, pos);
+    params_str = url_src.substr(pos + 1);
+  }
+
+  if (name_str[0] == '/') // trim leading slash
+    name_str.remove_prefix(1);
+
+  std::string dec_src = url_decode(name_str);
+
+  pos = dec_src.find('/');
+  if (pos == string::npos)
+    return false;
+
+  bucket_name = dec_src.substr(0, pos);
+  key.name = dec_src.substr(pos + 1);
+
+  if (key.name.empty()) {
+    return false;
+  }
+
+  if (! params_str.empty()) {
+    RGWHTTPArgs args;
+    args.set(std::string(params_str));
+    args.parse(s);
+
+    key.instance = args.get("versionId", NULL);
+  }
+
+  return true;
+}
+
+int RGWCopyObj::init_processing(optional_yield y)
+{
+  op_ret = RGWOp::init_processing(y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return op_ret;
+
+  op_ret = get_system_versioning_params(s, &olh_epoch, &version_id);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  op_ret = driver->get_bucket(this, s->user.get(),
+                              rgw_bucket_key(s->src_tenant_name,
+                                             s->src_bucket_name),
+                              &src_bucket, y);
+  if (op_ret < 0) {
+    if (op_ret == -ENOENT) {
+      op_ret = -ERR_NO_SUCH_BUCKET;
+    }
+    return op_ret;
+  }
+
+  /* This is the only place the bucket is set on src_object */
+  s->src_object->set_bucket(src_bucket.get());
+  return 0;
+}
+
+int RGWCopyObj::verify_permission(optional_yield y)
+{
+  RGWAccessControlPolicy src_acl(s->cct);
+  boost::optional<Policy> src_policy;
+
+  /* get buckets info (source and dest) */
+  if (s->local_source &&  source_zone.empty()) {
+    s->src_object->set_atomic();
+    s->src_object->set_prefetch_data();
+
+    rgw_placement_rule src_placement;
+
+    /* check source object permissions */
+    op_ret = read_obj_policy(this, driver, s, src_bucket->get_info(), src_bucket->get_attrs(), &src_acl, &src_placement.storage_class,
+			     src_policy, src_bucket.get(), s->src_object.get(), y);
+    if (op_ret < 0) {
+      return op_ret;
+    }
+
+    /* follow up on previous checks that required reading source object head */
+    if (need_to_check_storage_class) {
+      src_placement.inherit_from(src_bucket->get_placement_rule());
+
+      op_ret  = check_storage_class(src_placement);
+      if (op_ret < 0) {
+        return op_ret;
+      }
+    }
+
+    /* admin request overrides permission checks */
+    if (!s->auth.identity->is_admin_of(src_acl.get_owner().get_id())) {
+      if (src_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) {
+        auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, src_policy, s->iam_user_policies, s->session_policies);
+        if (has_s3_existing_tag || has_s3_resource_tag)
+          rgw_iam_add_objtags(this, s, s->src_object.get(), has_s3_existing_tag, has_s3_resource_tag);
+
+        ARN obj_arn(s->src_object->get_obj());
+        auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                                  s->src_object->get_instance().empty() ?
+                                                  rgw::IAM::s3GetObject :
+                                                  rgw::IAM::s3GetObjectVersion,
+                                                  obj_arn);
+        if (identity_policy_res == Effect::Deny) {
+          return -EACCES;
+        }
+        auto e = Effect::Pass;
+        rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+        if (src_policy) {
+	        e = src_policy->eval(s->env, *s->auth.identity,
+            s->src_object->get_instance().empty() ?
+            rgw::IAM::s3GetObject :
+            rgw::IAM::s3GetObjectVersion,
+            obj_arn,
+            princ_type);
+        }
+	if (e == Effect::Deny) {
+	  return -EACCES;
+	}
+        if (!s->session_policies.empty()) {
+	  auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                                  s->src_object->get_instance().empty() ?
+                                                  rgw::IAM::s3GetObject :
+                                                  rgw::IAM::s3GetObjectVersion,
+                                                  obj_arn);
+        if (session_policy_res == Effect::Deny) {
+            return -EACCES;
+        }
+        if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+          //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+          if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) &&
+              (session_policy_res != Effect::Allow || e != Effect::Allow)) {
+            return -EACCES;
+          }
+        } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+          //Intersection of session policy and identity policy plus bucket policy
+          if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) && e != Effect::Allow) {
+            return -EACCES;
+          }
+        } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+          if (session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) {
+            return -EACCES;
+          }
+        }
+      }
+  if (identity_policy_res == Effect::Pass && e == Effect::Pass &&
+		   !src_acl.verify_permission(this, *s->auth.identity, s->perm_mask,
+					      RGW_PERM_READ)) { 
+	  return -EACCES;
+	}
+      //remove src object tags as it may interfere with policy evaluation of destination obj
+      if (has_s3_existing_tag || has_s3_resource_tag)
+        rgw_iam_remove_objtags(this, s, s->src_object.get(), has_s3_existing_tag, has_s3_resource_tag);
+
+      } else if (!src_acl.verify_permission(this, *s->auth.identity,
+					       s->perm_mask,
+					    RGW_PERM_READ)) {
+	return -EACCES;
+      }
+    }
+  }
+
+  RGWAccessControlPolicy dest_bucket_policy(s->cct);
+
+  s->object->set_atomic();
+
+  /* check dest bucket permissions */
+  op_ret = read_bucket_policy(this, driver, s, s->bucket->get_info(),
+			      s->bucket->get_attrs(),
+                              &dest_bucket_policy, s->bucket->get_key(), y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+  auto dest_iam_policy = get_iam_policy_from_attr(s->cct, s->bucket->get_attrs(), s->bucket->get_tenant());
+  /* admin request overrides permission checks */
+  if (! s->auth.identity->is_admin_of(dest_policy.get_owner().get_id())){
+    if (dest_iam_policy != boost::none || ! s->iam_user_policies.empty() || !s->session_policies.empty()) {
+      //Add destination bucket tags for authorization
+      auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, dest_iam_policy, s->iam_user_policies, s->session_policies);
+      if (has_s3_resource_tag)
+        rgw_iam_add_buckettags(this, s, s->bucket.get());
+
+      rgw_add_to_iam_environment(s->env, "s3:x-amz-copy-source", copy_source);
+      if (md_directive)
+	rgw_add_to_iam_environment(s->env, "s3:x-amz-metadata-directive",
+				   *md_directive);
+
+      ARN obj_arn(s->object->get_obj());
+      auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies,
+                                                                  s->env,
+                                                                  rgw::IAM::s3PutObject,
+                                                                  obj_arn);
+      if (identity_policy_res == Effect::Deny) {
+        return -EACCES;
+      }
+      auto e = Effect::Pass;
+      rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+      if (dest_iam_policy) {
+        e = dest_iam_policy->eval(s->env, *s->auth.identity,
+                                      rgw::IAM::s3PutObject,
+                                      obj_arn,
+                                      princ_type);
+      }
+      if (e == Effect::Deny) {
+        return -EACCES;
+      }
+      if (!s->session_policies.empty()) {
+        auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+								    rgw::IAM::s3PutObject, obj_arn);
+        if (session_policy_res == Effect::Deny) {
+            return false;
+        }
+        if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+          //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+          if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) &&
+              (session_policy_res != Effect::Allow || e == Effect::Allow)) {
+            return -EACCES;
+          }
+        } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+          //Intersection of session policy and identity policy plus bucket policy
+          if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) && e != Effect::Allow) {
+            return -EACCES;
+          }
+        } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+          if (session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) {
+            return -EACCES;
+          }
+        }
+      }
+      if (identity_policy_res == Effect::Pass && e == Effect::Pass &&
+                 ! dest_bucket_policy.verify_permission(this,
+                                                        *s->auth.identity,
+                                                        s->perm_mask,
+                                                        RGW_PERM_WRITE)){
+        return -EACCES;
+      }
+    } else if (! dest_bucket_policy.verify_permission(this, *s->auth.identity, s->perm_mask,
+                                                      RGW_PERM_WRITE)) {
+      return -EACCES;
+    }
+
+  }
+
+  op_ret = init_dest_policy();
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  return 0;
+}
+
+
+int RGWCopyObj::init_common()
+{
+  if (if_mod) {
+    if (parse_time(if_mod, &mod_time) < 0) {
+      op_ret = -EINVAL;
+      return op_ret;
+    }
+    mod_ptr = &mod_time;
+  }
+
+  if (if_unmod) {
+    if (parse_time(if_unmod, &unmod_time) < 0) {
+      op_ret = -EINVAL;
+      return op_ret;
+    }
+    unmod_ptr = &unmod_time;
+  }
+
+  bufferlist aclbl;
+  dest_policy.encode(aclbl);
+  emplace_attr(RGW_ATTR_ACL, std::move(aclbl));
+
+  op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+  populate_with_generic_attrs(s, attrs);
+
+  return 0;
+}
+
+static void copy_obj_progress_cb(off_t ofs, void *param)
+{
+  RGWCopyObj *op = static_cast<RGWCopyObj *>(param);
+  op->progress_cb(ofs);
+}
+
+void RGWCopyObj::progress_cb(off_t ofs)
+{
+  if (!s->cct->_conf->rgw_copy_obj_progress)
+    return;
+
+  if (ofs - last_ofs <
+      static_cast<off_t>(s->cct->_conf->rgw_copy_obj_progress_every_bytes)) {
+    return;
+  }
+
+  send_partial_response(ofs);
+
+  last_ofs = ofs;
+}
+
+void RGWCopyObj::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWCopyObj::execute(optional_yield y)
+{
+  if (init_common() < 0)
+    return;
+
+  // make reservation for notification if needed
+  std::unique_ptr<rgw::sal::Notification> res
+				   = driver->get_notification(
+				     s->object.get(), s->src_object.get(),
+				     s, rgw::notify::ObjectCreatedCopy, y);
+  op_ret = res->publish_reserve(this);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if ( ! version_id.empty()) {
+    s->object->set_instance(version_id);
+  } else if (s->bucket->versioning_enabled()) {
+    s->object->gen_rand_obj_instance_name();
+  }
+
+  s->src_object->set_atomic();
+  s->object->set_atomic();
+
+  encode_delete_at_attr(delete_at, attrs);
+
+  if (obj_retention) {
+    bufferlist obj_retention_bl;
+    obj_retention->encode(obj_retention_bl);
+    emplace_attr(RGW_ATTR_OBJECT_RETENTION, std::move(obj_retention_bl));
+  }
+  if (obj_legal_hold) {
+    bufferlist obj_legal_hold_bl;
+    obj_legal_hold->encode(obj_legal_hold_bl);
+    emplace_attr(RGW_ATTR_OBJECT_LEGAL_HOLD, std::move(obj_legal_hold_bl));
+  }
+
+  uint64_t obj_size = 0;
+  {
+    // get src object size (cached in obj_ctx from verify_permission())
+    RGWObjState* astate = nullptr;
+    op_ret = s->src_object->get_obj_state(this, &astate, s->yield, true);
+    if (op_ret < 0) {
+      return;
+    }
+
+    /* Check if the src object is cloud-tiered */
+    bufferlist bl;
+    if (astate->get_attr(RGW_ATTR_MANIFEST, bl)) {
+      RGWObjManifest m;
+      try{
+        decode(m, bl);
+        if (m.get_tier_type() == "cloud-s3") {
+          op_ret = -ERR_INVALID_OBJECT_STATE;
+          s->err.message = "This object was transitioned to cloud-s3";
+          ldpp_dout(this, 4) << "Cannot copy cloud tiered object. Failing with "
+                         << op_ret << dendl;
+          return;
+        }
+      } catch (const buffer::end_of_buffer&) {
+        // ignore empty manifest; it's not cloud-tiered
+      } catch (const std::exception& e) {
+        ldpp_dout(this, 1) << "WARNING: failed to decode object manifest for "
+            << *s->object << ": " << e.what() << dendl;
+      }
+    }
+
+    obj_size = astate->size;
+  
+    if (!s->system_request) { // no quota enforcement for system requests
+      if (astate->accounted_size > static_cast<size_t>(s->cct->_conf->rgw_max_put_size)) {
+        op_ret = -ERR_TOO_LARGE;
+        return;
+      }
+      // enforce quota against the destination bucket owner
+      op_ret = s->bucket->check_quota(this, quota, astate->accounted_size, y);
+      if (op_ret < 0) {
+        return;
+      }
+    }
+  }
+
+  bool high_precision_time = (s->system_request);
+
+  /* Handle object versioning of Swift API. In case of copying to remote this
+   * should fail gently (op_ret == 0) as the dst_obj will not exist here. */
+  op_ret = s->object->swift_versioning_copy(this, s->yield);
+  if (op_ret < 0) {
+    return;
+  }
+
+  op_ret = s->src_object->copy_object(s->user.get(),
+	   &s->info,
+	   source_zone,
+	   s->object.get(),
+	   s->bucket.get(),
+	   src_bucket.get(),
+	   s->dest_placement,
+	   &src_mtime,
+	   &mtime,
+	   mod_ptr,
+	   unmod_ptr,
+	   high_precision_time,
+	   if_match,
+	   if_nomatch,
+	   attrs_mod,
+	   copy_if_newer,
+	   attrs,
+	   RGWObjCategory::Main,
+	   olh_epoch,
+	   delete_at,
+	   (version_id.empty() ? NULL : &version_id),
+	   &s->req_id, /* use req_id as tag */
+	   &etag,
+	   copy_obj_progress_cb, (void *)this,
+	   this,
+	   s->yield);
+
+  // send request to notification manager
+  int ret = res->publish_commit(this, obj_size, mtime, etag, s->object->get_instance());
+  if (ret < 0) {
+    ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
+    // too late to rollback operation, hence op_ret is not set here
+  }
+}
+
+int RGWGetACLs::verify_permission(optional_yield y)
+{
+  bool perm;
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    auto iam_action = s->object->get_instance().empty() ?
+      rgw::IAM::s3GetObjectAcl :
+      rgw::IAM::s3GetObjectVersionAcl;
+    if (has_s3_existing_tag || has_s3_resource_tag)
+      rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+    perm = verify_object_permission(this, s, iam_action);
+  } else {
+    if (!s->bucket_exists) {
+      return -ERR_NO_SUCH_BUCKET;
+    }
+    if (has_s3_resource_tag)
+      rgw_iam_add_buckettags(this, s);
+    perm = verify_bucket_permission(this, s, rgw::IAM::s3GetBucketAcl);
+  }
+  if (!perm)
+    return -EACCES;
+
+  return 0;
+}
+
+void RGWGetACLs::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetACLs::execute(optional_yield y)
+{
+  stringstream ss;
+  RGWAccessControlPolicy* const acl = \
+    (!rgw::sal::Object::empty(s->object.get()) ? s->object_acl.get() : s->bucket_acl.get());
+  RGWAccessControlPolicy_S3* const s3policy = \
+    static_cast<RGWAccessControlPolicy_S3*>(acl);
+  s3policy->to_xml(ss);
+  acls = ss.str();
+}
+
+
+
+int RGWPutACLs::verify_permission(optional_yield y)
+{
+  bool perm;
+
+  rgw_add_to_iam_environment(s->env, "s3:x-amz-acl", s->canned_acl);
+
+  rgw_add_grant_to_iam_environment(s->env, s);
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    auto iam_action = s->object->get_instance().empty() ? rgw::IAM::s3PutObjectAcl : rgw::IAM::s3PutObjectVersionAcl;
+    op_ret = rgw_iam_add_objtags(this, s, true, true);
+    perm = verify_object_permission(this, s, iam_action);
+  } else {
+    op_ret = rgw_iam_add_buckettags(this, s);
+    perm = verify_bucket_permission(this, s, rgw::IAM::s3PutBucketAcl);
+  }
+  if (!perm)
+    return -EACCES;
+
+  return 0;
+}
+
+int RGWGetLC::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  bool perm;
+  perm = verify_bucket_permission(this, s, rgw::IAM::s3GetLifecycleConfiguration);
+  if (!perm)
+    return -EACCES;
+
+  return 0;
+}
+
+int RGWPutLC::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  bool perm;
+  perm = verify_bucket_permission(this, s, rgw::IAM::s3PutLifecycleConfiguration);
+  if (!perm)
+    return -EACCES;
+
+  return 0;
+}
+
+int RGWDeleteLC::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  bool perm;
+  perm = verify_bucket_permission(this, s, rgw::IAM::s3PutLifecycleConfiguration);
+  if (!perm)
+    return -EACCES;
+
+  return 0;
+}
+
+void RGWPutACLs::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetLC::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWPutLC::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWDeleteLC::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWPutACLs::execute(optional_yield y)
+{
+  bufferlist bl;
+
+  RGWAccessControlPolicy_S3 *policy = NULL;
+  RGWACLXMLParser_S3 parser(s->cct);
+  RGWAccessControlPolicy_S3 new_policy(s->cct);
+  stringstream ss;
+
+  op_ret = 0; /* XXX redundant? */
+
+  if (!parser.init()) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+
+  RGWAccessControlPolicy* const existing_policy = \
+    (rgw::sal::Object::empty(s->object.get()) ? s->bucket_acl.get() : s->object_acl.get());
+
+  owner = existing_policy->get_owner();
+
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    if (op_ret == -ERANGE) {
+      ldpp_dout(this, 4) << "The size of request xml data is larger than the max limitation, data size = "
+                       << s->length << dendl;
+      op_ret = -ERR_MALFORMED_XML;
+      s->err.message = "The XML you provided was larger than the maximum " +
+                       std::to_string(s->cct->_conf->rgw_max_put_param_size) +
+                       " bytes allowed.";
+    }
+    return;
+  }
+
+  char* buf = data.c_str();
+  ldpp_dout(this, 15) << "read len=" << data.length() << " data=" << (buf ? buf : "") << dendl;
+
+  if (!s->canned_acl.empty() && data.length() > 0) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (!s->canned_acl.empty() || s->has_acl_header) {
+    op_ret = get_policy_from_state(driver, s, ss);
+    if (op_ret < 0)
+      return;
+
+    data.clear();
+    data.append(ss.str());
+  }
+
+  if (!parser.parse(data.c_str(), data.length(), 1)) {
+    op_ret = -EINVAL;
+    return;
+  }
+  policy = static_cast<RGWAccessControlPolicy_S3 *>(parser.find_first("AccessControlPolicy"));
+  if (!policy) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  const RGWAccessControlList& req_acl = policy->get_acl();
+  const multimap<string, ACLGrant>& req_grant_map = req_acl.get_grant_map();
+#define ACL_GRANTS_MAX_NUM      100
+  int max_num = s->cct->_conf->rgw_acl_grants_max_num;
+  if (max_num < 0) {
+    max_num = ACL_GRANTS_MAX_NUM;
+  }
+
+  int grants_num = req_grant_map.size();
+  if (grants_num > max_num) {
+    ldpp_dout(this, 4) << "An acl can have up to " << max_num
+        << " grants, request acl grants num: " << grants_num << dendl;
+    op_ret = -ERR_LIMIT_EXCEEDED;
+    s->err.message = "The request is rejected, because the acl grants number you requested is larger than the maximum "
+                     + std::to_string(max_num)
+                     + " grants allowed in an acl.";
+    return;
+  }
+
+  // forward bucket acl requests to meta master zone
+  if ((rgw::sal::Object::empty(s->object.get()))) {
+    bufferlist in_data;
+    // include acl data unless it was generated from a canned_acl
+    if (s->canned_acl.empty()) {
+      in_data.append(data);
+    }
+    op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+      return;
+    }
+  }
+
+  if (s->cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
+    ldpp_dout(this, 15) << "Old AccessControlPolicy";
+    policy->to_xml(*_dout);
+    *_dout << dendl;
+  }
+
+  op_ret = policy->rebuild(this, driver, &owner, new_policy, s->err.message);
+  if (op_ret < 0)
+    return;
+
+  if (s->cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
+    ldpp_dout(this, 15) << "New AccessControlPolicy:";
+    new_policy.to_xml(*_dout);
+    *_dout << dendl;
+  }
+
+  if (s->bucket_access_conf &&
+      s->bucket_access_conf->block_public_acls() &&
+      new_policy.is_public(this)) {
+    op_ret = -EACCES;
+    return;
+  }
+  new_policy.encode(bl);
+  map<string, bufferlist> attrs;
+
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    s->object->set_atomic();
+    //if instance is empty, we should modify the latest object
+    op_ret = s->object->modify_obj_attrs(RGW_ATTR_ACL, bl, s->yield, this);
+  } else {
+    map<string,bufferlist> attrs = s->bucket_attrs;
+    attrs[RGW_ATTR_ACL] = bl;
+    op_ret = s->bucket->merge_and_store_attrs(this, attrs, y);
+  }
+  if (op_ret == -ECANCELED) {
+    op_ret = 0; /* lost a race, but it's ok because acls are immutable */
+  }
+}
+
+void RGWPutLC::execute(optional_yield y)
+{
+  bufferlist bl;
+  
+  RGWLifecycleConfiguration_S3 config(s->cct);
+  RGWXMLParser parser;
+  RGWLifecycleConfiguration_S3 new_config(s->cct);
+
+  // amazon says that Content-MD5 is required for this op specifically, but MD5
+  // is not a security primitive and FIPS mode makes it difficult to use. if the
+  // client provides the header we'll try to verify its checksum, but the header
+  // itself is no longer required
+  std::optional<std::string> content_md5_bin;
+
+  content_md5 = s->info.env->get("HTTP_CONTENT_MD5");
+  if (content_md5 != nullptr) {
+    try {
+      content_md5_bin = rgw::from_base64(std::string_view(content_md5));
+    } catch (...) {
+      s->err.message = "Request header Content-MD5 contains character "
+                       "that is not base64 encoded.";
+      ldpp_dout(this, 5) << s->err.message << dendl;
+      op_ret = -ERR_BAD_DIGEST;
+      return;
+    }
+  }
+
+  if (!parser.init()) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+
+  char* buf = data.c_str();
+  ldpp_dout(this, 15) << "read len=" << data.length() << " data=" << (buf ? buf : "") << dendl;
+
+  if (content_md5_bin) {
+    MD5 data_hash;
+    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+    data_hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+    unsigned char data_hash_res[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    data_hash.Update(reinterpret_cast<const unsigned char*>(buf), data.length());
+    data_hash.Final(data_hash_res);
+
+    if (memcmp(data_hash_res, content_md5_bin->c_str(), CEPH_CRYPTO_MD5_DIGESTSIZE) != 0) {
+      op_ret = -ERR_BAD_DIGEST;
+      s->err.message = "The Content-MD5 you specified did not match what we received.";
+      ldpp_dout(this, 5) << s->err.message
+                       << " Specified content md5: " << content_md5
+                       << ", calculated content md5: " << data_hash_res
+                       << dendl;
+      return;
+    }
+  }
+
+  if (!parser.parse(buf, data.length(), 1)) {
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("LifecycleConfiguration", config, &parser);
+  } catch (RGWXMLDecoder::err& err) {
+    ldpp_dout(this, 5) << "Bad lifecycle configuration: " << err << dendl;
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  op_ret = config.rebuild(new_config);
+  if (op_ret < 0)
+    return;
+
+  if (s->cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
+    XMLFormatter xf;
+    new_config.dump_xml(&xf);
+    stringstream ss;
+    xf.flush(ss);
+    ldpp_dout(this, 15) << "New LifecycleConfiguration:" << ss.str() << dendl;
+  }
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = driver->get_rgwlc()->set_bucket_config(s->bucket.get(), s->bucket_attrs, &new_config);
+  if (op_ret < 0) {
+    return;
+  }
+  return;
+}
+
+void RGWDeleteLC::execute(optional_yield y)
+{
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = driver->get_rgwlc()->remove_bucket_config(s->bucket.get(), s->bucket_attrs);
+  if (op_ret < 0) {
+    return;
+  }
+  return;
+}
+
+int RGWGetCORS::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketCORS);
+}
+
+void RGWGetCORS::execute(optional_yield y)
+{
+  op_ret = read_bucket_cors();
+  if (op_ret < 0)
+    return ;
+
+  if (!cors_exist) {
+    ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl;
+    op_ret = -ERR_NO_CORS_FOUND;
+    return;
+  }
+}
+
+int RGWPutCORS::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketCORS);
+}
+
+void RGWPutCORS::execute(optional_yield y)
+{
+  rgw_raw_obj obj;
+
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] {
+      rgw::sal::Attrs attrs(s->bucket_attrs);
+      attrs[RGW_ATTR_CORS] = cors_bl;
+      return s->bucket->merge_and_store_attrs(this, attrs, s->yield);
+    });
+}
+
+int RGWDeleteCORS::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  // No separate delete permission
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketCORS);
+}
+
+void RGWDeleteCORS::execute(optional_yield y)
+{
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] {
+      op_ret = read_bucket_cors();
+      if (op_ret < 0)
+	return op_ret;
+
+      if (!cors_exist) {
+	ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl;
+	op_ret = -ENOENT;
+	return op_ret;
+      }
+
+      rgw::sal::Attrs attrs(s->bucket_attrs);
+      attrs.erase(RGW_ATTR_CORS);
+      op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield);
+      if (op_ret < 0) {
+	ldpp_dout(this, 0) << "RGWLC::RGWDeleteCORS() failed to set attrs on bucket=" << s->bucket->get_name()
+			 << " returned err=" << op_ret << dendl;
+      }
+      return op_ret;
+    });
+}
+
+void RGWOptionsCORS::get_response_params(string& hdrs, string& exp_hdrs, unsigned *max_age) {
+  get_cors_response_headers(this, rule, req_hdrs, hdrs, exp_hdrs, max_age);
+}
+
+int RGWOptionsCORS::validate_cors_request(RGWCORSConfiguration *cc) {
+  rule = cc->host_name_rule(origin);
+  if (!rule) {
+    ldpp_dout(this, 10) << "There is no cors rule present for " << origin << dendl;
+    return -ENOENT;
+  }
+
+  if (!validate_cors_rule_method(this, rule, req_meth)) {
+    return -ENOENT;
+  }
+
+  if (!validate_cors_rule_header(this, rule, req_hdrs)) {
+    return -ENOENT;
+  }
+
+  return 0;
+}
+
+void RGWOptionsCORS::execute(optional_yield y)
+{
+  op_ret = read_bucket_cors();
+  if (op_ret < 0)
+    return;
+
+  origin = s->info.env->get("HTTP_ORIGIN");
+  if (!origin) {
+    ldpp_dout(this, 0) << "Missing mandatory Origin header" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+  req_meth = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_METHOD");
+  if (!req_meth) {
+    ldpp_dout(this, 0) << "Missing mandatory Access-control-request-method header" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+  if (!cors_exist) {
+    ldpp_dout(this, 2) << "No CORS configuration set yet for this bucket" << dendl;
+    op_ret = -ENOENT;
+    return;
+  }
+  req_hdrs = s->info.env->get("HTTP_ACCESS_CONTROL_REQUEST_HEADERS");
+  op_ret = validate_cors_request(&bucket_cors);
+  if (!rule) {
+    origin = req_meth = NULL;
+    return;
+  }
+  return;
+}
+
+int RGWGetRequestPayment::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketRequestPayment);
+}
+
+void RGWGetRequestPayment::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetRequestPayment::execute(optional_yield y)
+{
+  requester_pays = s->bucket->get_info().requester_pays;
+}
+
+int RGWSetRequestPayment::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketRequestPayment);
+}
+
+void RGWSetRequestPayment::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWSetRequestPayment::execute(optional_yield y)
+{
+
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+  
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  s->bucket->get_info().requester_pays = requester_pays;
+  op_ret = s->bucket->put_info(this, false, real_time());
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket->get_name()
+		     << " returned err=" << op_ret << dendl;
+    return;
+  }
+  s->bucket_attrs = s->bucket->get_attrs();
+}
+
+int RGWInitMultipart::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+  if (has_s3_existing_tag || has_s3_resource_tag)
+    rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  // add server-side encryption headers
+  rgw_iam_add_crypt_attrs(s->env, s->info.crypt_attribute_map);
+
+  if (s->iam_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) {
+    auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                              rgw::IAM::s3PutObject,
+                                              s->object->get_obj());
+    if (identity_policy_res == Effect::Deny) {
+      return -EACCES;
+    }
+
+    rgw::IAM::Effect e = Effect::Pass;
+    rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+    ARN obj_arn(s->object->get_obj());
+    if (s->iam_policy) {
+      e = s->iam_policy->eval(s->env, *s->auth.identity,
+				 rgw::IAM::s3PutObject,
+				 obj_arn,
+         princ_type);
+    }
+    if (e == Effect::Deny) {
+      return -EACCES;
+    }
+
+    if (!s->session_policies.empty()) {
+      auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                              rgw::IAM::s3PutObject,
+                                              s->object->get_obj());
+      if (session_policy_res == Effect::Deny) {
+          return -EACCES;
+      }
+      if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+        //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+            (session_policy_res == Effect::Allow && e == Effect::Allow)) {
+          return 0;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+        //Intersection of session policy and identity policy plus bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) {
+          return 0;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+        if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) {
+          return 0;
+        }
+      }
+      return -EACCES;
+    }
+    if (e == Effect::Allow || identity_policy_res == Effect::Allow) {
+      return 0;
+    }
+  }
+
+  if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWInitMultipart::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWInitMultipart::execute(optional_yield y)
+{
+  multipart_trace = tracing::rgw::tracer.start_trace(tracing::rgw::MULTIPART, s->trace_enabled);
+  bufferlist aclbl, tracebl;
+  rgw::sal::Attrs attrs;
+
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (rgw::sal::Object::empty(s->object.get()))
+    return;
+
+  if (multipart_trace) {
+    tracing::encode(multipart_trace->GetContext(), tracebl);
+    attrs[RGW_ATTR_TRACE] = tracebl;
+  }
+
+  policy.encode(aclbl);
+  attrs[RGW_ATTR_ACL] = aclbl;
+
+  populate_with_generic_attrs(s, attrs);
+
+  /* select encryption mode */
+  op_ret = prepare_encryption(attrs);
+  if (op_ret != 0)
+    return;
+
+  op_ret = rgw_get_request_metadata(this, s->cct, s->info, attrs);
+  if (op_ret < 0) {
+    return;
+  }
+
+  std::unique_ptr<rgw::sal::MultipartUpload> upload;
+  upload = s->bucket->get_multipart_upload(s->object->get_name(),
+				       upload_id);
+  op_ret = upload->init(this, s->yield, s->owner, s->dest_placement, attrs);
+
+  if (op_ret == 0) {
+    upload_id = upload->get_upload_id();
+  }
+  s->trace->SetAttribute(tracing::rgw::UPLOAD_ID, upload_id);
+  multipart_trace->UpdateName(tracing::rgw::MULTIPART + upload_id);
+
+}
+
+int RGWCompleteMultipart::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+  if (has_s3_existing_tag || has_s3_resource_tag)
+    rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  // add server-side encryption headers
+  rgw_iam_add_crypt_attrs(s->env, s->info.crypt_attribute_map);
+
+  if (s->iam_policy || ! s->iam_user_policies.empty() || ! s->session_policies.empty()) {
+    auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                              rgw::IAM::s3PutObject,
+                                              s->object->get_obj());
+    if (identity_policy_res == Effect::Deny) {
+      return -EACCES;
+    }
+
+    rgw::IAM::Effect e = Effect::Pass;
+    rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+    rgw::ARN obj_arn(s->object->get_obj());
+    if (s->iam_policy) {
+      e = s->iam_policy->eval(s->env, *s->auth.identity,
+				 rgw::IAM::s3PutObject,
+				 obj_arn,
+         princ_type);
+    }
+    if (e == Effect::Deny) {
+      return -EACCES;
+    }
+
+    if (!s->session_policies.empty()) {
+      auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                              rgw::IAM::s3PutObject,
+                                              s->object->get_obj());
+      if (session_policy_res == Effect::Deny) {
+          return -EACCES;
+      }
+      if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+        //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+            (session_policy_res == Effect::Allow && e == Effect::Allow)) {
+          return 0;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+        //Intersection of session policy and identity policy plus bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) {
+          return 0;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+        if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) {
+          return 0;
+        }
+      }
+      return -EACCES;
+    }
+    if (e == Effect::Allow || identity_policy_res == Effect::Allow) {
+      return 0;
+    }
+  }
+
+  if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWCompleteMultipart::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWCompleteMultipart::execute(optional_yield y)
+{
+  RGWMultiCompleteUpload *parts;
+  RGWMultiXMLParser parser;
+  std::unique_ptr<rgw::sal::MultipartUpload> upload;
+  off_t ofs = 0;
+  std::unique_ptr<rgw::sal::Object> meta_obj;
+  std::unique_ptr<rgw::sal::Object> target_obj;
+  uint64_t olh_epoch = 0;
+
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+  op_ret = get_system_versioning_params(s, &olh_epoch, &version_id);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!data.length()) {
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  if (!parser.init()) {
+    op_ret = -EIO;
+    return;
+  }
+
+  if (!parser.parse(data.c_str(), data.length(), 1)) {
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  parts = static_cast<RGWMultiCompleteUpload *>(parser.find_first("CompleteMultipartUpload"));
+  if (!parts || parts->parts.empty()) {
+    // CompletedMultipartUpload is incorrect but some versions of some libraries use it, see PR #41700
+    parts = static_cast<RGWMultiCompleteUpload *>(parser.find_first("CompletedMultipartUpload"));
+  }
+
+  if (!parts || parts->parts.empty()) {
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+
+  if ((int)parts->parts.size() >
+      s->cct->_conf->rgw_multipart_part_upload_limit) {
+    op_ret = -ERANGE;
+    return;
+  }
+
+  upload = s->bucket->get_multipart_upload(s->object->get_name(), upload_id);
+
+  RGWCompressionInfo cs_info;
+  bool compressed = false;
+  uint64_t accounted_size = 0;
+
+  list<rgw_obj_index_key> remove_objs; /* objects to be removed from index listing */
+
+  meta_obj = upload->get_meta_obj();
+  meta_obj->set_in_extra_data(true);
+  meta_obj->set_hash_source(s->object->get_name());
+
+  /*take a cls lock on meta_obj to prevent racing completions (or retries)
+    from deleting the parts*/
+  int max_lock_secs_mp =
+    s->cct->_conf.get_val<int64_t>("rgw_mp_lock_max_time");
+  utime_t dur(max_lock_secs_mp, 0);
+
+  serializer = meta_obj->get_serializer(this, "RGWCompleteMultipart");
+  op_ret = serializer->try_lock(this, dur, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "failed to acquire lock" << dendl;
+    if (op_ret == -ENOENT && check_previously_completed(parts)) {
+      ldpp_dout(this, 1) << "NOTICE: This multipart completion is already completed" << dendl;
+      op_ret = 0;
+      return;
+    }
+    op_ret = -ERR_INTERNAL_ERROR;
+    s->err.message = "This multipart completion is already in progress";
+    return;
+  }
+
+  op_ret = meta_obj->get_obj_attrs(s->yield, this);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << meta_obj
+		     << " ret=" << op_ret << dendl;
+    return;
+  }
+  s->trace->SetAttribute(tracing::rgw::UPLOAD_ID, upload_id);
+  jspan_context trace_ctx(false, false);
+  extract_span_context(meta_obj->get_attrs(), trace_ctx);
+  multipart_trace = tracing::rgw::tracer.add_span(name(), trace_ctx);
+  
+
+  // make reservation for notification if needed
+  std::unique_ptr<rgw::sal::Notification> res
+    = driver->get_notification(meta_obj.get(), nullptr, s, rgw::notify::ObjectCreatedCompleteMultipartUpload, y, &s->object->get_name());
+  op_ret = res->publish_reserve(this);
+  if (op_ret < 0) {
+    return;
+  }
+
+  target_obj = s->bucket->get_object(rgw_obj_key(s->object->get_name()));
+  if (s->bucket->versioning_enabled()) {
+    if (!version_id.empty()) {
+      target_obj->set_instance(version_id);
+    } else {
+      target_obj->gen_rand_obj_instance_name();
+      version_id = target_obj->get_instance();
+    }
+  }
+  target_obj->set_attrs(meta_obj->get_attrs());
+
+  op_ret = upload->complete(this, y, s->cct, parts->parts, remove_objs, accounted_size, compressed, cs_info, ofs, s->req_id, s->owner, olh_epoch, target_obj.get());
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: upload complete failed ret=" << op_ret << dendl;
+    return;
+  }
+
+  // remove the upload meta object ; the meta object is not versioned
+  // when the bucket is, as that would add an unneeded delete marker
+  int r = meta_obj->delete_object(this, y, true /* prevent versioning */);
+  if (r >= 0)  {
+    /* serializer's exclusive lock is released */
+    serializer->clear_locked();
+  } else {
+    ldpp_dout(this, 0) << "WARNING: failed to remove object " << meta_obj << dendl;
+  }
+
+  // send request to notification manager
+  int ret = res->publish_commit(this, ofs, upload->get_mtime(), etag, target_obj->get_instance());
+  if (ret < 0) {
+    ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
+    // too late to rollback operation, hence op_ret is not set here
+  }
+} // RGWCompleteMultipart::execute
+
+bool RGWCompleteMultipart::check_previously_completed(const RGWMultiCompleteUpload* parts)
+{
+  // re-calculate the etag from the parts and compare to the existing object
+  int ret = s->object->get_obj_attrs(s->yield, this);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << __func__ << "() ERROR: get_obj_attrs() returned ret=" << ret << dendl;
+    return false;
+  }
+  rgw::sal::Attrs sattrs = s->object->get_attrs();
+  string oetag = sattrs[RGW_ATTR_ETAG].to_str();
+
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  for (const auto& [index, part] : parts->parts) {
+    std::string partetag = rgw_string_unquote(part);
+    char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    hex_to_buf(partetag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+    hash.Update((const unsigned char *)petag, sizeof(petag));
+    ldpp_dout(this, 20) << __func__ << "() re-calculating multipart etag: part: "
+                                   << index << ", etag: " << partetag << dendl;
+  }
+
+  unsigned char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+  hash.Final(final_etag);
+  buf_to_hex(final_etag, CEPH_CRYPTO_MD5_DIGESTSIZE, final_etag_str);
+  snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2], sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+           "-%lld", (long long)parts->parts.size());
+
+  if (oetag.compare(final_etag_str) != 0) {
+    ldpp_dout(this, 1) << __func__ << "() NOTICE: etag mismatch: object etag:"
+                                  << oetag << ", re-calculated etag:" << final_etag_str << dendl;
+    return false;
+  }
+  ldpp_dout(this, 5) << __func__ << "() object etag and re-calculated etag match, etag: " << oetag << dendl;
+  return true;
+}
+
+void RGWCompleteMultipart::complete()
+{
+  /* release exclusive lock iff not already */
+  if (unlikely(serializer.get() && serializer->is_locked())) {
+    int r = serializer->unlock();
+    if (r < 0) {
+      ldpp_dout(this, 0) << "WARNING: failed to unlock " << *serializer.get() << dendl;
+    }
+  }
+
+  etag = s->object->get_attrs()[RGW_ATTR_ETAG].to_str();
+
+  send_response();
+}
+
+int RGWAbortMultipart::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+  if (has_s3_existing_tag || has_s3_resource_tag)
+    rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  if (s->iam_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) {
+    auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                              rgw::IAM::s3AbortMultipartUpload,
+                                              s->object->get_obj());
+    if (identity_policy_res == Effect::Deny) {
+      return -EACCES;
+    }
+
+    rgw::IAM::Effect e = Effect::Pass;
+    rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+    ARN obj_arn(s->object->get_obj());
+    if (s->iam_policy) {
+      e = s->iam_policy->eval(s->env, *s->auth.identity,
+				 rgw::IAM::s3AbortMultipartUpload,
+				 obj_arn, princ_type);
+    }
+
+    if (e == Effect::Deny) {
+      return -EACCES;
+    }
+
+    if (!s->session_policies.empty()) {
+      auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                              rgw::IAM::s3PutObject,
+                                              s->object->get_obj());
+      if (session_policy_res == Effect::Deny) {
+          return -EACCES;
+      }
+      if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+        //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+            (session_policy_res == Effect::Allow && e == Effect::Allow)) {
+          return 0;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+        //Intersection of session policy and identity policy plus bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) {
+          return 0;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+        if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) {
+          return 0;
+        }
+      }
+      return -EACCES;
+    }
+    if (e == Effect::Allow || identity_policy_res == Effect::Allow) {
+      return 0;
+    }
+  }
+
+  if (!verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWAbortMultipart::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWAbortMultipart::execute(optional_yield y)
+{
+  op_ret = -EINVAL;
+  string upload_id;
+  upload_id = s->info.args.get("uploadId");
+  std::unique_ptr<rgw::sal::Object> meta_obj;
+  std::unique_ptr<rgw::sal::MultipartUpload> upload;
+
+  if (upload_id.empty() || rgw::sal::Object::empty(s->object.get()))
+    return;
+
+  upload = s->bucket->get_multipart_upload(s->object->get_name(), upload_id);
+  jspan_context trace_ctx(false, false);
+  if (tracing::rgw::tracer.is_enabled()) {
+    // read meta object attributes for trace info
+    meta_obj = upload->get_meta_obj();
+    meta_obj->set_in_extra_data(true);
+    meta_obj->get_obj_attrs(s->yield, this);
+    extract_span_context(meta_obj->get_attrs(), trace_ctx);
+  }
+  multipart_trace = tracing::rgw::tracer.add_span(name(), trace_ctx);
+
+  op_ret = upload->abort(this, s->cct);
+}
+
+int RGWListMultipart::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+  if (has_s3_existing_tag || has_s3_resource_tag)
+    rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  if (!verify_object_permission(this, s, rgw::IAM::s3ListMultipartUploadParts))
+    return -EACCES;
+
+  return 0;
+}
+
+void RGWListMultipart::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWListMultipart::execute(optional_yield y)
+{
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+
+  upload = s->bucket->get_multipart_upload(s->object->get_name(), upload_id);
+
+  rgw::sal::Attrs attrs;
+  op_ret = upload->get_info(this, s->yield, &placement, &attrs);
+  /* decode policy */
+  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_ACL);
+  if (iter != attrs.end()) {
+    auto bliter = iter->second.cbegin();
+    try {
+      policy.decode(bliter);
+    } catch (buffer::error& err) {
+      ldpp_dout(this, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
+      op_ret = -EIO;
+    }
+  }
+  if (op_ret < 0)
+    return;
+
+  op_ret = upload->list_parts(this, s->cct, max_parts, marker, NULL, &truncated);
+}
+
+int RGWListBucketMultiparts::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this,
+                                s,
+				rgw::IAM::s3ListBucketMultipartUploads))
+    return -EACCES;
+
+  return 0;
+}
+
+void RGWListBucketMultiparts::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWListBucketMultiparts::execute(optional_yield y)
+{
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+
+  if (s->prot_flags & RGW_REST_SWIFT) {
+    string path_args;
+    path_args = s->info.args.get("path");
+    if (!path_args.empty()) {
+      if (!delimiter.empty() || !prefix.empty()) {
+        op_ret = -EINVAL;
+        return;
+      }
+      prefix = path_args;
+      delimiter="/";
+    }
+  }
+
+  op_ret = s->bucket->list_multiparts(this, prefix, marker_meta,
+				      delimiter, max_uploads, uploads,
+				      &common_prefixes, &is_truncated);
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!uploads.empty()) {
+    next_marker_key = uploads.back()->get_key();
+    next_marker_upload_id = uploads.back()->get_upload_id();
+  }
+}
+
+void RGWGetHealthCheck::execute(optional_yield y)
+{
+  if (!g_conf()->rgw_healthcheck_disabling_path.empty() &&
+      (::access(g_conf()->rgw_healthcheck_disabling_path.c_str(), F_OK) == 0)) {
+    /* Disabling path specified & existent in the filesystem. */
+    op_ret = -ERR_SERVICE_UNAVAILABLE; /* 503 */
+  } else {
+    op_ret = 0; /* 200 OK */
+  }
+}
+
+int RGWDeleteMultiObj::verify_permission(optional_yield y)
+{
+  int op_ret = get_params(y);
+  if (op_ret) {
+    return op_ret;
+  }
+
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+    if (has_s3_existing_tag || has_s3_resource_tag)
+      rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  if (s->iam_policy || ! s->iam_user_policies.empty() || ! s->session_policies.empty()) {
+    if (s->bucket->get_info().obj_lock_enabled() && bypass_governance_mode) {
+      ARN bucket_arn(s->bucket->get_key());
+      auto r = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                               rgw::IAM::s3BypassGovernanceRetention, ARN(s->bucket->get_key()));
+      if (r == Effect::Deny) {
+        bypass_perm = false;
+      } else if (r == Effect::Pass && s->iam_policy) {
+        r = s->iam_policy->eval(s->env, *s->auth.identity, rgw::IAM::s3BypassGovernanceRetention,
+                                     bucket_arn);
+        if (r == Effect::Deny) {
+          bypass_perm = false;
+        }
+      } else if (r == Effect::Pass && !s->session_policies.empty()) {
+        r = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                               rgw::IAM::s3BypassGovernanceRetention, ARN(s->bucket->get_key()));
+        if (r == Effect::Deny) {
+          bypass_perm = false;
+        }
+      }
+    }
+
+    bool not_versioned = rgw::sal::Object::empty(s->object.get()) || s->object->get_instance().empty();
+
+    auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                              not_versioned ?
+                                              rgw::IAM::s3DeleteObject :
+                                              rgw::IAM::s3DeleteObjectVersion,
+                                              ARN(s->bucket->get_key()));
+    if (identity_policy_res == Effect::Deny) {
+      return -EACCES;
+    }
+
+    rgw::IAM::Effect r = Effect::Pass;
+    rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+    rgw::ARN bucket_arn(s->bucket->get_key());
+    if (s->iam_policy) {
+      r = s->iam_policy->eval(s->env, *s->auth.identity,
+				 not_versioned ?
+				 rgw::IAM::s3DeleteObject :
+				 rgw::IAM::s3DeleteObjectVersion,
+				 bucket_arn,
+         princ_type);
+    }
+    if (r == Effect::Deny)
+      return -EACCES;
+
+    if (!s->session_policies.empty()) {
+      auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                              not_versioned ?
+                                              rgw::IAM::s3DeleteObject :
+                                              rgw::IAM::s3DeleteObjectVersion,
+                                              ARN(s->bucket->get_key()));
+      if (session_policy_res == Effect::Deny) {
+          return -EACCES;
+      }
+      if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+        //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+            (session_policy_res == Effect::Allow && r == Effect::Allow)) {
+          return 0;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+        //Intersection of session policy and identity policy plus bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || r == Effect::Allow) {
+          return 0;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+        if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) {
+          return 0;
+        }
+      }
+      return -EACCES;
+    }
+    if (r == Effect::Allow || identity_policy_res == Effect::Allow)
+      return 0;
+  }
+
+  acl_allowed = verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE);
+  if (!acl_allowed)
+    return -EACCES;
+
+  return 0;
+}
+
+void RGWDeleteMultiObj::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWDeleteMultiObj::write_ops_log_entry(rgw_log_entry& entry) const {
+  int num_err = 0;
+  int num_ok = 0;
+  for (auto iter = ops_log_entries.begin();
+       iter != ops_log_entries.end();
+       ++iter) {
+    if (iter->error) {
+      num_err++;
+    } else {
+      num_ok++;
+    }
+  }
+  entry.delete_multi_obj_meta.num_err = num_err;
+  entry.delete_multi_obj_meta.num_ok = num_ok;
+  entry.delete_multi_obj_meta.objects = std::move(ops_log_entries);
+}
+
+void RGWDeleteMultiObj::wait_flush(optional_yield y,
+                                   boost::asio::deadline_timer *formatter_flush_cond,
+		                   std::function<bool()> predicate)
+{
+  if (y && formatter_flush_cond) {
+    auto yc = y.get_yield_context();
+    while (!predicate()) {
+      boost::system::error_code error;
+      formatter_flush_cond->async_wait(yc[error]);
+      rgw_flush_formatter(s, s->formatter);
+    }
+  }
+}
+
+void RGWDeleteMultiObj::handle_individual_object(const rgw_obj_key& o, optional_yield y,
+                                                 boost::asio::deadline_timer *formatter_flush_cond)
+{
+  std::string version_id;
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(o);
+  if (s->iam_policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) {
+    auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                                                 o.instance.empty() ?
+                                                                 rgw::IAM::s3DeleteObject :
+                                                                 rgw::IAM::s3DeleteObjectVersion,
+                                                                 ARN(obj->get_obj()));
+    if (identity_policy_res == Effect::Deny) {
+      send_partial_response(o, false, "", -EACCES, formatter_flush_cond);
+      return;
+    }
+
+    rgw::IAM::Effect e = Effect::Pass;
+    rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+    if (s->iam_policy) {
+      ARN obj_arn(obj->get_obj());
+      e = s->iam_policy->eval(s->env,
+                              *s->auth.identity,
+                              o.instance.empty() ?
+                              rgw::IAM::s3DeleteObject :
+                              rgw::IAM::s3DeleteObjectVersion,
+                              obj_arn,
+                              princ_type);
+    }
+    if (e == Effect::Deny) {
+      send_partial_response(o, false, "", -EACCES, formatter_flush_cond);
+      return;
+    }
+
+    if (!s->session_policies.empty()) {
+      auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                                                  o.instance.empty() ?
+                                                                  rgw::IAM::s3DeleteObject :
+                                                                  rgw::IAM::s3DeleteObjectVersion,
+                                                                  ARN(obj->get_obj()));
+      if (session_policy_res == Effect::Deny) {
+        send_partial_response(o, false, "", -EACCES, formatter_flush_cond);
+        return;
+      }
+      if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+        //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+        if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) &&
+            (session_policy_res != Effect::Allow || e != Effect::Allow)) {
+          send_partial_response(o, false, "", -EACCES, formatter_flush_cond);
+          return;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+        //Intersection of session policy and identity policy plus bucket policy
+        if ((session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) && e != Effect::Allow) {
+          send_partial_response(o, false, "", -EACCES, formatter_flush_cond);
+          return;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+        if (session_policy_res != Effect::Allow || identity_policy_res != Effect::Allow) {
+          send_partial_response(o, false, "", -EACCES, formatter_flush_cond);
+          return;
+        }
+      }
+      send_partial_response(o, false, "", -EACCES, formatter_flush_cond);
+      return;
+    }
+
+    if ((identity_policy_res == Effect::Pass && e == Effect::Pass && !acl_allowed)) {
+      send_partial_response(o, false, "", -EACCES, formatter_flush_cond);
+      return;
+    }
+  }
+
+  uint64_t obj_size = 0;
+  std::string etag;
+
+  if (!rgw::sal::Object::empty(obj.get())) {
+    RGWObjState* astate = nullptr;
+    bool check_obj_lock = obj->have_instance() && bucket->get_info().obj_lock_enabled();
+    const auto ret = obj->get_obj_state(this, &astate, y, true);
+
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+        // object maybe delete_marker, skip check_obj_lock
+        check_obj_lock = false;
+      } else {
+        // Something went wrong.
+        send_partial_response(o, false, "", ret, formatter_flush_cond);
+        return;
+      }
+    } else {
+      obj_size = astate->size;
+      etag = astate->attrset[RGW_ATTR_ETAG].to_str();
+    }
+
+    if (check_obj_lock) {
+      ceph_assert(astate);
+      int object_lock_response = verify_object_lock(this, astate->attrset, bypass_perm, bypass_governance_mode);
+      if (object_lock_response != 0) {
+        send_partial_response(o, false, "", object_lock_response, formatter_flush_cond);
+        return;
+      }
+    }
+  }
+
+  // make reservation for notification if needed
+  const auto versioned_object = s->bucket->versioning_enabled();
+  const auto event_type = versioned_object && obj->get_instance().empty() ?
+                          rgw::notify::ObjectRemovedDeleteMarkerCreated :
+                          rgw::notify::ObjectRemovedDelete;
+  std::unique_ptr<rgw::sal::Notification> res
+          = driver->get_notification(obj.get(), s->src_object.get(), s, event_type, y);
+  op_ret = res->publish_reserve(this);
+  if (op_ret < 0) {
+    send_partial_response(o, false, "", op_ret, formatter_flush_cond);
+    return;
+  }
+
+  obj->set_atomic();
+
+  std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = obj->get_delete_op();
+  del_op->params.versioning_status = obj->get_bucket()->get_info().versioning_status();
+  del_op->params.obj_owner = s->owner;
+  del_op->params.bucket_owner = s->bucket_owner;
+  del_op->params.marker_version_id = version_id;
+
+  op_ret = del_op->delete_obj(this, y);
+  if (op_ret == -ENOENT) {
+    op_ret = 0;
+  }
+
+  send_partial_response(o, del_op->result.delete_marker, del_op->result.version_id, op_ret, formatter_flush_cond);
+
+  // send request to notification manager
+  int ret = res->publish_commit(this, obj_size, ceph::real_clock::now(), etag, version_id);
+  if (ret < 0) {
+    ldpp_dout(this, 1) << "ERROR: publishing notification failed, with error: " << ret << dendl;
+    // too late to rollback operation, hence op_ret is not set here
+  }
+}
+
+void RGWDeleteMultiObj::execute(optional_yield y)
+{
+  RGWMultiDelDelete *multi_delete;
+  vector<rgw_obj_key>::iterator iter;
+  RGWMultiDelXMLParser parser;
+  uint32_t aio_count = 0;
+  const uint32_t max_aio = std::max<uint32_t>(1, s->cct->_conf->rgw_multi_obj_del_max_aio);
+  char* buf;
+  std::optional<boost::asio::deadline_timer> formatter_flush_cond;
+  if (y) {
+    formatter_flush_cond = std::make_optional<boost::asio::deadline_timer>(y.get_io_context());  
+  }
+
+  buf = data.c_str();
+  if (!buf) {
+    op_ret = -EINVAL;
+    goto error;
+  }
+
+  if (!parser.init()) {
+    op_ret = -EINVAL;
+    goto error;
+  }
+
+  if (!parser.parse(buf, data.length(), 1)) {
+    op_ret = -EINVAL;
+    goto error;
+  }
+
+  multi_delete = static_cast<RGWMultiDelDelete *>(parser.find_first("Delete"));
+  if (!multi_delete) {
+    op_ret = -EINVAL;
+    goto error;
+  } else {
+#define DELETE_MULTI_OBJ_MAX_NUM      1000
+    int max_num = s->cct->_conf->rgw_delete_multi_obj_max_num;
+    if (max_num < 0) {
+      max_num = DELETE_MULTI_OBJ_MAX_NUM;
+    }
+    int multi_delete_object_num = multi_delete->objects.size();
+    if (multi_delete_object_num > max_num) {
+      op_ret = -ERR_MALFORMED_XML;
+      goto error;
+    }
+  }
+
+  if (multi_delete->is_quiet())
+    quiet = true;
+
+  if (s->bucket->get_info().mfa_enabled()) {
+    bool has_versioned = false;
+    for (auto i : multi_delete->objects) {
+      if (!i.instance.empty()) {
+        has_versioned = true;
+        break;
+      }
+    }
+    if (has_versioned && !s->mfa_verified) {
+      ldpp_dout(this, 5) << "NOTICE: multi-object delete request with a versioned object, mfa auth not provided" << dendl;
+      op_ret = -ERR_MFA_REQUIRED;
+      goto error;
+    }
+  }
+
+  begin_response();
+  if (multi_delete->objects.empty()) {
+    goto done;
+  }
+
+  for (iter = multi_delete->objects.begin();
+        iter != multi_delete->objects.end();
+        ++iter) {
+    rgw_obj_key obj_key = *iter;
+    if (y) {
+      wait_flush(y, &*formatter_flush_cond, [&aio_count, max_aio] {
+        return aio_count < max_aio;
+      });
+      aio_count++;
+      spawn::spawn(y.get_yield_context(), [this, &y, &aio_count, obj_key, &formatter_flush_cond] (yield_context yield) {
+        handle_individual_object(obj_key, optional_yield { y.get_io_context(), yield }, &*formatter_flush_cond); 
+        aio_count--;
+      }); 
+    } else {
+      handle_individual_object(obj_key, y, nullptr);
+    }
+  }
+  if (formatter_flush_cond) {
+    wait_flush(y, &*formatter_flush_cond, [this, n=multi_delete->objects.size()] {
+      return n == ops_log_entries.size();
+    });
+  }
+
+  /*  set the return code to zero, errors at this point will be
+  dumped to the response */
+  op_ret = 0;
+
+done:
+  // will likely segfault if begin_response() has not been called
+  end_response();
+  return;
+
+error:
+  send_status();
+  return;
+
+}
+
+bool RGWBulkDelete::Deleter::verify_permission(RGWBucketInfo& binfo,
+                                               map<string, bufferlist>& battrs,
+                                               ACLOwner& bucket_owner /* out */,
+					       optional_yield y)
+{
+  RGWAccessControlPolicy bacl(driver->ctx());
+  int ret = read_bucket_policy(dpp, driver, s, binfo, battrs, &bacl, binfo.bucket, y);
+  if (ret < 0) {
+    return false;
+  }
+
+  auto policy = get_iam_policy_from_attr(s->cct, battrs, binfo.bucket.tenant);
+
+  bucket_owner = bacl.get_owner();
+
+  /* We can use global user_acl because each BulkDelete request is allowed
+   * to work on entities from a single account only. */
+  return verify_bucket_permission(dpp, s, binfo.bucket, s->user_acl.get(),
+				  &bacl, policy, s->iam_user_policies, s->session_policies, rgw::IAM::s3DeleteBucket);
+}
+
+bool RGWBulkDelete::Deleter::delete_single(const acct_path_t& path, optional_yield y)
+{
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  ACLOwner bowner;
+  RGWObjVersionTracker ot;
+
+  int ret = driver->get_bucket(dpp, s->user.get(), s->user->get_tenant(), path.bucket_name, &bucket, y);
+  if (ret < 0) {
+    goto binfo_fail;
+  }
+
+  ret = bucket->load_bucket(dpp, s->yield);
+  if (ret < 0) {
+    goto binfo_fail;
+  }
+
+  if (!verify_permission(bucket->get_info(), bucket->get_attrs(), bowner, y)) {
+    ret = -EACCES;
+    goto auth_fail;
+  }
+
+  if (!path.obj_key.empty()) {
+    ACLOwner bucket_owner;
+
+    bucket_owner.set_id(bucket->get_info().owner);
+    std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(path.obj_key);
+    obj->set_atomic();
+
+    std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = obj->get_delete_op();
+    del_op->params.versioning_status = obj->get_bucket()->get_info().versioning_status();
+    del_op->params.obj_owner = bowner;
+    del_op->params.bucket_owner = bucket_owner;
+
+    ret = del_op->delete_obj(dpp, y);
+    if (ret < 0) {
+      goto delop_fail;
+    }
+  } else {
+    ret = bucket->remove_bucket(dpp, false, true, &s->info, s->yield);
+    if (ret < 0) {
+      goto delop_fail;
+    }
+  }
+
+  num_deleted++;
+  return true;
+
+binfo_fail:
+    if (-ENOENT == ret) {
+      ldpp_dout(dpp, 20) << "cannot find bucket = " << path.bucket_name << dendl;
+      num_unfound++;
+    } else {
+      ldpp_dout(dpp, 20) << "cannot get bucket info, ret = " << ret << dendl;
+
+      fail_desc_t failed_item = {
+        .err  = ret,
+        .path = path
+      };
+      failures.push_back(failed_item);
+    }
+    return false;
+
+auth_fail:
+    ldpp_dout(dpp, 20) << "wrong auth for " << path << dendl;
+    {
+      fail_desc_t failed_item = {
+        .err  = ret,
+        .path = path
+      };
+      failures.push_back(failed_item);
+    }
+    return false;
+
+delop_fail:
+    if (-ENOENT == ret) {
+      ldpp_dout(dpp, 20) << "cannot find entry " << path << dendl;
+      num_unfound++;
+    } else {
+      fail_desc_t failed_item = {
+        .err  = ret,
+        .path = path
+      };
+      failures.push_back(failed_item);
+    }
+    return false;
+}
+
+bool RGWBulkDelete::Deleter::delete_chunk(const std::list<acct_path_t>& paths, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << "in delete_chunk" << dendl;
+  for (auto path : paths) {
+    ldpp_dout(dpp, 20) << "bulk deleting path: " << path << dendl;
+    delete_single(path, y);
+  }
+
+  return true;
+}
+
+int RGWBulkDelete::verify_permission(optional_yield y)
+{
+  return 0;
+}
+
+void RGWBulkDelete::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWBulkDelete::execute(optional_yield y)
+{
+  deleter = std::unique_ptr<Deleter>(new Deleter(this, driver, s));
+
+  bool is_truncated = false;
+  do {
+    list<RGWBulkDelete::acct_path_t> items;
+
+    int ret = get_data(items, &is_truncated);
+    if (ret < 0) {
+      return;
+    }
+
+    ret = deleter->delete_chunk(items, y);
+  } while (!op_ret && is_truncated);
+
+  return;
+}
+
+
+constexpr std::array<int, 2> RGWBulkUploadOp::terminal_errors;
+
+int RGWBulkUploadOp::verify_permission(optional_yield y)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  if (! verify_user_permission_no_policy(this, s, RGW_PERM_WRITE)) {
+    return -EACCES;
+  }
+
+  if (s->user->get_tenant() != s->bucket_tenant) {
+    ldpp_dout(this, 10) << "user cannot create a bucket in a different tenant"
+        << " (user_id.tenant=" << s->user->get_tenant()
+        << " requested=" << s->bucket_tenant << ")" << dendl;
+    return -EACCES;
+  }
+
+  if (s->user->get_max_buckets() < 0) {
+    return -EPERM;
+  }
+
+  return 0;
+}
+
+void RGWBulkUploadOp::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+boost::optional<std::pair<std::string, rgw_obj_key>>
+RGWBulkUploadOp::parse_path(const std::string_view& path)
+{
+  /* We need to skip all slashes at the beginning in order to preserve
+   * compliance with Swift. */
+  const size_t start_pos = path.find_first_not_of('/');
+
+  if (std::string_view::npos != start_pos) {
+    /* Seperator is the first slash after the leading ones. */
+    const size_t sep_pos = path.substr(start_pos).find('/');
+
+    if (std::string_view::npos != sep_pos) {
+      const auto bucket_name = path.substr(start_pos, sep_pos - start_pos);
+      const auto obj_name = path.substr(sep_pos + 1);
+
+      return std::make_pair(std::string(bucket_name),
+                            rgw_obj_key(std::string(obj_name)));
+    } else {
+      /* It's guaranteed here that bucket name is at least one character
+       * long and is different than slash. */
+      return std::make_pair(std::string(path.substr(start_pos)),
+                            rgw_obj_key());
+    }
+  }
+
+  return none;
+}
+
+std::pair<std::string, std::string>
+RGWBulkUploadOp::handle_upload_path(req_state *s)
+{
+  std::string bucket_path, file_prefix;
+  if (! s->init_state.url_bucket.empty()) {
+    file_prefix = bucket_path = s->init_state.url_bucket + "/";
+    if (!rgw::sal::Object::empty(s->object.get())) {
+      const std::string& object_name = s->object->get_name();
+
+      /* As rgw_obj_key::empty() already verified emptiness of s->object->get_name(),
+       * we can safely examine its last element. */
+      if (object_name.back() == '/') {
+        file_prefix.append(object_name);
+      } else {
+        file_prefix.append(object_name).append("/");
+      }
+    }
+  }
+  return std::make_pair(bucket_path, file_prefix);
+}
+
+int RGWBulkUploadOp::handle_dir_verify_permission(optional_yield y)
+{
+  if (s->user->get_max_buckets() > 0) {
+    rgw::sal::BucketList buckets;
+    std::string marker;
+    op_ret = s->user->list_buckets(this, marker, std::string(), s->user->get_max_buckets(),
+                                   false, buckets, y);
+    if (op_ret < 0) {
+      return op_ret;
+    }
+
+    if (buckets.count() >= static_cast<size_t>(s->user->get_max_buckets())) {
+      return -ERR_TOO_MANY_BUCKETS;
+    }
+  }
+
+  return 0;
+}
+
+static void forward_req_info(const DoutPrefixProvider *dpp, CephContext *cct, req_info& info, const std::string& bucket_name)
+{
+  /* the request of container or object level will contain bucket name.
+   * only at account level need to append the bucket name */
+  if (info.script_uri.find(bucket_name) != std::string::npos) {
+    return;
+  }
+
+  ldpp_dout(dpp, 20) << "append the bucket: "<< bucket_name << " to req_info" << dendl;
+  info.script_uri.append("/").append(bucket_name);
+  info.request_uri_aws4 = info.request_uri = info.script_uri;
+  info.effective_uri = "/" + bucket_name;
+}
+
+void RGWBulkUploadOp::init(rgw::sal::Driver* const driver,
+                           req_state* const s,
+                           RGWHandler* const h)
+{
+  RGWOp::init(driver, s, h);
+}
+
+int RGWBulkUploadOp::handle_dir(const std::string_view path, optional_yield y)
+{
+  ldpp_dout(this, 20) << "got directory=" << path << dendl;
+
+  op_ret = handle_dir_verify_permission(y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  std::string bucket_name;
+  rgw_obj_key object_junk;
+  std::tie(bucket_name, object_junk) =  *parse_path(path);
+
+  /* we need to make sure we read bucket info, it's not read before for this
+   * specific request */
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  /* Create metadata: ACLs. */
+  std::map<std::string, ceph::bufferlist> attrs;
+  RGWAccessControlPolicy policy;
+  policy.create_default(s->user->get_id(), s->user->get_display_name());
+  ceph::bufferlist aclbl;
+  policy.encode(aclbl);
+  attrs.emplace(RGW_ATTR_ACL, std::move(aclbl));
+
+  obj_version objv, ep_objv;
+  bool bucket_exists;
+  RGWQuotaInfo quota_info;
+  const RGWQuotaInfo* pquota_info = nullptr;
+  RGWBucketInfo out_info;
+  string swift_ver_location;
+  rgw_bucket new_bucket;
+  req_info info = s->info;
+  new_bucket.tenant = s->bucket_tenant; /* ignored if bucket exists */
+  new_bucket.name = bucket_name;
+  rgw_placement_rule placement_rule;
+  placement_rule.storage_class = s->info.storage_class;
+  forward_req_info(this, s->cct, info, bucket_name);
+
+  op_ret = s->user->create_bucket(this, new_bucket,
+                                driver->get_zone()->get_zonegroup().get_id(),
+                                placement_rule, swift_ver_location,
+                                pquota_info, policy, attrs,
+                                out_info, ep_objv,
+                                true, false, &bucket_exists,
+				info, &bucket, y);
+  /* continue if EEXIST and create_bucket will fail below.  this way we can
+   * recover from a partial create by retrying it. */
+  ldpp_dout(this, 20) << "rgw_create_bucket returned ret=" << op_ret
+      << ", bucket=" << bucket << dendl;
+
+  return op_ret;
+}
+
+
+bool RGWBulkUploadOp::handle_file_verify_permission(RGWBucketInfo& binfo,
+						    const rgw_obj& obj,
+                                                    std::map<std::string, ceph::bufferlist>& battrs,
+                                                    ACLOwner& bucket_owner /* out */,
+						    optional_yield y)
+{
+  RGWAccessControlPolicy bacl(driver->ctx());
+  op_ret = read_bucket_policy(this, driver, s, binfo, battrs, &bacl, binfo.bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "cannot read_policy() for bucket" << dendl;
+    return false;
+  }
+
+  auto policy = get_iam_policy_from_attr(s->cct, battrs, binfo.bucket.tenant);
+
+  bucket_owner = bacl.get_owner();
+  if (policy || ! s->iam_user_policies.empty() || !s->session_policies.empty()) {
+    auto identity_policy_res = eval_identity_or_session_policies(this, s->iam_user_policies, s->env,
+                                              rgw::IAM::s3PutObject, obj);
+    if (identity_policy_res == Effect::Deny) {
+      return false;
+    }
+
+    rgw::IAM::PolicyPrincipal princ_type = rgw::IAM::PolicyPrincipal::Other;
+    ARN obj_arn(obj);
+    auto e = policy->eval(s->env, *s->auth.identity,
+			  rgw::IAM::s3PutObject, obj_arn, princ_type);
+    if (e == Effect::Deny) {
+      return false;
+    }
+  
+    if (!s->session_policies.empty()) {
+      auto session_policy_res = eval_identity_or_session_policies(this, s->session_policies, s->env,
+                                              rgw::IAM::s3PutObject, obj);
+      if (session_policy_res == Effect::Deny) {
+          return false;
+      }
+      if (princ_type == rgw::IAM::PolicyPrincipal::Role) {
+        //Intersection of session policy and identity policy plus intersection of session policy and bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) ||
+            (session_policy_res == Effect::Allow && e == Effect::Allow)) {
+          return true;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Session) {
+        //Intersection of session policy and identity policy plus bucket policy
+        if ((session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) || e == Effect::Allow) {
+          return true;
+        }
+      } else if (princ_type == rgw::IAM::PolicyPrincipal::Other) {// there was no match in the bucket policy
+        if (session_policy_res == Effect::Allow && identity_policy_res == Effect::Allow) {
+          return true;
+        }
+      }
+      return false;
+    }
+    if (e == Effect::Allow || identity_policy_res == Effect::Allow) {
+      return true;
+    }
+  }
+    
+  return verify_bucket_permission_no_policy(this, s, s->user_acl.get(),
+					    &bacl, RGW_PERM_WRITE);
+}
+
+int RGWBulkUploadOp::handle_file(const std::string_view path,
+                                 const size_t size,
+                                 AlignedStreamGetter& body, optional_yield y)
+{
+
+  ldpp_dout(this, 20) << "got file=" << path << ", size=" << size << dendl;
+
+  if (size > static_cast<size_t>(s->cct->_conf->rgw_max_put_size)) {
+    op_ret = -ERR_TOO_LARGE;
+    return op_ret;
+  }
+
+  std::string bucket_name;
+  rgw_obj_key object;
+  std::tie(bucket_name, object) = *parse_path(path);
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  ACLOwner bowner;
+
+  op_ret = driver->get_bucket(this, s->user.get(), rgw_bucket(rgw_bucket_key(s->user->get_tenant(), bucket_name)), &bucket, y);
+  if (op_ret < 0) {
+    if (op_ret == -ENOENT) {
+      ldpp_dout(this, 20) << "non existent directory=" << bucket_name << dendl;
+    }
+    return op_ret;
+  }
+
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(object);
+
+  if (! handle_file_verify_permission(bucket->get_info(),
+				      obj->get_obj(),
+				      bucket->get_attrs(), bowner, y)) {
+    ldpp_dout(this, 20) << "object creation unauthorized" << dendl;
+    op_ret = -EACCES;
+    return op_ret;
+  }
+
+  op_ret = bucket->check_quota(this, quota, size, y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  if (bucket->versioning_enabled()) {
+    obj->gen_rand_obj_instance_name();
+  }
+
+  rgw_placement_rule dest_placement = s->dest_placement;
+  dest_placement.inherit_from(bucket->get_placement_rule());
+
+  std::unique_ptr<rgw::sal::Writer> processor;
+  processor = driver->get_atomic_writer(this, s->yield, obj.get(),
+				       bowner.get_id(),
+				       &s->dest_placement, 0, s->req_id);
+  op_ret = processor->prepare(s->yield);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "cannot prepare processor due to ret=" << op_ret << dendl;
+    return op_ret;
+  }
+
+  /* No filters by default. */
+  rgw::sal::DataProcessor *filter = processor.get();
+
+  const auto& compression_type = driver->get_compression_type(dest_placement);
+  CompressorRef plugin;
+  boost::optional<RGWPutObj_Compress> compressor;
+  if (compression_type != "none") {
+    plugin = Compressor::create(s->cct, compression_type);
+    if (! plugin) {
+      ldpp_dout(this, 1) << "Cannot load plugin for rgw_compression_type "
+          << compression_type << dendl;
+    } else {
+      compressor.emplace(s->cct, plugin, filter);
+      filter = &*compressor;
+    }
+  }
+
+  /* Upload file content. */
+  ssize_t len = 0;
+  size_t ofs = 0;
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  do {
+    ceph::bufferlist data;
+    len = body.get_at_most(s->cct->_conf->rgw_max_chunk_size, data);
+
+    ldpp_dout(this, 20) << "body=" << data.c_str() << dendl;
+    if (len < 0) {
+      op_ret = len;
+      return op_ret;
+    } else if (len > 0) {
+      hash.Update((const unsigned char *)data.c_str(), data.length());
+      op_ret = filter->process(std::move(data), ofs);
+      if (op_ret < 0) {
+        ldpp_dout(this, 20) << "filter->process() returned ret=" << op_ret << dendl;
+        return op_ret;
+      }
+
+      ofs += len;
+    }
+
+  } while (len > 0);
+
+  // flush
+  op_ret = filter->process({}, ofs);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  if (ofs != size) {
+    ldpp_dout(this, 10) << "real file size different from declared" << dendl;
+    op_ret = -EINVAL;
+    return op_ret;
+  }
+
+  op_ret = bucket->check_quota(this, quota, size, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "quota exceeded for path=" << path << dendl;
+    return op_ret;
+  }
+
+  char calc_md5[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
+  unsigned char m[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  hash.Final(m);
+  buf_to_hex(m, CEPH_CRYPTO_MD5_DIGESTSIZE, calc_md5);
+
+  /* Create metadata: ETAG. */
+  std::map<std::string, ceph::bufferlist> attrs;
+  std::string etag = calc_md5;
+  ceph::bufferlist etag_bl;
+  etag_bl.append(etag.c_str(), etag.size() + 1);
+  attrs.emplace(RGW_ATTR_ETAG, std::move(etag_bl));
+
+  /* Create metadata: ACLs. */
+  RGWAccessControlPolicy policy;
+  policy.create_default(s->user->get_id(), s->user->get_display_name());
+  ceph::bufferlist aclbl;
+  policy.encode(aclbl);
+  attrs.emplace(RGW_ATTR_ACL, std::move(aclbl));
+
+  /* Create metadata: compression info. */
+  if (compressor && compressor->is_compressed()) {
+    ceph::bufferlist tmp;
+    RGWCompressionInfo cs_info;
+    cs_info.compression_type = plugin->get_type_name();
+    cs_info.orig_size = size;
+    cs_info.compressor_message = compressor->get_compressor_message();
+    cs_info.blocks = std::move(compressor->get_compression_blocks());
+    encode(cs_info, tmp);
+    attrs.emplace(RGW_ATTR_COMPRESSION, std::move(tmp));
+  }
+
+  /* Complete the transaction. */
+  op_ret = processor->complete(size, etag, nullptr, ceph::real_time(),
+                              attrs, ceph::real_time() /* delete_at */,
+                              nullptr, nullptr, nullptr, nullptr, nullptr,
+                              s->yield);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "processor::complete returned op_ret=" << op_ret << dendl;
+  }
+
+  return op_ret;
+}
+
+void RGWBulkUploadOp::execute(optional_yield y)
+{
+  ceph::bufferlist buffer(64 * 1024);
+
+  ldpp_dout(this, 20) << "start" << dendl;
+
+  /* Create an instance of stream-abstracting class. Having this indirection
+   * allows for easy introduction of decompressors like gzip and bzip2. */
+  auto stream = create_stream();
+  if (! stream) {
+    return;
+  }
+
+  /* Handling the $UPLOAD_PATH accordingly to the Swift's Bulk middleware. See: 
+   * https://github.com/openstack/swift/blob/2.13.0/swift/common/middleware/bulk.py#L31-L41 */
+  std::string bucket_path, file_prefix;
+  std::tie(bucket_path, file_prefix) = handle_upload_path(s);
+
+  auto status = rgw::tar::StatusIndicator::create();
+  do {
+    op_ret = stream->get_exactly(rgw::tar::BLOCK_SIZE, buffer);
+    if (op_ret < 0) {
+      ldpp_dout(this, 2) << "cannot read header" << dendl;
+      return;
+    }
+
+    /* We need to re-interpret the buffer as a TAR block. Exactly two blocks
+     * must be tracked to detect out end-of-archive. It occurs when both of
+     * them are empty (zeroed). Tracing this particular inter-block dependency
+     * is responsibility of the rgw::tar::StatusIndicator class. */
+    boost::optional<rgw::tar::HeaderView> header;
+    std::tie(status, header) = rgw::tar::interpret_block(status, buffer);
+
+    if (! status.empty() && header) {
+      /* This specific block isn't empty (entirely zeroed), so we can parse
+       * it as a TAR header and dispatch. At the moment we do support only
+       * regular files and directories. Everything else (symlinks, devices)
+       * will be ignored but won't cease the whole upload. */
+      switch (header->get_filetype()) {
+        case rgw::tar::FileType::NORMAL_FILE: {
+          ldpp_dout(this, 2) << "handling regular file" << dendl;
+
+          std::string filename;
+	  if (bucket_path.empty())
+	    filename = header->get_filename();
+	  else
+	    filename = file_prefix + std::string(header->get_filename());
+	  auto body = AlignedStreamGetter(0, header->get_filesize(),
+                                          rgw::tar::BLOCK_SIZE, *stream);
+          op_ret = handle_file(filename,
+                               header->get_filesize(),
+                               body, y);
+          if (! op_ret) {
+            /* Only regular files counts. */
+            num_created++;
+          } else {
+            failures.emplace_back(op_ret, std::string(filename));
+          }
+          break;
+        }
+        case rgw::tar::FileType::DIRECTORY: {
+          ldpp_dout(this, 2) << "handling regular directory" << dendl;
+
+          std::string_view dirname = bucket_path.empty() ? header->get_filename() : bucket_path;
+          op_ret = handle_dir(dirname, y);
+          if (op_ret < 0 && op_ret != -ERR_BUCKET_EXISTS) {
+            failures.emplace_back(op_ret, std::string(dirname));
+          }
+          break;
+        }
+        default: {
+          /* Not recognized. Skip. */
+          op_ret = 0;
+          break;
+        }
+      }
+
+      /* In case of any problems with sub-request authorization Swift simply
+       * terminates whole upload immediately. */
+      if (boost::algorithm::contains(std::initializer_list<int>{ op_ret },
+                                     terminal_errors)) {
+        ldpp_dout(this, 2) << "terminating due to ret=" << op_ret << dendl;
+        break;
+      }
+    } else {
+      ldpp_dout(this, 2) << "an empty block" << dendl;
+      op_ret = 0;
+    }
+
+    buffer.clear();
+  } while (! status.eof());
+
+  return;
+}
+
+RGWBulkUploadOp::AlignedStreamGetter::~AlignedStreamGetter()
+{
+  const size_t aligned_legnth = length + (-length % alignment);
+  ceph::bufferlist junk;
+
+  DecoratedStreamGetter::get_exactly(aligned_legnth - position, junk);
+}
+
+ssize_t RGWBulkUploadOp::AlignedStreamGetter::get_at_most(const size_t want,
+                                                          ceph::bufferlist& dst)
+{
+  const size_t max_to_read = std::min(want, length - position);
+  const auto len = DecoratedStreamGetter::get_at_most(max_to_read, dst);
+  if (len > 0) {
+    position += len;
+  }
+  return len;
+}
+
+ssize_t RGWBulkUploadOp::AlignedStreamGetter::get_exactly(const size_t want,
+                                                          ceph::bufferlist& dst)
+{
+  const auto len = DecoratedStreamGetter::get_exactly(want, dst);
+  if (len > 0) {
+    position += len;
+  }
+  return len;
+}
+
+int RGWGetAttrs::verify_permission(optional_yield y)
+{
+  s->object->set_atomic();
+
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+    if (has_s3_existing_tag || has_s3_resource_tag)
+      rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  auto iam_action = s->object->get_instance().empty() ?
+    rgw::IAM::s3GetObject :
+    rgw::IAM::s3GetObjectVersion;
+
+  if (!verify_object_permission(this, s, iam_action)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWGetAttrs::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetAttrs::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0)
+    return;
+
+  s->object->set_atomic();
+
+  op_ret = s->object->get_obj_attrs(s->yield, this);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << s->object
+        << " ret=" << op_ret << dendl;
+    return;
+  }
+
+  /* XXX RGWObject::get_obj_attrs() does not support filtering (yet) */
+  auto& obj_attrs = s->object->get_attrs();
+  if (attrs.size() != 0) {
+    /* return only attrs requested */
+    for (auto& att : attrs) {
+      auto iter = obj_attrs.find(att.first);
+      if (iter != obj_attrs.end()) {
+	att.second = iter->second;
+      }
+    }
+  } else {
+    /* return all attrs */
+    for  (auto& att : obj_attrs) {
+      attrs.insert(get_attrs_t::value_type(att.first, att.second));;
+    }
+  }
+
+  return;
+ }
+
+int RGWRMAttrs::verify_permission(optional_yield y)
+{
+  // This looks to be part of the RGW-NFS machinery and has no S3 or
+  // Swift equivalent.
+  bool perm;
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    perm = verify_object_permission_no_policy(this, s, RGW_PERM_WRITE);
+  } else {
+    perm = verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE);
+  }
+  if (!perm)
+    return -EACCES;
+
+  return 0;
+}
+
+void RGWRMAttrs::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWRMAttrs::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0)
+    return;
+
+  s->object->set_atomic();
+
+  op_ret = s->object->set_obj_attrs(this, nullptr, &attrs, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to delete obj attrs, obj=" << s->object
+		       << " ret=" << op_ret << dendl;
+  }
+  return;
+}
+
+int RGWSetAttrs::verify_permission(optional_yield y)
+{
+  // This looks to be part of the RGW-NFS machinery and has no S3 or
+  // Swift equivalent.
+  bool perm;
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    perm = verify_object_permission_no_policy(this, s, RGW_PERM_WRITE);
+  } else {
+    perm = verify_bucket_permission_no_policy(this, s, RGW_PERM_WRITE);
+  }
+  if (!perm)
+    return -EACCES;
+
+  return 0;
+}
+
+void RGWSetAttrs::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWSetAttrs::execute(optional_yield y)
+{
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    rgw::sal::Attrs a(attrs);
+    op_ret = s->object->set_obj_attrs(this, &a, nullptr, y);
+  } else {
+    op_ret = s->bucket->merge_and_store_attrs(this, attrs, y);
+  }
+
+} /* RGWSetAttrs::execute() */
+
+void RGWGetObjLayout::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetObjLayout::execute(optional_yield y)
+{
+}
+
+
+int RGWConfigBucketMetaSearch::verify_permission(optional_yield y)
+{
+  if (!s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWConfigBucketMetaSearch::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWConfigBucketMetaSearch::execute(optional_yield y)
+{
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "NOTICE: get_params() returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  s->bucket->get_info().mdsearch_config = mdsearch_config;
+
+  op_ret = s->bucket->put_info(this, false, real_time());
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket->get_name()
+        << " returned err=" << op_ret << dendl;
+    return;
+  }
+  s->bucket_attrs = s->bucket->get_attrs();
+}
+
+int RGWGetBucketMetaSearch::verify_permission(optional_yield y)
+{
+  if (!s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWGetBucketMetaSearch::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+int RGWDelBucketMetaSearch::verify_permission(optional_yield y)
+{
+  if (!s->auth.identity->is_owner_of(s->bucket_owner.get_id())) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWDelBucketMetaSearch::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWDelBucketMetaSearch::execute(optional_yield y)
+{
+  s->bucket->get_info().mdsearch_config.clear();
+
+  op_ret = s->bucket->put_info(this, false, real_time());
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket->get_name()
+        << " returned err=" << op_ret << dendl;
+    return;
+  }
+  s->bucket_attrs = s->bucket->get_attrs();
+}
+
+
+RGWHandler::~RGWHandler()
+{
+}
+
+int RGWHandler::init(rgw::sal::Driver* _driver,
+                     req_state *_s,
+                     rgw::io::BasicClient *cio)
+{
+  driver = _driver;
+  s = _s;
+
+  return 0;
+}
+
+int RGWHandler::do_init_permissions(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  int ret = rgw_build_bucket_policies(dpp, driver, s, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 10) << "init_permissions on " << s->bucket
+        << " failed, ret=" << ret << dendl;
+    return ret==-ENODATA ? -EACCES : ret;
+  }
+
+  rgw_build_iam_environment(driver, s);
+  return ret;
+}
+
+int RGWHandler::do_read_permissions(RGWOp *op, bool only_bucket, optional_yield y)
+{
+  if (only_bucket) {
+    /* already read bucket info */
+    return 0;
+  }
+  int ret = rgw_build_object_policies(op, driver, s, op->prefetch_data(), y);
+
+  if (ret < 0) {
+    ldpp_dout(op, 10) << "read_permissions on " << s->bucket << ":"
+		      << s->object << " only_bucket=" << only_bucket
+		      << " ret=" << ret << dendl;
+    if (ret == -ENODATA)
+      ret = -EACCES;
+    if (s->auth.identity->is_anonymous() && ret == -EACCES)
+      ret = -EPERM;
+  }
+
+  return ret;
+}
+
+int RGWOp::error_handler(int err_no, string *error_content, optional_yield y) {
+  return dialect_handler->error_handler(err_no, error_content, y);
+}
+
+int RGWHandler::error_handler(int err_no, string *error_content, optional_yield) {
+  // This is the do-nothing error handler
+  return err_no;
+}
+
+std::ostream& RGWOp::gen_prefix(std::ostream& out) const
+{
+  // append <dialect>:<op name> to the prefix
+  return s->gen_prefix(out) << s->dialect << ':' << name() << ' ';
+}
+
+void RGWDefaultResponseOp::send_response() {
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+void RGWPutBucketPolicy::send_response()
+{
+  if (!op_ret) {
+    /* A successful Put Bucket Policy should return a 204 on success */
+    op_ret = STATUS_NO_CONTENT;
+  }
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+int RGWPutBucketPolicy::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketPolicy)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+int RGWPutBucketPolicy::get_params(optional_yield y)
+{
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  // At some point when I have more time I want to make a version of
+  // rgw_rest_read_all_input that doesn't use malloc.
+  std::tie(op_ret, data) = read_all_input(s, max_size, false);
+
+  // And throws exceptions.
+  return op_ret;
+}
+
+void RGWPutBucketPolicy::execute(optional_yield y)
+{
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    return;
+  }
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  try {
+    const Policy p(
+      s->cct, s->bucket_tenant, data,
+      s->cct->_conf.get_val<bool>("rgw_policy_reject_invalid_principals"));
+    rgw::sal::Attrs attrs(s->bucket_attrs);
+    if (s->bucket_access_conf &&
+        s->bucket_access_conf->block_public_policy() &&
+        rgw::IAM::is_public(p)) {
+      op_ret = -EACCES;
+      return;
+    }
+
+    op_ret = retry_raced_bucket_write(this, s->bucket.get(), [&p, this, &attrs] {
+	attrs[RGW_ATTR_IAM_POLICY].clear();
+	attrs[RGW_ATTR_IAM_POLICY].append(p.text);
+	op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield);
+	return op_ret;
+      });
+  } catch (rgw::IAM::PolicyParseException& e) {
+    ldpp_dout(this, 5) << "failed to parse policy: " << e.what() << dendl;
+    op_ret = -EINVAL;
+    s->err.message = e.what();
+  }
+}
+
+void RGWGetBucketPolicy::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s, this, "application/json");
+  dump_body(s, policy);
+}
+
+int RGWGetBucketPolicy::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketPolicy)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWGetBucketPolicy::execute(optional_yield y)
+{
+  rgw::sal::Attrs attrs(s->bucket_attrs);
+  auto aiter = attrs.find(RGW_ATTR_IAM_POLICY);
+  if (aiter == attrs.end()) {
+    ldpp_dout(this, 0) << "can't find bucket IAM POLICY attr bucket_name = "
+        << s->bucket_name << dendl;
+    op_ret = -ERR_NO_SUCH_BUCKET_POLICY;
+    s->err.message = "The bucket policy does not exist";
+    return;
+  } else {
+    policy = attrs[RGW_ATTR_IAM_POLICY];
+
+    if (policy.length() == 0) {
+      ldpp_dout(this, 10) << "The bucket policy does not exist, bucket: "
+          << s->bucket_name << dendl;
+      op_ret = -ERR_NO_SUCH_BUCKET_POLICY;
+      s->err.message = "The bucket policy does not exist";
+      return;
+    }
+  } 
+}
+
+void RGWDeleteBucketPolicy::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+int RGWDeleteBucketPolicy::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3DeleteBucketPolicy)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWDeleteBucketPolicy::execute(optional_yield y)
+{
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] {
+      rgw::sal::Attrs attrs(s->bucket_attrs);
+      attrs.erase(RGW_ATTR_IAM_POLICY);
+      op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield);
+      return op_ret;
+    });
+}
+
+void RGWPutBucketObjectLock::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+int RGWPutBucketObjectLock::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3PutBucketObjectLockConfiguration);
+}
+
+void RGWPutBucketObjectLock::execute(optional_yield y)
+{
+  if (!s->bucket->get_info().obj_lock_enabled()) {
+    s->err.message = "object lock configuration can't be set if bucket object lock not enabled";
+    ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl;
+    op_ret = -ERR_INVALID_BUCKET_STATE;
+    return;
+  }
+
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    return;
+  }
+  if (!parser.parse(data.c_str(), data.length(), 1)) {
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("ObjectLockConfiguration", obj_lock, &parser, true);
+  } catch (RGWXMLDecoder::err& err) {
+    ldpp_dout(this, 5) << "unexpected xml:" << err << dendl;
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+  if (obj_lock.has_rule() && !obj_lock.retention_period_valid()) {
+    s->err.message = "retention period must be a positive integer value";
+    ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl;
+    op_ret = -ERR_INVALID_RETENTION_PERIOD;
+    return;
+  }
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << __func__ << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] {
+    s->bucket->get_info().obj_lock = obj_lock;
+    op_ret = s->bucket->put_info(this, false, real_time());
+    return op_ret;
+  });
+  return;
+}
+
+void RGWGetBucketObjectLock::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+int RGWGetBucketObjectLock::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  return verify_bucket_owner_or_policy(s, rgw::IAM::s3GetBucketObjectLockConfiguration);
+}
+
+void RGWGetBucketObjectLock::execute(optional_yield y)
+{
+  if (!s->bucket->get_info().obj_lock_enabled()) {
+    op_ret = -ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION;
+    return;
+  }
+}
+
+int RGWPutObjRetention::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+    if (has_s3_existing_tag || has_s3_resource_tag)
+      rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  if (!verify_object_permission(this, s, rgw::IAM::s3PutObjectRetention)) {
+    return -EACCES;
+  }
+  op_ret = get_params(y);
+  if (op_ret) {
+    return op_ret;
+  }
+  if (bypass_governance_mode) {
+    bypass_perm = verify_object_permission(this, s, rgw::IAM::s3BypassGovernanceRetention);
+  }
+  return 0;
+}
+
+void RGWPutObjRetention::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWPutObjRetention::execute(optional_yield y)
+{
+  if (!s->bucket->get_info().obj_lock_enabled()) {
+    s->err.message = "object retention can't be set if bucket object lock not configured";
+    ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl;
+    op_ret = -ERR_INVALID_REQUEST;
+    return;
+  }
+
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  if (!parser.parse(data.c_str(), data.length(), 1)) {
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("Retention", obj_retention, &parser, true);
+  } catch (RGWXMLDecoder::err& err) {
+    ldpp_dout(this, 5) << "unexpected xml:" << err << dendl;
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  if (ceph::real_clock::to_time_t(obj_retention.get_retain_until_date()) < ceph_clock_now()) {
+    s->err.message = "the retain-until date must be in the future";
+    ldpp_dout(this, 0) << "ERROR: " << s->err.message << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+  bufferlist bl;
+  obj_retention.encode(bl);
+
+  //check old retention
+  op_ret = s->object->get_obj_attrs(s->yield, this);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: get obj attr error"<< dendl;
+    return;
+  }
+  rgw::sal::Attrs attrs = s->object->get_attrs();
+  auto aiter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+  if (aiter != attrs.end()) {
+    RGWObjectRetention old_obj_retention;
+    try {
+      decode(old_obj_retention, aiter->second);
+    } catch (buffer::error& err) {
+      ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectRetention" << dendl;
+      op_ret = -EIO;
+      return;
+    }
+    if (ceph::real_clock::to_time_t(obj_retention.get_retain_until_date()) < ceph::real_clock::to_time_t(old_obj_retention.get_retain_until_date())) {
+      if (old_obj_retention.get_mode().compare("GOVERNANCE") != 0 || !bypass_perm || !bypass_governance_mode) {
+	  s->err.message = "proposed retain-until date shortens an existing retention period and governance bypass check failed";
+        op_ret = -EACCES;
+        return;
+      }
+    } else if (old_obj_retention.get_mode() == obj_retention.get_mode()) {
+      // ok if retention mode doesn't change
+    } else if (obj_retention.get_mode() == "GOVERNANCE") {
+      s->err.message = "can't change retention mode from COMPLIANCE to GOVERNANCE";
+      op_ret = -EACCES;
+      return;
+    } else if (!bypass_perm || !bypass_governance_mode) {
+      s->err.message = "can't change retention mode from GOVERNANCE without governance bypass";
+      op_ret = -EACCES;
+      return;
+    }
+  }
+
+  op_ret = s->object->modify_obj_attrs(RGW_ATTR_OBJECT_RETENTION, bl, s->yield, this);
+
+  return;
+}
+
+int RGWGetObjRetention::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+    if (has_s3_existing_tag || has_s3_resource_tag)
+      rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  if (!verify_object_permission(this, s, rgw::IAM::s3GetObjectRetention)) {
+    return -EACCES;
+  }
+  return 0;
+}
+
+void RGWGetObjRetention::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetObjRetention::execute(optional_yield y)
+{
+  if (!s->bucket->get_info().obj_lock_enabled()) {
+    s->err.message = "bucket object lock not configured";
+    ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl;
+    op_ret = -ERR_INVALID_REQUEST;
+    return;
+  }
+  op_ret = s->object->get_obj_attrs(s->yield, this);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << s->object
+                       << " ret=" << op_ret << dendl;
+    return;
+  }
+  rgw::sal::Attrs attrs = s->object->get_attrs();
+  auto aiter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+  if (aiter == attrs.end()) {
+    op_ret = -ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION;
+    return;
+  }
+
+  bufferlist::const_iterator iter{&aiter->second};
+  try {
+    obj_retention.decode(iter);
+  } catch (const buffer::error& e) {
+    ldpp_dout(this, 0) << __func__ <<  "decode object retention config failed" << dendl;
+    op_ret = -EIO;
+    return;
+  }
+  return;
+}
+
+int RGWPutObjLegalHold::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+    if (has_s3_existing_tag || has_s3_resource_tag)
+      rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  if (!verify_object_permission(this, s, rgw::IAM::s3PutObjectLegalHold)) {
+    return -EACCES;
+  }
+  return 0;
+}
+
+void RGWPutObjLegalHold::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWPutObjLegalHold::execute(optional_yield y) {
+  if (!s->bucket->get_info().obj_lock_enabled()) {
+    s->err.message = "object legal hold can't be set if bucket object lock not enabled";
+    ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl;
+    op_ret = -ERR_INVALID_REQUEST;
+    return;
+  }
+
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+
+  if (!parser.parse(data.c_str(), data.length(), 1)) {
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("LegalHold", obj_legal_hold, &parser, true);
+  } catch (RGWXMLDecoder::err &err) {
+    ldpp_dout(this, 5) << "unexpected xml:" << err << dendl;
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+  bufferlist bl;
+  obj_legal_hold.encode(bl);
+  //if instance is empty, we should modify the latest object
+  op_ret = s->object->modify_obj_attrs(RGW_ATTR_OBJECT_LEGAL_HOLD, bl, s->yield, this);
+  return;
+}
+
+int RGWGetObjLegalHold::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s);
+    if (has_s3_existing_tag || has_s3_resource_tag)
+      rgw_iam_add_objtags(this, s, has_s3_existing_tag, has_s3_resource_tag);
+
+  if (!verify_object_permission(this, s, rgw::IAM::s3GetObjectLegalHold)) {
+    return -EACCES;
+  }
+  return 0;
+}
+
+void RGWGetObjLegalHold::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetObjLegalHold::execute(optional_yield y)
+{
+  if (!s->bucket->get_info().obj_lock_enabled()) {
+    s->err.message = "bucket object lock not configured";
+    ldpp_dout(this, 4) << "ERROR: " << s->err.message << dendl;
+    op_ret = -ERR_INVALID_REQUEST;
+    return;
+  }
+  map<string, bufferlist> attrs;
+  op_ret = s->object->get_obj_attrs(s->yield, this);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: failed to get obj attrs, obj=" << s->object
+                       << " ret=" << op_ret << dendl;
+    return;
+  }
+  auto aiter = s->object->get_attrs().find(RGW_ATTR_OBJECT_LEGAL_HOLD);
+  if (aiter == s->object->get_attrs().end()) {
+    op_ret = -ERR_NO_SUCH_OBJECT_LOCK_CONFIGURATION;
+    return;
+  }
+
+  bufferlist::const_iterator iter{&aiter->second};
+  try {
+    obj_legal_hold.decode(iter);
+  } catch (const buffer::error& e) {
+    ldpp_dout(this, 0) << __func__ <<  "decode object legal hold config failed" << dendl;
+    op_ret = -EIO;
+    return;
+  }
+  return;
+}
+
+void RGWGetClusterStat::execute(optional_yield y)
+{
+  op_ret = driver->cluster_stat(stats_op);
+}
+
+int RGWGetBucketPolicyStatus::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketPolicyStatus)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWGetBucketPolicyStatus::execute(optional_yield y)
+{
+  isPublic = (s->iam_policy && rgw::IAM::is_public(*s->iam_policy)) || s->bucket_acl->is_public(this);
+}
+
+int RGWPutBucketPublicAccessBlock::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketPublicAccessBlock)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+int RGWPutBucketPublicAccessBlock::get_params(optional_yield y)
+{
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  std::tie(op_ret, data) = read_all_input(s, max_size, false);
+  return op_ret;
+}
+
+void RGWPutBucketPublicAccessBlock::execute(optional_yield y)
+{
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  op_ret = get_params(y);
+  if (op_ret < 0)
+    return;
+
+  if (!parser.parse(data.c_str(), data.length(), 1)) {
+    ldpp_dout(this, 0) << "ERROR: malformed XML" << dendl;
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("PublicAccessBlockConfiguration", access_conf, &parser, true);
+  } catch (RGWXMLDecoder::err &err) {
+    ldpp_dout(this, 5) << "unexpected xml:" << err << dendl;
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  bufferlist bl;
+  access_conf.encode(bl);
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this, &bl] {
+      rgw::sal::Attrs attrs(s->bucket_attrs);
+      attrs[RGW_ATTR_PUBLIC_ACCESS] = bl;
+      return s->bucket->merge_and_store_attrs(this, attrs, s->yield);
+    });
+
+}
+
+int RGWGetBucketPublicAccessBlock::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketPolicy)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWGetBucketPublicAccessBlock::execute(optional_yield y)
+{
+  auto attrs = s->bucket_attrs;
+  if (auto aiter = attrs.find(RGW_ATTR_PUBLIC_ACCESS);
+      aiter == attrs.end()) {
+    ldpp_dout(this, 0) << "can't find bucket IAM POLICY attr bucket_name = "
+		       << s->bucket_name << dendl;
+    // return the default;
+    return;
+  } else {
+    bufferlist::const_iterator iter{&aiter->second};
+    try {
+      access_conf.decode(iter);
+    } catch (const buffer::error& e) {
+      ldpp_dout(this, 0) << __func__ <<  "decode access_conf failed" << dendl;
+      op_ret = -EIO;
+      return;
+    }
+  }
+}
+
+
+void RGWDeleteBucketPublicAccessBlock::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+int RGWDeleteBucketPublicAccessBlock::verify_permission(optional_yield y)
+{
+  auto [has_s3_existing_tag, has_s3_resource_tag] = rgw_check_policy_condition(this, s, false);
+  if (has_s3_resource_tag)
+    rgw_iam_add_buckettags(this, s);
+
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketPublicAccessBlock)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWDeleteBucketPublicAccessBlock::execute(optional_yield y)
+{
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this] {
+      rgw::sal::Attrs attrs(s->bucket_attrs);
+      attrs.erase(RGW_ATTR_PUBLIC_ACCESS);
+      op_ret = s->bucket->merge_and_store_attrs(this, attrs, s->yield);
+      return op_ret;
+    });
+}
+
+int RGWPutBucketEncryption::get_params(optional_yield y)
+{
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  std::tie(op_ret, data) = read_all_input(s, max_size, false);
+  return op_ret;
+}
+
+int RGWPutBucketEncryption::verify_permission(optional_yield y)
+{
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketEncryption)) {
+    return -EACCES;
+  }
+  return 0;
+}
+
+void RGWPutBucketEncryption::execute(optional_yield y)
+{
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+  op_ret = get_params(y);
+  if (op_ret < 0) {
+    return;
+  }
+  if (!parser.parse(data.c_str(), data.length(), 1)) {
+    ldpp_dout(this, 0) << "ERROR: malformed XML" << dendl;
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("ServerSideEncryptionConfiguration", bucket_encryption_conf, &parser, true);
+  } catch (RGWXMLDecoder::err& err) {
+    ldpp_dout(this, 5) << "unexpected xml:" << err << dendl;
+    op_ret = -ERR_MALFORMED_XML;
+    return;
+  }
+
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 20) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  bufferlist conf_bl;
+  bucket_encryption_conf.encode(conf_bl);
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this, y, &conf_bl] {
+    rgw::sal::Attrs attrs = s->bucket->get_attrs();
+    attrs[RGW_ATTR_BUCKET_ENCRYPTION_POLICY] = conf_bl;
+    return s->bucket->merge_and_store_attrs(this, attrs, y);
+  });
+}
+
+int RGWGetBucketEncryption::verify_permission(optional_yield y)
+{
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketEncryption)) {
+    return -EACCES;
+  }
+  return 0;
+}
+
+void RGWGetBucketEncryption::execute(optional_yield y)
+{
+  const auto& attrs = s->bucket_attrs;
+  if (auto aiter = attrs.find(RGW_ATTR_BUCKET_ENCRYPTION_POLICY);
+      aiter == attrs.end()) {
+    ldpp_dout(this, 0) << "can't find BUCKET ENCRYPTION attr for bucket_name = " << s->bucket_name << dendl;
+    op_ret = -ENOENT;
+    s->err.message = "The server side encryption configuration was not found";
+    return;
+  } else {
+    bufferlist::const_iterator iter{&aiter->second};
+    try {
+      bucket_encryption_conf.decode(iter);
+    } catch (const buffer::error& e) {
+      ldpp_dout(this, 0) << __func__ <<  "decode bucket_encryption_conf failed" << dendl;
+      op_ret = -EIO;
+      return;
+    }
+  }
+}
+
+int RGWDeleteBucketEncryption::verify_permission(optional_yield y)
+{
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketEncryption)) {
+    return -EACCES;
+  }
+  return 0;
+}
+
+void RGWDeleteBucketEncryption::execute(optional_yield y)
+{
+  bufferlist data;
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  op_ret = retry_raced_bucket_write(this, s->bucket.get(), [this, y] {
+    rgw::sal::Attrs attrs = s->bucket->get_attrs();
+    attrs.erase(RGW_ATTR_BUCKET_ENCRYPTION_POLICY);
+    attrs.erase(RGW_ATTR_BUCKET_ENCRYPTION_KEY_ID);
+    op_ret = s->bucket->merge_and_store_attrs(this, attrs, y);
+    return op_ret;
+  });
+}
+
+void rgw_slo_entry::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("path", path, obj);
+  JSONDecoder::decode_json("etag", etag, obj);
+  JSONDecoder::decode_json("size_bytes", size_bytes, obj);
+};
+
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
new file mode 100644
index 000000000..f398b5b15
--- /dev/null
+++ b/src/rgw/rgw_op.h
@@ -0,0 +1,2672 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/**
+ * All operations via the rados gateway are carried out by
+ * small classes known as RGWOps. This class contains a req_state
+ * and each possible command is a subclass of this with a defined
+ * execute() method that does whatever the subclass name implies.
+ * These subclasses must be further subclassed (by interface type)
+ * to provide additional virtual methods such as send_response or get_params.
+ */
+
+#pragma once
+
+#include <limits.h>
+
+#include <array>
+#include <memory>
+#include <string>
+#include <set>
+#include <map>
+#include <vector>
+
+#include <boost/optional.hpp>
+#include <boost/utility/in_place_factory.hpp>
+#include <boost/function.hpp>
+#include <boost/container/flat_map.hpp>
+#include <boost/asio/deadline_timer.hpp>
+
+#include "common/armor.h"
+#include "common/mime.h"
+#include "common/utf8.h"
+#include "common/ceph_json.h"
+#include "common/ceph_time.h"
+
+#include "rgw_common.h"
+#include "rgw_dmclock.h"
+#include "rgw_sal.h"
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_acl.h"
+#include "rgw_cors.h"
+#include "rgw_quota.h"
+#include "rgw_putobj.h"
+#include "rgw_sal.h"
+#include "rgw_compression_types.h"
+#include "rgw_log.h"
+
+#include "rgw_lc.h"
+#include "rgw_torrent.h"
+#include "rgw_tag.h"
+#include "rgw_object_lock.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "rgw_public_access.h"
+#include "rgw_bucket_encryption.h"
+#include "rgw_tracer.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_tier_rados.h"
+
+#include "include/ceph_assert.h"
+
+using ceph::crypto::SHA1;
+
+struct req_state;
+class RGWOp;
+class RGWRados;
+class RGWMultiCompleteUpload;
+
+
+namespace rgw {
+namespace auth {
+namespace registry {
+
+class StrategyRegistry;
+
+}
+}
+}
+
+int rgw_op_get_bucket_policy_from_attr(const DoutPrefixProvider *dpp,
+                                       CephContext *cct,
+				       rgw::sal::Driver* driver,
+                                       RGWBucketInfo& bucket_info,
+                                       std::map<std::string, bufferlist>& bucket_attrs,
+                                       RGWAccessControlPolicy *policy,
+				       optional_yield y);
+
+class RGWHandler {
+protected:
+  rgw::sal::Driver* driver{nullptr};
+  req_state *s{nullptr};
+
+  int do_init_permissions(const DoutPrefixProvider *dpp, optional_yield y);
+  int do_read_permissions(RGWOp* op, bool only_bucket, optional_yield y);
+
+public:
+  RGWHandler() {}
+  virtual ~RGWHandler();
+
+  virtual int init(rgw::sal::Driver* driver,
+                   req_state* _s,
+                   rgw::io::BasicClient* cio);
+
+  virtual int init_permissions(RGWOp*, optional_yield y) {
+    return 0;
+  }
+
+  virtual int retarget(RGWOp* op, RGWOp** new_op, optional_yield) {
+    *new_op = op;
+    return 0;
+  }
+
+  virtual int read_permissions(RGWOp* op, optional_yield y) = 0;
+  virtual int authorize(const DoutPrefixProvider* dpp, optional_yield y) = 0;
+  virtual int postauth_init(optional_yield y) = 0;
+  virtual int error_handler(int err_no, std::string* error_content, optional_yield y);
+  virtual void dump(const std::string& code, const std::string& message) const {}
+
+  virtual bool supports_quota() {
+    return true;
+  }
+};
+
+
+
+void rgw_bucket_object_pre_exec(req_state *s);
+
+namespace dmc = rgw::dmclock;
+
+std::tuple<int, bufferlist > rgw_rest_read_all_input(req_state *s,
+                                        const uint64_t max_len,
+                                        const bool allow_chunked=true);
+
+template <class T>
+int rgw_rest_get_json_input(CephContext *cct, req_state *s, T& out,
+			    uint64_t max_len, bool *empty)
+{
+  if (empty)
+    *empty = false;
+
+  int rv = 0;
+  bufferlist data;
+  std::tie(rv, data) = rgw_rest_read_all_input(s, max_len);
+  if (rv < 0) {
+    return rv;
+  }
+
+  if (!data.length()) {
+    if (empty) {
+      *empty = true;
+    }
+
+    return -EINVAL;
+  }
+
+  JSONParser parser;
+
+  if (!parser.parse(data.c_str(), data.length())) {
+    return -EINVAL;
+  }
+
+  try {
+      decode_json_obj(out, &parser);
+  } catch (JSONDecoder::err& e) {
+      return -EINVAL;
+  }
+
+  return 0;
+}
+
+/**
+ * Provide the base class for all ops.
+ */
+class RGWOp : public DoutPrefixProvider {
+protected:
+  req_state *s;
+  RGWHandler *dialect_handler;
+  rgw::sal::Driver* driver;
+  RGWCORSConfiguration bucket_cors;
+  bool cors_exist;
+  RGWQuota quota;
+  int op_ret;
+  int do_aws4_auth_completion();
+  bool init_called = false;
+
+  virtual int init_quota();
+
+  std::tuple<int, bufferlist> read_all_input(req_state *s,
+                                             const uint64_t max_len,
+                                             const bool allow_chunked=true) {
+
+    int rv = 0;
+    bufferlist data;
+    std::tie(rv, data) = rgw_rest_read_all_input(s, max_len);
+    if (rv >= 0) {
+      do_aws4_auth_completion();
+    }
+
+    return std::make_tuple(rv, std::move(data));
+  }
+
+  template <class T>
+  int get_json_input(CephContext *cct, req_state *s, T& out,
+                     uint64_t max_len, bool *empty) {
+    int r = rgw_rest_get_json_input(cct, s, out, max_len, empty);
+    if (r >= 0) {
+      do_aws4_auth_completion();
+    }
+    return r;
+  }
+
+public:
+  RGWOp()
+    : s(nullptr),
+      dialect_handler(nullptr),
+      driver(nullptr),
+      cors_exist(false),
+      op_ret(0) {
+  }
+
+  virtual ~RGWOp() override;
+
+  int get_ret() const { return op_ret; }
+
+  virtual int init_processing(optional_yield y) {
+    if (dialect_handler->supports_quota()) {
+      op_ret = init_quota();
+      if (op_ret < 0)
+        return op_ret;
+    }
+
+    return 0;
+  }
+
+  virtual void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *dialect_handler) {
+    if (init_called) return;
+    this->driver = driver;
+    init_called = true;
+    this->s = s;
+    this->dialect_handler = dialect_handler;
+  }
+  int read_bucket_cors();
+  bool generate_cors_headers(std::string& origin, std::string& method, std::string& headers, std::string& exp_headers, unsigned *max_age);
+
+  virtual int verify_params() { return 0; }
+  virtual bool prefetch_data() { return false; }
+
+  /* Authenticate requester -- verify its identity.
+   *
+   * NOTE: typically the procedure is common across all operations of the same
+   * dialect (S3, Swift API). However, there are significant exceptions in
+   * both APIs: browser uploads, /info and OPTIONS handlers. All of them use
+   * different, specific authentication schema driving the need for per-op
+   * authentication. The alternative is to duplicate parts of the method-
+   * dispatch logic in RGWHandler::authorize() and pollute it with a lot
+   * of special cases. */
+  virtual int verify_requester(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y) {
+    /* TODO(rzarzynski): rename RGWHandler::authorize to generic_authenticate. */
+    return dialect_handler->authorize(this, y);
+  }
+  virtual int verify_permission(optional_yield y) = 0;
+  virtual int verify_op_mask();
+  virtual void pre_exec() {}
+  virtual void execute(optional_yield y) = 0;
+  virtual void send_response() {}
+  virtual void complete() {
+    send_response();
+  }
+  virtual const char* name() const = 0;
+  virtual RGWOpType get_type() { return RGW_OP_UNKNOWN; }
+
+  virtual uint32_t op_mask() { return 0; }
+
+  virtual int error_handler(int err_no, std::string *error_content, optional_yield y);
+
+  // implements DoutPrefixProvider
+  std::ostream& gen_prefix(std::ostream& out) const override;
+  CephContext* get_cct() const override { return s->cct; }
+  unsigned get_subsys() const override { return ceph_subsys_rgw; }
+
+  virtual dmc::client_id dmclock_client() { return dmc::client_id::metadata; }
+  virtual dmc::Cost dmclock_cost() { return 1; }
+  virtual void write_ops_log_entry(rgw_log_entry& entry) const {};
+};
+
+class RGWDefaultResponseOp : public RGWOp {
+public:
+  void send_response() override;
+};
+
+class RGWGetObj_Filter : public RGWGetDataCB
+{
+protected:
+  RGWGetObj_Filter *next{nullptr};
+public:
+  RGWGetObj_Filter() {}
+  explicit RGWGetObj_Filter(RGWGetObj_Filter *next): next(next) {}
+  ~RGWGetObj_Filter() override {}
+  /**
+   * Passes data through filter.
+   * Filter can modify content of bl.
+   * When bl_len == 0 , it means 'flush
+   */
+  int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
+    if (next) {
+      return next->handle_data(bl, bl_ofs, bl_len);
+    }
+    return 0;
+  }
+  /**
+   * Flushes any cached data. Used by RGWGetObjFilter.
+   * Return logic same as handle_data.
+   */
+  virtual int flush() {
+    if (next) {
+      return next->flush();
+    }
+    return 0;
+  }
+  /**
+   * Allows filter to extend range required for successful filtering
+   */
+  virtual int fixup_range(off_t& ofs, off_t& end) {
+    if (next) {
+      return next->fixup_range(ofs, end);
+    }
+    return 0;
+  }
+};
+
+class RGWGetObj : public RGWOp {
+protected:
+  seed torrent; // get torrent
+  const char *range_str;
+  const char *if_mod;
+  const char *if_unmod;
+  const char *if_match;
+  const char *if_nomatch;
+  uint32_t mod_zone_id;
+  uint64_t mod_pg_ver;
+  off_t ofs;
+  uint64_t total_len;
+  off_t start;
+  off_t end;
+  ceph::real_time mod_time;
+  ceph::real_time lastmod;
+  ceph::real_time unmod_time;
+  ceph::real_time *mod_ptr;
+  ceph::real_time *unmod_ptr;
+  rgw::sal::Attrs attrs;
+  bool get_data;
+  bool partial_content;
+  bool ignore_invalid_range;
+  bool range_parsed;
+  bool skip_manifest;
+  bool skip_decrypt{false};
+  bool sync_cloudtiered{false};
+  utime_t gc_invalidate_time;
+  bool is_slo;
+  std::string lo_etag;
+  bool rgwx_stat; /* extended rgw stat operation */
+  std::string version_id;
+  rgw_zone_set_entry dst_zone_trace;
+
+  // compression attrs
+  RGWCompressionInfo cs_info;
+  off_t first_block, last_block;
+  off_t q_ofs, q_len;
+  bool first_data;
+  uint64_t cur_ofs;
+  bufferlist waiting;
+  uint64_t action = 0;
+
+  bool get_retention;
+  bool get_legal_hold;
+
+  int init_common();
+public:
+  RGWGetObj() {
+    range_str = NULL;
+    if_mod = NULL;
+    if_unmod = NULL;
+    if_match = NULL;
+    if_nomatch = NULL;
+    mod_zone_id = 0;
+    mod_pg_ver = 0;
+    start = 0;
+    ofs = 0;
+    total_len = 0;
+    end = -1;
+    mod_ptr = NULL;
+    unmod_ptr = NULL;
+    get_data = false;
+    partial_content = false;
+    range_parsed = false;
+    skip_manifest = false;
+    is_slo = false;
+    first_block = 0;
+    last_block = 0;
+    q_ofs = 0;
+    q_len = 0;
+    first_data = true;
+    cur_ofs = 0;
+    get_retention = false;
+    get_legal_hold = false;
+ }
+
+  bool prefetch_data() override;
+
+  void set_get_data(bool get_data) {
+    this->get_data = get_data;
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  int parse_range();
+  int read_user_manifest_part(
+    rgw::sal::Bucket* bucket,
+    const rgw_bucket_dir_entry& ent,
+    RGWAccessControlPolicy * const bucket_acl,
+    const boost::optional<rgw::IAM::Policy>& bucket_policy,
+    const off_t start_ofs,
+    const off_t end_ofs,
+    bool swift_slo);
+  int handle_user_manifest(const char *prefix, optional_yield y);
+  int handle_slo_manifest(bufferlist& bl, optional_yield y);
+
+  int get_data_cb(bufferlist& bl, off_t ofs, off_t len);
+
+  virtual int get_params(optional_yield y) = 0;
+  virtual int send_response_data_error(optional_yield y) = 0;
+  virtual int send_response_data(bufferlist& bl, off_t ofs, off_t len) = 0;
+
+  const char* name() const override { return "get_obj"; }
+  RGWOpType get_type() override { return RGW_OP_GET_OBJ; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+  virtual bool need_object_expiration() { return false; }
+  /**
+   * calculates filter used to decrypt RGW objects data
+   */
+  virtual int get_decrypt_filter(std::unique_ptr<RGWGetObj_Filter>* filter, RGWGetObj_Filter* cb, bufferlist* manifest_bl) {
+    *filter = nullptr;
+    return 0;
+  }
+
+  // get lua script to run as a "get object" filter
+  int get_lua_filter(std::unique_ptr<RGWGetObj_Filter>* filter,
+      RGWGetObj_Filter* cb);
+
+  dmc::client_id dmclock_client() override { return dmc::client_id::data; }
+};
+
+class RGWGetObj_CB : public RGWGetObj_Filter
+{
+  RGWGetObj *op;
+public:
+  explicit RGWGetObj_CB(RGWGetObj *_op) : op(_op) {}
+  ~RGWGetObj_CB() override {}
+
+  int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
+    return op->get_data_cb(bl, bl_ofs, bl_len);
+  }
+};
+
+class RGWGetObjTags : public RGWOp {
+ protected:
+  bufferlist tags_bl;
+  bool has_tags{false};
+ public:
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  void pre_exec() override;
+
+  virtual void send_response_data(bufferlist& bl) = 0;
+  const char* name() const override { return "get_obj_tags"; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+  RGWOpType get_type() override { return RGW_OP_GET_OBJ_TAGGING; }
+
+};
+
+class RGWPutObjTags : public RGWOp {
+ protected:
+  bufferlist tags_bl;
+ public:
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  virtual void send_response() override = 0;
+  virtual int get_params(optional_yield y) = 0;
+  const char* name() const override { return "put_obj_tags"; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  RGWOpType get_type() override { return RGW_OP_PUT_OBJ_TAGGING; }
+
+};
+
+class RGWDeleteObjTags: public RGWOp {
+ public:
+  void pre_exec() override;
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "delete_obj_tags"; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_OBJ_TAGGING;}
+};
+
+class RGWGetBucketTags : public RGWOp {
+protected:
+  bufferlist tags_bl;
+  bool has_tags{false};
+public:
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  void pre_exec() override;
+
+  virtual void send_response_data(bufferlist& bl) = 0;
+  const char* name() const override { return "get_bucket_tags"; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+  RGWOpType get_type() override { return RGW_OP_GET_BUCKET_TAGGING; }
+};
+
+class RGWPutBucketTags : public RGWOp {
+protected:
+  bufferlist tags_bl;
+  bufferlist in_data;
+public:
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  virtual void send_response() override = 0;
+  virtual int get_params(const DoutPrefixProvider *dpp, optional_yield y) = 0;
+  const char* name() const override { return "put_bucket_tags"; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_TAGGING; }
+};
+
+class RGWDeleteBucketTags : public RGWOp {
+public:
+  void pre_exec() override;
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "delete_bucket_tags"; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET_TAGGING;}
+};
+
+struct rgw_sync_policy_group;
+
+class RGWGetBucketReplication : public RGWOp {
+public:
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  void pre_exec() override;
+
+  virtual void send_response_data() = 0;
+  const char* name() const override { return "get_bucket_replication"; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+  RGWOpType get_type() override { return RGW_OP_GET_BUCKET_REPLICATION; }
+};
+
+class RGWPutBucketReplication : public RGWOp {
+protected:
+  bufferlist in_data;
+  std::vector<rgw_sync_policy_group> sync_policy_groups;
+public:
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  virtual void send_response() override = 0;
+  virtual int get_params(optional_yield y) = 0;
+  const char* name() const override { return "put_bucket_replication"; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_REPLICATION; }
+};
+
+class RGWDeleteBucketReplication : public RGWOp {
+protected:
+  virtual void update_sync_policy(rgw_sync_policy_info *policy) = 0;
+public:
+  void pre_exec() override;
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "delete_bucket_replication"; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET_REPLICATION;}
+};
+
+class RGWBulkDelete : public RGWOp {
+public:
+  struct acct_path_t {
+    std::string bucket_name;
+    rgw_obj_key obj_key;
+  };
+
+  struct fail_desc_t {
+    int err;
+    acct_path_t path;
+  };
+
+  class Deleter {
+  protected:
+    const DoutPrefixProvider * dpp;
+    unsigned int num_deleted;
+    unsigned int num_unfound;
+    std::list<fail_desc_t> failures;
+
+    rgw::sal::Driver*  const driver;
+    req_state * const s;
+
+  public:
+    Deleter(const DoutPrefixProvider* dpp, rgw::sal::Driver*  const str, req_state * const s)
+      : dpp(dpp),
+        num_deleted(0),
+        num_unfound(0),
+        driver(str),
+        s(s) {
+    }
+
+    unsigned int get_num_deleted() const {
+      return num_deleted;
+    }
+
+    unsigned int get_num_unfound() const {
+      return num_unfound;
+    }
+
+    const std::list<fail_desc_t> get_failures() const {
+      return failures;
+    }
+
+    bool verify_permission(RGWBucketInfo& binfo,
+                           std::map<std::string, bufferlist>& battrs,
+                           ACLOwner& bucket_owner /* out */,
+			   optional_yield y);
+    bool delete_single(const acct_path_t& path, optional_yield y);
+    bool delete_chunk(const std::list<acct_path_t>& paths, optional_yield y);
+  };
+  /* End of Deleter subclass */
+
+  static const size_t MAX_CHUNK_ENTRIES = 1024;
+
+protected:
+  std::unique_ptr<Deleter> deleter;
+
+public:
+  RGWBulkDelete()
+    : deleter(nullptr) {
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_data(std::list<acct_path_t>& items,
+                       bool * is_truncated) = 0;
+  void send_response() override = 0;
+
+  const char* name() const override { return "bulk_delete"; }
+  RGWOpType get_type() override { return RGW_OP_BULK_DELETE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+  dmc::client_id dmclock_client() override { return dmc::client_id::data; }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const RGWBulkDelete::acct_path_t &o) {
+  return out << o.bucket_name << "/" << o.obj_key;
+}
+
+
+class RGWBulkUploadOp : public RGWOp {
+protected:
+  class fail_desc_t {
+  public:
+    fail_desc_t(const int err, std::string path)
+      : err(err),
+        path(std::move(path)) {
+    }
+
+    const int err;
+    const std::string path;
+  };
+
+  static constexpr std::array<int, 2> terminal_errors = {
+    { -EACCES, -EPERM }
+  };
+
+  /* FIXME:  boost::container::small_vector<fail_desc_t, 4> failures; */
+  std::vector<fail_desc_t> failures;
+  size_t num_created;
+
+  class StreamGetter;
+  class DecoratedStreamGetter;
+  class AlignedStreamGetter;
+
+  virtual std::unique_ptr<StreamGetter> create_stream() = 0;
+  virtual void send_response() override = 0;
+
+  boost::optional<std::pair<std::string, rgw_obj_key>>
+  parse_path(const std::string_view& path);
+
+  std::pair<std::string, std::string>
+  handle_upload_path(req_state *s);
+
+  bool handle_file_verify_permission(RGWBucketInfo& binfo,
+				     const rgw_obj& obj,
+				     std::map<std::string, ceph::bufferlist>& battrs,
+                                     ACLOwner& bucket_owner /* out */,
+				     optional_yield y);
+  int handle_file(std::string_view path,
+                  size_t size,
+                  AlignedStreamGetter& body,
+		  optional_yield y);
+
+  int handle_dir_verify_permission(optional_yield y);
+  int handle_dir(std::string_view path, optional_yield y);
+
+public:
+  RGWBulkUploadOp()
+    : num_created(0) {
+  }
+
+  void init(rgw::sal::Driver* const driver,
+            req_state* const s,
+            RGWHandler* const h) override;
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "bulk_upload"; }
+
+  RGWOpType get_type() override {
+    return RGW_OP_BULK_UPLOAD;
+  }
+
+  uint32_t op_mask() override {
+    return RGW_OP_TYPE_WRITE;
+  }
+  dmc::client_id dmclock_client() override { return dmc::client_id::data; }
+}; /* RGWBulkUploadOp */
+
+
+class RGWBulkUploadOp::StreamGetter {
+public:
+  StreamGetter() = default;
+  virtual ~StreamGetter() = default;
+
+  virtual ssize_t get_at_most(size_t want, ceph::bufferlist& dst) = 0;
+  virtual ssize_t get_exactly(size_t want, ceph::bufferlist& dst) = 0;
+}; /* End of nested subclass StreamGetter */
+
+
+class RGWBulkUploadOp::DecoratedStreamGetter : public StreamGetter {
+  StreamGetter& decoratee;
+
+protected:
+  StreamGetter& get_decoratee() {
+    return decoratee;
+  }
+
+public:
+  explicit DecoratedStreamGetter(StreamGetter& decoratee)
+    : decoratee(decoratee) {
+  }
+  virtual ~DecoratedStreamGetter() = default;
+
+  ssize_t get_at_most(const size_t want, ceph::bufferlist& dst) override {
+    return get_decoratee().get_at_most(want, dst);
+  }
+
+  ssize_t get_exactly(const size_t want, ceph::bufferlist& dst) override {
+    return get_decoratee().get_exactly(want, dst);
+  }
+}; /* RGWBulkUploadOp::DecoratedStreamGetter */
+
+
+class RGWBulkUploadOp::AlignedStreamGetter
+  : public RGWBulkUploadOp::DecoratedStreamGetter {
+  size_t position;
+  size_t length;
+  size_t alignment;
+
+public:
+  template <typename U>
+  AlignedStreamGetter(const size_t position,
+                      const size_t length,
+                      const size_t alignment,
+                      U&& decoratee)
+    : DecoratedStreamGetter(std::forward<U>(decoratee)),
+      position(position),
+      length(length),
+      alignment(alignment) {
+  }
+  virtual ~AlignedStreamGetter();
+  ssize_t get_at_most(size_t want, ceph::bufferlist& dst) override;
+  ssize_t get_exactly(size_t want, ceph::bufferlist& dst) override;
+}; /* RGWBulkUploadOp::AlignedStreamGetter */
+
+
+struct RGWUsageStats {
+  uint64_t bytes_used = 0;
+  uint64_t bytes_used_rounded = 0;
+  uint64_t buckets_count = 0;
+  uint64_t objects_count = 0;
+};
+
+#define RGW_LIST_BUCKETS_LIMIT_MAX 10000
+
+class RGWListBuckets : public RGWOp {
+protected:
+  bool sent_data;
+  std::string marker;
+  std::string end_marker;
+  int64_t limit;
+  uint64_t limit_max;
+  bool is_truncated;
+
+  RGWUsageStats global_stats;
+  std::map<std::string, RGWUsageStats> policies_stats;
+
+  virtual uint64_t get_default_max() const {
+    return 1000;
+  }
+
+public:
+  RGWListBuckets()
+    : sent_data(false),
+      limit(RGW_LIST_BUCKETS_LIMIT_MAX),
+      limit_max(RGW_LIST_BUCKETS_LIMIT_MAX),
+      is_truncated(false) {
+  }
+
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  virtual void handle_listing_chunk(rgw::sal::BucketList&& buckets) {
+    /* The default implementation, used by e.g. S3, just generates a new
+     * part of listing and sends it client immediately. Swift can behave
+     * differently: when the reverse option is requested, all incoming
+     * instances of RGWBucketList are buffered and finally reversed. */
+    return send_response_data(buckets);
+  }
+  virtual void send_response_begin(bool has_buckets) = 0;
+  virtual void send_response_data(rgw::sal::BucketList& buckets) = 0;
+  virtual void send_response_end() = 0;
+  void send_response() override {}
+
+  virtual bool should_get_stats() { return false; }
+  virtual bool supports_account_metadata() { return false; }
+
+  const char* name() const override { return "list_buckets"; }
+  RGWOpType get_type() override { return RGW_OP_LIST_BUCKETS; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+}; // class RGWListBuckets
+
+class RGWGetUsage : public RGWOp {
+protected:
+  bool sent_data;
+  std::string start_date;
+  std::string end_date;
+  int show_log_entries;
+  int show_log_sum;
+  std::map<std::string, bool> categories;
+  std::map<rgw_user_bucket, rgw_usage_log_entry> usage;
+  std::map<std::string, rgw_usage_log_entry> summary_map;
+  std::map<std::string, bucket_meta_entry> buckets_usage;
+  cls_user_header header;
+  RGWStorageStats stats;
+public:
+  RGWGetUsage() : sent_data(false), show_log_entries(true), show_log_sum(true){
+  }
+
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override {}
+
+  virtual bool should_get_stats() { return false; }
+
+  const char* name() const override { return "get_self_usage"; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWStatAccount : public RGWOp {
+protected:
+  RGWUsageStats global_stats;
+  std::map<std::string, RGWUsageStats> policies_stats;
+
+public:
+  RGWStatAccount() = default;
+
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "stat_account"; }
+  RGWOpType get_type() override { return RGW_OP_STAT_ACCOUNT; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWListBucket : public RGWOp {
+protected:
+  std::string prefix;
+  rgw_obj_key marker;
+  rgw_obj_key next_marker;
+  rgw_obj_key end_marker;
+  std::string max_keys;
+  std::string delimiter;
+  std::string encoding_type;
+  bool list_versions;
+  int max;
+  std::vector<rgw_bucket_dir_entry> objs;
+  std::map<std::string, bool> common_prefixes;
+
+  int default_max;
+  bool is_truncated;
+  bool allow_unordered;
+
+  int shard_id;
+
+  int parse_max_keys();
+
+public:
+  RGWListBucket() : list_versions(false), max(0),
+                    default_max(0), is_truncated(false),
+		    allow_unordered(false), shard_id(-1) {}
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+  }
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "list_bucket"; }
+  RGWOpType get_type() override { return RGW_OP_LIST_BUCKET; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+  virtual bool need_container_stats() { return false; }
+};
+
+class RGWGetBucketLogging : public RGWOp {
+public:
+  RGWGetBucketLogging() {}
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield) override { }
+
+  void send_response() override = 0;
+  const char* name() const override { return "get_bucket_logging"; }
+  RGWOpType get_type() override { return RGW_OP_GET_BUCKET_LOGGING; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWGetBucketLocation : public RGWOp {
+public:
+  RGWGetBucketLocation() {}
+  ~RGWGetBucketLocation() override {}
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield) override { }
+
+  void send_response() override = 0;
+  const char* name() const override { return "get_bucket_location"; }
+  RGWOpType get_type() override { return RGW_OP_GET_BUCKET_LOCATION; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWGetBucketVersioning : public RGWOp {
+protected:
+  bool versioned{false};
+  bool versioning_enabled{false};
+  bool mfa_enabled{false};
+public:
+  RGWGetBucketVersioning() = default;
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "get_bucket_versioning"; }
+  RGWOpType get_type() override { return RGW_OP_GET_BUCKET_VERSIONING; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+enum BucketVersionStatus {
+  VersioningStatusInvalid = -1,
+  VersioningNotChanged = 0,
+  VersioningEnabled = 1,
+  VersioningSuspended =2,
+};
+
+class RGWSetBucketVersioning : public RGWOp {
+protected:
+  int versioning_status;
+  bool mfa_set_status{false};
+  bool mfa_status{false};
+  bufferlist in_data;
+public:
+  RGWSetBucketVersioning() : versioning_status(VersioningNotChanged) {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) { return 0; }
+
+  void send_response() override = 0;
+  const char* name() const override { return "set_bucket_versioning"; }
+  RGWOpType get_type() override { return RGW_OP_SET_BUCKET_VERSIONING; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWGetBucketWebsite : public RGWOp {
+public:
+  RGWGetBucketWebsite() {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "get_bucket_website"; }
+  RGWOpType get_type() override { return RGW_OP_GET_BUCKET_WEBSITE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWSetBucketWebsite : public RGWOp {
+protected:
+  bufferlist in_data;
+  RGWBucketWebsiteConf website_conf;
+public:
+  RGWSetBucketWebsite() {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) { return 0; }
+
+  void send_response() override = 0;
+  const char* name() const override { return "set_bucket_website"; }
+  RGWOpType get_type() override { return RGW_OP_SET_BUCKET_WEBSITE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWDeleteBucketWebsite : public RGWOp {
+public:
+  RGWDeleteBucketWebsite() {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "delete_bucket_website"; }
+  RGWOpType get_type() override { return RGW_OP_SET_BUCKET_WEBSITE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWStatBucket : public RGWOp {
+protected:
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+public:
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "stat_bucket"; }
+  RGWOpType get_type() override { return RGW_OP_STAT_BUCKET; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWCreateBucket : public RGWOp {
+protected:
+  RGWAccessControlPolicy policy;
+  std::string location_constraint;
+  rgw_placement_rule placement_rule;
+  RGWBucketInfo info;
+  obj_version ep_objv;
+  bool has_cors;
+  bool relaxed_region_enforcement;
+  bool obj_lock_enabled;
+  RGWCORSConfiguration cors_config;
+  boost::optional<std::string> swift_ver_location;
+  std::map<std::string, buffer::list> attrs;
+  std::set<std::string> rmattr_names;
+
+  bufferlist in_data;
+
+  virtual bool need_metadata_upload() const { return false; }
+
+public:
+  RGWCreateBucket() : has_cors(false), relaxed_region_enforcement(false), obj_lock_enabled(false) {}
+
+  void emplace_attr(std::string&& key, buffer::list&& bl) {
+    attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+    policy.set_ctx(s->cct);
+    relaxed_region_enforcement =
+	s->cct->_conf.get_val<bool>("rgw_relaxed_region_enforcement");
+  }
+  virtual int get_params(optional_yield y) { return 0; }
+  void send_response() override = 0;
+  const char* name() const override { return "create_bucket"; }
+  RGWOpType get_type() override { return RGW_OP_CREATE_BUCKET; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWDeleteBucket : public RGWOp {
+protected:
+  RGWObjVersionTracker objv_tracker;
+
+public:
+  RGWDeleteBucket() {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "delete_bucket"; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+};
+
+struct rgw_slo_entry {
+  std::string path;
+  std::string etag;
+  uint64_t size_bytes;
+
+  rgw_slo_entry() : size_bytes(0) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(path, bl);
+    encode(etag, bl);
+    encode(size_bytes, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(path, bl);
+     decode(etag, bl);
+     decode(size_bytes, bl);
+     DECODE_FINISH(bl);
+  }
+
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_slo_entry)
+
+struct RGWSLOInfo {
+  std::vector<rgw_slo_entry> entries;
+  uint64_t total_size;
+
+  /* in memory only */
+  bufferlist raw_data;
+
+  RGWSLOInfo() : total_size(0) {}
+  ~RGWSLOInfo() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(entries, bl);
+    encode(total_size, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START(1, bl);
+     decode(entries, bl);
+     decode(total_size, bl);
+     DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(RGWSLOInfo)
+
+class RGWPutObj : public RGWOp {
+protected:
+  seed torrent;
+  off_t ofs;
+  const char *supplied_md5_b64;
+  const char *supplied_etag;
+  const char *if_match;
+  const char *if_nomatch;
+  std::string copy_source;
+  const char *copy_source_range;
+  RGWBucketInfo copy_source_bucket_info;
+  std::string copy_source_tenant_name;
+  std::string copy_source_bucket_name;
+  std::string copy_source_object_name;
+  std::string copy_source_version_id;
+  off_t copy_source_range_fst;
+  off_t copy_source_range_lst;
+  std::string etag;
+  bool chunked_upload;
+  RGWAccessControlPolicy policy;
+  std::unique_ptr <RGWObjTags> obj_tags;
+  const char *dlo_manifest;
+  RGWSLOInfo *slo_info;
+  rgw::sal::Attrs attrs;
+  ceph::real_time mtime;
+  uint64_t olh_epoch;
+  std::string version_id;
+  bufferlist bl_aux;
+  std::map<std::string, std::string> crypt_http_responses;
+  std::string user_data;
+
+  std::string multipart_upload_id;
+  std::string multipart_part_str;
+  int multipart_part_num = 0;
+  jspan multipart_trace;
+
+  boost::optional<ceph::real_time> delete_at;
+  //append obj
+  bool append;
+  uint64_t position;
+  uint64_t cur_accounted_size;
+
+  //object lock
+  RGWObjectRetention *obj_retention;
+  RGWObjectLegalHold *obj_legal_hold;
+
+public:
+  RGWPutObj() : ofs(0),
+                supplied_md5_b64(NULL),
+                supplied_etag(NULL),
+                if_match(NULL),
+                if_nomatch(NULL),
+                copy_source_range(NULL),
+                copy_source_range_fst(0),
+                copy_source_range_lst(0),
+                chunked_upload(0),
+                dlo_manifest(NULL),
+                slo_info(NULL),
+                olh_epoch(0),
+                append(false),
+                position(0),
+                cur_accounted_size(0),
+                obj_retention(nullptr),
+                obj_legal_hold(nullptr) {}
+
+  ~RGWPutObj() override {
+    delete slo_info;
+    delete obj_retention;
+    delete obj_legal_hold;
+  }
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+    policy.set_ctx(s->cct);
+  }
+
+  virtual int init_processing(optional_yield y) override;
+
+  void emplace_attr(std::string&& key, buffer::list&& bl) {
+    attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  /* this is for cases when copying data from other object */
+  virtual int get_decrypt_filter(std::unique_ptr<RGWGetObj_Filter>* filter,
+                                 RGWGetObj_Filter* cb,
+                                 std::map<std::string, bufferlist>& attrs,
+                                 bufferlist* manifest_bl) {
+    *filter = nullptr;
+    return 0;
+  }
+  virtual int get_encrypt_filter(std::unique_ptr<rgw::sal::DataProcessor> *filter,
+                                 rgw::sal::DataProcessor *cb) {
+    return 0;
+  }
+
+  // get lua script to run as a "put object" filter
+  int get_lua_filter(std::unique_ptr<rgw::sal::DataProcessor>* filter,
+      rgw::sal::DataProcessor* cb);
+
+  int get_data_cb(bufferlist& bl, off_t bl_ofs, off_t bl_len);
+  int get_data(const off_t fst, const off_t lst, bufferlist& bl);
+
+  virtual int get_params(optional_yield y) = 0;
+  virtual int get_data(bufferlist& bl) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "put_obj"; }
+  RGWOpType get_type() override { return RGW_OP_PUT_OBJ; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  dmc::client_id dmclock_client() override { return dmc::client_id::data; }
+};
+
+class RGWPostObj : public RGWOp {
+protected:
+  off_t min_len;
+  off_t max_len;
+  int len;
+  off_t ofs;
+  const char *supplied_md5_b64;
+  const char *supplied_etag;
+  std::string etag;
+  RGWAccessControlPolicy policy;
+  std::map<std::string, bufferlist> attrs;
+  boost::optional<ceph::real_time> delete_at;
+
+  /* Must be called after get_data() or the result is undefined. */
+  virtual std::string get_current_filename() const = 0;
+  virtual std::string get_current_content_type() const = 0;
+  virtual bool is_next_file_to_upload() {
+     return false;
+  }
+public:
+  RGWPostObj() : min_len(0),
+                 max_len(LLONG_MAX),
+                 len(0),
+                 ofs(0),
+                 supplied_md5_b64(nullptr),
+                 supplied_etag(nullptr) {
+  }
+
+  void emplace_attr(std::string&& key, buffer::list&& bl) {
+    attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */
+  }
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+    policy.set_ctx(s->cct);
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_encrypt_filter(std::unique_ptr<rgw::sal::DataProcessor> *filter,
+                                 rgw::sal::DataProcessor *cb) {
+    return 0;
+  }
+  virtual int get_params(optional_yield y) = 0;
+  virtual int get_data(ceph::bufferlist& bl, bool& again) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "post_obj"; }
+  RGWOpType get_type() override { return RGW_OP_POST_OBJ; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  dmc::client_id dmclock_client() override { return dmc::client_id::data; }
+};
+
+class RGWPutMetadataAccount : public RGWOp {
+protected:
+  std::set<std::string> rmattr_names;
+  std::map<std::string, bufferlist> attrs, orig_attrs;
+  std::map<int, std::string> temp_url_keys;
+  RGWQuotaInfo new_quota;
+  bool new_quota_extracted;
+
+  RGWAccessControlPolicy policy;
+  bool has_policy;
+
+public:
+  RGWPutMetadataAccount()
+    : new_quota_extracted(false),
+      has_policy(false) {
+  }
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+    policy.set_ctx(s->cct);
+  }
+  int init_processing(optional_yield y) override;
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override { }
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  virtual void filter_out_temp_url(std::map<std::string, bufferlist>& add_attrs,
+                                   const std::set<std::string>& rmattr_names,
+                                   std::map<int, std::string>& temp_url_keys);
+  const char* name() const override { return "put_account_metadata"; }
+  RGWOpType get_type() override { return RGW_OP_PUT_METADATA_ACCOUNT; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWPutMetadataBucket : public RGWOp {
+protected:
+  rgw::sal::Attrs attrs;
+  std::set<std::string> rmattr_names;
+  bool has_policy, has_cors;
+  uint32_t policy_rw_mask;
+  RGWAccessControlPolicy policy;
+  RGWCORSConfiguration cors_config;
+  rgw_placement_rule placement_rule;
+  boost::optional<std::string> swift_ver_location;
+
+public:
+  RGWPutMetadataBucket()
+    : has_policy(false), has_cors(false), policy_rw_mask(0)
+  {}
+
+  void emplace_attr(std::string&& key, buffer::list&& bl) {
+    attrs.emplace(std::move(key), std::move(bl)); /* key and bl are r-value refs */
+  }
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+    policy.set_ctx(s->cct);
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "put_bucket_metadata"; }
+  RGWOpType get_type() override { return RGW_OP_PUT_METADATA_BUCKET; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWPutMetadataObject : public RGWOp {
+protected:
+  RGWAccessControlPolicy policy;
+  boost::optional<ceph::real_time> delete_at;
+  const char *dlo_manifest;
+
+public:
+  RGWPutMetadataObject()
+    : dlo_manifest(NULL)
+  {}
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+    policy.set_ctx(s->cct);
+  }
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "put_obj_metadata"; }
+  RGWOpType get_type() override { return RGW_OP_PUT_METADATA_OBJECT; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  virtual bool need_object_expiration() { return false; }
+};
+
+class RGWDeleteObj : public RGWOp {
+protected:
+  bool delete_marker;
+  bool multipart_delete;
+  std::string version_id;
+  ceph::real_time unmod_since; /* if unmodified since */
+  bool no_precondition_error;
+  std::unique_ptr<RGWBulkDelete::Deleter> deleter;
+  bool bypass_perm;
+  bool bypass_governance_mode;
+
+public:
+  RGWDeleteObj()
+    : delete_marker(false),
+      multipart_delete(false),
+      no_precondition_error(false),
+      deleter(nullptr),
+      bypass_perm(true),
+      bypass_governance_mode(false) {
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  int handle_slo_manifest(bufferlist& bl, optional_yield y);
+
+  virtual int get_params(optional_yield y) { return 0; }
+  void send_response() override = 0;
+  const char* name() const override { return "delete_obj"; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_OBJ; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+  virtual bool need_object_expiration() { return false; }
+  dmc::client_id dmclock_client() override { return dmc::client_id::data; }
+};
+
+class RGWCopyObj : public RGWOp {
+protected:
+  RGWAccessControlPolicy dest_policy;
+  const char *if_mod;
+  const char *if_unmod;
+  const char *if_match;
+  const char *if_nomatch;
+  // Required or it is not a copy operation
+  std::string_view copy_source;
+  // Not actually required
+  std::optional<std::string_view> md_directive;
+
+  off_t ofs;
+  off_t len;
+  off_t end;
+  ceph::real_time mod_time;
+  ceph::real_time unmod_time;
+  ceph::real_time *mod_ptr;
+  ceph::real_time *unmod_ptr;
+  rgw::sal::Attrs attrs;
+  std::unique_ptr<rgw::sal::Bucket> src_bucket;
+  ceph::real_time src_mtime;
+  ceph::real_time mtime;
+  rgw::sal::AttrsMod attrs_mod;
+  std::string source_zone;
+  std::string etag;
+
+  off_t last_ofs;
+
+  std::string version_id;
+  uint64_t olh_epoch;
+
+  boost::optional<ceph::real_time> delete_at;
+  bool copy_if_newer;
+
+  bool need_to_check_storage_class = false;
+
+  //object lock
+  RGWObjectRetention *obj_retention;
+  RGWObjectLegalHold *obj_legal_hold;
+
+  int init_common();
+
+public:
+  RGWCopyObj() {
+    if_mod = NULL;
+    if_unmod = NULL;
+    if_match = NULL;
+    if_nomatch = NULL;
+    ofs = 0;
+    len = 0;
+    end = -1;
+    mod_ptr = NULL;
+    unmod_ptr = NULL;
+    attrs_mod = rgw::sal::ATTRSMOD_NONE;
+    last_ofs = 0;
+    olh_epoch = 0;
+    copy_if_newer = false;
+    obj_retention = nullptr;
+    obj_legal_hold = nullptr;
+  }
+
+  ~RGWCopyObj() override {
+    delete obj_retention;
+    delete obj_legal_hold;
+  }
+
+  static bool parse_copy_location(const std::string_view& src,
+                                  std::string& bucket_name,
+                                  rgw_obj_key& object,
+                                  req_state *s);
+
+  void emplace_attr(std::string&& key, buffer::list&& bl) {
+    attrs.emplace(std::move(key), std::move(bl));
+  }
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+    dest_policy.set_ctx(s->cct);
+  }
+  int init_processing(optional_yield y) override;
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  void progress_cb(off_t ofs);
+
+  virtual int check_storage_class(const rgw_placement_rule& src_placement) {
+    return 0;
+  }
+
+  virtual int init_dest_policy() { return 0; }
+  virtual int get_params(optional_yield y) = 0;
+  virtual void send_partial_response(off_t ofs) {}
+  void send_response() override = 0;
+  const char* name() const override { return "copy_obj"; }
+  RGWOpType get_type() override { return RGW_OP_COPY_OBJ; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  dmc::client_id dmclock_client() override { return dmc::client_id::data; }
+};
+
+class RGWGetACLs : public RGWOp {
+protected:
+  std::string acls;
+
+public:
+  RGWGetACLs() {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "get_acls"; }
+  RGWOpType get_type() override { return RGW_OP_GET_ACLS; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWPutACLs : public RGWOp {
+protected:
+  bufferlist data;
+  ACLOwner owner;
+
+public:
+  RGWPutACLs() {}
+  ~RGWPutACLs() override {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_policy_from_state(rgw::sal::Driver* driver, req_state *s, std::stringstream& ss) { return 0; }
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "put_acls"; }
+  RGWOpType get_type() override { return RGW_OP_PUT_ACLS; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWGetLC : public RGWOp {
+protected:
+
+public:
+  RGWGetLC() { }
+  ~RGWGetLC() override { }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield) override = 0;
+
+  void send_response() override = 0;
+  const char* name() const override { return "get_lifecycle"; }
+  RGWOpType get_type() override { return RGW_OP_GET_LC; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWPutLC : public RGWOp {
+protected:
+  bufferlist data;
+  const char *content_md5;
+  std::string cookie;
+
+public:
+  RGWPutLC() {
+    content_md5 = nullptr;
+  }
+  ~RGWPutLC() override {}
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *dialect_handler) override {
+    static constexpr std::size_t COOKIE_LEN = 16;
+    char buf[COOKIE_LEN + 1];
+
+    RGWOp::init(driver, s, dialect_handler);
+    gen_rand_alphanumeric(s->cct, buf, sizeof(buf) - 1);
+    cookie = buf;
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+//  virtual int get_policy_from_state(RGWRados* driver, req_state *s, std::stringstream& ss) { return 0; }
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "put_lifecycle"; }
+  RGWOpType get_type() override { return RGW_OP_PUT_LC; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWDeleteLC : public RGWOp {
+public:
+  RGWDeleteLC() = default;
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "delete_lifecycle"; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_LC; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWGetCORS : public RGWOp {
+protected:
+
+public:
+  RGWGetCORS() {}
+
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "get_cors"; }
+  RGWOpType get_type() override { return RGW_OP_GET_CORS; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWPutCORS : public RGWOp {
+protected:
+  bufferlist cors_bl;
+  bufferlist in_data;
+
+public:
+  RGWPutCORS() {}
+  ~RGWPutCORS() override {}
+
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "put_cors"; }
+  RGWOpType get_type() override { return RGW_OP_PUT_CORS; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWDeleteCORS : public RGWOp {
+protected:
+
+public:
+  RGWDeleteCORS() {}
+
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "delete_cors"; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_CORS; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWOptionsCORS : public RGWOp {
+protected:
+  RGWCORSRule *rule;
+  const char *origin, *req_hdrs, *req_meth;
+
+public:
+  RGWOptionsCORS() : rule(NULL), origin(NULL),
+                     req_hdrs(NULL), req_meth(NULL) {
+  }
+
+  int verify_permission(optional_yield y) override {return 0;}
+  int validate_cors_request(RGWCORSConfiguration *cc);
+  void execute(optional_yield y) override;
+  void get_response_params(std::string& allowed_hdrs, std::string& exp_hdrs, unsigned *max_age);
+  void send_response() override = 0;
+  const char* name() const override { return "options_cors"; }
+  RGWOpType get_type() override { return RGW_OP_OPTIONS_CORS; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWPutBucketEncryption : public RGWOp {
+protected:
+  RGWBucketEncryptionConfig bucket_encryption_conf;
+  bufferlist data;
+public:
+  RGWPutBucketEncryption() = default;
+  ~RGWPutBucketEncryption() {}
+
+  int get_params(optional_yield y);
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  const char* name() const override { return "put_bucket_encryption"; }
+  RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_ENCRYPTION; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWGetBucketEncryption : public RGWOp {
+protected:
+  RGWBucketEncryptionConfig bucket_encryption_conf;
+public:
+  RGWGetBucketEncryption() {}
+
+  int get_params(optional_yield y);
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  const char* name() const override { return "get_bucket_encryption"; }
+  RGWOpType get_type() override { return RGW_OP_GET_BUCKET_ENCRYPTION; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWDeleteBucketEncryption : public RGWOp {
+protected:
+  RGWBucketEncryptionConfig bucket_encryption_conf;
+public:
+  RGWDeleteBucketEncryption() {}
+
+  int get_params(optional_yield y);
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  const char* name() const override { return "delete_bucket_encryption"; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET_ENCRYPTION; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWGetRequestPayment : public RGWOp {
+protected:
+  bool requester_pays;
+
+public:
+  RGWGetRequestPayment() : requester_pays(0) {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "get_request_payment"; }
+  RGWOpType get_type() override { return RGW_OP_GET_REQUEST_PAYMENT; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWSetRequestPayment : public RGWOp {
+protected:
+  bool requester_pays;
+  bufferlist in_data;
+public:
+ RGWSetRequestPayment() : requester_pays(false) {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) { return 0; }
+
+  void send_response() override = 0;
+  const char* name() const override { return "set_request_payment"; }
+  RGWOpType get_type() override { return RGW_OP_SET_REQUEST_PAYMENT; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWInitMultipart : public RGWOp {
+protected:
+  std::string upload_id;
+  RGWAccessControlPolicy policy;
+  ceph::real_time mtime;
+  jspan multipart_trace;
+
+public:
+  RGWInitMultipart() {}
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+    policy.set_ctx(s->cct);
+  }
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "init_multipart"; }
+  RGWOpType get_type() override { return RGW_OP_INIT_MULTIPART; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  virtual int prepare_encryption(std::map<std::string, bufferlist>& attrs) { return 0; }
+};
+
+class RGWCompleteMultipart : public RGWOp {
+protected:
+  std::string upload_id;
+  std::string etag;
+  std::string version_id;
+  bufferlist data;
+  std::unique_ptr<rgw::sal::MPSerializer> serializer;
+  jspan multipart_trace;
+
+public:
+  RGWCompleteMultipart() {}
+  ~RGWCompleteMultipart() = default;
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  bool check_previously_completed(const RGWMultiCompleteUpload* parts);
+  void complete() override;
+
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "complete_multipart"; }
+  RGWOpType get_type() override { return RGW_OP_COMPLETE_MULTIPART; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWAbortMultipart : public RGWOp {
+protected:
+  jspan multipart_trace;
+public:
+  RGWAbortMultipart() {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  void send_response() override = 0;
+  const char* name() const override { return "abort_multipart"; }
+  RGWOpType get_type() override { return RGW_OP_ABORT_MULTIPART; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+};
+
+class RGWListMultipart : public RGWOp {
+protected:
+  std::string upload_id;
+  std::unique_ptr<rgw::sal::MultipartUpload> upload;
+  int max_parts;
+  int marker;
+  RGWAccessControlPolicy policy;
+  bool truncated;
+  rgw_placement_rule* placement;
+
+public:
+  RGWListMultipart() {
+    max_parts = 1000;
+    marker = 0;
+    truncated = false;
+  }
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+    policy = RGWAccessControlPolicy(s->cct);
+  }
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "list_multipart"; }
+  RGWOpType get_type() override { return RGW_OP_LIST_MULTIPART; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWListBucketMultiparts : public RGWOp {
+protected:
+  std::string prefix;
+  std::string marker_meta;
+  std::string marker_key;
+  std::string marker_upload_id;
+  std::string next_marker_key;
+  std::string next_marker_upload_id;
+  int max_uploads;
+  std::string delimiter;
+  std::vector<std::unique_ptr<rgw::sal::MultipartUpload>> uploads;
+  std::map<std::string, bool> common_prefixes;
+  bool is_truncated;
+  int default_max;
+  bool encode_url {false};
+
+public:
+  RGWListBucketMultiparts() {
+    max_uploads = 0;
+    is_truncated = false;
+    default_max = 0;
+  }
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+    max_uploads = default_max;
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "list_bucket_multiparts"; }
+  RGWOpType get_type() override { return RGW_OP_LIST_BUCKET_MULTIPARTS; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+
+class RGWGetCrossDomainPolicy : public RGWOp {
+public:
+  RGWGetCrossDomainPolicy() = default;
+  ~RGWGetCrossDomainPolicy() override = default;
+
+  int verify_permission(optional_yield) override {
+    return 0;
+  }
+
+  void execute(optional_yield) override {
+    op_ret = 0;
+  }
+
+  const char* name() const override { return "get_crossdomain_policy"; }
+
+  RGWOpType get_type() override {
+    return RGW_OP_GET_CROSS_DOMAIN_POLICY;
+  }
+
+  uint32_t op_mask() override {
+    return RGW_OP_TYPE_READ;
+  }
+};
+
+
+class RGWGetHealthCheck : public RGWOp {
+public:
+  RGWGetHealthCheck() = default;
+  ~RGWGetHealthCheck() override = default;
+
+  int verify_permission(optional_yield) override {
+    return 0;
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_health_check"; }
+
+  RGWOpType get_type() override {
+    return RGW_OP_GET_HEALTH_CHECK;
+  }
+
+  uint32_t op_mask() override {
+    return RGW_OP_TYPE_READ;
+  }
+};
+
+
+class RGWDeleteMultiObj : public RGWOp {
+  /**
+   * Handles the deletion of an individual object and uses
+   * set_partial_response to record the outcome.
+   */
+  void handle_individual_object(const rgw_obj_key& o,
+				optional_yield y,
+                                boost::asio::deadline_timer *formatter_flush_cond);
+
+  /**
+   * When the request is being executed in a coroutine, performs
+   * the actual formatter flushing and is responsible for the
+   * termination condition (when when all partial object responses
+   * have been sent). Note that the formatter flushing must be handled
+   * on the coroutine that invokes the execute method vs. the
+   * coroutines that are spawned to handle individual objects because
+   * the flush logic uses a yield context that was captured
+   * and saved on the req_state vs. one that is passed on the stack.
+   * This is a no-op in the case where we're not executing as a coroutine.
+   */
+  void wait_flush(optional_yield y,
+                  boost::asio::deadline_timer *formatter_flush_cond,
+                  std::function<bool()> predicate);
+
+protected:
+  std::vector<delete_multi_obj_entry> ops_log_entries;
+  bufferlist data;
+  rgw::sal::Bucket* bucket;
+  bool quiet;
+  bool status_dumped;
+  bool acl_allowed = false;
+  bool bypass_perm;
+  bool bypass_governance_mode;
+
+public:
+  RGWDeleteMultiObj() {
+    quiet = false;
+    status_dumped = false;
+    bypass_perm = true;
+    bypass_governance_mode = false;
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  virtual void send_status() = 0;
+  virtual void begin_response() = 0;
+  virtual void send_partial_response(const rgw_obj_key& key, bool delete_marker,
+                                     const std::string& marker_version_id, int ret,
+                                     boost::asio::deadline_timer *formatter_flush_cond) = 0;
+  virtual void end_response() = 0;
+  const char* name() const override { return "multi_object_delete"; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_MULTI_OBJ; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+
+  void write_ops_log_entry(rgw_log_entry& entry) const override;
+};
+
+class RGWInfo: public RGWOp {
+public:
+  RGWInfo() = default;
+  ~RGWInfo() override = default;
+
+  int verify_permission(optional_yield) override { return 0; }
+  const char* name() const override { return "get info"; }
+  RGWOpType get_type() override { return RGW_OP_GET_INFO; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+extern int rgw_build_bucket_policies(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+				     req_state* s, optional_yield y);
+extern int rgw_build_object_policies(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+				     req_state *s, bool prefetch_data, optional_yield y);
+extern void rgw_build_iam_environment(rgw::sal::Driver* driver,
+				      req_state* s);
+extern std::vector<rgw::IAM::Policy> get_iam_user_policy_from_attr(CephContext* cct,
+                        std::map<std::string, bufferlist>& attrs,
+                        const std::string& tenant);
+
+inline int get_system_versioning_params(req_state *s,
+					uint64_t *olh_epoch,
+					std::string *version_id)
+{
+  if (!s->system_request) {
+    return 0;
+  }
+
+  if (olh_epoch) {
+    std::string epoch_str = s->info.args.get(RGW_SYS_PARAM_PREFIX "versioned-epoch");
+    if (!epoch_str.empty()) {
+      std::string err;
+      *olh_epoch = strict_strtol(epoch_str.c_str(), 10, &err);
+      if (!err.empty()) {
+        ldpp_subdout(s, rgw, 0) << "failed to parse versioned-epoch param"
+				 << dendl;
+        return -EINVAL;
+      }
+    }
+  }
+
+  if (version_id) {
+    *version_id = s->info.args.get(RGW_SYS_PARAM_PREFIX "version-id");
+  }
+
+  return 0;
+} /* get_system_versioning_params */
+
+static inline void format_xattr(std::string &xattr)
+{
+  /* If the extended attribute is not valid UTF-8, we encode it using
+   * quoted-printable encoding.
+   */
+  if ((check_utf8(xattr.c_str(), xattr.length()) != 0) ||
+      (check_for_control_characters(xattr.c_str(), xattr.length()) != 0)) {
+    static const char MIME_PREFIX_STR[] = "=?UTF-8?Q?";
+    static const int MIME_PREFIX_LEN = sizeof(MIME_PREFIX_STR) - 1;
+    static const char MIME_SUFFIX_STR[] = "?=";
+    static const int MIME_SUFFIX_LEN = sizeof(MIME_SUFFIX_STR) - 1;
+    int mlen = mime_encode_as_qp(xattr.c_str(), NULL, 0);
+    char *mime = new char[MIME_PREFIX_LEN + mlen + MIME_SUFFIX_LEN + 1];
+    strcpy(mime, MIME_PREFIX_STR);
+    mime_encode_as_qp(xattr.c_str(), mime + MIME_PREFIX_LEN, mlen);
+    strcpy(mime + MIME_PREFIX_LEN + (mlen - 1), MIME_SUFFIX_STR);
+    xattr.assign(mime);
+    delete [] mime;
+  }
+} /* format_xattr */
+
+/**
+ * Get the HTTP request metadata out of the req_state as a
+ * map(<attr_name, attr_contents>, where attr_name is RGW_ATTR_PREFIX.HTTP_NAME)
+ * s: The request state
+ * attrs: will be filled up with attrs mapped as <attr_name, attr_contents>
+ * On success returns 0.
+ * On failure returns a negative error code.
+ *
+ */
+inline int rgw_get_request_metadata(const DoutPrefixProvider *dpp,
+                                    CephContext* const cct,
+				    struct req_info& info,
+				    std::map<std::string, ceph::bufferlist>& attrs,
+				    const bool allow_empty_attrs = true)
+{
+  static const std::set<std::string> blocklisted_headers = {
+      "x-amz-server-side-encryption-customer-algorithm",
+      "x-amz-server-side-encryption-customer-key",
+      "x-amz-server-side-encryption-customer-key-md5",
+      "x-amz-storage-class"
+  };
+
+  size_t valid_meta_count = 0;
+  for (auto& kv : info.x_meta_map) {
+    const std::string& name = kv.first;
+    std::string& xattr = kv.second;
+
+    if (blocklisted_headers.count(name) == 1) {
+      ldpp_subdout(dpp, rgw, 10) << "skipping x>> " << name << dendl;
+      continue;
+    } else if (allow_empty_attrs || !xattr.empty()) {
+      ldpp_subdout(dpp, rgw, 10) << "x>> " << name << ":" << xattr << dendl;
+      format_xattr(xattr);
+
+      std::string attr_name(RGW_ATTR_PREFIX);
+      attr_name.append(name);
+
+      /* Check roughly whether we aren't going behind the limit on attribute
+       * name. Passing here doesn't guarantee that an OSD will accept that
+       * as ObjectStore::get_max_attr_name_length() can set the limit even
+       * lower than the "osd_max_attr_name_len" configurable.  */
+      const auto max_attr_name_len = cct->_conf->rgw_max_attr_name_len;
+      if (max_attr_name_len && attr_name.length() > max_attr_name_len) {
+        return -ENAMETOOLONG;
+      }
+
+      /* Similar remarks apply to the check for value size. We're veryfing
+       * it early at the RGW's side as it's being claimed in /info. */
+      const auto max_attr_size = cct->_conf->rgw_max_attr_size;
+      if (max_attr_size && xattr.length() > max_attr_size) {
+        return -EFBIG;
+      }
+
+      /* Swift allows administrators to limit the number of metadats items
+       * send _in a single request_. */
+      const auto max_attrs_num_in_req = cct->_conf->rgw_max_attrs_num_in_req;
+      if (max_attrs_num_in_req &&
+          ++valid_meta_count > max_attrs_num_in_req) {
+        return -E2BIG;
+      }
+
+      auto rval = attrs.emplace(std::move(attr_name), ceph::bufferlist());
+      /* At the moment the value of the freshly created attribute key-value
+       * pair is an empty bufferlist. */
+
+      ceph::bufferlist& bl = rval.first->second;
+      bl.append(xattr.c_str(), xattr.size() + 1);
+    }
+  }
+
+  return 0;
+} /* rgw_get_request_metadata */
+
+inline void encode_delete_at_attr(boost::optional<ceph::real_time> delete_at,
+				  std::map<std::string, bufferlist>& attrs)
+{
+  if (delete_at == boost::none) {
+    return;
+  }
+
+  bufferlist delatbl;
+  encode(*delete_at, delatbl);
+  attrs[RGW_ATTR_DELETE_AT] = delatbl;
+} /* encode_delete_at_attr */
+
+inline void encode_obj_tags_attr(RGWObjTags* obj_tags, std::map<std::string, bufferlist>& attrs)
+{
+  if (obj_tags == nullptr){
+    // we assume the user submitted a tag format which we couldn't parse since
+    // this wouldn't be parsed later by get/put obj tags, lets delete if the
+    // attr was populated
+    return;
+  }
+
+  bufferlist tagsbl;
+  obj_tags->encode(tagsbl);
+  attrs[RGW_ATTR_TAGS] = tagsbl;
+}
+
+inline int encode_dlo_manifest_attr(const char * const dlo_manifest,
+				    std::map<std::string, bufferlist>& attrs)
+{
+  std::string dm = dlo_manifest;
+
+  if (dm.find('/') == std::string::npos) {
+    return -EINVAL;
+  }
+
+  bufferlist manifest_bl;
+  manifest_bl.append(dlo_manifest, strlen(dlo_manifest) + 1);
+  attrs[RGW_ATTR_USER_MANIFEST] = manifest_bl;
+
+  return 0;
+} /* encode_dlo_manifest_attr */
+
+inline void complete_etag(MD5& hash, std::string *etag)
+{
+  char etag_buf[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char etag_buf_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+
+  hash.Final((unsigned char *)etag_buf);
+  buf_to_hex((const unsigned char *)etag_buf, CEPH_CRYPTO_MD5_DIGESTSIZE,
+	    etag_buf_str);
+
+  *etag = etag_buf_str;
+} /* complete_etag */
+
+using boost::container::flat_map;
+
+class RGWGetAttrs : public RGWOp {
+public:
+    using get_attrs_t = flat_map<std::string, std::optional<buffer::list>>;
+protected:
+  get_attrs_t attrs;
+
+public:
+  RGWGetAttrs()
+  {}
+
+  virtual ~RGWGetAttrs() {}
+
+  void emplace_key(std::string&& key) {
+    attrs.emplace(std::move(key), std::nullopt);
+  }
+
+  int verify_permission(optional_yield y);
+  void pre_exec();
+  void execute(optional_yield y);
+
+  virtual int get_params() = 0;
+  virtual void send_response() = 0;
+  virtual const char* name() const { return "get_attrs"; }
+  virtual RGWOpType get_type() { return RGW_OP_GET_ATTRS; }
+  virtual uint32_t op_mask() { return RGW_OP_TYPE_READ; }
+}; /* RGWGetAttrs */
+
+class RGWSetAttrs : public RGWOp {
+protected:
+  std::map<std::string, buffer::list> attrs;
+
+public:
+  RGWSetAttrs() {}
+  ~RGWSetAttrs() override {}
+
+  void emplace_attr(std::string&& key, buffer::list&& bl) {
+    attrs.emplace(std::move(key), std::move(bl));
+  }
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  void send_response() override = 0;
+  const char* name() const override { return "set_attrs"; }
+  RGWOpType get_type() override { return RGW_OP_SET_ATTRS; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWRMAttrs : public RGWOp {
+protected:
+  rgw::sal::Attrs attrs;
+
+public:
+  RGWRMAttrs()
+  {}
+
+  virtual ~RGWRMAttrs() {}
+
+  void emplace_key(std::string&& key) {
+    attrs.emplace(std::move(key), buffer::list());
+  }
+
+  int verify_permission(optional_yield y);
+  void pre_exec();
+  void execute(optional_yield y);
+
+  virtual int get_params() = 0;
+  virtual void send_response() = 0;
+  virtual const char* name() const { return "rm_attrs"; }
+  virtual RGWOpType get_type() { return RGW_OP_DELETE_ATTRS; }
+  virtual uint32_t op_mask() { return RGW_OP_TYPE_DELETE; }
+}; /* RGWRMAttrs */
+
+class RGWGetObjLayout : public RGWOp {
+public:
+  RGWGetObjLayout() {
+  }
+
+  int check_caps(RGWUserCaps& caps) {
+    return caps.check_cap("admin", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_info().caps);
+  }
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_obj_layout"; }
+  virtual RGWOpType get_type() override { return RGW_OP_GET_OBJ_LAYOUT; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWPutBucketPolicy : public RGWOp {
+  bufferlist data;
+public:
+  RGWPutBucketPolicy() = default;
+  ~RGWPutBucketPolicy() {
+  }
+  void send_response() override;
+  int verify_permission(optional_yield y) override;
+  uint32_t op_mask() override {
+    return RGW_OP_TYPE_WRITE;
+  }
+  void execute(optional_yield y) override;
+  int get_params(optional_yield y);
+  const char* name() const override { return "put_bucket_policy"; }
+  RGWOpType get_type() override {
+    return RGW_OP_PUT_BUCKET_POLICY;
+  }
+};
+
+class RGWGetBucketPolicy : public RGWOp {
+  buffer::list policy;
+public:
+  RGWGetBucketPolicy() = default;
+  void send_response() override;
+  int verify_permission(optional_yield y) override;
+  uint32_t op_mask() override {
+    return RGW_OP_TYPE_READ;
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override { return "get_bucket_policy"; }
+  RGWOpType get_type() override {
+    return RGW_OP_GET_BUCKET_POLICY;
+  }
+};
+
+class RGWDeleteBucketPolicy : public RGWOp {
+public:
+  RGWDeleteBucketPolicy() = default;
+  void send_response() override;
+  int verify_permission(optional_yield y) override;
+  uint32_t op_mask() override {
+    return RGW_OP_TYPE_WRITE;
+  }
+  void execute(optional_yield y) override;
+  int get_params(optional_yield y);
+  const char* name() const override { return "delete_bucket_policy"; }
+  RGWOpType get_type() override {
+    return RGW_OP_DELETE_BUCKET_POLICY;
+  }
+};
+
+class RGWPutBucketObjectLock : public RGWOp {
+protected:
+  bufferlist data;
+  bufferlist obj_lock_bl;
+  RGWObjectLock obj_lock;
+public:
+  RGWPutBucketObjectLock() = default;
+  ~RGWPutBucketObjectLock() {}
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  virtual void send_response() override = 0;
+  virtual int get_params(optional_yield y) = 0;
+  const char* name() const override { return "put_bucket_object_lock"; }
+  RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_OBJ_LOCK; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWGetBucketObjectLock : public RGWOp {
+public:
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  virtual void send_response() override = 0;
+  const char* name() const override {return "get_bucket_object_lock"; }
+  RGWOpType get_type() override { return RGW_OP_GET_BUCKET_OBJ_LOCK; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWPutObjRetention : public RGWOp {
+protected:
+  bufferlist data;
+  RGWObjectRetention obj_retention;
+  bool bypass_perm;
+  bool bypass_governance_mode;
+public:
+  RGWPutObjRetention():bypass_perm(true), bypass_governance_mode(false) {}
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  virtual void send_response() override = 0;
+  virtual int get_params(optional_yield y) = 0;
+  const char* name() const override { return "put_obj_retention"; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  RGWOpType get_type() override { return RGW_OP_PUT_OBJ_RETENTION; }
+};
+
+class RGWGetObjRetention : public RGWOp {
+protected:
+  RGWObjectRetention obj_retention;
+public:
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  virtual void send_response() override = 0;
+  const char* name() const override {return "get_obj_retention"; }
+  RGWOpType get_type() override { return RGW_OP_GET_OBJ_RETENTION; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWPutObjLegalHold : public RGWOp {
+protected:
+  bufferlist data;
+  RGWObjectLegalHold obj_legal_hold;
+public:
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  virtual void send_response() override = 0;
+  virtual int get_params(optional_yield y) = 0;
+  const char* name() const override { return "put_obj_legal_hold"; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  RGWOpType get_type() override { return RGW_OP_PUT_OBJ_LEGAL_HOLD; }
+};
+
+class RGWGetObjLegalHold : public RGWOp {
+protected:
+  RGWObjectLegalHold obj_legal_hold;
+public:
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+  virtual void send_response() override = 0;
+  const char* name() const override {return "get_obj_legal_hold"; }
+  RGWOpType get_type() override { return RGW_OP_GET_OBJ_LEGAL_HOLD; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+
+class RGWConfigBucketMetaSearch : public RGWOp {
+protected:
+  std::map<std::string, uint32_t> mdsearch_config;
+public:
+  RGWConfigBucketMetaSearch() {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  virtual int get_params(optional_yield y) = 0;
+  const char* name() const override { return "config_bucket_meta_search"; }
+  virtual RGWOpType get_type() override { return RGW_OP_CONFIG_BUCKET_META_SEARCH; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWGetBucketMetaSearch : public RGWOp {
+public:
+  RGWGetBucketMetaSearch() {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield) override {}
+
+  const char* name() const override { return "get_bucket_meta_search"; }
+  virtual RGWOpType get_type() override { return RGW_OP_GET_BUCKET_META_SEARCH; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+};
+
+class RGWDelBucketMetaSearch : public RGWOp {
+public:
+  RGWDelBucketMetaSearch() {}
+
+  int verify_permission(optional_yield y) override;
+  void pre_exec() override;
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "delete_bucket_meta_search"; }
+  virtual RGWOpType delete_type() { return RGW_OP_DEL_BUCKET_META_SEARCH; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+};
+
+class RGWGetClusterStat : public RGWOp {
+protected:
+  RGWClusterStat stats_op;
+public:
+  RGWGetClusterStat() {}
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWOp::init(driver, s, h);
+  }
+  int verify_permission(optional_yield) override {return 0;}
+  virtual void send_response() override = 0;
+  virtual int get_params(optional_yield y) = 0;
+  void execute(optional_yield y) override;
+  const char* name() const override { return "get_cluster_stat"; }
+  dmc::client_id dmclock_client() override { return dmc::client_id::admin; }
+};
+
+class RGWGetBucketPolicyStatus : public RGWOp {
+protected:
+  bool isPublic {false};
+public:
+  int verify_permission(optional_yield y) override;
+  const char* name() const override { return "get_bucket_policy_status"; }
+  virtual RGWOpType get_type() override { return RGW_OP_GET_BUCKET_POLICY_STATUS; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+  void execute(optional_yield y) override;
+  dmc::client_id dmclock_client() override { return dmc::client_id::metadata; }
+};
+
+class RGWPutBucketPublicAccessBlock : public RGWOp {
+protected:
+  bufferlist data;
+  PublicAccessBlockConfiguration access_conf;
+public:
+  int verify_permission(optional_yield y) override;
+  const char* name() const override { return "put_bucket_public_access_block";}
+  virtual RGWOpType get_type() override { return RGW_OP_PUT_BUCKET_PUBLIC_ACCESS_BLOCK; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  int get_params(optional_yield y);
+  void execute(optional_yield y) override;
+  dmc::client_id dmclock_client() override { return dmc::client_id::metadata; }
+};
+
+class RGWGetBucketPublicAccessBlock : public RGWOp {
+protected:
+  PublicAccessBlockConfiguration access_conf;
+public:
+  int verify_permission(optional_yield y) override;
+  const char* name() const override { return "get_bucket_public_access_block";}
+  virtual RGWOpType get_type() override { return RGW_OP_GET_BUCKET_PUBLIC_ACCESS_BLOCK; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+  int get_params(optional_yield y);
+  void execute(optional_yield y) override;
+  dmc::client_id dmclock_client() override { return dmc::client_id::metadata; }
+};
+
+class RGWDeleteBucketPublicAccessBlock : public RGWOp {
+protected:
+  PublicAccessBlockConfiguration access_conf;
+public:
+  int verify_permission(optional_yield y) override;
+  const char* name() const override { return "delete_bucket_public_access_block";}
+  virtual RGWOpType get_type() override { return RGW_OP_DELETE_BUCKET_PUBLIC_ACCESS_BLOCK; }
+  virtual uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+  int get_params(optional_yield y);
+  void execute(optional_yield y) override;
+  void send_response() override;
+  dmc::client_id dmclock_client() override { return dmc::client_id::metadata; }
+};
+
+inline int parse_value_and_bound(
+    const std::string &input,
+    int &output,
+    const long lower_bound,
+    const long upper_bound,
+    const long default_val)
+{
+  if (!input.empty()) {
+    char *endptr;
+    output = strtol(input.c_str(), &endptr, 10);
+    if (endptr) {
+      if (endptr == input.c_str()) return -EINVAL;
+      while (*endptr && isspace(*endptr)) // ignore white space
+        endptr++;
+      if (*endptr) {
+        return -EINVAL;
+      }
+    }
+    if(output > upper_bound) {
+      output = upper_bound;
+    }
+    if(output < lower_bound) {
+      output = lower_bound;
+    }
+  } else {
+    output = default_val;
+  }
+
+  return 0;
+}
+
+int rgw_policy_from_attrset(const DoutPrefixProvider *dpp,
+                            CephContext *cct,
+                            std::map<std::string, bufferlist>& attrset,
+                            RGWAccessControlPolicy *policy);
diff --git a/src/rgw/rgw_op_type.h b/src/rgw/rgw_op_type.h
new file mode 100644
index 000000000..375c7348b
--- /dev/null
+++ b/src/rgw/rgw_op_type.h
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+enum RGWOpType {
+  RGW_OP_UNKNOWN = 0,
+  RGW_OP_GET_OBJ,
+  RGW_OP_LIST_BUCKETS,
+  RGW_OP_STAT_ACCOUNT,
+  RGW_OP_LIST_BUCKET,
+  RGW_OP_GET_BUCKET_LOGGING,
+  RGW_OP_GET_BUCKET_LOCATION,
+  RGW_OP_GET_BUCKET_VERSIONING,
+  RGW_OP_SET_BUCKET_VERSIONING,
+  RGW_OP_GET_BUCKET_WEBSITE,
+  RGW_OP_SET_BUCKET_WEBSITE,
+  RGW_OP_STAT_BUCKET,
+  RGW_OP_CREATE_BUCKET,
+  RGW_OP_DELETE_BUCKET,
+  RGW_OP_PUT_OBJ,
+  RGW_OP_STAT_OBJ,
+  RGW_OP_POST_OBJ,
+  RGW_OP_PUT_METADATA_ACCOUNT,
+  RGW_OP_PUT_METADATA_BUCKET,
+  RGW_OP_PUT_METADATA_OBJECT,
+  RGW_OP_SET_TEMPURL,
+  RGW_OP_DELETE_OBJ,
+  RGW_OP_COPY_OBJ,
+  RGW_OP_GET_ACLS,
+  RGW_OP_PUT_ACLS,
+  RGW_OP_GET_CORS,
+  RGW_OP_PUT_CORS,
+  RGW_OP_DELETE_CORS,
+  RGW_OP_OPTIONS_CORS,
+  RGW_OP_GET_BUCKET_ENCRYPTION,
+  RGW_OP_PUT_BUCKET_ENCRYPTION,
+  RGW_OP_DELETE_BUCKET_ENCRYPTION,
+  RGW_OP_GET_REQUEST_PAYMENT,
+  RGW_OP_SET_REQUEST_PAYMENT,
+  RGW_OP_INIT_MULTIPART,
+  RGW_OP_COMPLETE_MULTIPART,
+  RGW_OP_ABORT_MULTIPART,
+  RGW_OP_LIST_MULTIPART,
+  RGW_OP_LIST_BUCKET_MULTIPARTS,
+  RGW_OP_DELETE_MULTI_OBJ,
+  RGW_OP_BULK_DELETE,
+  RGW_OP_GET_KEYS,
+  RGW_OP_GET_ATTRS,
+  RGW_OP_DELETE_ATTRS,
+  RGW_OP_SET_ATTRS,
+  RGW_OP_GET_CROSS_DOMAIN_POLICY,
+  RGW_OP_GET_HEALTH_CHECK,
+  RGW_OP_GET_INFO,
+  RGW_OP_CREATE_ROLE,
+  RGW_OP_DELETE_ROLE,
+  RGW_OP_GET_ROLE,
+  RGW_OP_MODIFY_ROLE_TRUST_POLICY,
+  RGW_OP_LIST_ROLES,
+  RGW_OP_PUT_ROLE_POLICY,
+  RGW_OP_GET_ROLE_POLICY,
+  RGW_OP_LIST_ROLE_POLICIES,
+  RGW_OP_DELETE_ROLE_POLICY,
+  RGW_OP_TAG_ROLE,
+  RGW_OP_LIST_ROLE_TAGS,
+  RGW_OP_UNTAG_ROLE,
+  RGW_OP_UPDATE_ROLE,
+  RGW_OP_PUT_BUCKET_POLICY,
+  RGW_OP_GET_BUCKET_POLICY,
+  RGW_OP_DELETE_BUCKET_POLICY,
+  RGW_OP_PUT_OBJ_TAGGING,
+  RGW_OP_GET_OBJ_TAGGING,
+  RGW_OP_DELETE_OBJ_TAGGING,
+  RGW_OP_PUT_LC,
+  RGW_OP_GET_LC,
+  RGW_OP_DELETE_LC,
+  RGW_OP_PUT_USER_POLICY,
+  RGW_OP_GET_USER_POLICY,
+  RGW_OP_LIST_USER_POLICIES,
+  RGW_OP_DELETE_USER_POLICY,
+  RGW_OP_PUT_BUCKET_OBJ_LOCK,
+  RGW_OP_GET_BUCKET_OBJ_LOCK,
+  RGW_OP_PUT_OBJ_RETENTION,
+  RGW_OP_GET_OBJ_RETENTION,
+  RGW_OP_PUT_OBJ_LEGAL_HOLD,
+  RGW_OP_GET_OBJ_LEGAL_HOLD,
+  /* rgw specific */
+  RGW_OP_ADMIN_SET_METADATA,
+  RGW_OP_GET_OBJ_LAYOUT,
+  RGW_OP_BULK_UPLOAD,
+  RGW_OP_METADATA_SEARCH,
+  RGW_OP_CONFIG_BUCKET_META_SEARCH,
+  RGW_OP_GET_BUCKET_META_SEARCH,
+  RGW_OP_DEL_BUCKET_META_SEARCH,
+  RGW_OP_SYNC_DATALOG_NOTIFY,
+  RGW_OP_SYNC_DATALOG_NOTIFY2,
+  RGW_OP_SYNC_MDLOG_NOTIFY,
+  RGW_OP_PERIOD_POST,
+  /* sts specific*/
+  RGW_STS_ASSUME_ROLE,
+  RGW_STS_GET_SESSION_TOKEN,
+  RGW_STS_ASSUME_ROLE_WEB_IDENTITY,
+  /* pubsub */
+  RGW_OP_PUBSUB_TOPIC_CREATE,
+  RGW_OP_PUBSUB_TOPICS_LIST,
+  RGW_OP_PUBSUB_TOPIC_GET,
+  RGW_OP_PUBSUB_TOPIC_DELETE,
+  RGW_OP_PUBSUB_SUB_CREATE,
+  RGW_OP_PUBSUB_SUB_GET,
+  RGW_OP_PUBSUB_SUB_DELETE,
+  RGW_OP_PUBSUB_SUB_PULL,
+  RGW_OP_PUBSUB_SUB_ACK,
+  RGW_OP_PUBSUB_NOTIF_CREATE,
+  RGW_OP_PUBSUB_NOTIF_DELETE,
+  RGW_OP_PUBSUB_NOTIF_LIST,
+  RGW_OP_GET_BUCKET_TAGGING,
+  RGW_OP_PUT_BUCKET_TAGGING,
+  RGW_OP_DELETE_BUCKET_TAGGING,
+  RGW_OP_GET_BUCKET_REPLICATION,
+  RGW_OP_PUT_BUCKET_REPLICATION,
+  RGW_OP_DELETE_BUCKET_REPLICATION,
+  /* public access */
+  RGW_OP_GET_BUCKET_POLICY_STATUS,
+  RGW_OP_PUT_BUCKET_PUBLIC_ACCESS_BLOCK,
+  RGW_OP_GET_BUCKET_PUBLIC_ACCESS_BLOCK,
+  RGW_OP_DELETE_BUCKET_PUBLIC_ACCESS_BLOCK,
+  /*OIDC provider specific*/
+  RGW_OP_CREATE_OIDC_PROVIDER,
+  RGW_OP_DELETE_OIDC_PROVIDER,
+  RGW_OP_GET_OIDC_PROVIDER,
+  RGW_OP_LIST_OIDC_PROVIDERS,
+};
+
diff --git a/src/rgw/rgw_opa.cc b/src/rgw/rgw_opa.cc
new file mode 100644
index 000000000..7422615ae
--- /dev/null
+++ b/src/rgw/rgw_opa.cc
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_opa.h"
+#include "rgw_http_client.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int rgw_opa_authorize(RGWOp *& op,
+                      req_state * const s)
+{
+
+  ldpp_dout(op, 2) << "authorizing request using OPA" << dendl;
+
+  /* get OPA url */
+  const string& opa_url = s->cct->_conf->rgw_opa_url;
+  if (opa_url == "") {
+    ldpp_dout(op, 2) << "OPA_URL not provided" << dendl;
+    return -ERR_INVALID_REQUEST;
+  }
+  ldpp_dout(op, 2) << "OPA URL= " << opa_url.c_str() << dendl;
+
+  /* get authentication token for OPA */
+  const string& opa_token = s->cct->_conf->rgw_opa_token;
+
+  int ret;
+  bufferlist bl;
+  RGWHTTPTransceiver req(s->cct, "POST", opa_url.c_str(), &bl);
+
+  /* set required headers for OPA request */
+  req.append_header("X-Auth-Token", opa_token);
+  req.append_header("Content-Type", "application/json");
+  req.append_header("Expect", "100-continue");
+
+  /* check if we want to verify OPA server SSL certificate */
+  req.set_verify_ssl(s->cct->_conf->rgw_opa_verify_ssl);
+
+  /* create json request body */
+  JSONFormatter jf;
+  jf.open_object_section("");
+  jf.open_object_section("input");
+  const char *request_method = s->info.env->get("REQUEST_METHOD");
+  if (request_method) {
+    jf.dump_string("method", request_method);
+  }
+  jf.dump_string("relative_uri", s->relative_uri.c_str());
+  jf.dump_string("decoded_uri", s->decoded_uri.c_str());
+  jf.dump_string("params", s->info.request_params.c_str());
+  jf.dump_string("request_uri_aws4", s->info.request_uri_aws4.c_str());
+  if (s->object) {
+    jf.dump_string("object_name", s->object->get_name().c_str());
+  }
+  if (s->auth.identity) {
+    jf.dump_string("subuser", s->auth.identity->get_subuser().c_str());
+  }
+  if (s->user) {
+    jf.dump_object("user_info", s->user->get_info());
+  }
+  if (s->bucket) {
+    jf.dump_object("bucket_info", s->bucket->get_info());
+  }
+  jf.close_section();
+  jf.close_section();
+
+  std::stringstream ss;
+  jf.flush(ss);
+  req.set_post_data(ss.str());
+  req.set_send_length(ss.str().length());
+
+  /* send request */
+  ret = req.process(null_yield);
+  if (ret < 0) {
+    ldpp_dout(op, 2) << "OPA process error:" << bl.c_str() << dendl;
+    return ret;
+  }
+
+  /* check OPA response */
+  JSONParser parser;
+  if (!parser.parse(bl.c_str(), bl.length())) {
+    ldpp_dout(op, 2) << "OPA parse error: malformed json" << dendl;
+    return -EINVAL;
+  }
+
+  bool opa_result;
+  JSONDecoder::decode_json("result", opa_result, &parser);
+
+  if (opa_result == false) {
+    ldpp_dout(op, 2) << "OPA rejecting request" << dendl;
+    return -EPERM;
+  }
+
+  ldpp_dout(op, 2) << "OPA accepting request" << dendl;
+  return 0;
+}
diff --git a/src/rgw/rgw_opa.h b/src/rgw/rgw_opa.h
new file mode 100644
index 000000000..6fd3b21bd
--- /dev/null
+++ b/src/rgw/rgw_opa.h
@@ -0,0 +1,11 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_common.h"
+#include "rgw_op.h"
+
+/* authorize request using OPA */
+int rgw_opa_authorize(RGWOp*& op,
+                      req_state* s);
diff --git a/src/rgw/rgw_orphan.cc b/src/rgw/rgw_orphan.cc
new file mode 100644
index 000000000..a8b4f5296
--- /dev/null
+++ b/src/rgw/rgw_orphan.cc
@@ -0,0 +1,1598 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string>
+
+
+#include "common/config.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "rgw_op.h"
+#include "rgw_multi.h"
+#include "rgw_orphan.h"
+#include "rgw_zone.h"
+#include "rgw_bucket.h"
+#include "rgw_sal_rados.h"
+
+#include "services/svc_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#define DEFAULT_NUM_SHARDS 64
+
+using namespace std;
+
+static string obj_fingerprint(const string& oid, const char *force_ns = NULL)
+{
+  ssize_t pos = oid.find('_');
+  if (pos < 0) {
+    cerr << "ERROR: object does not have a bucket marker: " << oid << std::endl;
+  }
+
+  string obj_marker = oid.substr(0, pos);
+
+  rgw_obj_key key;
+
+  rgw_obj_key::parse_raw_oid(oid.substr(pos + 1), &key);
+
+  if (key.ns.empty()) {
+    return oid;
+  }
+
+  string s = oid;
+
+  if (force_ns) {
+    rgw_bucket b;
+    rgw_obj new_obj(b, key);
+    s = obj_marker + "_" + new_obj.get_oid();
+  }
+
+  /* cut out suffix */
+  size_t i = s.size() - 1;
+  for (; i >= s.size() - 10; --i) {
+    char c = s[i];
+    if (!isdigit(c) && c != '.' && c != '_') {
+      break;
+    }
+  }
+
+  return s.substr(0, i + 1);
+}
+
+int RGWOrphanStore::read_job(const string& job_name, RGWOrphanSearchState & state)
+{
+  set<string> keys;
+  map<string, bufferlist> vals;
+  keys.insert(job_name);
+  int r = ioctx.omap_get_vals_by_keys(oid, keys, &vals);
+  if (r < 0) {
+    return r;
+  }
+
+  map<string, bufferlist>::iterator iter = vals.find(job_name);
+  if (iter == vals.end()) {
+    return -ENOENT;
+  }
+
+  try {
+    bufferlist& bl = iter->second;
+    decode(state, bl);
+  } catch (buffer::error& err) {
+    lderr(store->ctx()) << "ERROR: could not decode buffer" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RGWOrphanStore::write_job(const string& job_name, const RGWOrphanSearchState& state)
+{
+  map<string, bufferlist> vals;
+  bufferlist bl;
+  encode(state, bl);
+  vals[job_name] = bl;
+  int r = ioctx.omap_set(oid, vals);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWOrphanStore::remove_job(const string& job_name)
+{
+  set<string> keys;
+  keys.insert(job_name);
+
+  int r = ioctx.omap_rm_keys(oid, keys);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWOrphanStore::list_jobs(map <string,RGWOrphanSearchState>& job_list)
+{
+  map <string,bufferlist> vals;
+  int MAX_READ=1024;
+  string marker="";
+  int r = 0;
+
+  // loop through all the omap vals from index object, storing them to job_list,
+  // read in batches of 1024, we update the marker every iteration and exit the
+  // loop when we find that total size read out is less than batch size
+  do {
+    r = ioctx.omap_get_vals(oid, marker, MAX_READ, &vals);
+    if (r < 0) {
+      return r;
+    }
+    r = vals.size();
+
+    for (const auto &it : vals) {
+      marker=it.first;
+      RGWOrphanSearchState state;
+      try {
+        bufferlist bl = it.second;
+        decode(state, bl);
+      } catch (buffer::error& err) {
+        lderr(store->ctx()) << "ERROR: could not decode buffer" << dendl;
+        return -EIO;
+      }
+      job_list[it.first] = state;
+    }
+  } while (r == MAX_READ);
+
+  return 0;
+}
+
+int RGWOrphanStore::init(const DoutPrefixProvider *dpp)
+{
+  const rgw_pool& log_pool = static_cast<rgw::sal::RadosStore*>(store)->svc()->zone->get_zone_params().log_pool;
+  int r = rgw_init_ioctx(dpp, static_cast<rgw::sal::RadosStore*>(store)->getRados()->get_rados_handle(), log_pool, ioctx);
+  if (r < 0) {
+    cerr << "ERROR: failed to open log pool (" << log_pool << " ret=" << r << std::endl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWOrphanStore::store_entries(const DoutPrefixProvider *dpp, const string& oid, const map<string, bufferlist>& entries)
+{
+  librados::ObjectWriteOperation op;
+  op.omap_set(entries);
+  cout << "storing " << entries.size() << " entries at " << oid << std::endl;
+  ldpp_dout(dpp, 20) << "storing " << entries.size() << " entries at " << oid << ": " << dendl;
+  for (map<string, bufferlist>::const_iterator iter = entries.begin(); iter != entries.end(); ++iter) {
+    ldpp_dout(dpp, 20) << " > " << iter->first << dendl;
+  }
+  int ret = rgw_rados_operate(dpp, ioctx, oid, &op, null_yield);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: " << __func__ << "(" << oid << ") returned ret=" << ret << dendl;
+  }
+  
+  return 0;
+}
+
+int RGWOrphanStore::read_entries(const string& oid, const string& marker, map<string, bufferlist> *entries, bool *truncated)
+{
+#define MAX_OMAP_GET 100
+  int ret = ioctx.omap_get_vals(oid, marker, MAX_OMAP_GET, entries);
+  if (ret < 0 && ret != -ENOENT) {
+    cerr << "ERROR: " << __func__ << "(" << oid << ") returned ret=" << cpp_strerror(-ret) << std::endl;
+  }
+
+  *truncated = (entries->size() == MAX_OMAP_GET);
+
+  return 0;
+}
+
+int RGWOrphanSearch::init(const DoutPrefixProvider *dpp, const string& job_name, RGWOrphanSearchInfo *info, bool _detailed_mode)
+{
+  int r = orphan_store.init(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  constexpr int64_t MAX_LIST_OBJS_ENTRIES=100;
+
+  max_list_bucket_entries = std::max(store->ctx()->_conf->rgw_list_bucket_min_readahead,
+                                     MAX_LIST_OBJS_ENTRIES);
+
+  detailed_mode = _detailed_mode;
+  RGWOrphanSearchState state;
+  r = orphan_store.read_job(job_name, state);
+  if (r < 0 && r != -ENOENT) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to read state ret=" << r << dendl;
+    return r;
+  }
+
+  if (r == 0) {
+    search_info = state.info;
+    search_stage = state.stage;
+  } else if (info) { /* r == -ENOENT, initiate a new job if info was provided */ 
+    search_info = *info;
+    search_info.job_name = job_name;
+    search_info.num_shards = (info->num_shards ? info->num_shards : DEFAULT_NUM_SHARDS);
+    search_info.start_time = ceph_clock_now();
+    search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_INIT);
+
+    r = save_state();
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed to write state ret=" << r << dendl;
+      return r;
+    }
+  } else {
+      ldpp_dout(dpp, -1) << "ERROR: job not found" << dendl;
+      return r;
+  }
+
+  index_objs_prefix = RGW_ORPHAN_INDEX_PREFIX + string(".");
+  index_objs_prefix += job_name;
+
+  for (int i = 0; i < search_info.num_shards; i++) {
+    char buf[128];
+
+    snprintf(buf, sizeof(buf), "%s.rados.%d", index_objs_prefix.c_str(), i);
+    all_objs_index[i] = buf;
+
+    snprintf(buf, sizeof(buf), "%s.buckets.%d", index_objs_prefix.c_str(), i);
+    buckets_instance_index[i] = buf;
+
+    snprintf(buf, sizeof(buf), "%s.linked.%d", index_objs_prefix.c_str(), i);
+    linked_objs_index[i] = buf;
+  }
+  return 0;
+}
+
+int RGWOrphanSearch::log_oids(const DoutPrefixProvider *dpp, map<int, string>& log_shards, map<int, list<string> >& oids)
+{
+  map<int, list<string> >::iterator miter = oids.begin();
+
+  list<log_iter_info> liters; /* a list of iterator pairs for begin and end */
+
+  for (; miter != oids.end(); ++miter) {
+    log_iter_info info;
+    info.oid = log_shards[miter->first];
+    info.cur = miter->second.begin();
+    info.end = miter->second.end();
+    liters.push_back(info);
+  }
+
+  list<log_iter_info>::iterator list_iter;
+  while (!liters.empty()) {
+     list_iter = liters.begin();
+
+     while (list_iter != liters.end()) {
+       log_iter_info& cur_info = *list_iter;
+
+       list<string>::iterator& cur = cur_info.cur;
+       list<string>::iterator& end = cur_info.end;
+
+       map<string, bufferlist> entries;
+#define MAX_OMAP_SET_ENTRIES 100
+       for (int j = 0; cur != end && j != MAX_OMAP_SET_ENTRIES; ++cur, ++j) {
+         ldpp_dout(dpp, 20) << "adding obj: " << *cur << dendl;
+         entries[*cur] = bufferlist();
+       }
+
+       int ret = orphan_store.store_entries(dpp, cur_info.oid, entries);
+       if (ret < 0) {
+         return ret;
+       }
+       list<log_iter_info>::iterator tmp = list_iter;
+       ++list_iter;
+       if (cur == end) {
+         liters.erase(tmp);
+       }
+     }
+  }
+  return 0;
+}
+
+int RGWOrphanSearch::build_all_oids_index(const DoutPrefixProvider *dpp)
+{
+  librados::IoCtx ioctx;
+
+  int ret = rgw_init_ioctx(dpp, static_cast<rgw::sal::RadosStore*>(store)->getRados()->get_rados_handle(), search_info.pool, ioctx);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << __func__ << ": rgw_init_ioctx() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  ioctx.set_namespace(librados::all_nspaces);
+  librados::NObjectIterator i = ioctx.nobjects_begin();
+  librados::NObjectIterator i_end = ioctx.nobjects_end();
+
+  map<int, list<string> > oids;
+
+  int count = 0;
+  uint64_t total = 0;
+
+  cout << "logging all objects in the pool" << std::endl;
+
+  for (; i != i_end; ++i) {
+    string nspace = i->get_nspace();
+    string oid = i->get_oid();
+    string locator = i->get_locator();
+
+    ssize_t pos = oid.find('_');
+    if (pos < 0) {
+      cout << "unidentified oid: " << oid << ", skipping" << std::endl;
+      /* what is this object, oids should be in the format of <bucket marker>_<obj>,
+       * skip this entry
+       */
+      continue;
+    }
+    string stripped_oid = oid.substr(pos + 1);
+    rgw_obj_key key;
+    if (!rgw_obj_key::parse_raw_oid(stripped_oid, &key)) {
+      cout << "cannot parse oid: " << oid << ", skipping" << std::endl;
+      continue;
+    }
+
+    if (key.ns.empty()) {
+      /* skipping head objects, we don't want to remove these as they are mutable and
+       * cleaning them up is racy (can race with object removal and a later recreation)
+       */
+      cout << "skipping head object: oid=" << oid << std::endl;
+      continue;
+    }
+
+    string oid_fp = obj_fingerprint(oid);
+
+    ldout(store->ctx(), 20) << "oid_fp=" << oid_fp << dendl;
+
+    int shard = orphan_shard(oid_fp);
+    oids[shard].push_back(oid);
+
+#define COUNT_BEFORE_FLUSH 1000
+    ++total;
+    if (++count >= COUNT_BEFORE_FLUSH) {
+      ldout(store->ctx(), 1) << "iterated through " << total << " objects" << dendl;
+      ret = log_oids(dpp, all_objs_index, oids);
+      if (ret < 0) {
+        cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+        return ret;
+      }
+      count = 0;
+      oids.clear();
+    }
+  }
+  ret = log_oids(dpp, all_objs_index, oids);
+  if (ret < 0) {
+    cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+    return ret;
+  }
+  
+  return 0;
+}
+
+int RGWOrphanSearch::build_buckets_instance_index(const DoutPrefixProvider *dpp)
+{
+  void *handle;
+  int max = 1000;
+  string section = "bucket.instance";
+  int ret = store->meta_list_keys_init(dpp, section, string(), &handle);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: can't get key: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  map<int, list<string> > instances;
+
+  bool truncated;
+
+  RGWObjectCtx obj_ctx(store);
+
+  int count = 0;
+  uint64_t total = 0;
+
+  do {
+    list<string> keys;
+    ret = store->meta_list_keys_next(dpp, handle, max, keys, &truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+      ++total;
+      ldpp_dout(dpp, 10) << "bucket_instance=" << *iter << " total=" << total << dendl;
+      int shard = orphan_shard(*iter);
+      instances[shard].push_back(*iter);
+
+      if (++count >= COUNT_BEFORE_FLUSH) {
+        ret = log_oids(dpp, buckets_instance_index, instances);
+        if (ret < 0) {
+          ldpp_dout(dpp, -1) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl;
+          return ret;
+        }
+        count = 0;
+        instances.clear();
+      }
+    }
+
+  } while (truncated);
+
+  store->meta_list_keys_complete(handle);
+
+  ret = log_oids(dpp, buckets_instance_index, instances);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWOrphanSearch::handle_stat_result(const DoutPrefixProvider *dpp, map<int, list<string> >& oids, RGWRados::Object::Stat::Result& result)
+{
+  set<string> obj_oids;
+  rgw_bucket& bucket = result.obj.bucket;
+  if (!result.manifest) { /* a very very old object, or part of a multipart upload during upload */
+    const string loc = bucket.bucket_id + "_" + result.obj.get_oid();
+    obj_oids.insert(obj_fingerprint(loc));
+
+    /*
+     * multipart parts don't have manifest on them, it's in the meta object. Instead of reading the
+     * meta object, just add a "shadow" object to the mix
+     */
+    obj_oids.insert(obj_fingerprint(loc, "shadow"));
+  } else {
+    RGWObjManifest& manifest = *result.manifest;
+
+    if (!detailed_mode &&
+        manifest.get_obj_size() <= manifest.get_head_size()) {
+      ldpp_dout(dpp, 5) << "skipping object as it fits in a head" << dendl;
+      return 0;
+    }
+
+    RGWObjManifest::obj_iterator miter;
+    for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) {
+      const rgw_raw_obj& loc = miter.get_location().get_raw_obj(store->getRados());
+      string s = loc.oid;
+      obj_oids.insert(obj_fingerprint(s));
+    }
+  }
+
+  for (set<string>::iterator iter = obj_oids.begin(); iter != obj_oids.end(); ++iter) {
+    ldpp_dout(dpp, 20) << __func__ << ": oid for obj=" << result.obj << ": " << *iter << dendl;
+
+    int shard = orphan_shard(*iter);
+    oids[shard].push_back(*iter);
+  }
+
+  return 0;
+}
+
+int RGWOrphanSearch::pop_and_handle_stat_op(const DoutPrefixProvider *dpp, map<int, list<string> >& oids, std::deque<RGWRados::Object::Stat>& ops)
+{
+  RGWRados::Object::Stat& front_op = ops.front();
+
+  int ret = front_op.wait(dpp);
+  if (ret < 0) {
+    if (ret != -ENOENT) {
+      ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+    }
+    goto done;
+  }
+  ret = handle_stat_result(dpp, oids, front_op.result);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: handle_stat_response() returned error: " << cpp_strerror(-ret) << dendl;
+  }
+done:
+  ops.pop_front();
+  return ret;
+}
+
+int RGWOrphanSearch::build_linked_oids_for_bucket(const DoutPrefixProvider *dpp, const string& bucket_instance_id, map<int, list<string> >& oids)
+{
+  RGWObjectCtx obj_ctx(store);
+  rgw_bucket orphan_bucket;
+  int shard_id;
+  int ret = rgw_bucket_parse_bucket_key(store->ctx(), bucket_instance_id,
+                                        &orphan_bucket, &shard_id);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << __func__ << " failed to parse bucket instance: "
+                 << bucket_instance_id << " skipping" << dendl;
+    return ret;
+  }
+
+  std::unique_ptr<rgw::sal::Bucket> cur_bucket;
+  ret = store->get_bucket(dpp, nullptr, orphan_bucket, &cur_bucket, null_yield);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      /* probably raced with bucket removal */
+      return 0;
+    }
+    ldpp_dout(dpp, -1) << __func__ << ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (cur_bucket->get_bucket_id() != orphan_bucket.bucket_id) {
+    ldpp_dout(dpp, 0) << __func__ << ": Skipping stale bucket instance: "
+                           << orphan_bucket.name << ": "
+                           << orphan_bucket.bucket_id << dendl;
+    return 0;
+  }
+
+  if (cur_bucket->get_info().layout.resharding != rgw::BucketReshardState::None) {
+    ldpp_dout(dpp, 0) << __func__ << ": reshard in progress. Skipping "
+                           << orphan_bucket.name << ": "
+                           << orphan_bucket.bucket_id << dendl;
+    return 0;
+  }
+
+  rgw_bucket b;
+  rgw_bucket_parse_bucket_key(store->ctx(), bucket_instance_id, &b, nullptr);
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  ret = store->get_bucket(dpp, nullptr, b, &bucket, null_yield);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      /* probably raced with bucket removal */
+      return 0;
+    }
+    ldpp_dout(dpp, -1) << __func__ << ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 10) << "building linked oids for bucket instance: " << bucket_instance_id << dendl;
+  RGWRados::Bucket target(store->getRados(), cur_bucket->get_info());
+  RGWRados::Bucket::List list_op(&target);
+
+  string marker;
+  list_op.params.marker = rgw_obj_key(marker);
+  list_op.params.list_versions = true;
+  list_op.params.enforce_ns = false;
+
+  bool truncated;
+
+  deque<RGWRados::Object::Stat> stat_ops;
+
+  do {
+    vector<rgw_bucket_dir_entry> result;
+
+    ret = list_op.list_objects(dpp, max_list_bucket_entries,
+                               &result, nullptr, &truncated, null_yield);
+    if (ret < 0) {
+      cerr << "ERROR: store->list_objects(): " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    }
+
+    for (vector<rgw_bucket_dir_entry>::iterator iter = result.begin(); iter != result.end(); ++iter) {
+      rgw_bucket_dir_entry& entry = *iter;
+      if (entry.key.instance.empty()) {
+        ldpp_dout(dpp, 20) << "obj entry: " << entry.key.name << dendl;
+      } else {
+        ldpp_dout(dpp, 20) << "obj entry: " << entry.key.name << " [" << entry.key.instance << "]" << dendl;
+      }
+
+      ldpp_dout(dpp, 20) << __func__ << ": entry.key.name=" << entry.key.name << " entry.key.instance=" << entry.key.instance << dendl;
+
+      if (!detailed_mode &&
+          entry.meta.accounted_size <= (uint64_t)store->ctx()->_conf->rgw_max_chunk_size) {
+        ldpp_dout(dpp, 5) << __func__ << "skipping stat as the object " << entry.key.name
+                              << "fits in a head" << dendl;
+        continue;
+      }
+
+      rgw_obj obj(cur_bucket->get_key(), entry.key);
+
+      RGWRados::Object op_target(store->getRados(), cur_bucket->get_info(), obj_ctx, obj);
+
+      stat_ops.push_back(RGWRados::Object::Stat(&op_target));
+      RGWRados::Object::Stat& op = stat_ops.back();
+
+      ret = op.stat_async(dpp);
+      if (ret < 0) {
+        ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+        return ret;
+      }
+      if (stat_ops.size() >= max_concurrent_ios) {
+        ret = pop_and_handle_stat_op(dpp, oids, stat_ops);
+        if (ret < 0) {
+          if (ret != -ENOENT) {
+            ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+          }
+        }
+      }
+      if (oids.size() >= COUNT_BEFORE_FLUSH) {
+        ret = log_oids(dpp, linked_objs_index, oids);
+        if (ret < 0) {
+          cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+          return ret;
+        }
+        oids.clear();
+      }
+    }
+  } while (truncated);
+
+  while (!stat_ops.empty()) {
+    ret = pop_and_handle_stat_op(dpp, oids, stat_ops);
+    if (ret < 0) {
+      if (ret != -ENOENT) {
+        ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " << cpp_strerror(-ret) << dendl;
+      }
+    }
+  }
+
+  return 0;
+}
+
+int RGWOrphanSearch::build_linked_oids_index(const DoutPrefixProvider *dpp)
+{
+  map<int, list<string> > oids;
+  map<int, string>::iterator iter = buckets_instance_index.find(search_stage.shard);
+  for (; iter != buckets_instance_index.end(); ++iter) {
+    ldpp_dout(dpp, 0) << "building linked oids index: " << iter->first << "/" << buckets_instance_index.size() << dendl;
+    bool truncated;
+
+    string oid = iter->second;
+
+    do {
+      map<string, bufferlist> entries;
+      int ret = orphan_store.read_entries(oid, search_stage.marker, &entries, &truncated);
+      if (ret == -ENOENT) {
+        truncated = false;
+        ret = 0;
+      }
+
+      if (ret < 0) {
+        ldpp_dout(dpp, -1) << __func__ << ": ERROR: read_entries() oid=" << oid << " returned ret=" << ret << dendl;
+        return ret;
+      }
+
+      if (entries.empty()) {
+        break;
+      }
+
+      for (map<string, bufferlist>::iterator eiter = entries.begin(); eiter != entries.end(); ++eiter) {
+        ldpp_dout(dpp, 20) << " indexed entry: " << eiter->first << dendl;
+        ret = build_linked_oids_for_bucket(dpp, eiter->first, oids);
+        if (ret < 0) {
+          ldpp_dout(dpp, -1) << __func__ << ": ERROR: build_linked_oids_for_bucket() indexed entry=" << eiter->first
+                              << " returned ret=" << ret << dendl;
+          return ret;
+        }
+      }
+
+      search_stage.shard = iter->first;
+      search_stage.marker = entries.rbegin()->first; /* last entry */
+    } while (truncated);
+
+    search_stage.marker.clear();
+  }
+
+  int ret = log_oids(dpp, linked_objs_index, oids);
+  if (ret < 0) {
+    cerr << __func__ << ": ERROR: log_oids() returned ret=" << ret << std::endl;
+    return ret;
+  }
+
+  ret = save_state();
+  if (ret < 0) {
+    cerr << __func__ << ": ERROR: failed to write state ret=" << ret << std::endl;
+    return ret;
+  }
+
+  return 0;
+}
+
+class OMAPReader {
+  librados::IoCtx ioctx;
+  string oid;
+
+  map<string, bufferlist> entries;
+  map<string, bufferlist>::iterator iter;
+  string marker;
+  bool truncated;
+
+public:
+  OMAPReader(librados::IoCtx& _ioctx, const string& _oid) : ioctx(_ioctx), oid(_oid), truncated(true) {
+    iter = entries.end();
+  }
+
+  int get_next(string *key, bufferlist *pbl, bool *done);
+};
+
+int OMAPReader::get_next(string *key, bufferlist *pbl, bool *done)
+{
+  if (iter != entries.end()) {
+    *key = iter->first;
+    if (pbl) {
+      *pbl = iter->second;
+    }
+    ++iter;
+    *done = false;
+    marker = *key;
+    return 0;
+  }
+
+  if (!truncated) {
+    *done = true;
+    return 0;
+  }
+
+#define MAX_OMAP_GET_ENTRIES 100
+  int ret = ioctx.omap_get_vals(oid, marker, MAX_OMAP_GET_ENTRIES, &entries);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      *done = true;
+      return 0;
+    }
+    return ret;
+  }
+
+  truncated = (entries.size() == MAX_OMAP_GET_ENTRIES);
+  iter = entries.begin();
+  return get_next(key, pbl, done);
+}
+
+int RGWOrphanSearch::compare_oid_indexes(const DoutPrefixProvider *dpp)
+{
+  ceph_assert(linked_objs_index.size() == all_objs_index.size());
+
+  librados::IoCtx& ioctx = orphan_store.get_ioctx();
+
+  librados::IoCtx data_ioctx;
+
+  int ret = rgw_init_ioctx(dpp, static_cast<rgw::sal::RadosStore*>(store)->getRados()->get_rados_handle(), search_info.pool, data_ioctx);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << __func__ << ": rgw_init_ioctx() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  uint64_t time_threshold = search_info.start_time.sec() - stale_secs;
+
+  map<int, string>::iterator liter = linked_objs_index.begin();
+  map<int, string>::iterator aiter = all_objs_index.begin();
+
+  for (; liter != linked_objs_index.end(); ++liter, ++aiter) {
+    OMAPReader linked_entries(ioctx, liter->second);
+    OMAPReader all_entries(ioctx, aiter->second);
+
+    bool done;
+
+    string cur_linked;
+    bool linked_done = false;
+
+
+    do {
+      string key;
+      int r = all_entries.get_next(&key, NULL, &done);
+      if (r < 0) {
+        return r;
+      }
+      if (done) {
+        break;
+      }
+
+      string key_fp = obj_fingerprint(key);
+
+      while (cur_linked < key_fp && !linked_done) {
+        r = linked_entries.get_next(&cur_linked, NULL, &linked_done);
+        if (r < 0) {
+          return r;
+        }
+      }
+
+      if (cur_linked == key_fp) {
+        ldpp_dout(dpp, 20) << "linked: " << key << dendl;
+        continue;
+      }
+
+      time_t mtime;
+      r = data_ioctx.stat(key, NULL, &mtime);
+      if (r < 0) {
+        if (r != -ENOENT) {
+          ldpp_dout(dpp, -1) << "ERROR: ioctx.stat(" << key << ") returned ret=" << r << dendl;
+        }
+        continue;
+      }
+      if (stale_secs && (uint64_t)mtime >= time_threshold) {
+        ldpp_dout(dpp, 20) << "skipping: " << key << " (mtime=" << mtime << " threshold=" << time_threshold << ")" << dendl;
+        continue;
+      }
+      ldpp_dout(dpp, 20) << "leaked: " << key << dendl;
+      cout << "leaked: " << key << std::endl;
+    } while (!done);
+  }
+
+  return 0;
+}
+
+int RGWOrphanSearch::run(const DoutPrefixProvider *dpp)
+{
+  int r;
+
+  switch (search_stage.stage) {
+    
+    case ORPHAN_SEARCH_STAGE_INIT:
+      ldpp_dout(dpp, 0) << __func__ << "(): initializing state" << dendl;
+      search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_LSPOOL);
+      r = save_state();
+      if (r < 0) {
+        ldpp_dout(dpp, -1) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+        return r;
+      }
+      // fall through
+    case ORPHAN_SEARCH_STAGE_LSPOOL:
+      ldpp_dout(dpp, 0) << __func__ << "(): building index of all objects in pool" << dendl;
+      r = build_all_oids_index(dpp);
+      if (r < 0) {
+        ldpp_dout(dpp, -1) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl;
+        return r;
+      }
+
+      search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_LSBUCKETS);
+      r = save_state();
+      if (r < 0) {
+        ldpp_dout(dpp, -1) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+        return r;
+      }
+      // fall through
+
+    case ORPHAN_SEARCH_STAGE_LSBUCKETS:
+      ldpp_dout(dpp, 0) << __func__ << "(): building index of all bucket indexes" << dendl;
+      r = build_buckets_instance_index(dpp);
+      if (r < 0) {
+        ldpp_dout(dpp, -1) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl;
+        return r;
+      }
+
+      search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_ITERATE_BI);
+      r = save_state();
+      if (r < 0) {
+        ldpp_dout(dpp, -1) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+        return r;
+      }
+      // fall through
+
+
+    case ORPHAN_SEARCH_STAGE_ITERATE_BI:
+      ldpp_dout(dpp, 0) << __func__ << "(): building index of all linked objects" << dendl;
+      r = build_linked_oids_index(dpp);
+      if (r < 0) {
+        ldpp_dout(dpp, -1) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl;
+        return r;
+      }
+
+      search_stage = RGWOrphanSearchStage(ORPHAN_SEARCH_STAGE_COMPARE);
+      r = save_state();
+      if (r < 0) {
+        ldpp_dout(dpp, -1) << __func__ << ": ERROR: failed to save state, ret=" << r << dendl;
+        return r;
+      }
+      // fall through
+
+    case ORPHAN_SEARCH_STAGE_COMPARE:
+      r = compare_oid_indexes(dpp);
+      if (r < 0) {
+        ldpp_dout(dpp, -1) << __func__ << ": ERROR: build_all_objs_index returned ret=" << r << dendl;
+        return r;
+      }
+
+      break;
+
+    default:
+      ceph_abort();
+  };
+
+  return 0;
+}
+
+
+int RGWOrphanSearch::remove_index(map<int, string>& index)
+{
+  librados::IoCtx& ioctx = orphan_store.get_ioctx();
+
+  for (map<int, string>::iterator iter = index.begin(); iter != index.end(); ++iter) {
+    int r = ioctx.remove(iter->second);
+    if (r < 0) {
+      if (r != -ENOENT) {
+        ldout(store->ctx(), 0) << "ERROR: couldn't remove " << iter->second << ": ret=" << r << dendl;
+      }
+    }
+  }
+  return 0;
+}
+
+int RGWOrphanSearch::finish()
+{
+  int r = remove_index(all_objs_index);
+  if (r < 0) {
+    ldout(store->ctx(), 0) << "ERROR: remove_index(" << all_objs_index << ") returned ret=" << r << dendl;
+  }
+  r = remove_index(buckets_instance_index);
+  if (r < 0) {
+    ldout(store->ctx(), 0) << "ERROR: remove_index(" << buckets_instance_index << ") returned ret=" << r << dendl;
+  }
+  r = remove_index(linked_objs_index);
+  if (r < 0) {
+    ldout(store->ctx(), 0) << "ERROR: remove_index(" << linked_objs_index << ") returned ret=" << r << dendl;
+  }
+
+  r = orphan_store.remove_job(search_info.job_name);
+  if (r < 0) {
+    ldout(store->ctx(), 0) << "ERROR: could not remove job name (" << search_info.job_name << ") ret=" << r << dendl;
+  }
+
+  return r;
+}
+
+
+int RGWRadosList::handle_stat_result(const DoutPrefixProvider *dpp,
+				     RGWRados::Object::Stat::Result& result,
+				     std::string& bucket_name,
+				     rgw_obj_key& obj_key,
+                                     std::set<string>& obj_oids)
+{
+  obj_oids.clear();
+
+  rgw_bucket& bucket = result.obj.bucket;
+
+  ldpp_dout(dpp, 20) << "RGWRadosList::" << __func__ <<
+    " bucket=" << bucket <<
+    ", has_manifest=" << result.manifest.has_value() <<
+    dendl;
+
+  // iterator to store result of dlo/slo attribute find
+  decltype(result.attrs)::iterator attr_it = result.attrs.end();
+  const std::string oid = bucket.marker + "_" + result.obj.get_oid();
+  ldpp_dout(dpp, 20) << "radoslist processing object=\"" <<
+      oid << "\"" << dendl;
+  if (visited_oids.find(oid) != visited_oids.end()) {
+    // apparently we hit a loop; don't continue with this oid
+    ldpp_dout(dpp, 15) <<
+      "radoslist stopped loop at already visited object=\"" <<
+      oid << "\"" << dendl;
+    return 0;
+  }
+
+  bucket_name = bucket.name;
+  obj_key = result.obj.key;
+
+  if (!result.manifest) {
+    /* a very very old object, or part of a multipart upload during upload */
+    obj_oids.insert(oid);
+
+    /*
+     * multipart parts don't have manifest on them, it's in the meta
+     * object; we'll process them in
+     * RGWRadosList::do_incomplete_multipart
+     */
+  } else if ((attr_it = result.attrs.find(RGW_ATTR_USER_MANIFEST)) !=
+	     result.attrs.end()) {
+    // *** handle DLO object ***
+
+    obj_oids.insert(oid);
+    visited_oids.insert(oid); // prevent dlo loops
+    ldpp_dout(dpp, 15) << "radoslist added to visited list DLO=\"" <<
+      oid << "\"" << dendl;
+
+    char* prefix_path_c = attr_it->second.c_str();
+    const std::string& prefix_path = prefix_path_c;
+
+    const size_t sep_pos = prefix_path.find('/');
+    if (string::npos == sep_pos) {
+      return -EINVAL;
+    }
+
+    const std::string bucket_name = prefix_path.substr(0, sep_pos);
+    const std::string prefix = prefix_path.substr(sep_pos + 1);
+
+    add_bucket_prefix(bucket_name, prefix);
+    ldpp_dout(dpp, 25) << "radoslist DLO oid=\"" << oid <<
+      "\" added bucket=\"" << bucket_name << "\" prefix=\"" <<
+      prefix << "\" to process list" << dendl;
+  } else if ((attr_it = result.attrs.find(RGW_ATTR_USER_MANIFEST)) !=
+	     result.attrs.end()) {
+    // *** handle SLO object ***
+
+    obj_oids.insert(oid);
+    visited_oids.insert(oid); // prevent slo loops
+    ldpp_dout(dpp, 15) << "radoslist added to visited list SLO=\"" <<
+      oid << "\"" << dendl;
+
+    RGWSLOInfo slo_info;
+    bufferlist::const_iterator bliter = attr_it->second.begin();
+    try {
+      ::decode(slo_info, bliter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) <<
+	"ERROR: failed to decode slo manifest for " << oid << dendl;
+      return -EIO;
+    }
+
+    for (const auto& iter : slo_info.entries) {
+      const string& path_str = iter.path;
+
+      const size_t sep_pos = path_str.find('/', 1 /* skip initial slash */);
+      if (string::npos == sep_pos) {
+	return -EINVAL;
+      }
+
+      std::string bucket_name;
+      std::string obj_name;
+
+      bucket_name = url_decode(path_str.substr(1, sep_pos - 1));
+      obj_name = url_decode(path_str.substr(sep_pos + 1));
+
+      const rgw_obj_key obj_key(obj_name);
+      add_bucket_filter(bucket_name, obj_key);
+      ldpp_dout(dpp, 25) << "radoslist SLO oid=\"" << oid <<
+	"\" added bucket=\"" << bucket_name << "\" obj_key=\"" <<
+	obj_key << "\" to process list" << dendl;
+    }
+  } else {
+    RGWObjManifest& manifest = *result.manifest;
+
+    // in multipart, the head object contains no data and just has the
+    // manifest AND empty objects have no manifest, but they're
+    // realized as empty rados objects
+    if (0 == manifest.get_max_head_size() ||
+	manifest.obj_begin(dpp) == manifest.obj_end(dpp)) {
+      obj_oids.insert(oid);
+      // first_insert = true;
+    }
+
+    RGWObjManifest::obj_iterator miter;
+    for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) {
+      const rgw_raw_obj& loc =
+	miter.get_location().get_raw_obj(store->getRados());
+      string s = loc.oid;
+      obj_oids.insert(s);
+    }
+  }
+
+  return 0;
+} // RGWRadosList::handle_stat_result
+
+int RGWRadosList::pop_and_handle_stat_op(
+  const DoutPrefixProvider *dpp,
+  RGWObjectCtx& obj_ctx,
+  std::deque<RGWRados::Object::Stat>& ops)
+{
+  std::string bucket_name;
+  rgw_obj_key obj_key;
+  std::set<std::string> obj_oids;
+  RGWRados::Object::Stat& front_op = ops.front();
+
+  int ret = front_op.wait(dpp);
+  if (ret < 0) {
+    if (ret != -ENOENT) {
+      ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " <<
+	cpp_strerror(-ret) << dendl;
+    }
+    goto done;
+  }
+
+  ret = handle_stat_result(dpp, front_op.result, bucket_name, obj_key, obj_oids);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: handle_stat_result() returned error: " <<
+      cpp_strerror(-ret) << dendl;
+  }
+
+  // output results
+  for (const auto& o : obj_oids) {
+    if (include_rgw_obj_name) {
+      std::cout << o <<
+	field_separator << bucket_name <<
+	field_separator << obj_key <<
+	std::endl;
+    } else {
+      std::cout << o << std::endl;
+    }
+  }
+
+done:
+
+  // invalidate object context for this object to avoid memory leak
+  // (see pr https://github.com/ceph/ceph/pull/30174)
+  obj_ctx.invalidate(front_op.result.obj);
+
+  ops.pop_front();
+  return ret;
+}
+
+
+#if 0 // code that may be the basis for expansion
+int RGWRadosList::build_buckets_instance_index()
+{
+  void *handle;
+  int max = 1000;
+  string section = "bucket.instance";
+  int ret = store->meta_mgr->list_keys_init(section, &handle);
+  if (ret < 0) {
+    lderr(store->ctx()) << "ERROR: can't get key: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  map<int, list<string> > instances;
+
+  bool truncated;
+
+  RGWObjectCtx obj_ctx(store);
+
+  int count = 0;
+  uint64_t total = 0;
+
+  do {
+    list<string> keys;
+    ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated);
+    if (ret < 0) {
+      lderr(store->ctx()) << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+
+    for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+      ++total;
+      ldout(store->ctx(), 10) << "bucket_instance=" << *iter << " total=" << total << dendl;
+      int shard = orphan_shard(*iter);
+      instances[shard].push_back(*iter);
+
+      if (++count >= COUNT_BEFORE_FLUSH) {
+        ret = log_oids(buckets_instance_index, instances);
+        if (ret < 0) {
+          lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl;
+          return ret;
+        }
+        count = 0;
+        instances.clear();
+      }
+    }
+  } while (truncated);
+
+  ret = log_oids(buckets_instance_index, instances);
+  if (ret < 0) {
+    lderr(store->ctx()) << __func__ << ": ERROR: log_oids() returned ret=" << ret << dendl;
+    return ret;
+  }
+  store->meta_mgr->list_keys_complete(handle);
+
+  return 0;
+}
+#endif
+
+
+int RGWRadosList::process_bucket(
+  const DoutPrefixProvider *dpp,
+  const std::string& bucket_instance_id,
+  const std::string& prefix,
+  const std::set<rgw_obj_key>& entries_filter)
+{
+  ldpp_dout(dpp, 10) << "RGWRadosList::" << __func__ <<
+    " bucket_instance_id=" << bucket_instance_id <<
+    ", prefix=" << prefix <<
+    ", entries_filter.size=" << entries_filter.size() << dendl;
+
+  RGWBucketInfo bucket_info;
+  int ret = store->getRados()->get_bucket_instance_info(bucket_instance_id,
+							bucket_info,
+							nullptr,
+							nullptr,
+							null_yield,
+                                                        dpp);
+  if (ret < 0) {
+    if (ret == -ENOENT) {
+      // probably raced with bucket removal
+      return 0;
+    }
+    ldpp_dout(dpp, -1) << __func__ <<
+      ": ERROR: RGWRados::get_bucket_instance_info() returned ret=" <<
+      ret << dendl;
+    return ret;
+  }
+
+  RGWRados::Bucket target(store->getRados(), bucket_info);
+  RGWRados::Bucket::List list_op(&target);
+
+  std::string marker;
+  list_op.params.marker = rgw_obj_key(marker);
+  list_op.params.list_versions = true;
+  list_op.params.enforce_ns = false;
+  list_op.params.allow_unordered = false;
+  list_op.params.prefix = prefix;
+
+  bool truncated;
+
+  std::deque<RGWRados::Object::Stat> stat_ops;
+  std::string prev_versioned_key_name = "";
+
+  RGWObjectCtx obj_ctx(store);
+
+  do {
+    std::vector<rgw_bucket_dir_entry> result;
+    constexpr int64_t LIST_OBJS_MAX_ENTRIES = 100;
+    ret = list_op.list_objects(dpp, LIST_OBJS_MAX_ENTRIES, &result,
+			       NULL, &truncated, null_yield);
+    if (ret == -ENOENT) {
+      // race with bucket delete?
+      ret = 0;
+      break;
+    } else if (ret < 0) {
+      std::cerr << "ERROR: store->list_objects(): " << cpp_strerror(-ret) <<
+	std::endl;
+      return ret;
+    }
+
+    for (std::vector<rgw_bucket_dir_entry>::iterator iter = result.begin();
+	 iter != result.end();
+	 ++iter) {
+      rgw_bucket_dir_entry& entry = *iter;
+
+      if (entry.key.instance.empty()) {
+        ldpp_dout(dpp, 20) << "obj entry: " << entry.key.name << dendl;
+      } else {
+        ldpp_dout(dpp, 20) << "obj entry: " << entry.key.name <<
+	  " [" << entry.key.instance << "]" << dendl;
+      }
+
+      ldpp_dout(dpp, 20) << __func__ << ": entry.key.name=" <<
+	entry.key.name << " entry.key.instance=" << entry.key.instance <<
+	dendl;
+
+      // ignore entries that are not in the filter if there is a filter
+      if (!entries_filter.empty() &&
+	  entries_filter.find(entry.key) == entries_filter.cend()) {
+	continue;
+      }
+
+      std::unique_ptr<rgw::sal::Bucket> bucket;
+      store->get_bucket(nullptr, bucket_info, &bucket);
+      // we need to do this in two cases below, so use a lambda
+      auto do_stat_key =
+	[&](const rgw_obj_key& key) -> int {
+	  int ret;
+
+	  rgw_obj obj(bucket_info.bucket, key);
+	  RGWRados::Object op_target(store->getRados(), bucket_info,
+				     obj_ctx, obj);
+
+	  stat_ops.push_back(RGWRados::Object::Stat(&op_target));
+	  RGWRados::Object::Stat& op = stat_ops.back();
+
+	  ret = op.stat_async(dpp);
+	  if (ret < 0) {
+	    ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " <<
+	      cpp_strerror(-ret) << dendl;
+	    return ret;
+	  }
+
+	  if (stat_ops.size() >= max_concurrent_ios) {
+	    ret = pop_and_handle_stat_op(dpp, obj_ctx, stat_ops);
+	    if (ret < 0) {
+	      if (ret != -ENOENT) {
+		ldpp_dout(dpp, -1) <<
+		  "ERROR: pop_and_handle_stat_op() returned error: " <<
+		  cpp_strerror(-ret) << dendl;
+	      }
+
+	      // clear error, so we'll continue processing directory
+	      ret = 0;
+	    }
+	  }
+
+	  return ret;
+	}; // do_stat_key lambda
+
+      // for versioned objects, make sure the head object is handled
+      // as well by ignoring the instance identifier
+      if (!entry.key.instance.empty() &&
+	  entry.key.name != prev_versioned_key_name) {
+	// don't do the same key twice; even though out bucket index
+	// listing allows unordered, since all versions of an object
+	// use the same bucket index key, they'll all end up together
+	// and sorted
+	prev_versioned_key_name = entry.key.name;
+
+	rgw_obj_key uninstanced(entry.key.name);
+
+	ret = do_stat_key(uninstanced);
+	if (ret < 0) {
+	  return ret;
+	}
+      }
+
+      ret = do_stat_key(entry.key);
+      if (ret < 0) {
+	return ret;
+      }
+    } // for iter loop
+  } while (truncated);
+
+  while (!stat_ops.empty()) {
+    ret = pop_and_handle_stat_op(dpp, obj_ctx, stat_ops);
+    if (ret < 0) {
+      if (ret != -ENOENT) {
+        ldpp_dout(dpp, -1) << "ERROR: stat_async() returned error: " <<
+	  cpp_strerror(-ret) << dendl;
+      }
+    }
+  }
+
+  return 0;
+}
+
+
+int RGWRadosList::run(const DoutPrefixProvider *dpp,
+		      const bool yes_i_really_mean_it)
+{
+  int ret;
+  void* handle = nullptr;
+
+  ret = store->meta_list_keys_init(dpp, "bucket", string(), &handle);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ <<
+      " ERROR: list_keys_init returned " <<
+      cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  constexpr int max_keys = 1000;
+  bool truncated = true;
+  bool warned_indexless = false;
+
+  do {
+    std::list<std::string> buckets;
+    ret = store->meta_list_keys_next(dpp, handle, max_keys, buckets, &truncated);
+
+    for (std::string& bucket_id : buckets) {
+      ret = run(dpp, bucket_id, true);
+      if (ret == -ENOENT) {
+	continue;
+      } else if (ret == -EINVAL) {
+	if (! warned_indexless) {
+	  if (yes_i_really_mean_it) {
+	    std::cerr <<
+	      "WARNING: because there is at least one indexless bucket (" <<
+	      bucket_id <<
+	      ") the results of radoslist are *incomplete*; continuing due to --yes-i-really-mean-it" <<
+	      std::endl;
+	    warned_indexless = true;
+	  } else {
+	    std::cerr << "ERROR: because there is at least one indexless bucket (" <<
+	      bucket_id <<
+	      ") the results of radoslist are *incomplete*; use --yes-i-really-mean-it to bypass error" <<
+	      std::endl;
+	    return ret;
+	  }
+	}
+	continue;
+      } else if (ret < 0) {
+	return ret;
+      }
+    }
+  } while (truncated);
+
+  return 0;
+} // RGWRadosList::run(DoutPrefixProvider, bool)
+
+
+int RGWRadosList::run(const DoutPrefixProvider *dpp,
+		      const std::string& start_bucket_name,
+		      const bool silent_indexless)
+{
+  int ret;
+
+  add_bucket_entire(start_bucket_name);
+
+  while (! bucket_process_map.empty()) {
+    // pop item from map and capture its key data
+    auto front = bucket_process_map.begin();
+    std::string bucket_name = front->first;
+    process_t process;
+    std::swap(process, front->second);
+    bucket_process_map.erase(front);
+
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    ret = store->get_bucket(dpp, nullptr, tenant_name, bucket_name, &bucket, null_yield);
+    if (ret == -ENOENT) {
+      std::cerr << "WARNING: bucket " << bucket_name <<
+	" does not exist; could it have been deleted very recently?" <<
+	std::endl;
+      continue;
+    } else if (ret < 0) {
+      std::cerr << "ERROR: could not get info for bucket " << bucket_name <<
+	" -- " << cpp_strerror(-ret) << std::endl;
+      return ret;
+    } else if (bucket->get_info().is_indexless()) {
+      if (! silent_indexless) {
+	std::cerr << "ERROR: unable to run radoslist on indexless bucket " <<
+	  bucket_name << std::endl;
+      }
+      return -EINVAL;
+    }
+
+    const std::string bucket_id = bucket->get_key().get_key();
+
+    static const std::set<rgw_obj_key> empty_filter;
+    static const std::string empty_prefix;
+
+    auto do_process_bucket =
+      [dpp, &bucket_id, this]
+      (const std::string& prefix,
+       const std::set<rgw_obj_key>& entries_filter) -> int {
+	int ret = process_bucket(dpp, bucket_id, prefix, entries_filter);
+	if (ret == -ENOENT) {
+	  // bucket deletion race?
+	  return 0;
+	} if (ret < 0) {
+	  ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ <<
+	    ": ERROR: process_bucket(); bucket_id=" <<
+	    bucket_id << " returned ret=" << ret << dendl;
+	}
+
+	return ret;
+      };
+
+    // either process the whole bucket *or* process the filters and/or
+    // the prefixes
+    if (process.entire_container) {
+      ret = do_process_bucket(empty_prefix, empty_filter);
+      if (ret < 0) {
+	return ret;
+      }
+    } else {
+      if (! process.filter_keys.empty()) {
+	ret = do_process_bucket(empty_prefix, process.filter_keys);
+	if (ret < 0) {
+	  return ret;
+	}
+      }
+      for (const auto& p : process.prefixes) {
+	ret = do_process_bucket(p, empty_filter);
+	if (ret < 0) {
+	  return ret;
+	}
+      }
+    }
+  } // while (! bucket_process_map.empty())
+
+  if (include_rgw_obj_name) {
+    return 0;
+  }
+
+  // now handle incomplete multipart uploads by going back to the
+  // initial bucket
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  ret = store->get_bucket(dpp, nullptr, tenant_name, start_bucket_name, &bucket, null_yield);
+  if (ret == -ENOENT) {
+    // bucket deletion race?
+    return 0;
+  } else if (ret < 0) {
+    ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ <<
+      ": ERROR: get_bucket_info returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  ret = do_incomplete_multipart(dpp, bucket.get());
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ <<
+      ": ERROR: do_incomplete_multipart returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+} // RGWRadosList::run(DoutPrefixProvider, string, bool)
+
+
+int RGWRadosList::do_incomplete_multipart(const DoutPrefixProvider *dpp,
+					  rgw::sal::Bucket* bucket)
+{
+  constexpr int max_uploads = 1000;
+  constexpr int max_parts = 1000;
+  std::string marker;
+  vector<std::unique_ptr<rgw::sal::MultipartUpload>> uploads;
+  bool is_truncated;
+  int ret;
+
+  // use empty strings for params.{prefix,delim}
+
+  do {
+    ret = bucket->list_multiparts(dpp, string(), marker, string(), max_uploads, uploads, nullptr, &is_truncated);
+    if (ret == -ENOENT) {
+      // could bucket have been removed while this is running?
+      ldpp_dout(dpp, 5) << "RGWRadosList::" << __func__ <<
+	": WARNING: call to list_objects of multipart namespace got ENOENT; "
+	"assuming bucket removal race" << dendl;
+      break;
+    } else if (ret < 0) {
+      ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ <<
+	": ERROR: list_objects op returned ret=" << ret << dendl;
+      return ret;
+    }
+
+    if (!uploads.empty()) {
+      // now process the uploads vector
+      for (const auto& upload : uploads) {
+	int parts_marker = 0;
+	bool is_parts_truncated = false;
+
+	do { // while (is_parts_truncated);
+	  ret = upload->list_parts(dpp, store->ctx(), max_parts, parts_marker,
+				   &parts_marker, &is_parts_truncated);
+	  if (ret == -ENOENT) {
+	    ldpp_dout(dpp, 5) <<  "RGWRadosList::" << __func__ <<
+	      ": WARNING: list_multipart_parts returned ret=-ENOENT "
+	      "for " << upload->get_upload_id() << ", moving on" << dendl;
+	    break;
+	  } else if (ret < 0) {
+	    ldpp_dout(dpp, -1) << "RGWRadosList::" << __func__ <<
+	      ": ERROR: list_multipart_parts returned ret=" << ret <<
+	      dendl;
+	    return ret;
+	  }
+
+	  for (auto& p : upload->get_parts()) {
+	    rgw::sal::RadosMultipartPart* part =
+	      dynamic_cast<rgw::sal::RadosMultipartPart*>(p.second.get());
+	    RGWObjManifest& manifest = part->get_manifest();
+	    for (auto obj_it = manifest.obj_begin(dpp);
+		 obj_it != manifest.obj_end(dpp);
+		 ++obj_it) {
+	      const rgw_raw_obj& loc =
+		obj_it.get_location().get_raw_obj(store->getRados());
+	      std::cout << loc.oid << std::endl;
+	    } // for (auto obj_it
+	  } // for (auto& p
+	} while (is_parts_truncated);
+      } // for (const auto& upload
+    } // if objs not empty
+  } while (is_truncated);
+
+  return 0;
+} // RGWRadosList::do_incomplete_multipart
+
+void RGWOrphanSearchStage::dump(Formatter *f) const
+{
+  f->open_object_section("orphan_search_stage");
+  string s;
+  switch(stage){
+  case ORPHAN_SEARCH_STAGE_INIT:
+    s = "init";
+    break;
+  case ORPHAN_SEARCH_STAGE_LSPOOL:
+    s = "lspool";
+    break;
+  case ORPHAN_SEARCH_STAGE_LSBUCKETS:
+    s =  "lsbuckets";
+    break;
+  case ORPHAN_SEARCH_STAGE_ITERATE_BI:
+    s = "iterate_bucket_index";
+    break;
+  case ORPHAN_SEARCH_STAGE_COMPARE:
+    s = "comparing";
+    break;
+  default:
+    s = "unknown";
+  }
+  f->dump_string("search_stage", s);
+  f->dump_int("shard",shard);
+  f->dump_string("marker",marker);
+  f->close_section();
+}
+
+void RGWOrphanSearchInfo::dump(Formatter *f) const
+{
+  f->open_object_section("orphan_search_info");
+  f->dump_string("job_name", job_name);
+  encode_json("pool", pool, f);
+  f->dump_int("num_shards", num_shards);
+  encode_json("start_time", start_time, f);
+  f->close_section();
+}
+
+void RGWOrphanSearchState::dump(Formatter *f) const
+{
+  f->open_object_section("orphan_search_state");
+  encode_json("info", info, f);
+  encode_json("stage", stage, f);
+  f->close_section();
+}
+
+
diff --git a/src/rgw/rgw_orphan.h b/src/rgw/rgw_orphan.h
new file mode 100644
index 000000000..db811d31d
--- /dev/null
+++ b/src/rgw/rgw_orphan.h
@@ -0,0 +1,304 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "common/config.h"
+#include "common/Formatter.h"
+#include "common/errno.h"
+
+#include "rgw_sal_rados.h"
+
+#define RGW_ORPHAN_INDEX_OID "orphan.index"
+#define RGW_ORPHAN_INDEX_PREFIX "orphan.scan"
+
+
+enum RGWOrphanSearchStageId {
+  ORPHAN_SEARCH_STAGE_UNKNOWN = 0,
+  ORPHAN_SEARCH_STAGE_INIT = 1,
+  ORPHAN_SEARCH_STAGE_LSPOOL = 2,
+  ORPHAN_SEARCH_STAGE_LSBUCKETS = 3,
+  ORPHAN_SEARCH_STAGE_ITERATE_BI = 4,
+  ORPHAN_SEARCH_STAGE_COMPARE = 5,
+};
+
+
+struct RGWOrphanSearchStage {
+  RGWOrphanSearchStageId stage;
+  int shard;
+  std::string marker;
+
+  RGWOrphanSearchStage() : stage(ORPHAN_SEARCH_STAGE_UNKNOWN), shard(0) {}
+  explicit RGWOrphanSearchStage(RGWOrphanSearchStageId _stage) : stage(_stage), shard(0) {}
+  RGWOrphanSearchStage(RGWOrphanSearchStageId _stage, int _shard, const std::string& _marker) : stage(_stage), shard(_shard), marker(_marker) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode((int)stage, bl);
+    encode(shard, bl);
+    encode(marker, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    int s;
+    decode(s, bl);
+    stage = (RGWOrphanSearchStageId)s;
+    decode(shard, bl);
+    decode(marker, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOrphanSearchStage)
+
+struct RGWOrphanSearchInfo {
+  std::string job_name;
+  rgw_pool pool;
+  uint16_t num_shards;
+  utime_t start_time;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(job_name, bl);
+    encode(pool.to_str(), bl);
+    encode(num_shards, bl);
+    encode(start_time, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(job_name, bl);
+    std::string s;
+    decode(s, bl);
+    pool.from_str(s);
+    decode(num_shards, bl);
+    decode(start_time, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOrphanSearchInfo)
+
+struct RGWOrphanSearchState {
+  RGWOrphanSearchInfo info;
+  RGWOrphanSearchStage stage;
+
+  RGWOrphanSearchState() : stage(ORPHAN_SEARCH_STAGE_UNKNOWN) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(info, bl);
+    encode(stage, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(info, bl);
+    decode(stage, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(RGWOrphanSearchState)
+
+class RGWOrphanStore {
+  rgw::sal::RadosStore* store;
+  librados::IoCtx ioctx;
+
+  std::string oid;
+
+public:
+  explicit RGWOrphanStore(rgw::sal::RadosStore* _store) : store(_store), oid(RGW_ORPHAN_INDEX_OID) {}
+
+  librados::IoCtx& get_ioctx() { return ioctx; }
+
+  int init(const DoutPrefixProvider *dpp);
+
+  int read_job(const std::string& job_name, RGWOrphanSearchState& state);
+  int write_job(const std::string& job_name, const RGWOrphanSearchState& state);
+  int remove_job(const std::string& job_name);
+  int list_jobs(std::map<std::string,RGWOrphanSearchState> &job_list);
+
+
+  int store_entries(const DoutPrefixProvider *dpp, const std::string& oid, const std::map<std::string, bufferlist>& entries);
+  int read_entries(const std::string& oid, const std::string& marker, std::map<std::string, bufferlist> *entries, bool *truncated);
+};
+
+
+class RGWOrphanSearch {
+  rgw::sal::RadosStore* store;
+
+  RGWOrphanStore orphan_store;
+
+  RGWOrphanSearchInfo search_info;
+  RGWOrphanSearchStage search_stage;
+
+  std::map<int, std::string> all_objs_index;
+  std::map<int, std::string> buckets_instance_index;
+  std::map<int, std::string> linked_objs_index;
+
+  std::string index_objs_prefix;
+
+  uint16_t max_concurrent_ios;
+  uint64_t stale_secs;
+  int64_t max_list_bucket_entries;
+
+  bool detailed_mode;
+
+  struct log_iter_info {
+    std::string oid;
+    std::list<std::string>::iterator cur;
+    std::list<std::string>::iterator end;
+  };
+
+  int log_oids(const DoutPrefixProvider *dpp, std::map<int, std::string>& log_shards, std::map<int, std::list<std::string> >& oids);
+
+#define RGW_ORPHANSEARCH_HASH_PRIME 7877
+  int orphan_shard(const std::string& str) {
+    return ceph_str_hash_linux(str.c_str(), str.size()) % RGW_ORPHANSEARCH_HASH_PRIME % search_info.num_shards;
+  }
+
+  int handle_stat_result(const DoutPrefixProvider *dpp, std::map<int, std::list<std::string> >& oids, RGWRados::Object::Stat::Result& result);
+  int pop_and_handle_stat_op(const DoutPrefixProvider *dpp, std::map<int, std::list<std::string> >& oids, std::deque<RGWRados::Object::Stat>& ops);
+
+  int remove_index(std::map<int, std::string>& index);
+public:
+  RGWOrphanSearch(rgw::sal::RadosStore* _store, int _max_ios, uint64_t _stale_secs) : store(_store), orphan_store(store), max_concurrent_ios(_max_ios), stale_secs(_stale_secs) {}
+
+  int save_state() {
+    RGWOrphanSearchState state;
+    state.info = search_info;
+    state.stage = search_stage;
+    return orphan_store.write_job(search_info.job_name, state);
+  }
+
+  int init(const DoutPrefixProvider *dpp, const std::string& job_name, RGWOrphanSearchInfo *info, bool _detailed_mode=false);
+
+  int create(const std::string& job_name, int num_shards);
+
+  int build_all_oids_index(const DoutPrefixProvider *dpp);
+  int build_buckets_instance_index(const DoutPrefixProvider *dpp);
+  int build_linked_oids_for_bucket(const DoutPrefixProvider *dpp, const std::string& bucket_instance_id, std::map<int, std::list<std::string> >& oids);
+  int build_linked_oids_index(const DoutPrefixProvider *dpp);
+  int compare_oid_indexes(const DoutPrefixProvider *dpp);
+
+  int run(const DoutPrefixProvider *dpp);
+  int finish();
+};
+
+
+class RGWRadosList {
+
+  /*
+   * process_t describes how to process a irectory, we will either
+   * process the whole thing (entire_container == true) or a portion
+   * of it (entire_container == false). When we only process a
+   * portion, we will list the specific keys and/or specific lexical
+   * prefixes.
+   */
+  struct process_t {
+    bool entire_container;
+    std::set<rgw_obj_key> filter_keys;
+    std::set<std::string> prefixes;
+
+    process_t() :
+      entire_container(false)
+    {}
+  };
+
+  std::map<std::string,process_t> bucket_process_map;
+  std::set<std::string> visited_oids;
+
+  void add_bucket_entire(const std::string& bucket_name) {
+    auto p = bucket_process_map.emplace(std::make_pair(bucket_name,
+						       process_t()));
+    p.first->second.entire_container = true;
+  }
+
+  void add_bucket_prefix(const std::string& bucket_name,
+			 const std::string& prefix) {
+    auto p = bucket_process_map.emplace(std::make_pair(bucket_name,
+						       process_t()));
+    p.first->second.prefixes.insert(prefix);
+  }
+
+  void add_bucket_filter(const std::string& bucket_name,
+			 const rgw_obj_key& obj_key) {
+    auto p = bucket_process_map.emplace(std::make_pair(bucket_name,
+						       process_t()));
+    p.first->second.filter_keys.insert(obj_key);
+  }
+
+  rgw::sal::RadosStore* store;
+
+  uint16_t max_concurrent_ios;
+  uint64_t stale_secs;
+  std::string tenant_name;
+
+  bool include_rgw_obj_name;
+  std::string field_separator;
+
+  int handle_stat_result(const DoutPrefixProvider *dpp,
+			 RGWRados::Object::Stat::Result& result,
+			 std::string& bucket_name,
+			 rgw_obj_key& obj_key,
+			 std::set<std::string>& obj_oids);
+  int pop_and_handle_stat_op(const DoutPrefixProvider *dpp,
+                             RGWObjectCtx& obj_ctx,
+			     std::deque<RGWRados::Object::Stat>& ops);
+
+public:
+
+  RGWRadosList(rgw::sal::RadosStore* _store,
+	       int _max_ios,
+	       uint64_t _stale_secs,
+	       const std::string& _tenant_name) :
+    store(_store),
+    max_concurrent_ios(_max_ios),
+    stale_secs(_stale_secs),
+    tenant_name(_tenant_name),
+    include_rgw_obj_name(false)
+  {}
+
+  int process_bucket(const DoutPrefixProvider *dpp,
+                     const std::string& bucket_instance_id,
+		     const std::string& prefix,
+		     const std::set<rgw_obj_key>& entries_filter);
+
+  int do_incomplete_multipart(const DoutPrefixProvider *dpp,
+			      rgw::sal::Bucket* bucket);
+
+  int build_linked_oids_index();
+
+  int run(const DoutPrefixProvider *dpp,
+	  const std::string& bucket_id,
+	  const bool silent_indexless = false);
+  int run(const DoutPrefixProvider *dpp,
+	  const bool yes_i_really_mean_it = false);
+
+  // if there's a non-empty field separator, that means we'll display
+  // bucket and object names
+  void set_field_separator(const std::string& fs) {
+    field_separator = fs;
+    include_rgw_obj_name = !field_separator.empty();
+  }
+}; // class RGWRadosList
diff --git a/src/rgw/rgw_os_lib.cc b/src/rgw/rgw_os_lib.cc
new file mode 100644
index 000000000..55eb2fb4b
--- /dev/null
+++ b/src/rgw/rgw_os_lib.cc
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_user.h"
+#include "rgw_os_lib.h"
+#include "rgw_file.h"
+#include "rgw_lib_frontend.h"
+
+namespace rgw {
+
+/* static */
+  int RGWHandler_Lib::init_from_header(rgw::sal::Driver* driver,
+				       req_state *s)
+  {
+    string req;
+    string first;
+
+    const char *req_name = s->relative_uri.c_str();
+    const char *p;
+
+    /* skip request_params parsing, rgw_file should not be
+     * seeing any */
+    if (*req_name == '?') {
+      p = req_name;
+    } else {
+      p = s->info.request_params.c_str();
+    }
+
+    s->info.args.set(p);
+    s->info.args.parse(s);
+
+    if (*req_name != '/')
+      return 0;
+
+    req_name++;
+
+    if (!*req_name)
+      return 0;
+
+    req = req_name;
+    int pos = req.find('/');
+    if (pos >= 0) {
+      first = req.substr(0, pos);
+    } else {
+      first = req;
+    }
+
+    if (s->bucket_name.empty()) {
+      s->bucket_name = std::move(first);
+      if (pos >= 0) {
+	// XXX ugh, another copy
+	string encoded_obj_str = req.substr(pos+1);
+	s->object = driver->get_object(rgw_obj_key(encoded_obj_str, s->info.args.get("versionId")));
+      }
+    } else {
+      s->object = driver->get_object(rgw_obj_key(req_name, s->info.args.get("versionId")));
+    }
+    return 0;
+  } /* init_from_header */
+
+} /* namespace rgw */
diff --git a/src/rgw/rgw_os_lib.h b/src/rgw/rgw_os_lib.h
new file mode 100644
index 000000000..65df0a726
--- /dev/null
+++ b/src/rgw/rgw_os_lib.h
@@ -0,0 +1,9 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <functional>
+#include "rgw_common.h"
+#include "rgw_lib.h"
+
diff --git a/src/rgw/rgw_perf_counters.cc b/src/rgw/rgw_perf_counters.cc
new file mode 100644
index 000000000..fd058ab00
--- /dev/null
+++ b/src/rgw/rgw_perf_counters.cc
@@ -0,0 +1,78 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_perf_counters.h"
+#include "common/perf_counters.h"
+#include "common/ceph_context.h"
+
+PerfCounters *perfcounter = NULL;
+
+int rgw_perf_start(CephContext *cct)
+{
+  PerfCountersBuilder plb(cct, "rgw", l_rgw_first, l_rgw_last);
+
+  // RGW emits comparatively few metrics, so let's be generous
+  // and mark them all USEFUL to get transmission to ceph-mgr by default.
+  plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
+  plb.add_u64_counter(l_rgw_req, "req", "Requests");
+  plb.add_u64_counter(l_rgw_failed_req, "failed_req", "Aborted requests");
+
+  plb.add_u64_counter(l_rgw_get, "get", "Gets");
+  plb.add_u64_counter(l_rgw_get_b, "get_b", "Size of gets");
+  plb.add_time_avg(l_rgw_get_lat, "get_initial_lat", "Get latency");
+  plb.add_u64_counter(l_rgw_put, "put", "Puts");
+  plb.add_u64_counter(l_rgw_put_b, "put_b", "Size of puts");
+  plb.add_time_avg(l_rgw_put_lat, "put_initial_lat", "Put latency");
+
+  plb.add_u64(l_rgw_qlen, "qlen", "Queue length");
+  plb.add_u64(l_rgw_qactive, "qactive", "Active requests queue");
+
+  plb.add_u64_counter(l_rgw_cache_hit, "cache_hit", "Cache hits");
+  plb.add_u64_counter(l_rgw_cache_miss, "cache_miss", "Cache miss");
+
+  plb.add_u64_counter(l_rgw_keystone_token_cache_hit, "keystone_token_cache_hit", "Keystone token cache hits");
+  plb.add_u64_counter(l_rgw_keystone_token_cache_miss, "keystone_token_cache_miss", "Keystone token cache miss");
+
+  plb.add_u64_counter(l_rgw_gc_retire, "gc_retire_object", "GC object retires");
+
+  plb.add_u64_counter(l_rgw_lc_expire_current, "lc_expire_current",
+		      "Lifecycle current expiration");
+  plb.add_u64_counter(l_rgw_lc_expire_noncurrent, "lc_expire_noncurrent",
+		      "Lifecycle non-current expiration");
+  plb.add_u64_counter(l_rgw_lc_expire_dm, "lc_expire_dm",
+		      "Lifecycle delete-marker expiration");
+  plb.add_u64_counter(l_rgw_lc_transition_current, "lc_transition_current",
+		      "Lifecycle current transition");
+  plb.add_u64_counter(l_rgw_lc_transition_noncurrent,
+		      "lc_transition_noncurrent",
+		      "Lifecycle non-current transition");
+  plb.add_u64_counter(l_rgw_lc_abort_mpu, "lc_abort_mpu",
+		      "Lifecycle abort multipart upload");
+
+  plb.add_u64_counter(l_rgw_pubsub_event_triggered, "pubsub_event_triggered", "Pubsub events with at least one topic");
+  plb.add_u64_counter(l_rgw_pubsub_event_lost, "pubsub_event_lost", "Pubsub events lost");
+  plb.add_u64_counter(l_rgw_pubsub_store_ok, "pubsub_store_ok", "Pubsub events successfully stored");
+  plb.add_u64_counter(l_rgw_pubsub_store_fail, "pubsub_store_fail", "Pubsub events failed to be stored");
+  plb.add_u64(l_rgw_pubsub_events, "pubsub_events", "Pubsub events in store");
+  plb.add_u64_counter(l_rgw_pubsub_push_ok, "pubsub_push_ok", "Pubsub events pushed to an endpoint");
+  plb.add_u64_counter(l_rgw_pubsub_push_failed, "pubsub_push_failed", "Pubsub events failed to be pushed to an endpoint");
+  plb.add_u64(l_rgw_pubsub_push_pending, "pubsub_push_pending", "Pubsub events pending reply from endpoint");
+  plb.add_u64_counter(l_rgw_pubsub_missing_conf, "pubsub_missing_conf", "Pubsub events could not be handled because of missing configuration");
+  
+  plb.add_u64_counter(l_rgw_lua_script_ok, "lua_script_ok", "Successfull executions of lua scripts");
+  plb.add_u64_counter(l_rgw_lua_script_fail, "lua_script_fail", "Failed executions of lua scripts");
+  plb.add_u64(l_rgw_lua_current_vms, "lua_current_vms", "Number of Lua VMs currently being executed");
+  
+  perfcounter = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perfcounter);
+  return 0;
+}
+
+void rgw_perf_stop(CephContext *cct)
+{
+  ceph_assert(perfcounter);
+  cct->get_perfcounters_collection()->remove(perfcounter);
+  delete perfcounter;
+}
+
diff --git a/src/rgw/rgw_perf_counters.h b/src/rgw/rgw_perf_counters.h
new file mode 100644
index 000000000..3c4e4e97f
--- /dev/null
+++ b/src/rgw/rgw_perf_counters.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "include/common_fwd.h"
+
+extern PerfCounters *perfcounter;
+
+extern int rgw_perf_start(CephContext *cct);
+extern void rgw_perf_stop(CephContext *cct);
+
+enum {
+  l_rgw_first = 15000,
+  l_rgw_req,
+  l_rgw_failed_req,
+
+  l_rgw_get,
+  l_rgw_get_b,
+  l_rgw_get_lat,
+
+  l_rgw_put,
+  l_rgw_put_b,
+  l_rgw_put_lat,
+
+  l_rgw_qlen,
+  l_rgw_qactive,
+
+  l_rgw_cache_hit,
+  l_rgw_cache_miss,
+
+  l_rgw_keystone_token_cache_hit,
+  l_rgw_keystone_token_cache_miss,
+
+  l_rgw_gc_retire,
+
+  l_rgw_lc_expire_current,
+  l_rgw_lc_expire_noncurrent,
+  l_rgw_lc_expire_dm,
+  l_rgw_lc_transition_current,
+  l_rgw_lc_transition_noncurrent,
+  l_rgw_lc_abort_mpu,
+
+  l_rgw_pubsub_event_triggered,
+  l_rgw_pubsub_event_lost,
+  l_rgw_pubsub_store_ok,
+  l_rgw_pubsub_store_fail,
+  l_rgw_pubsub_events,
+  l_rgw_pubsub_push_ok,
+  l_rgw_pubsub_push_failed,
+  l_rgw_pubsub_push_pending,
+  l_rgw_pubsub_missing_conf,
+
+  l_rgw_lua_current_vms,
+  l_rgw_lua_script_ok,
+  l_rgw_lua_script_fail,
+
+  l_rgw_last,
+};
+
diff --git a/src/rgw/rgw_period.cc b/src/rgw/rgw_period.cc
new file mode 100644
index 000000000..1e7de60ea
--- /dev/null
+++ b/src/rgw/rgw_period.cc
@@ -0,0 +1,350 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync.h"
+
+using namespace std;
+using namespace rgw_zone_defaults;
+
+std::string period_latest_epoch_info_oid = ".latest_epoch";
+std::string period_info_oid_prefix = "periods.";
+
+#define FIRST_EPOCH 1
+
+int RGWPeriod::init(const DoutPrefixProvider *dpp, 
+                    CephContext *_cct, RGWSI_SysObj *_sysobj_svc,
+		    optional_yield y, bool setup_obj)
+{
+  cct = _cct;
+  sysobj_svc = _sysobj_svc;
+
+  if (!setup_obj)
+    return 0;
+
+  if (id.empty()) {
+    RGWRealm realm(realm_id, realm_name);
+    int ret = realm.init(dpp, cct, sysobj_svc, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 4) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
+	cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+    id = realm.get_current_period();
+    realm_id = realm.get_id();
+  }
+
+  if (!epoch) {
+    int ret = use_latest_epoch(dpp, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name  << " id " << realm_id
+	   << " : " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+  }
+
+  return read_info(dpp, y);
+}
+
+int RGWPeriod::init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc,
+		    const string& period_realm_id, optional_yield y,
+		    const string& period_realm_name, bool setup_obj)
+{
+  cct = _cct;
+  sysobj_svc = _sysobj_svc;
+
+  realm_id = period_realm_id;
+  realm_name = period_realm_name;
+
+  if (!setup_obj)
+    return 0;
+
+  return init(dpp, _cct, _sysobj_svc, y, setup_obj);
+}
+
+const string& RGWPeriod::get_latest_epoch_oid() const
+{
+  if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
+    return period_latest_epoch_info_oid;
+  }
+  return cct->_conf->rgw_period_latest_epoch_info_oid;
+}
+
+const string& RGWPeriod::get_info_oid_prefix() const
+{
+  return period_info_oid_prefix;
+}
+
+const string RGWPeriod::get_period_oid_prefix() const
+{
+  return get_info_oid_prefix() + id;
+}
+
+const string RGWPeriod::get_period_oid() const
+{
+  std::ostringstream oss;
+  oss << get_period_oid_prefix();
+  // skip the epoch for the staging period
+  if (id != get_staging_id(realm_id))
+    oss << "." << epoch;
+  return oss.str();
+}
+
+bool RGWPeriod::find_zone(const DoutPrefixProvider *dpp,
+                          const rgw_zone_id& zid,
+                          RGWZoneGroup *pzonegroup,
+                          optional_yield y) const
+{
+  RGWZoneGroup zg;
+  RGWZone zone;
+
+  bool found = period_map.find_zone_by_id(zid, &zg, &zone);
+  if (found) {
+    *pzonegroup = zg;
+  }
+
+  return found;
+}
+
+rgw_pool RGWPeriod::get_pool(CephContext *cct) const
+{
+  if (cct->_conf->rgw_period_root_pool.empty()) {
+    return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
+  }
+  return rgw_pool(cct->_conf->rgw_period_root_pool);
+}
+
+int RGWPeriod::set_latest_epoch(const DoutPrefixProvider *dpp, 
+                                optional_yield y,
+				epoch_t epoch, bool exclusive,
+                                RGWObjVersionTracker *objv)
+{
+  string oid = get_period_oid_prefix() + get_latest_epoch_oid();
+
+  rgw_pool pool(get_pool(cct));
+  bufferlist bl;
+
+  RGWPeriodLatestEpochInfo info;
+  info.epoch = epoch;
+
+  using ceph::encode;
+  encode(info, bl);
+
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid));
+  return sysobj.wop()
+               .set_exclusive(exclusive)
+               .write(dpp, bl, y);
+}
+
+int RGWPeriod::read_info(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  rgw_pool pool(get_pool(cct));
+
+  bufferlist bl;
+
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, get_period_oid()});
+  int ret = sysobj.rop().read(dpp, &bl, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  try {
+    using ceph::decode;
+    auto iter = bl.cbegin();
+    decode(*this, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RGWPeriod::store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  rgw_pool pool(get_pool(cct));
+
+  string oid = get_period_oid();
+  bufferlist bl;
+  using ceph::encode;
+  encode(*this, bl);
+
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid));
+  return sysobj.wop()
+               .set_exclusive(exclusive)
+               .write(dpp, bl, y);
+}
+
+int RGWPeriod::create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive)
+{
+  int ret;
+  
+  /* create unique id */
+  uuid_d new_uuid;
+  char uuid_str[37];
+  new_uuid.generate_random();
+  new_uuid.print(uuid_str);
+  id = uuid_str;
+
+  epoch = FIRST_EPOCH;
+
+  period_map.id = id;
+
+  ret = store_info(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR:  storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  ret = set_latest_epoch(dpp, y, epoch);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
+  }
+
+  return ret;
+}
+
+int RGWPeriod::reflect(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  for (auto& iter : period_map.zonegroups) {
+    RGWZoneGroup& zg = iter.second;
+    zg.reinit_instance(cct, sysobj_svc);
+    int r = zg.write(dpp, false, y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    if (zg.is_master_zonegroup()) {
+      // set master as default if no default exists
+      r = zg.set_as_default(dpp, y, true);
+      if (r == 0) {
+        ldpp_dout(dpp, 1) << "Set the period's master zonegroup " << zg.get_id()
+            << " as the default" << dendl;
+      }
+    }
+  }
+
+  int r = period_config.write(dpp, sysobj_svc, realm_id, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to store period config: "
+        << cpp_strerror(-r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void RGWPeriod::dump(Formatter *f) const
+{
+  encode_json("id", id, f);
+  encode_json("epoch", epoch , f);
+  encode_json("predecessor_uuid", predecessor_uuid, f);
+  encode_json("sync_status", sync_status, f);
+  encode_json("period_map", period_map, f);
+  encode_json("master_zonegroup", master_zonegroup, f);
+  encode_json("master_zone", master_zone, f);
+  encode_json("period_config", period_config, f);
+  encode_json("realm_id", realm_id, f);
+  encode_json("realm_name", realm_name, f);
+  encode_json("realm_epoch", realm_epoch, f);
+}
+
+void RGWPeriod::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("epoch", epoch, obj);
+  JSONDecoder::decode_json("predecessor_uuid", predecessor_uuid, obj);
+  JSONDecoder::decode_json("sync_status", sync_status, obj);
+  JSONDecoder::decode_json("period_map", period_map, obj);
+  JSONDecoder::decode_json("master_zonegroup", master_zonegroup, obj);
+  JSONDecoder::decode_json("master_zone", master_zone, obj);
+  JSONDecoder::decode_json("period_config", period_config, obj);
+  JSONDecoder::decode_json("realm_id", realm_id, obj);
+  JSONDecoder::decode_json("realm_name", realm_name, obj);
+  JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
+}
+
+int RGWPeriod::update_latest_epoch(const DoutPrefixProvider *dpp, epoch_t epoch, optional_yield y)
+{
+  static constexpr int MAX_RETRIES = 20;
+
+  for (int i = 0; i < MAX_RETRIES; i++) {
+    RGWPeriodLatestEpochInfo info;
+    RGWObjVersionTracker objv;
+    bool exclusive = false;
+
+    // read existing epoch
+    int r = read_latest_epoch(dpp, info, y, &objv);
+    if (r == -ENOENT) {
+      // use an exclusive create to set the epoch atomically
+      exclusive = true;
+      ldpp_dout(dpp, 20) << "creating initial latest_epoch=" << epoch
+          << " for period=" << id << dendl;
+    } else if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to read latest_epoch" << dendl;
+      return r;
+    } else if (epoch <= info.epoch) {
+      r = -EEXIST; // fail with EEXIST if epoch is not newer
+      ldpp_dout(dpp, 10) << "found existing latest_epoch " << info.epoch
+          << " >= given epoch " << epoch << ", returning r=" << r << dendl;
+      return r;
+    } else {
+      ldpp_dout(dpp, 20) << "updating latest_epoch from " << info.epoch
+          << " -> " << epoch << " on period=" << id << dendl;
+    }
+
+    r = set_latest_epoch(dpp, y, epoch, exclusive, &objv);
+    if (r == -EEXIST) {
+      continue; // exclusive create raced with another update, retry
+    } else if (r == -ECANCELED) {
+      continue; // write raced with a conflicting version, retry
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to write latest_epoch" << dendl;
+      return r;
+    }
+    return 0; // return success
+  }
+
+  return -ECANCELED; // fail after max retries
+}
+
+int RGWPeriod::read_latest_epoch(const DoutPrefixProvider *dpp, 
+                                 RGWPeriodLatestEpochInfo& info,
+				 optional_yield y,
+                                 RGWObjVersionTracker *objv)
+{
+  string oid = get_period_oid_prefix() + get_latest_epoch_oid();
+
+  rgw_pool pool(get_pool(cct));
+  bufferlist bl;
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid});
+  int ret = sysobj.rop().read(dpp, &bl, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
+    return ret;
+  }
+  try {
+    auto iter = bl.cbegin();
+    using ceph::decode;
+    decode(info, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "error decoding data from " << pool << ":" << oid << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+int RGWPeriod::use_latest_epoch(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  RGWPeriodLatestEpochInfo info;
+  int ret = read_latest_epoch(dpp, info, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  epoch = info.epoch;
+
+  return 0;
+}
+
diff --git a/src/rgw/rgw_period_history.cc b/src/rgw/rgw_period_history.cc
new file mode 100644
index 000000000..abbd998cf
--- /dev/null
+++ b/src/rgw/rgw_period_history.cc
@@ -0,0 +1,353 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_period_history.h"
+#include "rgw_zone.h"
+
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "rgw period history: ")
+
+/// an ordered history of consecutive periods
+class RGWPeriodHistory::History : public bi::avl_set_base_hook<> {
+ public:
+  std::deque<RGWPeriod> periods;
+
+  epoch_t get_oldest_epoch() const {
+    return periods.front().get_realm_epoch();
+  }
+  epoch_t get_newest_epoch() const {
+    return periods.back().get_realm_epoch();
+  }
+  bool contains(epoch_t epoch) const {
+    return get_oldest_epoch() <= epoch && epoch <= get_newest_epoch();
+  }
+  RGWPeriod& get(epoch_t epoch) {
+    return periods[epoch - get_oldest_epoch()];
+  }
+  const RGWPeriod& get(epoch_t epoch) const {
+    return periods[epoch - get_oldest_epoch()];
+  }
+  const std::string& get_predecessor_id() const {
+    return periods.front().get_predecessor();
+  }
+};
+
+/// value comparison for avl_set
+bool operator<(const RGWPeriodHistory::History& lhs,
+               const RGWPeriodHistory::History& rhs)
+{
+  return lhs.get_newest_epoch() < rhs.get_newest_epoch();
+}
+
+/// key-value comparison for avl_set
+struct NewestEpochLess {
+  bool operator()(const RGWPeriodHistory::History& value, epoch_t key) const {
+    return value.get_newest_epoch() < key;
+  }
+};
+
+
+using Cursor = RGWPeriodHistory::Cursor;
+
+const RGWPeriod& Cursor::get_period() const
+{
+  std::lock_guard<std::mutex> lock(*mutex);
+  return history->get(epoch);
+}
+bool Cursor::has_prev() const
+{
+  std::lock_guard<std::mutex> lock(*mutex);
+  return epoch > history->get_oldest_epoch();
+}
+bool Cursor::has_next() const
+{
+  std::lock_guard<std::mutex> lock(*mutex);
+  return epoch < history->get_newest_epoch();
+}
+
+bool operator==(const Cursor& lhs, const Cursor& rhs)
+{
+  return lhs.history == rhs.history && lhs.epoch == rhs.epoch;
+}
+
+bool operator!=(const Cursor& lhs, const Cursor& rhs)
+{
+  return !(lhs == rhs);
+}
+
+class RGWPeriodHistory::Impl final {
+ public:
+  Impl(CephContext* cct, Puller* puller, const RGWPeriod& current_period);
+  ~Impl();
+
+  Cursor get_current() const { return current_cursor; }
+  Cursor attach(const DoutPrefixProvider *dpp, RGWPeriod&& period, optional_yield y);
+  Cursor insert(RGWPeriod&& period);
+  Cursor lookup(epoch_t realm_epoch);
+
+ private:
+  /// an intrusive set of histories, ordered by their newest epoch. although
+  /// the newest epoch of each history is mutable, the ordering cannot change
+  /// because we prevent the histories from overlapping
+  using Set = bi::avl_set<RGWPeriodHistory::History>;
+
+  /// insert the given period into the period history, creating new unconnected
+  /// histories or merging existing histories as necessary. expects the caller
+  /// to hold a lock on mutex. returns a valid cursor regardless of whether it
+  /// ends up in current_history, though cursors in other histories are only
+  /// valid within the context of the lock
+  Cursor insert_locked(RGWPeriod&& period);
+
+  /// merge the periods from the src history onto the end of the dst history,
+  /// and return an iterator to the merged history
+  Set::iterator merge(Set::iterator dst, Set::iterator src);
+
+  /// construct a Cursor object using Cursor's private constuctor
+  Cursor make_cursor(Set::const_iterator history, epoch_t epoch);
+
+  CephContext *const cct;
+  Puller *const puller; //< interface for pulling missing periods
+  Cursor current_cursor; //< Cursor to realm's current period
+
+  mutable std::mutex mutex; //< protects the histories
+
+  /// set of disjoint histories that are missing intermediate periods needed to
+  /// connect them together
+  Set histories;
+
+  /// iterator to the history that contains the realm's current period
+  Set::const_iterator current_history;
+};
+
+RGWPeriodHistory::Impl::Impl(CephContext* cct, Puller* puller,
+                             const RGWPeriod& current_period)
+  : cct(cct), puller(puller)
+{
+  if (!current_period.get_id().empty()) {
+    // copy the current period into a new history
+    auto history = new History;
+    history->periods.push_back(current_period);
+
+    // insert as our current history
+    current_history = histories.insert(*history).first;
+
+    // get a cursor to the current period
+    current_cursor = make_cursor(current_history, current_period.get_realm_epoch());
+  } else {
+    current_history = histories.end();
+  }
+}
+
+RGWPeriodHistory::Impl::~Impl()
+{
+  // clear the histories and delete each entry
+  histories.clear_and_dispose(std::default_delete<History>{});
+}
+
+Cursor RGWPeriodHistory::Impl::attach(const DoutPrefixProvider *dpp, RGWPeriod&& period, optional_yield y)
+{
+  if (current_history == histories.end()) {
+    return Cursor{-EINVAL};
+  }
+
+  const auto epoch = period.get_realm_epoch();
+
+  std::string predecessor_id;
+  for (;;) {
+    {
+      // hold the lock over insert, and while accessing the unsafe cursor
+      std::lock_guard<std::mutex> lock(mutex);
+
+      auto cursor = insert_locked(std::move(period));
+      if (!cursor) {
+        return cursor;
+      }
+      if (current_history->contains(epoch)) {
+        break; // the history is complete
+      }
+
+      // take the predecessor id of the most recent history
+      if (cursor.get_epoch() > current_cursor.get_epoch()) {
+        predecessor_id = cursor.history->get_predecessor_id();
+      } else {
+        predecessor_id = current_history->get_predecessor_id();
+      }
+    }
+
+    if (predecessor_id.empty()) {
+      ldpp_dout(dpp, -1) << "reached a period with an empty predecessor id" << dendl;
+      return Cursor{-EINVAL};
+    }
+
+    // pull the period outside of the lock
+    int r = puller->pull(dpp, predecessor_id, period, y);
+    if (r < 0) {
+      return Cursor{r};
+    }
+  }
+
+  // return a cursor to the requested period
+  return make_cursor(current_history, epoch);
+}
+
+Cursor RGWPeriodHistory::Impl::insert(RGWPeriod&& period)
+{
+  if (current_history == histories.end()) {
+    return Cursor{-EINVAL};
+  }
+
+  std::lock_guard<std::mutex> lock(mutex);
+
+  auto cursor = insert_locked(std::move(period));
+
+  if (cursor.get_error()) {
+    return cursor;
+  }
+  // we can only provide cursors that are safe to use outside of the mutex if
+  // they're within the current_history, because other histories can disappear
+  // in a merge. see merge() for the special handling of current_history
+  if (cursor.history == &*current_history) {
+    return cursor;
+  }
+  return Cursor{};
+}
+
+Cursor RGWPeriodHistory::Impl::lookup(epoch_t realm_epoch)
+{
+  if (current_history != histories.end() &&
+      current_history->contains(realm_epoch)) {
+    return make_cursor(current_history, realm_epoch);
+  }
+  return Cursor{};
+}
+
+Cursor RGWPeriodHistory::Impl::insert_locked(RGWPeriod&& period)
+{
+  auto epoch = period.get_realm_epoch();
+
+  // find the first history whose newest epoch comes at or after this period
+  auto i = histories.lower_bound(epoch, NewestEpochLess{});
+
+  if (i == histories.end()) {
+    // epoch is past the end of our newest history
+    auto last = --Set::iterator{i}; // last = i - 1
+
+    if (epoch == last->get_newest_epoch() + 1) {
+      // insert at the back of the last history
+      last->periods.emplace_back(std::move(period));
+      return make_cursor(last, epoch);
+    }
+
+    // create a new history for this period
+    auto history = new History;
+    history->periods.emplace_back(std::move(period));
+    histories.insert(last, *history);
+
+    i = Set::s_iterator_to(*history);
+    return make_cursor(i, epoch);
+  }
+
+  if (i->contains(epoch)) {
+    // already resident in this history
+    auto& existing = i->get(epoch);
+    // verify that the period ids match; otherwise we've forked the history
+    if (period.get_id() != existing.get_id()) {
+      lderr(cct) << "Got two different periods, " << period.get_id()
+          << " and " << existing.get_id() << ", with the same realm epoch "
+          << epoch << "! This indicates a fork in the period history." << dendl;
+      return Cursor{-EEXIST};
+    }
+    // update the existing period if we got a newer period epoch
+    if (period.get_epoch() > existing.get_epoch()) {
+      existing = std::move(period);
+    }
+    return make_cursor(i, epoch);
+  }
+
+  if (epoch + 1 == i->get_oldest_epoch()) {
+    // insert at the front of this history
+    i->periods.emplace_front(std::move(period));
+
+    // try to merge with the previous history
+    if (i != histories.begin()) {
+      auto prev = --Set::iterator{i};
+      if (epoch == prev->get_newest_epoch() + 1) {
+        i = merge(prev, i);
+      }
+    }
+    return make_cursor(i, epoch);
+  }
+
+  if (i != histories.begin()) {
+    auto prev = --Set::iterator{i};
+    if (epoch == prev->get_newest_epoch() + 1) {
+      // insert at the back of the previous history
+      prev->periods.emplace_back(std::move(period));
+      return make_cursor(prev, epoch);
+    }
+  }
+
+  // create a new history for this period
+  auto history = new History;
+  history->periods.emplace_back(std::move(period));
+  histories.insert(i, *history);
+
+  i = Set::s_iterator_to(*history);
+  return make_cursor(i, epoch);
+}
+
+RGWPeriodHistory::Impl::Set::iterator
+RGWPeriodHistory::Impl::merge(Set::iterator dst, Set::iterator src)
+{
+  ceph_assert(dst->get_newest_epoch() + 1 == src->get_oldest_epoch());
+
+  // always merge into current_history
+  if (src == current_history) {
+    // move the periods from dst onto the front of src
+    src->periods.insert(src->periods.begin(),
+                        std::make_move_iterator(dst->periods.begin()),
+                        std::make_move_iterator(dst->periods.end()));
+    histories.erase_and_dispose(dst, std::default_delete<History>{});
+    return src;
+  }
+
+  // move the periods from src onto the end of dst
+  dst->periods.insert(dst->periods.end(),
+                      std::make_move_iterator(src->periods.begin()),
+                      std::make_move_iterator(src->periods.end()));
+  histories.erase_and_dispose(src, std::default_delete<History>{});
+  return dst;
+}
+
+Cursor RGWPeriodHistory::Impl::make_cursor(Set::const_iterator history,
+                                           epoch_t epoch) {
+  return Cursor{&*history, &mutex, epoch};
+}
+
+
+RGWPeriodHistory::RGWPeriodHistory(CephContext* cct, Puller* puller,
+                                   const RGWPeriod& current_period)
+  : impl(new Impl(cct, puller, current_period)) {}
+
+RGWPeriodHistory::~RGWPeriodHistory() = default;
+
+Cursor RGWPeriodHistory::get_current() const
+{
+  return impl->get_current();
+}
+Cursor RGWPeriodHistory::attach(const DoutPrefixProvider *dpp, RGWPeriod&& period, optional_yield y)
+{
+  return impl->attach(dpp, std::move(period), y);
+}
+Cursor RGWPeriodHistory::insert(RGWPeriod&& period)
+{
+  return impl->insert(std::move(period));
+}
+Cursor RGWPeriodHistory::lookup(epoch_t realm_epoch)
+{
+  return impl->lookup(realm_epoch);
+}
diff --git a/src/rgw/rgw_period_history.h b/src/rgw/rgw_period_history.h
new file mode 100644
index 000000000..3d18fbf9e
--- /dev/null
+++ b/src/rgw/rgw_period_history.h
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <deque>
+#include <mutex>
+#include <system_error>
+#include <boost/intrusive/avl_set.hpp>
+#include "include/ceph_assert.h"
+#include "include/types.h"
+#include "common/async/yield_context.h"
+#include "common/dout.h"
+
+namespace bi = boost::intrusive;
+
+class RGWPeriod;
+
+/**
+ * RGWPeriodHistory tracks the relative history of all inserted periods,
+ * coordinates the pulling of missing intermediate periods, and provides a
+ * Cursor object for traversing through the connected history.
+ */
+class RGWPeriodHistory final {
+ private:
+  /// an ordered history of consecutive periods
+  class History;
+
+  // comparisons for avl_set ordering
+  friend bool operator<(const History& lhs, const History& rhs);
+  friend struct NewestEpochLess;
+
+  class Impl;
+  std::unique_ptr<Impl> impl;
+
+ public:
+  /**
+   * Puller is a synchronous interface for pulling periods from the master
+   * zone. The abstraction exists mainly to support unit testing.
+   */
+  class Puller {
+   public:
+    virtual ~Puller() = default;
+
+    virtual int pull(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period,
+		     optional_yield y) = 0;
+  };
+
+  RGWPeriodHistory(CephContext* cct, Puller* puller,
+                   const RGWPeriod& current_period);
+  ~RGWPeriodHistory();
+
+  /**
+   * Cursor tracks a position in the period history and allows forward and
+   * backward traversal. Only periods that are fully connected to the
+   * current_period are reachable via a Cursor, because other histories are
+   * temporary and can be merged away. Cursors to periods in disjoint
+   * histories, as provided by insert() or lookup(), are therefore invalid and
+   * their operator bool() will return false.
+   */
+  class Cursor final {
+   public:
+    Cursor() = default;
+    explicit Cursor(int error) : error(error) {}
+
+    int get_error() const { return error; }
+
+    /// return false for a default-constructed or error Cursor
+    operator bool() const { return history != nullptr; }
+
+    epoch_t get_epoch() const { return epoch; }
+    const RGWPeriod& get_period() const;
+
+    bool has_prev() const;
+    bool has_next() const;
+
+    void prev() { epoch--; }
+    void next() { epoch++; }
+
+    friend bool operator==(const Cursor& lhs, const Cursor& rhs);
+    friend bool operator!=(const Cursor& lhs, const Cursor& rhs);
+
+   private:
+    // private constructors for RGWPeriodHistory
+    friend class RGWPeriodHistory::Impl;
+
+    Cursor(const History* history, std::mutex* mutex, epoch_t epoch)
+      : history(history), mutex(mutex), epoch(epoch) {}
+
+    int error{0};
+    const History* history{nullptr};
+    std::mutex* mutex{nullptr};
+    epoch_t epoch{0}; //< realm epoch of cursor position
+  };
+
+  /// return a cursor to the current period
+  Cursor get_current() const;
+
+  /// build up a connected period history that covers the span between
+  /// current_period and the given period, reading predecessor periods or
+  /// fetching them from the master as necessary. returns a cursor at the
+  /// given period that can be used to traverse the current_history
+  Cursor attach(const DoutPrefixProvider *dpp, RGWPeriod&& period, optional_yield y);
+
+  /// insert the given period into an existing history, or create a new
+  /// unconnected history. similar to attach(), but it doesn't try to fetch
+  /// missing periods. returns a cursor to the inserted period iff it's in
+  /// the current_history
+  Cursor insert(RGWPeriod&& period);
+
+  /// search for a period by realm epoch, returning a valid Cursor iff it's in
+  /// the current_history
+  Cursor lookup(epoch_t realm_epoch);
+};
diff --git a/src/rgw/rgw_period_puller.cc b/src/rgw/rgw_period_puller.cc
new file mode 100644
index 000000000..ea2f28e56
--- /dev/null
+++ b/src/rgw/rgw_period_puller.cc
@@ -0,0 +1,123 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_rados.h"
+#include "rgw_zone.h"
+#include "rgw_rest_conn.h"
+#include "common/ceph_json.h"
+#include "common/errno.h"
+
+#include "services/svc_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "rgw period puller: ")
+
+RGWPeriodPuller::RGWPeriodPuller(RGWSI_Zone *zone_svc, RGWSI_SysObj *sysobj_svc)
+{
+  cct = zone_svc->ctx();
+  svc.zone = zone_svc;
+  svc.sysobj = sysobj_svc;
+}
+
+namespace {
+
+// pull the given period over the connection
+int pull_period(const DoutPrefixProvider *dpp, RGWRESTConn* conn, const std::string& period_id,
+                const std::string& realm_id, RGWPeriod& period,
+		optional_yield y)
+{
+  rgw_user user;
+  RGWEnv env;
+  req_info info(conn->get_ctx(), &env);
+  info.method = "GET";
+  info.request_uri = "/admin/realm/period";
+
+  auto& params = info.args.get_params();
+  params["realm_id"] = realm_id;
+  params["period_id"] = period_id;
+
+  bufferlist data;
+#define MAX_REST_RESPONSE (128 * 1024)
+  int r = conn->forward(dpp, user, info, nullptr, MAX_REST_RESPONSE, nullptr, &data, y);
+  if (r < 0) {
+    return r;
+  }
+
+  JSONParser parser;
+  r = parser.parse(data.c_str(), data.length());
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "request failed: " << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  try {
+    decode_json_obj(period, &parser);
+  } catch (const JSONDecoder::err& e) {
+    ldpp_dout(dpp, -1) << "failed to decode JSON input: "
+        << e.what() << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+} // anonymous namespace
+
+int RGWPeriodPuller::pull(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period,
+			  optional_yield y)
+{
+  // try to read the period from rados
+  period.set_id(period_id);
+  period.set_epoch(0);
+  int r = period.init(dpp, cct, svc.sysobj, y);
+  if (r < 0) {
+    if (svc.zone->is_meta_master()) {
+      // can't pull if we're the master
+      ldpp_dout(dpp, 1) << "metadata master failed to read period "
+          << period_id << " from local storage: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    ldpp_dout(dpp, 14) << "pulling period " << period_id
+        << " from master" << dendl;
+    // request the period from the master zone
+    r = pull_period(dpp, svc.zone->get_master_conn(), period_id,
+                    svc.zone->get_realm().get_id(), period, y);
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << "failed to pull period " << period_id << dendl;
+      return r;
+    }
+    // write the period to rados
+    r = period.store_info(dpp, true, y);
+    if (r == -EEXIST) {
+      r = 0;
+    } else if (r < 0) {
+      ldpp_dout(dpp, -1) << "failed to store period " << period_id << dendl;
+      return r;
+    }
+    // update latest epoch
+    r = period.update_latest_epoch(dpp, period.get_epoch(), y);
+    if (r == -EEXIST) {
+      // already have this epoch (or a more recent one)
+      return 0;
+    }
+    if (r < 0) {
+      ldpp_dout(dpp, -1) << "failed to update latest_epoch for period "
+          << period_id << dendl;
+      return r;
+    }
+    // reflect period objects if this is the latest version
+    if (svc.zone->get_realm().get_current_period() == period_id) {
+      r = period.reflect(dpp, y);
+      if (r < 0) {
+        return r;
+      }
+    }
+    ldpp_dout(dpp, 14) << "period " << period_id
+        << " pulled and written to local storage" << dendl;
+  } else {
+    ldpp_dout(dpp, 14) << "found period " << period_id
+        << " in local storage" << dendl;
+  }
+  return 0;
+}
diff --git a/src/rgw/rgw_period_puller.h b/src/rgw/rgw_period_puller.h
new file mode 100644
index 000000000..88138d36b
--- /dev/null
+++ b/src/rgw/rgw_period_puller.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_period_history.h"
+#include "include/common_fwd.h"
+#include "rgw/services/svc_sys_obj.h"
+
+class RGWPeriod;
+
+class RGWPeriodPuller : public RGWPeriodHistory::Puller {
+  CephContext *cct;
+
+  struct {
+    RGWSI_Zone *zone;
+    RGWSI_SysObj *sysobj;
+  } svc;
+
+ public:
+  explicit RGWPeriodPuller(RGWSI_Zone *zone_svc, RGWSI_SysObj *sysobj_svc);
+
+  int pull(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period, optional_yield y) override;
+};
diff --git a/src/rgw/rgw_period_pusher.cc b/src/rgw/rgw_period_pusher.cc
new file mode 100644
index 000000000..d9c899e5c
--- /dev/null
+++ b/src/rgw/rgw_period_pusher.cc
@@ -0,0 +1,316 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <map>
+#include <thread>
+
+#include "rgw_period_pusher.h"
+#include "rgw_cr_rest.h"
+#include "rgw_zone.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+
+#include "services/svc_zone.h"
+
+#include "common/errno.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "rgw period pusher: ")
+
+/// A coroutine to post the period over the given connection.
+using PushCR = RGWPostRESTResourceCR<RGWPeriod, int>;
+
+/// A coroutine that calls PushCR, and retries with backoff until success.
+class PushAndRetryCR : public RGWCoroutine {
+  const std::string& zone;
+  RGWRESTConn *const conn;
+  RGWHTTPManager *const http;
+  RGWPeriod& period;
+  const std::string epoch; //< epoch string for params
+  double timeout; //< current interval between retries
+  const double timeout_max; //< maximum interval between retries
+  uint32_t counter; //< number of failures since backoff increased
+
+ public:
+  PushAndRetryCR(CephContext* cct, const std::string& zone, RGWRESTConn* conn,
+                 RGWHTTPManager* http, RGWPeriod& period)
+    : RGWCoroutine(cct), zone(zone), conn(conn), http(http), period(period),
+      epoch(std::to_string(period.get_epoch())),
+      timeout(cct->_conf->rgw_period_push_interval),
+      timeout_max(cct->_conf->rgw_period_push_interval_max),
+      counter(0)
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int PushAndRetryCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    for (;;) {
+      yield {
+        ldpp_dout(dpp, 10) << "pushing period " << period.get_id()
+            << " to " << zone << dendl;
+        // initialize the http params
+        rgw_http_param_pair params[] = {
+          { "period", period.get_id().c_str() },
+          { "epoch", epoch.c_str() },
+          { nullptr, nullptr }
+        };
+        call(new PushCR(cct, conn, http, "/admin/realm/period",
+                        params, period, nullptr));
+      }
+
+      // stop on success
+      if (get_ret_status() == 0) {
+        ldpp_dout(dpp, 10) << "push to " << zone << " succeeded" << dendl;
+        return set_cr_done();
+      }
+
+      // try each endpoint in the connection before waiting
+      if (++counter < conn->get_endpoint_count())
+        continue;
+      counter = 0;
+
+      // wait with exponential backoff up to timeout_max
+      yield {
+        utime_t dur;
+        dur.set_from_double(timeout);
+
+        ldpp_dout(dpp, 10) << "waiting " << dur << "s for retry.." << dendl;
+        wait(dur);
+
+        timeout *= 2;
+        if (timeout > timeout_max)
+          timeout = timeout_max;
+      }
+    }
+  }
+  return 0;
+}
+
+/**
+ * PushAllCR is a coroutine that sends the period over all of the given
+ * connections, retrying until they are all marked as completed.
+ */
+class PushAllCR : public RGWCoroutine {
+  RGWHTTPManager *const http;
+  RGWPeriod period; //< period object to push
+  std::map<std::string, RGWRESTConn> conns; //< zones that need the period
+
+ public:
+  PushAllCR(CephContext* cct, RGWHTTPManager* http, RGWPeriod&& period,
+            std::map<std::string, RGWRESTConn>&& conns)
+    : RGWCoroutine(cct), http(http),
+      period(std::move(period)),
+      conns(std::move(conns))
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) override;
+};
+
+int PushAllCR::operate(const DoutPrefixProvider *dpp)
+{
+  reenter(this) {
+    // spawn a coroutine to push the period over each connection
+    yield {
+      ldpp_dout(dpp, 4) << "sending " << conns.size() << " periods" << dendl;
+      for (auto& c : conns)
+        spawn(new PushAndRetryCR(cct, c.first, &c.second, http, period), false);
+    }
+    // wait for all to complete
+    drain_all();
+    return set_cr_done();
+  }
+  return 0;
+}
+
+/// A background thread to run the PushAllCR coroutine and exit.
+class RGWPeriodPusher::CRThread : public DoutPrefixProvider {
+  CephContext* cct;
+  RGWCoroutinesManager coroutines;
+  RGWHTTPManager http;
+  boost::intrusive_ptr<PushAllCR> push_all;
+  std::thread thread;
+
+ public:
+  CRThread(CephContext* cct, RGWPeriod&& period,
+           std::map<std::string, RGWRESTConn>&& conns)
+    : cct(cct), coroutines(cct, NULL),
+      http(cct, coroutines.get_completion_mgr()),
+      push_all(new PushAllCR(cct, &http, std::move(period), std::move(conns)))
+  {
+    http.start();
+    // must spawn the CR thread after start
+    thread = std::thread([this]() noexcept { coroutines.run(this, push_all.get()); });
+  }
+  ~CRThread()
+  {
+    push_all.reset();
+    coroutines.stop();
+    http.stop();
+    if (thread.joinable())
+      thread.join();
+  }
+
+  CephContext *get_cct() const override { return cct; }
+  unsigned get_subsys() const override { return dout_subsys; }
+  std::ostream& gen_prefix(std::ostream& out) const override { return out << "rgw period pusher CR thread: "; }
+};
+
+
+RGWPeriodPusher::RGWPeriodPusher(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+				 optional_yield y)
+  : cct(driver->ctx()), driver(driver)
+{
+  rgw::sal::Zone* zone = driver->get_zone();
+  auto& realm_id = zone->get_realm_id();
+  if (realm_id.empty()) // no realm configuration
+    return;
+
+  // always send out the current period on startup
+  RGWPeriod period;
+  // XXX dang
+  int r = period.init(dpp, cct, static_cast<rgw::sal::RadosStore* >(driver)->svc()->sysobj, realm_id, y, zone->get_realm_name());
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "failed to load period for realm " << realm_id << dendl;
+    return;
+  }
+
+  std::lock_guard<std::mutex> lock(mutex);
+  handle_notify(std::move(period));
+}
+
+// destructor is here because CRThread is incomplete in the header
+RGWPeriodPusher::~RGWPeriodPusher() = default;
+
+void RGWPeriodPusher::handle_notify(RGWRealmNotify type,
+                                    bufferlist::const_iterator& p)
+{
+  // decode the period
+  RGWZonesNeedPeriod info;
+  try {
+    decode(info, p);
+  } catch (buffer::error& e) {
+    lderr(cct) << "Failed to decode the period: " << e.what() << dendl;
+    return;
+  }
+
+  std::lock_guard<std::mutex> lock(mutex);
+
+  // we can't process this notification without access to our current realm
+  // configuration. queue it until resume()
+  if (driver == nullptr) {
+    pending_periods.emplace_back(std::move(info));
+    return;
+  }
+
+  handle_notify(std::move(info));
+}
+
+// expects the caller to hold a lock on mutex
+void RGWPeriodPusher::handle_notify(RGWZonesNeedPeriod&& period)
+{
+  if (period.get_realm_epoch() < realm_epoch) {
+    ldout(cct, 10) << "period's realm epoch " << period.get_realm_epoch()
+        << " is not newer than current realm epoch " << realm_epoch
+        << ", discarding update" << dendl;
+    return;
+  }
+  if (period.get_realm_epoch() == realm_epoch &&
+      period.get_epoch() <= period_epoch) {
+    ldout(cct, 10) << "period epoch " << period.get_epoch() << " is not newer "
+        "than current epoch " << period_epoch << ", discarding update" << dendl;
+    return;
+  }
+
+  // find our zonegroup in the new period
+  auto& zonegroups = period.get_map().zonegroups;
+  auto i = zonegroups.find(driver->get_zone()->get_zonegroup().get_id());
+  if (i == zonegroups.end()) {
+    lderr(cct) << "The new period does not contain my zonegroup!" << dendl;
+    return;
+  }
+  auto& my_zonegroup = i->second;
+
+  // if we're not a master zone, we're not responsible for pushing any updates
+  if (my_zonegroup.master_zone != driver->get_zone()->get_id())
+    return;
+
+  // construct a map of the zones that need this period. the map uses the same
+  // keys/ordering as the zone[group] map, so we can use a hint for insertions
+  std::map<std::string, RGWRESTConn> conns;
+  auto hint = conns.end();
+
+  // are we the master zonegroup in this period?
+  if (period.get_map().master_zonegroup == driver->get_zone()->get_zonegroup().get_id()) {
+    // update other zonegroup endpoints
+    for (auto& zg : zonegroups) {
+      auto& zonegroup = zg.second;
+      if (zonegroup.get_id() == driver->get_zone()->get_zonegroup().get_id())
+        continue;
+      if (zonegroup.endpoints.empty())
+        continue;
+
+      hint = conns.emplace_hint(
+          hint, std::piecewise_construct,
+          std::forward_as_tuple(zonegroup.get_id()),
+          std::forward_as_tuple(cct, driver, zonegroup.get_id(), zonegroup.endpoints, zonegroup.api_name));
+    }
+  }
+
+  // update other zone endpoints
+  for (auto& z : my_zonegroup.zones) {
+    auto& zone = z.second;
+    if (zone.id == driver->get_zone()->get_id())
+      continue;
+    if (zone.endpoints.empty())
+      continue;
+
+    hint = conns.emplace_hint(
+        hint, std::piecewise_construct,
+        std::forward_as_tuple(zone.id),
+        std::forward_as_tuple(cct, driver, zone.id, zone.endpoints, my_zonegroup.api_name));
+  }
+
+  if (conns.empty()) {
+    ldout(cct, 4) << "No zones to update" << dendl;
+    return;
+  }
+
+  realm_epoch = period.get_realm_epoch();
+  period_epoch = period.get_epoch();
+
+  ldout(cct, 4) << "Zone master pushing period " << period.get_id()
+      << " epoch " << period_epoch << " to "
+      << conns.size() << " other zones" << dendl;
+
+  // spawn a new coroutine thread, destroying the previous one
+  cr_thread.reset(new CRThread(cct, std::move(period), std::move(conns)));
+}
+
+void RGWPeriodPusher::pause()
+{
+  ldout(cct, 4) << "paused for realm update" << dendl;
+  std::lock_guard<std::mutex> lock(mutex);
+  driver = nullptr;
+}
+
+void RGWPeriodPusher::resume(rgw::sal::Driver* driver)
+{
+  std::lock_guard<std::mutex> lock(mutex);
+  this->driver = driver;
+
+  ldout(cct, 4) << "resume with " << pending_periods.size()
+      << " periods pending" << dendl;
+
+  // process notification queue
+  for (auto& info : pending_periods) {
+    handle_notify(std::move(info));
+  }
+  pending_periods.clear();
+}
diff --git a/src/rgw/rgw_period_pusher.h b/src/rgw/rgw_period_pusher.h
new file mode 100644
index 000000000..3ea7bd7dd
--- /dev/null
+++ b/src/rgw/rgw_period_pusher.h
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "common/async/yield_context.h"
+#include "rgw_realm_reloader.h"
+#include "rgw_sal_fwd.h"
+
+class RGWPeriod;
+
+// RGWRealmNotify payload for push coordination
+using RGWZonesNeedPeriod = RGWPeriod;
+
+/**
+ * RGWPeriodPusher coordinates with other nodes via the realm watcher to manage
+ * the responsibility for pushing period updates to other zones or zonegroups.
+ */
+class RGWPeriodPusher final : public RGWRealmWatcher::Watcher,
+                              public RGWRealmReloader::Pauser {
+ public:
+  explicit RGWPeriodPusher(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, optional_yield y);
+  ~RGWPeriodPusher() override;
+
+  /// respond to realm notifications by pushing new periods to other zones
+  void handle_notify(RGWRealmNotify type, bufferlist::const_iterator& p) override;
+
+  /// avoid accessing RGWRados while dynamic reconfiguration is in progress.
+  /// notifications will be enqueued until resume()
+  void pause() override;
+
+  /// continue processing notifications with a new RGWRados instance
+  void resume(rgw::sal::Driver* driver) override;
+
+ private:
+  void handle_notify(RGWZonesNeedPeriod&& period);
+
+  CephContext *const cct;
+  rgw::sal::Driver* driver;
+
+  std::mutex mutex;
+  epoch_t realm_epoch{0}; //< the current realm epoch being sent
+  epoch_t period_epoch{0}; //< the current period epoch being sent
+
+  /// while paused for reconfiguration, we need to queue up notifications
+  std::vector<RGWZonesNeedPeriod> pending_periods;
+
+  class CRThread; //< contains thread, coroutine manager, http manager
+  std::unique_ptr<CRThread> cr_thread; //< thread to run the push coroutines
+};
diff --git a/src/rgw/rgw_placement_types.h b/src/rgw/rgw_placement_types.h
new file mode 100644
index 000000000..bcf7a4af7
--- /dev/null
+++ b/src/rgw/rgw_placement_types.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+
+#include "include/types.h"
+
+#include "common/Formatter.h"
+
+
+static std::string RGW_STORAGE_CLASS_STANDARD = "STANDARD";
+
+struct rgw_placement_rule {
+  std::string name;
+  std::string storage_class;
+
+  rgw_placement_rule() {}
+  rgw_placement_rule(const std::string& _n, const std::string& _sc) : name(_n), storage_class(_sc) {}
+  rgw_placement_rule(const rgw_placement_rule& _r, const std::string& _sc) : name(_r.name) {
+    if (!_sc.empty()) {
+      storage_class = _sc;
+    } else {
+      storage_class = _r.storage_class;
+    }
+  }
+
+  bool empty() const {
+    return name.empty() && storage_class.empty();
+  }
+
+  void inherit_from(const rgw_placement_rule& r) {
+    if (name.empty()) {
+      name = r.name;
+    }
+    if (storage_class.empty()) {
+      storage_class = r.storage_class;
+    }
+  }
+
+  void clear() {
+    name.clear();
+    storage_class.clear();
+  }
+
+  void init(const std::string& n, const std::string& c) {
+    name = n;
+    storage_class = c;
+  }
+
+  static const std::string& get_canonical_storage_class(const std::string& storage_class) {
+    if (storage_class.empty()) {
+      return RGW_STORAGE_CLASS_STANDARD;
+    }
+    return storage_class;
+  }
+
+  const std::string& get_storage_class() const {
+    return get_canonical_storage_class(storage_class);
+  }
+
+  int compare(const rgw_placement_rule& r) const {
+    int c = name.compare(r.name);
+    if (c != 0) {
+      return c;
+    }
+    return get_storage_class().compare(r.get_storage_class());
+  }
+
+  bool operator==(const rgw_placement_rule& r) const {
+    return (name == r.name &&
+            get_storage_class() == r.get_storage_class());
+  }
+
+  bool operator!=(const rgw_placement_rule& r) const {
+    return !(*this == r);
+  }
+
+  void encode(bufferlist& bl) const {
+    /* no ENCODE_START/END due to backward compatibility */
+    std::string s = to_str();
+    ceph::encode(s, bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    std::string s;
+    ceph::decode(s, bl);
+    from_str(s);
+  }
+
+  std::string to_str() const {
+    if (standard_storage_class()) {
+      return name;
+    }
+    return to_str_explicit();
+  }
+
+  std::string to_str_explicit() const {
+    return name + "/" + storage_class;
+  }
+
+  void from_str(const std::string& s) {
+    size_t pos = s.find("/");
+    if (pos == std::string::npos) {
+      name = s;
+      storage_class.clear();
+      return;
+    }
+    name = s.substr(0, pos);
+    storage_class = s.substr(pos + 1);
+  }
+
+  bool standard_storage_class() const {
+    return storage_class.empty() || storage_class == RGW_STORAGE_CLASS_STANDARD;
+  }
+};
+WRITE_CLASS_ENCODER(rgw_placement_rule)
diff --git a/src/rgw/rgw_policy_s3.cc b/src/rgw/rgw_policy_s3.cc
new file mode 100644
index 000000000..e017cc887
--- /dev/null
+++ b/src/rgw/rgw_policy_s3.cc
@@ -0,0 +1,305 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+
+#include "common/ceph_json.h"
+#include "rgw_policy_s3.h"
+#include "rgw_common.h"
+#include "rgw_crypt_sanitize.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWPolicyCondition {
+protected:
+  string v1;
+  string v2;
+
+  virtual bool check(const string& first, const string& second, string& err_msg) = 0;
+
+public:
+  virtual ~RGWPolicyCondition() {}
+
+  void set_vals(const string& _v1, const string& _v2) {
+    v1 = _v1;
+    v2 = _v2;
+  }
+
+  bool check(RGWPolicyEnv *env, map<string, bool, ltstr_nocase>& checked_vars, string& err_msg) {
+     string first, second;
+     env->get_value(v1, first, checked_vars);
+     env->get_value(v2, second, checked_vars);
+     dout(1) << "policy condition check " << v1 << " ["
+         << rgw::crypt_sanitize::s3_policy{v1, first}
+         << "] " << v2 << " ["
+         << rgw::crypt_sanitize::s3_policy{v2, second}
+         << "]" << dendl;
+     bool ret = check(first, second, err_msg);
+     if (!ret) {
+       err_msg.append(": ");
+       err_msg.append(v1);
+       err_msg.append(", ");
+       err_msg.append(v2);
+     }
+     return ret;
+  }
+
+};
+
+
+class RGWPolicyCondition_StrEqual : public RGWPolicyCondition {
+protected:
+  bool check(const string& first, const string& second, string& msg) override {
+    bool ret = first.compare(second) == 0;
+    if (!ret) {
+      msg = "Policy condition failed: eq";
+    }
+    return ret;
+  }
+};
+
+class RGWPolicyCondition_StrStartsWith : public RGWPolicyCondition {
+protected:
+  bool check(const string& first, const string& second, string& msg) override {
+    bool ret = first.compare(0, second.size(), second) == 0;
+    if (!ret) {
+      msg = "Policy condition failed: starts-with";
+    }
+    return ret;
+  }
+};
+
+void RGWPolicyEnv::add_var(const string& name, const string& value)
+{
+  vars[name] = value;
+}
+
+bool RGWPolicyEnv::get_var(const string& name, string& val)
+{
+  map<string, string, ltstr_nocase>::iterator iter = vars.find(name);
+  if (iter == vars.end())
+    return false;
+
+  val = iter->second;
+
+  return true;
+}
+
+bool RGWPolicyEnv::get_value(const string& s, string& val, map<string, bool, ltstr_nocase>& checked_vars)
+{
+  if (s.empty() || s[0] != '$') {
+    val = s;
+    return true;
+  }
+
+  const string& var = s.substr(1);
+  checked_vars[var] = true;
+
+  return get_var(var, val);
+}
+
+
+bool RGWPolicyEnv::match_policy_vars(map<string, bool, ltstr_nocase>& policy_vars, string& err_msg)
+{
+  map<string, string, ltstr_nocase>::iterator iter;
+  string ignore_prefix = "x-ignore-";
+  for (iter = vars.begin(); iter != vars.end(); ++iter) {
+    const string& var = iter->first;
+    if (strncasecmp(ignore_prefix.c_str(), var.c_str(), ignore_prefix.size()) == 0)
+      continue;
+    if (policy_vars.count(var) == 0) {
+      err_msg = "Policy missing condition: ";
+      err_msg.append(iter->first);
+      dout(1) << "env var missing in policy: " << iter->first << dendl;
+      return false;
+    }
+  }
+  return true;
+}
+
+RGWPolicy::~RGWPolicy()
+{
+  list<RGWPolicyCondition *>::iterator citer;
+  for (citer = conditions.begin(); citer != conditions.end(); ++citer) {
+    RGWPolicyCondition *cond = *citer;
+    delete cond;
+  }
+}
+
+int RGWPolicy::set_expires(const string& e)
+{
+  struct tm t;
+  if (!parse_iso8601(e.c_str(), &t))
+      return -EINVAL;
+
+  expires = internal_timegm(&t);
+
+  return 0;
+}
+
+int RGWPolicy::add_condition(const string& op, const string& first, const string& second, string& err_msg)
+{
+  RGWPolicyCondition *cond = NULL;
+  if (stringcasecmp(op, "eq") == 0) {
+    cond = new RGWPolicyCondition_StrEqual;
+  } else if (stringcasecmp(op, "starts-with") == 0) {
+    cond = new RGWPolicyCondition_StrStartsWith;
+  } else if (stringcasecmp(op, "content-length-range") == 0) {
+    off_t min, max;
+    int r = stringtoll(first, &min);
+    if (r < 0) {
+      err_msg = "Bad content-length-range param";
+      dout(0) << "bad content-length-range param: " << first << dendl;
+      return r;
+    }
+
+    r = stringtoll(second, &max);
+    if (r < 0) {
+      err_msg = "Bad content-length-range param";
+      dout(0) << "bad content-length-range param: " << second << dendl;
+      return r;
+    }
+
+    if (min > min_length)
+      min_length = min;
+
+    if (max < max_length)
+      max_length = max;
+
+    return 0;
+  }
+
+  if (!cond) {
+    err_msg = "Invalid condition: ";
+    err_msg.append(op);
+    dout(0) << "invalid condition: " << op << dendl;
+    return -EINVAL;
+  }
+
+  cond->set_vals(first, second);
+  
+  conditions.push_back(cond);
+
+  return 0;
+}
+
+int RGWPolicy::check(RGWPolicyEnv *env, string& err_msg)
+{
+  uint64_t now = ceph_clock_now().sec();
+  if (expires <= now) {
+    dout(0) << "NOTICE: policy calculated as expired: " << expiration_str << dendl;
+    err_msg = "Policy expired";
+    return -EACCES; // change to condition about expired policy following S3
+  }
+
+  list<pair<string, string> >::iterator viter;
+  for (viter = var_checks.begin(); viter != var_checks.end(); ++viter) {
+    pair<string, string>& p = *viter;
+    const string& name = p.first;
+    const string& check_val = p.second;
+    string val;
+    if (!env->get_var(name, val)) {
+      dout(20) << " policy check failed, variable not found: '" << name << "'" << dendl;
+      err_msg = "Policy check failed, variable not found: ";
+      err_msg.append(name);
+      return -EACCES;
+    }
+
+    set_var_checked(name);
+
+    dout(20) << "comparing " << name << " [" << val << "], " << check_val << dendl;
+    if (val.compare(check_val) != 0) {
+      err_msg = "Policy check failed, variable not met condition: ";
+      err_msg.append(name);
+      dout(1) << "policy check failed, val=" << val << " != " << check_val << dendl;
+      return -EACCES;
+    }
+  }
+
+  list<RGWPolicyCondition *>::iterator citer;
+  for (citer = conditions.begin(); citer != conditions.end(); ++citer) {
+    RGWPolicyCondition *cond = *citer;
+    if (!cond->check(env, checked_vars, err_msg)) {
+      return -EACCES;
+    }
+  }
+
+  if (!env->match_policy_vars(checked_vars, err_msg)) {
+    dout(1) << "missing policy condition" << dendl;
+    return -EACCES;
+  }
+  return 0;
+}
+
+
+int RGWPolicy::from_json(bufferlist& bl, string& err_msg)
+{
+  JSONParser parser;
+
+  if (!parser.parse(bl.c_str(), bl.length())) {
+    err_msg = "Malformed JSON";
+    dout(0) << "malformed json" << dendl;
+    return -EINVAL;
+  }
+
+  // as no time was included in the request, we hope that the user has included a short timeout
+  JSONObjIter iter = parser.find_first("expiration");
+  if (iter.end()) {
+    err_msg = "Policy missing expiration";
+    dout(0) << "expiration not found" << dendl;
+    return -EINVAL; // change to a "no expiration" error following S3
+  }
+
+  JSONObj *obj = *iter;
+  expiration_str = obj->get_data();
+  int r = set_expires(expiration_str);
+  if (r < 0) {
+    err_msg = "Failed to parse policy expiration";
+    return r;
+  }
+
+  iter = parser.find_first("conditions");
+  if (iter.end()) {
+    err_msg = "Policy missing conditions";
+    dout(0) << "conditions not found" << dendl;
+    return -EINVAL; // change to a "no conditions" error following S3
+  }
+
+  obj = *iter;
+
+  iter = obj->find_first();
+  for (; !iter.end(); ++iter) {
+    JSONObj *child = *iter;
+    dout(20) << "data=" << child->get_data() << dendl;
+    dout(20) << "is_object=" << child->is_object() << dendl;
+    dout(20) << "is_array=" << child->is_array() << dendl;
+    JSONObjIter citer = child->find_first();
+    if (child->is_array()) {
+      vector<string> v;
+      int i;
+      for (i = 0; !citer.end() && i < 3; ++citer, ++i) {
+	JSONObj *o = *citer;
+        v.push_back(o->get_data());
+      }
+      if (i != 3 || !citer.end()) { /* we expect exactly 3 arguments here */
+        err_msg = "Bad condition array, expecting 3 arguments";
+        return -EINVAL;
+      }
+
+      int r = add_condition(v[0], v[1], v[2], err_msg);
+      if (r < 0)
+        return r;
+    } else if (!citer.end()) {
+      JSONObj *c = *citer;
+      dout(20) << "adding simple_check: " << c->get_name() << " : " << c->get_data() << dendl;
+
+      add_simple_check(c->get_name(), c->get_data());
+    } else {
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
diff --git a/src/rgw/rgw_policy_s3.h b/src/rgw/rgw_policy_s3.h
new file mode 100644
index 000000000..2a8a7ab09
--- /dev/null
+++ b/src/rgw/rgw_policy_s3.h
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <limits.h>
+
+#include <map>
+#include <list>
+#include <string>
+
+#include "include/utime.h"
+
+#include "rgw_string.h"
+
+
+class RGWPolicyEnv {
+  std::map<std::string, std::string, ltstr_nocase> vars;
+
+public:
+  void add_var(const std::string& name, const std::string& value);
+  bool get_var(const std::string& name, std::string& val);
+  bool get_value(const std::string& s, std::string& val, std::map<std::string, bool, ltstr_nocase>& checked_vars);
+  bool match_policy_vars(std::map<std::string, bool, ltstr_nocase>& policy_vars, std::string& err_msg);
+};
+
+class RGWPolicyCondition;
+
+
+class RGWPolicy {
+  uint64_t expires;
+  std::string expiration_str;
+  std::list<RGWPolicyCondition *> conditions;
+  std::list<std::pair<std::string, std::string> > var_checks;
+  std::map<std::string, bool, ltstr_nocase> checked_vars;
+
+public:
+  off_t min_length;
+  off_t max_length;
+
+  RGWPolicy() : expires(0), min_length(0), max_length(LLONG_MAX) {}
+  ~RGWPolicy();
+
+  int set_expires(const std::string& e);
+
+  void set_var_checked(const std::string& var) {
+    checked_vars[var] = true;
+  }
+
+  int add_condition(const std::string& op, const std::string& first, const std::string& second, std::string& err_msg);
+  void add_simple_check(const std::string& var, const std::string& value) {
+    var_checks.emplace_back(var, value);
+  }
+
+  int check(RGWPolicyEnv *env, std::string& err_msg);
+  int from_json(bufferlist& bl, std::string& err_msg);
+};
diff --git a/src/rgw/rgw_polparser.cc b/src/rgw/rgw_polparser.cc
new file mode 100644
index 000000000..eca5066b3
--- /dev/null
+++ b/src/rgw/rgw_polparser.cc
@@ -0,0 +1,105 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include <cstdint>
+#include <cstdlib>
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <string_view>
+
+#include "include/buffer.h"
+
+#include "common/ceph_argparse.h"
+#include "common/common_init.h"
+
+#include "global/global_init.h"
+
+#include "rgw/rgw_iam_policy.h"
+
+// Returns true on success
+bool parse(CephContext* cct, const std::string& tenant,
+           const std::string& fname, std::istream& in) noexcept
+{
+  bufferlist bl;
+  bl.append(in);
+  try {
+    auto p = rgw::IAM::Policy(
+      cct, tenant, bl,
+      cct->_conf.get_val<bool>("rgw_policy_reject_invalid_principals"));
+  } catch (const rgw::IAM::PolicyParseException& e) {
+    std::cerr << fname << ": " << e.what() << std::endl;
+    return false;
+  } catch (const std::exception& e) {
+    std::cerr << fname << ": caught exception: " << e.what() << std::endl;;
+    return false;
+  }
+  return true;
+}
+
+void helpful_exit(std::string_view cmdname)
+{
+  std::cerr << cmdname << "-h for usage" << std::endl;
+  exit(1);
+}
+
+void usage(std::string_view cmdname)
+{
+  std::cout << "usage: " << cmdname << " -t <tenant> [filename]"
+	    << std::endl;
+}
+
+int main(int argc, const char** argv)
+{
+  std::string_view cmdname = argv[0];
+  std::string tenant;
+
+  auto args = argv_to_vec(argc, argv);
+  if (ceph_argparse_need_usage(args)) {
+    usage(cmdname);
+    exit(0);
+  }
+
+  auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+			 CODE_ENVIRONMENT_UTILITY,
+			 CINIT_FLAG_NO_DAEMON_ACTIONS |
+			 CINIT_FLAG_NO_MON_CONFIG);
+  common_init_finish(cct.get());
+  std::string val;
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_double_dash(args, i)) {
+      break;
+    } else if (ceph_argparse_witharg(args, i, &val, "--tenant", "-t",
+				     (char*)nullptr)) {
+      tenant = std::move(val);
+    } else {
+      ++i;
+    }
+  }
+
+  if (tenant.empty()) {
+    std::cerr << cmdname << ": must specify tenant name" << std::endl;
+    helpful_exit(cmdname);
+  }
+
+  bool success = true;
+
+  if (args.empty()) {
+    success = parse(cct.get(), tenant, "(stdin)", std::cin);
+  } else {
+    for (const auto& file : args) {
+      std::ifstream in;
+      in.open(file, std::ifstream::in);
+      if (!in.is_open()) {
+	std::cerr << "Can't read " << file << std::endl;
+	success = false;
+      }
+      if (!parse(cct.get(), tenant, file, in)) {
+	success = false;
+      }
+    }
+  }
+
+  return success ? 0 : 1;
+}
diff --git a/src/rgw/rgw_pool_types.h b/src/rgw/rgw_pool_types.h
new file mode 100644
index 000000000..b23e7d005
--- /dev/null
+++ b/src/rgw/rgw_pool_types.h
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * introduce changes or include files which can only be compiled in
+ * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h)
+ */
+
+#pragma once
+
+#include <string>
+#include <fmt/format.h>
+
+#include "include/types.h"
+#include "common/Formatter.h"
+
+class JSONObj;
+
+struct rgw_pool {
+  std::string name;
+  std::string ns;
+
+  rgw_pool() = default;
+  rgw_pool(const rgw_pool& _p) : name(_p.name), ns(_p.ns) {}
+  rgw_pool(rgw_pool&&) = default;
+  rgw_pool(const std::string& _s) {
+    from_str(_s);
+  }
+  rgw_pool(const std::string& _name, const std::string& _ns) : name(_name), ns(_ns) {}
+
+  std::string to_str() const;
+  void from_str(const std::string& s);
+
+  void init(const std::string& _s) {
+    from_str(_s);
+  }
+
+  bool empty() const {
+    return name.empty();
+  }
+
+  int compare(const rgw_pool& p) const {
+    int r = name.compare(p.name);
+    if (r != 0) {
+      return r;
+    }
+    return ns.compare(p.ns);
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+     ENCODE_START(10, 10, bl);
+    encode(name, bl);
+    encode(ns, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode_from_bucket(ceph::buffer::list::const_iterator& bl);
+
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(10, 3, 3, bl);
+
+    decode(name, bl);
+
+    if (struct_v < 10) {
+
+    /*
+     * note that rgw_pool can be used where rgw_bucket was used before
+     * therefore we inherit rgw_bucket's old versions. However, we only
+     * need the first field from rgw_bucket. unless we add more fields
+     * in which case we'll need to look at struct_v, and check the actual
+     * version. Anything older than 10 needs to be treated as old rgw_bucket
+     */
+
+    } else {
+      decode(ns, bl);
+    }
+
+    DECODE_FINISH(bl);
+  }
+
+  rgw_pool& operator=(const rgw_pool&) = default;
+
+  bool operator==(const rgw_pool& p) const {
+    return (compare(p) == 0);
+  }
+  bool operator!=(const rgw_pool& p) const {
+    return !(*this == p);
+  }
+  bool operator<(const rgw_pool& p) const {
+    int r = name.compare(p.name);
+    if (r == 0) {
+      return (ns.compare(p.ns) < 0);
+    }
+    return (r < 0);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_pool)
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_pool& p) {
+  out << p.to_str();
+  return out;
+}
+
+struct rgw_data_placement_target {
+  rgw_pool data_pool;
+  rgw_pool data_extra_pool;
+  rgw_pool index_pool;
+
+  rgw_data_placement_target() = default;
+  rgw_data_placement_target(const rgw_data_placement_target&) = default;
+  rgw_data_placement_target(rgw_data_placement_target&&) = default;
+
+  rgw_data_placement_target(const rgw_pool& data_pool,
+                            const rgw_pool& data_extra_pool,
+                            const rgw_pool& index_pool)
+    : data_pool(data_pool),
+      data_extra_pool(data_extra_pool),
+      index_pool(index_pool) {
+  }
+
+  rgw_data_placement_target&
+  operator=(const rgw_data_placement_target&) = default;
+
+  const rgw_pool& get_data_extra_pool() const {
+    if (data_extra_pool.empty()) {
+      return data_pool;
+    }
+    return data_extra_pool;
+  }
+
+  int compare(const rgw_data_placement_target& t) {
+    int c = data_pool.compare(t.data_pool);
+    if (c != 0) {
+      return c;
+    }
+    c = data_extra_pool.compare(t.data_extra_pool);
+    if (c != 0) {
+      return c;
+    }
+    return index_pool.compare(t.index_pool);
+  };
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
diff --git a/src/rgw/rgw_process.cc b/src/rgw/rgw_process.cc
new file mode 100644
index 000000000..8d20251f8
--- /dev/null
+++ b/src/rgw/rgw_process.cc
@@ -0,0 +1,472 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include "common/WorkQueue.h"
+#include "include/scope_guard.h"
+
+#include <utility>
+#include "rgw_auth_registry.h"
+#include "rgw_dmclock_scheduler.h"
+#include "rgw_rest.h"
+#include "rgw_frontend.h"
+#include "rgw_request.h"
+#include "rgw_process.h"
+#include "rgw_loadgen.h"
+#include "rgw_client_io.h"
+#include "rgw_opa.h"
+#include "rgw_perf_counters.h"
+#include "rgw_lua.h"
+#include "rgw_lua_request.h"
+#include "rgw_tracer.h"
+#include "rgw_ratelimit.h"
+
+#include "services/svc_zone_utils.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using rgw::dmclock::Scheduler;
+
+void RGWProcess::RGWWQ::_dump_queue()
+{
+  if (!g_conf()->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    return;
+  }
+  deque<RGWRequest *>::iterator iter;
+  if (process->m_req_queue.empty()) {
+    dout(20) << "RGWWQ: empty" << dendl;
+    return;
+  }
+  dout(20) << "RGWWQ:" << dendl;
+  for (iter = process->m_req_queue.begin();
+       iter != process->m_req_queue.end(); ++iter) {
+    dout(20) << "req: " << hex << *iter << dec << dendl;
+  }
+} /* RGWProcess::RGWWQ::_dump_queue */
+
+auto schedule_request(Scheduler *scheduler, req_state *s, RGWOp *op)
+{
+  using rgw::dmclock::SchedulerCompleter;
+  if (!scheduler)
+    return std::make_pair(0,SchedulerCompleter{});
+
+  const auto client = op->dmclock_client();
+  const auto cost = op->dmclock_cost();
+  if (s->cct->_conf->subsys.should_gather(ceph_subsys_rgw, 10)) {
+    ldpp_dout(op,10) << "scheduling with "
+		     << s->cct->_conf.get_val<std::string>("rgw_scheduler_type")
+		     << " client=" << static_cast<int>(client)
+		     << " cost=" << cost << dendl;
+  }
+  return scheduler->schedule_request(client, {},
+                                     req_state::Clock::to_double(s->time),
+                                     cost,
+                                     s->yield);
+}
+
+bool RGWProcess::RGWWQ::_enqueue(RGWRequest* req) {
+  process->m_req_queue.push_back(req);
+  perfcounter->inc(l_rgw_qlen);
+  dout(20) << "enqueued request req=" << hex << req << dec << dendl;
+  _dump_queue();
+  return true;
+}
+
+RGWRequest* RGWProcess::RGWWQ::_dequeue() {
+  if (process->m_req_queue.empty())
+    return NULL;
+  RGWRequest *req = process->m_req_queue.front();
+  process->m_req_queue.pop_front();
+  dout(20) << "dequeued request req=" << hex << req << dec << dendl;
+  _dump_queue();
+  perfcounter->inc(l_rgw_qlen, -1);
+  return req;
+}
+
+void RGWProcess::RGWWQ::_process(RGWRequest *req, ThreadPool::TPHandle &) {
+  perfcounter->inc(l_rgw_qactive);
+  process->handle_request(this, req);
+  process->req_throttle.put(1);
+  perfcounter->inc(l_rgw_qactive, -1);
+}
+bool rate_limit(rgw::sal::Driver* driver, req_state* s) {
+  // we dont want to limit health check or system or admin requests
+  const auto& is_admin_or_system = s->user->get_info();
+  if ((s->op_type ==  RGW_OP_GET_HEALTH_CHECK) || is_admin_or_system.admin || is_admin_or_system.system)
+    return false;
+  std::string userfind;
+  RGWRateLimitInfo global_user;
+  RGWRateLimitInfo global_bucket;
+  RGWRateLimitInfo global_anon;
+  RGWRateLimitInfo* bucket_ratelimit;
+  RGWRateLimitInfo* user_ratelimit;
+  driver->get_ratelimit(global_bucket, global_user, global_anon);
+  bucket_ratelimit = &global_bucket;
+  user_ratelimit = &global_user;
+  s->user->get_id().to_str(userfind);
+  userfind = "u" + userfind;
+  s->ratelimit_user_name = userfind;
+  std::string bucketfind = !rgw::sal::Bucket::empty(s->bucket.get()) ? "b" + s->bucket->get_marker() : "";
+  s->ratelimit_bucket_marker = bucketfind;
+  const char *method = s->info.method;
+
+  auto iter = s->user->get_attrs().find(RGW_ATTR_RATELIMIT);
+  if(iter != s->user->get_attrs().end()) {
+    try {
+      RGWRateLimitInfo user_ratelimit_temp;
+      bufferlist& bl = iter->second;
+      auto biter = bl.cbegin();
+      decode(user_ratelimit_temp, biter);
+      // override global rate limiting only if local rate limiting is enabled
+      if (user_ratelimit_temp.enabled)
+        *user_ratelimit = user_ratelimit_temp;
+    } catch (buffer::error& err) {
+      ldpp_dout(s, 0) << "ERROR: failed to decode rate limit" << dendl;
+      return -EIO;
+    }
+  }
+  if (s->user->get_id().id == RGW_USER_ANON_ID && global_anon.enabled) {
+    *user_ratelimit = global_anon;
+  }
+  bool limit_bucket = false;
+  bool limit_user = s->ratelimit_data->should_rate_limit(method, s->ratelimit_user_name, s->time, user_ratelimit);
+
+  if(!rgw::sal::Bucket::empty(s->bucket.get()))
+  {
+    iter = s->bucket->get_attrs().find(RGW_ATTR_RATELIMIT);
+    if(iter != s->bucket->get_attrs().end()) {
+      try {
+        RGWRateLimitInfo bucket_ratelimit_temp;
+        bufferlist& bl = iter->second;
+        auto biter = bl.cbegin();
+        decode(bucket_ratelimit_temp, biter);
+        // override global rate limiting only if local rate limiting is enabled
+        if (bucket_ratelimit_temp.enabled)
+          *bucket_ratelimit = bucket_ratelimit_temp;
+      } catch (buffer::error& err) {
+        ldpp_dout(s, 0) << "ERROR: failed to decode rate limit" << dendl;
+        return -EIO;
+      }
+    }
+    if (!limit_user) {
+      limit_bucket = s->ratelimit_data->should_rate_limit(method, s->ratelimit_bucket_marker, s->time, bucket_ratelimit);
+    }
+  }
+  if(limit_bucket && !limit_user) {
+    s->ratelimit_data->giveback_tokens(method, s->ratelimit_user_name);
+  }
+  s->user_ratelimit = *user_ratelimit;
+  s->bucket_ratelimit = *bucket_ratelimit;
+  return (limit_user || limit_bucket);
+}
+
+int rgw_process_authenticated(RGWHandler_REST * const handler,
+                              RGWOp *& op,
+                              RGWRequest * const req,
+                              req_state * const s,
+			                        optional_yield y,
+                              rgw::sal::Driver* driver,
+                              const bool skip_retarget)
+{
+  ldpp_dout(op, 2) << "init permissions" << dendl;
+  int ret = handler->init_permissions(op, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  /**
+   * Only some accesses support website mode, and website mode does NOT apply
+   * if you are using the REST endpoint either (ergo, no authenticated access)
+   */
+  if (! skip_retarget) {
+    ldpp_dout(op, 2) << "recalculating target" << dendl;
+    ret = handler->retarget(op, &op, y);
+    if (ret < 0) {
+      return ret;
+    }
+    req->op = op;
+  } else {
+    ldpp_dout(op, 2) << "retargeting skipped because of SubOp mode" << dendl;
+  }
+
+  /* If necessary extract object ACL and put them into req_state. */
+  ldpp_dout(op, 2) << "reading permissions" << dendl;
+  ret = handler->read_permissions(op, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ldpp_dout(op, 2) << "init op" << dendl;
+  ret = op->init_processing(y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ldpp_dout(op, 2) << "verifying op mask" << dendl;
+  ret = op->verify_op_mask();
+  if (ret < 0) {
+    return ret;
+  }
+
+  /* Check if OPA is used to authorize requests */
+  if (s->cct->_conf->rgw_use_opa_authz) {
+    ret = rgw_opa_authorize(op, s);
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  ldpp_dout(op, 2) << "verifying op permissions" << dendl;
+  {
+    auto span = tracing::rgw::tracer.add_span("verify_permission", s->trace);
+    std::swap(span, s->trace);
+    ret = op->verify_permission(y);
+    std::swap(span, s->trace);
+  }
+  if (ret < 0) {
+    if (s->system_request) {
+      dout(2) << "overriding permissions due to system operation" << dendl;
+    } else if (s->auth.identity->is_admin_of(s->user->get_id())) {
+      dout(2) << "overriding permissions due to admin operation" << dendl;
+    } else {
+      return ret;
+    }
+  }
+
+  ldpp_dout(op, 2) << "verifying op params" << dendl;
+  ret = op->verify_params();
+  if (ret < 0) {
+    return ret;
+  }
+
+  ldpp_dout(op, 2) << "pre-executing" << dendl;
+  op->pre_exec();
+
+  ldpp_dout(op, 2) << "check rate limiting" << dendl;
+  if (rate_limit(driver, s)) {
+    return -ERR_RATE_LIMITED;
+  }
+  ldpp_dout(op, 2) << "executing" << dendl;
+  {
+    auto span = tracing::rgw::tracer.add_span("execute", s->trace);
+    std::swap(span, s->trace);
+    op->execute(y);
+    std::swap(span, s->trace);
+  }
+
+  ldpp_dout(op, 2) << "completing" << dendl;
+  op->complete();
+
+  return 0;
+}
+
+int process_request(const RGWProcessEnv& penv,
+                    RGWRequest* const req,
+                    const std::string& frontend_prefix,
+                    RGWRestfulIO* const client_io,
+                    optional_yield yield,
+		    rgw::dmclock::Scheduler *scheduler,
+                    string* user,
+                    ceph::coarse_real_clock::duration* latency,
+                    int* http_ret)
+{
+  int ret = client_io->init(g_ceph_context);
+  dout(1) << "====== starting new request req=" << hex << req << dec
+	  << " =====" << dendl;
+  perfcounter->inc(l_rgw_req);
+
+  RGWEnv& rgw_env = client_io->get_env();
+
+  req_state rstate(g_ceph_context, penv, &rgw_env, req->id);
+  req_state *s = &rstate;
+
+  s->ratelimit_data = penv.ratelimiting->get_active();
+
+  rgw::sal::Driver* driver = penv.driver;
+  std::unique_ptr<rgw::sal::User> u = driver->get_user(rgw_user());
+  s->set_user(u);
+
+  if (ret < 0) {
+    s->cio = client_io;
+    abort_early(s, nullptr, ret, nullptr, yield);
+    return ret;
+  }
+
+  s->req_id = driver->zone_unique_id(req->id);
+  s->trans_id = driver->zone_unique_trans_id(req->id);
+  s->host_id = driver->get_host_id();
+  s->yield = yield;
+
+  ldpp_dout(s, 2) << "initializing for trans_id = " << s->trans_id << dendl;
+
+  RGWOp* op = nullptr;
+  int init_error = 0;
+  bool should_log = false;
+  RGWREST* rest = penv.rest;
+  RGWRESTMgr *mgr;
+  RGWHandler_REST *handler = rest->get_handler(driver, s,
+                                               *penv.auth_registry,
+                                               frontend_prefix,
+                                               client_io, &mgr, &init_error);
+  rgw::dmclock::SchedulerCompleter c;
+
+  if (init_error != 0) {
+    abort_early(s, nullptr, init_error, nullptr, yield);
+    goto done;
+  }
+  ldpp_dout(s, 10) << "handler=" << typeid(*handler).name() << dendl;
+
+  should_log = mgr->get_logging();
+
+  ldpp_dout(s, 2) << "getting op " << s->op << dendl;
+  op = handler->get_op();
+  if (!op) {
+    abort_early(s, NULL, -ERR_METHOD_NOT_ALLOWED, handler, yield);
+    goto done;
+  }
+  {
+    s->trace_enabled = tracing::rgw::tracer.is_enabled();
+    std::string script;
+    auto rc = rgw::lua::read_script(s, penv.lua.manager.get(), s->bucket_tenant, s->yield, rgw::lua::context::preRequest, script);
+    if (rc == -ENOENT) {
+      // no script, nothing to do
+    } else if (rc < 0) {
+      ldpp_dout(op, 5) << "WARNING: failed to read pre request script. error: " << rc << dendl;
+    } else {
+      rc = rgw::lua::request::execute(driver, rest, penv.olog, s, op, script);
+      if (rc < 0) {
+        ldpp_dout(op, 5) << "WARNING: failed to execute pre request script. error: " << rc << dendl;
+      }
+    }
+  }
+  std::tie(ret,c) = schedule_request(scheduler, s, op);
+  if (ret < 0) {
+    if (ret == -EAGAIN) {
+      ret = -ERR_RATE_LIMITED;
+    }
+    ldpp_dout(op,0) << "Scheduling request failed with " << ret << dendl;
+    abort_early(s, op, ret, handler, yield);
+    goto done;
+  }
+  req->op = op;
+  ldpp_dout(op, 10) << "op=" << typeid(*op).name() << dendl;
+  s->op_type = op->get_type();
+
+  try {
+    ldpp_dout(op, 2) << "verifying requester" << dendl;
+    ret = op->verify_requester(*penv.auth_registry, yield);
+    if (ret < 0) {
+      dout(10) << "failed to authorize request" << dendl;
+      abort_early(s, op, ret, handler, yield);
+      goto done;
+    }
+
+    /* FIXME: remove this after switching all handlers to the new authentication
+     * infrastructure. */
+    if (nullptr == s->auth.identity) {
+      s->auth.identity = rgw::auth::transform_old_authinfo(s);
+    }
+
+    ldpp_dout(op, 2) << "normalizing buckets and tenants" << dendl;
+    ret = handler->postauth_init(yield);
+    if (ret < 0) {
+      dout(10) << "failed to run post-auth init" << dendl;
+      abort_early(s, op, ret, handler, yield);
+      goto done;
+    }
+
+    if (s->user->get_info().suspended) {
+      dout(10) << "user is suspended, uid=" << s->user->get_id() << dendl;
+      abort_early(s, op, -ERR_USER_SUSPENDED, handler, yield);
+      goto done;
+    }
+
+
+    const auto trace_name = std::string(op->name()) + " " + s->trans_id;
+    s->trace = tracing::rgw::tracer.start_trace(trace_name, s->trace_enabled);
+    s->trace->SetAttribute(tracing::rgw::OP, op->name());
+    s->trace->SetAttribute(tracing::rgw::TYPE, tracing::rgw::REQUEST);
+
+    ret = rgw_process_authenticated(handler, op, req, s, yield, driver);
+    if (ret < 0) {
+      abort_early(s, op, ret, handler, yield);
+      goto done;
+    }
+  } catch (const ceph::crypto::DigestException& e) {
+    dout(0) << "authentication failed" << e.what() << dendl;
+    abort_early(s, op, -ERR_INVALID_SECRET_KEY, handler, yield);
+  }
+
+done:
+  if (op) {
+    if (s->trace) {
+      s->trace->SetAttribute(tracing::rgw::RETURN, op->get_ret());
+      if (!rgw::sal::User::empty(s->user)) {
+        s->trace->SetAttribute(tracing::rgw::USER_ID, s->user->get_id().id);
+      }
+      if (!rgw::sal::Bucket::empty(s->bucket)) {
+        s->trace->SetAttribute(tracing::rgw::BUCKET_NAME, s->bucket->get_name());
+      }
+      if (!rgw::sal::Object::empty(s->object)) {
+        s->trace->SetAttribute(tracing::rgw::OBJECT_NAME, s->object->get_name());
+      }
+    }
+    std::string script;
+    auto rc = rgw::lua::read_script(s, penv.lua.manager.get(), s->bucket_tenant, s->yield, rgw::lua::context::postRequest, script);
+    if (rc == -ENOENT) {
+      // no script, nothing to do
+    } else if (rc < 0) {
+      ldpp_dout(op, 5) << "WARNING: failed to read post request script. error: " << rc << dendl;
+    } else {
+      rc = rgw::lua::request::execute(driver, rest, penv.olog, s, op, script);
+      if (rc < 0) {
+        ldpp_dout(op, 5) << "WARNING: failed to execute post request script. error: " << rc << dendl;
+      }
+    }
+  }
+
+  try {
+    client_io->complete_request();
+  } catch (rgw::io::Exception& e) {
+    dout(0) << "ERROR: client_io->complete_request() returned "
+            << e.what() << dendl;
+  }
+  if (should_log) {
+    rgw_log_op(rest, s, op, penv.olog);
+  }
+
+  if (http_ret != nullptr) {
+    *http_ret = s->err.http_ret;
+  }
+  int op_ret = 0;
+
+  if (user && !rgw::sal::User::empty(s->user.get())) {
+    *user = s->user->get_id().to_str();
+  }
+
+  if (op) {
+    op_ret = op->get_ret();
+    ldpp_dout(op, 2) << "op status=" << op_ret << dendl;
+    ldpp_dout(op, 2) << "http status=" << s->err.http_ret << dendl;
+  } else {
+    ldpp_dout(s, 2) << "http status=" << s->err.http_ret << dendl;
+  }
+  if (handler)
+    handler->put_op(op);
+  rest->put_handler(handler);
+
+  const auto lat = s->time_elapsed();
+  if (latency) {
+    *latency = lat;
+  }
+  dout(1) << "====== req done req=" << hex << req << dec
+	  << " op status=" << op_ret
+	  << " http_status=" << s->err.http_ret
+	  << " latency=" << lat
+	  << " ======"
+	  << dendl;
+
+  return (ret < 0 ? ret : s->err.ret);
+} /* process_request */
diff --git a/src/rgw/rgw_process.h b/src/rgw/rgw_process.h
new file mode 100644
index 000000000..640f07842
--- /dev/null
+++ b/src/rgw/rgw_process.h
@@ -0,0 +1,159 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_common.h"
+#include "rgw_acl.h"
+#include "rgw_user.h"
+#include "rgw_rest.h"
+#include "include/ceph_assert.h"
+
+#include "common/WorkQueue.h"
+#include "common/Throttle.h"
+
+#include <atomic>
+
+#define dout_context g_ceph_context
+
+
+namespace rgw::dmclock {
+  class Scheduler;
+}
+
+struct RGWProcessEnv;
+class RGWFrontendConfig;
+class RGWRequest;
+
+class RGWProcess {
+  std::deque<RGWRequest*> m_req_queue;
+protected:
+  CephContext *cct;
+  RGWProcessEnv& env;
+  ThreadPool m_tp;
+  Throttle req_throttle;
+  RGWFrontendConfig* conf;
+  int sock_fd;
+  std::string uri_prefix;
+
+  struct RGWWQ : public DoutPrefixProvider, public ThreadPool::WorkQueue<RGWRequest> {
+    RGWProcess* process;
+    RGWWQ(RGWProcess* p, ceph::timespan timeout, ceph::timespan suicide_timeout,
+	  ThreadPool* tp)
+      : ThreadPool::WorkQueue<RGWRequest>("RGWWQ", timeout, suicide_timeout,
+					  tp), process(p) {}
+
+    bool _enqueue(RGWRequest* req) override;
+
+    void _dequeue(RGWRequest* req) override {
+      ceph_abort();
+    }
+
+    bool _empty() override {
+      return process->m_req_queue.empty();
+    }
+
+    RGWRequest* _dequeue() override;
+
+    using ThreadPool::WorkQueue<RGWRequest>::_process;
+
+    void _process(RGWRequest *req, ThreadPool::TPHandle &) override;
+
+    void _dump_queue();
+
+    void _clear() override {
+      ceph_assert(process->m_req_queue.empty());
+    }
+
+  CephContext *get_cct() const override { return process->cct; }
+  unsigned get_subsys() const { return ceph_subsys_rgw; }
+  std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw request work queue: ";}
+
+  } req_wq;
+
+public:
+  RGWProcess(CephContext* const cct,
+             RGWProcessEnv& env,
+             const int num_threads,
+             std::string uri_prefix,
+             RGWFrontendConfig* const conf)
+    : cct(cct), env(env),
+      m_tp(cct, "RGWProcess::m_tp", "tp_rgw_process", num_threads),
+      req_throttle(cct, "rgw_ops", num_threads * 2),
+      conf(conf),
+      sock_fd(-1),
+      uri_prefix(std::move(uri_prefix)),
+      req_wq(this,
+	     ceph::make_timespan(g_conf()->rgw_op_thread_timeout),
+	     ceph::make_timespan(g_conf()->rgw_op_thread_suicide_timeout),
+	     &m_tp) {
+  }
+
+  virtual ~RGWProcess() = default;
+
+  const RGWProcessEnv& get_env() const { return env; }
+
+  virtual void run() = 0;
+  virtual void handle_request(const DoutPrefixProvider *dpp, RGWRequest *req) = 0;
+
+  void pause() {
+    m_tp.pause();
+  }
+
+  void unpause_with_new_config() {
+    m_tp.unpause();
+  }
+
+  void close_fd() {
+    if (sock_fd >= 0) {
+      ::close(sock_fd);
+      sock_fd = -1;
+    }
+  }
+}; /* RGWProcess */
+
+class RGWProcessControlThread : public Thread {
+  RGWProcess *pprocess;
+public:
+  explicit RGWProcessControlThread(RGWProcess *_pprocess) : pprocess(_pprocess) {}
+
+  void *entry() override {
+    pprocess->run();
+    return NULL;
+  }
+};
+
+class RGWLoadGenProcess : public RGWProcess {
+  RGWAccessKey access_key;
+public:
+  RGWLoadGenProcess(CephContext* cct, RGWProcessEnv& env, int num_threads,
+                    std::string uri_prefix, RGWFrontendConfig* _conf)
+    : RGWProcess(cct, env, num_threads, std::move(uri_prefix), _conf) {}
+  void run() override;
+  void checkpoint();
+  void handle_request(const DoutPrefixProvider *dpp, RGWRequest* req) override;
+  void gen_request(const std::string& method, const std::string& resource,
+		  int content_length, std::atomic<bool>* fail_flag);
+
+  void set_access_key(RGWAccessKey& key) { access_key = key; }
+};
+/* process stream request */
+extern int process_request(const RGWProcessEnv& penv,
+                           RGWRequest* req,
+                           const std::string& frontend_prefix,
+                           RGWRestfulIO* client_io,
+                           optional_yield y,
+                           rgw::dmclock::Scheduler *scheduler,
+                           std::string* user,
+                           ceph::coarse_real_clock::duration* latency,
+                           int* http_ret = nullptr);
+
+extern int rgw_process_authenticated(RGWHandler_REST* handler,
+                                     RGWOp*& op,
+                                     RGWRequest* req,
+                                     req_state* s,
+				                             optional_yield y,
+                                     rgw::sal::Driver* driver,
+                                     bool skip_retarget = false);
+
+#undef dout_context
diff --git a/src/rgw/rgw_process_env.h b/src/rgw/rgw_process_env.h
new file mode 100644
index 000000000..4becf21a1
--- /dev/null
+++ b/src/rgw/rgw_process_env.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <memory>
+
+class ActiveRateLimiter;
+class OpsLogSink;
+class RGWREST;
+
+namespace rgw::auth {
+  class StrategyRegistry;
+}
+namespace rgw::lua {
+  class Background;
+}
+namespace rgw::sal {
+  class Store;
+  class LuaManager;
+}
+
+#ifdef WITH_ARROW_FLIGHT
+namespace rgw::flight {
+  class FlightServer;
+  class FlightStore;
+}
+#endif
+
+struct RGWLuaProcessEnv {
+  std::string luarocks_path;
+  rgw::lua::Background* background = nullptr;
+  std::unique_ptr<rgw::sal::LuaManager> manager;
+};
+
+struct RGWProcessEnv {
+  RGWLuaProcessEnv lua;
+  rgw::sal::Driver* driver = nullptr;
+  RGWREST *rest = nullptr;
+  OpsLogSink *olog = nullptr;
+  std::unique_ptr<rgw::auth::StrategyRegistry> auth_registry;
+  ActiveRateLimiter* ratelimiting = nullptr;
+
+#ifdef WITH_ARROW_FLIGHT
+  // managed by rgw:flight::FlightFrontend in rgw_flight_frontend.cc
+  rgw::flight::FlightServer* flight_server = nullptr;
+  rgw::flight::FlightStore* flight_store = nullptr;
+#endif
+};
+
diff --git a/src/rgw/rgw_public_access.cc b/src/rgw/rgw_public_access.cc
new file mode 100644
index 000000000..6298bb306
--- /dev/null
+++ b/src/rgw/rgw_public_access.cc
@@ -0,0 +1,33 @@
+#include "rgw_public_access.h"
+#include "rgw_xml.h"
+
+void PublicAccessBlockConfiguration::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("BlockPublicAcls", BlockPublicAcls, obj);
+  RGWXMLDecoder::decode_xml("IgnorePublicAcls", IgnorePublicAcls, obj);
+  RGWXMLDecoder::decode_xml("BlockPublicPolicy", BlockPublicPolicy, obj);
+  RGWXMLDecoder::decode_xml("RestrictPublicBuckets", RestrictPublicBuckets, obj);
+}
+
+void PublicAccessBlockConfiguration::dump_xml(Formatter *f) const {
+  Formatter::ObjectSection os(*f, "BlockPublicAccessBlockConfiguration");
+  // Note: AWS spec mentions the values to be ALL CAPs, but clients seem to
+  // require all small letters, and S3 itself doesn't seem to follow the API
+  // spec here
+  f->dump_bool("BlockPublicAcls", BlockPublicAcls);
+  f->dump_bool("IgnorePublicAcls", IgnorePublicAcls);
+  f->dump_bool("BlockPublicPolicy", BlockPublicPolicy);
+  f->dump_bool("RestrictPublicBuckets", RestrictPublicBuckets);
+}
+
+
+std::ostream& operator<< (std::ostream& os, const PublicAccessBlockConfiguration& access_conf)
+{
+    os << std::boolalpha
+       << "BlockPublicAcls: " << access_conf.block_public_acls() << std::endl
+       << "IgnorePublicAcls: " << access_conf.ignore_public_acls() << std::endl
+       << "BlockPublicPolicy" << access_conf.block_public_policy() << std::endl
+       << "RestrictPublicBuckets" << access_conf.restrict_public_buckets() << std::endl;
+
+    return os;
+}
+
diff --git a/src/rgw/rgw_public_access.h b/src/rgw/rgw_public_access.h
new file mode 100644
index 000000000..87d2a16a3
--- /dev/null
+++ b/src/rgw/rgw_public_access.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+#include <include/types.h>
+
+class XMLObj;
+
+class PublicAccessBlockConfiguration {
+  bool BlockPublicAcls;
+  bool IgnorePublicAcls;
+  bool BlockPublicPolicy;
+  bool RestrictPublicBuckets;
+ public:
+ PublicAccessBlockConfiguration():
+   BlockPublicAcls(false), IgnorePublicAcls(false),
+  BlockPublicPolicy(false), RestrictPublicBuckets(false)
+    {}
+
+  auto block_public_acls() const {
+    return BlockPublicAcls;
+  }
+  auto ignore_public_acls() const {
+    return IgnorePublicAcls;
+  }
+  auto block_public_policy() const {
+    return BlockPublicPolicy;
+  }
+  auto restrict_public_buckets() const {
+    return RestrictPublicBuckets;
+  }
+
+  void encode(ceph::bufferlist& bl) const {
+    ENCODE_START(1,1, bl);
+    encode(BlockPublicAcls, bl);
+    encode(IgnorePublicAcls, bl);
+    encode(BlockPublicPolicy, bl);
+    encode(RestrictPublicBuckets, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(ceph::bufferlist::const_iterator& bl) {
+    DECODE_START(1,bl);
+    decode(BlockPublicAcls, bl);
+    decode(IgnorePublicAcls, bl);
+    decode(BlockPublicPolicy, bl);
+    decode(RestrictPublicBuckets, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(PublicAccessBlockConfiguration)
+std::ostream& operator<< (std::ostream& os, const PublicAccessBlockConfiguration& access_conf);
diff --git a/src/rgw/rgw_pubsub.cc b/src/rgw/rgw_pubsub.cc
new file mode 100644
index 000000000..2b0cffd47
--- /dev/null
+++ b/src/rgw/rgw_pubsub.cc
@@ -0,0 +1,736 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "services/svc_zone.h"
+#include "rgw_b64.h"
+#include "rgw_sal.h"
+#include "rgw_pubsub.h"
+#include "rgw_tools.h"
+#include "rgw_xml.h"
+#include "rgw_arn.h"
+#include "rgw_pubsub_push.h"
+#include <regex>
+#include <algorithm>
+
+#define dout_subsys ceph_subsys_rgw
+
+void set_event_id(std::string& id, const std::string& hash, const utime_t& ts) {
+  char buf[64];
+  const auto len = snprintf(buf, sizeof(buf), "%010ld.%06ld.%s", (long)ts.sec(), (long)ts.usec(), hash.c_str());
+  if (len > 0) {
+    id.assign(buf, len);
+  }
+}
+
+void rgw_s3_key_filter::dump(Formatter *f) const {
+  if (!prefix_rule.empty()) {
+    f->open_object_section("FilterRule");
+    ::encode_json("Name", "prefix", f);
+    ::encode_json("Value", prefix_rule, f);
+    f->close_section();
+  }
+  if (!suffix_rule.empty()) {
+    f->open_object_section("FilterRule");
+    ::encode_json("Name", "suffix", f);
+    ::encode_json("Value", suffix_rule, f);
+    f->close_section();
+  }
+  if (!regex_rule.empty()) {
+    f->open_object_section("FilterRule");
+    ::encode_json("Name", "regex", f);
+    ::encode_json("Value", regex_rule, f);
+    f->close_section();
+  }
+}
+
+bool rgw_s3_key_filter::decode_xml(XMLObj* obj) {
+  XMLObjIter iter = obj->find("FilterRule");
+  XMLObj *o;
+
+  const auto throw_if_missing = true;
+  auto prefix_not_set = true;
+  auto suffix_not_set = true;
+  auto regex_not_set = true;
+  std::string name;
+
+  while ((o = iter.get_next())) {
+    RGWXMLDecoder::decode_xml("Name", name, o, throw_if_missing);
+    if (name == "prefix" && prefix_not_set) {
+        prefix_not_set = false;
+        RGWXMLDecoder::decode_xml("Value", prefix_rule, o, throw_if_missing);
+    } else if (name == "suffix" && suffix_not_set) {
+        suffix_not_set = false;
+        RGWXMLDecoder::decode_xml("Value", suffix_rule, o, throw_if_missing);
+    } else if (name == "regex" && regex_not_set) {
+        regex_not_set = false;
+        RGWXMLDecoder::decode_xml("Value", regex_rule, o, throw_if_missing);
+    } else {
+        throw RGWXMLDecoder::err("invalid/duplicate S3Key filter rule name: '" + name + "'");
+    }
+  }
+  return true;
+}
+
+void rgw_s3_key_filter::dump_xml(Formatter *f) const {
+  if (!prefix_rule.empty()) {
+    f->open_object_section("FilterRule");
+    ::encode_xml("Name", "prefix", f);
+    ::encode_xml("Value", prefix_rule, f);
+    f->close_section();
+  }
+  if (!suffix_rule.empty()) {
+    f->open_object_section("FilterRule");
+    ::encode_xml("Name", "suffix", f);
+    ::encode_xml("Value", suffix_rule, f);
+    f->close_section();
+  }
+  if (!regex_rule.empty()) {
+    f->open_object_section("FilterRule");
+    ::encode_xml("Name", "regex", f);
+    ::encode_xml("Value", regex_rule, f);
+    f->close_section();
+  }
+}
+
+bool rgw_s3_key_filter::has_content() const {
+    return !(prefix_rule.empty() && suffix_rule.empty() && regex_rule.empty());
+}
+
+void rgw_s3_key_value_filter::dump(Formatter *f) const {
+  for (const auto& key_value : kv) {
+    f->open_object_section("FilterRule");
+    ::encode_json("Name", key_value.first, f);
+    ::encode_json("Value", key_value.second, f);
+    f->close_section();
+  }
+}
+
+bool rgw_s3_key_value_filter::decode_xml(XMLObj* obj) {
+  kv.clear();
+  XMLObjIter iter = obj->find("FilterRule");
+  XMLObj *o;
+
+  const auto throw_if_missing = true;
+
+  std::string key;
+  std::string value;
+
+  while ((o = iter.get_next())) {
+    RGWXMLDecoder::decode_xml("Name", key, o, throw_if_missing);
+    RGWXMLDecoder::decode_xml("Value", value, o, throw_if_missing);
+    kv.emplace(key, value);
+  }
+  return true;
+}
+
+void rgw_s3_key_value_filter::dump_xml(Formatter *f) const {
+  for (const auto& key_value : kv) {
+    f->open_object_section("FilterRule");
+    ::encode_xml("Name", key_value.first, f);
+    ::encode_xml("Value", key_value.second, f);
+    f->close_section();
+  }
+}
+
+bool rgw_s3_key_value_filter::has_content() const {
+    return !kv.empty();
+}
+
+void rgw_s3_filter::dump(Formatter *f) const {
+  encode_json("S3Key", key_filter, f);
+  encode_json("S3Metadata", metadata_filter, f);
+  encode_json("S3Tags", tag_filter, f);
+}
+
+bool rgw_s3_filter::decode_xml(XMLObj* obj) {
+    RGWXMLDecoder::decode_xml("S3Key", key_filter, obj);
+    RGWXMLDecoder::decode_xml("S3Metadata", metadata_filter, obj);
+    RGWXMLDecoder::decode_xml("S3Tags", tag_filter, obj);
+  return true;
+}
+
+void rgw_s3_filter::dump_xml(Formatter *f) const {
+  if (key_filter.has_content()) {
+      ::encode_xml("S3Key", key_filter, f);
+  }
+  if (metadata_filter.has_content()) {
+      ::encode_xml("S3Metadata", metadata_filter, f);
+  }
+  if (tag_filter.has_content()) {
+      ::encode_xml("S3Tags", tag_filter, f);
+  }
+}
+
+bool rgw_s3_filter::has_content() const {
+    return key_filter.has_content()  ||
+           metadata_filter.has_content() ||
+           tag_filter.has_content();
+}
+
+bool match(const rgw_s3_key_filter& filter, const std::string& key) {
+  const auto key_size = key.size();
+  const auto prefix_size = filter.prefix_rule.size();
+  if (prefix_size != 0) {
+    // prefix rule exists
+    if (prefix_size > key_size) {
+      // if prefix is longer than key, we fail
+      return false;
+    }
+    if (!std::equal(filter.prefix_rule.begin(), filter.prefix_rule.end(), key.begin())) {
+        return false;
+    }
+  }
+  const auto suffix_size = filter.suffix_rule.size();
+  if (suffix_size != 0) {
+    // suffix rule exists
+    if (suffix_size > key_size) {
+      // if suffix is longer than key, we fail
+      return false;
+    }
+    if (!std::equal(filter.suffix_rule.begin(), filter.suffix_rule.end(), (key.end() - suffix_size))) {
+        return false;
+    }
+  }
+  if (!filter.regex_rule.empty()) {
+    // TODO add regex chaching in the filter
+    const std::regex base_regex(filter.regex_rule);
+    if (!std::regex_match(key, base_regex)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv) {
+  // all filter pairs must exist with the same value in the object's metadata/tags
+  // object metadata/tags may include items not in the filter
+  return std::includes(kv.begin(), kv.end(), filter.kv.begin(), filter.kv.end());
+}
+
+bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv) {
+  // all filter pairs must exist with the same value in the object's metadata/tags
+  // object metadata/tags may include items not in the filter
+  for (auto& filter : filter.kv) {
+    auto result = kv.equal_range(filter.first);
+    if (std::any_of(result.first, result.second, [&filter](const std::pair<std::string, std::string>& p) { return p.second == filter.second;}))
+      continue;
+    else
+      return false;
+  }
+  return true;
+}
+
+bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event) {
+  // if event list exists, and none of the events in the list matches the event type, filter the message
+  if (!events.empty() && std::find(events.begin(), events.end(), event) == events.end()) {
+    return false;
+  }
+  return true;
+}
+
+void do_decode_xml_obj(rgw::notify::EventTypeList& l, const std::string& name, XMLObj *obj) {
+  l.clear();
+
+  XMLObjIter iter = obj->find(name);
+  XMLObj *o;
+
+  while ((o = iter.get_next())) {
+    std::string val;
+    decode_xml_obj(val, o);
+    l.push_back(rgw::notify::from_string(val));
+  }
+}
+
+bool rgw_pubsub_s3_notification::decode_xml(XMLObj *obj) {
+  const auto throw_if_missing = true;
+  RGWXMLDecoder::decode_xml("Id", id, obj, throw_if_missing);
+  
+  RGWXMLDecoder::decode_xml("Topic", topic_arn, obj, throw_if_missing);
+  
+  RGWXMLDecoder::decode_xml("Filter", filter, obj);
+
+  do_decode_xml_obj(events, "Event", obj);
+  if (events.empty()) {
+    // if no events are provided, we assume all events
+    events.push_back(rgw::notify::ObjectCreated);
+    events.push_back(rgw::notify::ObjectRemoved);
+  }
+  return true;
+}
+
+void rgw_pubsub_s3_notification::dump_xml(Formatter *f) const {
+  ::encode_xml("Id", id, f);
+  ::encode_xml("Topic", topic_arn.c_str(), f);
+  if (filter.has_content()) {
+      ::encode_xml("Filter", filter, f);
+  }
+  for (const auto& event : events) {
+    ::encode_xml("Event", rgw::notify::to_string(event), f);
+  }
+}
+
+bool rgw_pubsub_s3_notifications::decode_xml(XMLObj *obj) {
+  do_decode_xml_obj(list, "TopicConfiguration", obj);
+  return true;
+}
+
+rgw_pubsub_s3_notification::rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter) :
+    id(topic_filter.s3_id), events(topic_filter.events), topic_arn(topic_filter.topic.arn), filter(topic_filter.s3_filter) {} 
+
+void rgw_pubsub_s3_notifications::dump_xml(Formatter *f) const {
+  do_encode_xml("NotificationConfiguration", list, "TopicConfiguration", f);
+}
+
+void rgw_pubsub_s3_event::dump(Formatter *f) const {
+  encode_json("eventVersion", eventVersion, f);
+  encode_json("eventSource", eventSource, f);
+  encode_json("awsRegion", awsRegion, f);
+  utime_t ut(eventTime);
+  encode_json("eventTime", ut, f);
+  encode_json("eventName", eventName, f);
+  {
+    Formatter::ObjectSection s(*f, "userIdentity");
+    encode_json("principalId", userIdentity, f);
+  }
+  {
+    Formatter::ObjectSection s(*f, "requestParameters");
+    encode_json("sourceIPAddress", sourceIPAddress, f);
+  }
+  {
+    Formatter::ObjectSection s(*f, "responseElements");
+    encode_json("x-amz-request-id", x_amz_request_id, f);
+    encode_json("x-amz-id-2", x_amz_id_2, f);
+  }
+  {
+    Formatter::ObjectSection s(*f, "s3");
+    encode_json("s3SchemaVersion", s3SchemaVersion, f);
+    encode_json("configurationId", configurationId, f);
+    {
+        Formatter::ObjectSection sub_s(*f, "bucket");
+        encode_json("name", bucket_name, f);
+        {
+            Formatter::ObjectSection sub_sub_s(*f, "ownerIdentity");
+            encode_json("principalId", bucket_ownerIdentity, f);
+        }
+        encode_json("arn", bucket_arn, f);
+        encode_json("id", bucket_id, f);
+    }
+    {
+        Formatter::ObjectSection sub_s(*f, "object");
+        encode_json("key", object_key, f);
+        encode_json("size", object_size, f);
+        encode_json("eTag", object_etag, f);
+        encode_json("versionId", object_versionId, f);
+        encode_json("sequencer", object_sequencer, f);
+        encode_json("metadata", x_meta_map, f);
+        encode_json("tags", tags, f);
+    }
+  }
+  encode_json("eventId", id, f);
+  encode_json("opaqueData", opaque_data, f);
+}
+
+void rgw_pubsub_topic::dump(Formatter *f) const
+{
+  encode_json("user", user, f);
+  encode_json("name", name, f);
+  encode_json("dest", dest, f);
+  encode_json("arn", arn, f);
+  encode_json("opaqueData", opaque_data, f);
+}
+
+void rgw_pubsub_topic::dump_xml(Formatter *f) const
+{
+  encode_xml("User", user, f);
+  encode_xml("Name", name, f);
+  encode_xml("EndPoint", dest, f);
+  encode_xml("TopicArn", arn, f);
+  encode_xml("OpaqueData", opaque_data, f);
+}
+
+void encode_xml_key_value_entry(const std::string& key, const std::string& value, Formatter *f) {
+  f->open_object_section("entry");
+  encode_xml("key", key, f);
+  encode_xml("value", value, f);
+  f->close_section(); // entry
+}
+
+void rgw_pubsub_topic::dump_xml_as_attributes(Formatter *f) const
+{
+  f->open_array_section("Attributes");
+  std::string str_user;
+  user.to_str(str_user);
+  encode_xml_key_value_entry("User", str_user, f);
+  encode_xml_key_value_entry("Name", name, f);
+  encode_xml_key_value_entry("EndPoint", dest.to_json_str(), f);
+  encode_xml_key_value_entry("TopicArn", arn, f);
+  encode_xml_key_value_entry("OpaqueData", opaque_data, f);
+  f->close_section(); // Attributes
+}
+
+void encode_json(const char *name, const rgw::notify::EventTypeList& l, Formatter *f)
+{
+  f->open_array_section(name);
+  for (auto iter = l.cbegin(); iter != l.cend(); ++iter) {
+    f->dump_string("obj", rgw::notify::to_string(*iter));
+  }
+  f->close_section();
+}
+
+void rgw_pubsub_topic_filter::dump(Formatter *f) const
+{
+  encode_json("TopicArn", topic.arn, f);
+  encode_json("Id", s3_id, f);
+  encode_json("Events", events, f);
+  encode_json("Filter", s3_filter, f);
+}
+
+void rgw_pubsub_bucket_topics::dump(Formatter *f) const
+{
+  Formatter::ArraySection s(*f, "notifications");
+  for (auto& t : topics) {
+    encode_json(t.first.c_str(), t.second, f);
+  }
+}
+
+void rgw_pubsub_topics::dump(Formatter *f) const
+{
+  Formatter::ArraySection s(*f, "topics");
+  for (auto& t : topics) {
+    auto& topic = t.second;
+    if (topic.name == topic.dest.arn_topic) {
+      encode_json(t.first.c_str(), topic, f);
+    }
+  }
+}
+
+void rgw_pubsub_topics::dump_xml(Formatter *f) const
+{
+  for (auto& t : topics) {
+    encode_xml("member", t.second, f);
+  }
+}
+
+void rgw_pubsub_dest::dump(Formatter *f) const
+{
+  encode_json("push_endpoint", push_endpoint, f);
+  encode_json("push_endpoint_args", push_endpoint_args, f);
+  encode_json("push_endpoint_topic", arn_topic, f);
+  encode_json("stored_secret", stored_secret, f);
+  encode_json("persistent", persistent, f);
+}
+
+void rgw_pubsub_dest::dump_xml(Formatter *f) const
+{
+  encode_xml("EndpointAddress", push_endpoint, f);
+  encode_xml("EndpointArgs", push_endpoint_args, f);
+  encode_xml("EndpointTopic", arn_topic, f);
+  encode_xml("HasStoredSecret", stored_secret, f);
+  encode_xml("Persistent", persistent, f);
+}
+
+std::string rgw_pubsub_dest::to_json_str() const
+{
+  JSONFormatter f;
+  f.open_object_section("");
+  encode_json("EndpointAddress", push_endpoint, &f);
+  encode_json("EndpointArgs", push_endpoint_args, &f);
+  encode_json("EndpointTopic", arn_topic, &f);
+  encode_json("HasStoredSecret", stored_secret, &f);
+  encode_json("Persistent", persistent, &f);
+  f.close_section();
+  std::stringstream ss;
+  f.flush(ss);
+  return ss.str();
+}
+
+RGWPubSub::RGWPubSub(rgw::sal::Driver* _driver, const std::string& _tenant)
+  : driver(_driver), tenant(_tenant)
+{}
+
+int RGWPubSub::read_topics(const DoutPrefixProvider *dpp, rgw_pubsub_topics& result, 
+    RGWObjVersionTracker *objv_tracker, optional_yield y) const
+{
+  const int ret = driver->read_topics(tenant, result, objv_tracker, y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 10) << "WARNING: failed to read topics info: ret=" << ret << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int RGWPubSub::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics,
+				     RGWObjVersionTracker *objv_tracker, optional_yield y) const
+{
+  const int ret = driver->write_topics(tenant, topics, objv_tracker, y, dpp);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int RGWPubSub::Bucket::read_topics(const DoutPrefixProvider *dpp, rgw_pubsub_bucket_topics& result,
+    RGWObjVersionTracker *objv_tracker, optional_yield y) const
+{
+  const int ret = bucket->read_topics(result, objv_tracker, y, dpp);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+int RGWPubSub::Bucket::write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics,
+					RGWObjVersionTracker *objv_tracker,
+					optional_yield y) const
+{
+  const int ret = bucket->write_topics(topics, objv_tracker, y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to write bucket topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWPubSub::get_topic(const DoutPrefixProvider *dpp, const std::string& name, rgw_pubsub_topic& result, optional_yield y) const
+{
+  rgw_pubsub_topics topics;
+  const int ret = read_topics(dpp, topics, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  auto iter = topics.topics.find(name);
+  if (iter == topics.topics.end()) {
+    ldpp_dout(dpp, 1) << "ERROR: topic not found" << dendl;
+    return -ENOENT;
+  }
+
+  result = iter->second;
+  return 0;
+}
+
+// from list of bucket topics, find the one that was auto-generated by a notification
+auto find_unique_topic(const rgw_pubsub_bucket_topics &bucket_topics, const std::string &notification_id) {
+  auto it = std::find_if(bucket_topics.topics.begin(), bucket_topics.topics.end(),
+                         [&](const auto& val) { return notification_id == val.second.s3_id; });
+  return it != bucket_topics.topics.end() ?
+         std::optional<std::reference_wrapper<const rgw_pubsub_topic_filter>>(it->second):
+         std::nullopt;
+}
+
+int RGWPubSub::Bucket::get_notification_by_id(const DoutPrefixProvider *dpp, const std::string& notification_id,
+                                              rgw_pubsub_topic_filter& result, optional_yield y) const {
+  rgw_pubsub_bucket_topics bucket_topics;
+  const int ret = read_topics(dpp, bucket_topics, nullptr, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read bucket_topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  auto iter = find_unique_topic(bucket_topics, notification_id);
+  if (!iter) {
+    ldpp_dout(dpp, 1) << "ERROR: notification was not found" << dendl;
+    return -ENOENT;
+  }
+
+  result = iter->get();
+  return 0;
+}
+
+
+int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, 
+    const rgw::notify::EventTypeList& events, optional_yield y) const {
+  return create_notification(dpp, topic_name, events, std::nullopt, "", y);
+}
+
+int RGWPubSub::Bucket::create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, 
+    const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y) const {
+  rgw_pubsub_topic topic_info;
+
+  int ret = ps.get_topic(dpp, topic_name, topic_info, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read topic '" << topic_name << "' info: ret=" << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << "successfully read topic '" << topic_name << "' info" << dendl;
+
+  RGWObjVersionTracker objv_tracker;
+  rgw_pubsub_bucket_topics bucket_topics;
+
+  ret = read_topics(dpp, bucket_topics, &objv_tracker, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read topics from bucket '" << 
+      bucket->get_name() << "': ret=" << ret << dendl;
+    return ret;
+  }
+  ldpp_dout(dpp, 20) << "successfully read " << bucket_topics.topics.size() << " topics from bucket '" << 
+    bucket->get_name() << "'" << dendl;
+
+  auto& topic_filter = bucket_topics.topics[topic_name];
+  topic_filter.topic = topic_info;
+  topic_filter.events = events;
+  topic_filter.s3_id = notif_name;
+  if (s3_filter) {
+    topic_filter.s3_filter = *s3_filter;
+  }
+
+  ret = write_topics(dpp, bucket_topics, &objv_tracker, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to write topics to bucket '" << bucket->get_name() << "': ret=" << ret << dendl;
+    return ret;
+  }
+    
+  ldpp_dout(dpp, 20) << "successfully wrote " << bucket_topics.topics.size() << " topics to bucket '" << bucket->get_name() << "'" << dendl;
+
+  return 0;
+}
+
+int RGWPubSub::Bucket::remove_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, optional_yield y) const
+{
+  return remove_notification_inner(dpp, topic_name, false, y);
+}
+
+int RGWPubSub::Bucket::remove_notification_inner(const DoutPrefixProvider *dpp, const std::string& notification_id,
+                                  bool is_notification_id, optional_yield y) const
+{
+  RGWObjVersionTracker objv_tracker;
+  rgw_pubsub_bucket_topics bucket_topics;
+
+  auto ret = read_topics(dpp, bucket_topics, &objv_tracker, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read bucket topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+
+  std::unique_ptr<std::string> topic_name = std::make_unique<std::string>(notification_id);
+  if(is_notification_id) {
+    auto iter = find_unique_topic(bucket_topics, notification_id);
+    if (!iter) {
+      ldpp_dout(dpp, 1) << "ERROR: notification was not found" << dendl;
+      return -ENOENT;
+    }
+    topic_name = std::make_unique<std::string>(iter->get().topic.name);
+  }
+
+  if (bucket_topics.topics.erase(*topic_name) == 0) {
+    ldpp_dout(dpp, 1) << "INFO: no need to remove, topic does not exist" << dendl;
+    return 0;
+  }
+
+  if (bucket_topics.topics.empty()) {
+    // no more topics - delete the notification object of the bucket
+    ret = bucket->remove_topics(&objv_tracker, y, dpp);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl;
+      return ret;
+    }
+    return 0;
+  }
+
+  // write back the notifications without the deleted one
+  ret = write_topics(dpp, bucket_topics, &objv_tracker, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWPubSub::Bucket::remove_notification_by_id(const DoutPrefixProvider *dpp, const std::string& notif_id, optional_yield y) const
+{
+  return remove_notification_inner(dpp, notif_id, true, y);
+}
+
+int RGWPubSub::Bucket::remove_notifications(const DoutPrefixProvider *dpp, optional_yield y) const
+{
+  // get all topics on a bucket
+  rgw_pubsub_bucket_topics bucket_topics;
+  auto ret  = get_topics(dpp, bucket_topics, y);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to get list of topics from bucket '" << bucket->get_name() << "', ret=" << ret << dendl;
+    return ret ;
+  }
+
+  // remove all auto-genrated topics
+  for (const auto& topic : bucket_topics.topics) {
+    const auto& topic_name = topic.first;
+    ret = ps.remove_topic(dpp, topic_name, y);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 5) << "WARNING: failed to remove auto-generated topic '" << topic_name << "', ret=" << ret << dendl;
+    }
+  }
+
+  // delete the notification object of the bucket
+  ret = bucket->remove_topics(nullptr, y, dpp);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to remove bucket topics: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y) const {
+  return create_topic(dpp, name, rgw_pubsub_dest{}, "", "", y);
+}
+
+int RGWPubSub::create_topic(const DoutPrefixProvider *dpp, const std::string& name, const rgw_pubsub_dest& dest, 
+    const std::string& arn, const std::string& opaque_data, optional_yield y) const {
+  RGWObjVersionTracker objv_tracker;
+  rgw_pubsub_topics topics;
+
+  int ret = read_topics(dpp, topics, &objv_tracker, y);
+  if (ret < 0 && ret != -ENOENT) {
+    // its not an error if not topics exist, we create one
+    ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+    return ret;
+  }
+ 
+  rgw_pubsub_topic& new_topic = topics.topics[name];
+  new_topic.user = rgw_user("", tenant);
+  new_topic.name = name;
+  new_topic.dest = dest;
+  new_topic.arn = arn;
+  new_topic.opaque_data = opaque_data;
+
+  ret = write_topics(dpp, topics, &objv_tracker, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to write topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWPubSub::remove_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y) const
+{
+  RGWObjVersionTracker objv_tracker;
+  rgw_pubsub_topics topics;
+
+  int ret = read_topics(dpp, topics, &objv_tracker, y);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to read topics info: ret=" << ret << dendl;
+    return ret;
+  } else if (ret == -ENOENT) {
+      // its not an error if no topics exist, just a no-op
+      ldpp_dout(dpp, 10) << "WARNING: failed to read topics info, deletion is a no-op: ret=" << ret << dendl;
+      return 0;
+  }
+
+  topics.topics.erase(name);
+
+  ret = write_topics(dpp, topics, &objv_tracker, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "ERROR: failed to remove topics info: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
diff --git a/src/rgw/rgw_pubsub.h b/src/rgw/rgw_pubsub.h
new file mode 100644
index 000000000..290c52c2b
--- /dev/null
+++ b/src/rgw/rgw_pubsub.h
@@ -0,0 +1,629 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_sal.h"
+#include "rgw_tools.h"
+#include "rgw_zone.h"
+#include "rgw_notify_event_type.h"
+#include <boost/container/flat_map.hpp>
+
+class XMLObj;
+
+struct rgw_s3_key_filter {
+  std::string prefix_rule;
+  std::string suffix_rule;
+  std::string regex_rule;
+
+  bool has_content() const;
+
+  void dump(Formatter *f) const;
+  bool decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+  
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(prefix_rule, bl);
+    encode(suffix_rule, bl);
+    encode(regex_rule, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(prefix_rule, bl);
+    decode(suffix_rule, bl);
+    decode(regex_rule, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_s3_key_filter)
+
+using KeyValueMap = boost::container::flat_map<std::string, std::string>;
+using KeyMultiValueMap = std::multimap<std::string, std::string>;
+
+struct rgw_s3_key_value_filter {
+  KeyValueMap kv;
+  
+  bool has_content() const;
+
+  void dump(Formatter *f) const;
+  bool decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+  
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(kv, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(kv, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_s3_key_value_filter)
+
+struct rgw_s3_filter {
+  rgw_s3_key_filter key_filter;
+  rgw_s3_key_value_filter metadata_filter;
+  rgw_s3_key_value_filter tag_filter;
+
+  bool has_content() const;
+
+  void dump(Formatter *f) const;
+  bool decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+  
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(key_filter, bl);
+    encode(metadata_filter, bl);
+    encode(tag_filter, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(key_filter, bl);
+    decode(metadata_filter, bl);
+    if (struct_v >= 2) {
+        decode(tag_filter, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_s3_filter)
+
+using OptionalFilter = std::optional<rgw_s3_filter>;
+
+struct rgw_pubsub_topic_filter;
+/* S3 notification configuration
+ * based on: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketPUTnotification.html
+<NotificationConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
+  <TopicConfiguration>
+    <Filter>
+      <S3Key>
+        <FilterRule>
+          <Name>suffix</Name>
+          <Value>jpg</Value>
+        </FilterRule>
+      </S3Key>
+      <S3Metadata>
+        <FilterRule>
+          <Name></Name>
+          <Value></Value>
+        </FilterRule>
+      </S3Metadata>
+      <S3Tags>
+        <FilterRule>
+          <Name></Name>
+          <Value></Value>
+        </FilterRule>
+      </S3Tags>
+    </Filter>
+    <Id>notification1</Id>
+    <Topic>arn:aws:sns:<region>:<account>:<topic></Topic>
+    <Event>s3:ObjectCreated:*</Event>
+    <Event>s3:ObjectRemoved:*</Event>
+  </TopicConfiguration>
+</NotificationConfiguration>
+*/
+struct rgw_pubsub_s3_notification {
+  // notification id
+  std::string id;
+  // types of events
+  rgw::notify::EventTypeList events;
+  // topic ARN
+  std::string topic_arn;
+  // filter rules
+  rgw_s3_filter filter;
+
+  bool decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+
+  rgw_pubsub_s3_notification() = default;
+  // construct from rgw_pubsub_topic_filter (used by get/list notifications)
+  explicit rgw_pubsub_s3_notification(const rgw_pubsub_topic_filter& topic_filter);
+};
+
+// return true if the key matches the prefix/suffix/regex rules of the key filter
+bool match(const rgw_s3_key_filter& filter, const std::string& key);
+
+// return true if the key matches the metadata rules of the metadata filter
+bool match(const rgw_s3_key_value_filter& filter, const KeyValueMap& kv);
+
+// return true if the key matches the tag rules of the tag filter
+bool match(const rgw_s3_key_value_filter& filter, const KeyMultiValueMap& kv);
+
+// return true if the event type matches (equal or contained in) one of the events in the list
+bool match(const rgw::notify::EventTypeList& events, rgw::notify::EventType event);
+
+struct rgw_pubsub_s3_notifications {
+  std::list<rgw_pubsub_s3_notification> list;
+  bool decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+};
+
+/* S3 event records structure
+ * based on: https://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html
+{  
+"Records":[  
+  {
+    "eventVersion":""
+    "eventSource":"",
+    "awsRegion":"",
+    "eventTime":"",
+    "eventName":"",
+    "userIdentity":{  
+      "principalId":""
+    },
+    "requestParameters":{
+      "sourceIPAddress":""
+    },
+    "responseElements":{
+      "x-amz-request-id":"",
+      "x-amz-id-2":""
+    },
+    "s3":{
+      "s3SchemaVersion":"1.0",
+      "configurationId":"",
+      "bucket":{
+        "name":"",
+        "ownerIdentity":{
+          "principalId":""
+        },
+        "arn":""
+        "id": ""
+      },
+      "object":{
+        "key":"",
+        "size": ,
+        "eTag":"",
+        "versionId":"",
+        "sequencer": "",
+        "metadata": ""
+        "tags": ""
+      }
+    },
+    "eventId":"",
+  }
+]
+}*/
+
+struct rgw_pubsub_s3_event {
+  constexpr static const char* const json_type_plural = "Records";
+  std::string eventVersion = "2.2";
+  // aws:s3
+  std::string eventSource = "ceph:s3";
+  // zonegroup
+  std::string awsRegion;
+  // time of the request
+  ceph::real_time eventTime;
+  // type of the event
+  std::string eventName;
+  // user that sent the request
+  std::string userIdentity;
+  // IP address of source of the request (not implemented)
+  std::string sourceIPAddress;
+  // request ID (not implemented)
+  std::string x_amz_request_id;
+  // radosgw that received the request
+  std::string x_amz_id_2;
+  std::string s3SchemaVersion = "1.0";
+  // ID received in the notification request
+  std::string configurationId;
+  // bucket name
+  std::string bucket_name;
+  // bucket owner
+  std::string bucket_ownerIdentity;
+  // bucket ARN
+  std::string bucket_arn;
+  // object key
+  std::string object_key;
+  // object size
+  uint64_t object_size = 0;
+  // object etag
+  std::string object_etag;
+  // object version id bucket is versioned
+  std::string object_versionId;
+  // hexadecimal value used to determine event order for specific key
+  std::string object_sequencer;
+  // this is an rgw extension (not S3 standard)
+  // used to store a globally unique identifier of the event
+  // that could be used for acking or any other identification of the event
+  std::string id;
+  // this is an rgw extension holding the internal bucket id
+  std::string bucket_id;
+  // meta data
+  KeyValueMap x_meta_map;
+  // tags
+  KeyMultiValueMap tags;
+  // opaque data received from the topic
+  // could be used to identify the gateway
+  std::string opaque_data;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(4, 1, bl);
+    encode(eventVersion, bl);
+    encode(eventSource, bl);
+    encode(awsRegion, bl);
+    encode(eventTime, bl);
+    encode(eventName, bl);
+    encode(userIdentity, bl);
+    encode(sourceIPAddress, bl);
+    encode(x_amz_request_id, bl);
+    encode(x_amz_id_2, bl);
+    encode(s3SchemaVersion, bl);
+    encode(configurationId, bl);
+    encode(bucket_name, bl);
+    encode(bucket_ownerIdentity, bl);
+    encode(bucket_arn, bl);
+    encode(object_key, bl);
+    encode(object_size, bl);
+    encode(object_etag, bl);
+    encode(object_versionId, bl);
+    encode(object_sequencer, bl);
+    encode(id, bl);
+    encode(bucket_id, bl);
+    encode(x_meta_map, bl);
+    encode(tags, bl);
+    encode(opaque_data, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(4, bl);
+    decode(eventVersion, bl);
+    decode(eventSource, bl);
+    decode(awsRegion, bl);
+    decode(eventTime, bl);
+    decode(eventName, bl);
+    decode(userIdentity, bl);
+    decode(sourceIPAddress, bl);
+    decode(x_amz_request_id, bl);
+    decode(x_amz_id_2, bl);
+    decode(s3SchemaVersion, bl);
+    decode(configurationId, bl);
+    decode(bucket_name, bl);
+    decode(bucket_ownerIdentity, bl);
+    decode(bucket_arn, bl);
+    decode(object_key, bl);
+    decode(object_size, bl);
+    decode(object_etag, bl);
+    decode(object_versionId, bl);
+    decode(object_sequencer, bl);
+    decode(id, bl);
+    if (struct_v >= 2) {
+      decode(bucket_id, bl);
+      decode(x_meta_map, bl);
+    }
+    if (struct_v >= 3) {
+      decode(tags, bl);
+    }
+    if (struct_v >= 4) {
+      decode(opaque_data, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_s3_event)
+
+// setting a unique ID for an event based on object hash and timestamp
+void set_event_id(std::string& id, const std::string& hash, const utime_t& ts);
+
+struct rgw_pubsub_dest {
+  std::string push_endpoint;
+  std::string push_endpoint_args;
+  std::string arn_topic;
+  bool stored_secret = false;
+  bool persistent = false;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(5, 1, bl);
+    encode("", bl);
+    encode("", bl);
+    encode(push_endpoint, bl);
+    encode(push_endpoint_args, bl);
+    encode(arn_topic, bl);
+    encode(stored_secret, bl);
+    encode(persistent, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(5, bl);
+    std::string dummy;
+    decode(dummy, bl);
+    decode(dummy, bl);
+    decode(push_endpoint, bl);
+    if (struct_v >= 2) {
+        decode(push_endpoint_args, bl);
+    }
+    if (struct_v >= 3) {
+        decode(arn_topic, bl);
+    }
+    if (struct_v >= 4) {
+        decode(stored_secret, bl);
+    }
+    if (struct_v >= 5) {
+        decode(persistent, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void dump_xml(Formatter *f) const;
+  std::string to_json_str() const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_dest)
+
+struct rgw_pubsub_topic {
+  rgw_user user;
+  std::string name;
+  rgw_pubsub_dest dest;
+  std::string arn;
+  std::string opaque_data;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    encode(user, bl);
+    encode(name, bl);
+    encode(dest, bl);
+    encode(arn, bl);
+    encode(opaque_data, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(3, bl);
+    decode(user, bl);
+    decode(name, bl);
+    if (struct_v >= 2) {
+      decode(dest, bl);
+      decode(arn, bl);
+    }
+    if (struct_v >= 3) {
+      decode(opaque_data, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  std::string to_str() const {
+    return user.tenant + "/" + name;
+  }
+
+  void dump(Formatter *f) const;
+  void dump_xml(Formatter *f) const;
+  void dump_xml_as_attributes(Formatter *f) const;
+
+  bool operator<(const rgw_pubsub_topic& t) const {
+    return to_str().compare(t.to_str());
+  }
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topic)
+
+// this struct deprecated and remain only for backward compatibility
+struct rgw_pubsub_topic_subs {
+  rgw_pubsub_topic topic;
+  std::set<std::string> subs;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(topic, bl);
+    encode(subs, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(topic, bl);
+    decode(subs, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topic_subs)
+
+struct rgw_pubsub_topic_filter {
+  rgw_pubsub_topic topic;
+  rgw::notify::EventTypeList events;
+  std::string s3_id;
+  rgw_s3_filter s3_filter;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    encode(topic, bl);
+    // events are stored as a vector of std::strings
+    std::vector<std::string> tmp_events;
+    std::transform(events.begin(), events.end(), std::back_inserter(tmp_events), rgw::notify::to_string);
+    encode(tmp_events, bl);
+    encode(s3_id, bl);
+    encode(s3_filter, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(3, bl);
+    decode(topic, bl);
+    // events are stored as a vector of std::strings
+    events.clear();
+    std::vector<std::string> tmp_events;
+    decode(tmp_events, bl);
+    std::transform(tmp_events.begin(), tmp_events.end(), std::back_inserter(events), rgw::notify::from_string);
+    if (struct_v >= 2) {
+      decode(s3_id, bl);
+    }
+    if (struct_v >= 3) {
+      decode(s3_filter, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topic_filter)
+
+struct rgw_pubsub_bucket_topics {
+  std::map<std::string, rgw_pubsub_topic_filter> topics;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(topics, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(topics, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_bucket_topics)
+
+struct rgw_pubsub_topics {
+  std::map<std::string, rgw_pubsub_topic> topics;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(topics, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    if (struct_v >= 2) {
+      decode(topics, bl);
+    } else {
+      std::map<std::string, rgw_pubsub_topic_subs> v1topics;
+      decode(v1topics, bl);
+      std::transform(v1topics.begin(), v1topics.end(), std::inserter(topics, topics.end()),
+          [](const auto& entry) {
+            return std::pair<std::string, rgw_pubsub_topic>(entry.first, entry.second.topic); 
+          });
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void dump_xml(Formatter *f) const;
+};
+WRITE_CLASS_ENCODER(rgw_pubsub_topics)
+
+class RGWPubSub
+{
+  friend class Bucket;
+
+  rgw::sal::Driver* const driver;
+  const std::string tenant;
+
+  int read_topics(const DoutPrefixProvider *dpp, rgw_pubsub_topics& result, 
+      RGWObjVersionTracker* objv_tracker, optional_yield y) const;
+  int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_topics& topics,
+			RGWObjVersionTracker* objv_tracker, optional_yield y) const;
+
+public:
+  RGWPubSub(rgw::sal::Driver* _driver, const std::string& tenant);
+
+  class Bucket {
+    friend class RGWPubSub;
+    const RGWPubSub& ps;
+    rgw::sal::Bucket* const bucket;
+
+    // read the list of topics associated with a bucket and populate into result
+    // use version tacker to enforce atomicity between read/write
+    // return 0 on success or if no topic was associated with the bucket, error code otherwise
+    int read_topics(const DoutPrefixProvider *dpp, rgw_pubsub_bucket_topics& result, 
+        RGWObjVersionTracker* objv_tracker, optional_yield y) const;
+    // set the list of topics associated with a bucket
+    // use version tacker to enforce atomicity between read/write
+    // return 0 on success, error code otherwise
+    int write_topics(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& topics,
+		     RGWObjVersionTracker* objv_tracker, optional_yield y) const;
+    int remove_notification_inner(const DoutPrefixProvider *dpp, const std::string& notification_id,
+                                  bool notif_id_or_topic, optional_yield y) const;
+  public:
+    Bucket(const RGWPubSub& _ps, rgw::sal::Bucket* _bucket) : 
+      ps(_ps), bucket(_bucket)
+    {}
+
+    // get the list of topics associated with a bucket and populate into result
+    // return 0 on success or if no topic was associated with the bucket, error code otherwise
+    int get_topics(const DoutPrefixProvider *dpp, rgw_pubsub_bucket_topics& result, optional_yield y) const {
+      return read_topics(dpp, result, nullptr, y);
+    }
+    // get a bucket_topic with by its name and populate it into "result"
+    // return -ENOENT if the topic does not exists
+    // return 0 on success, error code otherwise
+    int get_notification_by_id(const DoutPrefixProvider *dpp, const std::string& notification_id, rgw_pubsub_topic_filter& result, optional_yield y) const;
+    // adds a topic + filter (event list, and possibly name metadata or tags filters) to a bucket
+    // assigning a notification name is optional (needed for S3 compatible notifications)
+    // if the topic already exist on the bucket, the filter event list may be updated
+    // for S3 compliant notifications the version with: s3_filter and notif_name should be used
+    // return -ENOENT if the topic does not exists
+    // return 0 on success, error code otherwise
+    int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, 
+        const rgw::notify::EventTypeList& events, optional_yield y) const;
+    int create_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, 
+        const rgw::notify::EventTypeList& events, OptionalFilter s3_filter, const std::string& notif_name, optional_yield y) const;
+    // remove a topic and filter from bucket
+    // if the topic does not exists on the bucket it is a no-op (considered success)
+    // return -ENOENT if the notification-id/topic does not exists
+    // return 0 on success, error code otherwise
+    int remove_notification_by_id(const DoutPrefixProvider *dpp, const std::string& notif_id, optional_yield y) const;
+    int remove_notification(const DoutPrefixProvider *dpp, const std::string& topic_name, optional_yield y) const;
+    // remove all notifications (and autogenerated topics) associated with the bucket
+    // return 0 on success or if no topic was associated with the bucket, error code otherwise
+    int remove_notifications(const DoutPrefixProvider *dpp, optional_yield y) const;
+  };
+
+  // get the list of topics
+  // return 0 on success or if no topic was associated with the bucket, error code otherwise
+  int get_topics(const DoutPrefixProvider *dpp, rgw_pubsub_topics& result, optional_yield y) const {
+    return read_topics(dpp, result, nullptr, y);
+  }
+  // get a topic with by its name and populate it into "result"
+  // return -ENOENT if the topic does not exists 
+  // return 0 on success, error code otherwise
+  int get_topic(const DoutPrefixProvider *dpp, const std::string& name, rgw_pubsub_topic& result, optional_yield y) const;
+  // create a topic with a name only
+  // if the topic already exists it is a no-op (considered success)
+  // return 0 on success, error code otherwise
+  int create_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y) const;
+  // create a topic with push destination information and ARN
+  // if the topic already exists the destination and ARN values may be updated (considered succsess)
+  // return 0 on success, error code otherwise
+  int create_topic(const DoutPrefixProvider *dpp, const std::string& name, const rgw_pubsub_dest& dest, 
+      const std::string& arn, const std::string& opaque_data, optional_yield y) const;
+  // remove a topic according to its name
+  // if the topic does not exists it is a no-op (considered success)
+  // return 0 on success, error code otherwise
+  int remove_topic(const DoutPrefixProvider *dpp, const std::string& name, optional_yield y) const;
+};
+
diff --git a/src/rgw/rgw_putobj.cc b/src/rgw/rgw_putobj.cc
new file mode 100644
index 000000000..24a4b3275
--- /dev/null
+++ b/src/rgw/rgw_putobj.cc
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_putobj.h"
+
+namespace rgw::putobj {
+
+int ChunkProcessor::process(bufferlist&& data, uint64_t offset)
+{
+  ceph_assert(offset >= chunk.length());
+  uint64_t position = offset - chunk.length();
+
+  const bool flush = (data.length() == 0);
+  if (flush) {
+    if (chunk.length() > 0) {
+      int r = Pipe::process(std::move(chunk), position);
+      if (r < 0) {
+        return r;
+      }
+    }
+    return Pipe::process({}, offset);
+  }
+  chunk.claim_append(data);
+
+  // write each full chunk
+  while (chunk.length() >= chunk_size) {
+    bufferlist bl;
+    chunk.splice(0, chunk_size, &bl);
+
+    int r = Pipe::process(std::move(bl), position);
+    if (r < 0) {
+      return r;
+    }
+    position += chunk_size;
+  }
+  return 0;
+}
+
+
+int StripeProcessor::process(bufferlist&& data, uint64_t offset)
+{
+  ceph_assert(offset >= bounds.first);
+
+  const bool flush = (data.length() == 0);
+  if (flush) {
+    return Pipe::process({}, offset - bounds.first);
+  }
+
+  auto max = bounds.second - offset;
+  while (data.length() > max) {
+    if (max > 0) {
+      bufferlist bl;
+      data.splice(0, max, &bl);
+
+      int r = Pipe::process(std::move(bl), offset - bounds.first);
+      if (r < 0) {
+        return r;
+      }
+      offset += max;
+    }
+
+    // flush the current chunk
+    int r = Pipe::process({}, offset - bounds.first);
+    if (r < 0) {
+      return r;
+    }
+    // generate the next stripe
+    uint64_t stripe_size;
+    r = gen->next(offset, &stripe_size);
+    if (r < 0) {
+      return r;
+    }
+    ceph_assert(stripe_size > 0);
+
+    bounds.first = offset;
+    bounds.second = offset + stripe_size;
+
+    max = stripe_size;
+  }
+
+  if (data.length() == 0) { // don't flush the chunk here
+    return 0;
+  }
+  return Pipe::process(std::move(data), offset - bounds.first);
+}
+
+} // namespace rgw::putobj
diff --git a/src/rgw/rgw_putobj.h b/src/rgw/rgw_putobj.h
new file mode 100644
index 000000000..6740e88ce
--- /dev/null
+++ b/src/rgw/rgw_putobj.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "include/buffer.h"
+#include "rgw_sal.h"
+
+namespace rgw::putobj {
+
+// for composing data processors into a pipeline
+class Pipe : public rgw::sal::DataProcessor {
+  rgw::sal::DataProcessor *next;
+ public:
+  explicit Pipe(rgw::sal::DataProcessor *next) : next(next) {}
+
+  virtual ~Pipe() override {}
+
+  // passes the data on to the next processor
+  int process(bufferlist&& data, uint64_t offset) override {
+    return next->process(std::move(data), offset);
+  }
+};
+
+// pipe that writes to the next processor in discrete chunks
+class ChunkProcessor : public Pipe {
+  uint64_t chunk_size;
+  bufferlist chunk; // leftover bytes from the last call to process()
+ public:
+  ChunkProcessor(rgw::sal::DataProcessor *next, uint64_t chunk_size)
+    : Pipe(next), chunk_size(chunk_size)
+  {}
+  virtual ~ChunkProcessor() override {}
+
+  int process(bufferlist&& data, uint64_t offset) override;
+};
+
+
+// interface to generate the next stripe description
+class StripeGenerator {
+ public:
+  virtual ~StripeGenerator() {}
+
+  virtual int next(uint64_t offset, uint64_t *stripe_size) = 0;
+};
+
+// pipe that respects stripe boundaries and restarts each stripe at offset 0
+class StripeProcessor : public Pipe {
+  StripeGenerator *gen;
+  std::pair<uint64_t, uint64_t> bounds; // bounds of current stripe
+ public:
+  StripeProcessor(rgw::sal::DataProcessor *next, StripeGenerator *gen,
+                  uint64_t first_stripe_size)
+    : Pipe(next), gen(gen), bounds(0, first_stripe_size)
+  {}
+  virtual ~StripeProcessor() override {}
+
+  int process(bufferlist&& data, uint64_t data_offset) override;
+};
+
+} // namespace rgw::putobj
diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc
new file mode 100644
index 000000000..f1ae34f93
--- /dev/null
+++ b/src/rgw/rgw_quota.cc
@@ -0,0 +1,1049 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#include "include/utime.h"
+#include "common/lru_map.h"
+#include "common/RefCountedObj.h"
+#include "common/Thread.h"
+#include "common/ceph_mutex.h"
+
+#include "rgw_common.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "rgw_quota.h"
+#include "rgw_bucket.h"
+#include "rgw_user.h"
+
+#include "services/svc_sys_obj.h"
+#include "services/svc_meta.h"
+
+#include <atomic>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+struct RGWQuotaCacheStats {
+  RGWStorageStats stats;
+  utime_t expiration;
+  utime_t async_refresh_time;
+};
+
+template<class T>
+class RGWQuotaCache {
+protected:
+  rgw::sal::Driver* driver;
+  lru_map<T, RGWQuotaCacheStats> stats_map;
+  RefCountedWaitObject *async_refcount;
+
+  class StatsAsyncTestSet : public lru_map<T, RGWQuotaCacheStats>::UpdateContext {
+    int objs_delta;
+    uint64_t added_bytes;
+    uint64_t removed_bytes;
+  public:
+    StatsAsyncTestSet() : objs_delta(0), added_bytes(0), removed_bytes(0) {}
+    bool update(RGWQuotaCacheStats *entry) override {
+      if (entry->async_refresh_time.sec() == 0)
+        return false;
+
+      entry->async_refresh_time = utime_t(0, 0);
+
+      return true;
+    }
+  };
+
+  virtual int fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, optional_yield y, const DoutPrefixProvider *dpp) = 0;
+
+  virtual bool map_find(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) = 0;
+
+  virtual bool map_find_and_update(const rgw_user& user, const rgw_bucket& bucket, typename lru_map<T, RGWQuotaCacheStats>::UpdateContext *ctx) = 0;
+  virtual void map_add(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) = 0;
+
+  virtual void data_modified(const rgw_user& user, rgw_bucket& bucket) {}
+public:
+  RGWQuotaCache(rgw::sal::Driver* _driver, int size) : driver(_driver), stats_map(size) {
+    async_refcount = new RefCountedWaitObject;
+  }
+  virtual ~RGWQuotaCache() {
+    async_refcount->put_wait(); /* wait for all pending async requests to complete */
+  }
+
+  int get_stats(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, optional_yield y,
+                const DoutPrefixProvider* dpp);
+  void adjust_stats(const rgw_user& user, rgw_bucket& bucket, int objs_delta, uint64_t added_bytes, uint64_t removed_bytes);
+
+  void set_stats(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats);
+  int async_refresh(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs);
+  void async_refresh_response(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats);
+  void async_refresh_fail(const rgw_user& user, rgw_bucket& bucket);
+
+  class AsyncRefreshHandler {
+  protected:
+    rgw::sal::Driver* driver;
+    RGWQuotaCache<T> *cache;
+  public:
+    AsyncRefreshHandler(rgw::sal::Driver* _driver, RGWQuotaCache<T> *_cache) : driver(_driver), cache(_cache) {}
+    virtual ~AsyncRefreshHandler() {}
+
+    virtual int init_fetch() = 0;
+    virtual void drop_reference() = 0;
+  };
+
+  virtual AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, const rgw_bucket& bucket) = 0;
+};
+
+template<class T>
+int RGWQuotaCache<T>::async_refresh(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs)
+{
+  /* protect against multiple updates */
+  StatsAsyncTestSet test_update;
+  if (!map_find_and_update(user, bucket, &test_update)) {
+    /* most likely we just raced with another update */
+    return 0;
+  }
+
+  async_refcount->get();
+
+
+  AsyncRefreshHandler *handler = allocate_refresh_handler(user, bucket);
+
+  int ret = handler->init_fetch();
+  if (ret < 0) {
+    async_refcount->put();
+    handler->drop_reference();
+    return ret;
+  }
+
+  return 0;
+}
+
+template<class T>
+void RGWQuotaCache<T>::async_refresh_fail(const rgw_user& user, rgw_bucket& bucket)
+{
+  ldout(driver->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl;
+
+  async_refcount->put();
+}
+
+template<class T>
+void RGWQuotaCache<T>::async_refresh_response(const rgw_user& user, rgw_bucket& bucket, RGWStorageStats& stats)
+{
+  ldout(driver->ctx(), 20) << "async stats refresh response for bucket=" << bucket << dendl;
+
+  RGWQuotaCacheStats qs;
+
+  map_find(user, bucket, qs);
+
+  set_stats(user, bucket, qs, stats);
+
+  async_refcount->put();
+}
+
+template<class T>
+void RGWQuotaCache<T>::set_stats(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs, RGWStorageStats& stats)
+{
+  qs.stats = stats;
+  qs.expiration = ceph_clock_now();
+  qs.async_refresh_time = qs.expiration;
+  qs.expiration += driver->ctx()->_conf->rgw_bucket_quota_ttl;
+  qs.async_refresh_time += driver->ctx()->_conf->rgw_bucket_quota_ttl / 2;
+
+  map_add(user, bucket, qs);
+}
+
+template<class T>
+int RGWQuotaCache<T>::get_stats(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, optional_yield y, const DoutPrefixProvider* dpp) {
+  RGWQuotaCacheStats qs;
+  utime_t now = ceph_clock_now();
+  if (map_find(user, bucket, qs)) {
+    if (qs.async_refresh_time.sec() > 0 && now >= qs.async_refresh_time) {
+      int r = async_refresh(user, bucket, qs);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: quota async refresh returned ret=" << r << dendl;
+
+        /* continue processing, might be a transient error, async refresh is just optimization */
+      }
+    }
+
+    if (qs.expiration > ceph_clock_now()) {
+      stats = qs.stats;
+      return 0;
+    }
+  }
+
+  int ret = fetch_stats_from_storage(user, bucket, stats, y, dpp);
+  if (ret < 0 && ret != -ENOENT)
+    return ret;
+
+  set_stats(user, bucket, qs, stats);
+
+  return 0;
+}
+
+
+template<class T>
+class RGWQuotaStatsUpdate : public lru_map<T, RGWQuotaCacheStats>::UpdateContext {
+  const int objs_delta;
+  const uint64_t added_bytes;
+  const uint64_t removed_bytes;
+public:
+  RGWQuotaStatsUpdate(const int objs_delta,
+                      const uint64_t added_bytes,
+                      const uint64_t removed_bytes)
+    : objs_delta(objs_delta),
+      added_bytes(added_bytes),
+      removed_bytes(removed_bytes) {
+  }
+
+  bool update(RGWQuotaCacheStats * const entry) override {
+    const uint64_t rounded_added = rgw_rounded_objsize(added_bytes);
+    const uint64_t rounded_removed = rgw_rounded_objsize(removed_bytes);
+
+    if (((int64_t)(entry->stats.size + added_bytes - removed_bytes)) >= 0) {
+      entry->stats.size += added_bytes - removed_bytes;
+    } else {
+      entry->stats.size = 0;
+    }
+
+    if (((int64_t)(entry->stats.size_rounded + rounded_added - rounded_removed)) >= 0) {
+      entry->stats.size_rounded += rounded_added - rounded_removed;
+    } else {
+      entry->stats.size_rounded = 0;
+    }
+
+    if (((int64_t)(entry->stats.num_objects + objs_delta)) >= 0) {
+      entry->stats.num_objects += objs_delta;
+    } else {
+      entry->stats.num_objects = 0;
+    }
+
+    return true;
+  }
+};
+
+
+template<class T>
+void RGWQuotaCache<T>::adjust_stats(const rgw_user& user, rgw_bucket& bucket, int objs_delta,
+                                 uint64_t added_bytes, uint64_t removed_bytes)
+{
+  RGWQuotaStatsUpdate<T> update(objs_delta, added_bytes, removed_bytes);
+  map_find_and_update(user, bucket, &update);
+
+  data_modified(user, bucket);
+}
+
+class BucketAsyncRefreshHandler : public RGWQuotaCache<rgw_bucket>::AsyncRefreshHandler,
+                                  public RGWGetBucketStats_CB {
+  rgw_user user;
+public:
+  BucketAsyncRefreshHandler(rgw::sal::Driver* _driver, RGWQuotaCache<rgw_bucket> *_cache,
+                            const rgw_user& _user, const rgw_bucket& _bucket) :
+                                      RGWQuotaCache<rgw_bucket>::AsyncRefreshHandler(_driver, _cache),
+                                      RGWGetBucketStats_CB(_bucket), user(_user) {}
+
+  void drop_reference() override { put(); }
+  void handle_response(int r) override;
+  int init_fetch() override;
+};
+
+int BucketAsyncRefreshHandler::init_fetch()
+{
+  std::unique_ptr<rgw::sal::Bucket> rbucket;
+
+  const DoutPrefix dp(driver->ctx(), dout_subsys, "rgw bucket async refresh handler: ");
+  int r = driver->get_bucket(&dp, nullptr, bucket, &rbucket, null_yield);
+  if (r < 0) {
+    ldpp_dout(&dp, 0) << "could not get bucket info for bucket=" << bucket << " r=" << r << dendl;
+    return r;
+  }
+
+  ldpp_dout(&dp, 20) << "initiating async quota refresh for bucket=" << bucket << dendl;
+
+  const auto& index = rbucket->get_info().get_current_index();
+  if (is_layout_indexless(index)) {
+    return 0;
+  }
+
+  r = rbucket->read_stats_async(&dp, index, RGW_NO_SHARD, this);
+  if (r < 0) {
+    ldpp_dout(&dp, 0) << "could not get bucket info for bucket=" << bucket.name << dendl;
+
+    /* read_stats_async() dropped our reference already */
+    return r;
+  }
+
+  return 0;
+}
+
+void BucketAsyncRefreshHandler::handle_response(const int r)
+{
+  if (r < 0) {
+    ldout(driver->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl;
+    cache->async_refresh_fail(user, bucket);
+    return;
+  }
+
+  RGWStorageStats bs;
+
+  for (const auto& pair : *stats) {
+    const RGWStorageStats& s = pair.second;
+
+    bs.size += s.size;
+    bs.size_rounded += s.size_rounded;
+    bs.num_objects += s.num_objects;
+  }
+
+  cache->async_refresh_response(user, bucket, bs);
+}
+
+class RGWBucketStatsCache : public RGWQuotaCache<rgw_bucket> {
+protected:
+  bool map_find(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override {
+    return stats_map.find(bucket, qs);
+  }
+
+  bool map_find_and_update(const rgw_user& user, const rgw_bucket& bucket, lru_map<rgw_bucket, RGWQuotaCacheStats>::UpdateContext *ctx) override {
+    return stats_map.find_and_update(bucket, NULL, ctx);
+  }
+
+  void map_add(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override {
+    stats_map.add(bucket, qs);
+  }
+
+  int fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, optional_yield y, const DoutPrefixProvider *dpp) override;
+
+public:
+  explicit RGWBucketStatsCache(rgw::sal::Driver* _driver) : RGWQuotaCache<rgw_bucket>(_driver, _driver->ctx()->_conf->rgw_bucket_quota_cache_size) {
+  }
+
+  AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, const rgw_bucket& bucket) override {
+    return new BucketAsyncRefreshHandler(driver, this, user, bucket);
+  }
+};
+
+int RGWBucketStatsCache::fetch_stats_from_storage(const rgw_user& _u, const rgw_bucket& _b, RGWStorageStats& stats, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(_u);
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  int r = driver->get_bucket(dpp, user.get(), _b, &bucket, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "could not get bucket info for bucket=" << _b << " r=" << r << dendl;
+    return r;
+  }
+
+  stats = RGWStorageStats();
+
+  const auto& index = bucket->get_info().get_current_index();
+  if (is_layout_indexless(index)) {
+    return 0;
+  }
+
+  string bucket_ver;
+  string master_ver;
+
+  map<RGWObjCategory, RGWStorageStats> bucket_stats;
+  r = bucket->read_stats(dpp, index, RGW_NO_SHARD, &bucket_ver,
+			 &master_ver, bucket_stats, nullptr);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "could not get bucket stats for bucket="
+                           << _b.name << dendl;
+    return r;
+  }
+
+  for (const auto& pair : bucket_stats) {
+    const RGWStorageStats& s = pair.second;
+
+    stats.size += s.size;
+    stats.size_rounded += s.size_rounded;
+    stats.num_objects += s.num_objects;
+  }
+
+  return 0;
+}
+
+class UserAsyncRefreshHandler : public RGWQuotaCache<rgw_user>::AsyncRefreshHandler,
+                                public RGWGetUserStats_CB {
+  const DoutPrefixProvider *dpp;
+  rgw_bucket bucket;
+public:
+  UserAsyncRefreshHandler(const DoutPrefixProvider *_dpp, rgw::sal::Driver* _driver, RGWQuotaCache<rgw_user> *_cache,
+                          const rgw_user& _user, const rgw_bucket& _bucket) :
+                          RGWQuotaCache<rgw_user>::AsyncRefreshHandler(_driver, _cache),
+                          RGWGetUserStats_CB(_user),
+                          dpp(_dpp),
+                          bucket(_bucket) {}
+
+  void drop_reference() override { put(); }
+  int init_fetch() override;
+  void handle_response(int r) override;
+};
+
+int UserAsyncRefreshHandler::init_fetch()
+{
+  std::unique_ptr<rgw::sal::User> ruser = driver->get_user(user);
+
+  ldpp_dout(dpp, 20) << "initiating async quota refresh for user=" << user << dendl;
+  int r = ruser->read_stats_async(dpp, this);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "could not get bucket info for user=" << user << dendl;
+
+    /* get_bucket_stats_async() dropped our reference already */
+    return r;
+  }
+
+  return 0;
+}
+
+void UserAsyncRefreshHandler::handle_response(int r)
+{
+  if (r < 0) {
+    ldout(driver->ctx(), 20) << "AsyncRefreshHandler::handle_response() r=" << r << dendl;
+    cache->async_refresh_fail(user, bucket);
+    return;
+  }
+
+  cache->async_refresh_response(user, bucket, stats);
+}
+
+class RGWUserStatsCache : public RGWQuotaCache<rgw_user> {
+  const DoutPrefixProvider *dpp;
+  std::atomic<bool> down_flag = { false };
+  ceph::shared_mutex mutex = ceph::make_shared_mutex("RGWUserStatsCache");
+  map<rgw_bucket, rgw_user> modified_buckets;
+
+  /* thread, sync recent modified buckets info */
+  class BucketsSyncThread : public Thread {
+    CephContext *cct;
+    RGWUserStatsCache *stats;
+
+    ceph::mutex lock = ceph::make_mutex("RGWUserStatsCache::BucketsSyncThread");
+    ceph::condition_variable cond;
+  public:
+
+    BucketsSyncThread(CephContext *_cct, RGWUserStatsCache *_s) : cct(_cct), stats(_s) {}
+
+    void *entry() override {
+      ldout(cct, 20) << "BucketsSyncThread: start" << dendl;
+      do {
+        map<rgw_bucket, rgw_user> buckets;
+
+        stats->swap_modified_buckets(buckets);
+
+        for (map<rgw_bucket, rgw_user>::iterator iter = buckets.begin(); iter != buckets.end(); ++iter) {
+          rgw_bucket bucket = iter->first;
+          rgw_user& user = iter->second;
+          ldout(cct, 20) << "BucketsSyncThread: sync user=" << user << " bucket=" << bucket << dendl;
+          const DoutPrefix dp(cct, dout_subsys, "rgw bucket sync thread: ");
+          int r = stats->sync_bucket(user, bucket, null_yield, &dp);
+          if (r < 0) {
+            ldout(cct, 0) << "WARNING: sync_bucket() returned r=" << r << dendl;
+          }
+        }
+
+        if (stats->going_down())
+          break;
+
+	std::unique_lock locker{lock};
+	cond.wait_for(
+          locker,
+          std::chrono::seconds(cct->_conf->rgw_user_quota_bucket_sync_interval));
+      } while (!stats->going_down());
+      ldout(cct, 20) << "BucketsSyncThread: done" << dendl;
+
+      return NULL;
+    }
+
+    void stop() {
+      std::lock_guard l{lock};
+      cond.notify_all();
+    }
+  };
+
+  /*
+   * thread, full sync all users stats periodically
+   *
+   * only sync non idle users or ones that never got synced before, this is needed so that
+   * users that didn't have quota turned on before (or existed before the user objclass
+   * tracked stats) need to get their backend stats up to date.
+   */
+  class UserSyncThread : public Thread {
+    CephContext *cct;
+    RGWUserStatsCache *stats;
+
+    ceph::mutex lock = ceph::make_mutex("RGWUserStatsCache::UserSyncThread");
+    ceph::condition_variable cond;
+  public:
+
+    UserSyncThread(CephContext *_cct, RGWUserStatsCache *_s) : cct(_cct), stats(_s) {}
+
+    void *entry() override {
+      ldout(cct, 20) << "UserSyncThread: start" << dendl;
+      do {
+        const DoutPrefix dp(cct, dout_subsys, "rgw user sync thread: ");
+        int ret = stats->sync_all_users(&dp, null_yield);
+        if (ret < 0) {
+          ldout(cct, 5) << "ERROR: sync_all_users() returned ret=" << ret << dendl;
+        }
+
+        if (stats->going_down())
+          break;
+
+	std::unique_lock l{lock};
+        cond.wait_for(l, std::chrono::seconds(cct->_conf->rgw_user_quota_sync_interval));
+      } while (!stats->going_down());
+      ldout(cct, 20) << "UserSyncThread: done" << dendl;
+
+      return NULL;
+    }
+
+    void stop() {
+      std::lock_guard l{lock};
+      cond.notify_all();
+    }
+  };
+
+  BucketsSyncThread *buckets_sync_thread;
+  UserSyncThread *user_sync_thread;
+protected:
+  bool map_find(const rgw_user& user,const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override {
+    return stats_map.find(user, qs);
+  }
+
+  bool map_find_and_update(const rgw_user& user, const rgw_bucket& bucket, lru_map<rgw_user, RGWQuotaCacheStats>::UpdateContext *ctx) override {
+    return stats_map.find_and_update(user, NULL, ctx);
+  }
+
+  void map_add(const rgw_user& user, const rgw_bucket& bucket, RGWQuotaCacheStats& qs) override {
+    stats_map.add(user, qs);
+  }
+
+  int fetch_stats_from_storage(const rgw_user& user, const rgw_bucket& bucket, RGWStorageStats& stats, optional_yield y, const DoutPrefixProvider *dpp) override;
+  int sync_bucket(const rgw_user& rgw_user, rgw_bucket& bucket, optional_yield y, const DoutPrefixProvider *dpp);
+  int sync_user(const DoutPrefixProvider *dpp, const rgw_user& user, optional_yield y);
+  int sync_all_users(const DoutPrefixProvider *dpp, optional_yield y);
+
+  void data_modified(const rgw_user& user, rgw_bucket& bucket) override;
+
+  void swap_modified_buckets(map<rgw_bucket, rgw_user>& out) {
+    std::unique_lock lock{mutex};
+    modified_buckets.swap(out);
+  }
+
+  template<class T> /* easier doing it as a template, Thread doesn't have ->stop() */
+  void stop_thread(T **pthr) {
+    T *thread = *pthr;
+    if (!thread)
+      return;
+
+    thread->stop();
+    thread->join();
+    delete thread;
+    *pthr = NULL;
+  }
+
+public:
+  RGWUserStatsCache(const DoutPrefixProvider *dpp, rgw::sal::Driver* _driver, bool quota_threads)
+    : RGWQuotaCache<rgw_user>(_driver, _driver->ctx()->_conf->rgw_bucket_quota_cache_size), dpp(dpp)
+  {
+    if (quota_threads) {
+      buckets_sync_thread = new BucketsSyncThread(driver->ctx(), this);
+      buckets_sync_thread->create("rgw_buck_st_syn");
+      user_sync_thread = new UserSyncThread(driver->ctx(), this);
+      user_sync_thread->create("rgw_user_st_syn");
+    } else {
+      buckets_sync_thread = NULL;
+      user_sync_thread = NULL;
+    }
+  }
+  ~RGWUserStatsCache() override {
+    stop();
+  }
+
+  AsyncRefreshHandler *allocate_refresh_handler(const rgw_user& user, const rgw_bucket& bucket) override {
+    return new UserAsyncRefreshHandler(dpp, driver, this, user, bucket);
+  }
+
+  bool going_down() {
+    return down_flag;
+  }
+
+  void stop() {
+    down_flag = true;
+    {
+      std::unique_lock lock{mutex};
+      stop_thread(&buckets_sync_thread);
+    }
+    stop_thread(&user_sync_thread);
+  }
+};
+
+int RGWUserStatsCache::fetch_stats_from_storage(const rgw_user& _u,
+						const rgw_bucket& _b,
+						RGWStorageStats& stats,
+						optional_yield y,
+                                                const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(_u);
+  int r = user->read_stats(dpp, y, &stats);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "could not get user stats for user=" << user << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWUserStatsCache::sync_bucket(const rgw_user& _u, rgw_bucket& _b, optional_yield y, const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(_u);
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  int r = driver->get_bucket(dpp, user.get(), _b, &bucket, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "could not get bucket info for bucket=" << _b << " r=" << r << dendl;
+    return r;
+  }
+
+  r = bucket->sync_user_stats(dpp, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: sync_user_stats() for user=" << _u << ", bucket=" << bucket << " returned " << r << dendl;
+    return r;
+  }
+
+  return bucket->check_bucket_shards(dpp);
+}
+
+int RGWUserStatsCache::sync_user(const DoutPrefixProvider *dpp, const rgw_user& _u, optional_yield y)
+{
+  RGWStorageStats stats;
+  ceph::real_time last_stats_sync;
+  ceph::real_time last_stats_update;
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(_u.to_str()));
+
+  int ret = user->read_stats(dpp, y, &stats, &last_stats_sync, &last_stats_update);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << "ERROR: can't read user header: ret=" << ret << dendl;
+    return ret;
+  }
+
+  if (!driver->ctx()->_conf->rgw_user_quota_sync_idle_users &&
+      last_stats_update < last_stats_sync) {
+    ldpp_dout(dpp, 20) << "user is idle, not doing a full sync (user=" << user << ")" << dendl;
+    return 0;
+  }
+
+  real_time when_need_full_sync = last_stats_sync;
+  when_need_full_sync += make_timespan(driver->ctx()->_conf->rgw_user_quota_sync_wait_time);
+  
+  // check if enough time passed since last full sync
+  /* FIXME: missing check? */
+
+  ret = rgw_user_sync_all_stats(dpp, driver, user.get(), y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed user stats sync, ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWUserStatsCache::sync_all_users(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  string key = "user";
+  void *handle;
+
+  int ret = driver->meta_list_keys_init(dpp, key, string(), &handle);
+  if (ret < 0) {
+    ldpp_dout(dpp, 10) << "ERROR: can't get key: ret=" << ret << dendl;
+    return ret;
+  }
+
+  bool truncated;
+  int max = 1000;
+
+  do {
+    list<string> keys;
+    ret = driver->meta_list_keys_next(dpp, handle, max, keys, &truncated);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: lists_keys_next(): ret=" << ret << dendl;
+      goto done;
+    }
+    for (list<string>::iterator iter = keys.begin();
+         iter != keys.end() && !going_down(); 
+         ++iter) {
+      rgw_user user(*iter);
+      ldpp_dout(dpp, 20) << "RGWUserStatsCache: sync user=" << user << dendl;
+      int ret = sync_user(dpp, user, y);
+      if (ret < 0) {
+        ldpp_dout(dpp, 5) << "ERROR: sync_user() failed, user=" << user << " ret=" << ret << dendl;
+
+        /* continuing to next user */
+        continue;
+      }
+    }
+  } while (truncated);
+
+  ret = 0;
+done:
+  driver->meta_list_keys_complete(handle);
+  return ret;
+}
+
+void RGWUserStatsCache::data_modified(const rgw_user& user, rgw_bucket& bucket)
+{
+  /* racy, but it's ok */
+  mutex.lock_shared();
+  bool need_update = modified_buckets.find(bucket) == modified_buckets.end();
+  mutex.unlock_shared();
+
+  if (need_update) {
+    std::unique_lock lock{mutex};
+    modified_buckets[bucket] = user;
+  }
+}
+
+
+class RGWQuotaInfoApplier {
+  /* NOTE: no non-static field allowed as instances are supposed to live in
+   * the static memory only. */
+protected:
+  RGWQuotaInfoApplier() = default;
+
+public:
+  virtual ~RGWQuotaInfoApplier() {}
+
+  virtual bool is_size_exceeded(const DoutPrefixProvider *dpp,
+                                const char * const entity,
+                                const RGWQuotaInfo& qinfo,
+                                const RGWStorageStats& stats,
+                                const uint64_t size) const = 0;
+
+  virtual bool is_num_objs_exceeded(const DoutPrefixProvider *dpp,
+                                    const char * const entity,
+                                    const RGWQuotaInfo& qinfo,
+                                    const RGWStorageStats& stats,
+                                    const uint64_t num_objs) const = 0;
+
+  static const RGWQuotaInfoApplier& get_instance(const RGWQuotaInfo& qinfo);
+};
+
+class RGWQuotaInfoDefApplier : public RGWQuotaInfoApplier {
+public:
+  bool is_size_exceeded(const DoutPrefixProvider *dpp, const char * const entity,
+                                const RGWQuotaInfo& qinfo,
+                                const RGWStorageStats& stats,
+                                const uint64_t size) const override;
+
+  bool is_num_objs_exceeded(const DoutPrefixProvider *dpp, const char * const entity,
+                                    const RGWQuotaInfo& qinfo,
+                                    const RGWStorageStats& stats,
+                                    const uint64_t num_objs) const override;
+};
+
+class RGWQuotaInfoRawApplier : public RGWQuotaInfoApplier {
+public:
+  bool is_size_exceeded(const DoutPrefixProvider *dpp, const char * const entity,
+                                const RGWQuotaInfo& qinfo,
+                                const RGWStorageStats& stats,
+                                const uint64_t size) const override;
+
+  bool is_num_objs_exceeded(const DoutPrefixProvider *dpp, const char * const entity,
+                                    const RGWQuotaInfo& qinfo,
+                                    const RGWStorageStats& stats,
+                                    const uint64_t num_objs) const override;
+};
+
+
+bool RGWQuotaInfoDefApplier::is_size_exceeded(const DoutPrefixProvider *dpp,
+                                              const char * const entity,
+                                              const RGWQuotaInfo& qinfo,
+                                              const RGWStorageStats& stats,
+                                              const uint64_t size) const
+{
+  if (qinfo.max_size < 0) {
+    /* The limit is not enabled. */
+    return false;
+  }
+
+  const uint64_t cur_size = stats.size_rounded;
+  const uint64_t new_size = rgw_rounded_objsize(size);
+
+  if (std::cmp_greater(cur_size + new_size, qinfo.max_size)) {
+    ldpp_dout(dpp, 10) << "quota exceeded: stats.size_rounded=" << stats.size_rounded
+             << " size=" << new_size << " "
+             << entity << "_quota.max_size=" << qinfo.max_size << dendl;
+    return true;
+  }
+
+  return false;
+}
+
+bool RGWQuotaInfoDefApplier::is_num_objs_exceeded(const DoutPrefixProvider *dpp,
+                                                  const char * const entity,
+                                                  const RGWQuotaInfo& qinfo,
+                                                  const RGWStorageStats& stats,
+                                                  const uint64_t num_objs) const
+{
+  if (qinfo.max_objects < 0) {
+    /* The limit is not enabled. */
+    return false;
+  }
+
+  if (std::cmp_greater(stats.num_objects + num_objs, qinfo.max_objects)) {
+    ldpp_dout(dpp, 10) << "quota exceeded: stats.num_objects=" << stats.num_objects
+             << " " << entity << "_quota.max_objects=" << qinfo.max_objects
+             << dendl;
+    return true;
+  }
+
+  return false;
+}
+
+bool RGWQuotaInfoRawApplier::is_size_exceeded(const DoutPrefixProvider *dpp,
+                                              const char * const entity,
+                                              const RGWQuotaInfo& qinfo,
+                                              const RGWStorageStats& stats,
+                                              const uint64_t size) const
+{
+  if (qinfo.max_size < 0) {
+    /* The limit is not enabled. */
+    return false;
+  }
+
+  const uint64_t cur_size = stats.size;
+
+  if (std::cmp_greater(cur_size + size, qinfo.max_size)) {
+    ldpp_dout(dpp, 10) << "quota exceeded: stats.size=" << stats.size
+             << " size=" << size << " "
+             << entity << "_quota.max_size=" << qinfo.max_size << dendl;
+    return true;
+  }
+
+  return false;
+}
+
+bool RGWQuotaInfoRawApplier::is_num_objs_exceeded(const DoutPrefixProvider *dpp,
+                                                  const char * const entity,
+                                                  const RGWQuotaInfo& qinfo,
+                                                  const RGWStorageStats& stats,
+                                                  const uint64_t num_objs) const
+{
+  if (qinfo.max_objects < 0) {
+    /* The limit is not enabled. */
+    return false;
+  }
+
+  if (std::cmp_greater(stats.num_objects + num_objs, qinfo.max_objects)) {
+    ldpp_dout(dpp, 10) << "quota exceeded: stats.num_objects=" << stats.num_objects
+             << " " << entity << "_quota.max_objects=" << qinfo.max_objects
+             << dendl;
+    return true;
+  }
+
+  return false;
+}
+
+const RGWQuotaInfoApplier& RGWQuotaInfoApplier::get_instance(
+  const RGWQuotaInfo& qinfo)
+{
+  static RGWQuotaInfoDefApplier default_qapplier;
+  static RGWQuotaInfoRawApplier raw_qapplier;
+
+  if (qinfo.check_on_raw) {
+    return raw_qapplier;
+  } else {
+    return default_qapplier;
+  }
+}
+
+
+class RGWQuotaHandlerImpl : public RGWQuotaHandler {
+  rgw::sal::Driver* driver;
+  RGWBucketStatsCache bucket_stats_cache;
+  RGWUserStatsCache user_stats_cache;
+
+  int check_quota(const DoutPrefixProvider *dpp,
+                  const char * const entity,
+                  const RGWQuotaInfo& quota,
+                  const RGWStorageStats& stats,
+                  const uint64_t num_objs,
+                  const uint64_t size) {
+    if (!quota.enabled) {
+      return 0;
+    }
+
+    const auto& quota_applier = RGWQuotaInfoApplier::get_instance(quota);
+
+    ldpp_dout(dpp, 20) << entity
+                            << " quota: max_objects=" << quota.max_objects
+                            << " max_size=" << quota.max_size << dendl;
+
+
+    if (quota_applier.is_num_objs_exceeded(dpp, entity, quota, stats, num_objs)) {
+      return -ERR_QUOTA_EXCEEDED;
+    }
+
+    if (quota_applier.is_size_exceeded(dpp, entity, quota, stats, size)) {
+      return -ERR_QUOTA_EXCEEDED;
+    }
+
+    ldpp_dout(dpp, 20) << entity << " quota OK:"
+                            << " stats.num_objects=" << stats.num_objects
+                            << " stats.size=" << stats.size << dendl;
+    return 0;
+  }
+public:
+  RGWQuotaHandlerImpl(const DoutPrefixProvider *dpp, rgw::sal::Driver* _driver, bool quota_threads) : driver(_driver),
+                                    bucket_stats_cache(_driver),
+                                    user_stats_cache(dpp, _driver, quota_threads) {}
+
+  int check_quota(const DoutPrefixProvider *dpp,
+                  const rgw_user& user,
+                  rgw_bucket& bucket,
+                  RGWQuota& quota,
+                  uint64_t num_objs,
+                  uint64_t size, optional_yield y) override {
+
+    if (!quota.bucket_quota.enabled && !quota.user_quota.enabled) {
+      return 0;
+    }
+
+    /*
+     * we need to fetch bucket stats if the user quota is enabled, because
+     * the whole system relies on us periodically updating the user's bucket
+     * stats in the user's header, this happens in get_stats() if we actually
+     * fetch that info and not rely on cached data
+     */
+
+    const DoutPrefix dp(driver->ctx(), dout_subsys, "rgw quota handler: ");
+    if (quota.bucket_quota.enabled) {
+      RGWStorageStats bucket_stats;
+      int ret = bucket_stats_cache.get_stats(user, bucket, bucket_stats, y, &dp);
+      if (ret < 0) {
+        return ret;
+      }
+      ret = check_quota(dpp, "bucket", quota.bucket_quota, bucket_stats, num_objs, size);
+      if (ret < 0) {
+        return ret;
+      }
+    }
+
+    if (quota.user_quota.enabled) {
+      RGWStorageStats user_stats;
+      int ret = user_stats_cache.get_stats(user, bucket, user_stats, y, &dp);
+      if (ret < 0) {
+        return ret;
+      }
+      ret = check_quota(dpp, "user", quota.user_quota, user_stats, num_objs, size);
+      if (ret < 0) {
+        return ret;
+      }
+    }
+    return 0;
+  }
+
+  void update_stats(const rgw_user& user, rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) override {
+    bucket_stats_cache.adjust_stats(user, bucket, obj_delta, added_bytes, removed_bytes);
+    user_stats_cache.adjust_stats(user, bucket, obj_delta, added_bytes, removed_bytes);
+  }
+
+  void check_bucket_shards(const DoutPrefixProvider *dpp, uint64_t max_objs_per_shard,
+                           uint64_t num_shards, uint64_t num_objs, bool is_multisite,
+                           bool& need_resharding, uint32_t *suggested_num_shards) override
+  {
+    if (num_objs > num_shards * max_objs_per_shard) {
+      ldpp_dout(dpp, 0) << __func__ << ": resharding needed: stats.num_objects=" << num_objs
+             << " shard max_objects=" <<  max_objs_per_shard * num_shards << dendl;
+      need_resharding = true;
+      if (suggested_num_shards) {
+        uint32_t obj_multiplier = 2;
+        if (is_multisite) {
+          // if we're maintaining bilogs for multisite, reshards are significantly
+          // more expensive. scale up the shard count much faster to minimize the
+          // number of reshard events during a write workload
+          obj_multiplier = 8;
+        }
+        *suggested_num_shards = num_objs * obj_multiplier / max_objs_per_shard;
+      }
+    } else {
+      need_resharding = false;
+    }
+  }
+};
+
+
+RGWQuotaHandler *RGWQuotaHandler::generate_handler(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, bool quota_threads)
+{
+  return new RGWQuotaHandlerImpl(dpp, driver, quota_threads);
+}
+
+void RGWQuotaHandler::free_handler(RGWQuotaHandler *handler)
+{
+  delete handler;
+}
+
+
+void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const ConfigProxy& conf)
+{
+  if (conf->rgw_bucket_default_quota_max_objects >= 0) {
+    quota.max_objects = conf->rgw_bucket_default_quota_max_objects;
+    quota.enabled = true;
+  }
+  if (conf->rgw_bucket_default_quota_max_size >= 0) {
+    quota.max_size = conf->rgw_bucket_default_quota_max_size;
+    quota.enabled = true;
+  }
+}
+
+void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const ConfigProxy& conf)
+{
+  if (conf->rgw_user_default_quota_max_objects >= 0) {
+    quota.max_objects = conf->rgw_user_default_quota_max_objects;
+    quota.enabled = true;
+  }
+  if (conf->rgw_user_default_quota_max_size >= 0) {
+    quota.max_size = conf->rgw_user_default_quota_max_size;
+    quota.enabled = true;
+  }
+}
+
+void RGWQuotaInfo::dump(Formatter *f) const
+{
+  f->dump_bool("enabled", enabled);
+  f->dump_bool("check_on_raw", check_on_raw);
+
+  f->dump_int("max_size", max_size);
+  f->dump_int("max_size_kb", rgw_rounded_kb(max_size));
+  f->dump_int("max_objects", max_objects);
+}
+
+void RGWQuotaInfo::decode_json(JSONObj *obj)
+{
+  if (false == JSONDecoder::decode_json("max_size", max_size, obj)) {
+    /* We're parsing an older version of the struct. */
+    int64_t max_size_kb = 0;
+
+    JSONDecoder::decode_json("max_size_kb", max_size_kb, obj);
+    max_size = max_size_kb * 1024;
+  }
+  JSONDecoder::decode_json("max_objects", max_objects, obj);
+
+  JSONDecoder::decode_json("check_on_raw", check_on_raw, obj);
+  JSONDecoder::decode_json("enabled", enabled, obj);
+}
+
diff --git a/src/rgw/rgw_quota.h b/src/rgw/rgw_quota.h
new file mode 100644
index 000000000..632cb4817
--- /dev/null
+++ b/src/rgw/rgw_quota.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "include/utime.h"
+#include "common/config_fwd.h"
+#include "common/lru_map.h"
+
+#include "rgw/rgw_quota_types.h"
+#include "common/async/yield_context.h"
+#include "rgw_sal_fwd.h"
+
+struct rgw_bucket;
+
+class RGWQuotaHandler {
+public:
+  RGWQuotaHandler() {}
+  virtual ~RGWQuotaHandler() {
+  }
+  virtual int check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
+                          RGWQuota& quota,
+			  uint64_t num_objs, uint64_t size, optional_yield y) = 0;
+
+  virtual void check_bucket_shards(const DoutPrefixProvider *dpp, uint64_t max_objs_per_shard,
+                                   uint64_t num_shards, uint64_t num_objs, bool is_multisite,
+                                   bool& need_resharding, uint32_t *suggested_num_shards) = 0;
+
+  virtual void update_stats(const rgw_user& bucket_owner, rgw_bucket& bucket, int obj_delta, uint64_t added_bytes, uint64_t removed_bytes) = 0;
+
+  static RGWQuotaHandler *generate_handler(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver, bool quota_threads);
+  static void free_handler(RGWQuotaHandler *handler);
+};
+
+// apply default quotas from configuration
+void rgw_apply_default_bucket_quota(RGWQuotaInfo& quota, const ConfigProxy& conf);
+void rgw_apply_default_user_quota(RGWQuotaInfo& quota, const ConfigProxy& conf);
diff --git a/src/rgw/rgw_quota_types.h b/src/rgw/rgw_quota_types.h
new file mode 100644
index 000000000..830696815
--- /dev/null
+++ b/src/rgw/rgw_quota_types.h
@@ -0,0 +1,87 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * introduce changes or include files which can only be compiled in
+ * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h)
+ */
+
+#pragma once
+
+static inline int64_t rgw_rounded_kb(int64_t bytes)
+{
+  return (bytes + 1023) / 1024;
+}
+
+class JSONObj;
+
+struct RGWQuotaInfo {
+  template<class T> friend class RGWQuotaCache;
+public:
+  int64_t max_size;
+  int64_t max_objects;
+  bool enabled;
+  /* Do we want to compare with raw, not rounded RGWStorageStats::size (true)
+   * or maybe rounded-to-4KiB RGWStorageStats::size_rounded (false)? */
+  bool check_on_raw;
+
+  RGWQuotaInfo()
+    : max_size(-1),
+      max_objects(-1),
+      enabled(false),
+      check_on_raw(false) {
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    if (max_size < 0) {
+      encode(-rgw_rounded_kb(abs(max_size)), bl);
+    } else {
+      encode(rgw_rounded_kb(max_size), bl);
+    }
+    encode(max_objects, bl);
+    encode(enabled, bl);
+    encode(max_size, bl);
+    encode(check_on_raw, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(3, 1, 1, bl);
+    int64_t max_size_kb;
+    decode(max_size_kb, bl);
+    decode(max_objects, bl);
+    decode(enabled, bl);
+    if (struct_v < 2) {
+      max_size = max_size_kb * 1024;
+    } else {
+      decode(max_size, bl);
+    }
+    if (struct_v >= 3) {
+      decode(check_on_raw, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+
+  void decode_json(JSONObj *obj);
+
+};
+WRITE_CLASS_ENCODER(RGWQuotaInfo)
+
+struct RGWQuota {
+    RGWQuotaInfo user_quota;
+    RGWQuotaInfo bucket_quota;
+};
diff --git a/src/rgw/rgw_ratelimit.h b/src/rgw/rgw_ratelimit.h
new file mode 100644
index 000000000..2639d4d42
--- /dev/null
+++ b/src/rgw/rgw_ratelimit.h
@@ -0,0 +1,292 @@
+#pragma once
+#include <chrono>
+#include <thread>
+#include <condition_variable>
+#include "rgw_common.h"
+
+
+class RateLimiterEntry {
+  /* 
+    fixed_point_rgw_ratelimit is important to preserve the precision of the token calculation
+    for example: a user have a limit of single op per minute, the user will consume its single token and then will send another request, 1s after it.
+    in that case, without this method, the user will get 0 tokens although it should get 0.016 tokens.
+    using this method it will add 16 tokens to the user, and the user will have 16 tokens, each time rgw will do comparison rgw will divide by fixed_point_rgw_ratelimit, so the user will be blocked anyway until it has enough tokens.
+  */
+  static constexpr int64_t fixed_point_rgw_ratelimit = 1000;
+  // counters are tracked in multiples of fixed_point_rgw_ratelimit
+  struct counters {
+    int64_t ops = 0;
+    int64_t bytes = 0;
+  };
+  counters read;
+  counters write;
+  ceph::timespan ts;
+  bool first_run = true;
+  std::mutex ts_lock;
+  // Those functions are returning the integer value of the tokens 
+  int64_t read_ops () const
+  {
+    return read.ops / fixed_point_rgw_ratelimit;
+  }
+  int64_t write_ops() const
+  {
+    return write.ops / fixed_point_rgw_ratelimit;
+  }
+  int64_t read_bytes() const
+  {
+    return read.bytes / fixed_point_rgw_ratelimit;
+  }
+  int64_t write_bytes() const
+  {
+    return write.bytes / fixed_point_rgw_ratelimit;
+  }
+  bool should_rate_limit_read(int64_t ops_limit, int64_t bw_limit) {
+    //check if tenants did not reach their bw or ops limits and that the limits are not 0 (which is unlimited)
+    if(((read_ops() - 1 < 0) && (ops_limit > 0)) ||
+      (read_bytes() < 0 && bw_limit > 0))
+  {
+    return true;
+  }
+    // we don't want to reduce ops' tokens if we've rejected it.
+    read.ops -= fixed_point_rgw_ratelimit;
+    return false;
+  }
+  bool should_rate_limit_write(int64_t ops_limit, int64_t bw_limit) 
+  {
+    //check if tenants did not reach their bw or ops limits and that the limits are not 0 (which is unlimited)
+    if(((write_ops() - 1 < 0) && (ops_limit > 0)) ||
+      (write_bytes() < 0 && bw_limit > 0))
+    {
+      return true;
+    }
+
+    // we don't want to reduce ops' tokens if we've rejected it.
+    write.ops -= fixed_point_rgw_ratelimit;
+    return false;
+  }
+  /* The purpose of this function is to minimum time before overriding the stored timestamp
+     This function is necessary to force the increase tokens add at least 1 token when it updates the last stored timestamp.
+     That way the user/bucket will not lose tokens because of rounding
+  */
+  bool minimum_time_reached(ceph::timespan curr_timestamp) const
+  {
+    using namespace std::chrono;
+    constexpr auto min_duration = duration_cast<ceph::timespan>(seconds(60)) / fixed_point_rgw_ratelimit;
+    const auto delta = curr_timestamp - ts;
+    if (delta < min_duration)
+    {
+      return false;
+    }
+    return true;
+  }
+
+  void increase_tokens(ceph::timespan curr_timestamp,
+                       const RGWRateLimitInfo* info)
+  {
+    constexpr int fixed_point = fixed_point_rgw_ratelimit;
+    if (first_run)
+    {
+      write.ops = info->max_write_ops * fixed_point;
+      write.bytes = info->max_write_bytes * fixed_point;
+      read.ops = info->max_read_ops * fixed_point;
+      read.bytes = info->max_read_bytes * fixed_point;
+      ts = curr_timestamp;
+      first_run = false;
+      return;
+    }
+    else if(curr_timestamp > ts && minimum_time_reached(curr_timestamp))
+    {
+      const int64_t time_in_ms = std::chrono::duration_cast<std::chrono::milliseconds>(curr_timestamp - ts).count() / 60.0 / std::milli::den * fixed_point; // / 60 to make it work with 1 min token bucket
+      ts = curr_timestamp;
+      const int64_t write_ops = info->max_write_ops * time_in_ms;
+      const int64_t write_bw = info->max_write_bytes * time_in_ms;
+      const int64_t read_ops = info->max_read_ops * time_in_ms;
+      const int64_t read_bw = info->max_read_bytes * time_in_ms;
+      read.ops = std::min(info->max_read_ops * fixed_point, read_ops + read.ops);
+      read.bytes = std::min(info->max_read_bytes * fixed_point, read_bw + read.bytes);
+      write.ops = std::min(info->max_write_ops * fixed_point, write_ops + write.ops);
+      write.bytes = std::min(info->max_write_bytes * fixed_point, write_bw + write.bytes);
+    }
+  }
+
+  public:
+    bool should_rate_limit(bool is_read, const RGWRateLimitInfo* ratelimit_info, ceph::timespan curr_timestamp)
+    {
+      std::unique_lock lock(ts_lock);
+      increase_tokens(curr_timestamp, ratelimit_info);
+      if (is_read)
+      {
+        return should_rate_limit_read(ratelimit_info->max_read_ops, ratelimit_info->max_read_bytes);
+      }
+      return should_rate_limit_write(ratelimit_info->max_write_ops, ratelimit_info->max_write_bytes);
+    }
+    void decrease_bytes(bool is_read, int64_t amount, const RGWRateLimitInfo* info) {
+      std::unique_lock lock(ts_lock);
+      // we don't want the tenant to be with higher debt than 120 seconds(2 min) of its limit
+      if (is_read)
+      {
+        read.bytes = std::max(read.bytes - amount * fixed_point_rgw_ratelimit,info->max_read_bytes * fixed_point_rgw_ratelimit * -2);
+      } else {
+        write.bytes = std::max(write.bytes - amount * fixed_point_rgw_ratelimit,info->max_write_bytes * fixed_point_rgw_ratelimit * -2);
+      }
+    }
+    void giveback_tokens(bool is_read)
+    {
+      std::unique_lock lock(ts_lock);
+      if (is_read) 
+      {
+        read.ops += fixed_point_rgw_ratelimit;
+      } else {
+        write.ops += fixed_point_rgw_ratelimit;
+      }
+    }
+};
+
+class RateLimiter {
+
+  static constexpr size_t map_size = 2000000; // will create it with the closest upper prime number
+  std::shared_mutex insert_lock;
+  std::atomic_bool& replacing;
+  std::condition_variable& cv;
+  typedef std::unordered_map<std::string, RateLimiterEntry> hash_map;
+  hash_map ratelimit_entries{map_size};
+  static bool is_read_op(const std::string_view method) {
+    if (method == "GET" || method == "HEAD")
+    {
+      return true;
+    }
+    return false;
+  }
+
+    // find or create an entry, and return its iterator
+  auto& find_or_create(const std::string& key) {
+    std::shared_lock rlock(insert_lock);
+    if (ratelimit_entries.size() > 0.9 * map_size && replacing == false)
+    {
+      replacing = true;
+      cv.notify_all();
+    }
+    auto ret = ratelimit_entries.find(key);
+    rlock.unlock();
+    if (ret == ratelimit_entries.end())
+    {
+      std::unique_lock wlock(insert_lock);
+      ret = ratelimit_entries.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(key),
+                                 std::forward_as_tuple()).first;
+    }
+    return ret->second;
+  }
+
+  
+
+  public:
+    RateLimiter(const RateLimiter&) = delete;
+    RateLimiter& operator =(const RateLimiter&) = delete;
+    RateLimiter(RateLimiter&&) = delete;
+    RateLimiter& operator =(RateLimiter&&) = delete;
+    RateLimiter() = delete;
+    RateLimiter(std::atomic_bool& replacing, std::condition_variable& cv)
+      : replacing(replacing), cv(cv)
+    {
+      // prevents rehash, so no iterators invalidation
+      ratelimit_entries.max_load_factor(1000);
+    };
+
+    bool should_rate_limit(const char *method, const std::string& key, ceph::coarse_real_time curr_timestamp, const RGWRateLimitInfo* ratelimit_info) {
+      if (key.empty() || key.length() == 1 || !ratelimit_info->enabled)
+      {
+        return false;
+      }
+      bool is_read = is_read_op(method);
+      auto& it = find_or_create(key);
+      auto curr_ts = curr_timestamp.time_since_epoch();
+      return it.should_rate_limit(is_read ,ratelimit_info, curr_ts);
+    }
+    void giveback_tokens(const char *method, const std::string& key)
+    {
+      bool is_read = is_read_op(method);
+      auto& it = find_or_create(key);
+      it.giveback_tokens(is_read);
+    }
+    void decrease_bytes(const char *method, const std::string& key, const int64_t amount, const RGWRateLimitInfo* info) {
+      if (key.empty() || key.length() == 1 || !info->enabled)
+      {
+        return;
+      }
+      bool is_read = is_read_op(method);
+      if ((is_read && !info->max_read_bytes) || (!is_read && !info->max_write_bytes))
+      {
+        return;
+      }
+      auto& it = find_or_create(key);
+      it.decrease_bytes(is_read, amount, info);
+    }
+    void clear() {
+      ratelimit_entries.clear();
+    }
+};
+// This class purpose is to hold 2 RateLimiter instances, one active and one passive.
+// once the active has reached the watermark for clearing it will call the replace_active() thread using cv
+// The replace_active will clear the previous RateLimiter after all requests to it has been done (use_count() > 1)
+// In the meanwhile new requests will come into the newer active
+class ActiveRateLimiter : public DoutPrefix  {
+  std::atomic_uint8_t stopped = {false};
+  std::condition_variable cv;
+  std::mutex cv_m;
+  std::thread runner;
+  std::atomic_bool replacing = false;
+  std::atomic_uint8_t current_active = 0;
+  std::shared_ptr<RateLimiter> ratelimit[2];
+  void replace_active() {
+    using namespace std::chrono_literals;
+    std::unique_lock<std::mutex> lk(cv_m);
+    while (!stopped) {
+      cv.wait(lk);
+      current_active = current_active ^ 1;
+      ldpp_dout(this, 20) << "replacing active ratelimit data structure" << dendl;
+      while (!stopped && ratelimit[(current_active ^ 1)].use_count() > 1 ) {
+        if (cv.wait_for(lk, 1min) != std::cv_status::timeout && stopped)
+        {
+          return;
+        }
+      }
+      if (stopped)
+      {
+        return;
+      }
+      ldpp_dout(this, 20) << "clearing passive ratelimit data structure" << dendl;
+      ratelimit[(current_active ^ 1)]->clear();
+      replacing = false;
+    }
+  }
+  public:
+    ActiveRateLimiter(const ActiveRateLimiter&) = delete;
+    ActiveRateLimiter& operator =(const ActiveRateLimiter&) = delete;
+    ActiveRateLimiter(ActiveRateLimiter&&) = delete;
+    ActiveRateLimiter& operator =(ActiveRateLimiter&&) = delete;
+    ActiveRateLimiter() = delete;
+    ActiveRateLimiter(CephContext* cct) :
+      DoutPrefix(cct, ceph_subsys_rgw, "rate limiter: ")
+    {
+      ratelimit[0] = std::make_shared<RateLimiter>(replacing, cv);
+      ratelimit[1] = std::make_shared<RateLimiter>(replacing, cv);
+    }
+    ~ActiveRateLimiter() {
+      ldpp_dout(this, 20) << "stopping ratelimit_gc thread" << dendl;
+      cv_m.lock();
+      stopped = true;
+      cv_m.unlock();
+      cv.notify_all();
+      runner.join();
+    }
+    std::shared_ptr<RateLimiter> get_active() {
+      return ratelimit[current_active];
+    }
+    void start() {
+      ldpp_dout(this, 20) << "starting ratelimit_gc thread" << dendl;
+      runner = std::thread(&ActiveRateLimiter::replace_active, this);
+      const auto rc = ceph_pthread_setname(runner.native_handle(), "ratelimit_gc");
+      ceph_assert(rc==0);
+    }
+};
diff --git a/src/rgw/rgw_realm.cc b/src/rgw/rgw_realm.cc
new file mode 100644
index 000000000..8dd6d6f50
--- /dev/null
+++ b/src/rgw/rgw_realm.cc
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <optional>
+
+#include "common/errno.h"
+
+#include "rgw_zone.h"
+#include "rgw_realm_watcher.h"
+#include "rgw_meta_sync_status.h"
+#include "rgw_sal_config.h"
+#include "rgw_string.h"
+#include "rgw_sync.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+
+#include "common/ceph_json.h"
+#include "common/Formatter.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw_zone_defaults {
+
+std::string realm_info_oid_prefix = "realms.";
+std::string realm_names_oid_prefix = "realms_names.";
+std::string default_realm_info_oid = "default.realm";
+std::string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
+
+}
+
+using namespace std;
+using namespace rgw_zone_defaults;
+
+RGWRealm::~RGWRealm() {}
+
+RGWRemoteMetaLog::~RGWRemoteMetaLog()
+{
+  delete error_logger;
+}
+
+string RGWRealm::get_predefined_id(CephContext *cct) const {
+  return cct->_conf.get_val<string>("rgw_realm_id");
+}
+
+const string& RGWRealm::get_predefined_name(CephContext *cct) const {
+  return cct->_conf->rgw_realm;
+}
+
+int RGWRealm::create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive)
+{
+  int ret = RGWSystemMetaObj::create(dpp, y, exclusive);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  // create the control object for watch/notify
+  ret = create_control(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  RGWPeriod period;
+  if (current_period.empty()) {
+    /* create new period for the realm */
+    ret = period.init(dpp, cct, sysobj_svc, id, y, name, false);
+    if (ret < 0 ) {
+      return ret;
+    }
+    ret = period.create(dpp, y, true);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+  } else {
+    period = RGWPeriod(current_period, 0);
+    int ret = period.init(dpp, cct, sysobj_svc, id, y, name);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to init period " << current_period << dendl;
+      return ret;
+    }
+  }
+  ret = set_current_period(dpp, period, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed set current period " << current_period << dendl;
+    return ret;
+  }
+  // try to set as default. may race with another create, so pass exclusive=true
+  // so we don't override an existing default
+  ret = set_as_default(dpp, y, true);
+  if (ret < 0 && ret != -EEXIST) {
+    ldpp_dout(dpp, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
+  }
+
+  return 0;
+}
+
+int RGWRealm::delete_obj(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  int ret = RGWSystemMetaObj::delete_obj(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+  return delete_control(dpp, y);
+}
+
+int RGWRealm::create_control(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  auto pool = rgw_pool{get_pool(cct)};
+  auto oid = get_control_oid();
+  bufferlist bl;
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid});
+  return sysobj.wop()
+               .set_exclusive(exclusive)
+               .write(dpp, bl, y);
+}
+
+int RGWRealm::delete_control(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  auto pool = rgw_pool{get_pool(cct)};
+  auto obj = rgw_raw_obj{pool, get_control_oid()};
+  auto sysobj = sysobj_svc->get_obj(obj);
+  return sysobj.wop().remove(dpp, y);
+}
+
+rgw_pool RGWRealm::get_pool(CephContext *cct) const
+{
+  if (cct->_conf->rgw_realm_root_pool.empty()) {
+    return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
+  }
+  return rgw_pool(cct->_conf->rgw_realm_root_pool);
+}
+
+const string RGWRealm::get_default_oid(bool old_format) const
+{
+  if (cct->_conf->rgw_default_realm_info_oid.empty()) {
+    return default_realm_info_oid;
+  }
+  return cct->_conf->rgw_default_realm_info_oid;
+}
+
+const string& RGWRealm::get_names_oid_prefix() const
+{
+  return realm_names_oid_prefix;
+}
+
+const string& RGWRealm::get_info_oid_prefix(bool old_format) const
+{
+  return realm_info_oid_prefix;
+}
+
+int RGWRealm::set_current_period(const DoutPrefixProvider *dpp, RGWPeriod& period, optional_yield y)
+{
+  // update realm epoch to match the period's
+  if (epoch > period.get_realm_epoch()) {
+    ldpp_dout(dpp, 0) << "ERROR: set_current_period with old realm epoch "
+        << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
+    return -EINVAL;
+  }
+  if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
+    ldpp_dout(dpp, 0) << "ERROR: set_current_period with same realm epoch "
+        << period.get_realm_epoch() << ", but different period id "
+        << period.get_id() << " != " << current_period << dendl;
+    return -EINVAL;
+  }
+
+  epoch = period.get_realm_epoch();
+  current_period = period.get_id();
+
+  int ret = update(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  ret = period.reflect(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+string RGWRealm::get_control_oid() const
+{
+  return get_info_oid_prefix() + id + ".control";
+}
+
+int RGWRealm::notify_zone(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y)
+{
+  rgw_pool pool{get_pool(cct)};
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, get_control_oid()});
+  int ret = sysobj.wn().notify(dpp, bl, 0, nullptr, y);
+  if (ret < 0) {
+    return ret;
+  }
+  return 0;
+}
+
+int RGWRealm::notify_new_period(const DoutPrefixProvider *dpp, const RGWPeriod& period, optional_yield y)
+{
+  bufferlist bl;
+  using ceph::encode;
+  // push the period to dependent zonegroups/zones
+  encode(RGWRealmNotify::ZonesNeedPeriod, bl);
+  encode(period, bl);
+  // reload the gateway with the new period
+  encode(RGWRealmNotify::Reload, bl);
+
+  return notify_zone(dpp, bl, y);
+}
+
+
+int RGWRealm::find_zone(const DoutPrefixProvider *dpp,
+                        const rgw_zone_id& zid,
+                        RGWPeriod *pperiod,
+                        RGWZoneGroup *pzonegroup,
+                        bool *pfound,
+                        optional_yield y) const
+{
+  auto& found = *pfound;
+
+  found = false;
+
+  string period_id;
+  epoch_t epoch = 0;
+
+  RGWPeriod period(period_id, epoch);
+  int r = period.init(dpp, cct, sysobj_svc, get_id(), y, get_name());
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "WARNING: period init failed: " << cpp_strerror(-r) << " ... skipping" << dendl;
+    return r;
+  }
+
+  found = period.find_zone(dpp, zid, pzonegroup, y);
+  if (found) {
+    *pperiod = period;
+  }
+  return 0;
+}
+
+void RGWRealm::generate_test_instances(list<RGWRealm*> &o)
+{
+  RGWRealm *z = new RGWRealm;
+  o.push_back(z);
+  o.push_back(new RGWRealm);
+}
+
+void RGWRealm::dump(Formatter *f) const
+{
+  RGWSystemMetaObj::dump(f);
+  encode_json("current_period", current_period, f);
+  encode_json("epoch", epoch, f);
+}
+
+
+void RGWRealm::decode_json(JSONObj *obj)
+{
+  RGWSystemMetaObj::decode_json(obj);
+  JSONDecoder::decode_json("current_period", current_period, obj);
+  JSONDecoder::decode_json("epoch", epoch, obj);
+}
+
diff --git a/src/rgw/rgw_realm_reloader.cc b/src/rgw/rgw_realm_reloader.cc
new file mode 100644
index 000000000..182cf1639
--- /dev/null
+++ b/src/rgw/rgw_realm_reloader.cc
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_realm_reloader.h"
+
+#include "rgw_auth_registry.h"
+#include "rgw_bucket.h"
+#include "rgw_log.h"
+#include "rgw_rest.h"
+#include "rgw_user.h"
+#include "rgw_process_env.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+
+#include "services/svc_zone.h"
+
+#include "common/errno.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "rgw realm reloader: ")
+
+
+// safe callbacks from SafeTimer are unneccessary. reload() can take a long
+// time, so we don't want to hold the mutex and block handle_notify() for the
+// duration
+static constexpr bool USE_SAFE_TIMER_CALLBACKS = false;
+
+
+RGWRealmReloader::RGWRealmReloader(RGWProcessEnv& env,
+                                   const rgw::auth::ImplicitTenants& implicit_tenants,
+                                   std::map<std::string, std::string>& service_map_meta,
+                                   Pauser* frontends)
+  : env(env),
+    implicit_tenants(implicit_tenants),
+    service_map_meta(service_map_meta),
+    frontends(frontends),
+    timer(env.driver->ctx(), mutex, USE_SAFE_TIMER_CALLBACKS),
+    mutex(ceph::make_mutex("RGWRealmReloader")),
+    reload_scheduled(nullptr)
+{
+  timer.init();
+}
+
+RGWRealmReloader::~RGWRealmReloader()
+{
+  std::lock_guard lock{mutex};
+  timer.shutdown();
+}
+
+class RGWRealmReloader::C_Reload : public Context {
+  RGWRealmReloader* reloader;
+ public:
+  explicit C_Reload(RGWRealmReloader* reloader) : reloader(reloader) {}
+  void finish(int r) override { reloader->reload(); }
+};
+
+void RGWRealmReloader::handle_notify(RGWRealmNotify type,
+                                     bufferlist::const_iterator& p)
+{
+  if (!env.driver) {
+    /* we're in the middle of reload */
+    return;
+  }
+
+  CephContext *const cct = env.driver->ctx();
+
+  std::lock_guard lock{mutex};
+  if (reload_scheduled) {
+    ldout(cct, 4) << "Notification on realm, reconfiguration "
+        "already scheduled" << dendl;
+    return;
+  }
+
+  reload_scheduled = new C_Reload(this);
+  cond.notify_one(); // wake reload() if it blocked on a bad configuration
+
+  // schedule reload() without delay
+  timer.add_event_after(0, reload_scheduled);
+
+  ldout(cct, 4) << "Notification on realm, reconfiguration scheduled" << dendl;
+}
+
+void RGWRealmReloader::reload()
+{
+  CephContext *const cct = env.driver->ctx();
+  const DoutPrefix dp(cct, dout_subsys, "rgw realm reloader: ");
+  ldpp_dout(&dp, 1) << "Pausing frontends for realm update..." << dendl;
+
+  frontends->pause();
+
+  ldpp_dout(&dp, 1) << "Frontends paused" << dendl;
+
+  // TODO: make RGWRados responsible for rgw_log_usage lifetime
+  rgw_log_usage_finalize();
+
+  // destroy the existing driver
+  DriverManager::close_storage(env.driver);
+  env.driver = nullptr;
+
+  ldpp_dout(&dp, 1) << "driver closed" << dendl;
+  {
+    // allow a new notify to reschedule us. it's important that we do this
+    // before we start loading the new realm, or we could miss some updates
+    std::lock_guard lock{mutex};
+    reload_scheduled = nullptr;
+  }
+
+
+  while (!env.driver) {
+    // recreate and initialize a new driver
+    DriverManager::Config cfg;
+    cfg.store_name = "rados";
+    cfg.filter_name = "none";
+    env.driver =
+      DriverManager::get_storage(&dp, cct,
+				   cfg,
+				   cct->_conf->rgw_enable_gc_threads,
+				   cct->_conf->rgw_enable_lc_threads,
+				   cct->_conf->rgw_enable_quota_threads,
+				   cct->_conf->rgw_run_sync_thread,
+				   cct->_conf.get_val<bool>("rgw_dynamic_resharding"),
+				   cct->_conf->rgw_cache_enabled);
+
+    ldpp_dout(&dp, 1) << "Creating new driver" << dendl;
+
+    rgw::sal::Driver* store_cleanup = nullptr;
+    {
+      std::unique_lock lock{mutex};
+
+      // failure to recreate RGWRados is not a recoverable error, but we
+      // don't want to assert or abort the entire cluster.  instead, just
+      // sleep until we get another notification, and retry until we get
+      // a working configuration
+      if (env.driver == nullptr) {
+        ldpp_dout(&dp, -1) << "Failed to reinitialize RGWRados after a realm "
+            "configuration update. Waiting for a new update." << dendl;
+
+        // sleep until another event is scheduled
+	cond.wait(lock, [this] { return reload_scheduled; });
+        ldout(cct, 1) << "Woke up with a new configuration, retrying "
+            "RGWRados initialization." << dendl;
+      }
+
+      if (reload_scheduled) {
+        // cancel the event; we'll handle it now
+        timer.cancel_event(reload_scheduled);
+        reload_scheduled = nullptr;
+
+        // if we successfully created a driver, clean it up outside of the lock,
+        // then continue to loop and recreate another
+        std::swap(env.driver, store_cleanup);
+      }
+    }
+
+    if (store_cleanup) {
+      ldpp_dout(&dp, 4) << "Got another notification, restarting RGWRados "
+          "initialization." << dendl;
+
+      DriverManager::close_storage(store_cleanup);
+    }
+  }
+
+  int r = env.driver->register_to_service_map(&dp, "rgw", service_map_meta);
+  if (r < 0) {
+    ldpp_dout(&dp, -1) << "ERROR: failed to register to service map: " << cpp_strerror(-r) << dendl;
+
+    /* ignore error */
+  }
+
+  ldpp_dout(&dp, 1) << "Finishing initialization of new driver" << dendl;
+  // finish initializing the new driver
+  ldpp_dout(&dp, 1) << " - REST subsystem init" << dendl;
+  rgw_rest_init(cct, env.driver->get_zone()->get_zonegroup());
+  ldpp_dout(&dp, 1) << " - usage subsystem init" << dendl;
+  rgw_log_usage_init(cct, env.driver);
+
+  /* Initialize the registry of auth strategies which will coordinate
+   * the dynamic reconfiguration. */
+  env.auth_registry = rgw::auth::StrategyRegistry::create(
+      cct, implicit_tenants, env.driver);
+  env.lua.manager = env.driver->get_lua_manager();
+
+  ldpp_dout(&dp, 1) << "Resuming frontends with new realm configuration." << dendl;
+
+  frontends->resume(env.driver);
+}
diff --git a/src/rgw/rgw_realm_reloader.h b/src/rgw/rgw_realm_reloader.h
new file mode 100644
index 000000000..25082a2e4
--- /dev/null
+++ b/src/rgw/rgw_realm_reloader.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_realm_watcher.h"
+#include "common/Cond.h"
+#include "rgw_sal_fwd.h"
+
+struct RGWProcessEnv;
+namespace rgw::auth { class ImplicitTenants; }
+
+/**
+ * RGWRealmReloader responds to new period notifications by recreating RGWRados
+ * with the updated realm configuration.
+ */
+class RGWRealmReloader : public RGWRealmWatcher::Watcher {
+ public:
+  /**
+   * Pauser is an interface to pause/resume frontends. Frontend cooperation
+   * is required to ensure that they stop issuing requests on the old
+   * RGWRados instance, and restart with the updated configuration.
+   *
+   * This abstraction avoids a dependency on class RGWFrontend.
+   */
+  class Pauser {
+   public:
+    virtual ~Pauser() = default;
+
+    /// pause all frontends while realm reconfiguration is in progress
+    virtual void pause() = 0;
+    /// resume all frontends with the given RGWRados instance
+    virtual void resume(rgw::sal::Driver* driver) = 0;
+  };
+
+  RGWRealmReloader(RGWProcessEnv& env,
+                   const rgw::auth::ImplicitTenants& implicit_tenants,
+                   std::map<std::string, std::string>& service_map_meta,
+                   Pauser* frontends);
+  ~RGWRealmReloader() override;
+
+  /// respond to realm notifications by scheduling a reload()
+  void handle_notify(RGWRealmNotify type, bufferlist::const_iterator& p) override;
+
+ private:
+  /// pause frontends and replace the RGWRados instance
+  void reload();
+
+  class C_Reload; //< Context that calls reload()
+
+  RGWProcessEnv& env;
+  const rgw::auth::ImplicitTenants& implicit_tenants;
+  std::map<std::string, std::string>& service_map_meta;
+  Pauser *const frontends;
+
+  /// reload() takes a significant amount of time, so we don't want to run
+  /// it in the handle_notify() thread. we choose a timer thread instead of a
+  /// Finisher because it allows us to cancel events that were scheduled while
+  /// reload() is still running
+  SafeTimer timer;
+  ceph::mutex mutex; //< protects access to timer and reload_scheduled
+  ceph::condition_variable cond; //< to signal reload() after an invalid realm config
+  C_Reload* reload_scheduled; //< reload() context if scheduled
+};
diff --git a/src/rgw/rgw_realm_watcher.cc b/src/rgw/rgw_realm_watcher.cc
new file mode 100644
index 000000000..f6cd34759
--- /dev/null
+++ b/src/rgw/rgw_realm_watcher.cc
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/errno.h"
+
+#include "rgw_realm_watcher.h"
+#include "rgw_tools.h"
+#include "rgw_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#undef dout_prefix
+#define dout_prefix (*_dout << "rgw realm watcher: ")
+
+
+RGWRealmWatcher::RGWRealmWatcher(const DoutPrefixProvider *dpp, CephContext* cct, const RGWRealm& realm)
+  : cct(cct)
+{
+  // no default realm, nothing to watch
+  if (realm.get_id().empty()) {
+    ldpp_dout(dpp, 4) << "No realm, disabling dynamic reconfiguration." << dendl;
+    return;
+  }
+
+  // establish the watch on RGWRealm
+  int r = watch_start(dpp, realm);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "Failed to establish a watch on RGWRealm, "
+        "disabling dynamic reconfiguration." << dendl;
+    return;
+  }
+}
+
+RGWRealmWatcher::~RGWRealmWatcher()
+{
+  watch_stop();
+}
+
+void RGWRealmWatcher::add_watcher(RGWRealmNotify type, Watcher& watcher)
+{
+  watchers.emplace(type, watcher);
+}
+
+void RGWRealmWatcher::handle_notify(uint64_t notify_id, uint64_t cookie,
+                                    uint64_t notifier_id, bufferlist& bl)
+{
+  if (cookie != watch_handle)
+    return;
+
+  // send an empty notify ack
+  bufferlist reply;
+  pool_ctx.notify_ack(watch_oid, notify_id, cookie, reply);
+
+  try {
+    auto p = bl.cbegin();
+    while (!p.end()) {
+      RGWRealmNotify notify;
+      decode(notify, p);
+      auto watcher = watchers.find(notify);
+      if (watcher == watchers.end()) {
+        lderr(cct) << "Failed to find a watcher for notify type "
+            << static_cast<int>(notify) << dendl;
+        break;
+      }
+      watcher->second.handle_notify(notify, p);
+    }
+  } catch (const buffer::error &e) {
+    lderr(cct) << "Failed to decode realm notifications." << dendl;
+  }
+}
+
+void RGWRealmWatcher::handle_error(uint64_t cookie, int err)
+{
+  lderr(cct) << "RGWRealmWatcher::handle_error oid=" << watch_oid << " err=" << err << dendl;
+  if (cookie != watch_handle)
+    return;
+
+  watch_restart();
+}
+
+int RGWRealmWatcher::watch_start(const DoutPrefixProvider *dpp, const RGWRealm& realm)
+{
+  // initialize a Rados client
+  int r = rados.init_with_context(cct);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "Rados client initialization failed with "
+        << cpp_strerror(-r) << dendl;
+    return r;
+  }
+  r = rados.connect();
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "Rados client connection failed with "
+        << cpp_strerror(-r) << dendl;
+    return r;
+  }
+
+  // open an IoCtx for the realm's pool
+  rgw_pool pool(realm.get_pool(cct));
+  r = rgw_init_ioctx(dpp, &rados, pool, pool_ctx);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "Failed to open pool " << pool
+        << " with " << cpp_strerror(-r) << dendl;
+    rados.shutdown();
+    return r;
+  }
+
+  // register a watch on the realm's control object
+  auto oid = realm.get_control_oid();
+  r = pool_ctx.watch2(oid, &watch_handle, this);
+  if (r < 0) {
+    ldpp_dout(dpp, -1) << "Failed to watch " << oid
+        << " with " << cpp_strerror(-r) << dendl;
+    pool_ctx.close();
+    rados.shutdown();
+    return r;
+  }
+
+  ldpp_dout(dpp, 10) << "Watching " << oid << dendl;
+  std::swap(watch_oid, oid);
+  return 0;
+}
+
+int RGWRealmWatcher::watch_restart()
+{
+  ceph_assert(!watch_oid.empty());
+  int r = pool_ctx.unwatch2(watch_handle);
+  if (r < 0) {
+    lderr(cct) << "Failed to unwatch on " << watch_oid
+        << " with " << cpp_strerror(-r) << dendl;
+  }
+  r = pool_ctx.watch2(watch_oid, &watch_handle, this);
+  if (r < 0) {
+    lderr(cct) << "Failed to restart watch on " << watch_oid
+        << " with " << cpp_strerror(-r) << dendl;
+    pool_ctx.close();
+    watch_oid.clear();
+  }
+  return r;
+}
+
+void RGWRealmWatcher::watch_stop()
+{
+  if (!watch_oid.empty()) {
+    pool_ctx.unwatch2(watch_handle);
+    pool_ctx.close();
+    watch_oid.clear();
+  }
+}
diff --git a/src/rgw/rgw_realm_watcher.h b/src/rgw/rgw_realm_watcher.h
new file mode 100644
index 000000000..2a0c0d076
--- /dev/null
+++ b/src/rgw/rgw_realm_watcher.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "include/rados/librados.hpp"
+#include "include/ceph_assert.h"
+#include "common/Timer.h"
+#include "common/Cond.h"
+
+class RGWRados;
+class RGWRealm;
+
+enum class RGWRealmNotify {
+  Reload,
+  ZonesNeedPeriod,
+};
+WRITE_RAW_ENCODER(RGWRealmNotify);
+
+/**
+ * RGWRealmWatcher establishes a watch on the current RGWRealm's control object,
+ * and forwards notifications to registered observers.
+ */
+class RGWRealmWatcher : public librados::WatchCtx2 {
+ public:
+  /**
+   * Watcher is an interface that allows the RGWRealmWatcher to pass
+   * notifications on to other interested objects.
+   */
+  class Watcher {
+   public:
+    virtual ~Watcher() = default;
+
+    virtual void handle_notify(RGWRealmNotify type,
+                               bufferlist::const_iterator& p) = 0;
+  };
+
+  RGWRealmWatcher(const DoutPrefixProvider *dpp, CephContext* cct, const RGWRealm& realm);
+  ~RGWRealmWatcher() override;
+
+  /// register a watcher for the given notification type
+  void add_watcher(RGWRealmNotify type, Watcher& watcher);
+
+  /// respond to realm notifications by calling the appropriate watcher
+  void handle_notify(uint64_t notify_id, uint64_t cookie,
+                     uint64_t notifier_id, bufferlist& bl) override;
+
+  /// reestablish the watch if it gets disconnected
+  void handle_error(uint64_t cookie, int err) override;
+
+ private:
+  CephContext *const cct;
+
+  /// keep a separate Rados client whose lifetime is independent of RGWRados
+  /// so that we don't miss notifications during realm reconfiguration
+  librados::Rados rados;
+  librados::IoCtx pool_ctx;
+  uint64_t watch_handle = 0;
+  std::string watch_oid;
+
+  int watch_start(const DoutPrefixProvider *dpp, const RGWRealm& realm);
+  int watch_restart();
+  void watch_stop();
+
+  std::map<RGWRealmNotify, Watcher&> watchers;
+};
diff --git a/src/rgw/rgw_request.h b/src/rgw/rgw_request.h
new file mode 100644
index 000000000..cd05f51c9
--- /dev/null
+++ b/src/rgw/rgw_request.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_common.h"
+#include "rgw_acl.h"
+#include "rgw_user.h"
+#include "rgw_op.h"
+
+#include "common/QueueRing.h"
+
+#include <atomic>
+
+struct RGWRequest
+{
+  uint64_t id;
+  req_state *s;
+  RGWOp *op;
+
+  explicit RGWRequest(uint64_t id) : id(id), s(NULL), op(NULL) {}
+
+  virtual ~RGWRequest() {}
+
+  void init_state(req_state *_s) {
+    s = _s;
+  }
+}; /* RGWRequest */
+
+struct RGWLoadGenRequest : public RGWRequest {
+	std::string method;
+	std::string resource;
+	int content_length;
+	std::atomic<bool>* fail_flag = nullptr;
+
+RGWLoadGenRequest(uint64_t req_id, const std::string& _m, const std::string& _r, int _cl,
+		std::atomic<bool> *ff)
+	: RGWRequest(req_id), method(_m), resource(_r), content_length(_cl),
+		fail_flag(ff) {}
+};
diff --git a/src/rgw/rgw_resolve.cc b/src/rgw/rgw_resolve.cc
new file mode 100644
index 000000000..b6f258ee0
--- /dev/null
+++ b/src/rgw/rgw_resolve.cc
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <arpa/nameser.h>
+#include <resolv.h>
+
+#include "acconfig.h"
+
+#ifdef HAVE_ARPA_NAMESER_COMPAT_H
+#include <arpa/nameser_compat.h>
+#endif
+
+#include "rgw_common.h"
+#include "rgw_resolve.h"
+#include "common/dns_resolve.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWResolver::~RGWResolver() {
+}
+
+RGWResolver::RGWResolver() {
+  resolver = DNSResolver::get_instance();
+}
+
+int RGWResolver::resolve_cname(const string& hostname, string& cname, bool *found) {
+  return resolver->resolve_cname(g_ceph_context, hostname, &cname, found);
+}
+
+RGWResolver *rgw_resolver;
+
+
+void rgw_init_resolver()
+{
+  rgw_resolver = new RGWResolver();
+}
+
+void rgw_shutdown_resolver()
+{
+  delete rgw_resolver;
+}
diff --git a/src/rgw/rgw_resolve.h b/src/rgw/rgw_resolve.h
new file mode 100644
index 000000000..0428e0a02
--- /dev/null
+++ b/src/rgw/rgw_resolve.h
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_common.h"
+
+namespace ceph {
+  class DNSResolver;
+}
+
+class RGWResolver {
+  DNSResolver *resolver;
+
+public:
+  ~RGWResolver();
+  RGWResolver();
+  int resolve_cname(const std::string& hostname, std::string& cname, bool *found);
+};
+
+
+extern void rgw_init_resolver(void);
+extern void rgw_shutdown_resolver(void);
+extern RGWResolver *rgw_resolver;
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
new file mode 100644
index 000000000..a1741e6dc
--- /dev/null
+++ b/src/rgw/rgw_rest.cc
@@ -0,0 +1,2335 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+
+#include <errno.h>
+#include <limits.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/tokenizer.hpp>
+#include "common/Formatter.h"
+#include "common/HTMLFormatter.h"
+#include "common/utf8.h"
+#include "include/str_list.h"
+#include "rgw_common.h"
+#include "rgw_rados.h"
+#include "rgw_zone.h"
+#include "rgw_auth_s3.h"
+#include "rgw_formats.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_rest_swift.h"
+#include "rgw_rest_s3.h"
+#include "rgw_swift_auth.h"
+#include "rgw_cors_s3.h"
+#include "rgw_perf_counters.h"
+
+#include "rgw_client_io.h"
+#include "rgw_resolve.h"
+#include "rgw_sal_rados.h"
+
+#include "rgw_ratelimit.h"
+#include <numeric>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+struct rgw_http_status_code {
+  int code;
+  const char *name;
+};
+
+const static struct rgw_http_status_code http_codes[] = {
+  { 100, "Continue" },
+  { 200, "OK" },
+  { 201, "Created" },
+  { 202, "Accepted" },
+  { 204, "No Content" },
+  { 205, "Reset Content" },
+  { 206, "Partial Content" },
+  { 207, "Multi Status" },
+  { 208, "Already Reported" },
+  { 300, "Multiple Choices" },
+  { 301, "Moved Permanently" },
+  { 302, "Found" },
+  { 303, "See Other" },
+  { 304, "Not Modified" },
+  { 305, "User Proxy" },
+  { 306, "Switch Proxy" },
+  { 307, "Temporary Redirect" },
+  { 308, "Permanent Redirect" },
+  { 400, "Bad Request" },
+  { 401, "Unauthorized" },
+  { 402, "Payment Required" },
+  { 403, "Forbidden" },
+  { 404, "Not Found" },
+  { 405, "Method Not Allowed" },
+  { 406, "Not Acceptable" },
+  { 407, "Proxy Authentication Required" },
+  { 408, "Request Timeout" },
+  { 409, "Conflict" },
+  { 410, "Gone" },
+  { 411, "Length Required" },
+  { 412, "Precondition Failed" },
+  { 413, "Request Entity Too Large" },
+  { 414, "Request-URI Too Long" },
+  { 415, "Unsupported Media Type" },
+  { 416, "Requested Range Not Satisfiable" },
+  { 417, "Expectation Failed" },
+  { 422, "Unprocessable Entity" },
+  { 498, "Rate Limited"},
+  { 500, "Internal Server Error" },
+  { 501, "Not Implemented" },
+  { 503, "Slow Down"},
+  { 0, NULL },
+};
+
+struct rgw_http_attr {
+  const char *rgw_attr;
+  const char *http_attr;
+};
+
+/*
+ * mapping between rgw object attrs and output http fields
+ */
+static const struct rgw_http_attr base_rgw_to_http_attrs[] = {
+  { RGW_ATTR_CONTENT_LANG,      "Content-Language" },
+  { RGW_ATTR_EXPIRES,           "Expires" },
+  { RGW_ATTR_CACHE_CONTROL,     "Cache-Control" },
+  { RGW_ATTR_CONTENT_DISP,      "Content-Disposition" },
+  { RGW_ATTR_CONTENT_ENC,       "Content-Encoding" },
+  { RGW_ATTR_USER_MANIFEST,     "X-Object-Manifest" },
+  { RGW_ATTR_X_ROBOTS_TAG ,     "X-Robots-Tag" },
+  { RGW_ATTR_STORAGE_CLASS ,    "X-Amz-Storage-Class" },
+  /* RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION header depends on access mode:
+   * S3 endpoint: x-amz-website-redirect-location
+   * S3Website endpoint: Location
+   */
+  { RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION, "x-amz-website-redirect-location" },
+};
+
+
+struct generic_attr {
+  const char *http_header;
+  const char *rgw_attr;
+};
+
+/*
+ * mapping between http env fields and rgw object attrs
+ */
+static const struct generic_attr generic_attrs[] = {
+  { "CONTENT_TYPE",             RGW_ATTR_CONTENT_TYPE },
+  { "HTTP_CONTENT_LANGUAGE",    RGW_ATTR_CONTENT_LANG },
+  { "HTTP_EXPIRES",             RGW_ATTR_EXPIRES },
+  { "HTTP_CACHE_CONTROL",       RGW_ATTR_CACHE_CONTROL },
+  { "HTTP_CONTENT_DISPOSITION", RGW_ATTR_CONTENT_DISP },
+  { "HTTP_CONTENT_ENCODING",    RGW_ATTR_CONTENT_ENC },
+  { "HTTP_X_ROBOTS_TAG",        RGW_ATTR_X_ROBOTS_TAG },
+};
+
+map<string, string> rgw_to_http_attrs;
+static map<string, string> generic_attrs_map;
+map<int, const char *> http_status_names;
+
+/*
+ * make attrs look_like_this
+ * converts dashes to underscores
+ */
+string lowercase_underscore_http_attr(const string& orig)
+{
+  const char *s = orig.c_str();
+  char buf[orig.size() + 1];
+  buf[orig.size()] = '\0';
+
+  for (size_t i = 0; i < orig.size(); ++i, ++s) {
+    switch (*s) {
+      case '-':
+        buf[i] = '_';
+        break;
+      default:
+        buf[i] = tolower(*s);
+    }
+  }
+  return string(buf);
+}
+
+/*
+ * make attrs LOOK_LIKE_THIS
+ * converts dashes to underscores
+ */
+string uppercase_underscore_http_attr(const string& orig)
+{
+  const char *s = orig.c_str();
+  char buf[orig.size() + 1];
+  buf[orig.size()] = '\0';
+
+  for (size_t i = 0; i < orig.size(); ++i, ++s) {
+    switch (*s) {
+      case '-':
+        buf[i] = '_';
+        break;
+      default:
+        buf[i] = toupper(*s);
+    }
+  }
+  return string(buf);
+}
+
+/* avoid duplicate hostnames in hostnames lists */
+static set<string> hostnames_set;
+static set<string> hostnames_s3website_set;
+
+void rgw_rest_init(CephContext *cct, const rgw::sal::ZoneGroup& zone_group)
+{
+  for (const auto& rgw2http : base_rgw_to_http_attrs)  {
+    rgw_to_http_attrs[rgw2http.rgw_attr] = rgw2http.http_attr;
+  }
+
+  for (const auto& http2rgw : generic_attrs) {
+    generic_attrs_map[http2rgw.http_header] = http2rgw.rgw_attr;
+  }
+
+  list<string> extended_http_attrs;
+  get_str_list(cct->_conf->rgw_extended_http_attrs, extended_http_attrs);
+
+  list<string>::iterator iter;
+  for (iter = extended_http_attrs.begin(); iter != extended_http_attrs.end(); ++iter) {
+    string rgw_attr = RGW_ATTR_PREFIX;
+    rgw_attr.append(lowercase_underscore_http_attr(*iter));
+
+    rgw_to_http_attrs[rgw_attr] = camelcase_dash_http_attr(*iter);
+
+    string http_header = "HTTP_";
+    http_header.append(uppercase_underscore_http_attr(*iter));
+
+    generic_attrs_map[http_header] = rgw_attr;
+  }
+
+  for (const struct rgw_http_status_code *h = http_codes; h->code; h++) {
+    http_status_names[h->code] = h->name;
+  }
+
+  std::list<std::string> rgw_dns_names;
+  std::string rgw_dns_names_str = cct->_conf->rgw_dns_name;
+  get_str_list(rgw_dns_names_str, ", ", rgw_dns_names);
+  hostnames_set.insert(rgw_dns_names.begin(), rgw_dns_names.end());
+
+  std::list<std::string> names;
+  zone_group.get_hostnames(names);
+  hostnames_set.insert(names.begin(), names.end());
+  hostnames_set.erase(""); // filter out empty hostnames
+  ldout(cct, 20) << "RGW hostnames: " << hostnames_set << dendl;
+  /* TODO: We should have a sanity check that no hostname matches the end of
+   * any other hostname, otherwise we will get ambigious results from
+   * rgw_find_host_in_domains.
+   * Eg: 
+   * Hostnames: [A, B.A]
+   * Inputs: [Z.A, X.B.A]
+   * Z.A clearly splits to subdomain=Z, domain=Z
+   * X.B.A ambigously splits to both {X, B.A} and {X.B, A}
+   */
+
+  zone_group.get_s3website_hostnames(names);
+  hostnames_s3website_set.insert(cct->_conf->rgw_dns_s3website_name);
+  hostnames_s3website_set.insert(names.begin(), names.end());
+  hostnames_s3website_set.erase(""); // filter out empty hostnames
+  ldout(cct, 20) << "RGW S3website hostnames: " << hostnames_s3website_set << dendl;
+  /* TODO: we should repeat the hostnames_set sanity check here
+   * and ALSO decide about overlap, if any
+   */
+}
+
+static bool str_ends_with_nocase(const string& s, const string& suffix, size_t *pos)
+{
+  size_t len = suffix.size();
+  if (len > (size_t)s.size()) {
+    return false;
+  }
+
+  ssize_t p = s.size() - len;
+  if (pos) {
+    *pos = p;
+  }
+
+  return boost::algorithm::iends_with(s, suffix);
+}
+
+static bool rgw_find_host_in_domains(const string& host, string *domain, string *subdomain,
+                                     const set<string>& valid_hostnames_set)
+{
+  set<string>::iterator iter;
+  /** TODO, Future optimization
+   * store hostnames_set elements _reversed_, and look for a prefix match,
+   * which is much faster than a suffix match.
+   */
+  for (iter = valid_hostnames_set.begin(); iter != valid_hostnames_set.end(); ++iter) {
+    size_t pos;
+    if (!str_ends_with_nocase(host, *iter, &pos))
+      continue;
+
+    if (pos == 0) {
+      *domain = host;
+      subdomain->clear();
+    } else {
+      if (host[pos - 1] != '.') {
+	continue;
+      }
+
+      *domain = host.substr(pos);
+      *subdomain = host.substr(0, pos - 1);
+    }
+    return true;
+  }
+  return false;
+}
+
+static void dump_status(req_state *s, int status,
+			const char *status_name)
+{
+  s->formatter->set_status(status, status_name);
+  try {
+    RESTFUL_IO(s)->send_status(status, status_name);
+  } catch (rgw::io::Exception& e) {
+    ldpp_dout(s, 0) << "ERROR: s->cio->send_status() returned err="
+                     << e.what() << dendl;
+  }
+}
+
+void rgw_flush_formatter_and_reset(req_state *s, Formatter *formatter)
+{
+  std::ostringstream oss;
+  formatter->output_footer();
+  formatter->flush(oss);
+  std::string outs(oss.str());
+  if (!outs.empty() && s->op != OP_HEAD) {
+    dump_body(s, outs);
+  }
+
+  s->formatter->reset();
+}
+
+void rgw_flush_formatter(req_state *s, Formatter *formatter)
+{
+  std::ostringstream oss;
+  formatter->flush(oss);
+  std::string outs(oss.str());
+  if (!outs.empty() && s->op != OP_HEAD) {
+    dump_body(s, outs);
+  }
+}
+
+void dump_errno(int http_ret, string& out) {
+  stringstream ss;
+
+  ss <<  http_ret << " " << http_status_names[http_ret];
+  out = ss.str();
+}
+
+void dump_errno(const struct rgw_err &err, string& out) {
+  dump_errno(err.http_ret, out);
+}
+
+void dump_errno(req_state *s)
+{
+  dump_status(s, s->err.http_ret, http_status_names[s->err.http_ret]);
+}
+
+void dump_errno(req_state *s, int http_ret)
+{
+  dump_status(s, http_ret, http_status_names[http_ret]);
+}
+
+void dump_header(req_state* const s,
+                 const std::string_view& name,
+                 const std::string_view& val)
+{
+  try {
+    RESTFUL_IO(s)->send_header(name, val);
+  } catch (rgw::io::Exception& e) {
+    ldpp_dout(s, 0) << "ERROR: s->cio->send_header() returned err="
+                     << e.what() << dendl;
+  }
+}
+
+void dump_header(req_state* const s,
+                 const std::string_view& name,
+                 ceph::buffer::list& bl)
+{
+  return dump_header(s, name, rgw_sanitized_hdrval(bl));
+}
+
+void dump_header(req_state* const s,
+                 const std::string_view& name,
+                 const long long val)
+{
+  char buf[32];
+  const auto len = snprintf(buf, sizeof(buf), "%lld", val);
+
+  return dump_header(s, name, std::string_view(buf, len));
+}
+
+void dump_header(req_state* const s,
+                 const std::string_view& name,
+                 const utime_t& ut)
+{
+  char buf[32];
+  const auto len = snprintf(buf, sizeof(buf), "%lld.%05d",
+	                    static_cast<long long>(ut.sec()),
+                            static_cast<int>(ut.usec() / 10));
+
+  return dump_header(s, name, std::string_view(buf, len));
+}
+
+void dump_content_length(req_state* const s, const uint64_t len)
+{
+  try {
+    RESTFUL_IO(s)->send_content_length(len);
+  } catch (rgw::io::Exception& e) {
+    ldpp_dout(s, 0) << "ERROR: s->cio->send_content_length() returned err="
+                     << e.what() << dendl;
+  }
+  dump_header(s, "Accept-Ranges", "bytes");
+}
+
+static void dump_chunked_encoding(req_state* const s)
+{
+  try {
+    RESTFUL_IO(s)->send_chunked_transfer_encoding();
+  } catch (rgw::io::Exception& e) {
+    ldpp_dout(s, 0) << "ERROR: RESTFUL_IO(s)->send_chunked_transfer_encoding()"
+                     << " returned err=" << e.what() << dendl;
+  }
+}
+
+void dump_etag(req_state* const s,
+               const std::string_view& etag,
+               const bool quoted)
+{
+  if (etag.empty()) {
+    return;
+  }
+
+  if (s->prot_flags & RGW_REST_SWIFT && ! quoted) {
+    return dump_header(s, "etag", etag);
+  } else {
+    return dump_header_quoted(s, "ETag", etag);
+  }
+}
+
+void dump_bucket_from_state(req_state *s)
+{
+  if (g_conf()->rgw_expose_bucket && ! s->bucket_name.empty()) {
+    if (! s->bucket_tenant.empty()) {
+      dump_header(s, "Bucket",
+                  url_encode(s->bucket_tenant + "/" + s->bucket_name));
+    } else {
+      dump_header(s, "Bucket", url_encode(s->bucket_name));
+    }
+  }
+}
+
+void dump_redirect(req_state * const s, const std::string& redirect)
+{
+  return dump_header_if_nonempty(s, "Location", redirect);
+}
+
+static size_t dump_time_header_impl(char (&timestr)[TIME_BUF_SIZE],
+                                    const real_time t)
+{
+  const utime_t ut(t);
+  time_t secs = static_cast<time_t>(ut.sec());
+
+  struct tm result;
+  const struct tm * const tmp = gmtime_r(&secs, &result);
+  if (tmp == nullptr) {
+    return 0;
+  }
+
+  return strftime(timestr, sizeof(timestr),
+                  "%a, %d %b %Y %H:%M:%S %Z", tmp);
+}
+
+void dump_time_header(req_state *s, const char *name, real_time t)
+{
+  char timestr[TIME_BUF_SIZE];
+
+  const size_t len = dump_time_header_impl(timestr, t);
+  if (len == 0) {
+    return;
+  }
+
+  return dump_header(s, name, std::string_view(timestr, len));
+}
+
+std::string dump_time_to_str(const real_time& t)
+{
+  char timestr[TIME_BUF_SIZE];
+  dump_time_header_impl(timestr, t);
+
+  return timestr;
+}
+
+
+void dump_last_modified(req_state *s, real_time t)
+{
+  dump_time_header(s, "Last-Modified", t);
+}
+
+void dump_epoch_header(req_state *s, const char *name, real_time t)
+{
+  utime_t ut(t);
+  char buf[65];
+  const auto len = snprintf(buf, sizeof(buf), "%lld.%09lld",
+                            (long long)ut.sec(),
+                            (long long)ut.nsec());
+
+  return dump_header(s, name, std::string_view(buf, len));
+}
+
+void dump_time(req_state *s, const char *name, real_time t)
+{
+  char buf[TIME_BUF_SIZE];
+  rgw_to_iso8601(t, buf, sizeof(buf));
+
+  s->formatter->dump_string(name, buf);
+}
+
+void dump_owner(req_state *s, const rgw_user& id, const string& name,
+		const char *section)
+{
+  if (!section)
+    section = "Owner";
+  s->formatter->open_object_section(section);
+  s->formatter->dump_string("ID", id.to_str());
+  s->formatter->dump_string("DisplayName", name);
+  s->formatter->close_section();
+}
+
+void dump_access_control(req_state *s, const char *origin,
+			 const char *meth,
+			 const char *hdr, const char *exp_hdr,
+			 uint32_t max_age) {
+  if (origin && (origin[0] != '\0')) {
+    dump_header(s, "Access-Control-Allow-Origin", origin);
+    /* If the server specifies an origin host rather than "*",
+     * then it must also include Origin in the Vary response header
+     * to indicate to clients that server responses will differ
+     * based on the value of the Origin request header.
+     */
+    if (strcmp(origin, "*") != 0) {
+      dump_header(s, "Vary", "Origin");
+    }
+
+    if (meth && (meth[0] != '\0')) {
+      dump_header(s, "Access-Control-Allow-Methods", meth);
+    }
+    if (hdr && (hdr[0] != '\0')) {
+      dump_header(s, "Access-Control-Allow-Headers", hdr);
+    }
+    if (exp_hdr && (exp_hdr[0] != '\0')) {
+      dump_header(s, "Access-Control-Expose-Headers", exp_hdr);
+    }
+    if (max_age != CORS_MAX_AGE_INVALID) {
+      dump_header(s, "Access-Control-Max-Age", max_age);
+    }
+  }
+}
+
+void dump_access_control(req_state *s, RGWOp *op)
+{
+  string origin;
+  string method;
+  string header;
+  string exp_header;
+  unsigned max_age = CORS_MAX_AGE_INVALID;
+
+  if (!op->generate_cors_headers(origin, method, header, exp_header, &max_age))
+    return;
+
+  dump_access_control(s, origin.c_str(), method.c_str(), header.c_str(),
+		      exp_header.c_str(), max_age);
+}
+
+void dump_start(req_state *s)
+{
+  if (!s->content_started) {
+    s->formatter->output_header();
+    s->content_started = true;
+  }
+}
+
+void dump_trans_id(req_state *s)
+{
+  if (s->prot_flags & RGW_REST_SWIFT) {
+    dump_header(s, "X-Trans-Id", s->trans_id);
+    dump_header(s, "X-Openstack-Request-Id", s->trans_id);
+  } else if (s->trans_id.length()) {
+    dump_header(s, "x-amz-request-id", s->trans_id);
+  }
+}
+
+void end_header(req_state* s, RGWOp* op, const char *content_type,
+		const int64_t proposed_content_length, bool force_content_type,
+		bool force_no_error)
+{
+  string ctype;
+
+  dump_trans_id(s);
+
+  if ((!s->is_err()) && s->bucket &&
+      (s->bucket->get_info().owner != s->user->get_id()) &&
+      (s->bucket->get_info().requester_pays)) {
+    dump_header(s, "x-amz-request-charged", "requester");
+  }
+
+  if (op) {
+    dump_access_control(s, op);
+  }
+
+  if (s->prot_flags & RGW_REST_SWIFT && !content_type) {
+    force_content_type = true;
+  }
+
+  /* do not send content type if content length is zero
+     and the content type was not set by the user */
+  if (force_content_type ||
+      (!content_type &&  s->formatter->get_len()  != 0) || s->is_err()){
+    ctype = to_mime_type(s->format);
+    if (s->prot_flags & RGW_REST_SWIFT)
+      ctype.append("; charset=utf-8");
+    content_type = ctype.c_str();
+  }
+  if (!force_no_error && s->is_err()) {
+    dump_start(s);
+    dump(s);
+    dump_content_length(s, s->formatter->get_len());
+  } else {
+    if (proposed_content_length == CHUNKED_TRANSFER_ENCODING) {
+      dump_chunked_encoding(s);
+    } else if (proposed_content_length != NO_CONTENT_LENGTH) {
+      dump_content_length(s, proposed_content_length);
+    }
+  }
+
+  if (content_type) {
+    dump_header(s, "Content-Type", content_type);
+  }
+  dump_header_if_nonempty(s, "Server", g_conf()->rgw_service_provider_name);
+
+  try {
+    RESTFUL_IO(s)->complete_header();
+  } catch (rgw::io::Exception& e) {
+    ldpp_dout(s, 0) << "ERROR: RESTFUL_IO(s)->complete_header() returned err="
+		     << e.what() << dendl;
+  }
+
+  ACCOUNTING_IO(s)->set_account(true);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+static void build_redirect_url(req_state *s, const string& redirect_base, string *redirect_url)
+{
+  string& dest_uri = *redirect_url;
+  
+  dest_uri = redirect_base;
+  /*
+   * reqest_uri is always start with slash, so we need to remove
+   * the unnecessary slash at the end of dest_uri.
+   */
+  if (dest_uri[dest_uri.size() - 1] == '/') {
+    dest_uri = dest_uri.substr(0, dest_uri.size() - 1);
+  }
+  dest_uri += s->info.request_uri;
+  dest_uri += "?";
+  dest_uri += s->info.request_params;
+}
+
+void abort_early(req_state *s, RGWOp* op, int err_no,
+		 RGWHandler* handler, optional_yield y)
+{
+  string error_content("");
+  if (!s->formatter) {
+    s->formatter = new JSONFormatter;
+    s->format = RGWFormat::JSON;
+  }
+
+  // op->error_handler is responsible for calling it's handler error_handler
+  if (op != NULL) {
+    int new_err_no;
+    new_err_no = op->error_handler(err_no, &error_content, y);
+    ldpp_dout(s, 1) << "op->ERRORHANDLER: err_no=" << err_no
+		      << " new_err_no=" << new_err_no << dendl;
+    err_no = new_err_no;
+  } else if (handler != NULL) {
+    int new_err_no;
+    new_err_no = handler->error_handler(err_no, &error_content, y);
+    ldpp_dout(s, 1) << "handler->ERRORHANDLER: err_no=" << err_no
+		      << " new_err_no=" << new_err_no << dendl;
+    err_no = new_err_no;
+  }
+
+  // If the error handler(s) above dealt with it completely, they should have
+  // returned 0. If non-zero, we need to continue here.
+  if (err_no) {
+    // Watch out, we might have a custom error state already set!
+    if (!s->err.http_ret || s->err.http_ret == 200) {
+      set_req_state_err(s, err_no);
+    }
+
+    if (s->err.http_ret == 404 && !s->redirect_zone_endpoint.empty()) {
+      s->err.http_ret = 301;
+      err_no = -ERR_PERMANENT_REDIRECT;
+      build_redirect_url(s, s->redirect_zone_endpoint, &s->redirect);
+    }
+
+    dump_errno(s);
+    dump_bucket_from_state(s);
+    if (err_no == -ERR_PERMANENT_REDIRECT || err_no == -ERR_WEBSITE_REDIRECT) {
+      string dest_uri;
+      if (!s->redirect.empty()) {
+        dest_uri = s->redirect;
+      } else if (!s->zonegroup_endpoint.empty()) {
+        build_redirect_url(s, s->zonegroup_endpoint, &dest_uri);
+      }
+
+      if (!dest_uri.empty()) {
+        dump_redirect(s, dest_uri);
+      }
+    }
+
+    if (!error_content.empty()) {
+      /*
+       * TODO we must add all error entries as headers here:
+       * when having a working errordoc, then the s3 error fields are
+       * rendered as HTTP headers, e.g.:
+       *   x-amz-error-code: NoSuchKey
+       *   x-amz-error-message: The specified key does not exist.
+       *   x-amz-error-detail-Key: foo
+       */
+      end_header(s, op, NULL, error_content.size(), false, true);
+      RESTFUL_IO(s)->send_body(error_content.c_str(), error_content.size());
+    } else {
+      end_header(s, op);
+    }
+  }
+  perfcounter->inc(l_rgw_failed_req);
+}
+
+void dump_continue(req_state * const s)
+{
+  try {
+    RESTFUL_IO(s)->send_100_continue();
+  } catch (rgw::io::Exception& e) {
+    ldpp_dout(s, 0) << "ERROR: RESTFUL_IO(s)->send_100_continue() returned err="
+		     << e.what() << dendl;
+  }
+}
+
+void dump_range(req_state* const s,
+                const uint64_t ofs,
+                const uint64_t end,
+		const uint64_t total)
+{
+  /* dumping range into temp buffer first, as libfcgi will fail to digest
+   * %lld */
+  char range_buf[128];
+  size_t len;
+
+  if (! total) {
+    len = snprintf(range_buf, sizeof(range_buf), "bytes */%lld",
+                   static_cast<long long>(total));
+  } else {
+    len = snprintf(range_buf, sizeof(range_buf), "bytes %lld-%lld/%lld",
+                   static_cast<long long>(ofs),
+                   static_cast<long long>(end),
+                   static_cast<long long>(total));
+  }
+
+  return dump_header(s, "Content-Range", std::string_view(range_buf, len));
+}
+
+
+int dump_body(req_state* const s,
+              const char* const buf,
+              const size_t len)
+{
+  bool healthchk = false;
+  // we dont want to limit health checks
+  if(s->op_type == RGW_OP_GET_HEALTH_CHECK)
+    healthchk = true;
+  if(len > 0 && !healthchk) {
+    const char *method = s->info.method;
+    s->ratelimit_data->decrease_bytes(method, s->ratelimit_user_name, len, &s->user_ratelimit);
+    if(!rgw::sal::Bucket::empty(s->bucket.get()))
+      s->ratelimit_data->decrease_bytes(method, s->ratelimit_bucket_marker, len, &s->bucket_ratelimit);
+  }
+  try {
+    return RESTFUL_IO(s)->send_body(buf, len);
+  } catch (rgw::io::Exception& e) {
+    return -e.code().value();
+  }
+}
+
+int dump_body(req_state* const s, /* const */ ceph::buffer::list& bl)
+{
+  return dump_body(s, bl.c_str(), bl.length());
+}
+
+int dump_body(req_state* const s, const std::string& str)
+{
+  return dump_body(s, str.c_str(), str.length());
+}
+
+int recv_body(req_state* const s,
+              char* const buf,
+              const size_t max)
+{
+  int len;
+  try {
+    len = RESTFUL_IO(s)->recv_body(buf, max);
+  } catch (rgw::io::Exception& e) {
+    return -e.code().value();
+  }
+  bool healthchk = false;
+  // we dont want to limit health checks
+  if(s->op_type ==  RGW_OP_GET_HEALTH_CHECK)
+    healthchk = true;
+  if(len > 0 && !healthchk) {
+    const char *method = s->info.method;
+    s->ratelimit_data->decrease_bytes(method, s->ratelimit_user_name, len, &s->user_ratelimit);
+    if(!rgw::sal::Bucket::empty(s->bucket.get()))
+      s->ratelimit_data->decrease_bytes(method, s->ratelimit_bucket_marker, len, &s->bucket_ratelimit);
+  }
+  return len;
+
+}
+
+int RGWGetObj_ObjStore::get_params(optional_yield y)
+{
+  range_str = s->info.env->get("HTTP_RANGE");
+  if_mod = s->info.env->get("HTTP_IF_MODIFIED_SINCE");
+  if_unmod = s->info.env->get("HTTP_IF_UNMODIFIED_SINCE");
+  if_match = s->info.env->get("HTTP_IF_MATCH");
+  if_nomatch = s->info.env->get("HTTP_IF_NONE_MATCH");
+
+  if (s->system_request) {
+    mod_zone_id = s->info.env->get_int("HTTP_DEST_ZONE_SHORT_ID", 0);
+    mod_pg_ver = s->info.env->get_int("HTTP_DEST_PG_VER", 0);
+    rgwx_stat = s->info.args.exists(RGW_SYS_PARAM_PREFIX "stat");
+    get_data &= (!rgwx_stat);
+  }
+
+  if (s->info.args.exists(GET_TORRENT)) {
+    return torrent.get_params();
+  }
+  return 0;
+}
+
+int RESTArgs::get_string(req_state *s, const string& name,
+			 const string& def_val, string *val, bool *existed)
+{
+  bool exists;
+  *val = s->info.args.get(name, &exists);
+
+  if (existed)
+    *existed = exists;
+
+  if (!exists) {
+    *val = def_val;
+    return 0;
+  }
+
+  return 0;
+}
+
+int RESTArgs::get_uint64(req_state *s, const string& name,
+			 uint64_t def_val, uint64_t *val, bool *existed)
+{
+  bool exists;
+  string sval = s->info.args.get(name, &exists);
+
+  if (existed)
+    *existed = exists;
+
+  if (!exists) {
+    *val = def_val;
+    return 0;
+  }
+
+  int r = stringtoull(sval, val);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RESTArgs::get_int64(req_state *s, const string& name,
+			int64_t def_val, int64_t *val, bool *existed)
+{
+  bool exists;
+  string sval = s->info.args.get(name, &exists);
+
+  if (existed)
+    *existed = exists;
+
+  if (!exists) {
+    *val = def_val;
+    return 0;
+  }
+
+  int r = stringtoll(sval, val);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RESTArgs::get_uint32(req_state *s, const string& name,
+			 uint32_t def_val, uint32_t *val, bool *existed)
+{
+  bool exists;
+  string sval = s->info.args.get(name, &exists);
+
+  if (existed)
+    *existed = exists;
+
+  if (!exists) {
+    *val = def_val;
+    return 0;
+  }
+
+  int r = stringtoul(sval, val);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RESTArgs::get_int32(req_state *s, const string& name,
+			int32_t def_val, int32_t *val, bool *existed)
+{
+  bool exists;
+  string sval = s->info.args.get(name, &exists);
+
+  if (existed)
+    *existed = exists;
+
+  if (!exists) {
+    *val = def_val;
+    return 0;
+  }
+
+  int r = stringtol(sval, val);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RESTArgs::get_time(req_state *s, const string& name,
+		       const utime_t& def_val, utime_t *val, bool *existed)
+{
+  bool exists;
+  string sval = s->info.args.get(name, &exists);
+
+  if (existed)
+    *existed = exists;
+
+  if (!exists) {
+    *val = def_val;
+    return 0;
+  }
+
+  uint64_t epoch, nsec;
+
+  int r = utime_t::parse_date(sval, &epoch, &nsec);
+  if (r < 0)
+    return r;
+
+  *val = utime_t(epoch, nsec);
+
+  return 0;
+}
+
+int RESTArgs::get_epoch(req_state *s, const string& name, uint64_t def_val, uint64_t *epoch, bool *existed)
+{
+  bool exists;
+  string date = s->info.args.get(name, &exists);
+
+  if (existed)
+    *existed = exists;
+
+  if (!exists) {
+    *epoch = def_val;
+    return 0;
+  }
+
+  int r = utime_t::parse_date(date, epoch, NULL);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RESTArgs::get_bool(req_state *s, const string& name, bool def_val, bool *val, bool *existed)
+{
+  bool exists;
+  string sval = s->info.args.get(name, &exists);
+
+  if (existed)
+    *existed = exists;
+
+  if (!exists) {
+    *val = def_val;
+    return 0;
+  }
+
+  const char *str = sval.c_str();
+
+  if (sval.empty() ||
+      strcasecmp(str, "true") == 0 ||
+      sval.compare("1") == 0) {
+    *val = true;
+    return 0;
+  }
+
+  if (strcasecmp(str, "false") != 0 &&
+      sval.compare("0") != 0) {
+    *val = def_val;
+    return -EINVAL;
+  }
+
+  *val = false;
+  return 0;
+}
+
+
+void RGWRESTFlusher::do_start(int ret)
+{
+  set_req_state_err(s, ret); /* no going back from here */
+  dump_errno(s);
+  dump_start(s);
+  end_header(s, op);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void RGWRESTFlusher::do_flush()
+{
+  rgw_flush_formatter(s, s->formatter);
+}
+
+int RGWPutObj_ObjStore::verify_params()
+{
+  if (s->length) {
+    off_t len = atoll(s->length);
+    if (len > (off_t)(s->cct->_conf->rgw_max_put_size)) {
+      return -ERR_TOO_LARGE;
+    }
+  }
+
+  return 0;
+}
+
+int RGWPutObj_ObjStore::get_params(optional_yield y)
+{
+  /* start gettorrent */
+  if (s->cct->_conf->rgw_torrent_flag)
+  {
+    int ret = 0;
+    ret = torrent.get_params();
+    ldpp_dout(s, 5) << "NOTICE:  open produce torrent file " << dendl;
+    if (ret < 0)
+    {
+      return ret;
+    }
+    torrent.set_info_name(s->object->get_name());
+  }
+  /* end gettorrent */
+  supplied_md5_b64 = s->info.env->get("HTTP_CONTENT_MD5");
+
+  return 0;
+}
+
+int RGWPutObj_ObjStore::get_data(bufferlist& bl)
+{
+  size_t cl;
+  uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
+  if (s->length) {
+    cl = atoll(s->length) - ofs;
+    if (cl > chunk_size)
+      cl = chunk_size;
+  } else {
+    cl = chunk_size;
+  }
+
+  int len = 0;
+  {
+    ACCOUNTING_IO(s)->set_account(true);
+    bufferptr bp(cl);
+
+    const auto read_len  = recv_body(s, bp.c_str(), cl);
+    if (read_len < 0) {
+      return read_len;
+    }
+
+    len = read_len;
+    bl.append(bp, 0, len);
+
+    ACCOUNTING_IO(s)->set_account(false);
+  }
+
+  if ((uint64_t)ofs + len > s->cct->_conf->rgw_max_put_size) {
+    return -ERR_TOO_LARGE;
+  }
+
+  return len;
+}
+
+
+/*
+ * parses params in the format: 'first; param1=foo; param2=bar'
+ */
+void RGWPostObj_ObjStore::parse_boundary_params(const std::string& params_str,
+                                                std::string& first,
+                                                std::map<std::string,
+                                                std::string>& params)
+{
+  size_t pos = params_str.find(';');
+  if (std::string::npos == pos) {
+    first = rgw_trim_whitespace(params_str);
+    return;
+  }
+
+  first = rgw_trim_whitespace(params_str.substr(0, pos));
+  pos++;
+
+  while (pos < params_str.size()) {
+    size_t end = params_str.find(';', pos);
+    if (std::string::npos == end) {
+      end = params_str.size();
+    }
+
+    std::string param = params_str.substr(pos, end - pos);
+    size_t eqpos = param.find('=');
+
+    if (std::string::npos != eqpos) {
+      std::string param_name = rgw_trim_whitespace(param.substr(0, eqpos));
+      std::string val = rgw_trim_quotes(param.substr(eqpos + 1));
+      params[std::move(param_name)] = std::move(val);
+    } else {
+      params[rgw_trim_whitespace(param)] = "";
+    }
+
+    pos = end + 1;
+  }
+}
+
+int RGWPostObj_ObjStore::parse_part_field(const std::string& line,
+                                          std::string& field_name,  /* out */
+                                          post_part_field& field)   /* out */
+{
+  size_t pos = line.find(':');
+  if (pos == string::npos)
+    return -EINVAL;
+
+  field_name = line.substr(0, pos);
+  if (pos >= line.size() - 1)
+    return 0;
+
+  parse_boundary_params(line.substr(pos + 1), field.val, field.params);
+
+  return 0;
+}
+
+static bool is_crlf(const char *s)
+{
+  return (*s == '\r' && *(s + 1) == '\n');
+}
+
+/*
+ * find the index of the boundary, if exists, or optionally the next end of line
+ * also returns how many bytes to skip
+ */
+static int index_of(ceph::bufferlist& bl,
+                    uint64_t max_len,
+                    const std::string& str,
+                    const bool check_crlf,
+                    bool& reached_boundary,
+                    int& skip)
+{
+  reached_boundary = false;
+  skip = 0;
+
+  if (str.size() < 2) // we assume boundary is at least 2 chars (makes it easier with crlf checks)
+    return -EINVAL;
+
+  if (bl.length() < str.size())
+    return -1;
+
+  const char *buf = bl.c_str();
+  const char *s = str.c_str();
+
+  if (max_len > bl.length())
+    max_len = bl.length();
+
+  for (uint64_t i = 0; i < max_len; i++, buf++) {
+    if (check_crlf &&
+	i >= 1 &&
+	is_crlf(buf - 1)) {
+      return i + 1; // skip the crlf
+    }
+    if ((i < max_len - str.size() + 1) &&
+	(buf[0] == s[0] && buf[1] == s[1]) &&
+	(strncmp(buf, s, str.size()) == 0)) {
+      reached_boundary = true;
+      skip = str.size();
+
+      /* oh, great, now we need to swallow the preceding crlf
+       * if exists
+       */
+      if ((i >= 2) &&
+	  is_crlf(buf - 2)) {
+	i -= 2;
+	skip += 2;
+      }
+      return i;
+    }
+  }
+
+  return -1;
+}
+
+int RGWPostObj_ObjStore::read_with_boundary(ceph::bufferlist& bl,
+                                            uint64_t max,
+                                            const bool check_crlf,
+                                            bool& reached_boundary,
+                                            bool& done)
+{
+  uint64_t cl = max + 2 + boundary.size();
+
+  if (max > in_data.length()) {
+    uint64_t need_to_read = cl - in_data.length();
+
+    bufferptr bp(need_to_read);
+
+    const auto read_len = recv_body(s, bp.c_str(), need_to_read);
+    if (read_len < 0) {
+      return read_len;
+    }
+    in_data.append(bp, 0, read_len);
+  }
+
+  done = false;
+  int skip;
+  const int index = index_of(in_data, cl, boundary, check_crlf,
+                             reached_boundary, skip);
+  if (index >= 0) {
+    max = index;
+  }
+
+  if (max > in_data.length()) {
+    max = in_data.length();
+  }
+
+  bl.substr_of(in_data, 0, max);
+
+  ceph::bufferlist new_read_data;
+
+  /*
+   * now we need to skip boundary for next time, also skip any crlf, or
+   * check to see if it's the last final boundary (marked with "--" at the end
+   */
+  if (reached_boundary) {
+    int left = in_data.length() - max;
+    if (left < skip + 2) {
+      int need = skip + 2 - left;
+      bufferptr boundary_bp(need);
+      const int r = recv_body(s, boundary_bp.c_str(), need);
+      if (r < 0) {
+        return r;
+      }
+      in_data.append(boundary_bp);
+    }
+    max += skip; // skip boundary for next time
+    if (in_data.length() >= max + 2) {
+      const char *data = in_data.c_str();
+      if (is_crlf(data + max)) {
+	max += 2;
+      } else {
+	if (*(data + max) == '-' &&
+	    *(data + max + 1) == '-') {
+	  done = true;
+	  max += 2;
+	}
+      }
+    }
+  }
+
+  new_read_data.substr_of(in_data, max, in_data.length() - max);
+  in_data = new_read_data;
+
+  return 0;
+}
+
+int RGWPostObj_ObjStore::read_line(ceph::bufferlist& bl,
+                                   const uint64_t max,
+                                   bool& reached_boundary,
+                                   bool& done)
+{
+  return read_with_boundary(bl, max, true, reached_boundary, done);
+}
+
+int RGWPostObj_ObjStore::read_data(ceph::bufferlist& bl,
+                                   const uint64_t max,
+                                   bool& reached_boundary,
+                                   bool& done)
+{
+  return read_with_boundary(bl, max, false, reached_boundary, done);
+}
+
+
+int RGWPostObj_ObjStore::read_form_part_header(struct post_form_part* const part,
+                                               bool& done)
+{
+  bufferlist bl;
+  bool reached_boundary;
+  uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
+  int r = read_line(bl, chunk_size, reached_boundary, done);
+  if (r < 0) {
+    return r;
+  }
+
+  if (done) {
+    return 0;
+  }
+
+  if (reached_boundary) { // skip the first boundary
+    r = read_line(bl, chunk_size, reached_boundary, done);
+    if (r < 0) {
+      return r;
+    } else if (done) {
+      return 0;
+    }
+  }
+
+  while (true) {
+  /*
+   * iterate through fields
+   */
+    std::string line = rgw_trim_whitespace(string(bl.c_str(), bl.length()));
+
+    if (line.empty()) {
+      break;
+    }
+
+    struct post_part_field field;
+
+    string field_name;
+    r = parse_part_field(line, field_name, field);
+    if (r < 0) {
+      return r;
+    }
+
+    part->fields[field_name] = field;
+
+    if (stringcasecmp(field_name, "Content-Disposition") == 0) {
+      part->name = field.params["name"];
+    }
+
+    if (reached_boundary) {
+      break;
+    }
+
+    r = read_line(bl, chunk_size, reached_boundary, done);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  return 0;
+}
+
+bool RGWPostObj_ObjStore::part_str(parts_collection_t& parts,
+                                   const std::string& name,
+                                   std::string* val)
+{
+  const auto iter = parts.find(name);
+  if (std::end(parts) == iter) {
+    return false;
+  }
+
+  ceph::bufferlist& data = iter->second.data;
+  std::string str = string(data.c_str(), data.length());
+  *val = rgw_trim_whitespace(str);
+  return true;
+}
+
+std::string RGWPostObj_ObjStore::get_part_str(parts_collection_t& parts,
+                                              const std::string& name,
+                                              const std::string& def_val)
+{
+  std::string val;
+
+  if (part_str(parts, name, &val)) {
+    return val;
+  } else {
+    return rgw_trim_whitespace(def_val);
+  }
+}
+
+bool RGWPostObj_ObjStore::part_bl(parts_collection_t& parts,
+                                  const std::string& name,
+                                  ceph::bufferlist* pbl)
+{
+  const auto iter = parts.find(name);
+  if (std::end(parts) == iter) {
+    return false;
+  }
+
+  *pbl = iter->second.data;
+  return true;
+}
+
+int RGWPostObj_ObjStore::verify_params()
+{
+  /*  check that we have enough memory to store the object
+  note that this test isn't exact and may fail unintentionally
+  for large requests is */
+  if (!s->length) {
+    return -ERR_LENGTH_REQUIRED;
+  }
+  off_t len = atoll(s->length);
+  if (len > (off_t)(s->cct->_conf->rgw_max_put_size)) {
+    return -ERR_TOO_LARGE;
+  }
+
+  supplied_md5_b64 = s->info.env->get("HTTP_CONTENT_MD5");
+
+  return 0;
+}
+
+int RGWPostObj_ObjStore::get_params(optional_yield y)
+{
+  if (s->expect_cont) {
+    /* OK, here it really gets ugly. With POST, the params are embedded in the
+     * request body, so we need to continue before being able to actually look
+     * at them. This diverts from the usual request flow. */
+    dump_continue(s);
+    s->expect_cont = false;
+  }
+
+  std::string req_content_type_str = s->info.env->get("CONTENT_TYPE", "");
+  std::string req_content_type;
+  std::map<std::string, std::string> params;
+  parse_boundary_params(req_content_type_str, req_content_type, params);
+
+  if (req_content_type.compare("multipart/form-data") != 0) {
+    err_msg = "Request Content-Type is not multipart/form-data";
+    return -EINVAL;
+  }
+
+  if (s->cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    ldpp_dout(s, 20) << "request content_type_str="
+		      << req_content_type_str << dendl;
+    ldpp_dout(s, 20) << "request content_type params:" << dendl;
+
+    for (const auto& pair : params) {
+      ldpp_dout(s, 20) << " " << pair.first << " -> " << pair.second
+			<< dendl;
+    }
+  }
+
+  const auto iter = params.find("boundary");
+  if (std::end(params) == iter) {
+    err_msg = "Missing multipart boundary specification";
+    return -EINVAL;
+  }
+
+  /* Create the boundary. */
+  boundary = "--";
+  boundary.append(iter->second);
+
+  return 0;
+}
+
+
+int RGWPutACLs_ObjStore::get_params(optional_yield y)
+{
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  std::tie(op_ret, data) = read_all_input(s, max_size, false);
+  ldpp_dout(s, 0) << "RGWPutACLs_ObjStore::get_params read data is: " << data.c_str() << dendl;
+  return op_ret;
+}
+
+int RGWPutLC_ObjStore::get_params(optional_yield y)
+{
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  std::tie(op_ret, data) = read_all_input(s, max_size, false);
+  return op_ret;
+}
+
+int RGWPutBucketObjectLock_ObjStore::get_params(optional_yield y)
+{
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  std::tie(op_ret, data) = read_all_input(s, max_size, false);
+  return op_ret;
+}
+
+int RGWPutObjLegalHold_ObjStore::get_params(optional_yield y)
+{
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  std::tie(op_ret, data) = read_all_input(s, max_size, false);
+  return op_ret;
+}
+
+
+static std::tuple<int, bufferlist> read_all_chunked_input(req_state *s, const uint64_t max_read)
+{
+#define READ_CHUNK 4096
+#define MAX_READ_CHUNK (128 * 1024)
+  int need_to_read = READ_CHUNK;
+  int total = need_to_read;
+  bufferlist bl;
+
+  int read_len = 0;
+  do {
+    bufferptr bp(need_to_read + 1);
+    read_len = recv_body(s, bp.c_str(), need_to_read);
+    if (read_len < 0) {
+      return std::make_tuple(read_len, std::move(bl));
+    }
+
+    bp.c_str()[read_len] = '\0';
+    bp.set_length(read_len);
+    bl.append(bp);
+
+    if (read_len == need_to_read) {
+      if (need_to_read < MAX_READ_CHUNK)
+	need_to_read *= 2;
+
+      if ((unsigned)total > max_read) {
+	return std::make_tuple(-ERANGE, std::move(bl));
+      }
+      total += need_to_read;
+    } else {
+      break;
+    }
+  } while (true);
+
+  return std::make_tuple(0, std::move(bl));
+}
+
+std::tuple<int, bufferlist > rgw_rest_read_all_input(req_state *s,
+                                        const uint64_t max_len,
+                                        const bool allow_chunked)
+{
+  size_t cl = 0;
+  int len = 0;
+  bufferlist bl;
+
+  if (s->length)
+    cl = atoll(s->length);
+  else if (!allow_chunked)
+    return std::make_tuple(-ERR_LENGTH_REQUIRED, std::move(bl));
+
+  if (cl) {
+    if (cl > (size_t)max_len) {
+      return std::make_tuple(-ERANGE, std::move(bl));
+    }
+
+    bufferptr bp(cl + 1);
+  
+    len = recv_body(s, bp.c_str(), cl);
+    if (len < 0) {
+      return std::make_tuple(len, std::move(bl));
+    }
+
+    bp.c_str()[len] = '\0';
+    bp.set_length(len);
+    bl.append(bp);
+
+  } else if (allow_chunked && !s->length) {
+    const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+    if (!encoding || strcmp(encoding, "chunked") != 0)
+      return std::make_tuple(-ERR_LENGTH_REQUIRED, std::move(bl));
+
+    int ret = 0;
+    std::tie(ret, bl) = read_all_chunked_input(s, max_len);
+    if (ret < 0)
+      return std::make_tuple(ret, std::move(bl));
+  }
+
+  return std::make_tuple(0, std::move(bl));
+}
+
+int RGWCompleteMultipart_ObjStore::get_params(optional_yield y)
+{
+  upload_id = s->info.args.get("uploadId");
+
+  if (upload_id.empty()) {
+    op_ret = -ENOTSUP;
+    return op_ret;
+  }
+
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  std::tie(op_ret, data) = read_all_input(s, max_size);
+  if (op_ret < 0)
+    return op_ret;
+
+  return 0;
+}
+
+int RGWListMultipart_ObjStore::get_params(optional_yield y)
+{
+  upload_id = s->info.args.get("uploadId");
+
+  if (upload_id.empty()) {
+    op_ret = -ENOTSUP;
+  }
+  string marker_str = s->info.args.get("part-number-marker");
+
+  if (!marker_str.empty()) {
+    string err;
+    marker = strict_strtol(marker_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(s, 20) << "bad marker: "  << marker << dendl;
+      op_ret = -EINVAL;
+      return op_ret;
+    }
+  }
+  
+  string str = s->info.args.get("max-parts");
+  op_ret = parse_value_and_bound(str, max_parts, 0,
+			g_conf().get_val<uint64_t>("rgw_max_listing_results"),
+			max_parts);
+
+  return op_ret;
+}
+
+int RGWListBucketMultiparts_ObjStore::get_params(optional_yield y)
+{
+  delimiter = s->info.args.get("delimiter");
+  prefix = s->info.args.get("prefix");
+  string str = s->info.args.get("max-uploads");
+  op_ret = parse_value_and_bound(str, max_uploads, 0,
+			g_conf().get_val<uint64_t>("rgw_max_listing_results"),
+			default_max);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  if (auto encoding_type = s->info.args.get_optional("encoding-type");
+      encoding_type != boost::none) {
+    if (strcasecmp(encoding_type->c_str(), "url") != 0) {
+      op_ret = -EINVAL;
+      s->err.message="Invalid Encoding Method specified in Request";
+      return op_ret;
+    }
+    encode_url = true;
+  }
+
+  string key_marker = s->info.args.get("key-marker");
+  string upload_id_marker = s->info.args.get("upload-id-marker");
+  if (!key_marker.empty()) {
+    std::unique_ptr<rgw::sal::MultipartUpload> upload;
+    upload = s->bucket->get_multipart_upload(key_marker,
+					 upload_id_marker);
+    marker_meta = upload->get_meta();
+    marker_key = upload->get_key();
+    marker_upload_id = upload->get_upload_id();
+  }
+
+  return 0;
+}
+
+int RGWDeleteMultiObj_ObjStore::get_params(optional_yield y)
+{
+
+  if (s->bucket_name.empty()) {
+    op_ret = -EINVAL;
+    return op_ret;
+  }
+
+  // everything is probably fine, set the bucket
+  bucket = s->bucket.get();
+
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  std::tie(op_ret, data) = read_all_input(s, max_size, false);
+  return op_ret;
+}
+
+
+void RGWRESTOp::send_response()
+{
+  if (!flusher.did_start()) {
+    set_req_state_err(s, get_ret());
+    dump_errno(s);
+    end_header(s, this);
+  }
+  flusher.flush();
+}
+
+int RGWRESTOp::verify_permission(optional_yield)
+{
+  return check_caps(s->user->get_info().caps);
+}
+
+RGWOp* RGWHandler_REST::get_op(void)
+{
+  RGWOp *op;
+  switch (s->op) {
+   case OP_GET:
+     op = op_get();
+     break;
+   case OP_PUT:
+     op = op_put();
+     break;
+   case OP_DELETE:
+     op = op_delete();
+     break;
+   case OP_HEAD:
+     op = op_head();
+     break;
+   case OP_POST:
+     op = op_post();
+     break;
+   case OP_COPY:
+     op = op_copy();
+     break;
+   case OP_OPTIONS:
+     op = op_options();
+     break;
+   default:
+     return NULL;
+  }
+
+  if (op) {
+    op->init(driver, s, this);
+  }
+  return op;
+} /* get_op */
+
+void RGWHandler_REST::put_op(RGWOp* op)
+{
+  delete op;
+} /* put_op */
+
+int RGWHandler_REST::allocate_formatter(req_state *s,
+					RGWFormat default_type,
+					bool configurable)
+{
+  s->format = RGWFormat::BAD_FORMAT; // set to invalid value to allocation happens anyway
+  auto type = default_type;
+  if (configurable) {
+    string format_str = s->info.args.get("format");
+    if (format_str.compare("xml") == 0) {
+      type = RGWFormat::XML;
+    } else if (format_str.compare("json") == 0) {
+      type = RGWFormat::JSON;
+    } else if (format_str.compare("html") == 0) {
+      type = RGWFormat::HTML;
+    } else {
+      const char *accept = s->info.env->get("HTTP_ACCEPT");
+      if (accept) {
+        // trim at first ;
+        std::string_view format = accept;
+        format = format.substr(0, format.find(';'));
+
+        if (format == "text/xml" || format == "application/xml") {
+          type = RGWFormat::XML;
+        } else if (format == "application/json") {
+          type = RGWFormat::JSON;
+        } else if (format == "text/html") {
+          type = RGWFormat::HTML;
+        }
+      }
+    }
+  }
+  return RGWHandler_REST::reallocate_formatter(s, type);
+}
+
+int RGWHandler_REST::reallocate_formatter(req_state *s, const RGWFormat type)
+{
+  if (s->format == type) {
+    // do nothing, just reset
+    ceph_assert(s->formatter);
+    s->formatter->reset();
+    return 0;
+  }
+
+  delete s->formatter;
+  s->formatter = nullptr;
+  s->format = type;
+
+  const string& mm = s->info.args.get("multipart-manifest");
+  const bool multipart_delete = (mm.compare("delete") == 0);
+  const bool swift_bulkupload = s->prot_flags & RGW_REST_SWIFT &&
+                                s->info.args.exists("extract-archive");
+  switch (s->format) {
+    case RGWFormat::PLAIN:
+      {
+        const bool use_kv_syntax = s->info.args.exists("bulk-delete") ||
+                                   multipart_delete || swift_bulkupload;
+        s->formatter = new RGWFormatter_Plain(use_kv_syntax);
+        break;
+      }
+    case RGWFormat::XML:
+      {
+        const bool lowercase_underscore = s->info.args.exists("bulk-delete") ||
+                                          multipart_delete || swift_bulkupload;
+
+        s->formatter = new XMLFormatter(false, lowercase_underscore);
+        break;
+      }
+    case RGWFormat::JSON:
+      s->formatter = new JSONFormatter(false);
+      break;
+    case RGWFormat::HTML:
+      s->formatter = new HTMLFormatter(s->prot_flags & RGW_REST_WEBSITE);
+      break;
+    default:
+      return -EINVAL;
+
+  };
+  //s->formatter->reset(); // All formatters should reset on create already
+
+  return 0;
+}
+// This function enforces Amazon's spec for bucket names.
+// (The requirements, not the recommendations.)
+int RGWHandler_REST::validate_bucket_name(const string& bucket)
+{
+  int len = bucket.size();
+  if (len < 3) {
+    if (len == 0) {
+      // This request doesn't specify a bucket at all
+      return 0;
+    }
+    // Name too short
+    return -ERR_INVALID_BUCKET_NAME;
+  }
+  else if (len > MAX_BUCKET_NAME_LEN) {
+    // Name too long
+    return -ERR_INVALID_BUCKET_NAME;
+  }
+
+  const char *s = bucket.c_str();
+  for (int i = 0; i < len; ++i, ++s) {
+    if (*(unsigned char *)s == 0xff)
+      return -ERR_INVALID_BUCKET_NAME;
+    if (*(unsigned char *)s == '/')
+      return -ERR_INVALID_BUCKET_NAME;
+  }
+
+  return 0;
+}
+
+// "The name for a key is a sequence of Unicode characters whose UTF-8 encoding
+// is at most 1024 bytes long."
+// However, we can still have control characters and other nasties in there.
+// Just as long as they're utf-8 nasties.
+int RGWHandler_REST::validate_object_name(const string& object)
+{
+  int len = object.size();
+  if (len > MAX_OBJ_NAME_LEN) {
+    // Name too long
+    return -ERR_INVALID_OBJECT_NAME;
+  }
+
+  if (check_utf8(object.c_str(), len)) {
+    // Object names must be valid UTF-8.
+    return -ERR_INVALID_OBJECT_NAME;
+  }
+  return 0;
+}
+
+static http_op op_from_method(const char *method)
+{
+  if (!method)
+    return OP_UNKNOWN;
+  if (strcmp(method, "GET") == 0)
+    return OP_GET;
+  if (strcmp(method, "PUT") == 0)
+    return OP_PUT;
+  if (strcmp(method, "DELETE") == 0)
+    return OP_DELETE;
+  if (strcmp(method, "HEAD") == 0)
+    return OP_HEAD;
+  if (strcmp(method, "POST") == 0)
+    return OP_POST;
+  if (strcmp(method, "COPY") == 0)
+    return OP_COPY;
+  if (strcmp(method, "OPTIONS") == 0)
+    return OP_OPTIONS;
+
+  return OP_UNKNOWN;
+}
+
+int RGWHandler_REST::init_permissions(RGWOp* op, optional_yield y)
+{
+  if (op->get_type() == RGW_OP_CREATE_BUCKET) {
+    // We don't need user policies in case of STS token returned by AssumeRole, hence the check for user type
+    if (! s->user->get_id().empty() && s->auth.identity->get_identity_type() != TYPE_ROLE) {
+      try {
+        if (auto ret = s->user->read_attrs(s, y); ! ret) {
+          auto user_policies = get_iam_user_policy_from_attr(s->cct, s->user->get_attrs(), s->user->get_tenant());
+          s->iam_user_policies.insert(s->iam_user_policies.end(),
+                                      std::make_move_iterator(user_policies.begin()),
+                                      std::make_move_iterator(user_policies.end()));
+
+        }
+      } catch (const std::exception& e) {
+        ldpp_dout(op, -1) << "Error reading IAM User Policy: " << e.what() << dendl;
+      }
+    }
+    rgw_build_iam_environment(driver, s);
+    return 0;
+  }
+
+  return do_init_permissions(op, y);
+}
+
+int RGWHandler_REST::read_permissions(RGWOp* op_obj, optional_yield y)
+{
+  bool only_bucket = false;
+
+  switch (s->op) {
+  case OP_HEAD:
+  case OP_GET:
+    only_bucket = false;
+    break;
+  case OP_PUT:
+  case OP_POST:
+  case OP_COPY:
+    /* is it a 'multi-object delete' request? */
+    if (s->info.args.exists("delete")) {
+      only_bucket = true;
+      break;
+    }
+    if (is_obj_update_op()) {
+      only_bucket = false;
+      break;
+    }
+    /* is it a 'create bucket' request? */
+    if (op_obj->get_type() == RGW_OP_CREATE_BUCKET)
+      return 0;
+    
+    only_bucket = true;
+    break;
+  case OP_DELETE:
+    if (!s->info.args.exists("tagging")){
+      only_bucket = true;
+    }
+    break;
+  case OP_OPTIONS:
+    only_bucket = true;
+    break;
+  default:
+    return -EINVAL;
+  }
+
+  return do_read_permissions(op_obj, only_bucket, y);
+}
+
+void RGWRESTMgr::register_resource(string resource, RGWRESTMgr *mgr)
+{
+  string r = "/";
+  r.append(resource);
+
+  /* do we have a resource manager registered for this entry point? */
+  map<string, RGWRESTMgr *>::iterator iter = resource_mgrs.find(r);
+  if (iter != resource_mgrs.end()) {
+    delete iter->second;
+  }
+  resource_mgrs[r] = mgr;
+  resources_by_size.insert(pair<size_t, string>(r.size(), r));
+
+  /* now build default resource managers for the path (instead of nested entry points)
+   * e.g., if the entry point is /auth/v1.0/ then we'd want to create a default
+   * manager for /auth/
+   */
+
+  size_t pos = r.find('/', 1);
+
+  while (pos != r.size() - 1 && pos != string::npos) {
+    string s = r.substr(0, pos);
+
+    iter = resource_mgrs.find(s);
+    if (iter == resource_mgrs.end()) { /* only register it if one does not exist */
+      resource_mgrs[s] = new RGWRESTMgr; /* a default do-nothing manager */
+      resources_by_size.insert(pair<size_t, string>(s.size(), s));
+    }
+
+    pos = r.find('/', pos + 1);
+  }
+}
+
+void RGWRESTMgr::register_default_mgr(RGWRESTMgr *mgr)
+{
+  delete default_mgr;
+  default_mgr = mgr;
+}
+
+RGWRESTMgr* RGWRESTMgr::get_resource_mgr(req_state* const s,
+                                         const std::string& uri,
+                                         std::string* const out_uri)
+{
+  *out_uri = uri;
+
+  multimap<size_t, string>::reverse_iterator iter;
+
+  for (iter = resources_by_size.rbegin(); iter != resources_by_size.rend(); ++iter) {
+    string& resource = iter->second;
+    if (uri.compare(0, iter->first, resource) == 0 &&
+	(uri.size() == iter->first ||
+	 uri[iter->first] == '/')) {
+      std::string suffix = uri.substr(iter->first);
+      return resource_mgrs[resource]->get_resource_mgr(s, suffix, out_uri);
+    }
+  }
+
+  if (default_mgr) {
+    return default_mgr->get_resource_mgr_as_default(s, uri, out_uri);
+  }
+
+  return this;
+}
+
+void RGWREST::register_x_headers(const string& s_headers)
+{
+  std::vector<std::string> hdrs = get_str_vec(s_headers);
+  for (auto& hdr : hdrs) {
+    boost::algorithm::to_upper(hdr); // XXX
+    (void) x_headers.insert(hdr);
+  }
+}
+
+RGWRESTMgr::~RGWRESTMgr()
+{
+  map<string, RGWRESTMgr *>::iterator iter;
+  for (iter = resource_mgrs.begin(); iter != resource_mgrs.end(); ++iter) {
+    delete iter->second;
+  }
+  delete default_mgr;
+}
+
+int64_t parse_content_length(const char *content_length)
+{
+  int64_t len = -1;
+
+  if (*content_length == '\0') {
+    len = 0;
+  } else {
+    string err;
+    len = strict_strtoll(content_length, 10, &err);
+    if (!err.empty()) {
+      len = -1;
+    }
+  }
+
+  return len;
+}
+
+int RGWREST::preprocess(req_state *s, rgw::io::BasicClient* cio)
+{
+  req_info& info = s->info;
+
+  /* save the request uri used to hash on the client side. request_uri may suffer
+     modifications as part of the bucket encoding in the subdomain calling format.
+     request_uri_aws4 will be used under aws4 auth */
+  s->info.request_uri_aws4 = s->info.request_uri;
+
+  s->cio = cio;
+
+  // We need to know if this RGW instance is running the s3website API with a
+  // higher priority than regular S3 API, or possibly in place of the regular
+  // S3 API.
+  // Map the listing of rgw_enable_apis in REVERSE order, so that items near
+  // the front of the list have a higher number assigned (and -1 for items not in the list).
+  list<string> apis;
+  get_str_list(g_conf()->rgw_enable_apis, apis);
+  int api_priority_s3 = -1;
+  int api_priority_s3website = -1;
+  auto api_s3website_priority_rawpos = std::find(apis.begin(), apis.end(), "s3website");
+  auto api_s3_priority_rawpos = std::find(apis.begin(), apis.end(), "s3");
+  if (api_s3_priority_rawpos != apis.end()) {
+    api_priority_s3 = apis.size() - std::distance(apis.begin(), api_s3_priority_rawpos);
+  }
+  if (api_s3website_priority_rawpos != apis.end()) {
+    api_priority_s3website = apis.size() - std::distance(apis.begin(), api_s3website_priority_rawpos);
+  }
+  ldpp_dout(s, 10) << "rgw api priority: s3=" << api_priority_s3 << " s3website=" << api_priority_s3website << dendl;
+  bool s3website_enabled = api_priority_s3website >= 0;
+
+  if (info.host.size()) {
+    ssize_t pos;
+    if (info.host.find('[') == 0) {
+      pos = info.host.find(']');
+      if (pos >=1) {
+        info.host = info.host.substr(1, pos-1);
+      }
+    } else {
+      pos = info.host.find(':');
+      if (pos >= 0) {
+        info.host = info.host.substr(0, pos);
+      }
+    }
+    ldpp_dout(s, 10) << "host=" << info.host << dendl;
+    string domain;
+    string subdomain;
+    bool in_hosted_domain_s3website = false;
+    bool in_hosted_domain = rgw_find_host_in_domains(info.host, &domain, &subdomain, hostnames_set);
+
+    string s3website_domain;
+    string s3website_subdomain;
+
+    if (s3website_enabled) {
+      in_hosted_domain_s3website = rgw_find_host_in_domains(info.host, &s3website_domain, &s3website_subdomain, hostnames_s3website_set);
+      if (in_hosted_domain_s3website) {
+	in_hosted_domain = true; // TODO: should hostnames be a strict superset of hostnames_s3website?
+        domain = s3website_domain;
+        subdomain = s3website_subdomain;
+      }
+    }
+
+    ldpp_dout(s, 20)
+      << "subdomain=" << subdomain 
+      << " domain=" << domain 
+      << " in_hosted_domain=" << in_hosted_domain 
+      << " in_hosted_domain_s3website=" << in_hosted_domain_s3website 
+      << dendl;
+
+    if (g_conf()->rgw_resolve_cname
+	&& !in_hosted_domain
+	&& !in_hosted_domain_s3website) {
+      string cname;
+      bool found;
+      int r = rgw_resolver->resolve_cname(info.host, cname, &found);
+      if (r < 0) {
+	ldpp_dout(s, 0)
+	  << "WARNING: rgw_resolver->resolve_cname() returned r=" << r
+	  << dendl;
+      }
+
+      if (found) {
+	ldpp_dout(s, 5) << "resolved host cname " << info.host << " -> "
+			 << cname << dendl;
+	in_hosted_domain =
+	  rgw_find_host_in_domains(cname, &domain, &subdomain, hostnames_set);
+
+        if (s3website_enabled
+	    && !in_hosted_domain_s3website) {
+	  in_hosted_domain_s3website =
+	    rgw_find_host_in_domains(cname, &s3website_domain,
+				     &s3website_subdomain,
+				     hostnames_s3website_set);
+	  if (in_hosted_domain_s3website) {
+	    in_hosted_domain = true; // TODO: should hostnames be a
+				     // strict superset of hostnames_s3website?
+	    domain = s3website_domain;
+	    subdomain = s3website_subdomain;
+	  }
+        }
+
+        ldpp_dout(s, 20)
+          << "subdomain=" << subdomain 
+          << " domain=" << domain 
+          << " in_hosted_domain=" << in_hosted_domain 
+          << " in_hosted_domain_s3website=" << in_hosted_domain_s3website 
+          << dendl;
+      }
+    }
+
+    // Handle A/CNAME records that point to the RGW storage, but do match the
+    // CNAME test above, per issue http://tracker.ceph.com/issues/15975
+    // If BOTH domain & subdomain variables are empty, then none of the above
+    // cases matched anything, and we should fall back to using the Host header
+    // directly as the bucket name.
+    // As additional checks:
+    // - if the Host header is an IP, we're using path-style access without DNS
+    // - Also check that the Host header is a valid bucket name before using it.
+    // - Don't enable virtual hosting if no hostnames are configured
+    if (subdomain.empty()
+        && (domain.empty() || domain != info.host)
+        && !looks_like_ip_address(info.host.c_str())
+        && RGWHandler_REST::validate_bucket_name(info.host) == 0
+        && !(hostnames_set.empty() && hostnames_s3website_set.empty())) {
+      subdomain.append(info.host);
+      in_hosted_domain = 1;
+    }
+
+    if (s3website_enabled && api_priority_s3website > api_priority_s3) {
+      in_hosted_domain_s3website = 1;
+    }
+
+    if (in_hosted_domain_s3website) {
+      s->prot_flags |= RGW_REST_WEBSITE;
+    }
+
+
+    if (in_hosted_domain && !subdomain.empty()) {
+      string encoded_bucket = "/";
+      encoded_bucket.append(subdomain);
+      if (s->info.request_uri[0] != '/')
+        encoded_bucket.append("/");
+      encoded_bucket.append(s->info.request_uri);
+      s->info.request_uri = encoded_bucket;
+    }
+
+    if (!domain.empty()) {
+      s->info.domain = domain;
+    }
+
+    ldpp_dout(s, 20)
+      << "final domain/bucket"
+      << " subdomain=" << subdomain
+      << " domain=" << domain
+      << " in_hosted_domain=" << in_hosted_domain
+      << " in_hosted_domain_s3website=" << in_hosted_domain_s3website
+      << " s->info.domain=" << s->info.domain
+      << " s->info.request_uri=" << s->info.request_uri
+      << dendl;
+  }
+
+  if (s->info.domain.empty()) {
+    s->info.domain = s->cct->_conf->rgw_dns_name;
+  }
+
+  s->decoded_uri = url_decode(s->info.request_uri);
+  /* Validate for being free of the '\0' buried in the middle of the string. */
+  if (std::strlen(s->decoded_uri.c_str()) != s->decoded_uri.length()) {
+    return -ERR_ZERO_IN_URL;
+  }
+
+  /* FastCGI specification, section 6.3
+   * http://www.fastcgi.com/devkit/doc/fcgi-spec.html#S6.3
+   * ===
+   * The Authorizer application receives HTTP request information from the Web
+   * server on the FCGI_PARAMS stream, in the same format as a Responder. The
+   * Web server does not send CONTENT_LENGTH, PATH_INFO, PATH_TRANSLATED, and
+   * SCRIPT_NAME headers.
+   * ===
+   * Ergo if we are in Authorizer role, we MUST look at HTTP_CONTENT_LENGTH
+   * instead of CONTENT_LENGTH for the Content-Length.
+   *
+   * There is one slight wrinkle in this, and that's older versions of
+   * nginx/lighttpd/apache setting BOTH headers. As a result, we have to check
+   * both headers and can't always simply pick A or B.
+   */
+  const char* content_length = info.env->get("CONTENT_LENGTH");
+  const char* http_content_length = info.env->get("HTTP_CONTENT_LENGTH");
+  if (!http_content_length != !content_length) {
+    /* Easy case: one or the other is missing */
+    s->length = (content_length ? content_length : http_content_length);
+  } else if (s->cct->_conf->rgw_content_length_compat &&
+	     content_length && http_content_length) {
+    /* Hard case: Both are set, we have to disambiguate */
+    int64_t content_length_i, http_content_length_i;
+
+    content_length_i = parse_content_length(content_length);
+    http_content_length_i = parse_content_length(http_content_length);
+
+    // Now check them:
+    if (http_content_length_i < 0) {
+      // HTTP_CONTENT_LENGTH is invalid, ignore it
+    } else if (content_length_i < 0) {
+      // CONTENT_LENGTH is invalid, and HTTP_CONTENT_LENGTH is valid
+      // Swap entries
+      content_length = http_content_length;
+    } else {
+      // both CONTENT_LENGTH and HTTP_CONTENT_LENGTH are valid
+      // Let's pick the larger size
+      if (content_length_i < http_content_length_i) {
+	// prefer the larger value
+	content_length = http_content_length;
+      }
+    }
+    s->length = content_length;
+    // End of: else if (s->cct->_conf->rgw_content_length_compat &&
+    //   content_length &&
+    // http_content_length)
+  } else {
+    /* no content length was defined */
+    s->length = NULL;
+  }
+
+  if (s->length) {
+    if (*s->length == '\0') {
+      s->content_length = 0;
+    } else {
+      string err;
+      s->content_length = strict_strtoll(s->length, 10, &err);
+      if (!err.empty()) {
+	ldpp_dout(s, 10) << "bad content length, aborting" << dendl;
+	return -EINVAL;
+      }
+    }
+  }
+
+  if (s->content_length < 0) {
+    ldpp_dout(s, 10) << "negative content length, aborting" << dendl;
+    return -EINVAL;
+  }
+
+  map<string, string>::iterator giter;
+  for (giter = generic_attrs_map.begin(); giter != generic_attrs_map.end();
+       ++giter) {
+    const char *env = info.env->get(giter->first.c_str());
+    if (env) {
+      s->generic_attrs[giter->second] = env;
+    }
+  }
+
+  if (g_conf()->rgw_print_continue) {
+    const char *expect = info.env->get("HTTP_EXPECT");
+    s->expect_cont = (expect && !strcasecmp(expect, "100-continue"));
+  }
+  s->op = op_from_method(info.method);
+
+  info.init_meta_info(s, &s->has_bad_meta);
+
+  return 0;
+}
+
+RGWHandler_REST* RGWREST::get_handler(
+  rgw::sal::Driver*  const driver,
+  req_state* const s,
+  const rgw::auth::StrategyRegistry& auth_registry,
+  const std::string& frontend_prefix,
+  RGWRestfulIO* const rio,
+  RGWRESTMgr** const pmgr,
+  int* const init_error
+) {
+  *init_error = preprocess(s, rio);
+  if (*init_error < 0) {
+    return nullptr;
+  }
+
+  RGWRESTMgr *m = mgr.get_manager(s, frontend_prefix, s->decoded_uri,
+                                  &s->relative_uri);
+  if (! m) {
+    *init_error = -ERR_METHOD_NOT_ALLOWED;
+    return nullptr;
+  }
+
+  if (pmgr) {
+    *pmgr = m;
+  }
+
+  RGWHandler_REST* handler = m->get_handler(driver, s, auth_registry, frontend_prefix);
+  if (! handler) {
+    *init_error = -ERR_METHOD_NOT_ALLOWED;
+    return NULL;
+  }
+
+  ldpp_dout(s, 20) << __func__ << " handler=" << typeid(*handler).name() << dendl;
+  
+  *init_error = handler->init(driver, s, rio);
+  if (*init_error < 0) {
+    m->put_handler(handler);
+    return nullptr;
+  }
+
+  return handler;
+} /* get stream handler */
diff --git a/src/rgw/rgw_rest.h b/src/rgw/rgw_rest.h
new file mode 100644
index 000000000..434de99e9
--- /dev/null
+++ b/src/rgw/rgw_rest.h
@@ -0,0 +1,819 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#define TIME_BUF_SIZE 128
+
+#include <string_view>
+#include <boost/container/flat_set.hpp>
+#include "common/sstring.hh"
+#include "common/ceph_json.h"
+#include "include/ceph_assert.h" /* needed because of common/ceph_json.h */
+#include "rgw_op.h"
+#include "rgw_formats.h"
+#include "rgw_client_io.h"
+#include "rgw_lua_background.h"
+
+extern std::map<std::string, std::string> rgw_to_http_attrs;
+
+extern void rgw_rest_init(CephContext *cct, const rgw::sal::ZoneGroup& zone_group);
+
+extern void rgw_flush_formatter_and_reset(req_state *s,
+					 ceph::Formatter *formatter);
+
+extern void rgw_flush_formatter(req_state *s,
+				ceph::Formatter *formatter);
+
+inline std::string_view rgw_sanitized_hdrval(ceph::buffer::list& raw)
+{
+  /* std::string and thus std::string_view ARE OBLIGED to carry multiple
+   * 0x00 and count them to the length of a string. We need to take that
+   * into consideration and sanitize the size of a ceph::buffer::list used
+   * to store metadata values (x-amz-meta-*, X-Container-Meta-*, etags).
+   * Otherwise we might send 0x00 to clients. */
+  const char* const data = raw.c_str();
+  size_t len = raw.length();
+
+  if (len && data[len - 1] == '\0') {
+    /* That's the case - the null byte has been included at the last position
+     * of the bufferlist. We need to restore the proper string length we'll
+     * pass to string_ref. */
+    len--;
+  }
+
+  return std::string_view(data, len);
+}
+
+template <class T>
+std::tuple<int, bufferlist > rgw_rest_get_json_input_keep_data(CephContext *cct, req_state *s, T& out, uint64_t max_len)
+{
+  int rv = 0;
+  bufferlist data;
+  std::tie(rv, data) = rgw_rest_read_all_input(s, max_len);
+  if (rv < 0) {
+    return std::make_tuple(rv, std::move(data));
+  }
+
+  if (!data.length()) {
+    return std::make_tuple(-EINVAL, std::move(data));
+  }
+
+  JSONParser parser;
+
+  if (!parser.parse(data.c_str(), data.length())) {
+    return std::make_tuple(-EINVAL, std::move(data));
+  }
+
+  try {
+    decode_json_obj(out, &parser);
+  } catch (JSONDecoder::err& e) {
+    return std::make_tuple(-EINVAL, std::move(data));
+  }
+
+  return std::make_tuple(0, std::move(data));
+}
+
+class RESTArgs {
+public:
+  static int get_string(req_state *s, const std::string& name,
+			const std::string& def_val, std::string *val,
+			bool *existed = NULL);
+  static int get_uint64(req_state *s, const std::string& name,
+			uint64_t def_val, uint64_t *val, bool *existed = NULL);
+  static int get_int64(req_state *s, const std::string& name,
+		       int64_t def_val, int64_t *val, bool *existed = NULL);
+  static int get_uint32(req_state *s, const std::string& name,
+			uint32_t def_val, uint32_t *val, bool *existed = NULL);
+  static int get_int32(req_state *s, const std::string& name,
+		       int32_t def_val, int32_t *val, bool *existed = NULL);
+  static int get_time(req_state *s, const std::string& name,
+		      const utime_t& def_val, utime_t *val,
+		      bool *existed = NULL);
+  static int get_epoch(req_state *s, const std::string& name,
+		       uint64_t def_val, uint64_t *epoch,
+		       bool *existed = NULL);
+  static int get_bool(req_state *s, const std::string& name, bool def_val,
+		      bool *val, bool *existed = NULL);
+};
+
+class RGWRESTFlusher : public RGWFormatterFlusher {
+  req_state *s;
+  RGWOp *op;
+protected:
+  void do_flush() override;
+  void do_start(int ret) override;
+public:
+  RGWRESTFlusher(req_state *_s, RGWOp *_op) :
+    RGWFormatterFlusher(_s->formatter), s(_s), op(_op) {}
+  RGWRESTFlusher() : RGWFormatterFlusher(NULL), s(NULL), op(NULL) {}
+
+  void init(req_state *_s, RGWOp *_op) {
+    s = _s;
+    op = _op;
+    set_formatter(s->formatter);
+  }
+};
+
+class RGWGetObj_ObjStore : public RGWGetObj
+{
+protected:
+  bool sent_header;
+public:
+  RGWGetObj_ObjStore() : sent_header(false) {}
+
+  void init(rgw::sal::Driver* driver, req_state *s, RGWHandler *h) override {
+    RGWGetObj::init(driver, s, h);
+    sent_header = false;
+  }
+
+  int get_params(optional_yield y) override;
+};
+
+class RGWGetObjTags_ObjStore : public RGWGetObjTags {
+public:
+  RGWGetObjTags_ObjStore() {};
+  ~RGWGetObjTags_ObjStore() {};
+};
+
+class RGWPutObjTags_ObjStore: public RGWPutObjTags {
+public:
+  RGWPutObjTags_ObjStore() {};
+  ~RGWPutObjTags_ObjStore() {};
+};
+
+class RGWGetBucketTags_ObjStore : public RGWGetBucketTags {
+public:
+  RGWGetBucketTags_ObjStore() = default;
+  virtual ~RGWGetBucketTags_ObjStore() = default;
+};
+
+class RGWPutBucketTags_ObjStore: public RGWPutBucketTags {
+public:
+  RGWPutBucketTags_ObjStore() = default;
+  virtual ~RGWPutBucketTags_ObjStore() = default;
+};
+
+class RGWGetBucketReplication_ObjStore : public RGWGetBucketReplication {
+public:
+  RGWGetBucketReplication_ObjStore() {};
+  ~RGWGetBucketReplication_ObjStore() {};
+};
+
+class RGWPutBucketReplication_ObjStore: public RGWPutBucketReplication {
+public:
+  RGWPutBucketReplication_ObjStore() = default;
+  virtual ~RGWPutBucketReplication_ObjStore() = default;
+};
+
+class RGWDeleteBucketReplication_ObjStore: public RGWDeleteBucketReplication {
+public:
+  RGWDeleteBucketReplication_ObjStore() = default;
+  virtual ~RGWDeleteBucketReplication_ObjStore() = default;
+};
+
+class RGWListBuckets_ObjStore : public RGWListBuckets {
+public:
+  RGWListBuckets_ObjStore() {}
+  ~RGWListBuckets_ObjStore() override {}
+};
+
+class RGWGetUsage_ObjStore : public RGWGetUsage {
+public:
+  RGWGetUsage_ObjStore() {}
+  ~RGWGetUsage_ObjStore() override {}
+};
+
+class RGWListBucket_ObjStore : public RGWListBucket {
+public:
+  RGWListBucket_ObjStore() {}
+  ~RGWListBucket_ObjStore() override {}
+};
+
+class RGWStatAccount_ObjStore : public RGWStatAccount {
+public:
+  RGWStatAccount_ObjStore() {}
+  ~RGWStatAccount_ObjStore() override {}
+};
+
+class RGWStatBucket_ObjStore : public RGWStatBucket {
+public:
+  RGWStatBucket_ObjStore() {}
+  ~RGWStatBucket_ObjStore() override {}
+};
+
+class RGWCreateBucket_ObjStore : public RGWCreateBucket {
+public:
+  RGWCreateBucket_ObjStore() {}
+  ~RGWCreateBucket_ObjStore() override {}
+};
+
+class RGWDeleteBucket_ObjStore : public RGWDeleteBucket {
+public:
+  RGWDeleteBucket_ObjStore() {}
+  ~RGWDeleteBucket_ObjStore() override {}
+};
+
+class RGWPutObj_ObjStore : public RGWPutObj
+{
+public:
+  RGWPutObj_ObjStore() {}
+  ~RGWPutObj_ObjStore() override {}
+
+  int verify_params() override;
+  int get_params(optional_yield y) override;
+  int get_data(bufferlist& bl) override;
+};
+
+class RGWPostObj_ObjStore : public RGWPostObj
+{
+  std::string boundary;
+
+public:
+  struct post_part_field {
+    std::string val;
+    std::map<std::string, std::string> params;
+  };
+
+  struct post_form_part {
+    std::string name;
+    std::map<std::string, post_part_field, ltstr_nocase> fields;
+    ceph::bufferlist data;
+  };
+
+protected:
+  using parts_collection_t = \
+    std::map<std::string, post_form_part, const ltstr_nocase>;
+
+  std::string err_msg;
+  ceph::bufferlist in_data;
+
+  int read_with_boundary(ceph::bufferlist& bl,
+                         uint64_t max,
+                         bool check_eol,
+                         bool& reached_boundary,
+                         bool& done);
+
+  int read_line(ceph::bufferlist& bl,
+                uint64_t max,
+                bool& reached_boundary,
+                bool& done);
+
+  int read_data(ceph::bufferlist& bl,
+                uint64_t max,
+                bool& reached_boundary,
+                bool& done);
+
+  int read_form_part_header(struct post_form_part *part, bool& done);
+
+  int get_params(optional_yield y) override;
+
+  static int parse_part_field(const std::string& line,
+                              std::string& field_name, /* out */
+                              post_part_field& field); /* out */
+
+  static void parse_boundary_params(const std::string& params_str,
+                                    std::string& first,
+                                    std::map<std::string, std::string>& params);
+
+  static bool part_str(parts_collection_t& parts,
+                       const std::string& name,
+                       std::string *val);
+
+  static std::string get_part_str(parts_collection_t& parts,
+                                  const std::string& name,
+                                  const std::string& def_val = std::string());
+
+  static bool part_bl(parts_collection_t& parts,
+                      const std::string& name,
+                      ceph::bufferlist *pbl);
+
+public:
+  RGWPostObj_ObjStore() {}
+  ~RGWPostObj_ObjStore() override {}
+
+  int verify_params() override;
+};
+
+
+class RGWPutMetadataAccount_ObjStore : public RGWPutMetadataAccount
+{
+public:
+  RGWPutMetadataAccount_ObjStore() {}
+  ~RGWPutMetadataAccount_ObjStore() override {}
+};
+
+class RGWPutMetadataBucket_ObjStore : public RGWPutMetadataBucket
+{
+public:
+  RGWPutMetadataBucket_ObjStore() {}
+  ~RGWPutMetadataBucket_ObjStore() override {}
+};
+
+class RGWPutMetadataObject_ObjStore : public RGWPutMetadataObject
+{
+public:
+  RGWPutMetadataObject_ObjStore() {}
+  ~RGWPutMetadataObject_ObjStore() override {}
+};
+
+class RGWDeleteObj_ObjStore : public RGWDeleteObj {
+public:
+  RGWDeleteObj_ObjStore() {}
+  ~RGWDeleteObj_ObjStore() override {}
+};
+
+class  RGWGetCrossDomainPolicy_ObjStore : public RGWGetCrossDomainPolicy {
+public:
+  RGWGetCrossDomainPolicy_ObjStore() = default;
+  ~RGWGetCrossDomainPolicy_ObjStore() override = default;
+};
+
+class  RGWGetHealthCheck_ObjStore : public RGWGetHealthCheck {
+public:
+  RGWGetHealthCheck_ObjStore() = default;
+  ~RGWGetHealthCheck_ObjStore() override = default;
+};
+
+class RGWCopyObj_ObjStore : public RGWCopyObj {
+public:
+  RGWCopyObj_ObjStore() {}
+  ~RGWCopyObj_ObjStore() override {}
+};
+
+class RGWGetACLs_ObjStore : public RGWGetACLs {
+public:
+  RGWGetACLs_ObjStore() {}
+  ~RGWGetACLs_ObjStore() override {}
+};
+
+class RGWPutACLs_ObjStore : public RGWPutACLs {
+public:
+  RGWPutACLs_ObjStore() {}
+  ~RGWPutACLs_ObjStore() override {}
+
+  int get_params(optional_yield y) override;
+};
+
+class RGWGetLC_ObjStore : public RGWGetLC {
+public:
+  RGWGetLC_ObjStore() {}
+  ~RGWGetLC_ObjStore() override {}
+};
+
+class RGWPutLC_ObjStore : public RGWPutLC {
+public:
+  RGWPutLC_ObjStore() {}
+  ~RGWPutLC_ObjStore() override {}
+
+  int get_params(optional_yield y) override;
+};
+
+class RGWDeleteLC_ObjStore : public RGWDeleteLC {
+public:
+  RGWDeleteLC_ObjStore() {}
+  ~RGWDeleteLC_ObjStore() override {}
+
+};
+
+class RGWGetCORS_ObjStore : public RGWGetCORS {
+public:
+  RGWGetCORS_ObjStore() {}
+  ~RGWGetCORS_ObjStore() override {}
+};
+
+class RGWPutCORS_ObjStore : public RGWPutCORS {
+public:
+  RGWPutCORS_ObjStore() {}
+  ~RGWPutCORS_ObjStore() override {}
+};
+
+class RGWDeleteCORS_ObjStore : public RGWDeleteCORS {
+public:
+  RGWDeleteCORS_ObjStore() {}
+  ~RGWDeleteCORS_ObjStore() override {}
+};
+
+class RGWOptionsCORS_ObjStore : public RGWOptionsCORS {
+public:
+  RGWOptionsCORS_ObjStore() {}
+  ~RGWOptionsCORS_ObjStore() override {}
+};
+
+class RGWGetBucketEncryption_ObjStore : public RGWGetBucketEncryption {
+public:
+  RGWGetBucketEncryption_ObjStore() {}
+  ~RGWGetBucketEncryption_ObjStore() override {}
+};
+
+class RGWPutBucketEncryption_ObjStore : public RGWPutBucketEncryption {
+public:
+  RGWPutBucketEncryption_ObjStore() {}
+  ~RGWPutBucketEncryption_ObjStore() override {}
+};
+
+class RGWDeleteBucketEncryption_ObjStore : public RGWDeleteBucketEncryption {
+public:
+  RGWDeleteBucketEncryption_ObjStore() {}
+  ~RGWDeleteBucketEncryption_ObjStore() override {}
+};
+
+class RGWInitMultipart_ObjStore : public RGWInitMultipart {
+public:
+  RGWInitMultipart_ObjStore() {}
+  ~RGWInitMultipart_ObjStore() override {}
+};
+
+class RGWCompleteMultipart_ObjStore : public RGWCompleteMultipart {
+public:
+  RGWCompleteMultipart_ObjStore() {}
+  ~RGWCompleteMultipart_ObjStore() override {}
+
+  int get_params(optional_yield y) override;
+};
+
+class RGWAbortMultipart_ObjStore : public RGWAbortMultipart {
+public:
+  RGWAbortMultipart_ObjStore() {}
+  ~RGWAbortMultipart_ObjStore() override {}
+};
+
+class RGWListMultipart_ObjStore : public RGWListMultipart {
+public:
+  RGWListMultipart_ObjStore() {}
+  ~RGWListMultipart_ObjStore() override {}
+
+  int get_params(optional_yield y) override;
+};
+
+class RGWListBucketMultiparts_ObjStore : public RGWListBucketMultiparts {
+public:
+  RGWListBucketMultiparts_ObjStore() {}
+  ~RGWListBucketMultiparts_ObjStore() override {}
+
+  int get_params(optional_yield y) override;
+};
+
+class RGWBulkDelete_ObjStore : public RGWBulkDelete {
+public:
+  RGWBulkDelete_ObjStore() {}
+  ~RGWBulkDelete_ObjStore() override {}
+};
+
+class RGWBulkUploadOp_ObjStore : public RGWBulkUploadOp {
+public:
+  RGWBulkUploadOp_ObjStore() = default;
+  ~RGWBulkUploadOp_ObjStore() = default;
+};
+
+class RGWDeleteMultiObj_ObjStore : public RGWDeleteMultiObj {
+public:
+  RGWDeleteMultiObj_ObjStore() {}
+  ~RGWDeleteMultiObj_ObjStore() override {}
+
+  int get_params(optional_yield y) override;
+};
+
+class RGWInfo_ObjStore : public RGWInfo {
+public:
+    RGWInfo_ObjStore() = default;
+    ~RGWInfo_ObjStore() override = default;
+};
+
+class RGWPutBucketObjectLock_ObjStore : public RGWPutBucketObjectLock {
+public:
+  RGWPutBucketObjectLock_ObjStore() = default;
+  ~RGWPutBucketObjectLock_ObjStore() = default;
+  int get_params(optional_yield y) override;
+};
+
+class RGWGetBucketObjectLock_ObjStore : public RGWGetBucketObjectLock {
+public:
+  RGWGetBucketObjectLock_ObjStore() = default;
+  ~RGWGetBucketObjectLock_ObjStore() override = default;
+};
+
+class RGWPutObjRetention_ObjStore : public RGWPutObjRetention {
+public:
+  RGWPutObjRetention_ObjStore() = default;
+  ~RGWPutObjRetention_ObjStore() override = default;
+};
+
+class RGWGetObjRetention_ObjStore : public RGWGetObjRetention {
+public:
+  RGWGetObjRetention_ObjStore() = default;
+  ~RGWGetObjRetention_ObjStore() = default;
+};
+
+class RGWPutObjLegalHold_ObjStore : public RGWPutObjLegalHold {
+public:
+  RGWPutObjLegalHold_ObjStore() = default;
+  ~RGWPutObjLegalHold_ObjStore() override = default;
+  int get_params(optional_yield y) override;
+};
+
+class RGWGetObjLegalHold_ObjStore : public RGWGetObjLegalHold {
+public:
+  RGWGetObjLegalHold_ObjStore() = default;
+  ~RGWGetObjLegalHold_ObjStore() = default;
+};
+
+class RGWRESTOp : public RGWOp {
+protected:
+  RGWRESTFlusher flusher;
+
+public:
+  void init(rgw::sal::Driver* driver, req_state *s,
+            RGWHandler *dialect_handler) override {
+    RGWOp::init(driver, s, dialect_handler);
+    flusher.init(s, this);
+  }
+  void send_response() override;
+  virtual int check_caps(const RGWUserCaps& caps)
+    { return -EPERM; } /* should to be implemented! */
+  int verify_permission(optional_yield y) override;
+  dmc::client_id dmclock_client() override { return dmc::client_id::admin; }
+};
+
+class RGWHandler_REST : public RGWHandler {
+protected:
+
+  virtual bool is_obj_update_op() const { return false; }
+  virtual RGWOp *op_get() { return NULL; }
+  virtual RGWOp *op_put() { return NULL; }
+  virtual RGWOp *op_delete() { return NULL; }
+  virtual RGWOp *op_head() { return NULL; }
+  virtual RGWOp *op_post() { return NULL; }
+  virtual RGWOp *op_copy() { return NULL; }
+  virtual RGWOp *op_options() { return NULL; }
+
+public:
+  static int allocate_formatter(req_state *s, RGWFormat default_formatter,
+				bool configurable);
+
+  static constexpr int MAX_BUCKET_NAME_LEN = 255;
+  static constexpr int MAX_OBJ_NAME_LEN = 1024;
+
+  RGWHandler_REST() {}
+  ~RGWHandler_REST() override {}
+
+  static int validate_bucket_name(const std::string& bucket);
+  static int validate_object_name(const std::string& object);
+  static int reallocate_formatter(req_state *s, RGWFormat type);
+
+  int init_permissions(RGWOp* op, optional_yield y) override;
+  int read_permissions(RGWOp* op, optional_yield y) override;
+
+  virtual RGWOp* get_op(void);
+  virtual void put_op(RGWOp* op);
+};
+
+class RGWHandler_REST_SWIFT;
+class RGWHandler_SWIFT_Auth;
+class RGWHandler_REST_S3;
+
+namespace rgw::auth {
+
+class StrategyRegistry;
+
+}
+
+class RGWRESTMgr {
+  bool should_log;
+
+protected:
+  std::map<std::string, RGWRESTMgr*> resource_mgrs;
+  std::multimap<size_t, std::string> resources_by_size;
+  RGWRESTMgr* default_mgr;
+
+  virtual RGWRESTMgr* get_resource_mgr(req_state* s,
+                                       const std::string& uri,
+                                       std::string* out_uri);
+
+  virtual RGWRESTMgr* get_resource_mgr_as_default(req_state* const s,
+                                                  const std::string& uri,
+                                                  std::string* our_uri) {
+    return this;
+  }
+
+public:
+  RGWRESTMgr()
+    : should_log(false),
+      default_mgr(nullptr) {
+  }
+  virtual ~RGWRESTMgr();
+
+  void register_resource(std::string resource, RGWRESTMgr* mgr);
+  void register_default_mgr(RGWRESTMgr* mgr);
+
+  virtual RGWRESTMgr* get_manager(req_state* const s,
+                                  /* Prefix to be concatenated with @uri
+                                   * during the lookup. */
+                                  const std::string& frontend_prefix,
+                                  const std::string& uri,
+                                  std::string* out_uri) final {
+    return get_resource_mgr(s, frontend_prefix + uri, out_uri);
+  }
+
+  virtual RGWHandler_REST* get_handler(
+    rgw::sal::Driver* driver,
+    req_state* const s,
+    const rgw::auth::StrategyRegistry& auth_registry,
+    const std::string& frontend_prefix
+  ) {
+    return nullptr;
+  }
+
+  virtual void put_handler(RGWHandler_REST* const handler) {
+    delete handler;
+  }
+
+  void set_logging(bool _should_log) {
+    should_log = _should_log;
+  }
+
+  bool get_logging() const {
+    return should_log;
+  }
+};
+
+class RGWLibIO;
+class RGWRestfulIO;
+
+class RGWREST {
+  using x_header = basic_sstring<char, uint16_t, 32>;
+  boost::container::flat_set<x_header> x_headers;
+  RGWRESTMgr mgr;
+
+  static int preprocess(req_state *s, rgw::io::BasicClient* rio);
+public:
+  RGWREST() {}
+  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+                               req_state *s,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string& frontend_prefix,
+                               RGWRestfulIO *rio,
+                               RGWRESTMgr **pmgr,
+                               int *init_error);
+#if 0
+  RGWHandler *get_handler(RGWRados *driver, req_state *s,
+			  RGWLibIO *io, RGWRESTMgr **pmgr,
+			  int *init_error);
+#endif
+
+  void put_handler(RGWHandler_REST *handler) {
+    mgr.put_handler(handler);
+  }
+
+  void register_resource(std::string resource, RGWRESTMgr *m,
+			 bool register_empty = false) {
+    if (!register_empty && resource.empty())
+      return;
+
+    mgr.register_resource(resource, m);
+  }
+
+  void register_default_mgr(RGWRESTMgr *m) {
+    mgr.register_default_mgr(m);
+  }
+
+  void register_x_headers(const std::string& headers);
+
+  bool log_x_headers(void) {
+    return (x_headers.size() > 0);
+  }
+
+  bool log_x_header(const std::string& header) {
+    return (x_headers.find(header) != x_headers.end());
+  }
+};
+
+static constexpr int64_t NO_CONTENT_LENGTH = -1;
+static constexpr int64_t CHUNKED_TRANSFER_ENCODING = -2;
+
+extern void dump_errno(int http_ret, std::string& out);
+extern void dump_errno(const struct rgw_err &err, std::string& out);
+extern void dump_errno(req_state *s);
+extern void dump_errno(req_state *s, int http_ret);
+extern void end_header(req_state *s,
+                       RGWOp* op = nullptr,
+                       const char *content_type = nullptr,
+                       const int64_t proposed_content_length =
+		       NO_CONTENT_LENGTH,
+		       bool force_content_type = false,
+		       bool force_no_error = false);
+extern void dump_start(req_state *s);
+extern void list_all_buckets_start(req_state *s);
+extern void dump_owner(req_state *s, const rgw_user& id,
+                       const std::string& name, const char *section = NULL);
+extern void dump_header(req_state* s,
+                        const std::string_view& name,
+                        const std::string_view& val);
+extern void dump_header(req_state* s,
+                        const std::string_view& name,
+                        ceph::buffer::list& bl);
+extern void dump_header(req_state* s,
+                        const std::string_view& name,
+                        long long val);
+extern void dump_header(req_state* s,
+                        const std::string_view& name,
+                        const utime_t& val);
+
+template <class... Args>
+inline void dump_header_prefixed(req_state* s,
+				 const std::string_view& name_prefix,
+				 const std::string_view& name,
+				 Args&&... args) {
+  char full_name_buf[name_prefix.size() + name.size() + 1];
+  const auto len = snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s",
+                            static_cast<int>(name_prefix.length()),
+                            name_prefix.data(),
+                            static_cast<int>(name.length()),
+                            name.data());
+  std::string_view full_name(full_name_buf, len);
+  return dump_header(s, std::move(full_name), std::forward<Args>(args)...);
+}
+
+template <class... Args>
+inline void dump_header_infixed(req_state* s,
+				const std::string_view& prefix,
+				const std::string_view& infix,
+				const std::string_view& sufix,
+				Args&&... args) {
+  char full_name_buf[prefix.size() + infix.size() + sufix.size() + 1];
+  const auto len = snprintf(full_name_buf, sizeof(full_name_buf), "%.*s%.*s%.*s",
+                            static_cast<int>(prefix.length()),
+                            prefix.data(),
+                            static_cast<int>(infix.length()),
+                            infix.data(),
+                            static_cast<int>(sufix.length()),
+                            sufix.data());
+  std::string_view full_name(full_name_buf, len);
+  return dump_header(s, std::move(full_name), std::forward<Args>(args)...);
+}
+
+template <class... Args>
+inline void dump_header_quoted(req_state* s,
+			       const std::string_view& name,
+			       const std::string_view& val) {
+  /* We need two extra bytes for quotes. */
+  char qvalbuf[val.size() + 2 + 1];
+  const auto len = snprintf(qvalbuf, sizeof(qvalbuf), "\"%.*s\"",
+                            static_cast<int>(val.length()), val.data());
+  return dump_header(s, name, std::string_view(qvalbuf, len));
+}
+
+template <class ValueT>
+inline void dump_header_if_nonempty(req_state* s,
+				    const std::string_view& name,
+				    const ValueT& value) {
+  if (name.length() > 0 && value.length() > 0) {
+    return dump_header(s, name, value);
+  }
+}
+
+inline std::string compute_domain_uri(const req_state *s) {
+  std::string uri = (!s->info.domain.empty()) ? s->info.domain :
+    [&s]() -> std::string {
+    RGWEnv const &env(*(s->info.env));
+    std::string uri =
+    env.get("SERVER_PORT_SECURE") ? "https://" : "http://";
+    if (env.exists("SERVER_NAME")) {
+      uri.append(env.get("SERVER_NAME", "<SERVER_NAME>"));
+    } else {
+      uri.append(env.get("HTTP_HOST", "<HTTP_HOST>"));
+    }
+    return uri;
+  }();
+  return uri;
+}
+
+extern void dump_content_length(req_state *s, uint64_t len);
+extern int64_t parse_content_length(const char *content_length);
+extern void dump_etag(req_state *s,
+                      const std::string_view& etag,
+                      bool quoted = false);
+extern void dump_epoch_header(req_state *s, const char *name, real_time t);
+extern void dump_time_header(req_state *s, const char *name, real_time t);
+extern void dump_last_modified(req_state *s, real_time t);
+extern void abort_early(req_state* s, RGWOp* op, int err,
+			RGWHandler* handler, optional_yield y);
+extern void dump_range(req_state* s, uint64_t ofs, uint64_t end,
+		       uint64_t total_size);
+extern void dump_continue(req_state *s);
+extern void list_all_buckets_end(req_state *s);
+extern void dump_time(req_state *s, const char *name, real_time t);
+extern std::string dump_time_to_str(const real_time& t);
+extern void dump_bucket_from_state(req_state *s);
+extern void dump_redirect(req_state *s, const std::string& redirect);
+extern bool is_valid_url(const char *url);
+extern void dump_access_control(req_state *s, const char *origin,
+				const char *meth,
+				const char *hdr, const char *exp_hdr,
+				uint32_t max_age);
+extern void dump_access_control(req_state *s, RGWOp *op);
+
+extern int dump_body(req_state* s, const char* buf, size_t len);
+extern int dump_body(req_state* s, /* const */ ceph::buffer::list& bl);
+extern int dump_body(req_state* s, const std::string& str);
+extern int recv_body(req_state* s, char* buf, size_t max);
diff --git a/src/rgw/rgw_rest_admin.h b/src/rgw/rgw_rest_admin.h
new file mode 100644
index 000000000..91230af6c
--- /dev/null
+++ b/src/rgw/rgw_rest_admin.h
@@ -0,0 +1,12 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw/rgw_rest.h"
+
+class RGWRESTMgr_Admin : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Admin() {}
+  ~RGWRESTMgr_Admin() override {}
+};
diff --git a/src/rgw/rgw_rest_client.cc b/src/rgw/rgw_rest_client.cc
new file mode 100644
index 000000000..b0b8fcc84
--- /dev/null
+++ b/src/rgw/rgw_rest_client.cc
@@ -0,0 +1,1124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_common.h"
+#include "rgw_rest_client.h"
+#include "rgw_auth_s3.h"
+#include "rgw_http_errors.h"
+
+#include "common/armor.h"
+#include "common/strtol.h"
+#include "include/str_list.h"
+#include "rgw_crypt_sanitize.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int RGWHTTPSimpleRequest::get_status()
+{
+  int retcode = get_req_retcode();
+  if (retcode < 0) {
+    return retcode;
+  }
+  return status;
+}
+
+int RGWHTTPSimpleRequest::handle_header(const string& name, const string& val) 
+{
+  if (name == "CONTENT_LENGTH") {
+    string err;
+    long len = strict_strtol(val.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(this, 0) << "ERROR: failed converting content length (" << val << ") to int " << dendl;
+      return -EINVAL;
+    }
+
+    max_response = len;
+  }
+
+  return 0;
+}
+
+int RGWHTTPSimpleRequest::receive_header(void *ptr, size_t len)
+{
+  unique_lock guard(out_headers_lock);
+
+  char line[len + 1];
+
+  char *s = (char *)ptr, *end = (char *)ptr + len;
+  char *p = line;
+  ldpp_dout(this, 30) << "receive_http_header" << dendl;
+
+  while (s != end) {
+    if (*s == '\r') {
+      s++;
+      continue;
+    }
+    if (*s == '\n') {
+      *p = '\0';
+      ldpp_dout(this, 30) << "received header:" << line << dendl;
+      // TODO: fill whatever data required here
+      char *l = line;
+      char *tok = strsep(&l, " \t:");
+      if (tok && l) {
+        while (*l == ' ')
+          l++;
+ 
+        if (strcmp(tok, "HTTP") == 0 || strncmp(tok, "HTTP/", 5) == 0) {
+          http_status = atoi(l);
+          if (http_status == 100) /* 100-continue response */
+            continue;
+          status = rgw_http_error_to_errno(http_status);
+        } else {
+          /* convert header field name to upper case  */
+          char *src = tok;
+          char buf[len + 1];
+          size_t i;
+          for (i = 0; i < len && *src; ++i, ++src) {
+            switch (*src) {
+              case '-':
+                buf[i] = '_';
+                break;
+              default:
+                buf[i] = toupper(*src);
+            }
+          }
+          buf[i] = '\0';
+          out_headers[buf] = l;
+          int r = handle_header(buf, l);
+          if (r < 0)
+            return r;
+        }
+      }
+    }
+    if (s != end)
+      *p++ = *s++;
+  }
+  return 0;
+}
+
+static void get_new_date_str(string& date_str)
+{
+  date_str = rgw_to_asctime(ceph_clock_now());
+}
+
+static void get_gmt_date_str(string& date_str)
+{
+  auto now_time = ceph::real_clock::now();
+  time_t rawtime = ceph::real_clock::to_time_t(now_time);
+
+  char buffer[80];
+
+  struct tm timeInfo;
+  gmtime_r(&rawtime, &timeInfo);
+  strftime(buffer, sizeof(buffer), "%a, %d %b %Y %H:%M:%S %z", &timeInfo);  
+  
+  date_str = buffer;
+}
+
+int RGWHTTPSimpleRequest::send_data(void *ptr, size_t len, bool* pause)
+{
+  if (!send_iter)
+    return 0;
+
+  if (len > send_iter->get_remaining())
+    len = send_iter->get_remaining();
+
+  send_iter->copy(len, (char *)ptr);
+
+  return len;
+}
+
+int RGWHTTPSimpleRequest::receive_data(void *ptr, size_t len, bool *pause)
+{
+  size_t cp_len, left_len;
+
+  left_len = max_response > response.length() ? (max_response - response.length()) : 0;
+  if (left_len == 0)
+    return 0; /* don't read extra data */
+
+  cp_len = (len > left_len) ? left_len : len;
+  bufferptr p((char *)ptr, cp_len);
+
+  response.append(p);
+
+  return 0;
+}
+
+static void append_param(string& dest, const string& name, const string& val)
+{
+  if (dest.empty()) {
+    dest.append("?");
+  } else {
+    dest.append("&");
+  }
+  string url_name;
+  url_encode(name, url_name);
+  dest.append(url_name);
+
+  if (!val.empty()) {
+    string url_val;
+    url_encode(val, url_val);
+    dest.append("=");
+    dest.append(url_val);
+  }
+}
+
+static void do_get_params_str(const param_vec_t& params, map<string, string>& extra_args, string& dest)
+{
+  map<string, string>::iterator miter;
+  for (miter = extra_args.begin(); miter != extra_args.end(); ++miter) {
+    append_param(dest, miter->first, miter->second);
+  }
+  for (auto iter = params.begin(); iter != params.end(); ++iter) {
+    append_param(dest, iter->first, iter->second);
+  }
+}
+
+void RGWHTTPSimpleRequest::get_params_str(map<string, string>& extra_args, string& dest)
+{
+  do_get_params_str(params, extra_args, dest);
+}
+
+void RGWHTTPSimpleRequest::get_out_headers(map<string, string> *pheaders)
+{
+  unique_lock guard(out_headers_lock);
+  pheaders->swap(out_headers);
+  out_headers.clear();
+}
+
+static int sign_request_v2(const DoutPrefixProvider *dpp, const RGWAccessKey& key,
+                        const string& region, const string& service,
+                        RGWEnv& env, req_info& info,
+                        const bufferlist *opt_content)
+{
+  /* don't sign if no key is provided */
+  if (key.key.empty()) {
+    return 0;
+  }
+
+  auto cct = dpp->get_cct();
+
+  if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    for (const auto& i: env.get_map()) {
+      ldpp_dout(dpp, 20) << __func__ << "():> " << i.first << " -> " << rgw::crypt_sanitize::x_meta_map{i.first, i.second} << dendl;
+    }
+  }
+
+  string canonical_header;
+  if (!rgw_create_s3_canonical_header(dpp, info, NULL, canonical_header, false)) {
+    ldpp_dout(dpp, 0) << "failed to create canonical s3 header" << dendl;
+    return -EINVAL;
+  }
+
+  ldpp_dout(dpp, 10) << "generated canonical header: " << canonical_header << dendl;
+
+  string digest;
+  try {
+    digest = rgw::auth::s3::get_v2_signature(cct, key.key, canonical_header);
+  } catch (int ret) {
+    return ret;
+  }
+
+  string auth_hdr = "AWS " + key.id + ":" + digest;
+  ldpp_dout(dpp, 15) << "generated auth header: " << auth_hdr << dendl;
+
+  env.set("AUTHORIZATION", auth_hdr);
+
+  return 0;
+}
+
+static int sign_request_v4(const DoutPrefixProvider *dpp, const RGWAccessKey& key,
+                           const string& region, const string& service,
+                           RGWEnv& env, req_info& info,
+                           const bufferlist *opt_content)
+{
+  /* don't sign if no key is provided */
+  if (key.key.empty()) {
+    return 0;
+  }
+
+  auto cct = dpp->get_cct();
+
+  if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+    for (const auto& i: env.get_map()) {
+      ldpp_dout(dpp, 20) << __func__ << "():> " << i.first << " -> " << rgw::crypt_sanitize::x_meta_map{i.first, i.second} << dendl;
+    }
+  }
+
+  rgw::auth::s3::AWSSignerV4::prepare_result_t sigv4_data;
+  if (service == "s3") {
+    sigv4_data = rgw::auth::s3::AWSSignerV4::prepare(dpp, key.id, region, service, info, opt_content, true);
+  } else {
+    sigv4_data = rgw::auth::s3::AWSSignerV4::prepare(dpp, key.id, region, service, info, opt_content, false);
+  }
+  auto sigv4_headers = sigv4_data.signature_factory(dpp, key.key, sigv4_data);
+
+  for (auto& entry : sigv4_headers) {
+    ldpp_dout(dpp, 20) << __func__ << "(): sigv4 header: " << entry.first << ": " << entry.second << dendl;
+    env.set(entry.first, entry.second);
+  }
+
+  return 0;
+}
+
+static int sign_request(const DoutPrefixProvider *dpp, const RGWAccessKey& key,
+                        const string& region, const string& service,
+                        RGWEnv& env, req_info& info,
+                        const bufferlist *opt_content)
+{
+  auto authv = dpp->get_cct()->_conf.get_val<int64_t>("rgw_s3_client_max_sig_ver");
+  if (authv > 0 &&
+      authv <= 3) {
+    return sign_request_v2(dpp, key, region, service, env, info, opt_content);
+  }
+
+  return sign_request_v4(dpp, key, region, service, env, info, opt_content);
+}
+
+static string extract_region_name(string&& s)
+{
+  if (s == "s3") {
+      return "us-east-1";
+  }
+  if (boost::algorithm::starts_with(s, "s3-")) {
+    return s.substr(3);
+  }
+  return std::move(s);
+}
+
+
+static bool identify_scope(const DoutPrefixProvider *dpp,
+                           CephContext *cct,
+                           const string& host,
+                           string *region,
+                           string& service)
+{
+  if (!boost::algorithm::ends_with(host, "amazonaws.com")) {
+    ldpp_dout(dpp, 20) << "NOTICE: cannot identify region for connection to: " << host << dendl;
+    return false;
+  }
+
+  vector<string> vec;
+
+  get_str_vec(host, ".", vec);
+
+  string ser = service;
+  if (service.empty()) {
+    service = "s3"; /* default */
+  }
+
+  for (auto iter = vec.begin(); iter != vec.end(); ++iter) {
+    auto& s = *iter;
+    if (s == "s3" ||
+        s == "execute-api" ||
+        s == "iam") {
+      if (s == "execute-api") {
+        service = s;
+      }
+      ++iter;
+      if (iter == vec.end()) {
+        ldpp_dout(dpp, 0) << "WARNING: cannot identify region name from host name: " << host << dendl;
+        return false;
+      }
+      auto& next = *iter;
+      if (next == "amazonaws") {
+        *region = "us-east-1";
+        return true;
+      }
+      *region = next;
+      return true;
+    } else if (boost::algorithm::starts_with(s, "s3-")) {
+      *region = extract_region_name(std::move(s));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static void scope_from_api_name(const DoutPrefixProvider *dpp,
+                                CephContext *cct,
+                                const string& host,
+                                std::optional<string> api_name,
+                                string *region,
+                                string& service)
+{
+  if (api_name && service.empty()) {
+    *region = *api_name;
+    service = "s3";
+    return;
+  }
+
+  if (!identify_scope(dpp, cct, host, region, service)) {
+    if (service == "iam") {
+      *region = cct->_conf->rgw_zonegroup;
+    } else {
+      *region = cct->_conf->rgw_zonegroup;
+      service = "s3";
+    }
+    return;
+  }
+}
+
+int RGWRESTSimpleRequest::forward_request(const DoutPrefixProvider *dpp, const RGWAccessKey& key, req_info& info, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y, std::string service)
+{
+
+  string date_str;
+  get_new_date_str(date_str);
+
+  RGWEnv new_env;
+  req_info new_info(cct, &new_env);
+  new_info.rebuild_from(info);
+  string bucket_encode;
+  string request_uri_encode;
+  size_t pos = new_info.request_uri.substr(1, new_info.request_uri.size() - 1).find("/");
+  string bucket = new_info.request_uri.substr(1, pos);
+  url_encode(bucket, bucket_encode);
+  if (std::string::npos != pos)
+    request_uri_encode = string("/") + bucket_encode + new_info.request_uri.substr(pos + 1);
+  else
+    request_uri_encode = string("/") + bucket_encode;
+  new_info.request_uri = request_uri_encode;
+
+  for (auto& param : params) {
+    new_info.args.append(param.first, param.second);
+  }
+
+  new_env.set("HTTP_DATE", date_str.c_str());
+  const char* const content_md5 = info.env->get("HTTP_CONTENT_MD5");
+  if (content_md5) {
+    new_env.set("HTTP_CONTENT_MD5", content_md5);
+  }
+
+  string region;
+  string s;
+  if (!service.empty()) {
+    s = service;
+  }
+
+  scope_from_api_name(dpp, cct, host, api_name, &region, s);
+
+  const char *maybe_payload_hash = info.env->get("HTTP_X_AMZ_CONTENT_SHA256");
+  if (maybe_payload_hash && s != "iam") {
+    new_env.set("HTTP_X_AMZ_CONTENT_SHA256", maybe_payload_hash);
+  }
+
+  int ret = sign_request(dpp, key, region, s, new_env, new_info, nullptr);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to sign request" << dendl;
+    return ret;
+  }
+
+  if (s == "iam") {
+    info.args.remove("PayloadHash");
+  }
+
+  for (const auto& kv: new_env.get_map()) {
+    headers.emplace_back(kv);
+  }
+
+  meta_map_t& meta_map = new_info.x_meta_map;
+  for (const auto& kv: meta_map) {
+    headers.emplace_back(kv);
+  }
+
+  string params_str;
+  get_params_str(info.args.get_params(), params_str);
+
+  string new_url = url;
+  string& resource = new_info.request_uri;
+  string new_resource = resource;
+  if (new_url[new_url.size() - 1] == '/' && resource[0] == '/') {
+    new_url = new_url.substr(0, new_url.size() - 1);
+  } else if (resource[0] != '/') {
+    new_resource = "/";
+    new_resource.append(resource);
+  }
+  new_url.append(new_resource + params_str);
+
+  bufferlist::iterator bliter;
+
+  if (inbl) {
+    bliter = inbl->begin();
+    send_iter = &bliter;
+
+    set_send_length(inbl->length());
+  }
+
+  method = new_info.method;
+  url = new_url;
+
+  int r = process(y);
+  if (r < 0){
+    if (r == -EINVAL){
+      // curl_easy has errored, generally means the service is not available
+      r = -ERR_SERVICE_UNAVAILABLE;
+    }
+    return r;
+  }
+
+  response.append((char)0); /* NULL terminate response */
+
+  if (outbl) {
+    *outbl = std::move(response);
+  }
+
+  return status;
+}
+
+class RGWRESTStreamOutCB : public RGWGetDataCB {
+  RGWRESTStreamS3PutObj *req;
+public:
+  explicit RGWRESTStreamOutCB(RGWRESTStreamS3PutObj *_req) : req(_req) {}
+  int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override; /* callback for object iteration when sending data */
+};
+
+int RGWRESTStreamOutCB::handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len)
+{
+  dout(20) << "RGWRESTStreamOutCB::handle_data bl.length()=" << bl.length() << " bl_ofs=" << bl_ofs << " bl_len=" << bl_len << dendl;
+  if (!bl_ofs && bl_len == bl.length()) {
+    req->add_send_data(bl);
+    return 0;
+  }
+
+  bufferptr bp(bl.c_str() + bl_ofs, bl_len);
+  bufferlist new_bl;
+  new_bl.push_back(bp);
+
+  req->add_send_data(new_bl);
+  return 0;
+}
+
+RGWRESTStreamS3PutObj::~RGWRESTStreamS3PutObj()
+{
+  delete out_cb;
+}
+
+static void grants_by_type_add_one_grant(map<int, string>& grants_by_type, int perm, ACLGrant& grant)
+{
+  string& s = grants_by_type[perm];
+
+  if (!s.empty())
+    s.append(", ");
+
+  string id_type_str;
+  ACLGranteeType& type = grant.get_type();
+  switch (type.get_type()) {
+    case ACL_TYPE_GROUP:
+      id_type_str = "uri";
+      break;
+    case ACL_TYPE_EMAIL_USER:
+      id_type_str = "emailAddress";
+      break;
+    default:
+      id_type_str = "id";
+  }
+  rgw_user id;
+  grant.get_id(id);
+  s.append(id_type_str + "=\"" + id.to_str() + "\"");
+}
+
+struct grant_type_to_header {
+  int type;
+  const char *header;
+};
+
+struct grant_type_to_header grants_headers_def[] = {
+  { RGW_PERM_FULL_CONTROL, "x-amz-grant-full-control"},
+  { RGW_PERM_READ,         "x-amz-grant-read"},
+  { RGW_PERM_WRITE,        "x-amz-grant-write"},
+  { RGW_PERM_READ_ACP,     "x-amz-grant-read-acp"},
+  { RGW_PERM_WRITE_ACP,    "x-amz-grant-write-acp"},
+  { 0, NULL}
+};
+
+static bool grants_by_type_check_perm(map<int, string>& grants_by_type, int perm, ACLGrant& grant, int check_perm)
+{
+  if ((perm & check_perm) == check_perm) {
+    grants_by_type_add_one_grant(grants_by_type, check_perm, grant);
+    return true;
+  }
+  return false;
+}
+
+static void grants_by_type_add_perm(map<int, string>& grants_by_type, int perm, ACLGrant& grant)
+{
+  struct grant_type_to_header *t;
+
+  for (t = grants_headers_def; t->header; t++) {
+    if (grants_by_type_check_perm(grants_by_type, perm, grant, t->type))
+      return;
+  }
+}
+
+static void add_grants_headers(map<int, string>& grants, RGWEnv& env, meta_map_t& meta_map)
+{
+  struct grant_type_to_header *t;
+
+  for (t = grants_headers_def; t->header; t++) {
+    map<int, string>::iterator iter = grants.find(t->type);
+    if (iter != grants.end()) {
+      env.set(t->header,iter->second);
+      meta_map[t->header] = iter->second;
+    }
+  }
+}
+
+RGWRESTGenerateHTTPHeaders::RGWRESTGenerateHTTPHeaders(CephContext *_cct, RGWEnv *_env, req_info *_info) :
+                                                DoutPrefix(_cct, dout_subsys, "rest gen http headers: "),
+                                                cct(_cct),
+                                                new_env(_env),
+                                                new_info(_info) {
+}
+
+void RGWRESTGenerateHTTPHeaders::init(const string& _method, const string& host,
+                                      const string& resource_prefix, const string& _url,
+                                      const string& resource, const param_vec_t& params,
+                                      std::optional<string> api_name)
+{
+  scope_from_api_name(this, cct, host, api_name, &region, service);
+
+  string params_str;
+  map<string, string>& args = new_info->args.get_params();
+  do_get_params_str(params, args, params_str);
+
+  /* merge params with extra args so that we can sign correctly */
+  for (auto iter = params.begin(); iter != params.end(); ++iter) {
+    new_info->args.append(iter->first, iter->second);
+  }
+
+  url = _url + resource + params_str;
+
+  string date_str;
+  get_gmt_date_str(date_str);
+
+  new_env->set("HTTP_DATE", date_str.c_str());
+  new_env->set("HTTP_HOST", host);
+
+  method = _method;
+  new_info->method = method.c_str();
+  new_info->host = host;
+
+  new_info->script_uri = "/";
+  new_info->script_uri.append(resource_prefix);
+  new_info->script_uri.append(resource);
+  new_info->request_uri = new_info->script_uri;
+}
+
+static bool is_x_amz(const string& s) {
+  return boost::algorithm::starts_with(s, "x-amz-");
+}
+
+void RGWRESTGenerateHTTPHeaders::set_extra_headers(const map<string, string>& extra_headers)
+{
+  for (auto iter : extra_headers) {
+    const string& name = lowercase_dash_http_attr(iter.first);
+    new_env->set(name, iter.second.c_str());
+    if (is_x_amz(name)) {
+      new_info->x_meta_map[name] = iter.second;
+    }
+  }
+}
+
+int RGWRESTGenerateHTTPHeaders::set_obj_attrs(const DoutPrefixProvider *dpp, map<string, bufferlist>& rgw_attrs)
+{
+  map<string, string> new_attrs;
+
+  /* merge send headers */
+  for (auto& attr: rgw_attrs) {
+    bufferlist& bl = attr.second;
+    const string& name = attr.first;
+    string val = bl.c_str();
+    if (name.compare(0, sizeof(RGW_ATTR_META_PREFIX) - 1, RGW_ATTR_META_PREFIX) == 0) {
+      string header_name = RGW_AMZ_META_PREFIX;
+      header_name.append(name.substr(sizeof(RGW_ATTR_META_PREFIX) - 1));
+      new_attrs[header_name] = val;
+    }
+  }
+
+  RGWAccessControlPolicy policy;
+  int ret = rgw_policy_from_attrset(dpp, cct, rgw_attrs, &policy);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: couldn't get policy ret=" << ret << dendl;
+    return ret;
+  }
+
+  set_http_attrs(new_attrs);
+  set_policy(policy);
+
+  return 0;
+}
+
+void RGWRESTGenerateHTTPHeaders::set_http_attrs(const map<string, string>& http_attrs)
+{
+  /* merge send headers */
+  for (auto& attr: http_attrs) {
+    const string& val = attr.second;
+    const string& name = lowercase_dash_http_attr(attr.first);
+    if (is_x_amz(name)) {
+      new_env->set(name, val);
+      new_info->x_meta_map[name] = val;
+    } else {
+      new_env->set(attr.first, val); /* Ugh, using the uppercase representation,
+                                       as the signing function calls info.env.get("CONTENT_TYPE").
+                                       This needs to be cleaned up! */
+    }
+  }
+}
+
+void RGWRESTGenerateHTTPHeaders::set_policy(RGWAccessControlPolicy& policy)
+{
+  /* update acl headers */
+  RGWAccessControlList& acl = policy.get_acl();
+  multimap<string, ACLGrant>& grant_map = acl.get_grant_map();
+  multimap<string, ACLGrant>::iterator giter;
+  map<int, string> grants_by_type;
+  for (giter = grant_map.begin(); giter != grant_map.end(); ++giter) {
+    ACLGrant& grant = giter->second;
+    ACLPermission& perm = grant.get_permission();
+    grants_by_type_add_perm(grants_by_type, perm.get_permissions(), grant);
+  }
+  add_grants_headers(grants_by_type, *new_env, new_info->x_meta_map);
+}
+
+int RGWRESTGenerateHTTPHeaders::sign(const DoutPrefixProvider *dpp, RGWAccessKey& key, const bufferlist *opt_content)
+{
+  int ret = sign_request(dpp, key, region, service, *new_env, *new_info, opt_content);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to sign request" << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+void RGWRESTStreamS3PutObj::send_init(const rgw_obj& obj)
+{
+  string resource_str;
+  string resource;
+  string new_url = url;
+  string new_host = host;
+
+   const auto& bucket_name = obj.bucket.name;
+
+  if (host_style == VirtualStyle) {
+    resource_str = obj.get_oid();
+
+    new_url = bucket_name + "."  + new_url;
+    new_host = bucket_name + "." + new_host;
+  } else {
+    resource_str = bucket_name + "/" + obj.get_oid();
+  }
+
+  //do not encode slash in object key name
+  url_encode(resource_str, resource, false);
+
+  if (new_url[new_url.size() - 1] != '/')
+    new_url.append("/");
+
+  method = "PUT";
+  headers_gen.init(method, new_host, resource_prefix, new_url, resource, params, api_name);
+
+  url = headers_gen.get_url();
+}
+
+void RGWRESTStreamS3PutObj::send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key, map<string, bufferlist>& rgw_attrs)
+{
+  headers_gen.set_obj_attrs(dpp, rgw_attrs);
+
+  send_ready(dpp, key);
+}
+
+void RGWRESTStreamS3PutObj::send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key, const map<string, string>& http_attrs,
+                                       RGWAccessControlPolicy& policy)
+{
+  headers_gen.set_http_attrs(http_attrs);
+  headers_gen.set_policy(policy);
+
+  send_ready(dpp, key);
+}
+
+void RGWRESTStreamS3PutObj::send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key)
+{
+  headers_gen.sign(dpp, key, nullptr);
+
+  for (const auto& kv: new_env.get_map()) {
+    headers.emplace_back(kv);
+  }
+
+  out_cb = new RGWRESTStreamOutCB(this);
+}
+
+void RGWRESTStreamS3PutObj::put_obj_init(const DoutPrefixProvider *dpp, RGWAccessKey& key, const rgw_obj& obj, map<string, bufferlist>& attrs)
+{
+  send_init(obj);
+  send_ready(dpp, key, attrs);
+}
+
+void set_str_from_headers(map<string, string>& out_headers, const string& header_name, string& str)
+{
+  map<string, string>::iterator iter = out_headers.find(header_name);
+  if (iter != out_headers.end()) {
+    str = iter->second;
+  } else {
+    str.clear();
+  }
+}
+
+static int parse_rgwx_mtime(const DoutPrefixProvider *dpp, CephContext *cct, const string& s, ceph::real_time *rt)
+{
+  string err;
+  vector<string> vec;
+
+  get_str_vec(s, ".", vec);
+
+  if (vec.empty()) {
+    return -EINVAL;
+  }
+
+  long secs = strict_strtol(vec[0].c_str(), 10, &err);
+  long nsecs = 0;
+  if (!err.empty()) {
+    ldpp_dout(dpp, 0) << "ERROR: failed converting mtime (" << s << ") to real_time " << dendl;
+    return -EINVAL;
+  }
+
+  if (vec.size() > 1) {
+    nsecs = strict_strtol(vec[1].c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(dpp, 0) << "ERROR: failed converting mtime (" << s << ") to real_time " << dendl;
+      return -EINVAL;
+    }
+  }
+
+  *rt = utime_t(secs, nsecs).to_real_time();
+
+  return 0;
+}
+
+static void send_prepare_convert(const rgw_obj& obj, string *resource)
+{
+  string urlsafe_bucket, urlsafe_object;
+  url_encode(obj.bucket.get_key(':', 0), urlsafe_bucket);
+  url_encode(obj.key.name, urlsafe_object);
+  *resource = urlsafe_bucket + "/" + urlsafe_object;
+}
+
+int RGWRESTStreamRWRequest::send_request(const DoutPrefixProvider *dpp, RGWAccessKey& key, map<string, string>& extra_headers, const rgw_obj& obj, RGWHTTPManager *mgr)
+{
+  string resource;
+  send_prepare_convert(obj, &resource);
+
+  return send_request(dpp, &key, extra_headers, resource, mgr);
+}
+
+int RGWRESTStreamRWRequest::send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey& key, map<string, string>& extra_headers, const rgw_obj& obj)
+{
+  string resource;
+  send_prepare_convert(obj, &resource);
+
+  return do_send_prepare(dpp, &key, extra_headers, resource);
+}
+
+int RGWRESTStreamRWRequest::send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey *key, map<string, string>& extra_headers, const string& resource,
+                                           bufferlist *send_data)
+{
+  string new_resource;
+  //do not encode slash
+  url_encode(resource, new_resource, false);
+
+  return do_send_prepare(dpp, key, extra_headers, new_resource, send_data);
+}
+
+int RGWRESTStreamRWRequest::do_send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey *key, map<string, string>& extra_headers, const string& resource,
+                                         bufferlist *send_data)
+{
+  string new_url = url;
+  if (!new_url.empty() && new_url.back() != '/')
+    new_url.append("/");
+  
+  string new_resource;
+  string bucket_name;
+  string old_resource = resource;
+
+  if (resource[0] == '/') {
+    new_resource = resource.substr(1);
+  } else {
+    new_resource = resource;
+  }
+
+  size_t pos = new_resource.find("/");
+  bucket_name = new_resource.substr(0, pos);
+
+  //when dest is a bucket with out other params, uri should end up with '/'
+  if(pos == string::npos && params.size() == 0 && host_style == VirtualStyle) {
+    new_resource.append("/");
+  }
+
+  if (host_style == VirtualStyle) {
+    new_url = protocol + "://" + bucket_name + "." + host;
+    if(pos == string::npos) {
+      new_resource = "";
+    } else {
+      new_resource = new_resource.substr(pos+1);
+    }
+  }
+
+  headers_gen.emplace(cct, &new_env, &new_info);
+
+  headers_gen->init(method, host, resource_prefix, new_url, new_resource, params, api_name);
+
+  headers_gen->set_http_attrs(extra_headers);
+
+  if (key) {
+    sign_key = *key;
+  }
+
+  if (send_data) {
+    set_send_length(send_data->length());
+    set_outbl(*send_data);
+    set_send_data_hint(true);
+  }
+
+  method = new_info.method;
+  url = headers_gen->get_url();
+
+  return 0;
+}
+
+int RGWRESTStreamRWRequest::send_request(const DoutPrefixProvider *dpp, RGWAccessKey *key, map<string, string>& extra_headers, const string& resource,
+                                         RGWHTTPManager *mgr, bufferlist *send_data)
+{
+  int ret = send_prepare(dpp, key, extra_headers, resource, send_data);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return send(mgr);
+}
+
+
+int RGWRESTStreamRWRequest::send(RGWHTTPManager *mgr)
+{
+  if (!headers_gen) {
+    ldpp_dout(this, 0) << "ERROR: " << __func__ << "(): send_prepare() was not called: likey a bug!" << dendl;
+    return -EINVAL;
+  }
+
+  const bufferlist *outblp{nullptr};
+
+  if (send_len == outbl.length()) {
+    outblp = &outbl;
+  }
+
+  if (sign_key) {
+    int r = headers_gen->sign(this, *sign_key, outblp);
+    if (r < 0) {
+      ldpp_dout(this, 0) << "ERROR: failed to sign request" << dendl;
+      return r;
+    }
+  }
+
+  for (const auto& kv: new_env.get_map()) {
+    headers.emplace_back(kv);
+  }
+
+  return RGWHTTPStreamRWRequest::send(mgr);
+}
+
+int RGWHTTPStreamRWRequest::complete_request(optional_yield y,
+                                             string *etag,
+                                             real_time *mtime,
+                                             uint64_t *psize,
+                                             map<string, string> *pattrs,
+                                             map<string, string> *pheaders)
+{
+  int ret = wait(y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  unique_lock guard(out_headers_lock);
+
+  if (etag) {
+    set_str_from_headers(out_headers, "ETAG", *etag);
+  }
+  if (status >= 0) {
+    if (mtime) {
+      string mtime_str;
+      set_str_from_headers(out_headers, "RGWX_MTIME", mtime_str);
+      if (!mtime_str.empty()) {
+        int ret = parse_rgwx_mtime(this, cct, mtime_str, mtime);
+        if (ret < 0) {
+          return ret;
+        }
+      } else {
+        *mtime = real_time();
+      }
+    }
+    if (psize) {
+      string size_str;
+      set_str_from_headers(out_headers, "RGWX_OBJECT_SIZE", size_str);
+      string err;
+      *psize = strict_strtoll(size_str.c_str(), 10, &err);
+      if (!err.empty()) {
+        ldpp_dout(this, 0) << "ERROR: failed parsing embedded metadata object size (" << size_str << ") to int " << dendl;
+        return -EIO;
+      }
+    }
+  }
+
+  for (auto iter = out_headers.begin(); pattrs && iter != out_headers.end(); ++iter) {
+    const string& attr_name = iter->first;
+    if (attr_name.compare(0, sizeof(RGW_HTTP_RGWX_ATTR_PREFIX) - 1, RGW_HTTP_RGWX_ATTR_PREFIX) == 0) {
+      string name = attr_name.substr(sizeof(RGW_HTTP_RGWX_ATTR_PREFIX) - 1);
+      const char *src = name.c_str();
+      char buf[name.size() + 1];
+      char *dest = buf;
+      for (; *src; ++src, ++dest) {
+        switch(*src) {
+          case '_':
+            *dest = '-';
+            break;
+          default:
+            *dest = tolower(*src);
+        }
+      }
+      *dest = '\0';
+      (*pattrs)[buf] = iter->second;
+    }
+  }
+
+  if (pheaders) {
+    *pheaders = std::move(out_headers);
+  }
+  return status;
+}
+
+int RGWHTTPStreamRWRequest::handle_header(const string& name, const string& val)
+{
+  if (name == "RGWX_EMBEDDED_METADATA_LEN") {
+    string err;
+    long len = strict_strtol(val.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(this, 0) << "ERROR: failed converting embedded metadata len (" << val << ") to int " << dendl;
+      return -EINVAL;
+    }
+
+    cb->set_extra_data_len(len);
+  }
+  return 0;
+}
+
+int RGWHTTPStreamRWRequest::receive_data(void *ptr, size_t len, bool *pause)
+{
+  size_t orig_len = len;
+
+  if (cb) {
+    in_data.append((const char *)ptr, len);
+
+    size_t orig_in_data_len = in_data.length();
+
+    int ret = cb->handle_data(in_data, pause);
+    if (ret < 0)
+      return ret;
+    if (ret == 0) {
+      in_data.clear();
+    } else {
+      /* partial read */
+      ceph_assert(in_data.length() <= orig_in_data_len);
+      len = ret;
+      bufferlist bl;
+      size_t left_to_read = orig_in_data_len - len;
+      if (in_data.length() > left_to_read) {
+        in_data.splice(0, in_data.length() - left_to_read, &bl);
+      }
+    }
+  }
+  ofs += len;
+  return orig_len;
+}
+
+void RGWHTTPStreamRWRequest::set_stream_write(bool s) {
+  std::lock_guard wl{write_lock};
+  stream_writes = s;
+}
+
+void RGWHTTPStreamRWRequest::unpause_receive()
+{
+  std::lock_guard req_locker{get_req_lock()};
+  if (!read_paused) {
+    _set_read_paused(false);
+  }
+}
+
+void RGWHTTPStreamRWRequest::add_send_data(bufferlist& bl)
+{
+  std::scoped_lock locker{get_req_lock(), write_lock};
+  outbl.claim_append(bl);
+  _set_write_paused(false);
+}
+
+uint64_t RGWHTTPStreamRWRequest::get_pending_send_size()
+{
+  std::lock_guard wl{write_lock};
+  return outbl.length();
+}
+
+void RGWHTTPStreamRWRequest::finish_write()
+{
+  std::scoped_lock locker{get_req_lock(), write_lock};
+  write_stream_complete = true;
+  _set_write_paused(false);
+}
+
+int RGWHTTPStreamRWRequest::send_data(void *ptr, size_t len, bool *pause)
+{
+  uint64_t out_len;
+  uint64_t send_size;
+  {
+    std::lock_guard wl{write_lock};
+
+    if (outbl.length() == 0) {
+      if ((stream_writes && !write_stream_complete) ||
+          (write_ofs < send_len)) {
+        *pause = true;
+      }
+      return 0;
+    }
+
+    len = std::min(len, (size_t)outbl.length());
+
+    bufferlist bl;
+    outbl.splice(0, len, &bl);
+    send_size = bl.length();
+    if (send_size > 0) {
+      memcpy(ptr, bl.c_str(), send_size);
+      write_ofs += send_size;
+    }
+
+    out_len = outbl.length();
+  }
+  /* don't need to be under write_lock here, avoid deadlocks in case notify callback
+   * needs to lock */
+  if (write_drain_cb) {
+    write_drain_cb->notify(out_len);
+  }
+  return send_size;
+}
+
+int RGWHTTPStreamRWRequest::send(RGWHTTPManager *mgr)
+{
+  if (!mgr) {
+    return RGWHTTP::send(this);
+  }
+
+  int r = mgr->add_request(this);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
diff --git a/src/rgw/rgw_rest_client.h b/src/rgw/rgw_rest_client.h
new file mode 100644
index 000000000..97cf899fd
--- /dev/null
+++ b/src/rgw/rgw_rest_client.h
@@ -0,0 +1,257 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_http_client.h"
+
+class RGWGetDataCB;
+
+class RGWHTTPSimpleRequest : public RGWHTTPClient {
+protected:
+  int http_status;
+  int status;
+
+  using unique_lock = std::unique_lock<std::mutex>;
+
+  std::mutex out_headers_lock;
+  std::map<std::string, std::string> out_headers;
+  param_vec_t params;
+
+  bufferlist::iterator *send_iter;
+
+  size_t max_response; /* we need this as we don't stream out response */
+  bufferlist response;
+
+  virtual int handle_header(const std::string& name, const std::string& val);
+  void get_params_str(std::map<std::string, std::string>& extra_args, std::string& dest);
+
+public:
+  RGWHTTPSimpleRequest(CephContext *_cct, const std::string& _method, const std::string& _url,
+                       param_vec_t *_headers, param_vec_t *_params) : RGWHTTPClient(_cct, _method, _url),
+                http_status(0), status(0),
+                send_iter(NULL),
+                max_response(0) {
+    set_headers(_headers);
+    set_params(_params);
+  }
+
+  void set_headers(param_vec_t *_headers) {
+    if (_headers)
+      headers = *_headers;
+  }
+
+  void set_params(param_vec_t *_params) {
+    if (_params)
+      params = *_params;
+  }
+
+  int receive_header(void *ptr, size_t len) override;
+  int receive_data(void *ptr, size_t len, bool *pause) override;
+  int send_data(void *ptr, size_t len, bool* pause=nullptr) override;
+
+  bufferlist& get_response() { return response; }
+
+  void get_out_headers(std::map<std::string, std::string> *pheaders); /* modifies out_headers */
+
+  int get_http_status() { return http_status; }
+  int get_status();
+};
+
+class RGWRESTSimpleRequest : public RGWHTTPSimpleRequest {
+  std::optional<std::string> api_name;
+public:
+  RGWRESTSimpleRequest(CephContext *_cct, const std::string& _method, const std::string& _url,
+                       param_vec_t *_headers, param_vec_t *_params,
+                       std::optional<std::string> _api_name) : RGWHTTPSimpleRequest(_cct, _method, _url, _headers, _params), api_name(_api_name) {}
+
+  int forward_request(const DoutPrefixProvider *dpp, const RGWAccessKey& key, req_info& info, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y, std::string service="");
+};
+
+class RGWWriteDrainCB {
+public:
+  RGWWriteDrainCB() = default;
+  virtual ~RGWWriteDrainCB() = default;
+  virtual void notify(uint64_t pending_size) = 0;
+};
+
+class RGWRESTGenerateHTTPHeaders : public DoutPrefix {
+  CephContext *cct;
+  RGWEnv *new_env;
+  req_info *new_info;
+  std::string region;
+  std::string service;
+  std::string method;
+  std::string url;
+  std::string resource;
+
+public:
+  RGWRESTGenerateHTTPHeaders(CephContext *_cct, RGWEnv *_env, req_info *_info);
+  void init(const std::string& method, const std::string& host,
+            const std::string& resource_prefix, const std::string& url,
+            const std::string& resource, const param_vec_t& params,
+            std::optional<std::string> api_name);
+  void set_extra_headers(const std::map<std::string, std::string>& extra_headers);
+  int set_obj_attrs(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>& rgw_attrs);
+  void set_http_attrs(const std::map<std::string, std::string>& http_attrs);
+  void set_policy(RGWAccessControlPolicy& policy);
+  int sign(const DoutPrefixProvider *dpp, RGWAccessKey& key, const bufferlist *opt_content);
+
+  const std::string& get_url() { return url; }
+};
+
+class RGWHTTPStreamRWRequest : public RGWHTTPSimpleRequest {
+public:
+    class ReceiveCB;
+
+private:
+  ceph::mutex lock =
+    ceph::make_mutex("RGWHTTPStreamRWRequest");
+  ceph::mutex write_lock =
+    ceph::make_mutex("RGWHTTPStreamRWRequest::write_lock");
+  ReceiveCB *cb{nullptr};
+  RGWWriteDrainCB *write_drain_cb{nullptr};
+  bufferlist in_data;
+  size_t chunk_ofs{0};
+  size_t ofs{0};
+  uint64_t write_ofs{0};
+  bool read_paused{false};
+  bool send_paused{false};
+  bool stream_writes{false};
+  bool write_stream_complete{false};
+protected:
+  bufferlist outbl;
+
+  int handle_header(const std::string& name, const std::string& val) override;
+public:
+  int send_data(void *ptr, size_t len, bool *pause) override;
+  int receive_data(void *ptr, size_t len, bool *pause) override;
+
+  class ReceiveCB {
+    protected:
+      uint64_t extra_data_len{0};
+    public:
+      ReceiveCB() = default;
+      virtual ~ReceiveCB() = default;
+      virtual int handle_data(bufferlist& bl, bool *pause = nullptr) = 0;
+      virtual void set_extra_data_len(uint64_t len) {
+        extra_data_len = len;
+      }
+  };
+
+  RGWHTTPStreamRWRequest(CephContext *_cct, const std::string& _method, const std::string& _url,
+                         param_vec_t *_headers, param_vec_t *_params) : RGWHTTPSimpleRequest(_cct, _method, _url, _headers, _params) {
+  }
+  RGWHTTPStreamRWRequest(CephContext *_cct, const std::string& _method, const std::string& _url, ReceiveCB *_cb,
+                         param_vec_t *_headers, param_vec_t *_params) : RGWHTTPSimpleRequest(_cct, _method, _url, _headers, _params),
+									cb(_cb) {
+  }
+  virtual ~RGWHTTPStreamRWRequest() override {}
+
+  void set_outbl(bufferlist& _outbl) {
+    outbl.swap(_outbl);
+  }
+
+  void set_in_cb(ReceiveCB *_cb) { cb = _cb; }
+  void set_write_drain_cb(RGWWriteDrainCB *_cb) { write_drain_cb = _cb; }
+
+  void unpause_receive();
+
+  void add_send_data(bufferlist& bl);
+
+  void set_stream_write(bool s);
+
+  uint64_t get_pending_send_size();
+
+  /* finish streaming writes */
+  void finish_write();
+
+  virtual int send(RGWHTTPManager *mgr);
+
+  int complete_request(optional_yield y,
+                       std::string *etag = nullptr,
+                       real_time *mtime = nullptr,
+                       uint64_t *psize = nullptr,
+                       std::map<std::string, std::string> *pattrs = nullptr,
+                       std::map<std::string, std::string> *pheaders = nullptr);
+};
+
+class RGWRESTStreamRWRequest : public RGWHTTPStreamRWRequest {
+  std::optional<RGWAccessKey> sign_key;
+  std::optional<RGWRESTGenerateHTTPHeaders> headers_gen;
+  RGWEnv new_env;
+  req_info new_info;
+
+protected:
+  std::optional<std::string> api_name;
+  HostStyle host_style;
+public:
+  RGWRESTStreamRWRequest(CephContext *_cct, const std::string& _method, const std::string& _url, RGWHTTPStreamRWRequest::ReceiveCB *_cb,
+                         param_vec_t *_headers, param_vec_t *_params,
+                         std::optional<std::string> _api_name, HostStyle _host_style = PathStyle) :
+                         RGWHTTPStreamRWRequest(_cct, _method, _url, _cb, _headers, _params),
+                         new_info(_cct, &new_env),
+                         api_name(_api_name), host_style(_host_style) {
+  }
+  virtual ~RGWRESTStreamRWRequest() override {}
+
+  int send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey *key, std::map<std::string, std::string>& extra_headers, const std::string& resource, bufferlist *send_data = nullptr /* optional input data */);
+  int send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey& key, std::map<std::string, std::string>& extra_headers, const rgw_obj& obj);
+  int send(RGWHTTPManager *mgr) override;
+
+  int send_request(const DoutPrefixProvider *dpp, RGWAccessKey& key, std::map<std::string, std::string>& extra_headers, const rgw_obj& obj, RGWHTTPManager *mgr);
+  int send_request(const DoutPrefixProvider *dpp, RGWAccessKey *key, std::map<std::string, std::string>& extra_headers, const std::string& resource, RGWHTTPManager *mgr, bufferlist *send_data = nullptr /* optional input data */);
+
+  void add_params(param_vec_t *params);
+
+private:
+  int do_send_prepare(const DoutPrefixProvider *dpp, RGWAccessKey *key, std::map<std::string, std::string>& extra_headers, const std::string& resource, bufferlist *send_data = nullptr /* optional input data */);
+};
+
+class RGWRESTStreamReadRequest : public RGWRESTStreamRWRequest {
+public:
+  RGWRESTStreamReadRequest(CephContext *_cct, const std::string& _url, ReceiveCB *_cb, param_vec_t *_headers,
+		param_vec_t *_params, std::optional<std::string> _api_name,
+                HostStyle _host_style = PathStyle) : RGWRESTStreamRWRequest(_cct, "GET", _url, _cb, _headers, _params, _api_name, _host_style) {}
+};
+
+class RGWRESTStreamHeadRequest : public RGWRESTStreamRWRequest {
+public:
+  RGWRESTStreamHeadRequest(CephContext *_cct, const std::string& _url, ReceiveCB *_cb, param_vec_t *_headers,
+		param_vec_t *_params, std::optional<std::string> _api_name) : RGWRESTStreamRWRequest(_cct, "HEAD", _url, _cb, _headers, _params, _api_name) {}
+};
+
+class RGWRESTStreamSendRequest : public RGWRESTStreamRWRequest {
+public:
+  RGWRESTStreamSendRequest(CephContext *_cct, const std::string& method,
+                           const std::string& _url,
+                           ReceiveCB *_cb, param_vec_t *_headers, param_vec_t *_params,
+                           std::optional<std::string> _api_name,
+                           HostStyle _host_style = PathStyle) : RGWRESTStreamRWRequest(_cct, method, _url, _cb, _headers, _params, _api_name, _host_style) {}
+};
+
+class RGWRESTStreamS3PutObj : public RGWHTTPStreamRWRequest {
+  std::optional<std::string> api_name;
+  HostStyle host_style;
+  RGWGetDataCB *out_cb;
+  RGWEnv new_env;
+  req_info new_info;
+  RGWRESTGenerateHTTPHeaders headers_gen;
+public:
+  RGWRESTStreamS3PutObj(CephContext *_cct, const std::string& _method, const std::string& _url, param_vec_t *_headers,
+		param_vec_t *_params, std::optional<std::string> _api_name,
+                HostStyle _host_style) : RGWHTTPStreamRWRequest(_cct, _method, _url, nullptr, _headers, _params),
+                api_name(_api_name), host_style(_host_style),
+                out_cb(NULL), new_info(cct, &new_env), headers_gen(_cct, &new_env, &new_info) {}
+  ~RGWRESTStreamS3PutObj() override;
+
+  void send_init(const rgw_obj& obj);
+  void send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key, std::map<std::string, bufferlist>& rgw_attrs);
+  void send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key, const std::map<std::string, std::string>& http_attrs,
+                  RGWAccessControlPolicy& policy);
+  void send_ready(const DoutPrefixProvider *dpp, RGWAccessKey& key);
+
+  void put_obj_init(const DoutPrefixProvider *dpp, RGWAccessKey& key, const rgw_obj& obj, std::map<std::string, bufferlist>& attrs);
+
+  RGWGetDataCB *get_out_cb() { return out_cb; }
+};
diff --git a/src/rgw/rgw_rest_config.cc b/src/rgw/rgw_rest_config.cc
new file mode 100644
index 000000000..a3b93ea3a
--- /dev/null
+++ b/src/rgw/rgw_rest_config.cc
@@ -0,0 +1,57 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/ceph_json.h"
+#include "common/strtol.h"
+#include "rgw_rest.h"
+#include "rgw_op.h"
+#include "rgw_rados.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_config.h"
+#include "rgw_client_io.h"
+#include "rgw_sal_rados.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#include "services/svc_zone.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+void RGWOp_ZoneConfig_Get::send_response() {
+  const RGWZoneParams& zone_params = static_cast<rgw::sal::RadosStore*>(driver)->svc()->zone->get_zone_params();
+
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret < 0)
+    return;
+
+  encode_json("zone_params", zone_params, s->formatter);
+  flusher.flush();
+}
+
+RGWOp* RGWHandler_Config::op_get() {
+  bool exists;
+  string type = s->info.args.get("type", &exists);
+
+  if (type.compare("zone") == 0) {
+    return new RGWOp_ZoneConfig_Get();
+  }
+  return nullptr;
+}
diff --git a/src/rgw/rgw_rest_config.h b/src/rgw/rgw_rest_config.h
new file mode 100644
index 000000000..1910cbe0b
--- /dev/null
+++ b/src/rgw/rgw_rest_config.h
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_auth_s3.h"
+#include "rgw_rest.h"
+#include "rgw_zone.h"
+
+class RGWOp_ZoneConfig_Get : public RGWRESTOp {
+  RGWZoneParams zone_params;
+public:
+  RGWOp_ZoneConfig_Get() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("zone", RGW_CAP_READ);
+  }
+  int verify_permission(optional_yield) override {
+    return check_caps(s->user->get_caps());
+  }
+  void execute(optional_yield) override {} /* driver already has the info we need, just need to send response */
+  void send_response() override ;
+  const char* name() const override {
+    return "get_zone_config";
+  }
+};
+
+class RGWHandler_Config : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+
+  int read_permissions(RGWOp*, optional_yield) override {
+    return 0;
+  }
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_Config() override = default;
+};
+
+
+class RGWRESTMgr_Config : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Config() = default;
+  ~RGWRESTMgr_Config() override = default;
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* ,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override {
+    return new RGWHandler_Config(auth_registry);
+  }
+};
diff --git a/src/rgw/rgw_rest_conn.cc b/src/rgw/rgw_rest_conn.cc
new file mode 100644
index 000000000..ffb536ed9
--- /dev/null
+++ b/src/rgw/rgw_rest_conn.cc
@@ -0,0 +1,526 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_zone.h"
+#include "rgw_rest_conn.h"
+#include "rgw_sal.h"
+#include "rgw_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWRESTConn::RGWRESTConn(CephContext *_cct, rgw::sal::Driver* driver,
+                         const string& _remote_id,
+                         const list<string>& remote_endpoints,
+                         std::optional<string> _api_name,
+                         HostStyle _host_style)
+  : cct(_cct),
+    endpoints(remote_endpoints.begin(), remote_endpoints.end()),
+    remote_id(_remote_id),
+    api_name(_api_name),
+    host_style(_host_style)
+{
+  if (driver) {
+    key = driver->get_zone()->get_system_key();
+    self_zone_group = driver->get_zone()->get_zonegroup().get_id();
+  }
+}
+
+RGWRESTConn::RGWRESTConn(CephContext *_cct,
+                         const string& _remote_id,
+                         const list<string>& remote_endpoints,
+                         RGWAccessKey _cred,
+                         std::string _zone_group,
+                         std::optional<string> _api_name,
+                         HostStyle _host_style)
+  : cct(_cct),
+    endpoints(remote_endpoints.begin(), remote_endpoints.end()),
+    key(_cred),
+    self_zone_group(_zone_group),
+    remote_id(_remote_id),
+    api_name(_api_name),
+    host_style(_host_style)
+{
+}
+
+RGWRESTConn::RGWRESTConn(RGWRESTConn&& other)
+  : cct(other.cct),
+    endpoints(std::move(other.endpoints)),
+    key(std::move(other.key)),
+    self_zone_group(std::move(other.self_zone_group)),
+    remote_id(std::move(other.remote_id)),
+    counter(other.counter.load())
+{
+}
+
+RGWRESTConn& RGWRESTConn::operator=(RGWRESTConn&& other)
+{
+  cct = other.cct;
+  endpoints = std::move(other.endpoints);
+  key = std::move(other.key);
+  self_zone_group = std::move(other.self_zone_group);
+  remote_id = std::move(other.remote_id);
+  counter = other.counter.load();
+  return *this;
+}
+
+int RGWRESTConn::get_url(string& endpoint)
+{
+  if (endpoints.empty()) {
+    ldout(cct, 0) << "ERROR: endpoints not configured for upstream zone" << dendl;
+    return -EIO;
+  }
+
+  int i = ++counter;
+  endpoint = endpoints[i % endpoints.size()];
+
+  return 0;
+}
+
+string RGWRESTConn::get_url()
+{
+  string endpoint;
+  get_url(endpoint);
+  return endpoint;
+}
+
+void RGWRESTConn::populate_params(param_vec_t& params, const rgw_user *uid, const string& zonegroup)
+{
+  populate_uid(params, uid);
+  populate_zonegroup(params, zonegroup);
+}
+
+int RGWRESTConn::forward(const DoutPrefixProvider *dpp, const rgw_user& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y)
+{
+  string url;
+  int ret = get_url(url);
+  if (ret < 0)
+    return ret;
+  param_vec_t params;
+  populate_params(params, &uid, self_zone_group);
+  if (objv) {
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "tag", objv->tag));
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%lld", (long long)objv->ver);
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "ver", buf));
+  }
+  RGWRESTSimpleRequest req(cct, info.method, url, NULL, &params, api_name);
+  return req.forward_request(dpp, key, info, max_response, inbl, outbl, y);
+}
+
+int RGWRESTConn::forward_iam_request(const DoutPrefixProvider *dpp, const RGWAccessKey& key, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y)
+{
+  string url;
+  int ret = get_url(url);
+  if (ret < 0)
+    return ret;
+  param_vec_t params;
+  if (objv) {
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "tag", objv->tag));
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%lld", (long long)objv->ver);
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "ver", buf));
+  }
+  std::string service = "iam";
+  RGWRESTSimpleRequest req(cct, info.method, url, NULL, &params, api_name);
+  return req.forward_request(dpp, key, info, max_response, inbl, outbl, y, service);
+}
+
+int RGWRESTConn::put_obj_send_init(const rgw_obj& obj, const rgw_http_param_pair *extra_params, RGWRESTStreamS3PutObj **req)
+{
+  string url;
+  int ret = get_url(url);
+  if (ret < 0)
+    return ret;
+
+  rgw_user uid;
+  param_vec_t params;
+  populate_params(params, &uid, self_zone_group);
+
+  if (extra_params) {
+    append_param_list(params, extra_params);
+  }
+
+  RGWRESTStreamS3PutObj *wr = new RGWRESTStreamS3PutObj(cct, "PUT", url, NULL, &params, api_name, host_style);
+  wr->send_init(obj);
+  *req = wr;
+  return 0;
+}
+
+int RGWRESTConn::put_obj_async_init(const DoutPrefixProvider *dpp, const rgw_user& uid, const rgw_obj& obj,
+                                    map<string, bufferlist>& attrs,
+                                    RGWRESTStreamS3PutObj **req)
+{
+  string url;
+  int ret = get_url(url);
+  if (ret < 0)
+    return ret;
+
+  param_vec_t params;
+  populate_params(params, &uid, self_zone_group);
+  RGWRESTStreamS3PutObj *wr = new RGWRESTStreamS3PutObj(cct, "PUT", url, NULL, &params, api_name, host_style);
+  wr->put_obj_init(dpp, key, obj, attrs);
+  *req = wr;
+  return 0;
+}
+
+int RGWRESTConn::complete_request(RGWRESTStreamS3PutObj *req, string& etag,
+                                  real_time *mtime, optional_yield y)
+{
+  int ret = req->complete_request(y, &etag, mtime);
+  delete req;
+
+  return ret;
+}
+
+static void set_date_header(const real_time *t, map<string, string>& headers, bool high_precision_time, const string& header_name)
+{
+  if (!t) {
+    return;
+  }
+  stringstream s;
+  utime_t tm = utime_t(*t);
+  if (high_precision_time) {
+    tm.gmtime_nsec(s);
+  } else {
+    tm.gmtime(s);
+  }
+  headers[header_name] = s.str();
+}
+
+template <class T>
+static void set_header(T val, map<string, string>& headers, const string& header_name)
+{
+  stringstream s;
+  s << val;
+  headers[header_name] = s.str();
+}
+
+
+int RGWRESTConn::get_obj(const DoutPrefixProvider *dpp, const rgw_user& uid, req_info *info /* optional */, const rgw_obj& obj,
+                         const real_time *mod_ptr, const real_time *unmod_ptr,
+                         uint32_t mod_zone_id, uint64_t mod_pg_ver,
+                         bool prepend_metadata, bool get_op, bool rgwx_stat,
+                         bool sync_manifest, bool skip_decrypt,
+                         rgw_zone_set_entry *dst_zone_trace, bool sync_cloudtiered,
+                         bool send, RGWHTTPStreamRWRequest::ReceiveCB *cb, RGWRESTStreamRWRequest **req)
+{
+  get_obj_params params;
+  params.uid = uid;
+  params.info = info;
+  params.mod_ptr = mod_ptr;
+  params.mod_pg_ver = mod_pg_ver;
+  params.prepend_metadata = prepend_metadata;
+  params.get_op = get_op;
+  params.rgwx_stat = rgwx_stat;
+  params.sync_manifest = sync_manifest;
+  params.skip_decrypt = skip_decrypt;
+  params.sync_cloudtiered = sync_cloudtiered;
+  params.dst_zone_trace = dst_zone_trace;
+  params.cb = cb;
+  return get_obj(dpp, obj, params, send, req);
+}
+
+int RGWRESTConn::get_obj(const DoutPrefixProvider *dpp, const rgw_obj& obj, const get_obj_params& in_params, bool send, RGWRESTStreamRWRequest **req)
+{
+  string url;
+  int ret = get_url(url);
+  if (ret < 0)
+    return ret;
+
+  param_vec_t params;
+  populate_params(params, &in_params.uid, self_zone_group);
+  if (in_params.prepend_metadata) {
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "prepend-metadata", "true"));
+  }
+  if (in_params.rgwx_stat) {
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "stat", "true"));
+  }
+  if (in_params.sync_manifest) {
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "sync-manifest", ""));
+  }
+  if (in_params.sync_cloudtiered) {
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "sync-cloudtiered", ""));
+  }
+  if (in_params.skip_decrypt) {
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "skip-decrypt", ""));
+  }
+  if (in_params.dst_zone_trace) {
+    params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "if-not-replicated-to", in_params.dst_zone_trace->to_str()));
+  }
+  if (!obj.key.instance.empty()) {
+    params.push_back(param_pair_t("versionId", obj.key.instance));
+  }
+  if (in_params.get_op) {
+    *req = new RGWRESTStreamReadRequest(cct, url, in_params.cb, NULL, &params, api_name, host_style);
+  } else {
+    *req = new RGWRESTStreamHeadRequest(cct, url, in_params.cb, NULL, &params, api_name);
+  }
+  map<string, string> extra_headers;
+  if (in_params.info) {
+    const auto& orig_map = in_params.info->env->get_map();
+
+    /* add original headers that start with HTTP_X_AMZ_ */
+    static constexpr char SEARCH_AMZ_PREFIX[] = "HTTP_X_AMZ_";
+    for (auto iter= orig_map.lower_bound(SEARCH_AMZ_PREFIX); iter != orig_map.end(); ++iter) {
+      const string& name = iter->first;
+      if (name == "HTTP_X_AMZ_DATE") /* don't forward date from original request */
+        continue;
+      if (name.compare(0, strlen(SEARCH_AMZ_PREFIX), SEARCH_AMZ_PREFIX) != 0)
+        break;
+      extra_headers[iter->first] = iter->second;
+    }
+  }
+
+  set_date_header(in_params.mod_ptr, extra_headers, in_params.high_precision_time, "HTTP_IF_MODIFIED_SINCE");
+  set_date_header(in_params.unmod_ptr, extra_headers, in_params.high_precision_time, "HTTP_IF_UNMODIFIED_SINCE");
+  if (!in_params.etag.empty()) {
+    set_header(in_params.etag, extra_headers, "HTTP_IF_MATCH");
+  }
+  if (in_params.mod_zone_id != 0) {
+    set_header(in_params.mod_zone_id, extra_headers, "HTTP_DEST_ZONE_SHORT_ID");
+  }
+  if (in_params.mod_pg_ver != 0) {
+    set_header(in_params.mod_pg_ver, extra_headers, "HTTP_DEST_PG_VER");
+  }
+  if (in_params.range_is_set) {
+    char buf[64];
+    snprintf(buf, sizeof(buf), "bytes=%lld-%lld", (long long)in_params.range_start, (long long)in_params.range_end);
+    set_header(buf, extra_headers, "RANGE");
+  }
+
+  int r = (*req)->send_prepare(dpp, key, extra_headers, obj);
+  if (r < 0) {
+    goto done_err;
+  }
+  
+  if (!send) {
+    return 0;
+  }
+
+  r = (*req)->send(nullptr);
+  if (r < 0) {
+    goto done_err;
+  }
+  return 0;
+done_err:
+  delete *req;
+  *req = nullptr;
+  return r;
+}
+
+int RGWRESTConn::complete_request(RGWRESTStreamRWRequest *req,
+                                  string *etag,
+                                  real_time *mtime,
+                                  uint64_t *psize,
+                                  map<string, string> *pattrs,
+                                  map<string, string> *pheaders,
+                                  optional_yield y)
+{
+  int ret = req->complete_request(y, etag, mtime, psize, pattrs, pheaders);
+  delete req;
+
+  return ret;
+}
+
+int RGWRESTConn::get_resource(const DoutPrefixProvider *dpp,
+                     const string& resource,
+		     param_vec_t *extra_params,
+		     map<string, string> *extra_headers,
+		     bufferlist& bl,
+                     bufferlist *send_data,
+		     RGWHTTPManager *mgr,
+		     optional_yield y)
+{
+  string url;
+  int ret = get_url(url);
+  if (ret < 0)
+    return ret;
+
+  param_vec_t params;
+
+  if (extra_params) {
+    params.insert(params.end(), extra_params->begin(), extra_params->end());
+  }
+
+  populate_params(params, nullptr, self_zone_group);
+
+  RGWStreamIntoBufferlist cb(bl);
+
+  RGWRESTStreamReadRequest req(cct, url, &cb, NULL, &params, api_name, host_style);
+
+  map<string, string> headers;
+  if (extra_headers) {
+    headers.insert(extra_headers->begin(), extra_headers->end());
+  }
+
+  ret = req.send_request(dpp, &key, headers, resource, mgr, send_data);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return req.complete_request(y);
+}
+
+int RGWRESTConn::send_resource(const DoutPrefixProvider *dpp, const std::string& method,
+                        const std::string& resource, rgw_http_param_pair *extra_params,
+		                std::map<std::string, std::string> *extra_headers, bufferlist& bl,
+                        bufferlist *send_data, RGWHTTPManager *mgr, optional_yield y)
+{
+  std::string url;
+  int ret = get_url(url);
+  if (ret < 0)
+    return ret;
+
+  param_vec_t params;
+
+  if (extra_params) {
+    params = make_param_list(extra_params);
+  }
+
+  populate_params(params, nullptr, self_zone_group);
+
+  RGWStreamIntoBufferlist cb(bl);
+
+  RGWRESTStreamSendRequest req(cct, method, url, &cb, NULL, &params, api_name, host_style);
+
+  std::map<std::string, std::string> headers;
+  if (extra_headers) {
+    headers.insert(extra_headers->begin(), extra_headers->end());
+  }
+
+  ret = req.send_request(dpp, &key, headers, resource, mgr, send_data);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  ret = req.complete_request(y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << __func__ << ": complete_request() resource=" << resource << " returned ret=" << ret << dendl;
+  }
+
+  return ret;
+}
+
+RGWRESTReadResource::RGWRESTReadResource(RGWRESTConn *_conn,
+                                         const string& _resource,
+		                         const rgw_http_param_pair *pp,
+					 param_vec_t *extra_headers,
+                                         RGWHTTPManager *_mgr)
+  : cct(_conn->get_ctx()), conn(_conn), resource(_resource),
+    params(make_param_list(pp)), cb(bl), mgr(_mgr),
+    req(cct, conn->get_url(), &cb, NULL, NULL, _conn->get_api_name())
+{
+  init_common(extra_headers);
+}
+
+RGWRESTReadResource::RGWRESTReadResource(RGWRESTConn *_conn,
+                                         const string& _resource,
+					 param_vec_t& _params,
+					 param_vec_t *extra_headers,
+                                         RGWHTTPManager *_mgr)
+  : cct(_conn->get_ctx()), conn(_conn), resource(_resource), params(_params),
+    cb(bl), mgr(_mgr), req(cct, conn->get_url(), &cb, NULL, NULL, _conn->get_api_name())
+{
+  init_common(extra_headers);
+}
+
+void RGWRESTReadResource::init_common(param_vec_t *extra_headers)
+{
+  conn->populate_params(params, nullptr, conn->get_self_zonegroup());
+
+  if (extra_headers) {
+    headers.insert(extra_headers->begin(), extra_headers->end());
+  }
+
+  req.set_params(&params);
+}
+
+int RGWRESTReadResource::read(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  int ret = req.send_request(dpp, &conn->get_key(), headers, resource, mgr);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return req.complete_request(y);
+}
+
+int RGWRESTReadResource::aio_read(const DoutPrefixProvider *dpp)
+{
+  int ret = req.send_request(dpp, &conn->get_key(), headers, resource, mgr);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+RGWRESTSendResource::RGWRESTSendResource(RGWRESTConn *_conn,
+                                         const string& _method,
+                                         const string& _resource,
+		                         const rgw_http_param_pair *pp,
+					 param_vec_t *extra_headers,
+                                         RGWHTTPManager *_mgr)
+  : cct(_conn->get_ctx()), conn(_conn), method(_method), resource(_resource),
+    params(make_param_list(pp)), cb(bl), mgr(_mgr),
+    req(cct, method.c_str(), conn->get_url(), &cb, NULL, NULL, _conn->get_api_name(), _conn->get_host_style())
+{
+  init_common(extra_headers);
+}
+
+RGWRESTSendResource::RGWRESTSendResource(RGWRESTConn *_conn,
+                                         const string& _method,
+                                         const string& _resource,
+					 param_vec_t& params,
+					 param_vec_t *extra_headers,
+                                         RGWHTTPManager *_mgr)
+  : cct(_conn->get_ctx()), conn(_conn), method(_method), resource(_resource), params(params),
+    cb(bl), mgr(_mgr), req(cct, method.c_str(), conn->get_url(), &cb, NULL, NULL, _conn->get_api_name(), _conn->get_host_style())
+{
+  init_common(extra_headers);
+}
+
+void RGWRESTSendResource::init_common(param_vec_t *extra_headers)
+{
+  conn->populate_params(params, nullptr, conn->get_self_zonegroup());
+
+  if (extra_headers) {
+    headers.insert(extra_headers->begin(), extra_headers->end());
+  }
+
+  req.set_params(&params);
+}
+
+int RGWRESTSendResource::send(const DoutPrefixProvider *dpp, bufferlist& outbl, optional_yield y)
+{
+  req.set_send_length(outbl.length());
+  req.set_outbl(outbl);
+
+  int ret = req.send_request(dpp, &conn->get_key(), headers, resource, mgr);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return req.complete_request(y);
+}
+
+int RGWRESTSendResource::aio_send(const DoutPrefixProvider *dpp, bufferlist& outbl)
+{
+  req.set_send_length(outbl.length());
+  req.set_outbl(outbl);
+
+  int ret = req.send_request(dpp, &conn->get_key(), headers, resource, mgr);
+  if (ret < 0) {
+    ldpp_dout(dpp, 5) << __func__ << ": send_request() resource=" << resource << " returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
diff --git a/src/rgw/rgw_rest_conn.h b/src/rgw/rgw_rest_conn.h
new file mode 100644
index 000000000..81f839f49
--- /dev/null
+++ b/src/rgw/rgw_rest_conn.h
@@ -0,0 +1,557 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest_client.h"
+#include "common/ceph_json.h"
+#include "common/RefCountedObj.h"
+#include "include/common_fwd.h"
+#include "rgw_sal_fwd.h"
+
+#include <atomic>
+
+class RGWSI_Zone;
+
+template<class T>
+inline int parse_decode_json(T& t, bufferlist& bl)
+{
+  JSONParser p;
+  if (!p.parse(bl.c_str(), bl.length())) {
+    return -EINVAL;
+  }
+
+  try {
+    decode_json_obj(t, &p);
+  } catch (JSONDecoder::err& e) {
+    return -EINVAL;
+  }
+  return 0;
+}
+
+struct rgw_http_param_pair {
+  const char *key;
+  const char *val;
+};
+
+// append a null-terminated rgw_http_param_pair list into a list of string pairs
+inline void append_param_list(param_vec_t& params, const rgw_http_param_pair* pp)
+{
+  while (pp && pp->key) {
+    std::string k = pp->key;
+    std::string v = (pp->val ? pp->val : "");
+    params.emplace_back(make_pair(std::move(k), std::move(v)));
+    ++pp;
+  }
+}
+
+// copy a null-terminated rgw_http_param_pair list into a list of std::string pairs
+inline param_vec_t make_param_list(const rgw_http_param_pair* pp)
+{
+  param_vec_t params;
+  append_param_list(params, pp);
+  return params;
+}
+
+inline param_vec_t make_param_list(const std::map<std::string, std::string> *pp)
+{
+  param_vec_t params;
+  if (!pp) {
+    return params;
+  }
+  for (auto iter : *pp) {
+    params.emplace_back(make_pair(iter.first, iter.second));
+  }
+  return params;
+}
+
+class RGWRESTConn
+{
+  CephContext *cct;
+  std::vector<std::string> endpoints;
+  RGWAccessKey key;
+  std::string self_zone_group;
+  std::string remote_id;
+  std::optional<std::string> api_name;
+  HostStyle host_style;
+  std::atomic<int64_t> counter = { 0 };
+
+public:
+
+  RGWRESTConn(CephContext *_cct,
+              rgw::sal::Driver* driver,
+              const std::string& _remote_id,
+              const std::list<std::string>& endpoints,
+              std::optional<std::string> _api_name,
+              HostStyle _host_style = PathStyle);
+  RGWRESTConn(CephContext *_cct,
+	      const std::string& _remote_id,
+	      const std::list<std::string>& endpoints,
+	      RGWAccessKey _cred,
+	      std::string _zone_group,
+	      std::optional<std::string> _api_name,
+	      HostStyle _host_style = PathStyle);
+
+  // custom move needed for atomic
+  RGWRESTConn(RGWRESTConn&& other);
+  RGWRESTConn& operator=(RGWRESTConn&& other);
+  virtual ~RGWRESTConn() = default;
+
+  int get_url(std::string& endpoint);
+  std::string get_url();
+  const std::string& get_self_zonegroup() {
+    return self_zone_group;
+  }
+  const std::string& get_remote_id() {
+    return remote_id;
+  }
+  RGWAccessKey& get_key() {
+    return key;
+  }
+
+  std::optional<std::string> get_api_name() const {
+    return api_name;
+  }
+
+  HostStyle get_host_style() {
+    return host_style;
+  }
+
+  CephContext *get_ctx() {
+    return cct;
+  }
+  size_t get_endpoint_count() const { return endpoints.size(); }
+
+  virtual void populate_params(param_vec_t& params, const rgw_user *uid, const std::string& zonegroup);
+
+  /* sync request */
+  int forward(const DoutPrefixProvider *dpp, const rgw_user& uid, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y);
+
+  /* sync request */
+  int forward_iam_request(const DoutPrefixProvider *dpp, const RGWAccessKey& key, req_info& info, obj_version *objv, size_t max_response, bufferlist *inbl, bufferlist *outbl, optional_yield y);
+
+
+  /* async requests */
+  int put_obj_send_init(const rgw_obj& obj, const rgw_http_param_pair *extra_params, RGWRESTStreamS3PutObj **req);
+  int put_obj_async_init(const DoutPrefixProvider *dpp, const rgw_user& uid, const rgw_obj& obj,
+                         std::map<std::string, bufferlist>& attrs, RGWRESTStreamS3PutObj **req);
+  int complete_request(RGWRESTStreamS3PutObj *req, std::string& etag,
+                       ceph::real_time *mtime, optional_yield y);
+
+  struct get_obj_params {
+    rgw_user uid;
+    req_info *info{nullptr};
+    const ceph::real_time *mod_ptr{nullptr};
+    const ceph::real_time *unmod_ptr{nullptr};
+    bool high_precision_time{true};
+
+    std::string etag;
+
+    uint32_t mod_zone_id{0};
+    uint64_t mod_pg_ver{0};
+
+    bool prepend_metadata{false};
+    bool get_op{false};
+    bool rgwx_stat{false};
+    bool sync_manifest{false};
+    bool sync_cloudtiered{false};
+
+    bool skip_decrypt{true};
+    RGWHTTPStreamRWRequest::ReceiveCB *cb{nullptr};
+
+    bool range_is_set{false};
+    uint64_t range_start{0};
+    uint64_t range_end{0};
+    rgw_zone_set_entry *dst_zone_trace{nullptr};
+  };
+
+  int get_obj(const DoutPrefixProvider *dpp, const rgw_obj& obj, const get_obj_params& params, bool send, RGWRESTStreamRWRequest **req);
+
+  int get_obj(const DoutPrefixProvider *dpp, const rgw_user& uid, req_info *info /* optional */, const rgw_obj& obj,
+              const ceph::real_time *mod_ptr, const ceph::real_time *unmod_ptr,
+              uint32_t mod_zone_id, uint64_t mod_pg_ver,
+              bool prepend_metadata, bool get_op, bool rgwx_stat, bool sync_manifest,
+              bool skip_decrypt, rgw_zone_set_entry *dst_zone_trace, bool sync_cloudtiered,
+              bool send, RGWHTTPStreamRWRequest::ReceiveCB *cb, RGWRESTStreamRWRequest **req);
+  int complete_request(RGWRESTStreamRWRequest *req,
+                       std::string *etag,
+                       ceph::real_time *mtime,
+                       uint64_t *psize,
+                       std::map<std::string, std::string> *pattrs,
+                       std::map<std::string, std::string> *pheaders,
+                       optional_yield y);
+
+  int get_resource(const DoutPrefixProvider *dpp,
+                   const std::string& resource,
+		   param_vec_t *extra_params,
+                   std::map<std::string, std::string>* extra_headers,
+                   bufferlist& bl,
+                   bufferlist *send_data,
+                   RGWHTTPManager *mgr,
+                   optional_yield y);
+
+  int send_resource(const DoutPrefixProvider *dpp,
+                   const std::string& method,
+                   const std::string& resource,
+		           rgw_http_param_pair *extra_params,
+                   std::map<std::string, std::string>* extra_headers,
+                   bufferlist& bl,
+                   bufferlist *send_data,
+                   RGWHTTPManager *mgr,
+                   optional_yield y);
+
+  template <class T>
+  int get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, param_vec_t *params,
+                        bufferlist *in_data, optional_yield y, T& t);
+  template <class T>
+  int get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, param_vec_t *params,
+                        optional_yield y, T& t);
+  template <class T>
+  int get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, const rgw_http_param_pair *pp,
+                        optional_yield y, T& t);
+
+private:
+  void populate_zonegroup(param_vec_t& params, const std::string& zonegroup) {
+    if (!zonegroup.empty()) {
+      params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "zonegroup", zonegroup));
+    }
+  }
+  void populate_uid(param_vec_t& params, const rgw_user *uid) {
+    if (uid) {
+      std::string uid_str = uid->to_str();
+      if (!uid->empty()){
+        params.push_back(param_pair_t(RGW_SYS_PARAM_PREFIX "uid", uid_str));
+      }
+    }
+  }
+};
+
+class S3RESTConn : public RGWRESTConn {
+
+public:
+
+  S3RESTConn(CephContext *_cct, rgw::sal::Driver* driver, const std::string& _remote_id, const std::list<std::string>& endpoints, std::optional<std::string> _api_name, HostStyle _host_style = PathStyle) :
+    RGWRESTConn(_cct, driver, _remote_id, endpoints, _api_name, _host_style) {}
+  S3RESTConn(CephContext *_cct, const std::string& _remote_id, const std::list<std::string>& endpoints, RGWAccessKey _cred, std::string _zone_group, std::optional<std::string> _api_name, HostStyle _host_style = PathStyle):
+    RGWRESTConn(_cct, _remote_id, endpoints, _cred, _zone_group, _api_name, _host_style) {}
+  ~S3RESTConn() override = default;
+
+  void populate_params(param_vec_t& params, const rgw_user *uid, const std::string& zonegroup) override {
+    // do not populate any params in S3 REST Connection.
+    return;
+  }
+};
+
+
+template<class T>
+int RGWRESTConn::get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, param_vec_t *params,
+                                   bufferlist *in_data, optional_yield y, T& t)
+{
+  bufferlist bl;
+  int ret = get_resource(dpp, resource, params, nullptr, bl, in_data, nullptr, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = parse_decode_json(t, bl);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+template<class T>
+int RGWRESTConn::get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, param_vec_t *params,
+                                   optional_yield y, T& t)
+{
+  return get_json_resource(dpp, resource, params, nullptr, y, t);
+}
+
+template<class T>
+int RGWRESTConn::get_json_resource(const DoutPrefixProvider *dpp, const std::string& resource, const rgw_http_param_pair *pp,
+                                   optional_yield y, T& t)
+{
+  param_vec_t params = make_param_list(pp);
+  return get_json_resource(dpp, resource, &params, y, t);
+}
+
+class RGWStreamIntoBufferlist : public RGWHTTPStreamRWRequest::ReceiveCB {
+  bufferlist& bl;
+public:
+  explicit RGWStreamIntoBufferlist(bufferlist& _bl) : bl(_bl) {}
+  int handle_data(bufferlist& inbl, bool *pause) override {
+    bl.claim_append(inbl);
+    return inbl.length();
+  }
+};
+
+class RGWRESTReadResource : public RefCountedObject, public RGWIOProvider {
+  CephContext *cct;
+  RGWRESTConn *conn;
+  std::string resource;
+  param_vec_t params;
+  std::map<std::string, std::string> headers;
+  bufferlist bl;
+  RGWStreamIntoBufferlist cb;
+
+  RGWHTTPManager *mgr;
+  RGWRESTStreamReadRequest req;
+
+  void init_common(param_vec_t *extra_headers);
+
+public:
+  RGWRESTReadResource(RGWRESTConn *_conn,
+		      const std::string& _resource,
+		      const rgw_http_param_pair *pp,
+		      param_vec_t *extra_headers,
+		      RGWHTTPManager *_mgr);
+
+  RGWRESTReadResource(RGWRESTConn *_conn,
+		      const std::string& _resource,
+		      param_vec_t& _params,
+		      param_vec_t *extra_headers,
+		      RGWHTTPManager *_mgr);
+  ~RGWRESTReadResource() = default;
+
+  rgw_io_id get_io_id(int io_type) {
+    return req.get_io_id(io_type);
+  }
+
+  void set_io_user_info(void *user_info) override {
+    req.set_io_user_info(user_info);
+  }
+
+  void *get_io_user_info() override {
+    return req.get_io_user_info();
+  }
+
+  template <class T>
+  int decode_resource(T *dest);
+
+  int read(const DoutPrefixProvider *dpp, optional_yield y);
+
+  int aio_read(const DoutPrefixProvider *dpp);
+
+  std::string to_str() {
+    return req.to_str();
+  }
+
+  int get_http_status() {
+    return req.get_http_status();
+  }
+
+  int wait(bufferlist *pbl, optional_yield y) {
+    int ret = req.wait(y);
+    if (ret < 0) {
+      return ret;
+    }
+
+    if (req.get_status() < 0) {
+      return req.get_status();
+    }
+    *pbl = bl;
+    return 0;
+  }
+
+  template <class T>
+  int wait(T *dest, optional_yield y);
+
+  template <class T>
+  int fetch(const DoutPrefixProvider *dpp, T *dest, optional_yield y);
+};
+
+
+template <class T>
+int RGWRESTReadResource::decode_resource(T *dest)
+{
+  int ret = req.get_status();
+  if (ret < 0) {
+    return ret;
+  }
+  ret = parse_decode_json(*dest, bl);
+  if (ret < 0) {
+    return ret;
+  }
+  return 0;
+}
+
+template <class T>
+int RGWRESTReadResource::fetch(const DoutPrefixProvider *dpp, T *dest, optional_yield y)
+{
+  int ret = read(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = decode_resource(dest);
+  if (ret < 0) {
+    return ret;
+  }
+  return 0;
+}
+
+template <class T>
+int RGWRESTReadResource::wait(T *dest, optional_yield y)
+{
+  int ret = req.wait(y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = decode_resource(dest);
+  if (ret < 0) {
+    return ret;
+  }
+  return 0;
+}
+
+class RGWRESTSendResource : public RefCountedObject, public RGWIOProvider {
+  CephContext *cct;
+  RGWRESTConn *conn;
+  std::string method;
+  std::string resource;
+  param_vec_t params;
+  std::map<std::string, std::string> headers;
+  bufferlist bl;
+  RGWStreamIntoBufferlist cb;
+
+  RGWHTTPManager *mgr;
+  RGWRESTStreamRWRequest req;
+
+  void init_common(param_vec_t *extra_headers);
+
+public:
+  RGWRESTSendResource(RGWRESTConn *_conn,
+                      const std::string& _method,
+		      const std::string& _resource,
+		      const rgw_http_param_pair *pp,
+		      param_vec_t *extra_headers,
+		      RGWHTTPManager *_mgr);
+
+  RGWRESTSendResource(RGWRESTConn *_conn,
+                      const std::string& _method,
+		      const std::string& _resource,
+		      param_vec_t& params,
+		      param_vec_t *extra_headers,
+		      RGWHTTPManager *_mgr);
+
+  ~RGWRESTSendResource() = default;
+
+  rgw_io_id get_io_id(int io_type) {
+    return req.get_io_id(io_type);
+  }
+
+  void set_io_user_info(void *user_info) override {
+    req.set_io_user_info(user_info);
+  }
+
+  void *get_io_user_info() override {
+    return req.get_io_user_info();
+  }
+
+  int send(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y);
+
+  int aio_send(const DoutPrefixProvider *dpp, bufferlist& bl);
+
+  std::string to_str() {
+    return req.to_str();
+  }
+
+  int get_http_status() {
+    return req.get_http_status();
+  }
+
+  template <class E = int>
+  int wait(bufferlist *pbl, optional_yield y, E *err_result = nullptr) {
+    int ret = req.wait(y);
+    *pbl = bl;
+
+    if (ret < 0 && err_result ) {
+      ret = parse_decode_json(*err_result, bl);
+    }
+
+    return req.get_status();
+  }
+
+  template <class T, class E = int>
+  int wait(T *dest, optional_yield y, E *err_result = nullptr);
+};
+
+template <class T, class E>
+int RGWRESTSendResource::wait(T *dest, optional_yield y, E *err_result)
+{
+  int ret = req.wait(y);
+  if (ret >= 0) {
+    ret = req.get_status();
+  }
+
+  if (ret < 0 && err_result) {
+    ret = parse_decode_json(*err_result, bl);
+  }
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = parse_decode_json(*dest, bl);
+  if (ret < 0) {
+    return ret;
+  }
+  return 0;
+
+}
+
+class RGWRESTPostResource : public RGWRESTSendResource {
+public:
+  RGWRESTPostResource(RGWRESTConn *_conn,
+		      const std::string& _resource,
+		      const rgw_http_param_pair *pp,
+		      param_vec_t *extra_headers,
+		      RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "POST", _resource,
+                                                                  pp, extra_headers, _mgr) {}
+
+  RGWRESTPostResource(RGWRESTConn *_conn,
+		      const std::string& _resource,
+		      param_vec_t& params,
+		      param_vec_t *extra_headers,
+		      RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "POST", _resource,
+                                                                  params, extra_headers, _mgr) {}
+
+};
+
+class RGWRESTPutResource : public RGWRESTSendResource {
+public:
+  RGWRESTPutResource(RGWRESTConn *_conn,
+		     const std::string& _resource,
+		     const rgw_http_param_pair *pp,
+		     param_vec_t *extra_headers,
+		     RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "PUT", _resource,
+                                                                  pp, extra_headers, _mgr) {}
+
+  RGWRESTPutResource(RGWRESTConn *_conn,
+		     const std::string& _resource,
+		     param_vec_t& params,
+		     param_vec_t *extra_headers,
+		     RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "PUT", _resource,
+                                                                  params, extra_headers, _mgr) {}
+
+};
+
+class RGWRESTDeleteResource : public RGWRESTSendResource {
+public:
+  RGWRESTDeleteResource(RGWRESTConn *_conn,
+		     const std::string& _resource,
+		     const rgw_http_param_pair *pp,
+		     param_vec_t *extra_headers,
+		     RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "DELETE", _resource,
+                                                                  pp, extra_headers, _mgr) {}
+
+  RGWRESTDeleteResource(RGWRESTConn *_conn,
+		     const std::string& _resource,
+		     param_vec_t& params,
+		     param_vec_t *extra_headers,
+		     RGWHTTPManager *_mgr) : RGWRESTSendResource(_conn, "DELETE", _resource,
+                                                                  params, extra_headers, _mgr) {}
+
+};
diff --git a/src/rgw/rgw_rest_iam.cc b/src/rgw/rgw_rest_iam.cc
new file mode 100644
index 000000000..b9e8779c1
--- /dev/null
+++ b/src/rgw/rgw_rest_iam.cc
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <boost/tokenizer.hpp>
+
+#include "rgw_auth_s3.h"
+#include "rgw_rest_iam.h"
+
+#include "rgw_rest_role.h"
+#include "rgw_rest_user_policy.h"
+#include "rgw_rest_oidc_provider.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+using op_generator = RGWOp*(*)(const bufferlist&);
+static const std::unordered_map<std::string_view, op_generator> op_generators = {
+  {"CreateRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWCreateRole(bl_post_body);}},
+  {"DeleteRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWDeleteRole(bl_post_body);}},
+  {"GetRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWGetRole;}},
+  {"UpdateAssumeRolePolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWModifyRoleTrustPolicy(bl_post_body);}},
+  {"ListRoles", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWListRoles;}},
+  {"PutRolePolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWPutRolePolicy(bl_post_body);}},
+  {"GetRolePolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWGetRolePolicy;}},
+  {"ListRolePolicies", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWListRolePolicies;}},
+  {"DeleteRolePolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWDeleteRolePolicy(bl_post_body);}},
+  {"PutUserPolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWPutUserPolicy;}},
+  {"GetUserPolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWGetUserPolicy;}},
+  {"ListUserPolicies", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWListUserPolicies;}},
+  {"DeleteUserPolicy", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWDeleteUserPolicy;}},
+  {"CreateOpenIDConnectProvider", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWCreateOIDCProvider;}},
+  {"ListOpenIDConnectProviders", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWListOIDCProviders;}},
+  {"GetOpenIDConnectProvider", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWGetOIDCProvider;}},
+  {"DeleteOpenIDConnectProvider", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWDeleteOIDCProvider;}},
+  {"TagRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWTagRole(bl_post_body);}},
+  {"ListRoleTags", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWListRoleTags;}},
+  {"UntagRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWUntagRole(bl_post_body);}},
+  {"UpdateRole", [](const bufferlist& bl_post_body) -> RGWOp* {return new RGWUpdateRole(bl_post_body);}}
+};
+
+bool RGWHandler_REST_IAM::action_exists(const req_state* s) 
+{
+  if (s->info.args.exists("Action")) {
+    const std::string action_name = s->info.args.get("Action");
+    return op_generators.contains(action_name);
+  }
+  return false;
+}
+
+RGWOp *RGWHandler_REST_IAM::op_post()
+{
+  if (s->info.args.exists("Action")) {
+    const std::string action_name = s->info.args.get("Action");
+    const auto action_it = op_generators.find(action_name);
+    if (action_it != op_generators.end()) {
+      return action_it->second(bl_post_body);
+    }
+    ldpp_dout(s, 10) << "unknown action '" << action_name << "' for IAM handler" << dendl;
+  } else {
+    ldpp_dout(s, 10) << "missing action argument in IAM handler" << dendl;
+  }
+  return nullptr;
+}
+
+int RGWHandler_REST_IAM::init(rgw::sal::Driver* driver,
+                              req_state *s,
+                              rgw::io::BasicClient *cio)
+{
+  s->dialect = "iam";
+  s->prot_flags = RGW_REST_IAM;
+
+  return RGWHandler_REST::init(driver, s, cio);
+}
+
+int RGWHandler_REST_IAM::authorize(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y);
+}
+
+RGWHandler_REST*
+RGWRESTMgr_IAM::get_handler(rgw::sal::Driver* driver,
+			    req_state* const s,
+			    const rgw::auth::StrategyRegistry& auth_registry,
+			    const std::string& frontend_prefix)
+{
+  bufferlist bl;
+  return new RGWHandler_REST_IAM(auth_registry, bl);
+}
diff --git a/src/rgw/rgw_rest_iam.h b/src/rgw/rgw_rest_iam.h
new file mode 100644
index 000000000..3e579ab35
--- /dev/null
+++ b/src/rgw/rgw_rest_iam.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_auth.h"
+#include "rgw_auth_filters.h"
+#include "rgw_rest.h"
+
+class RGWHandler_REST_IAM : public RGWHandler_REST {
+  const rgw::auth::StrategyRegistry& auth_registry;
+  bufferlist bl_post_body;
+  RGWOp *op_post() override;
+
+public:
+
+  static bool action_exists(const req_state* s);
+
+  RGWHandler_REST_IAM(const rgw::auth::StrategyRegistry& auth_registry,
+		      bufferlist& bl_post_body)
+    : RGWHandler_REST(),
+      auth_registry(auth_registry),
+      bl_post_body(bl_post_body) {}
+  ~RGWHandler_REST_IAM() override = default;
+
+  int init(rgw::sal::Driver* driver,
+           req_state *s,
+           rgw::io::BasicClient *cio) override;
+  int authorize(const DoutPrefixProvider* dpp, optional_yield y) override;
+  int postauth_init(optional_yield y) override { return 0; }
+};
+
+class RGWRESTMgr_IAM : public RGWRESTMgr {
+public:
+  RGWRESTMgr_IAM() = default;
+  ~RGWRESTMgr_IAM() override = default;
+
+  RGWRESTMgr *get_resource_mgr(req_state* const s,
+                               const std::string& uri,
+                               std::string* const out_uri) override {
+    return this;
+  }
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry&,
+                               const std::string&) override;
+};
diff --git a/src/rgw/rgw_rest_info.cc b/src/rgw/rgw_rest_info.cc
new file mode 100644
index 000000000..65323dd00
--- /dev/null
+++ b/src/rgw/rgw_rest_info.cc
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_op.h"
+#include "rgw_rest_info.h"
+#include "rgw_sal.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+class RGWOp_Info_Get : public RGWRESTOp {
+
+public:
+  RGWOp_Info_Get() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("info", RGW_CAP_READ);
+  }
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_info"; }
+};
+
+void RGWOp_Info_Get::execute(optional_yield y) {
+  Formatter *formatter = flusher.get_formatter();
+  flusher.start(0);
+
+  /* extensible array of general info sections, currently only
+   * storage backend is defined:
+   * {"info":{"storage_backends":[{"name":"rados","cluster_id":"75d1938b-2949-4933-8386-fb2d1449ff03"}]}}
+   */
+  formatter->open_object_section("dummy");
+  formatter->open_object_section("info");
+  formatter->open_array_section("storage_backends");
+  // for now, just return the backend that is accessible
+  formatter->open_object_section("dummy");
+  formatter->dump_string("name", driver->get_name());
+  formatter->dump_string("cluster_id", driver->get_cluster_id(this, y));
+  formatter->close_section();
+  formatter->close_section();
+  formatter->close_section();
+  formatter->close_section();
+
+  flusher.flush();
+} /* RGWOp_Info_Get::execute */
+
+RGWOp *RGWHandler_Info::op_get()
+{
+  return new RGWOp_Info_Get;
+}
diff --git a/src/rgw/rgw_rest_info.h b/src/rgw/rgw_rest_info.h
new file mode 100644
index 000000000..0c4467073
--- /dev/null
+++ b/src/rgw/rgw_rest_info.h
@@ -0,0 +1,33 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+
+
+class RGWHandler_Info : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_Info() override = default;
+
+  int read_permissions(RGWOp*, optional_yield) override {
+    return 0;
+  }
+};
+
+class RGWRESTMgr_Info : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Info() = default;
+  ~RGWRESTMgr_Info() override = default;
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override {
+    return new RGWHandler_Info(auth_registry);
+  }
+};
diff --git a/src/rgw/rgw_rest_metadata.cc b/src/rgw/rgw_rest_metadata.cc
new file mode 100644
index 000000000..23f78819c
--- /dev/null
+++ b/src/rgw/rgw_rest_metadata.cc
@@ -0,0 +1,321 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/page.h"
+
+#include "rgw_rest.h"
+#include "rgw_op.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_metadata.h"
+#include "rgw_client_io.h"
+#include "rgw_mdlog_types.h"
+#include "rgw_sal_rados.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "rgw/rgw_b64.h"
+#include "include/ceph_assert.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static inline void frame_metadata_key(req_state *s, string& out) {
+  bool exists;
+  string key = s->info.args.get("key", &exists);
+
+  string section;
+  if (!s->init_state.url_bucket.empty()) {
+    section = s->init_state.url_bucket;
+  } else {
+    section = key;
+    key.clear();
+  }
+
+  out = section;
+
+  if (!key.empty()) {
+    out += string(":") + key;
+  }
+}
+
+void RGWOp_Metadata_Get::execute(optional_yield y) {
+  string metadata_key;
+
+  frame_metadata_key(s, metadata_key);
+
+  auto meta_mgr = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr;
+
+  /* Get keys */
+  op_ret = meta_mgr->get(metadata_key, s->formatter, s->yield, s);
+  if (op_ret < 0) {
+    ldpp_dout(s, 5) << "ERROR: can't get key: " << cpp_strerror(op_ret) << dendl;
+    return;
+  }
+
+  op_ret = 0;
+}
+
+void RGWOp_Metadata_Get_Myself::execute(optional_yield y) {
+  string owner_id;
+
+  owner_id = s->owner.get_id().to_str();
+  s->info.args.append("key", owner_id);
+
+  return RGWOp_Metadata_Get::execute(y);
+}
+
+void RGWOp_Metadata_List::execute(optional_yield y) {
+  string marker;
+  ldpp_dout(this, 16) << __func__
+		    << " raw marker " << s->info.args.get("marker")
+		    << dendl;
+
+  try {
+    marker = s->info.args.get("marker");
+    if (!marker.empty()) {
+      marker = rgw::from_base64(marker);
+    }
+    ldpp_dout(this, 16) << __func__
+	     << " marker " << marker << dendl;
+  } catch (...) {
+    marker = std::string("");
+  }
+
+  bool max_entries_specified;
+  string max_entries_str =
+    s->info.args.get("max-entries", &max_entries_specified);
+
+  bool extended_response = (max_entries_specified); /* for backward compatibility, if max-entries is not specified
+                                                    we will send the old response format */
+  uint64_t max_entries = 0;
+
+  if (max_entries_specified) {
+    string err;
+    max_entries = (unsigned)strict_strtol(max_entries_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(this, 5) << "Error parsing max-entries " << max_entries_str << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  string metadata_key;
+
+  frame_metadata_key(s, metadata_key);
+  /* List keys */
+  void *handle;
+  int max = 1000;
+
+  /* example markers:
+     marker = "3:b55a9110:root::bu_9:head";
+     marker = "3:b9a8b2a6:root::sorry_janefonda_890:head";
+     marker = "3:bf885d8f:root::sorry_janefonda_665:head";
+  */
+
+  op_ret = driver->meta_list_keys_init(this, metadata_key, marker, &handle);
+  if (op_ret < 0) {
+    ldpp_dout(this, 5) << "ERROR: can't get key: " << cpp_strerror(op_ret) << dendl;
+    return;
+  }
+
+  bool truncated;
+  uint64_t count = 0;
+
+  if (extended_response) {
+    s->formatter->open_object_section("result");
+  }
+
+  s->formatter->open_array_section("keys");
+
+  uint64_t left;
+  do {
+    list<string> keys;
+    left = (max_entries_specified ? max_entries - count : max);
+    op_ret = driver->meta_list_keys_next(this, handle, left, keys, &truncated);
+    if (op_ret < 0) {
+      ldpp_dout(this, 5) << "ERROR: lists_keys_next(): " << cpp_strerror(op_ret)
+	      << dendl;
+      return;
+    }
+
+    for (list<string>::iterator iter = keys.begin(); iter != keys.end();
+	 ++iter) {
+      s->formatter->dump_string("key", *iter);
+      ++count;
+    }
+
+  } while (truncated && left > 0);
+
+  s->formatter->close_section();
+
+  if (extended_response) {
+    encode_json("truncated", truncated, s->formatter);
+    encode_json("count", count, s->formatter);
+    if (truncated) {
+      string esc_marker =
+	rgw::to_base64(driver->meta_get_marker(handle));
+      encode_json("marker", esc_marker, s->formatter);
+    }
+    s->formatter->close_section();
+  }
+  driver->meta_list_keys_complete(handle);
+
+  op_ret = 0;
+}
+
+int RGWOp_Metadata_Put::get_data(bufferlist& bl) {
+  size_t cl = 0;
+  char *data;
+  int read_len;
+
+  if (s->length)
+    cl = atoll(s->length);
+  if (cl) {
+    data = (char *)malloc(cl + 1);
+    if (!data) {
+       return -ENOMEM;
+    }
+    read_len = recv_body(s, data, cl);
+    if (cl != (size_t)read_len) {
+      ldpp_dout(this, 10) << "recv_body incomplete" << dendl;
+    }
+    if (read_len < 0) {
+      free(data);
+      return read_len;
+    }
+    bl.append(data, read_len);
+  } else {
+    int chunk_size = CEPH_PAGE_SIZE;
+    const char *enc = s->info.env->get("HTTP_TRANSFER_ENCODING");
+    if (!enc || strcmp(enc, "chunked")) {
+      return -ERR_LENGTH_REQUIRED;
+    }
+    data = (char *)malloc(chunk_size);
+    if (!data) {
+      return -ENOMEM;
+    }
+    do {
+      read_len = recv_body(s, data, chunk_size);
+      if (read_len < 0) {
+	free(data);
+	return read_len;
+      }
+      bl.append(data, read_len);
+    } while (read_len == chunk_size);
+  }
+
+  free(data);
+  return 0;
+}
+
+static bool string_to_sync_type(const string& sync_string,
+                                RGWMDLogSyncType& type) {
+  if (sync_string.compare("update-by-version") == 0)
+    type = APPLY_UPDATES;
+  else if (sync_string.compare("update-by-timestamp") == 0)
+    type = APPLY_NEWER;
+  else if (sync_string.compare("always") == 0)
+    type = APPLY_ALWAYS;
+  else
+    return false;
+  return true;
+}
+
+void RGWOp_Metadata_Put::execute(optional_yield y) {
+  bufferlist bl;
+  string metadata_key;
+
+  op_ret = get_data(bl);
+  if (op_ret < 0) {
+    return;
+  }
+
+  op_ret = do_aws4_auth_completion();
+  if (op_ret < 0) {
+    return;
+  }
+
+  frame_metadata_key(s, metadata_key);
+
+  RGWMDLogSyncType sync_type = RGWMDLogSyncType::APPLY_ALWAYS;
+
+  bool mode_exists = false;
+  string mode_string = s->info.args.get("update-type", &mode_exists);
+  if (mode_exists) {
+    bool parsed = string_to_sync_type(mode_string,
+                                      sync_type);
+    if (!parsed) {
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->put(metadata_key, bl, s->yield, s, sync_type,
+				       false, &ondisk_version);
+  if (op_ret < 0) {
+    ldpp_dout(s, 5) << "ERROR: can't put key: " << cpp_strerror(op_ret) << dendl;
+    return;
+  }
+  // translate internal codes into return header
+  if (op_ret == STATUS_NO_APPLY)
+    update_status = "skipped";
+  else if (op_ret == STATUS_APPLIED)
+    update_status = "applied";
+}
+
+void RGWOp_Metadata_Put::send_response() {
+  int op_return_code = op_ret;
+  if ((op_ret == STATUS_NO_APPLY) || (op_ret == STATUS_APPLIED))
+    op_return_code = STATUS_NO_CONTENT;
+  set_req_state_err(s, op_return_code);
+  dump_errno(s);
+  stringstream ver_stream;
+  ver_stream << "ver:" << ondisk_version.ver
+	     <<",tag:" << ondisk_version.tag;
+  dump_header_if_nonempty(s, "RGWX_UPDATE_STATUS", update_status);
+  dump_header_if_nonempty(s, "RGWX_UPDATE_VERSION", ver_stream.str());
+  end_header(s);
+}
+
+void RGWOp_Metadata_Delete::execute(optional_yield y) {
+  string metadata_key;
+
+  frame_metadata_key(s, metadata_key);
+  op_ret = static_cast<rgw::sal::RadosStore*>(driver)->ctl()->meta.mgr->remove(metadata_key, s->yield, s);
+  if (op_ret < 0) {
+    ldpp_dout(s, 5) << "ERROR: can't remove key: " << cpp_strerror(op_ret) << dendl;
+    return;
+  }
+  op_ret = 0;
+}
+
+RGWOp *RGWHandler_Metadata::op_get() {
+  if (s->info.args.exists("myself"))
+    return new RGWOp_Metadata_Get_Myself;
+  if (s->info.args.exists("key"))
+    return new RGWOp_Metadata_Get;
+  else
+    return new RGWOp_Metadata_List;
+}
+
+RGWOp *RGWHandler_Metadata::op_put() {
+  return new RGWOp_Metadata_Put;
+}
+
+RGWOp *RGWHandler_Metadata::op_delete() {
+  return new RGWOp_Metadata_Delete;
+}
+
diff --git a/src/rgw/rgw_rest_metadata.h b/src/rgw/rgw_rest_metadata.h
new file mode 100644
index 000000000..ea7376a1b
--- /dev/null
+++ b/src/rgw/rgw_rest_metadata.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 eNovance SAS <licensing@enovance.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw/rgw_rest.h"
+#include "rgw/rgw_auth_s3.h"
+
+class RGWOp_Metadata_List : public RGWRESTOp {
+public:
+  RGWOp_Metadata_List() {}
+  ~RGWOp_Metadata_List() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("metadata", RGW_CAP_READ);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override { return "list_metadata"; }
+};
+
+class RGWOp_Metadata_Get : public RGWRESTOp {
+public:
+  RGWOp_Metadata_Get() {}
+  ~RGWOp_Metadata_Get() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("metadata", RGW_CAP_READ);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override { return "get_metadata"; }
+};
+
+class RGWOp_Metadata_Get_Myself : public RGWOp_Metadata_Get {
+public:
+  RGWOp_Metadata_Get_Myself() {}
+  ~RGWOp_Metadata_Get_Myself() override {}
+
+  void execute(optional_yield y) override;
+};
+
+class RGWOp_Metadata_Put : public RGWRESTOp {
+  int get_data(bufferlist& bl);
+  std::string update_status;
+  obj_version ondisk_version;
+public:
+  RGWOp_Metadata_Put() {}
+  ~RGWOp_Metadata_Put() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("metadata", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  void send_response() override;
+  const char* name() const override { return "set_metadata"; }
+  RGWOpType get_type() override { return RGW_OP_ADMIN_SET_METADATA; }
+};
+
+class RGWOp_Metadata_Delete : public RGWRESTOp {
+public:
+  RGWOp_Metadata_Delete() {}
+  ~RGWOp_Metadata_Delete() override {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("metadata", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+  const char* name() const override { return "remove_metadata"; }
+};
+
+class RGWHandler_Metadata : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+  RGWOp *op_put() override;
+  RGWOp *op_delete() override;
+
+  int read_permissions(RGWOp*, optional_yield y) override {
+    return 0;
+  }
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_Metadata() override = default;
+};
+
+class RGWRESTMgr_Metadata : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Metadata() = default;
+  ~RGWRESTMgr_Metadata() override = default;
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state* const s,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string& frontend_prefix) override {
+    return new RGWHandler_Metadata(auth_registry);
+  }
+};
diff --git a/src/rgw/rgw_rest_oidc_provider.cc b/src/rgw/rgw_rest_oidc_provider.cc
new file mode 100644
index 000000000..db4bc12fc
--- /dev/null
+++ b/src/rgw/rgw_rest_oidc_provider.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+
+#include "include/types.h"
+#include "rgw_string.h"
+
+#include "rgw_common.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_role.h"
+#include "rgw_rest_oidc_provider.h"
+#include "rgw_oidc_provider.h"
+#include "rgw_sal.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int RGWRestOIDCProvider::verify_permission(optional_yield y)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  provider_arn = s->info.args.get("OpenIDConnectProviderArn");
+  if (provider_arn.empty()) {
+    ldpp_dout(this, 20) << "ERROR: Provider ARN is empty"<< dendl;
+    return -EINVAL;
+  }
+
+  auto ret = check_caps(s->user->get_caps());
+  if (ret == 0) {
+    return ret;
+  }
+
+  uint64_t op = get_op();
+  auto rgw_arn = rgw::ARN::parse(provider_arn, true);
+  if (rgw_arn) {
+    if (!verify_user_permission(this, s, *rgw_arn, op)) {
+      return -EACCES;
+    }
+  } else {
+      return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWRestOIDCProvider::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s, this);
+}
+
+int RGWRestOIDCProviderRead::check_caps(const RGWUserCaps& caps)
+{
+    return caps.check_cap("oidc-provider", RGW_CAP_READ);
+}
+
+int RGWRestOIDCProviderWrite::check_caps(const RGWUserCaps& caps)
+{
+    return caps.check_cap("oidc-provider", RGW_CAP_WRITE);
+}
+
+int RGWCreateOIDCProvider::verify_permission(optional_yield y)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  auto ret = check_caps(s->user->get_caps());
+  if (ret == 0) {
+    return ret;
+  }
+
+  string idp_url = url_remove_prefix(provider_url);
+  if (!verify_user_permission(this,
+                              s,
+                              rgw::ARN(idp_url,
+                                        "oidc-provider",
+                                         s->user->get_tenant(), true),
+                                         get_op())) {
+    return -EACCES;
+  }
+  return 0;
+}
+
+int RGWCreateOIDCProvider::get_params()
+{
+  provider_url = s->info.args.get("Url");
+
+  auto val_map = s->info.args.get_params();
+  for (auto& it : val_map) {
+      if (it.first.find("ClientIDList.member.") != string::npos) {
+          client_ids.emplace_back(it.second);
+      }
+      if (it.first.find("ThumbprintList.member.") != string::npos) {
+          thumbprints.emplace_back(it.second);
+      }
+  }
+
+  if (provider_url.empty() || thumbprints.empty()) {
+    ldpp_dout(this, 20) << "ERROR: one of url or thumbprints is empty" << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWCreateOIDCProvider::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  std::unique_ptr<rgw::sal::RGWOIDCProvider> provider = driver->get_oidc_provider();
+  provider->set_url(provider_url);
+  provider->set_tenant(s->user->get_tenant());
+  provider->set_client_ids(client_ids);
+  provider->set_thumbprints(thumbprints);
+  op_ret = provider->create(s, true, y);
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("CreateOpenIDConnectProviderResponse");
+    s->formatter->open_object_section("CreateOpenIDConnectProviderResult");
+    provider->dump(s->formatter);
+    s->formatter->close_section();
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+
+}
+
+void RGWDeleteOIDCProvider::execute(optional_yield y)
+{
+  std::unique_ptr<rgw::sal::RGWOIDCProvider> provider = driver->get_oidc_provider();
+  provider->set_arn(provider_arn);
+  provider->set_tenant(s->user->get_tenant());
+  op_ret = provider->delete_obj(s, y);
+
+  if (op_ret < 0 && op_ret != -ENOENT && op_ret != -EINVAL) {
+    op_ret = ERR_INTERNAL_ERROR;
+  }
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("DeleteOpenIDConnectProviderResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+void RGWGetOIDCProvider::execute(optional_yield y)
+{
+  std::unique_ptr<rgw::sal::RGWOIDCProvider> provider = driver->get_oidc_provider();
+  provider->set_arn(provider_arn);
+  provider->set_tenant(s->user->get_tenant());
+  op_ret = provider->get(s);
+
+  if (op_ret < 0 && op_ret != -ENOENT && op_ret != -EINVAL) {
+    op_ret = ERR_INTERNAL_ERROR;
+  }
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("GetOpenIDConnectProviderResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->open_object_section("GetOpenIDConnectProviderResult");
+    provider->dump_all(s->formatter);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWListOIDCProviders::verify_permission(optional_yield y)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  if (int ret = check_caps(s->user->get_caps()); ret == 0) {
+    return ret;
+  }
+
+  if (!verify_user_permission(this, 
+                              s,
+                              rgw::ARN(),
+                              get_op())) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+void RGWListOIDCProviders::execute(optional_yield y)
+{
+  vector<std::unique_ptr<rgw::sal::RGWOIDCProvider>> result;
+  op_ret = driver->get_oidc_providers(s, s->user->get_tenant(), result);
+
+  if (op_ret == 0) {
+    s->formatter->open_array_section("ListOpenIDConnectProvidersResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->open_object_section("ListOpenIDConnectProvidersResult");
+    s->formatter->open_array_section("OpenIDConnectProviderList");
+    for (const auto& it : result) {
+      s->formatter->open_object_section("member");
+      auto& arn = it->get_arn();
+      ldpp_dout(s, 0) << "ARN: " << arn << dendl;
+      s->formatter->dump_string("Arn", arn);
+      s->formatter->close_section();
+    }
+    s->formatter->close_section();
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
diff --git a/src/rgw/rgw_rest_oidc_provider.h b/src/rgw/rgw_rest_oidc_provider.h
new file mode 100644
index 000000000..33535c6b5
--- /dev/null
+++ b/src/rgw/rgw_rest_oidc_provider.h
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_oidc_provider.h"
+
+class RGWRestOIDCProvider : public RGWRESTOp {
+protected:
+  std::vector<std::string> client_ids;
+  std::vector<std::string> thumbprints;
+  std::string provider_url; //'iss' field in JWT
+  std::string provider_arn;
+public:
+  int verify_permission(optional_yield y) override;
+  void send_response() override;
+  virtual uint64_t get_op() = 0;
+};
+
+class RGWRestOIDCProviderRead : public RGWRestOIDCProvider {
+public:
+  RGWRestOIDCProviderRead() = default;
+  int check_caps(const RGWUserCaps& caps) override;
+};
+
+class RGWRestOIDCProviderWrite : public RGWRestOIDCProvider {
+public:
+  RGWRestOIDCProviderWrite() = default;
+  int check_caps(const RGWUserCaps& caps) override;
+};
+
+class RGWCreateOIDCProvider : public RGWRestOIDCProviderWrite {
+public:
+  RGWCreateOIDCProvider() = default;
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "create_oidc_provider"; }
+  RGWOpType get_type() override { return RGW_OP_CREATE_OIDC_PROVIDER; }
+  uint64_t get_op() override { return rgw::IAM::iamCreateOIDCProvider; }
+};
+
+class RGWDeleteOIDCProvider : public RGWRestOIDCProviderWrite {
+public:
+  RGWDeleteOIDCProvider() = default;
+  void execute(optional_yield y) override;
+  const char* name() const override { return "delete_oidc_provider"; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_OIDC_PROVIDER; }
+  uint64_t get_op() override { return rgw::IAM::iamDeleteOIDCProvider; }
+};
+
+class RGWGetOIDCProvider : public RGWRestOIDCProviderRead {
+public:
+  RGWGetOIDCProvider() = default;
+  void execute(optional_yield y) override;
+  const char* name() const override { return "get_oidc_provider"; }
+  RGWOpType get_type() override { return RGW_OP_GET_OIDC_PROVIDER; }
+  uint64_t get_op() override { return rgw::IAM::iamGetOIDCProvider; }
+};
+
+class RGWListOIDCProviders : public RGWRestOIDCProviderRead {
+public:
+  RGWListOIDCProviders() = default;
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "list_oidc_providers"; }
+  RGWOpType get_type() override { return RGW_OP_LIST_OIDC_PROVIDERS; }
+  uint64_t get_op() override { return rgw::IAM::iamListOIDCProviders; }
+};
diff --git a/src/rgw/rgw_rest_pubsub.cc b/src/rgw/rgw_rest_pubsub.cc
new file mode 100644
index 000000000..793232866
--- /dev/null
+++ b/src/rgw/rgw_rest_pubsub.cc
@@ -0,0 +1,954 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <algorithm>
+#include <boost/tokenizer.hpp>
+#include <optional>
+#include "rgw_rest_pubsub.h"
+#include "rgw_pubsub_push.h"
+#include "rgw_pubsub.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_arn.h"
+#include "rgw_auth_s3.h"
+#include "rgw_notify.h"
+#include "services/svc_zone.h"
+#include "common/dout.h"
+#include "rgw_url.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+static const char* AWS_SNS_NS("https://sns.amazonaws.com/doc/2010-03-31/");
+
+bool verify_transport_security(CephContext *cct, const RGWEnv& env) {
+  const auto is_secure = rgw_transport_is_secure(cct, env);
+  if (!is_secure && g_conf().get_val<bool>("rgw_allow_notification_secrets_in_cleartext")) {
+    ldout(cct, 0) << "WARNING: bypassing endpoint validation, allows sending secrets over insecure transport" << dendl;
+    return true;
+  }
+  return is_secure;
+}
+
+// make sure that endpoint is a valid URL
+// make sure that if user/password are passed inside URL, it is over secure connection
+// update rgw_pubsub_dest to indicate that a password is stored in the URL
+bool validate_and_update_endpoint_secret(rgw_pubsub_dest& dest, CephContext *cct, const RGWEnv& env) {
+  if (dest.push_endpoint.empty()) {
+      return true;
+  }
+  std::string user;
+  std::string password;
+  if (!rgw::parse_url_userinfo(dest.push_endpoint, user, password)) {
+    ldout(cct, 1) << "endpoint validation error: malformed endpoint URL:" << dest.push_endpoint << dendl;
+    return false;
+  }
+  // this should be verified inside parse_url()
+  ceph_assert(user.empty() == password.empty());
+  if (!user.empty()) {
+      dest.stored_secret = true;
+      if (!verify_transport_security(cct, env)) {
+        ldout(cct, 1) << "endpoint validation error: sending secrets over insecure transport" << dendl;
+        return false;
+      }
+  }
+  return true;
+}
+
+bool topic_has_endpoint_secret(const rgw_pubsub_topic& topic) {
+    return topic.dest.stored_secret;
+}
+
+bool topics_has_endpoint_secret(const rgw_pubsub_topics& topics) {
+    for (const auto& topic : topics.topics) {
+        if (topic_has_endpoint_secret(topic.second)) return true;
+    }
+    return false;
+}
+
+// command (AWS compliant): 
+// POST
+// Action=CreateTopic&Name=<topic-name>[&OpaqueData=data][&push-endpoint=<endpoint>[&persistent][&<arg1>=<value1>]]
+class RGWPSCreateTopicOp : public RGWOp {
+  private:
+  std::string topic_name;
+  rgw_pubsub_dest dest;
+  std::string topic_arn;
+  std::string opaque_data;
+  
+  int get_params() {
+    topic_name = s->info.args.get("Name");
+    if (topic_name.empty()) {
+      ldpp_dout(this, 1) << "CreateTopic Action 'Name' argument is missing" << dendl;
+      return -EINVAL;
+    }
+
+    opaque_data = s->info.args.get("OpaqueData");
+
+    dest.push_endpoint = s->info.args.get("push-endpoint");
+    s->info.args.get_bool("persistent", &dest.persistent, false);
+
+    if (!validate_and_update_endpoint_secret(dest, s->cct, *(s->info.env))) {
+      return -EINVAL;
+    }
+    for (const auto& param : s->info.args.get_params()) {
+      if (param.first == "Action" || param.first == "Name" || param.first == "PayloadHash") {
+        continue;
+      }
+      dest.push_endpoint_args.append(param.first+"="+param.second+"&");
+    }
+
+    if (!dest.push_endpoint_args.empty()) {
+      // remove last separator
+      dest.push_endpoint_args.pop_back();
+    }
+    if (!dest.push_endpoint.empty() && dest.persistent) {
+      const auto ret = rgw::notify::add_persistent_topic(topic_name, s->yield);
+      if (ret < 0) {
+        ldpp_dout(this, 1) << "CreateTopic Action failed to create queue for persistent topics. error:" << ret << dendl;
+        return ret;
+      }
+    }
+    
+    // dest object only stores endpoint info
+    dest.arn_topic = topic_name;
+    // the topic ARN will be sent in the reply
+    const rgw::ARN arn(rgw::Partition::aws, rgw::Service::sns, 
+        driver->get_zone()->get_zonegroup().get_name(),
+        s->user->get_tenant(), topic_name);
+    topic_arn = arn.to_string();
+    return 0;
+  }
+
+  public:
+  int verify_permission(optional_yield) override {
+    return 0;
+  }
+
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  void execute(optional_yield) override;
+
+  const char* name() const override { return "pubsub_topic_create"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_CREATE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    const auto f = s->formatter;
+    f->open_object_section_in_ns("CreateTopicResponse", AWS_SNS_NS);
+    f->open_object_section("CreateTopicResult");
+    encode_xml("TopicArn", topic_arn, f); 
+    f->close_section(); // CreateTopicResult
+    f->open_object_section("ResponseMetadata");
+    encode_xml("RequestId", s->req_id, f); 
+    f->close_section(); // ResponseMetadata
+    f->close_section(); // CreateTopicResponse
+    rgw_flush_formatter_and_reset(s, f);
+  }
+};
+
+void RGWPSCreateTopicOp::execute(optional_yield y) {
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  const RGWPubSub ps(driver, s->owner.get_id().tenant);
+  op_ret = ps.create_topic(this, topic_name, dest, topic_arn, opaque_data, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to create topic '" << topic_name << "', ret=" << op_ret << dendl;
+    return;
+  }
+  ldpp_dout(this, 20) << "successfully created topic '" << topic_name << "'" << dendl;
+}
+
+// command (AWS compliant): 
+// POST 
+// Action=ListTopics
+class RGWPSListTopicsOp : public RGWOp {
+private:
+  rgw_pubsub_topics result;
+
+public:
+  int verify_permission(optional_yield) override {
+    return 0;
+  }
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  void execute(optional_yield) override;
+
+  const char* name() const override { return "pubsub_topics_list"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPICS_LIST; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    const auto f = s->formatter;
+    f->open_object_section_in_ns("ListTopicsResponse", AWS_SNS_NS);
+    f->open_object_section("ListTopicsResult");
+    encode_xml("Topics", result, f); 
+    f->close_section(); // ListTopicsResult
+    f->open_object_section("ResponseMetadata");
+    encode_xml("RequestId", s->req_id, f); 
+    f->close_section(); // ResponseMetadat
+    f->close_section(); // ListTopicsResponse
+    rgw_flush_formatter_and_reset(s, f);
+  }
+};
+
+void RGWPSListTopicsOp::execute(optional_yield y) {
+  const RGWPubSub ps(driver, s->owner.get_id().tenant);
+  op_ret = ps.get_topics(this, result, y);
+  // if there are no topics it is not considered an error
+  op_ret = op_ret == -ENOENT ? 0 : op_ret;
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get topics, ret=" << op_ret << dendl;
+    return;
+  }
+  if (topics_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) {
+    ldpp_dout(this, 1) << "topics contain secrets and cannot be sent over insecure transport" << dendl;
+    op_ret = -EPERM;
+    return;
+  }
+  ldpp_dout(this, 20) << "successfully got topics" << dendl;
+}
+
+// command (extension to AWS): 
+// POST
+// Action=GetTopic&TopicArn=<topic-arn>
+class RGWPSGetTopicOp : public RGWOp {
+  private:
+  std::string topic_name;
+  rgw_pubsub_topic result;
+  
+  int get_params() {
+    const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn")));
+
+    if (!topic_arn || topic_arn->resource.empty()) {
+        ldpp_dout(this, 1) << "GetTopic Action 'TopicArn' argument is missing or invalid" << dendl;
+        return -EINVAL;
+    }
+
+    topic_name = topic_arn->resource;
+    return 0;
+  }
+
+  public:
+  int verify_permission(optional_yield y) override {
+    return 0;
+  }
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "pubsub_topic_get"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_GET; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    const auto f = s->formatter;
+    f->open_object_section("GetTopicResponse");
+    f->open_object_section("GetTopicResult");
+    encode_xml("Topic", result, f); 
+    f->close_section();
+    f->open_object_section("ResponseMetadata");
+    encode_xml("RequestId", s->req_id, f); 
+    f->close_section();
+    f->close_section();
+    rgw_flush_formatter_and_reset(s, f);
+  }
+};
+
+void RGWPSGetTopicOp::execute(optional_yield y) {
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+  const RGWPubSub ps(driver, s->owner.get_id().tenant);
+  op_ret = ps.get_topic(this, topic_name, result, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl;
+    return;
+  }
+  if (topic_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) {
+    ldpp_dout(this, 1) << "topic '" << topic_name << "' contain secret and cannot be sent over insecure transport" << dendl;
+    op_ret = -EPERM;
+    return;
+  }
+  ldpp_dout(this, 1) << "successfully got topic '" << topic_name << "'" << dendl;
+}
+
+// command (AWS compliant): 
+// POST
+// Action=GetTopicAttributes&TopicArn=<topic-arn>
+class RGWPSGetTopicAttributesOp : public RGWOp {
+  private:
+  std::string topic_name;
+  rgw_pubsub_topic result;
+  
+  int get_params() {
+    const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn")));
+
+    if (!topic_arn || topic_arn->resource.empty()) {
+        ldpp_dout(this, 1) << "GetTopicAttribute Action 'TopicArn' argument is missing or invalid" << dendl;
+        return -EINVAL;
+    }
+
+    topic_name = topic_arn->resource;
+    return 0;
+  }
+
+  public:
+  int verify_permission(optional_yield y) override {
+    return 0;
+  }
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "pubsub_topic_get"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_GET; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    const auto f = s->formatter;
+    f->open_object_section_in_ns("GetTopicAttributesResponse", AWS_SNS_NS);
+    f->open_object_section("GetTopicAttributesResult");
+    result.dump_xml_as_attributes(f);
+    f->close_section(); // GetTopicAttributesResult
+    f->open_object_section("ResponseMetadata");
+    encode_xml("RequestId", s->req_id, f); 
+    f->close_section(); // ResponseMetadata
+    f->close_section(); // GetTopicAttributesResponse
+    rgw_flush_formatter_and_reset(s, f);
+  }
+};
+
+void RGWPSGetTopicAttributesOp::execute(optional_yield y) {
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+  const RGWPubSub ps(driver, s->owner.get_id().tenant);
+  op_ret = ps.get_topic(this, topic_name, result, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl;
+    return;
+  }
+  if (topic_has_endpoint_secret(result) && !verify_transport_security(s->cct, *(s->info.env))) {
+    ldpp_dout(this, 1) << "topic '" << topic_name << "' contain secret and cannot be sent over insecure transport" << dendl;
+    op_ret = -EPERM;
+    return;
+  }
+  ldpp_dout(this, 1) << "successfully got topic '" << topic_name << "'" << dendl;
+}
+
+// command (AWS compliant): 
+// POST
+// Action=DeleteTopic&TopicArn=<topic-arn>
+class RGWPSDeleteTopicOp : public RGWOp {
+  private:
+  std::string topic_name;
+  
+  int get_params() {
+    const auto topic_arn = rgw::ARN::parse((s->info.args.get("TopicArn")));
+
+    if (!topic_arn || topic_arn->resource.empty()) {
+      ldpp_dout(this, 1) << "DeleteTopic Action 'TopicArn' argument is missing or invalid" << dendl;
+      return -EINVAL;
+    }
+
+    topic_name = topic_arn->resource;
+
+    // upon deletion it is not known if topic is persistent or not
+    // will try to delete the persistent topic anyway
+    const auto ret = rgw::notify::remove_persistent_topic(topic_name, s->yield);
+    if (ret == -ENOENT) {
+      // topic was not persistent, or already deleted
+      return 0;
+    }
+    if (ret < 0) {
+      ldpp_dout(this, 1) << "DeleteTopic Action failed to remove queue for persistent topics. error:" << ret << dendl;
+      return ret;
+    }
+
+    return 0;
+  }
+
+  public:
+  int verify_permission(optional_yield) override {
+    return 0;
+  }
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "pubsub_topic_delete"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_TOPIC_DELETE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+
+    const auto f = s->formatter;
+    f->open_object_section_in_ns("DeleteTopicResponse", AWS_SNS_NS);
+    f->open_object_section("ResponseMetadata");
+    encode_xml("RequestId", s->req_id, f); 
+    f->close_section(); // ResponseMetadata
+    f->close_section(); // DeleteTopicResponse
+    rgw_flush_formatter_and_reset(s, f);
+  }
+};
+
+void RGWPSDeleteTopicOp::execute(optional_yield y) {
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+  const RGWPubSub ps(driver, s->owner.get_id().tenant);
+  op_ret = ps.remove_topic(this, topic_name, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to remove topic '" << topic_name << ", ret=" << op_ret << dendl;
+    return;
+  }
+  ldpp_dout(this, 1) << "successfully removed topic '" << topic_name << "'" << dendl;
+}
+
+using op_generator = RGWOp*(*)();
+static const std::unordered_map<std::string, op_generator> op_generators = {
+  {"CreateTopic", []() -> RGWOp* {return new RGWPSCreateTopicOp;}},
+  {"DeleteTopic", []() -> RGWOp* {return new RGWPSDeleteTopicOp;}},
+  {"ListTopics", []() -> RGWOp* {return new RGWPSListTopicsOp;}},
+  {"GetTopic", []() -> RGWOp* {return new RGWPSGetTopicOp;}},
+  {"GetTopicAttributes", []() -> RGWOp* {return new RGWPSGetTopicAttributesOp;}}
+};
+
+bool RGWHandler_REST_PSTopic_AWS::action_exists(const req_state* s) 
+{
+  if (s->info.args.exists("Action")) {
+    const std::string action_name = s->info.args.get("Action");
+    return op_generators.contains(action_name);
+  }
+  return false;
+}
+
+RGWOp *RGWHandler_REST_PSTopic_AWS::op_post()
+{
+  s->dialect = "sns";
+  s->prot_flags = RGW_REST_STS;
+
+  if (s->info.args.exists("Action")) {
+    const std::string action_name = s->info.args.get("Action");
+    const auto action_it = op_generators.find(action_name);
+    if (action_it != op_generators.end()) {
+      return action_it->second();
+    }
+    ldpp_dout(s, 10) << "unknown action '" << action_name << "' for Topic handler" << dendl;
+  } else {
+    ldpp_dout(s, 10) << "missing action argument in Topic handler" << dendl;
+  }
+  return nullptr;
+}
+
+int RGWHandler_REST_PSTopic_AWS::authorize(const DoutPrefixProvider* dpp, optional_yield y) {
+  const auto rc = RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y);
+  if (rc < 0) {
+    return rc;
+  }
+  if (s->auth.identity->is_anonymous()) {
+    ldpp_dout(dpp, 1) << "anonymous user not allowed in topic operations" << dendl;
+    return -ERR_INVALID_REQUEST;
+  }
+  return 0;
+}
+
+namespace {
+// return a unique topic by prefexing with the notification name: <notification>_<topic>
+std::string topic_to_unique(const std::string& topic, const std::string& notification) {
+  return notification + "_" + topic;
+}
+
+// extract the topic from a unique topic of the form: <notification>_<topic>
+[[maybe_unused]] std::string unique_to_topic(const std::string& unique_topic, const std::string& notification) {
+  if (unique_topic.find(notification + "_") == std::string::npos) {
+    return "";
+  }
+  return unique_topic.substr(notification.length() + 1);
+}
+
+// from list of bucket topics, find the one that was auto-generated by a notification
+auto find_unique_topic(const rgw_pubsub_bucket_topics& bucket_topics, const std::string& notif_name) {
+    auto it = std::find_if(bucket_topics.topics.begin(), bucket_topics.topics.end(), [&](const auto& val) { return notif_name == val.second.s3_id; });
+    return it != bucket_topics.topics.end() ?
+        std::optional<std::reference_wrapper<const rgw_pubsub_topic_filter>>(it->second):
+        std::nullopt;
+}
+}
+
+int remove_notification_by_topic(const DoutPrefixProvider *dpp, const std::string& topic_name, const RGWPubSub::Bucket& b, optional_yield y, const RGWPubSub& ps) {
+  int op_ret = b.remove_notification(dpp, topic_name, y);
+  if (op_ret < 0) {
+    ldpp_dout(dpp, 1) << "failed to remove notification of topic '" << topic_name << "', ret=" << op_ret << dendl;
+  }
+  op_ret = ps.remove_topic(dpp, topic_name, y);
+  if (op_ret < 0) {
+    ldpp_dout(dpp, 1) << "failed to remove auto-generated topic '" << topic_name << "', ret=" << op_ret << dendl;
+  }
+  return op_ret;
+}
+
+int delete_all_notifications(const DoutPrefixProvider *dpp, const rgw_pubsub_bucket_topics& bucket_topics, const RGWPubSub::Bucket& b, optional_yield y, const RGWPubSub& ps) {
+  // delete all notifications of on a bucket
+  for (const auto& topic : bucket_topics.topics) {
+    const auto op_ret = remove_notification_by_topic(dpp, topic.first, b, y, ps);
+    if (op_ret < 0) {
+      return op_ret;
+    }
+  }
+  return 0;
+}
+
+// command (S3 compliant): PUT /<bucket name>?notification
+// a "notification" and a subscription will be auto-generated
+// actual configuration is XML encoded in the body of the message
+class RGWPSCreateNotifOp : public RGWDefaultResponseOp {
+  int verify_params() override {
+    bool exists;
+    const auto no_value = s->info.args.get("notification", &exists);
+    if (!exists) {
+      ldpp_dout(this, 1) << "missing required param 'notification'" << dendl;
+      return -EINVAL;
+    } 
+    if (no_value.length() > 0) {
+      ldpp_dout(this, 1) << "param 'notification' should not have any value" << dendl;
+      return -EINVAL;
+    }
+    if (s->bucket_name.empty()) {
+      ldpp_dout(this, 1) << "request must be on a bucket" << dendl;
+      return -EINVAL;
+    }
+    return 0;
+  }
+
+  int get_params_from_body(rgw_pubsub_s3_notifications& configurations) {
+    const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+    int r;
+    bufferlist data;
+    std::tie(r, data) = read_all_input(s, max_size, false);
+
+    if (r < 0) {
+      ldpp_dout(this, 1) << "failed to read XML payload" << dendl;
+      return r;
+    }
+    if (data.length() == 0) {
+      ldpp_dout(this, 1) << "XML payload missing" << dendl;
+      return -EINVAL;
+    }
+
+    RGWXMLDecoder::XMLParser parser;
+
+    if (!parser.init()){
+      ldpp_dout(this, 1) << "failed to initialize XML parser" << dendl;
+      return -EINVAL;
+    }
+    if (!parser.parse(data.c_str(), data.length(), 1)) {
+      ldpp_dout(this, 1) << "failed to parse XML payload" << dendl;
+      return -ERR_MALFORMED_XML;
+    }
+    try {
+      // NotificationConfigurations is mandatory
+      // It can be empty which means we delete all the notifications
+      RGWXMLDecoder::decode_xml("NotificationConfiguration", configurations, &parser, true);
+    } catch (RGWXMLDecoder::err& err) {
+      ldpp_dout(this, 1) << "failed to parse XML payload. error: " << err << dendl;
+      return -ERR_MALFORMED_XML;
+    }
+    return 0;
+  }
+public:
+  int verify_permission(optional_yield y) override;
+
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+
+  const char* name() const override { return "pubsub_notification_create_s3"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_CREATE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_WRITE; }
+
+
+  void execute(optional_yield) override;
+};
+
+void RGWPSCreateNotifOp::execute(optional_yield y) {
+  op_ret = verify_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  rgw_pubsub_s3_notifications configurations;
+  op_ret = get_params_from_body(configurations);
+  if (op_ret < 0) {
+    return;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(s->owner.get_id());
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  op_ret = driver->get_bucket(this, user.get(), s->bucket_tenant, s->bucket_name, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get bucket '" << 
+      (s->bucket_tenant.empty() ? s->bucket_name : s->bucket_tenant + ":" + s->bucket_name) << 
+      "' info, ret = " << op_ret << dendl;
+    return;
+  }
+
+  const RGWPubSub ps(driver, s->owner.get_id().tenant);
+  const RGWPubSub::Bucket b(ps, bucket.get());
+
+  if(configurations.list.empty()) {
+    // get all topics on a bucket
+    rgw_pubsub_bucket_topics bucket_topics;
+    op_ret = b.get_topics(this, bucket_topics, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << s->bucket_name << "', ret=" << op_ret << dendl;
+      return;
+    }
+
+    op_ret = delete_all_notifications(this, bucket_topics, b, y, ps);
+    return;
+  }
+
+  for (const auto& c : configurations.list) {
+    const auto& notif_name = c.id;
+    if (notif_name.empty()) {
+      ldpp_dout(this, 1) << "missing notification id" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    if (c.topic_arn.empty()) {
+      ldpp_dout(this, 1) << "missing topic ARN in notification: '" << notif_name << "'" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    const auto arn = rgw::ARN::parse(c.topic_arn);
+    if (!arn || arn->resource.empty()) {
+      ldpp_dout(this, 1) << "topic ARN has invalid format: '" << c.topic_arn << "' in notification: '" << notif_name << "'" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    if (std::find(c.events.begin(), c.events.end(), rgw::notify::UnknownEvent) != c.events.end()) {
+      ldpp_dout(this, 1) << "unknown event type in notification: '" << notif_name << "'" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    const auto topic_name = arn->resource;
+
+    // get topic information. destination information is stored in the topic
+    rgw_pubsub_topic topic_info;  
+    op_ret = ps.get_topic(this, topic_name, topic_info, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 1) << "failed to get topic '" << topic_name << "', ret=" << op_ret << dendl;
+      return;
+    }
+    // make sure that full topic configuration match
+    // TODO: use ARN match function
+    
+    // create unique topic name. this has 2 reasons:
+    // (1) topics cannot be shared between different S3 notifications because they hold the filter information
+    // (2) make topic clneaup easier, when notification is removed
+    const auto unique_topic_name = topic_to_unique(topic_name, notif_name);
+    // generate the internal topic. destination is stored here for the "push-only" case
+    // when no subscription exists
+    // ARN is cached to make the "GET" method faster
+    op_ret = ps.create_topic(this, unique_topic_name, topic_info.dest, topic_info.arn, topic_info.opaque_data, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 1) << "failed to auto-generate unique topic '" << unique_topic_name << 
+        "', ret=" << op_ret << dendl;
+      return;
+    }
+    ldpp_dout(this, 20) << "successfully auto-generated unique topic '" << unique_topic_name << "'" << dendl;
+    // generate the notification
+    rgw::notify::EventTypeList events;
+    op_ret = b.create_notification(this, unique_topic_name, c.events, std::make_optional(c.filter), notif_name, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 1) << "failed to auto-generate notification for unique topic '" << unique_topic_name <<
+        "', ret=" << op_ret << dendl;
+      // rollback generated topic (ignore return value)
+      ps.remove_topic(this, unique_topic_name, y);
+      return;
+    }
+    ldpp_dout(this, 20) << "successfully auto-generated notification for unique topic '" << unique_topic_name << "'" << dendl;
+  }
+}
+
+int RGWPSCreateNotifOp::verify_permission(optional_yield y) {
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketNotification)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+// command (extension to S3): DELETE /bucket?notification[=<notification-id>]
+class RGWPSDeleteNotifOp : public RGWDefaultResponseOp {
+  int get_params(std::string& notif_name) const {
+    bool exists;
+    notif_name = s->info.args.get("notification", &exists);
+    if (!exists) {
+      ldpp_dout(this, 1) << "missing required param 'notification'" << dendl;
+      return -EINVAL;
+    } 
+    if (s->bucket_name.empty()) {
+      ldpp_dout(this, 1) << "request must be on a bucket" << dendl;
+      return -EINVAL;
+    }
+    return 0;
+  }
+
+public:
+  int verify_permission(optional_yield y) override;
+
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+  
+  const char* name() const override { return "pubsub_notification_delete_s3"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_DELETE; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_DELETE; }
+
+  void execute(optional_yield y) override;
+};
+
+void RGWPSDeleteNotifOp::execute(optional_yield y) {
+  std::string notif_name;
+  op_ret = get_params(notif_name);
+  if (op_ret < 0) {
+    return;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(s->owner.get_id());
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  op_ret = driver->get_bucket(this, user.get(), s->bucket_tenant, s->bucket_name, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get bucket '" << 
+      (s->bucket_tenant.empty() ? s->bucket_name : s->bucket_tenant + ":" + s->bucket_name) << 
+      "' info, ret = " << op_ret << dendl;
+    return;
+  }
+
+  const RGWPubSub ps(driver, s->owner.get_id().tenant);
+  const RGWPubSub::Bucket b(ps, bucket.get());
+
+  // get all topics on a bucket
+  rgw_pubsub_bucket_topics bucket_topics;
+  op_ret = b.get_topics(this, bucket_topics, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << s->bucket_name << "', ret=" << op_ret << dendl;
+    return;
+  }
+
+  if (!notif_name.empty()) {
+    // delete a specific notification
+    const auto unique_topic = find_unique_topic(bucket_topics, notif_name);
+    if (unique_topic) {
+      const auto unique_topic_name = unique_topic->get().topic.name;
+      op_ret = remove_notification_by_topic(this, unique_topic_name, b, y, ps);
+      return;
+    }
+    // notification to be removed is not found - considered success
+    ldpp_dout(this, 20) << "notification '" << notif_name << "' already removed" << dendl;
+    return;
+  }
+
+  op_ret = delete_all_notifications(this, bucket_topics, b, y, ps);
+}
+
+int RGWPSDeleteNotifOp::verify_permission(optional_yield y) {
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3PutBucketNotification)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+// command (S3 compliant): GET /bucket?notification[=<notification-id>]
+class RGWPSListNotifsOp : public RGWOp {
+  rgw_pubsub_s3_notifications notifications;
+
+  int get_params(std::string& notif_name) const {
+    bool exists;
+    notif_name = s->info.args.get("notification", &exists);
+    if (!exists) {
+      ldpp_dout(this, 1) << "missing required param 'notification'" << dendl;
+      return -EINVAL;
+    } 
+    if (s->bucket_name.empty()) {
+      ldpp_dout(this, 1) << "request must be on a bucket" << dendl;
+      return -EINVAL;
+    }
+    return 0;
+  }
+
+public:
+  int verify_permission(optional_yield y) override;
+
+  void pre_exec() override {
+    rgw_bucket_object_pre_exec(s);
+  }
+
+  const char* name() const override { return "pubsub_notifications_get_s3"; }
+  RGWOpType get_type() override { return RGW_OP_PUBSUB_NOTIF_LIST; }
+  uint32_t op_mask() override { return RGW_OP_TYPE_READ; }
+
+  void execute(optional_yield y) override;
+  void send_response() override {
+    if (op_ret) {
+      set_req_state_err(s, op_ret);
+    }
+    dump_errno(s);
+    end_header(s, this, "application/xml");
+
+    if (op_ret < 0) {
+      return;
+    }
+    notifications.dump_xml(s->formatter);
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+};
+
+void RGWPSListNotifsOp::execute(optional_yield y) {
+  std::string notif_name;
+  op_ret = get_params(notif_name);
+  if (op_ret < 0) {
+    return;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(s->owner.get_id());
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  op_ret = driver->get_bucket(this, user.get(), s->bucket_tenant, s->bucket_name, &bucket, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get bucket '" << 
+      (s->bucket_tenant.empty() ? s->bucket_name : s->bucket_tenant + ":" + s->bucket_name) << 
+      "' info, ret = " << op_ret << dendl;
+    return;
+  }
+
+  const RGWPubSub ps(driver, s->owner.get_id().tenant);
+  const RGWPubSub::Bucket b(ps, bucket.get());
+  
+  // get all topics on a bucket
+  rgw_pubsub_bucket_topics bucket_topics;
+  op_ret = b.get_topics(this, bucket_topics, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 1) << "failed to get list of topics from bucket '" << s->bucket_name << "', ret=" << op_ret << dendl;
+    return;
+  }
+  if (!notif_name.empty()) {
+    // get info of a specific notification
+    const auto unique_topic = find_unique_topic(bucket_topics, notif_name);
+    if (unique_topic) {
+      notifications.list.emplace_back(unique_topic->get());
+      return;
+    }
+    op_ret = -ENOENT;
+    ldpp_dout(this, 1) << "failed to get notification info for '" << notif_name << "', ret=" << op_ret << dendl;
+    return;
+  }
+  // loop through all topics of the bucket
+  for (const auto& topic : bucket_topics.topics) {
+    if (topic.second.s3_id.empty()) {
+        // not an s3 notification
+        continue;
+    }
+    notifications.list.emplace_back(topic.second);
+  }
+}
+
+int RGWPSListNotifsOp::verify_permission(optional_yield y) {
+  if (!verify_bucket_permission(this, s, rgw::IAM::s3GetBucketNotification)) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::op_get() {
+  return new RGWPSListNotifsOp();
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::op_put() {
+  return new RGWPSCreateNotifOp();
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::op_delete() {
+  return new RGWPSDeleteNotifOp();
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::create_get_op() {
+    return new RGWPSListNotifsOp();
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::create_put_op() {
+  return new RGWPSCreateNotifOp();
+}
+
+RGWOp* RGWHandler_REST_PSNotifs_S3::create_delete_op() {
+  return new RGWPSDeleteNotifOp();
+}
+
diff --git a/src/rgw/rgw_rest_ratelimit.cc b/src/rgw/rgw_rest_ratelimit.cc
new file mode 100644
index 000000000..b482b4f82
--- /dev/null
+++ b/src/rgw/rgw_rest_ratelimit.cc
@@ -0,0 +1,349 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+#include "rgw_rest_ratelimit.h"
+class RGWOp_Ratelimit_Info : public RGWRESTOp {
+int check_caps(const RGWUserCaps& caps) override {
+  return caps.check_cap("ratelimit", RGW_CAP_READ);
+}
+  
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_ratelimit_info"; }
+};
+void RGWOp_Ratelimit_Info::execute(optional_yield y)
+{
+  ldpp_dout(this, 20) << "" << dendl;
+  std::string uid_str;
+  std::string ratelimit_scope;
+  std::string bucket_name;
+  std::string tenant_name;
+  bool global = false;
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "ratelimit-scope", ratelimit_scope, &ratelimit_scope);
+  RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name);
+  RESTArgs::get_string(s, "tenant", tenant_name, &tenant_name);
+  // RESTArgs::get_bool default value to true even if global is empty
+  bool exists;
+  std::string sval = s->info.args.get("global", &exists);
+  if (exists) {
+    if (!boost::iequals(sval,"true") && !boost::iequals(sval,"false")) {
+      op_ret = -EINVAL;
+      ldpp_dout(this, 20) << "global is not equal to true or false" << dendl;
+      return;
+    }
+  }
+  RESTArgs::get_bool(s, "global", false, &global);
+
+  if (ratelimit_scope == "bucket" && !bucket_name.empty() && !global) {
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    int r = driver->get_bucket(s, nullptr, tenant_name, bucket_name, &bucket, y);
+    if (r != 0) {
+      op_ret = r;
+      ldpp_dout(this, 0) << "Error on getting bucket info" << dendl;
+      return;
+    }
+    RGWRateLimitInfo ratelimit_info;
+    auto iter = bucket->get_attrs().find(RGW_ATTR_RATELIMIT);
+    if (iter != bucket->get_attrs().end()) {
+      try {
+        bufferlist& bl = iter->second;
+        auto biter = bl.cbegin();
+        decode(ratelimit_info, biter);
+      } catch (buffer::error& err) {
+        ldpp_dout(this, 0) << "Error on decoding ratelimit info from bucket" << dendl;
+        op_ret = -EIO;
+        return;
+      }
+    }
+    flusher.start(0);
+    s->formatter->open_object_section("bucket_ratelimit");
+    encode_json("bucket_ratelimit", ratelimit_info, s->formatter);
+    s->formatter->close_section();
+    flusher.flush();
+    return;
+  }
+  if (ratelimit_scope == "user" && !uid_str.empty() && !global) {
+    RGWRateLimitInfo ratelimit_info;
+    rgw_user user(uid_str);
+    std::unique_ptr<rgw::sal::User> user_sal;
+    user_sal = driver->get_user(user);
+    if (!rgw::sal::User::empty(user_sal)) {
+      op_ret = user_sal->load_user(this, y);
+      if (op_ret) {
+        ldpp_dout(this, 0) << "Cannot load user info" << dendl;
+        return;
+      }
+    } else {
+      ldpp_dout(this, 0) << "User does not exist" << dendl;
+      op_ret = -ENOENT;
+      return;
+    }
+
+    auto iter = user_sal->get_attrs().find(RGW_ATTR_RATELIMIT);
+    if(iter != user_sal->get_attrs().end()) {
+      try {
+        bufferlist& bl = iter->second;
+        auto biter = bl.cbegin();
+        decode(ratelimit_info, biter);
+      } catch (buffer::error& err) {
+        ldpp_dout(this, 0) << "Error on decoding ratelimit info from user" << dendl;
+        op_ret = -EIO;
+        return;
+      }
+    }
+    flusher.start(0);
+    s->formatter->open_object_section("user_ratelimit");
+    encode_json("user_ratelimit", ratelimit_info, s->formatter);
+    s->formatter->close_section();
+    flusher.flush();
+  }
+  if (global) {
+    std::string realm_id = driver->get_zone()->get_realm_id();
+    RGWPeriodConfig period_config;
+    op_ret = period_config.read(this, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm_id, y);
+    if (op_ret && op_ret != -ENOENT) {
+      ldpp_dout(this, 0) << "Error on period config read" << dendl;
+      return;
+    }
+    flusher.start(0);
+    s->formatter->open_object_section("period_config");
+    encode_json("bucket_ratelimit", period_config.bucket_ratelimit, s->formatter);
+    encode_json("user_ratelimit", period_config.user_ratelimit, s->formatter);
+    encode_json("anonymous_ratelimit", period_config.anon_ratelimit, s->formatter);
+    s->formatter->close_section();
+    flusher.flush();
+    return;
+  }
+  op_ret = -EINVAL;
+  return;
+}
+
+class RGWOp_Ratelimit_Set : public RGWRESTOp {
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("ratelimit", RGW_CAP_WRITE);
+  }
+
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "put_ratelimit_info"; }
+
+  void set_ratelimit_info(bool have_max_read_ops, int64_t max_read_ops, bool have_max_write_ops, int64_t max_write_ops,
+                          bool have_max_read_bytes, int64_t max_read_bytes, bool have_max_write_bytes, int64_t max_write_bytes,
+                          bool have_enabled, bool enabled, bool& ratelimit_configured, RGWRateLimitInfo& ratelimit_info);
+};
+
+
+  void RGWOp_Ratelimit_Set::set_ratelimit_info(bool have_max_read_ops, int64_t max_read_ops, bool have_max_write_ops, int64_t max_write_ops,
+                          bool have_max_read_bytes, int64_t max_read_bytes, bool have_max_write_bytes, int64_t max_write_bytes,
+                          bool have_enabled, bool enabled, bool& ratelimit_configured, RGWRateLimitInfo& ratelimit_info) 
+  {
+    if (have_max_read_ops) {
+      if (max_read_ops >= 0) {
+        ratelimit_info.max_read_ops = max_read_ops;
+        ratelimit_configured = true;
+      }
+    }
+    if (have_max_write_ops) {
+      if (max_write_ops >= 0) {
+        ratelimit_info.max_write_ops = max_write_ops;
+        ratelimit_configured = true;
+      }
+    }
+    if (have_max_read_bytes) {
+      if (max_read_bytes >= 0) {
+        ratelimit_info.max_read_bytes = max_read_bytes;
+        ratelimit_configured = true;
+      }
+    }
+    if (have_max_write_bytes) {
+      if (max_write_bytes >= 0) {
+        ratelimit_info.max_write_bytes = max_write_bytes;
+        ratelimit_configured = true;
+      }
+    }
+    if (have_enabled) {
+      ratelimit_info.enabled = enabled;
+      ratelimit_configured = true;
+    }
+    if (!ratelimit_configured) {
+      ldpp_dout(this, 0) << "No rate limit configuration arguments have been sent" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+  }
+
+
+void RGWOp_Ratelimit_Set::execute(optional_yield y)
+{
+  std::string uid_str;
+  std::string ratelimit_scope;
+  std::string bucket_name;
+  std::string tenant_name;
+  RGWRateLimitInfo ratelimit_info;
+  bool ratelimit_configured = false;
+  bool enabled = false;
+  bool have_enabled = false;
+  bool global = false;
+  int64_t max_read_ops = 0;
+  bool have_max_read_ops = false;
+  int64_t max_write_ops = 0;
+  bool have_max_write_ops = false;
+  int64_t max_read_bytes = 0;
+  bool have_max_read_bytes = false;
+  int64_t max_write_bytes = 0;
+  bool have_max_write_bytes = false;
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "ratelimit-scope", ratelimit_scope, &ratelimit_scope);
+  RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name);
+  RESTArgs::get_string(s, "tenant", tenant_name, &tenant_name);
+  // check there was no -EINVAL coming from get_int64
+  op_ret = RESTArgs::get_int64(s, "max-read-ops", 0, &max_read_ops, &have_max_read_ops);
+  op_ret |= RESTArgs::get_int64(s, "max-write-ops", 0, &max_write_ops, &have_max_write_ops);
+  op_ret |= RESTArgs::get_int64(s, "max-read-bytes", 0, &max_read_bytes, &have_max_read_bytes);
+  op_ret |= RESTArgs::get_int64(s, "max-write-bytes", 0, &max_write_bytes, &have_max_write_bytes);
+  if (op_ret) {
+    ldpp_dout(this, 0) << "one of the maximum arguments could not be parsed" << dendl;
+    return;
+  }
+  // RESTArgs::get_bool default value to true even if enabled or global are empty
+  std::string sval = s->info.args.get("enabled", &have_enabled);
+  if (have_enabled) {
+    if (!boost::iequals(sval,"true") && !boost::iequals(sval,"false")) {
+      ldpp_dout(this, 20) << "enabled is not equal to true or false" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+  RESTArgs::get_bool(s, "enabled", false, &enabled, &have_enabled);
+  bool exists;
+  sval = s->info.args.get("global", &exists);
+  if (exists) {
+    if (!boost::iequals(sval,"true") && !boost::iequals(sval,"false")) {
+      ldpp_dout(this, 20) << "global is not equal to true or faslse" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+  RESTArgs::get_bool(s, "global", false, &global, nullptr);
+  set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops,
+                     have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes,
+                     have_enabled, enabled, ratelimit_configured, ratelimit_info);
+  if (op_ret) {
+    return;
+  }
+  if (ratelimit_scope == "user" && !uid_str.empty() && !global) {
+    rgw_user user(uid_str);
+    std::unique_ptr<rgw::sal::User> user_sal;
+    user_sal = driver->get_user(user);
+    if (!rgw::sal::User::empty(user_sal)) {
+      op_ret = user_sal->load_user(this, y);
+      if (op_ret) {
+        ldpp_dout(this, 0) << "Cannot load user info" << dendl;
+        return;
+      }
+    } else {
+      ldpp_dout(this, 0) << "User does not exist" << dendl;
+      op_ret = -ENOENT;
+      return;
+    }
+    auto iter = user_sal->get_attrs().find(RGW_ATTR_RATELIMIT);
+    if (iter != user_sal->get_attrs().end()) {
+      try {
+        bufferlist& bl = iter->second;
+        auto biter = bl.cbegin();
+        decode(ratelimit_info, biter);
+      } catch (buffer::error& err) {
+        ldpp_dout(this, 0) << "Error on decoding ratelimit info from user" << dendl;
+        op_ret = -EIO;
+        return;
+      }
+    }
+    set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops,
+                       have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes,
+                       have_enabled, enabled, ratelimit_configured, ratelimit_info);
+    bufferlist bl;
+    ratelimit_info.encode(bl);
+    rgw::sal::Attrs attr;
+    attr[RGW_ATTR_RATELIMIT] = bl;
+    op_ret = user_sal->merge_and_store_attrs(this, attr, y);
+    return;
+  }
+
+  if (ratelimit_scope == "bucket" && !bucket_name.empty() && !global) {
+    ldpp_dout(this, 0) << "getting bucket info" << dendl;
+    std::unique_ptr<rgw::sal::Bucket> bucket;
+    op_ret = driver->get_bucket(this, nullptr, tenant_name, bucket_name, &bucket, y);
+    if (op_ret) {
+      ldpp_dout(this, 0) << "Error on getting bucket info" << dendl;
+      return;
+    }
+    auto iter = bucket->get_attrs().find(RGW_ATTR_RATELIMIT);
+    if (iter != bucket->get_attrs().end()) {
+      try {
+        bufferlist& bl = iter->second;
+        auto biter = bl.cbegin();
+        decode(ratelimit_info, biter);
+      } catch (buffer::error& err) {
+        ldpp_dout(this, 0) << "Error on decoding ratelimit info from bucket" << dendl;
+        op_ret = -EIO;
+        return;
+      }
+    }
+    bufferlist bl;
+    set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops,
+                       have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes,
+                       have_enabled, enabled, ratelimit_configured, ratelimit_info);
+    ratelimit_info.encode(bl);
+    rgw::sal::Attrs attr;
+    attr[RGW_ATTR_RATELIMIT] = bl;
+    op_ret = bucket->merge_and_store_attrs(this, attr, y);
+    return;
+  }
+  if (global) {
+    std::string realm_id = driver->get_zone()->get_realm_id();
+    RGWPeriodConfig period_config;
+    op_ret = period_config.read(s, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm_id, y);
+    if (op_ret && op_ret != -ENOENT) {
+      ldpp_dout(this, 0) << "Error on period config read" << dendl;
+      return;
+    }
+    if (ratelimit_scope == "bucket") {
+      ratelimit_info = period_config.bucket_ratelimit;
+      set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops,
+                         have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes,
+                         have_enabled, enabled, ratelimit_configured, ratelimit_info);
+      period_config.bucket_ratelimit = ratelimit_info;
+      op_ret = period_config.write(s, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm_id, y);
+      return;
+    }
+    if (ratelimit_scope == "anon") {
+      ratelimit_info = period_config.anon_ratelimit;
+      set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops,
+                         have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes,
+                         have_enabled, enabled, ratelimit_configured, ratelimit_info);
+      period_config.anon_ratelimit = ratelimit_info;
+      op_ret = period_config.write(s, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm_id, y);
+      return;
+    }
+    if (ratelimit_scope == "user") {
+      ratelimit_info = period_config.user_ratelimit;
+      set_ratelimit_info(have_max_read_ops, max_read_ops, have_max_write_ops, max_write_ops,
+                         have_max_read_bytes, max_read_bytes, have_max_write_bytes, max_write_bytes,
+                         have_enabled, enabled, ratelimit_configured, ratelimit_info);
+      period_config.user_ratelimit = ratelimit_info;
+      op_ret = period_config.write(s, static_cast<rgw::sal::RadosStore*>(driver)->svc()->sysobj, realm_id, y);
+      return;
+    }
+  }
+  op_ret = -EINVAL;
+  return;
+}
+RGWOp* RGWHandler_Ratelimit::op_get()
+{
+  return new RGWOp_Ratelimit_Info;
+}
+RGWOp* RGWHandler_Ratelimit::op_post()
+{
+  return new RGWOp_Ratelimit_Set;
+}
diff --git a/src/rgw/rgw_rest_ratelimit.h b/src/rgw/rgw_rest_ratelimit.h
new file mode 100644
index 000000000..c3a942b19
--- /dev/null
+++ b/src/rgw/rgw_rest_ratelimit.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_sal_rados.h"
+
+class RGWHandler_Ratelimit : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+  RGWOp *op_post() override;
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_Ratelimit() override = default;
+
+  int read_permissions(RGWOp*, optional_yield) override {
+    return 0;
+  }
+};
+
+class RGWRESTMgr_Ratelimit : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Ratelimit() = default;
+  ~RGWRESTMgr_Ratelimit() override = default;
+
+  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override {
+    return new RGWHandler_Ratelimit(auth_registry);
+  }
+};
diff --git a/src/rgw/rgw_rest_role.cc b/src/rgw/rgw_rest_role.cc
new file mode 100644
index 000000000..e71dff570
--- /dev/null
+++ b/src/rgw/rgw_rest_role.cc
@@ -0,0 +1,1022 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <regex>
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+
+#include "include/types.h"
+#include "rgw_string.h"
+
+#include "rgw_common.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_role.h"
+#include "rgw_rest_role.h"
+#include "rgw_sal.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int RGWRestRole::verify_permission(optional_yield y)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  string role_name = s->info.args.get("RoleName");
+  std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name,
+							    s->user->get_tenant());
+  if (op_ret = role->get(s, y); op_ret < 0) {
+    if (op_ret == -ENOENT) {
+      op_ret = -ERR_NO_ROLE_FOUND;
+    }
+    return op_ret;
+  }
+
+  if (int ret = check_caps(s->user->get_caps()); ret == 0) {
+    _role = std::move(role);
+    return ret;
+  }
+
+  string resource_name = role->get_path() + role_name;
+  uint64_t op = get_op();
+  if (!verify_user_permission(this,
+                              s,
+                              rgw::ARN(resource_name,
+                                            "role",
+                                             s->user->get_tenant(), true),
+                                             op)) {
+    return -EACCES;
+  }
+
+  _role = std::move(role);
+
+  return 0;
+}
+
+int RGWRestRole::parse_tags()
+{
+  vector<string> keys, vals;
+  auto val_map = s->info.args.get_params();
+  const regex pattern_key("Tags.member.([0-9]+).Key");
+  const regex pattern_value("Tags.member.([0-9]+).Value");
+  for (auto& v : val_map) {
+    string key_index="", value_index="";
+    for(sregex_iterator it = sregex_iterator(
+        v.first.begin(), v.first.end(), pattern_key);
+        it != sregex_iterator(); it++) {
+        smatch match;
+        match = *it;
+        key_index = match.str(1);
+        ldout(s->cct, 20) << "Key index: " << match.str(1) << dendl;
+        if (!key_index.empty()) {
+          int index = stoi(key_index);
+          auto pos = keys.begin() + (index-1);
+          keys.insert(pos, v.second);
+        }
+    }
+    for(sregex_iterator it = sregex_iterator(
+        v.first.begin(), v.first.end(), pattern_value);
+        it != sregex_iterator(); it++) {
+        smatch match;
+        match = *it;
+        value_index = match.str(1);
+        ldout(s->cct, 20) << "Value index: " << match.str(1) << dendl;
+        if (!value_index.empty()) {
+          int index = stoi(value_index);
+          auto pos = vals.begin() + (index-1);
+          vals.insert(pos, v.second);
+        }
+    }
+  }
+  if (keys.size() != vals.size()) {
+    ldout(s->cct, 0) << "No. of keys doesn't match with no. of values in tags" << dendl;
+    return -EINVAL;
+  }
+  for (size_t i = 0; i < keys.size(); i++) {
+    tags.emplace(keys[i], vals[i]);
+    ldout(s->cct, 0) << "Tag Key: " << keys[i] << " Tag Value is: " << vals[i] << dendl;
+  }
+  return 0;
+}
+
+void RGWRestRole::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s, this);
+}
+
+int RGWRoleRead::check_caps(const RGWUserCaps& caps)
+{
+    return caps.check_cap("roles", RGW_CAP_READ);
+}
+
+int RGWRoleWrite::check_caps(const RGWUserCaps& caps)
+{
+    return caps.check_cap("roles", RGW_CAP_WRITE);
+}
+
+int RGWCreateRole::verify_permission(optional_yield y)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  if (int ret = check_caps(s->user->get_caps()); ret == 0) {
+    return ret;
+  }
+
+  string role_name = s->info.args.get("RoleName");
+  string role_path = s->info.args.get("Path");
+
+  string resource_name = role_path + role_name;
+  if (!verify_user_permission(this,
+                              s,
+                              rgw::ARN(resource_name,
+                                            "role",
+                                             s->user->get_tenant(), true),
+                                             get_op())) {
+    return -EACCES;
+  }
+  return 0;
+}
+
+int RGWCreateRole::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+  role_path = s->info.args.get("Path");
+  trust_policy = s->info.args.get("AssumeRolePolicyDocument");
+  max_session_duration = s->info.args.get("MaxSessionDuration");
+
+  if (role_name.empty() || trust_policy.empty()) {
+    ldpp_dout(this, 20) << "ERROR: one of role name or assume role policy document is empty"
+    << dendl;
+    return -EINVAL;
+  }
+
+  bufferlist bl = bufferlist::static_from_string(trust_policy);
+  try {
+    const rgw::IAM::Policy p(
+      s->cct, s->user->get_tenant(), bl,
+      s->cct->_conf.get_val<bool>("rgw_policy_reject_invalid_principals"));
+  }
+  catch (rgw::IAM::PolicyParseException& e) {
+    ldpp_dout(this, 5) << "failed to parse policy: " << e.what() << dendl;
+    s->err.message = e.what();
+    return -ERR_MALFORMED_DOC;
+  }
+
+  int ret = parse_tags();
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (tags.size() > 50) {
+    ldout(s->cct, 0) << "No. tags is greater than 50" << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWCreateRole::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+  std::string user_tenant = s->user->get_tenant();
+  std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name,
+							    user_tenant,
+							    role_path,
+							    trust_policy,
+							    max_session_duration,
+	                tags);
+  if (!user_tenant.empty() && role->get_tenant() != user_tenant) {
+    ldpp_dout(this, 20) << "ERROR: the tenant provided in the role name does not match with the tenant of the user creating the role"
+    << dendl;
+    op_ret = -EINVAL;
+    return;
+  }
+
+  std::string role_id;
+
+  if (!driver->is_meta_master()) {
+    RGWXMLDecoder::XMLParser parser;
+    if (!parser.init()) {
+      ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    bufferlist data;
+    s->info.args.remove("RoleName");
+    s->info.args.remove("Path");
+    s->info.args.remove("AssumeRolePolicyDocument");
+    s->info.args.remove("MaxSessionDuration");
+    s->info.args.remove("Action");
+    s->info.args.remove("Version");
+    auto& val_map = s->info.args.get_params();
+    for (auto it = val_map.begin(); it!= val_map.end(); it++) {
+      if (it->first.find("Tags.member.") == 0) {
+        val_map.erase(it);
+      }
+    }
+
+    RGWUserInfo info = s->user->get_info();
+    const auto& it = info.access_keys.begin();
+    RGWAccessKey key;
+    if (it != info.access_keys.end()) {
+      key.id = it->first;
+      RGWAccessKey cred = it->second;
+      key.key = cred.key;
+    }
+    op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl;
+      return;
+    }
+
+    XMLObj* create_role_resp_obj = parser.find_first("CreateRoleResponse");;
+    if (!create_role_resp_obj) {
+      ldpp_dout(this, 5) << "ERROR: unexpected xml: CreateRoleResponse" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    XMLObj* create_role_res_obj = create_role_resp_obj->find_first("CreateRoleResult");
+    if (!create_role_res_obj) {
+      ldpp_dout(this, 5) << "ERROR: unexpected xml: CreateRoleResult" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    XMLObj* role_obj = nullptr;
+    if (create_role_res_obj) {
+      role_obj = create_role_res_obj->find_first("Role");
+    }
+    if (!role_obj) {
+      ldpp_dout(this, 5) << "ERROR: unexpected xml: Role" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    try {
+      if (role_obj) {
+        RGWXMLDecoder::decode_xml("RoleId", role_id, role_obj, true);
+      }
+    } catch (RGWXMLDecoder::err& err) {
+      ldpp_dout(this, 5) << "ERROR: unexpected xml: RoleId" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+    ldpp_dout(this, 0) << "role_id decoded from master zonegroup response is" << role_id << dendl;
+  }
+
+  op_ret = role->create(s, true, role_id, y);
+  if (op_ret == -EEXIST) {
+    op_ret = -ERR_ROLE_EXISTS;
+    return;
+  }
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("CreateRoleResponse");
+    s->formatter->open_object_section("CreateRoleResult");
+    s->formatter->open_object_section("Role");
+    role->dump(s->formatter);
+    s->formatter->close_section();
+    s->formatter->close_section();
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWDeleteRole::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+
+  if (role_name.empty()) {
+    ldpp_dout(this, 20) << "ERROR: Role name is empty"<< dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWDeleteRole::execute(optional_yield y)
+{
+  bool is_master = true;
+  int master_op_ret = 0;
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!driver->is_meta_master()) {
+    is_master = false;
+    RGWXMLDecoder::XMLParser parser;
+    if (!parser.init()) {
+      ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl;
+      op_ret = -EINVAL;
+    }
+
+    bufferlist data;
+    s->info.args.remove("RoleName");
+    s->info.args.remove("Action");
+    s->info.args.remove("Version");
+
+    RGWUserInfo info = s->user->get_info();
+    const auto& it = info.access_keys.begin();
+    RGWAccessKey key;
+    if (it != info.access_keys.end()) {
+      key.id = it->first;
+      RGWAccessKey cred = it->second;
+      key.key = cred.key;
+    }
+    master_op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y);
+    if (master_op_ret < 0) {
+      op_ret = master_op_ret;
+      ldpp_dout(this, 0) << "forward_iam_request_to_master returned ret=" << op_ret << dendl;
+      return;
+    }
+  }
+
+  op_ret = _role->delete_obj(s, y);
+
+  if (op_ret == -ENOENT) {
+    //Role has been deleted since metadata from master has synced up
+    if (!is_master && master_op_ret == 0) {
+      op_ret = 0;
+    } else {
+      op_ret = -ERR_NO_ROLE_FOUND;
+    }
+    return;
+  }
+  if (!op_ret) {
+    s->formatter->open_object_section("DeleteRoleResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWGetRole::verify_permission(optional_yield y)
+{
+  return 0;
+}
+
+int RGWGetRole::_verify_permission(const rgw::sal::RGWRole* role)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  if (int ret = check_caps(s->user->get_caps()); ret == 0) {
+    return ret;
+  }
+
+  string resource_name = role->get_path() + role->get_name();
+  if (!verify_user_permission(this,
+                              s,
+                              rgw::ARN(resource_name,
+                                            "role",
+                                             s->user->get_tenant(), true),
+                                             get_op())) {
+    return -EACCES;
+  }
+  return 0;
+}
+
+int RGWGetRole::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+
+  if (role_name.empty()) {
+    ldpp_dout(this, 20) << "ERROR: Role name is empty"<< dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWGetRole::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+  std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name,
+							    s->user->get_tenant());
+  op_ret = role->get(s, y);
+
+  if (op_ret == -ENOENT) {
+    op_ret = -ERR_NO_ROLE_FOUND;
+    return;
+  }
+
+  op_ret = _verify_permission(role.get());
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("GetRoleResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->open_object_section("GetRoleResult");
+    s->formatter->open_object_section("Role");
+    role->dump(s->formatter);
+    s->formatter->close_section();
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWModifyRoleTrustPolicy::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+  trust_policy = s->info.args.get("PolicyDocument");
+
+  if (role_name.empty() || trust_policy.empty()) {
+    ldpp_dout(this, 20) << "ERROR: One of role name or trust policy is empty"<< dendl;
+    return -EINVAL;
+  }
+  JSONParser p;
+  if (!p.parse(trust_policy.c_str(), trust_policy.length())) {
+    ldpp_dout(this, 20) << "ERROR: failed to parse assume role policy doc" << dendl;
+    return -ERR_MALFORMED_DOC;
+  }
+
+  return 0;
+}
+
+void RGWModifyRoleTrustPolicy::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!driver->is_meta_master()) {
+    RGWXMLDecoder::XMLParser parser;
+    if (!parser.init()) {
+      ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    bufferlist data;
+    s->info.args.remove("RoleName");
+    s->info.args.remove("PolicyDocument");
+    s->info.args.remove("Action");
+    s->info.args.remove("Version");
+
+    RGWUserInfo info = s->user->get_info();
+    const auto& it = info.access_keys.begin();
+    RGWAccessKey key;
+    if (it != info.access_keys.end()) {
+      key.id = it->first;
+      RGWAccessKey cred = it->second;
+      key.key = cred.key;
+    }
+    op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl;
+      return;
+    }
+  }
+
+  _role->update_trust_policy(trust_policy);
+  op_ret = _role->update(this, y);
+
+  s->formatter->open_object_section("UpdateAssumeRolePolicyResponse");
+  s->formatter->open_object_section("ResponseMetadata");
+  s->formatter->dump_string("RequestId", s->trans_id);
+  s->formatter->close_section();
+  s->formatter->close_section();
+}
+
+int RGWListRoles::verify_permission(optional_yield y)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  if (int ret = check_caps(s->user->get_caps()); ret == 0) {
+    return ret;
+  }
+
+  if (!verify_user_permission(this, 
+                              s,
+                              rgw::ARN(),
+                              get_op())) {
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+int RGWListRoles::get_params()
+{
+  path_prefix = s->info.args.get("PathPrefix");
+
+  return 0;
+}
+
+void RGWListRoles::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+  vector<std::unique_ptr<rgw::sal::RGWRole>> result;
+  op_ret = driver->get_roles(s, y, path_prefix, s->user->get_tenant(), result);
+
+  if (op_ret == 0) {
+    s->formatter->open_array_section("ListRolesResponse");
+    s->formatter->open_array_section("ListRolesResult");
+    s->formatter->open_object_section("Roles");
+    for (const auto& it : result) {
+      s->formatter->open_object_section("member");
+      it->dump(s->formatter);
+      s->formatter->close_section();
+    }
+    s->formatter->close_section();
+    s->formatter->close_section();
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWPutRolePolicy::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+  policy_name = s->info.args.get("PolicyName");
+  perm_policy = s->info.args.get("PolicyDocument");
+
+  if (role_name.empty() || policy_name.empty() || perm_policy.empty()) {
+    ldpp_dout(this, 20) << "ERROR: One of role name, policy name or perm policy is empty"<< dendl;
+    return -EINVAL;
+  }
+  bufferlist bl = bufferlist::static_from_string(perm_policy);
+  try {
+    const rgw::IAM::Policy p(
+      s->cct, s->user->get_tenant(), bl,
+      s->cct->_conf.get_val<bool>("rgw_policy_reject_invalid_principals"));
+  }
+  catch (rgw::IAM::PolicyParseException& e) {
+    ldpp_dout(this, 20) << "failed to parse policy: " << e.what() << dendl;
+    s->err.message = e.what();
+    return -ERR_MALFORMED_DOC;
+  }
+  return 0;
+}
+
+void RGWPutRolePolicy::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!driver->is_meta_master()) {
+    RGWXMLDecoder::XMLParser parser;
+    if (!parser.init()) {
+      ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    bufferlist data;
+    s->info.args.remove("RoleName");
+    s->info.args.remove("PolicyName");
+    s->info.args.remove("PolicyDocument");
+    s->info.args.remove("Action");
+    s->info.args.remove("Version");
+
+    RGWUserInfo info = s->user->get_info();
+    const auto& it = info.access_keys.begin();
+    RGWAccessKey key;
+    if (it != info.access_keys.end()) {
+      key.id = it->first;
+      RGWAccessKey cred = it->second;
+      key.key = cred.key;
+    }
+    op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl;
+      return;
+    }
+  }
+
+  _role->set_perm_policy(policy_name, perm_policy);
+  op_ret = _role->update(this, y);
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("PutRolePolicyResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWGetRolePolicy::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+  policy_name = s->info.args.get("PolicyName");
+
+  if (role_name.empty() || policy_name.empty()) {
+    ldpp_dout(this, 20) << "ERROR: One of role name or policy name is empty"<< dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+void RGWGetRolePolicy::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  string perm_policy;
+  op_ret = _role->get_role_policy(this, policy_name, perm_policy);
+  if (op_ret == -ENOENT) {
+    op_ret = -ERR_NO_SUCH_ENTITY;
+  }
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("GetRolePolicyResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->open_object_section("GetRolePolicyResult");
+    s->formatter->dump_string("PolicyName", policy_name);
+    s->formatter->dump_string("RoleName", role_name);
+    s->formatter->dump_string("PolicyDocument", perm_policy);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWListRolePolicies::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+
+  if (role_name.empty()) {
+    ldpp_dout(this, 20) << "ERROR: Role name is empty"<< dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+void RGWListRolePolicies::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  std::vector<string> policy_names = _role->get_role_policy_names();
+  s->formatter->open_object_section("ListRolePoliciesResponse");
+  s->formatter->open_object_section("ResponseMetadata");
+  s->formatter->dump_string("RequestId", s->trans_id);
+  s->formatter->close_section();
+  s->formatter->open_object_section("ListRolePoliciesResult");
+  s->formatter->open_array_section("PolicyNames");
+  for (const auto& it : policy_names) {
+    s->formatter->dump_string("member", it);
+  }
+  s->formatter->close_section();
+  s->formatter->close_section();
+  s->formatter->close_section();
+}
+
+int RGWDeleteRolePolicy::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+  policy_name = s->info.args.get("PolicyName");
+
+  if (role_name.empty() || policy_name.empty()) {
+    ldpp_dout(this, 20) << "ERROR: One of role name or policy name is empty"<< dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+void RGWDeleteRolePolicy::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!driver->is_meta_master()) {
+    RGWXMLDecoder::XMLParser parser;
+    if (!parser.init()) {
+      ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    bufferlist data;
+    s->info.args.remove("RoleName");
+    s->info.args.remove("PolicyName");
+    s->info.args.remove("Action");
+    s->info.args.remove("Version");
+
+    RGWUserInfo info = s->user->get_info();
+    const auto& it = info.access_keys.begin();
+    RGWAccessKey key;
+    if (it != info.access_keys.end()) {
+      key.id = it->first;
+      RGWAccessKey cred = it->second;
+      key.key = cred.key;
+    }
+    op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl;
+      return;
+    }
+  }
+
+  op_ret = _role->delete_policy(this, policy_name);
+  if (op_ret == -ENOENT) {
+    op_ret = -ERR_NO_ROLE_FOUND;
+    return;
+  }
+
+  if (op_ret == 0) {
+    op_ret = _role->update(this, y);
+  }
+
+  s->formatter->open_object_section("DeleteRolePoliciesResponse");
+  s->formatter->open_object_section("ResponseMetadata");
+  s->formatter->dump_string("RequestId", s->trans_id);
+  s->formatter->close_section();
+  s->formatter->close_section();
+}
+
+int RGWTagRole::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+
+  if (role_name.empty()) {
+    ldout(s->cct, 0) << "ERROR: Role name is empty" << dendl;
+    return -EINVAL;
+  }
+  int ret = parse_tags();
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+void RGWTagRole::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!driver->is_meta_master()) {
+    RGWXMLDecoder::XMLParser parser;
+    if (!parser.init()) {
+      ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    bufferlist data;
+    s->info.args.remove("RoleName");
+    s->info.args.remove("Action");
+    s->info.args.remove("Version");
+    auto& val_map = s->info.args.get_params();
+    for (auto it = val_map.begin(); it!= val_map.end(); it++) {
+      if (it->first.find("Tags.member.") == 0) {
+        val_map.erase(it);
+      }
+    }
+
+    RGWUserInfo info = s->user->get_info();
+    const auto& it = info.access_keys.begin();
+    RGWAccessKey key;
+    if (it != info.access_keys.end()) {
+      key.id = it->first;
+      RGWAccessKey cred = it->second;
+      key.key = cred.key;
+    }
+    op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl;
+      return;
+    }
+  }
+
+  op_ret = _role->set_tags(this, tags);
+  if (op_ret == 0) {
+    op_ret = _role->update(this, y);
+  }
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("TagRoleResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWListRoleTags::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+
+  if (role_name.empty()) {
+    ldout(s->cct, 0) << "ERROR: Role name is empty" << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWListRoleTags::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  boost::optional<multimap<string,string>> tag_map = _role->get_tags();
+  s->formatter->open_object_section("ListRoleTagsResponse");
+  s->formatter->open_object_section("ListRoleTagsResult");
+  if (tag_map) {
+    s->formatter->open_array_section("Tags");
+    for (const auto& it : tag_map.get()) {
+      s->formatter->open_object_section("Key");
+      encode_json("Key", it.first, s->formatter);
+      s->formatter->close_section();
+      s->formatter->open_object_section("Value");
+      encode_json("Value", it.second, s->formatter);
+      s->formatter->close_section();
+    }
+    s->formatter->close_section();
+  }
+  s->formatter->close_section();
+  s->formatter->open_object_section("ResponseMetadata");
+  s->formatter->dump_string("RequestId", s->trans_id);
+  s->formatter->close_section();
+  s->formatter->close_section();
+}
+
+int RGWUntagRole::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+
+  if (role_name.empty()) {
+    ldout(s->cct, 0) << "ERROR: Role name is empty" << dendl;
+    return -EINVAL;
+  }
+
+  auto val_map = s->info.args.get_params();
+  for (auto& it : val_map) {
+    if (it.first.find("TagKeys.member.") != string::npos) {
+        tagKeys.emplace_back(it.second);
+    }
+  }
+  return 0;
+}
+
+void RGWUntagRole::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!driver->is_meta_master()) {
+    RGWXMLDecoder::XMLParser parser;
+    if (!parser.init()) {
+      ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    bufferlist data;
+    s->info.args.remove("RoleName");
+    s->info.args.remove("Action");
+    s->info.args.remove("Version");
+    auto& val_map = s->info.args.get_params();
+    std::vector<std::multimap<std::string, std::string>::iterator> iters;
+    for (auto it = val_map.begin(); it!= val_map.end(); it++) {
+      if (it->first.find("Tags.member.") == 0) {
+        iters.emplace_back(it);
+      }
+    }
+
+    for (auto& it : iters) {
+      val_map.erase(it);
+    }
+    RGWUserInfo info = s->user->get_info();
+    const auto& it = info.access_keys.begin();
+    RGWAccessKey key;
+    if (it != info.access_keys.end()) {
+      key.id = it->first;
+      RGWAccessKey cred = it->second;
+      key.key = cred.key;
+    }
+    op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl;
+      return;
+    }
+  }
+
+  _role->erase_tags(tagKeys);
+  op_ret = _role->update(this, y);
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("UntagRoleResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWUpdateRole::get_params()
+{
+  role_name = s->info.args.get("RoleName");
+  max_session_duration = s->info.args.get("MaxSessionDuration");
+
+  if (role_name.empty()) {
+    ldpp_dout(this, 20) << "ERROR: Role name is empty"<< dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWUpdateRole::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  if (!driver->is_meta_master()) {
+    RGWXMLDecoder::XMLParser parser;
+    if (!parser.init()) {
+      ldpp_dout(this, 0) << "ERROR: failed to initialize xml parser" << dendl;
+      op_ret = -EINVAL;
+      return;
+    }
+
+    bufferlist data;
+    s->info.args.remove("RoleName");
+    s->info.args.remove("MaxSessionDuration");
+    s->info.args.remove("Action");
+    s->info.args.remove("Version");
+
+    RGWUserInfo info = s->user->get_info();
+    const auto& it = info.access_keys.begin();
+    RGWAccessKey key;
+    if (it != info.access_keys.end()) {
+      key.id = it->first;
+      RGWAccessKey cred = it->second;
+      key.key = cred.key;
+    }
+    op_ret = driver->forward_iam_request_to_master(s, key, nullptr, bl_post_body, &parser, s->info, y);
+    if (op_ret < 0) {
+      ldpp_dout(this, 20) << "ERROR: forward_iam_request_to_master failed with error code: " << op_ret << dendl;
+      return;
+    }
+  }
+
+  if (!_role->validate_max_session_duration(this)) {
+    op_ret = -EINVAL;
+    return;
+  }
+
+  _role->update_max_session_duration(max_session_duration);
+  op_ret = _role->update(this, y);
+
+  s->formatter->open_object_section("UpdateRoleResponse");
+  s->formatter->open_object_section("UpdateRoleResult");
+  s->formatter->open_object_section("ResponseMetadata");
+  s->formatter->dump_string("RequestId", s->trans_id);
+  s->formatter->close_section();
+  s->formatter->close_section();
+}
diff --git a/src/rgw/rgw_rest_role.h b/src/rgw/rgw_rest_role.h
new file mode 100644
index 000000000..98a08833b
--- /dev/null
+++ b/src/rgw/rgw_rest_role.h
@@ -0,0 +1,181 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "common/async/yield_context.h"
+
+#include "rgw_role.h"
+#include "rgw_rest.h"
+
+class RGWRestRole : public RGWRESTOp {
+protected:
+  std::string role_name;
+  std::string role_path;
+  std::string trust_policy;
+  std::string policy_name;
+  std::string perm_policy;
+  std::string path_prefix;
+  std::string max_session_duration;
+  std::multimap<std::string,std::string> tags;
+  std::vector<std::string> tagKeys;
+  std::unique_ptr<rgw::sal::RGWRole> _role;
+  int verify_permission(optional_yield y) override;
+  void send_response() override;
+  virtual uint64_t get_op() = 0;
+  int parse_tags();
+};
+
+class RGWRoleRead : public RGWRestRole {
+public:
+  RGWRoleRead() = default;
+  int check_caps(const RGWUserCaps& caps) override;
+};
+
+class RGWRoleWrite : public RGWRestRole {
+public:
+  RGWRoleWrite() = default;
+  int check_caps(const RGWUserCaps& caps) override;
+};
+
+class RGWCreateRole : public RGWRoleWrite {
+  bufferlist bl_post_body;
+public:
+  RGWCreateRole(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {};
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "create_role"; }
+  RGWOpType get_type() override { return RGW_OP_CREATE_ROLE; }
+  uint64_t get_op() override { return rgw::IAM::iamCreateRole; }
+};
+
+class RGWDeleteRole : public RGWRoleWrite {
+  bufferlist bl_post_body;
+public:
+  RGWDeleteRole(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {};
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "delete_role"; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_ROLE; }
+  uint64_t get_op() override { return rgw::IAM::iamDeleteRole; }
+};
+
+class RGWGetRole : public RGWRoleRead {
+  int _verify_permission(const rgw::sal::RGWRole* role);
+public:
+  RGWGetRole() = default;
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "get_role"; }
+  RGWOpType get_type() override { return RGW_OP_GET_ROLE; }
+  uint64_t get_op() override { return rgw::IAM::iamGetRole; }
+};
+
+class RGWModifyRoleTrustPolicy : public RGWRoleWrite {
+  bufferlist bl_post_body;
+public:
+  RGWModifyRoleTrustPolicy(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {};
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "modify_role_trust_policy"; }
+  RGWOpType get_type() override { return RGW_OP_MODIFY_ROLE_TRUST_POLICY; }
+  uint64_t get_op() override { return rgw::IAM::iamModifyRoleTrustPolicy; }
+};
+
+class RGWListRoles : public RGWRoleRead {
+public:
+  RGWListRoles() = default;
+  int verify_permission(optional_yield y) override;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "list_roles"; }
+  RGWOpType get_type() override { return RGW_OP_LIST_ROLES; }
+  uint64_t get_op() override { return rgw::IAM::iamListRoles; }
+};
+
+class RGWPutRolePolicy : public RGWRoleWrite {
+  bufferlist bl_post_body;
+public:
+  RGWPutRolePolicy(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {};
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "put_role_policy"; }
+  RGWOpType get_type() override { return RGW_OP_PUT_ROLE_POLICY; }
+  uint64_t get_op() override { return rgw::IAM::iamPutRolePolicy; }
+};
+
+class RGWGetRolePolicy : public RGWRoleRead {
+public:
+  RGWGetRolePolicy() = default;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "get_role_policy"; }
+  RGWOpType get_type() override { return RGW_OP_GET_ROLE_POLICY; }
+  uint64_t get_op() override { return rgw::IAM::iamGetRolePolicy; }
+};
+
+class RGWListRolePolicies : public RGWRoleRead {
+public:
+  RGWListRolePolicies() = default;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "list_role_policies"; }
+  RGWOpType get_type() override { return RGW_OP_LIST_ROLE_POLICIES; }
+  uint64_t get_op() override { return rgw::IAM::iamListRolePolicies; }
+};
+
+class RGWDeleteRolePolicy : public RGWRoleWrite {
+  bufferlist bl_post_body;
+public:
+  RGWDeleteRolePolicy(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {};
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "delete_role_policy"; }
+  RGWOpType get_type() override { return RGW_OP_DELETE_ROLE_POLICY; }
+  uint64_t get_op() override { return rgw::IAM::iamDeleteRolePolicy; }
+};
+
+class RGWTagRole : public RGWRoleWrite {
+  bufferlist bl_post_body;
+public:
+  RGWTagRole(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {};
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "tag_role"; }
+  RGWOpType get_type() override { return RGW_OP_TAG_ROLE; }
+  uint64_t get_op() override { return rgw::IAM::iamTagRole; }
+};
+
+class RGWListRoleTags : public RGWRoleRead {
+public:
+  RGWListRoleTags() = default;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "list_role_tags"; }
+  RGWOpType get_type() override { return RGW_OP_LIST_ROLE_TAGS; }
+  uint64_t get_op() override { return rgw::IAM::iamListRoleTags; }
+};
+
+class RGWUntagRole : public RGWRoleWrite {
+  bufferlist bl_post_body;
+public:
+  RGWUntagRole(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {};
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "untag_role"; }
+  RGWOpType get_type() override { return RGW_OP_UNTAG_ROLE; }
+  uint64_t get_op() override { return rgw::IAM::iamUntagRole; }
+};
+
+class RGWUpdateRole : public RGWRoleWrite {
+  bufferlist bl_post_body;
+public:
+  RGWUpdateRole(const bufferlist& bl_post_body) : bl_post_body(bl_post_body) {};
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "update_role"; }
+  RGWOpType get_type() override { return RGW_OP_UPDATE_ROLE; }
+  uint64_t get_op() override { return rgw::IAM::iamUpdateRole; }
+};
+\ No newline at end of file
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
new file mode 100644
index 000000000..0b997f30b
--- /dev/null
+++ b/src/rgw/rgw_rest_s3.cc
@@ -0,0 +1,6477 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <array>
+#include <string.h>
+#include <string_view>
+
+#include "common/ceph_crypto.h"
+#include "common/split.h"
+#include "common/Formatter.h"
+#include "common/utf8.h"
+#include "common/ceph_json.h"
+#include "common/safe_io.h"
+#include "common/errno.h"
+#include "auth/Crypto.h"
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/replace.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/tokenizer.hpp>
+#define BOOST_BIND_GLOBAL_PLACEHOLDERS
+#ifdef HAVE_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wimplicit-const-int-float-conversion"
+#endif
+#ifdef HAVE_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION
+#pragma clang diagnostic pop
+#endif
+#undef BOOST_BIND_GLOBAL_PLACEHOLDERS
+
+#include <liboath/oath.h>
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+#include "rgw_rest_s3website.h"
+#include "rgw_rest_pubsub.h"
+#include "rgw_auth_s3.h"
+#include "rgw_acl.h"
+#include "rgw_policy_s3.h"
+#include "rgw_user.h"
+#include "rgw_cors.h"
+#include "rgw_cors_s3.h"
+#include "rgw_tag_s3.h"
+
+#include "rgw_client_io.h"
+
+#include "rgw_keystone.h"
+#include "rgw_auth_keystone.h"
+#include "rgw_auth_registry.h"
+
+#include "rgw_es_query.h"
+
+#include <typeinfo> // for 'typeid'
+
+#include "rgw_ldap.h"
+#include "rgw_token.h"
+#include "rgw_rest_role.h"
+#include "rgw_crypt.h"
+#include "rgw_crypt_sanitize.h"
+#include "rgw_rest_user_policy.h"
+#include "rgw_zone.h"
+#include "rgw_bucket_sync.h"
+
+#include "include/ceph_assert.h"
+#include "rgw_role.h"
+#include "rgw_rest_sts.h"
+#include "rgw_rest_iam.h"
+#include "rgw_sts.h"
+#include "rgw_sal_rados.h"
+
+#include "rgw_s3select.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw;
+using namespace ceph::crypto;
+
+void list_all_buckets_start(req_state *s)
+{
+  s->formatter->open_array_section_in_ns("ListAllMyBucketsResult", XMLNS_AWS_S3);
+}
+
+void list_all_buckets_end(req_state *s)
+{
+  s->formatter->close_section();
+}
+
+void dump_bucket(req_state *s, rgw::sal::Bucket& obj)
+{
+  s->formatter->open_object_section("Bucket");
+  s->formatter->dump_string("Name", obj.get_name());
+  dump_time(s, "CreationDate", obj.get_creation_time());
+  s->formatter->close_section();
+}
+
+void rgw_get_errno_s3(rgw_http_error *e , int err_no)
+{
+  rgw_http_errors::const_iterator r = rgw_http_s3_errors.find(err_no);
+
+  if (r != rgw_http_s3_errors.end()) {
+    e->http_ret = r->second.first;
+    e->s3_code = r->second.second;
+  } else {
+    e->http_ret = 500;
+    e->s3_code = "UnknownError";
+  }
+}
+
+static inline std::string get_s3_expiration_header(
+  req_state* s,
+  const ceph::real_time& mtime)
+{
+  return rgw::lc::s3_expiration_header(
+    s, s->object->get_key(), s->tagset, mtime, s->bucket_attrs);
+}
+
+static inline bool get_s3_multipart_abort_header(
+  req_state* s, const ceph::real_time& mtime,
+  ceph::real_time& date, std::string& rule_id)
+{
+  return rgw::lc::s3_multipart_abort_header(
+          s, s->object->get_key(), mtime, s->bucket_attrs, date, rule_id);
+}
+
+struct response_attr_param {
+  const char *param;
+  const char *http_attr;
+};
+
+static struct response_attr_param resp_attr_params[] = {
+  {"response-content-type", "Content-Type"},
+  {"response-content-language", "Content-Language"},
+  {"response-expires", "Expires"},
+  {"response-cache-control", "Cache-Control"},
+  {"response-content-disposition", "Content-Disposition"},
+  {"response-content-encoding", "Content-Encoding"},
+  {NULL, NULL},
+};
+
+#define SSE_C_GROUP 1
+#define KMS_GROUP 2
+
+int get_encryption_defaults(req_state *s)
+{
+  int meta_sse_group = 0;
+  constexpr auto sse_c_prefix = "x-amz-server-side-encryption-customer-";
+  constexpr auto encrypt_attr = "x-amz-server-side-encryption";
+  constexpr auto context_attr = "x-amz-server-side-encryption-context";
+  constexpr auto kms_attr = "x-amz-server-side-encryption-aws-kms-key-id";
+  constexpr auto bucket_key_attr = "x-amz-server-side-encryption-bucket-key-enabled";
+  bool bucket_configuration_found { false };
+  bool rest_only { false };
+
+  for (auto& kv : s->info.crypt_attribute_map) {
+    if (kv.first.find(sse_c_prefix) == 0)
+      meta_sse_group |= SSE_C_GROUP;
+    else if (kv.first.find(encrypt_attr) == 0)
+      meta_sse_group |= KMS_GROUP;
+  }
+  if (meta_sse_group == (SSE_C_GROUP|KMS_GROUP)) {
+    s->err.message = "Server side error - can't do sse-c & sse-kms|sse-s3";
+    return -EINVAL;
+  }
+
+  const auto& buck_attrs = s->bucket_attrs;
+  auto aiter = buck_attrs.find(RGW_ATTR_BUCKET_ENCRYPTION_POLICY);
+  RGWBucketEncryptionConfig bucket_encryption_conf;
+  if (aiter != buck_attrs.end()) {
+    ldpp_dout(s, 5) << "Found RGW_ATTR_BUCKET_ENCRYPTION_POLICY on "
+	    << s->bucket_name << dendl;
+
+    bufferlist::const_iterator iter{&aiter->second};
+
+    try {
+      bucket_encryption_conf.decode(iter);
+      bucket_configuration_found = true;
+    } catch (const buffer::error& e) {
+      s->err.message = "Server side error - can't decode bucket_encryption_conf";
+      ldpp_dout(s, 5) << __func__ <<  "decode bucket_encryption_conf failed" << dendl;
+      return -EINVAL;
+    }
+  }
+  if (meta_sse_group & SSE_C_GROUP) {
+    ldpp_dout(s, 20) << "get_encryption_defaults: no defaults cause sse-c forced"
+	<< dendl;
+    return 0;			// sse-c: no defaults here
+  }
+  std::string sse_algorithm { bucket_encryption_conf.sse_algorithm() };
+  auto kms_master_key_id { bucket_encryption_conf.kms_master_key_id() };
+  bool bucket_key_enabled { bucket_encryption_conf.bucket_key_enabled() };
+  bool kms_attr_seen = false;
+  if (bucket_configuration_found) {
+    ldpp_dout(s, 5) << "RGW_ATTR_BUCKET_ENCRYPTION ALGO: "
+	  <<  sse_algorithm << dendl;
+  }
+
+  auto iter = s->info.crypt_attribute_map.find(encrypt_attr);
+  if (iter != s->info.crypt_attribute_map.end()) {
+ldpp_dout(s, 20) << "get_encryption_defaults: found encrypt_attr " << encrypt_attr << " = " << iter->second << ", setting sse_algorithm to that" << dendl;
+    rest_only = true;
+    sse_algorithm = iter->second;
+  } else if (sse_algorithm != "") {
+    rgw_set_amz_meta_header(s->info.crypt_attribute_map, encrypt_attr, sse_algorithm, OVERWRITE);
+  }
+
+  iter = s->info.crypt_attribute_map.find(kms_attr);
+  if (iter != s->info.crypt_attribute_map.end()) {
+ldpp_dout(s, 20) << "get_encryption_defaults: found kms_attr " << kms_attr << " = " << iter->second << ", setting kms_attr_seen" << dendl;
+    if (!rest_only) {
+      s->err.message = std::string("incomplete rest sse parms: ") + kms_attr + " not valid without kms";
+      ldpp_dout(s, 5) << __func__ << "argument problem: " << s->err.message << dendl;
+      return -EINVAL;
+    }
+    kms_attr_seen = true;
+  } else if (!rest_only && kms_master_key_id != "") {
+ldpp_dout(s, 20) << "get_encryption_defaults: no kms_attr, but kms_master_key_id = " << kms_master_key_id << ", settig kms_attr_seen" << dendl;
+    kms_attr_seen = true;
+    rgw_set_amz_meta_header(s->info.crypt_attribute_map, kms_attr, kms_master_key_id, OVERWRITE);
+  }
+
+  iter = s->info.crypt_attribute_map.find(bucket_key_attr);
+  if (iter != s->info.crypt_attribute_map.end()) {
+ldpp_dout(s, 20) << "get_encryption_defaults: found bucket_key_attr " << bucket_key_attr << " = " << iter->second << ", setting kms_attr_seen" << dendl;
+    if (!rest_only) {
+      s->err.message = std::string("incomplete rest sse parms: ") + bucket_key_attr + " not valid without kms";
+      ldpp_dout(s, 5) << __func__ << "argument problem: " << s->err.message << dendl;
+      return -EINVAL;
+    }
+    kms_attr_seen = true;
+  } else if (!rest_only && bucket_key_enabled) {
+ldpp_dout(s, 20) << "get_encryption_defaults: no bucket_key_attr, but bucket_key_enabled,  setting kms_attr_seen" << dendl;
+    kms_attr_seen = true;
+    rgw_set_amz_meta_header(s->info.crypt_attribute_map, bucket_key_attr, "true", OVERWRITE);
+  }
+
+  iter = s->info.crypt_attribute_map.find(context_attr);
+  if (iter != s->info.crypt_attribute_map.end()) {
+ldpp_dout(s, 20) << "get_encryption_defaults: found context_attr " << context_attr << " = " << iter->second << ", setting kms_attr_seen" << dendl;
+    if (!rest_only) {
+      s->err.message = std::string("incomplete rest sse parms: ") + context_attr + " not valid without kms";
+      ldpp_dout(s, 5) << __func__ << "argument problem: " << s->err.message << dendl;
+      return -EINVAL;
+    }
+    kms_attr_seen = true;
+  }
+
+  if (kms_attr_seen && sse_algorithm == "") {
+ldpp_dout(s, 20) << "get_encryption_defaults: kms_attr but no algorithm, defaulting to aws_kms" << dendl;
+    sse_algorithm = "aws:kms";
+  }
+for (const auto& kv: s->info.crypt_attribute_map) {
+ldpp_dout(s, 20) << "get_encryption_defaults:  final map: " << kv.first << " = " << kv.second << dendl;
+}
+ldpp_dout(s, 20) << "get_encryption_defaults:  kms_attr_seen is " << kms_attr_seen << " and sse_algorithm is " << sse_algorithm << dendl;
+  if (kms_attr_seen && sse_algorithm != "aws:kms") {
+    s->err.message = "algorithm <" + sse_algorithm + "> but got sse-kms attributes";
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+int RGWGetObj_ObjStore_S3Website::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) {
+  map<string, bufferlist>::iterator iter;
+  iter = attrs.find(RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION);
+  if (iter != attrs.end()) {
+    bufferlist &bl = iter->second;
+    s->redirect = bl.c_str();
+    s->err.http_ret = 301;
+    ldpp_dout(this, 20) << __CEPH_ASSERT_FUNCTION << " redirecting per x-amz-website-redirect-location=" << s->redirect << dendl;
+    op_ret = -ERR_WEBSITE_REDIRECT;
+    set_req_state_err(s, op_ret);
+    dump_errno(s);
+    dump_content_length(s, 0);
+    dump_redirect(s, s->redirect);
+    end_header(s, this);
+    return op_ret;
+  } else {
+    return RGWGetObj_ObjStore_S3::send_response_data(bl, bl_ofs, bl_len);
+  }
+}
+
+int RGWGetObj_ObjStore_S3Website::send_response_data_error(optional_yield y)
+{
+  return RGWGetObj_ObjStore_S3::send_response_data_error(y);
+}
+
+int RGWGetObj_ObjStore_S3::get_params(optional_yield y)
+{
+  // for multisite sync requests, only read the slo manifest itself, rather than
+  // all of the data from its parts. the parts will sync as separate objects
+  skip_manifest = s->info.args.exists(RGW_SYS_PARAM_PREFIX "sync-manifest");
+
+  // multisite sync requests should fetch encrypted data, along with the
+  // attributes needed to support decryption on the other zone
+  if (s->system_request) {
+    skip_decrypt = s->info.args.exists(RGW_SYS_PARAM_PREFIX "skip-decrypt");
+  }
+
+  // multisite sync requests should fetch cloudtiered objects
+  sync_cloudtiered = s->info.args.exists(RGW_SYS_PARAM_PREFIX "sync-cloudtiered");
+
+  dst_zone_trace = s->info.args.get(RGW_SYS_PARAM_PREFIX "if-not-replicated-to");
+
+  return RGWGetObj_ObjStore::get_params(y);
+}
+
+int RGWGetObj_ObjStore_S3::send_response_data_error(optional_yield y)
+{
+  bufferlist bl;
+  return send_response_data(bl, 0 , 0);
+}
+
+template <class T>
+int decode_attr_bl_single_value(map<string, bufferlist>& attrs, const char *attr_name, T *result, T def_val)
+{
+  map<string, bufferlist>::iterator iter = attrs.find(attr_name);
+  if (iter == attrs.end()) {
+    *result = def_val;
+    return 0;
+  }
+  bufferlist& bl = iter->second;
+  if (bl.length() == 0) {
+    *result = def_val;
+    return 0;
+  }
+  auto bliter = bl.cbegin();
+  try {
+    decode(*result, bliter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+  return 0;
+}
+
+inline bool str_has_cntrl(const std::string s) {
+  return std::any_of(s.begin(), s.end(), ::iscntrl);
+}
+
+inline bool str_has_cntrl(const char* s) {
+  std::string _s(s);
+  return str_has_cntrl(_s);
+}
+
+int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs,
+					      off_t bl_len)
+{
+  const char *content_type = NULL;
+  string content_type_str;
+  map<string, string> response_attrs;
+  map<string, string>::iterator riter;
+  bufferlist metadata_bl;
+
+  string expires = get_s3_expiration_header(s, lastmod);
+
+  if (sent_header)
+    goto send_data;
+
+  if (custom_http_ret) {
+    set_req_state_err(s, 0);
+    dump_errno(s, custom_http_ret);
+  } else {
+    set_req_state_err(s, (partial_content && !op_ret) ? STATUS_PARTIAL_CONTENT
+                  : op_ret);
+    dump_errno(s);
+  }
+
+  if (op_ret)
+    goto done;
+
+  if (range_str)
+    dump_range(s, start, end, s->obj_size);
+
+  if (s->system_request &&
+      s->info.args.exists(RGW_SYS_PARAM_PREFIX "prepend-metadata")) {
+
+    dump_header(s, "Rgwx-Object-Size", (long long)total_len);
+
+    if (rgwx_stat) {
+      /*
+       * in this case, we're not returning the object's content, only the prepended
+       * extra metadata
+       */
+      total_len = 0;
+    }
+
+    /* JSON encode object metadata */
+    JSONFormatter jf;
+    jf.open_object_section("obj_metadata");
+    encode_json("attrs", attrs, &jf);
+    utime_t ut(lastmod);
+    encode_json("mtime", ut, &jf);
+    jf.close_section();
+    stringstream ss;
+    jf.flush(ss);
+    metadata_bl.append(ss.str());
+    dump_header(s, "Rgwx-Embedded-Metadata-Len", metadata_bl.length());
+    total_len += metadata_bl.length();
+  }
+
+  if (s->system_request && !real_clock::is_zero(lastmod)) {
+    /* we end up dumping mtime in two different methods, a bit redundant */
+    dump_epoch_header(s, "Rgwx-Mtime", lastmod);
+    uint64_t pg_ver = 0;
+    int r = decode_attr_bl_single_value(attrs, RGW_ATTR_PG_VER, &pg_ver, (uint64_t)0);
+    if (r < 0) {
+      ldpp_dout(this, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl;
+    }
+    dump_header(s, "Rgwx-Obj-PG-Ver", pg_ver);
+
+    uint32_t source_zone_short_id = 0;
+    r = decode_attr_bl_single_value(attrs, RGW_ATTR_SOURCE_ZONE, &source_zone_short_id, (uint32_t)0);
+    if (r < 0) {
+      ldpp_dout(this, 0) << "ERROR: failed to decode pg ver attr, ignoring" << dendl;
+    }
+    if (source_zone_short_id != 0) {
+      dump_header(s, "Rgwx-Source-Zone-Short-Id", source_zone_short_id);
+    }
+  }
+
+  for (auto &it : crypt_http_responses)
+    dump_header(s, it.first, it.second);
+
+  dump_content_length(s, total_len);
+  dump_last_modified(s, lastmod);
+  dump_header_if_nonempty(s, "x-amz-version-id", version_id);
+  dump_header_if_nonempty(s, "x-amz-expiration", expires);
+
+  if (attrs.find(RGW_ATTR_APPEND_PART_NUM) != attrs.end()) {
+    dump_header(s, "x-rgw-object-type", "Appendable");
+    dump_header(s, "x-rgw-next-append-position", s->obj_size);
+  } else {
+    dump_header(s, "x-rgw-object-type", "Normal");
+  }
+  // replication status
+  if (auto i = attrs.find(RGW_ATTR_OBJ_REPLICATION_STATUS);
+      i != attrs.end()) {
+    dump_header(s, "x-amz-replication-status", i->second);
+  }
+  if (auto i = attrs.find(RGW_ATTR_OBJ_REPLICATION_TRACE);
+      i != attrs.end()) {
+    try {
+      std::vector<rgw_zone_set_entry> zones;
+      auto p = i->second.cbegin();
+      decode(zones, p);
+      for (const auto& zone : zones) {
+        dump_header(s, "x-rgw-replicated-from", zone.to_str());
+      }
+    } catch (const buffer::error&) {} // omit x-rgw-replicated-from headers
+  }
+
+  if (! op_ret) {
+    if (! lo_etag.empty()) {
+      /* Handle etag of Swift API's large objects (DLO/SLO). It's entirerly
+       * legit to perform GET on them through S3 API. In such situation,
+       * a client should receive the composited content with corresponding
+       * etag value. */
+      dump_etag(s, lo_etag);
+    } else {
+      auto iter = attrs.find(RGW_ATTR_ETAG);
+      if (iter != attrs.end()) {
+        dump_etag(s, iter->second.to_str());
+      }
+    }
+
+    for (struct response_attr_param *p = resp_attr_params; p->param; p++) {
+      bool exists;
+      string val = s->info.args.get(p->param, &exists);
+      if (exists) {
+	/* reject unauthenticated response header manipulation, see
+	 * https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObject.html */
+	if (s->auth.identity->is_anonymous()) {
+	  return -ERR_INVALID_REQUEST;
+	}
+        /* HTTP specification says no control characters should be present in
+         * header values: https://tools.ietf.org/html/rfc7230#section-3.2
+         *      field-vchar    = VCHAR / obs-text
+         *
+         * Failure to validate this permits a CRLF injection in HTTP headers,
+         * whereas S3 GetObject only permits specific headers.
+         */
+        if(str_has_cntrl(val)) {
+          /* TODO: return a more distinct error in future;
+           * stating what the problem is */
+          return -ERR_INVALID_REQUEST;
+        }
+
+	if (strcmp(p->param, "response-content-type") != 0) {
+	  response_attrs[p->http_attr] = val;
+	} else {
+	  content_type_str = val;
+	  content_type = content_type_str.c_str();
+	}
+      }
+    }
+
+    for (auto iter = attrs.begin(); iter != attrs.end(); ++iter) {
+      const char *name = iter->first.c_str();
+      map<string, string>::iterator aiter = rgw_to_http_attrs.find(name);
+      if (aiter != rgw_to_http_attrs.end()) {
+        if (response_attrs.count(aiter->second) == 0) {
+          /* Was not already overridden by a response param. */
+
+          size_t len = iter->second.length();
+          string s(iter->second.c_str(), len);
+          while (len && !s[len - 1]) {
+            --len;
+            s.resize(len);
+          }
+          response_attrs[aiter->second] = s;
+        }
+      } else if (iter->first.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
+        /* Special handling for content_type. */
+        if (!content_type) {
+          content_type_str = rgw_bl_str(iter->second);
+          content_type = content_type_str.c_str();
+        }
+      } else if (strcmp(name, RGW_ATTR_SLO_UINDICATOR) == 0) {
+        // this attr has an extra length prefix from encode() in prior versions
+        dump_header(s, "X-Object-Meta-Static-Large-Object", "True");
+      } else if (strncmp(name, RGW_ATTR_META_PREFIX,
+			 sizeof(RGW_ATTR_META_PREFIX)-1) == 0) {
+        /* User custom metadata. */
+        name += sizeof(RGW_ATTR_PREFIX) - 1;
+        dump_header(s, name, iter->second);
+      } else if (iter->first.compare(RGW_ATTR_TAGS) == 0) {
+        RGWObjTags obj_tags;
+        try{
+          auto it = iter->second.cbegin();
+          obj_tags.decode(it);
+        } catch (buffer::error &err) {
+          ldpp_dout(this,0) << "Error caught buffer::error couldn't decode TagSet " << dendl;
+        }
+        dump_header(s, RGW_AMZ_TAG_COUNT, obj_tags.count());
+      } else if (iter->first.compare(RGW_ATTR_OBJECT_RETENTION) == 0 && get_retention){
+        RGWObjectRetention retention;
+        try {
+          decode(retention, iter->second);
+          dump_header(s, "x-amz-object-lock-mode", retention.get_mode());
+          string date = ceph::to_iso_8601(retention.get_retain_until_date());
+          dump_header(s, "x-amz-object-lock-retain-until-date", date.c_str());
+        } catch (buffer::error& err) {
+          ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectRetention" << dendl;
+        }
+      } else if (iter->first.compare(RGW_ATTR_OBJECT_LEGAL_HOLD) == 0 && get_legal_hold) {
+        RGWObjectLegalHold legal_hold;
+        try {
+          decode(legal_hold, iter->second);
+          dump_header(s, "x-amz-object-lock-legal-hold",legal_hold.get_status());
+        } catch (buffer::error& err) {
+          ldpp_dout(this, 0) << "ERROR: failed to decode RGWObjectLegalHold" << dendl;
+        }
+      }
+    }
+  }
+
+done:
+  for (riter = response_attrs.begin(); riter != response_attrs.end();
+       ++riter) {
+    dump_header(s, riter->first, riter->second);
+  }
+
+  if (op_ret == -ERR_NOT_MODIFIED) {
+      end_header(s, this);
+  } else {
+      if (!content_type)
+          content_type = "binary/octet-stream";
+
+      end_header(s, this, content_type);
+  }
+
+  if (metadata_bl.length()) {
+    dump_body(s, metadata_bl);
+  }
+  sent_header = true;
+
+send_data:
+  if (get_data && !op_ret) {
+    int r = dump_body(s, bl.c_str() + bl_ofs, bl_len);
+    if (r < 0)
+      return r;
+  }
+
+  return 0;
+}
+
+int RGWGetObj_ObjStore_S3::get_decrypt_filter(std::unique_ptr<RGWGetObj_Filter> *filter, RGWGetObj_Filter* cb, bufferlist* manifest_bl)
+{
+  if (skip_decrypt) { // bypass decryption for multisite sync requests
+    return 0;
+  }
+
+  std::unique_ptr<BlockCrypt> block_crypt;
+  int res = rgw_s3_prepare_decrypt(s, attrs, &block_crypt, crypt_http_responses);
+  if (res < 0) {
+    return res;
+  }
+  if (block_crypt == nullptr) {
+    return 0;
+  }
+
+  // in case of a multipart upload, we need to know the part lengths to
+  // correctly decrypt across part boundaries
+  std::vector<size_t> parts_len;
+
+  // for replicated objects, the original part lengths are preserved in an xattr
+  if (auto i = attrs.find(RGW_ATTR_CRYPT_PARTS); i != attrs.end()) {
+    try {
+      auto p = i->second.cbegin();
+      using ceph::decode;
+      decode(parts_len, p);
+    } catch (const buffer::error&) {
+      ldpp_dout(this, 1) << "failed to decode RGW_ATTR_CRYPT_PARTS" << dendl;
+      return -EIO;
+    }
+  } else if (manifest_bl) {
+    // otherwise, we read the part lengths from the manifest
+    res = RGWGetObj_BlockDecrypt::read_manifest_parts(this, *manifest_bl,
+                                                      parts_len);
+    if (res < 0) {
+      return res;
+    }
+  }
+
+  *filter = std::make_unique<RGWGetObj_BlockDecrypt>(
+      s, s->cct, cb, std::move(block_crypt),
+      std::move(parts_len));
+  return 0;
+}
+int RGWGetObj_ObjStore_S3::verify_requester(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y) 
+{
+  int ret = -EINVAL;
+  ret = RGWOp::verify_requester(auth_registry, y);
+  if(!s->user->get_caps().check_cap("amz-cache", RGW_CAP_READ) && !ret && s->info.env->exists("HTTP_X_AMZ_CACHE"))
+    ret = override_range_hdr(auth_registry, y);
+  return ret;
+}
+
+int RGWGetObj_ObjStore_S3::override_range_hdr(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y)
+{
+  int ret = -EINVAL;
+  ldpp_dout(this, 10) << "cache override headers" << dendl;
+  RGWEnv* rgw_env = const_cast<RGWEnv *>(s->info.env);
+  const char* backup_range = rgw_env->get("HTTP_RANGE");
+  const char hdrs_split[2] = {(char)178,'\0'};
+  const char kv_split[2] = {(char)177,'\0'};
+  const char* cache_hdr = rgw_env->get("HTTP_X_AMZ_CACHE");
+  for (std::string_view hdr : ceph::split(cache_hdr, hdrs_split)) {
+    auto kv = ceph::split(hdr, kv_split);
+    auto k = kv.begin();
+    if (std::distance(k, kv.end()) != 2) {
+      return -EINVAL;
+    }
+    auto v = std::next(k);
+    std::string key = "HTTP_";
+    key.append(*k);
+    boost::replace_all(key, "-", "_");
+    ldpp_dout(this, 10) << "after splitting cache kv key: " << key  << " " << *v << dendl;
+    rgw_env->set(std::move(key), std::string(*v));
+  }
+  ret = RGWOp::verify_requester(auth_registry, y);
+  if(!ret && backup_range) {
+    rgw_env->set("HTTP_RANGE",backup_range);
+  } else {
+    rgw_env->remove("HTTP_RANGE");
+  }
+  return ret;
+}
+
+
+void RGWGetObjTags_ObjStore_S3::send_response_data(bufferlist& bl)
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  if (!op_ret){
+    s->formatter->open_object_section_in_ns("Tagging", XMLNS_AWS_S3);
+    s->formatter->open_object_section("TagSet");
+    if (has_tags){
+      RGWObjTagSet_S3 tagset;
+      auto iter = bl.cbegin();
+      try {
+        tagset.decode(iter);
+      } catch (buffer::error& err) {
+        ldpp_dout(this,0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl;
+        op_ret= -EIO;
+        return;
+      }
+      tagset.dump_xml(s->formatter);
+    }
+    s->formatter->close_section();
+    s->formatter->close_section();
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+
+int RGWPutObjTags_ObjStore_S3::get_params(optional_yield y)
+{
+  RGWXMLParser parser;
+
+  if (!parser.init()){
+    return -EINVAL;
+  }
+
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = read_all_input(s, max_size, false);
+
+  if (r < 0)
+    return r;
+
+  if (!parser.parse(data.c_str(), data.length(), 1)) {
+    return -ERR_MALFORMED_XML;
+  }
+
+  RGWObjTagging_S3 tagging;
+
+  try {
+    RGWXMLDecoder::decode_xml("Tagging", tagging, &parser);
+  } catch (RGWXMLDecoder::err& err) {
+    ldpp_dout(this, 5) << "Malformed tagging request: " << err << dendl;
+    return -ERR_MALFORMED_XML;
+  }
+
+  RGWObjTags obj_tags;
+  r = tagging.rebuild(obj_tags);
+  if (r < 0)
+    return r;
+
+  obj_tags.encode(tags_bl);
+  ldpp_dout(this, 20) << "Read " << obj_tags.count() << "tags" << dendl;
+
+  return 0;
+}
+
+void RGWPutObjTags_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+}
+
+void RGWDeleteObjTags_ObjStore_S3::send_response()
+{
+  if (op_ret == 0){
+    op_ret = STATUS_NO_CONTENT;
+  }
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+}
+
+void RGWGetBucketTags_ObjStore_S3::send_response_data(bufferlist& bl)
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  if (!op_ret) {
+  s->formatter->open_object_section_in_ns("Tagging", XMLNS_AWS_S3);
+  s->formatter->open_object_section("TagSet");
+  if (has_tags){
+    RGWObjTagSet_S3 tagset;
+    auto iter = bl.cbegin();
+    try {
+      tagset.decode(iter);
+    } catch (buffer::error& err) {
+      ldpp_dout(this,0) << "ERROR: caught buffer::error, couldn't decode TagSet" << dendl;
+      op_ret= -EIO;
+      return;
+    }
+    tagset.dump_xml(s->formatter);
+  }
+  s->formatter->close_section();
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+int RGWPutBucketTags_ObjStore_S3::get_params(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  RGWXMLParser parser;
+
+  if (!parser.init()){
+    return -EINVAL;
+  }
+
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  int r = 0;
+  bufferlist data;
+
+  std::tie(r, data) = read_all_input(s, max_size, false);
+
+  if (r < 0)
+    return r;
+
+  if (!parser.parse(data.c_str(), data.length(), 1)) {
+    return -ERR_MALFORMED_XML;
+  }
+
+  RGWObjTagging_S3 tagging;
+  try {
+    RGWXMLDecoder::decode_xml("Tagging", tagging, &parser);
+  } catch (RGWXMLDecoder::err& err) {
+
+    ldpp_dout(dpp, 5) << "Malformed tagging request: " << err << dendl;
+    return -ERR_MALFORMED_XML;
+  }
+
+  RGWObjTags obj_tags(50); // A tag set can contain as many as 50 tags, or it can be empty.
+  r = tagging.rebuild(obj_tags);
+  if (r < 0)
+    return r;
+
+  obj_tags.encode(tags_bl);
+  ldpp_dout(dpp, 20) << "Read " << obj_tags.count() << "tags" << dendl;
+
+  // forward bucket tags requests to meta master zone
+  if (!driver->is_meta_master()) {
+    /* only need to keep this data around if we're not meta master */
+    in_data = std::move(data);
+  }
+
+  return 0;
+}
+
+void RGWPutBucketTags_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+}
+
+void RGWDeleteBucketTags_ObjStore_S3::send_response()
+{
+  // A successful DeleteBucketTagging should
+  // return a 204 status code.
+  if (op_ret == 0)
+    op_ret = STATUS_NO_CONTENT;
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+}
+
+namespace {
+
+bool is_valid_status(const string& s) {
+  return (s == "Enabled" ||
+          s == "Disabled");
+}
+
+static string enabled_group_id = "s3-bucket-replication:enabled";
+static string disabled_group_id = "s3-bucket-replication:disabled";
+
+struct ReplicationConfiguration {
+  string role;
+
+  struct Rule {
+    struct DeleteMarkerReplication {
+      string status;
+
+      void decode_xml(XMLObj *obj) {
+        RGWXMLDecoder::decode_xml("Status", status, obj);
+      }
+
+      void dump_xml(Formatter *f) const {
+        encode_xml("Status", status, f);
+      }
+
+      bool is_valid(CephContext *cct) const {
+        bool result = is_valid_status(status);
+        if (!result) {
+          ldout(cct, 5) << "NOTICE: bad status provided in DeleteMarkerReplication element (status=" << status << ")" << dendl;
+        }
+        return result;
+      }
+    };
+
+    struct Source { /* rgw extension */
+      std::vector<string> zone_names;
+
+      void decode_xml(XMLObj *obj) {
+        RGWXMLDecoder::decode_xml("Zone", zone_names, obj);
+      }
+
+      void dump_xml(Formatter *f) const {
+        encode_xml("Zone", zone_names, f);
+      }
+    };
+
+    struct Destination {
+      struct AccessControlTranslation {
+        string owner;
+
+        void decode_xml(XMLObj *obj) {
+          RGWXMLDecoder::decode_xml("Owner", owner, obj);
+        }
+        void dump_xml(Formatter *f) const {
+          encode_xml("Owner", owner, f);
+        }
+      };
+
+      std::optional<AccessControlTranslation> acl_translation;
+      std::optional<string> account;
+      string bucket;
+      std::optional<string> storage_class;
+      std::vector<string> zone_names;
+
+      void decode_xml(XMLObj *obj) {
+        RGWXMLDecoder::decode_xml("AccessControlTranslation", acl_translation, obj);
+        RGWXMLDecoder::decode_xml("Account", account, obj);
+        if (account && account->empty()) {
+          account.reset();
+        }
+        RGWXMLDecoder::decode_xml("Bucket", bucket, obj);
+        RGWXMLDecoder::decode_xml("StorageClass", storage_class, obj);
+        if (storage_class && storage_class->empty()) {
+          storage_class.reset();
+        }
+        RGWXMLDecoder::decode_xml("Zone", zone_names, obj); /* rgw extension */
+      }
+
+      void dump_xml(Formatter *f) const {
+        encode_xml("AccessControlTranslation", acl_translation, f);
+        encode_xml("Account", account, f);
+        encode_xml("Bucket", bucket, f);
+        encode_xml("StorageClass", storage_class, f);
+        encode_xml("Zone", zone_names, f);
+      }
+    };
+
+    struct Filter {
+      struct Tag {
+        string key;
+        string value;
+
+        bool empty() const {
+          return key.empty() && value.empty();
+        }
+
+        void decode_xml(XMLObj *obj) {
+          RGWXMLDecoder::decode_xml("Key", key, obj);
+          RGWXMLDecoder::decode_xml("Value", value, obj);
+        };
+
+        void dump_xml(Formatter *f) const {
+          encode_xml("Key", key, f);
+          encode_xml("Value", value, f);
+        }
+      };
+
+      struct AndElements {
+        std::optional<string> prefix;
+        std::vector<Tag> tags;
+
+        bool empty() const {
+          return !prefix &&
+            (tags.size() == 0);
+        }
+
+        void decode_xml(XMLObj *obj) {
+          std::vector<Tag> _tags;
+          RGWXMLDecoder::decode_xml("Prefix", prefix, obj);
+          if (prefix && prefix->empty()) {
+            prefix.reset();
+          }
+          RGWXMLDecoder::decode_xml("Tag", _tags, obj);
+          for (auto& t : _tags) {
+            if (!t.empty()) {
+              tags.push_back(std::move(t));
+            }
+          }
+        };
+
+        void dump_xml(Formatter *f) const {
+          encode_xml("Prefix", prefix, f);
+          encode_xml("Tag", tags, f);
+        }
+      };
+
+      std::optional<string> prefix;
+      std::optional<Tag> tag;
+      std::optional<AndElements> and_elements;
+
+      bool empty() const {
+        return (!prefix && !tag && !and_elements);
+      }
+
+      void decode_xml(XMLObj *obj) {
+        RGWXMLDecoder::decode_xml("Prefix", prefix, obj);
+        if (prefix && prefix->empty()) {
+          prefix.reset();
+        }
+        RGWXMLDecoder::decode_xml("Tag", tag, obj);
+        if (tag && tag->empty()) {
+          tag.reset();
+        }
+        RGWXMLDecoder::decode_xml("And", and_elements, obj);
+        if (and_elements && and_elements->empty()) {
+          and_elements.reset();
+        }
+      };
+
+      void dump_xml(Formatter *f) const {
+        encode_xml("Prefix", prefix, f);
+        encode_xml("Tag", tag, f);
+        encode_xml("And", and_elements, f);
+      }
+
+      bool is_valid(CephContext *cct) const {
+        if (tag && prefix) {
+          ldout(cct, 5) << "NOTICE: both tag and prefix were provided in replication filter rule" << dendl;
+          return false;
+        }
+
+        if (and_elements) {
+          if (prefix && and_elements->prefix) {
+            ldout(cct, 5) << "NOTICE: too many prefixes were provided in re" << dendl;
+            return false;
+          }
+        }
+        return true;
+      };
+
+      int to_sync_pipe_filter(CephContext *cct,
+                              rgw_sync_pipe_filter *f) const {
+        if (!is_valid(cct)) {
+          return -EINVAL;
+        }
+        if (prefix) {
+          f->prefix = *prefix;
+        }
+        if (tag) {
+          f->tags.insert(rgw_sync_pipe_filter_tag(tag->key, tag->value));
+        }
+
+        if (and_elements) {
+          if (and_elements->prefix) {
+            f->prefix = *and_elements->prefix;
+          }
+          for (auto& t : and_elements->tags) {
+            f->tags.insert(rgw_sync_pipe_filter_tag(t.key, t.value));
+          }
+        }
+        return 0;
+      }
+
+      void from_sync_pipe_filter(const rgw_sync_pipe_filter& f) {
+        if (f.prefix && f.tags.empty()) {
+          prefix = f.prefix;
+          return;
+        }
+        if (f.prefix) {
+          and_elements.emplace();
+          and_elements->prefix = f.prefix;
+        } else if (f.tags.size() == 1) {
+          auto iter = f.tags.begin();
+          if (iter == f.tags.end()) {
+            /* should never happen */
+            return;
+          }
+          auto& t = *iter;
+          tag.emplace();
+          tag->key = t.key;
+          tag->value = t.value;
+          return;
+        }
+
+        if (f.tags.empty()) {
+          return;
+        }
+
+        if (!and_elements) {
+          and_elements.emplace();
+        }
+
+        for (auto& t : f.tags) {
+          auto& tag = and_elements->tags.emplace_back();
+          tag.key = t.key;
+          tag.value = t.value;
+        }
+      }
+    };
+
+    set<rgw_zone_id> get_zone_ids_from_names(rgw::sal::Driver* driver,
+                                             const vector<string>& zone_names) const {
+      set<rgw_zone_id> ids;
+
+      for (auto& name : zone_names) {
+	std::unique_ptr<rgw::sal::Zone> zone;
+	int ret = driver->get_zone()->get_zonegroup().get_zone_by_name(name, &zone);
+	if (ret >= 0) {
+	  rgw_zone_id id = zone->get_id();
+	  ids.insert(std::move(id));
+        }
+      }
+
+      return ids;
+    }
+
+    vector<string> get_zone_names_from_ids(rgw::sal::Driver* driver,
+                                           const set<rgw_zone_id>& zone_ids) const {
+      vector<string> names;
+
+      for (auto& id : zone_ids) {
+	std::unique_ptr<rgw::sal::Zone> zone;
+	int ret = driver->get_zone()->get_zonegroup().get_zone_by_id(id.id, &zone);
+	if (ret >= 0) {
+	  names.emplace_back(zone->get_name());
+	}
+      }
+
+      return names;
+    }
+
+    std::optional<DeleteMarkerReplication> delete_marker_replication;
+    std::optional<Source> source;
+    Destination destination;
+    std::optional<Filter> filter;
+    string id;
+    int32_t priority;
+    string status;
+
+    void decode_xml(XMLObj *obj) {
+      RGWXMLDecoder::decode_xml("DeleteMarkerReplication", delete_marker_replication, obj);
+      RGWXMLDecoder::decode_xml("Source", source, obj);
+      RGWXMLDecoder::decode_xml("Destination", destination, obj);
+      RGWXMLDecoder::decode_xml("ID", id, obj);
+
+      std::optional<string> prefix;
+      RGWXMLDecoder::decode_xml("Prefix", prefix, obj);
+      if (prefix) {
+        filter.emplace();
+        filter->prefix = prefix;
+      }
+
+      if (!filter) {
+        RGWXMLDecoder::decode_xml("Filter", filter, obj);
+      } else {
+        /* don't want to have filter reset because it might have been initialized
+         * when decoding prefix
+         */
+        RGWXMLDecoder::decode_xml("Filter", *filter, obj);
+      }
+
+      RGWXMLDecoder::decode_xml("Priority", priority, obj);
+      RGWXMLDecoder::decode_xml("Status", status, obj);
+    }
+
+    void dump_xml(Formatter *f) const {
+      encode_xml("DeleteMarkerReplication", delete_marker_replication, f);
+      encode_xml("Source", source, f);
+      encode_xml("Destination", destination, f);
+      encode_xml("Filter", filter, f);
+      encode_xml("ID", id, f);
+      encode_xml("Priority", priority, f);
+      encode_xml("Status", status, f);
+    }
+
+    bool is_valid(CephContext *cct) const {
+      if (!is_valid_status(status)) {
+        ldout(cct, 5) << "NOTICE: bad status provided in rule (status=" << status << ")" << dendl;
+        return false;
+      }
+      if ((filter && !filter->is_valid(cct)) ||
+          (delete_marker_replication && !delete_marker_replication->is_valid(cct))) {
+        return false;
+      }
+      return true;
+    }
+
+    int to_sync_policy_pipe(req_state *s, rgw::sal::Driver* driver,
+                            rgw_sync_bucket_pipes *pipe,
+                            bool *enabled) const {
+      if (!is_valid(s->cct)) {
+        return -EINVAL;
+      }
+
+      pipe->id = id;
+      pipe->params.priority = priority;
+
+      const auto& user_id = s->user->get_id();
+
+      rgw_bucket_key dest_bk(user_id.tenant,
+                             destination.bucket);
+
+      if (source && !source->zone_names.empty()) {
+        pipe->source.zones = get_zone_ids_from_names(driver, source->zone_names);
+      } else {
+        pipe->source.set_all_zones(true);
+      }
+      if (!destination.zone_names.empty()) {
+        pipe->dest.zones = get_zone_ids_from_names(driver, destination.zone_names);
+      } else {
+        pipe->dest.set_all_zones(true);
+      }
+      pipe->dest.bucket.emplace(dest_bk);
+
+      if (filter) {
+        int r = filter->to_sync_pipe_filter(s->cct, &pipe->params.source.filter);
+        if (r < 0) {
+          return r;
+        }
+      }
+      if (destination.acl_translation) {
+        rgw_user u;
+        u.tenant = user_id.tenant;
+        u.from_str(destination.acl_translation->owner); /* explicit tenant will override tenant,
+                                                           otherwise will inherit it from s->user */
+        pipe->params.dest.acl_translation.emplace();
+        pipe->params.dest.acl_translation->owner = u;
+      }
+      pipe->params.dest.storage_class = destination.storage_class;
+
+      *enabled = (status == "Enabled");
+
+      pipe->params.mode = rgw_sync_pipe_params::Mode::MODE_USER;
+      pipe->params.user = user_id.to_str();
+
+      return 0;
+    }
+
+    void from_sync_policy_pipe(rgw::sal::Driver* driver,
+                              const rgw_sync_bucket_pipes& pipe,
+                              bool enabled) {
+      id = pipe.id;
+      status = (enabled ? "Enabled" : "Disabled");
+      priority = pipe.params.priority;
+
+      if (pipe.source.all_zones) {
+        source.reset();
+      } else if (pipe.source.zones) {
+        source.emplace();
+        source->zone_names = get_zone_names_from_ids(driver, *pipe.source.zones);
+      }
+
+      if (!pipe.dest.all_zones &&
+          pipe.dest.zones) {
+        destination.zone_names = get_zone_names_from_ids(driver, *pipe.dest.zones);
+      }
+
+      if (pipe.params.dest.acl_translation) {
+        destination.acl_translation.emplace();
+        destination.acl_translation->owner = pipe.params.dest.acl_translation->owner.to_str();
+      }
+
+      if (pipe.params.dest.storage_class) {
+        destination.storage_class = *pipe.params.dest.storage_class;
+      }
+
+      if (pipe.dest.bucket) {
+        destination.bucket = pipe.dest.bucket->get_key();
+      }
+
+      filter.emplace();
+      filter->from_sync_pipe_filter(pipe.params.source.filter);
+
+      if (filter->empty()) {
+        filter.reset();
+      }
+    }
+  };
+
+  std::vector<Rule> rules;
+
+  void decode_xml(XMLObj *obj) {
+    RGWXMLDecoder::decode_xml("Role", role, obj);
+    RGWXMLDecoder::decode_xml("Rule", rules, obj);
+  }
+
+  void dump_xml(Formatter *f) const {
+    encode_xml("Role", role, f);
+    encode_xml("Rule", rules, f);
+  }
+
+  int to_sync_policy_groups(req_state *s, rgw::sal::Driver* driver,
+                            vector<rgw_sync_policy_group> *result) const {
+    result->resize(2);
+
+    rgw_sync_policy_group& enabled_group = (*result)[0];
+    rgw_sync_policy_group& disabled_group = (*result)[1];
+
+    enabled_group.id = enabled_group_id;
+    enabled_group.status = rgw_sync_policy_group::Status::ENABLED;
+    disabled_group.id = disabled_group_id;
+    disabled_group.status = rgw_sync_policy_group::Status::ALLOWED; /* not enabled, not forbidden */
+
+    for (auto& rule : rules) {
+      rgw_sync_bucket_pipes pipe;
+      bool enabled;
+      int r = rule.to_sync_policy_pipe(s, driver, &pipe, &enabled);
+      if (r < 0) {
+        ldpp_dout(s, 5) << "NOTICE: failed to convert replication configuration into sync policy pipe (rule.id=" << rule.id << "): " << cpp_strerror(-r) << dendl;
+        return r;
+      }
+
+      if (enabled) {
+        enabled_group.pipes.emplace_back(std::move(pipe));
+      } else {
+        disabled_group.pipes.emplace_back(std::move(pipe));
+      }
+    }
+    return 0;
+  }
+
+  void from_sync_policy_group(rgw::sal::Driver* driver,
+                              const rgw_sync_policy_group& group) {
+
+    bool enabled = (group.status == rgw_sync_policy_group::Status::ENABLED);
+
+    for (auto& pipe : group.pipes) {
+      auto& rule = rules.emplace_back();
+      rule.from_sync_policy_pipe(driver, pipe, enabled);
+    }
+  }
+};
+
+}
+
+void RGWGetBucketReplication_ObjStore_S3::send_response_data()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  ReplicationConfiguration conf;
+
+  if (s->bucket->get_info().sync_policy) {
+    auto policy = s->bucket->get_info().sync_policy;
+
+    auto iter = policy->groups.find(enabled_group_id);
+    if (iter != policy->groups.end()) {
+      conf.from_sync_policy_group(driver, iter->second);
+    }
+    iter = policy->groups.find(disabled_group_id);
+    if (iter != policy->groups.end()) {
+      conf.from_sync_policy_group(driver, iter->second);
+    }
+  }
+
+  if (!op_ret) {
+  s->formatter->open_object_section_in_ns("ReplicationConfiguration", XMLNS_AWS_S3);
+  conf.dump_xml(s->formatter);
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+int RGWPutBucketReplication_ObjStore_S3::get_params(optional_yield y)
+{
+  RGWXMLParser parser;
+
+  if (!parser.init()){
+    return -EINVAL;
+  }
+
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  int r = 0;
+  bufferlist data;
+
+  std::tie(r, data) = read_all_input(s, max_size, false);
+
+  if (r < 0)
+    return r;
+
+  if (!parser.parse(data.c_str(), data.length(), 1)) {
+    return -ERR_MALFORMED_XML;
+  }
+
+  ReplicationConfiguration conf;
+  try {
+    RGWXMLDecoder::decode_xml("ReplicationConfiguration", conf, &parser);
+  } catch (RGWXMLDecoder::err& err) {
+
+    ldpp_dout(this, 5) << "Malformed tagging request: " << err << dendl;
+    return -ERR_MALFORMED_XML;
+  }
+
+  r = conf.to_sync_policy_groups(s, driver, &sync_policy_groups);
+  if (r < 0) {
+    return r;
+  }
+
+  // forward requests to meta master zone
+  if (!driver->is_meta_master()) {
+    /* only need to keep this data around if we're not meta master */
+    in_data = std::move(data);
+  }
+
+  return 0;
+}
+
+void RGWPutBucketReplication_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+}
+
+void RGWDeleteBucketReplication_ObjStore_S3::update_sync_policy(rgw_sync_policy_info *policy)
+{
+  policy->groups.erase(enabled_group_id);
+  policy->groups.erase(disabled_group_id);
+}
+
+void RGWDeleteBucketReplication_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+}
+
+void RGWListBuckets_ObjStore_S3::send_response_begin(bool has_buckets)
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  dump_start(s);
+  // Explicitly use chunked transfer encoding so that we can stream the result
+  // to the user without having to wait for the full length of it.
+  end_header(s, NULL, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING);
+
+  if (! op_ret) {
+    list_all_buckets_start(s);
+    dump_owner(s, s->user->get_id(), s->user->get_display_name());
+    s->formatter->open_array_section("Buckets");
+    sent_data = true;
+  }
+}
+
+void RGWListBuckets_ObjStore_S3::send_response_data(rgw::sal::BucketList& buckets)
+{
+  if (!sent_data)
+    return;
+
+  auto& m = buckets.get_buckets();
+
+  for (auto iter = m.begin(); iter != m.end(); ++iter) {
+    auto& bucket = iter->second;
+    dump_bucket(s, *bucket);
+  }
+  rgw_flush_formatter(s, s->formatter);
+}
+
+void RGWListBuckets_ObjStore_S3::send_response_end()
+{
+  if (sent_data) {
+    s->formatter->close_section();
+    list_all_buckets_end(s);
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+int RGWGetUsage_ObjStore_S3::get_params(optional_yield y)
+{
+  start_date = s->info.args.get("start-date");
+  end_date = s->info.args.get("end-date");
+  return 0;
+}
+
+static void dump_usage_categories_info(Formatter *formatter, const rgw_usage_log_entry& entry, map<string, bool> *categories)
+{
+  formatter->open_array_section("categories");
+  map<string, rgw_usage_data>::const_iterator uiter;
+  for (uiter = entry.usage_map.begin(); uiter != entry.usage_map.end(); ++uiter) {
+    if (categories && !categories->empty() && !categories->count(uiter->first))
+      continue;
+    const rgw_usage_data& usage = uiter->second;
+    formatter->open_object_section("Entry");
+    encode_json("Category", uiter->first, formatter);
+    encode_json("BytesSent", usage.bytes_sent, formatter);
+    encode_json("BytesReceived", usage.bytes_received, formatter);
+    encode_json("Ops", usage.ops, formatter);
+    encode_json("SuccessfulOps", usage.successful_ops, formatter);
+    formatter->close_section(); // Entry
+  }
+  formatter->close_section(); // Category
+}
+
+static void dump_usage_bucket_info(Formatter *formatter, const std::string& name, const bucket_meta_entry& entry)
+{
+  formatter->open_object_section("Entry");
+  encode_json("Bucket", name, formatter);
+  encode_json("Bytes", entry.size, formatter);
+  encode_json("Bytes_Rounded", entry.size_rounded, formatter);
+  formatter->close_section(); // entry
+}
+
+void RGWGetUsage_ObjStore_S3::send_response()
+{
+  if (op_ret < 0)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+
+  // Explicitly use chunked transfer encoding so that we can stream the result
+  // to the user without having to wait for the full length of it.
+  end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING);
+  dump_start(s);
+  if (op_ret < 0)
+    return;
+
+  Formatter *formatter = s->formatter;
+  string last_owner;
+  bool user_section_open = false;
+
+  formatter->open_object_section("Usage");
+  if (show_log_entries) {
+    formatter->open_array_section("Entries");
+  }
+  map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
+  for (iter = usage.begin(); iter != usage.end(); ++iter) {
+    const rgw_user_bucket& ub = iter->first;
+    const rgw_usage_log_entry& entry = iter->second;
+
+    if (show_log_entries) {
+      if (ub.user.compare(last_owner) != 0) {
+        if (user_section_open) {
+          formatter->close_section();
+          formatter->close_section();
+        }
+        formatter->open_object_section("User");
+        formatter->dump_string("Owner", ub.user);
+        formatter->open_array_section("Buckets");
+        user_section_open = true;
+        last_owner = ub.user;
+      }
+      formatter->open_object_section("Bucket");
+      formatter->dump_string("Bucket", ub.bucket);
+      utime_t ut(entry.epoch, 0);
+      ut.gmtime(formatter->dump_stream("Time"));
+      formatter->dump_int("Epoch", entry.epoch);
+      dump_usage_categories_info(formatter, entry, &categories);
+      formatter->close_section(); // bucket
+    }
+
+    summary_map[ub.user].aggregate(entry, &categories);
+  }
+
+  if (show_log_entries) {
+     if (user_section_open) {
+       formatter->close_section(); // buckets
+       formatter->close_section(); //user
+     }
+     formatter->close_section(); // entries
+   }
+
+   if (show_log_sum) {
+     formatter->open_array_section("Summary");
+     map<string, rgw_usage_log_entry>::iterator siter;
+     for (siter = summary_map.begin(); siter != summary_map.end(); ++siter) {
+       const rgw_usage_log_entry& entry = siter->second;
+       formatter->open_object_section("User");
+       formatter->dump_string("User", siter->first);
+       dump_usage_categories_info(formatter, entry, &categories);
+       rgw_usage_data total_usage;
+       entry.sum(total_usage, categories);
+       formatter->open_object_section("Total");
+       encode_json("BytesSent", total_usage.bytes_sent, formatter);
+       encode_json("BytesReceived", total_usage.bytes_received, formatter);
+       encode_json("Ops", total_usage.ops, formatter);
+       encode_json("SuccessfulOps", total_usage.successful_ops, formatter);
+       formatter->close_section(); // total
+       formatter->close_section(); // user
+     }
+
+     if (s->cct->_conf->rgw_rest_getusage_op_compat) {
+       formatter->open_object_section("Stats");
+     }
+
+     // send info about quota config
+     auto user_info = s->user->get_info();
+     encode_json("QuotaMaxBytes", user_info.quota.user_quota.max_size, formatter);
+     encode_json("QuotaMaxBuckets", user_info.max_buckets, formatter);
+     encode_json("QuotaMaxObjCount", user_info.quota.user_quota.max_objects, formatter);
+     encode_json("QuotaMaxBytesPerBucket", user_info.quota.bucket_quota.max_objects, formatter);
+     encode_json("QuotaMaxObjCountPerBucket", user_info.quota.bucket_quota.max_size, formatter);
+     // send info about user's capacity utilization
+     encode_json("TotalBytes", stats.size, formatter);
+     encode_json("TotalBytesRounded", stats.size_rounded, formatter);
+     encode_json("TotalEntries", stats.num_objects, formatter);
+
+     if (s->cct->_conf->rgw_rest_getusage_op_compat) {
+       formatter->close_section(); //Stats
+     }
+
+     formatter->close_section(); // summary
+   }
+
+  formatter->open_array_section("CapacityUsed");
+  formatter->open_object_section("User");
+  formatter->open_array_section("Buckets");
+  for (const auto& biter : buckets_usage) {
+    const bucket_meta_entry& entry = biter.second;
+    dump_usage_bucket_info(formatter, biter.first, entry);
+  }
+  formatter->close_section(); // Buckets
+  formatter->close_section(); // User
+  formatter->close_section(); // CapacityUsed
+
+  formatter->close_section(); // usage
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+int RGWListBucket_ObjStore_S3::get_common_params()
+{
+  list_versions = s->info.args.exists("versions");
+  prefix = s->info.args.get("prefix");
+
+  // non-standard
+  s->info.args.get_bool("allow-unordered", &allow_unordered, false);
+  delimiter = s->info.args.get("delimiter");
+  max_keys = s->info.args.get("max-keys");
+  op_ret = parse_max_keys();
+  if (op_ret < 0) {
+   return op_ret;
+  }
+  encoding_type = s->info.args.get("encoding-type");
+  if (s->system_request) {
+    s->info.args.get_bool("objs-container", &objs_container, false);
+    const char *shard_id_str = s->info.env->get("HTTP_RGWX_SHARD_ID");
+    if (shard_id_str) {
+      string err;
+      shard_id = strict_strtol(shard_id_str, 10, &err);
+      if (!err.empty()) {
+        ldpp_dout(this, 5) << "bad shard id specified: " << shard_id_str << dendl;
+        return -EINVAL;
+      }
+    } else {
+     shard_id = s->bucket_instance_shard_id;
+    }
+  }
+  return 0;
+}
+
+int RGWListBucket_ObjStore_S3::get_params(optional_yield y)
+{
+  int ret = get_common_params();
+  if (ret < 0) {
+    return ret;
+  }
+  if (!list_versions) {
+    marker = s->info.args.get("marker");
+  } else {
+    marker.name = s->info.args.get("key-marker");
+    marker.instance = s->info.args.get("version-id-marker");
+  }
+  return 0;
+}
+
+int RGWListBucket_ObjStore_S3v2::get_params(optional_yield y)
+{
+int ret = get_common_params();
+if (ret < 0) {
+  return ret;
+}
+s->info.args.get_bool("fetch-owner", &fetchOwner, false);
+startAfter = s->info.args.get("start-after", &start_after_exist);
+continuation_token = s->info.args.get("continuation-token", &continuation_token_exist);
+if(!continuation_token_exist) {
+  marker = startAfter;
+} else {
+  marker = continuation_token;
+}
+return 0;
+}
+
+void RGWListBucket_ObjStore_S3::send_common_versioned_response()
+{
+  if (!s->bucket_tenant.empty()) {
+    s->formatter->dump_string("Tenant", s->bucket_tenant);
+  }
+  s->formatter->dump_string("Name", s->bucket_name);
+  s->formatter->dump_string("Prefix", prefix);
+  s->formatter->dump_int("MaxKeys", max);
+  if (!delimiter.empty()) {
+    s->formatter->dump_string("Delimiter", delimiter);
+  }
+  s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true"
+              : "false"));
+
+  if (!common_prefixes.empty()) {
+      map<string, bool>::iterator pref_iter;
+      for (pref_iter = common_prefixes.begin();
+      pref_iter != common_prefixes.end(); ++pref_iter) {
+      s->formatter->open_array_section("CommonPrefixes");
+      if (encode_key) {
+        s->formatter->dump_string("Prefix", url_encode(pref_iter->first, false));
+      } else {
+        s->formatter->dump_string("Prefix", pref_iter->first);
+      }
+
+      s->formatter->close_section();
+      }
+    }
+  }
+
+void RGWListBucket_ObjStore_S3::send_versioned_response()
+{
+  s->formatter->open_object_section_in_ns("ListVersionsResult", XMLNS_AWS_S3);
+  if (strcasecmp(encoding_type.c_str(), "url") == 0) {
+    s->formatter->dump_string("EncodingType", "url");
+    encode_key = true;
+  }
+  RGWListBucket_ObjStore_S3::send_common_versioned_response();
+  s->formatter->dump_string("KeyMarker", marker.name);
+  s->formatter->dump_string("VersionIdMarker", marker.instance);
+  if (is_truncated && !next_marker.empty()) {
+    s->formatter->dump_string("NextKeyMarker", next_marker.name);
+    if (next_marker.instance.empty()) {
+      s->formatter->dump_string("NextVersionIdMarker", "null");
+    }
+    else {
+      s->formatter->dump_string("NextVersionIdMarker", next_marker.instance);
+    }
+  }
+
+  if (op_ret >= 0) {
+    if (objs_container) {
+      s->formatter->open_array_section("Entries");
+    }
+
+    vector<rgw_bucket_dir_entry>::iterator iter;
+    for (iter = objs.begin(); iter != objs.end(); ++iter) {
+      const char *section_name = (iter->is_delete_marker() ? "DeleteMarker"
+          : "Version");
+      s->formatter->open_object_section(section_name);
+      if (objs_container) {
+        s->formatter->dump_bool("IsDeleteMarker", iter->is_delete_marker());
+      }
+      rgw_obj_key key(iter->key);
+      if (encode_key) {
+        string key_name;
+        url_encode(key.name, key_name);
+        s->formatter->dump_string("Key", key_name);
+      }
+      else {
+        s->formatter->dump_string("Key", key.name);
+      }
+      string version_id = key.instance;
+      if (version_id.empty()) {
+        version_id = "null";
+      }
+      if (s->system_request) {
+        if (iter->versioned_epoch > 0) {
+          s->formatter->dump_int("VersionedEpoch", iter->versioned_epoch);
+        }
+        s->formatter->dump_string("RgwxTag", iter->tag);
+        utime_t ut(iter->meta.mtime);
+        ut.gmtime_nsec(s->formatter->dump_stream("RgwxMtime"));
+      }
+      s->formatter->dump_string("VersionId", version_id);
+      s->formatter->dump_bool("IsLatest", iter->is_current());
+      dump_time(s, "LastModified", iter->meta.mtime);
+      if (!iter->is_delete_marker()) {
+        s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str());
+        s->formatter->dump_int("Size", iter->meta.accounted_size);
+        auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class);
+        s->formatter->dump_string("StorageClass", storage_class.c_str());
+      }
+      dump_owner(s, rgw_user(iter->meta.owner), iter->meta.owner_display_name);
+      if (iter->meta.appendable) {
+        s->formatter->dump_string("Type", "Appendable");
+      } else {
+        s->formatter->dump_string("Type", "Normal");
+      }
+      s->formatter->close_section(); // Version/DeleteMarker
+    }
+    if (objs_container) {
+      s->formatter->close_section(); // Entries
+    }
+    s->formatter->close_section(); // ListVersionsResult
+  }
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+
+void RGWListBucket_ObjStore_S3::send_common_response()
+{
+  if (!s->bucket_tenant.empty()) {
+    s->formatter->dump_string("Tenant", s->bucket_tenant);
+  }
+  s->formatter->dump_string("Name", s->bucket_name);
+  s->formatter->dump_string("Prefix", prefix);
+  s->formatter->dump_int("MaxKeys", max);
+  if (!delimiter.empty()) {
+    if (encode_key) {
+      s->formatter->dump_string("Delimiter", url_encode(delimiter, false));
+    } else {
+      s->formatter->dump_string("Delimiter", delimiter);
+    }
+  }
+  s->formatter->dump_string("IsTruncated", (max && is_truncated ? "true"
+              : "false"));
+
+    if (!common_prefixes.empty()) {
+      map<string, bool>::iterator pref_iter;
+      for (pref_iter = common_prefixes.begin();
+      pref_iter != common_prefixes.end(); ++pref_iter) {
+      s->formatter->open_array_section("CommonPrefixes");
+      if (encode_key) {
+        s->formatter->dump_string("Prefix", url_encode(pref_iter->first, false));
+      } else {
+        s->formatter->dump_string("Prefix", pref_iter->first);
+      }
+      s->formatter->close_section();
+      }
+    }
+  }
+
+void RGWListBucket_ObjStore_S3::send_response()
+{
+  if (op_ret < 0) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+
+  // Explicitly use chunked transfer encoding so that we can stream the result
+  // to the user without having to wait for the full length of it.
+  end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING);
+  dump_start(s);
+  if (op_ret < 0) {
+    return;
+  }
+  if (list_versions) {
+    send_versioned_response();
+    return;
+  }
+
+  s->formatter->open_object_section_in_ns("ListBucketResult", XMLNS_AWS_S3);
+  if (strcasecmp(encoding_type.c_str(), "url") == 0) {
+    s->formatter->dump_string("EncodingType", "url");
+    encode_key = true;
+  }
+  
+  RGWListBucket_ObjStore_S3::send_common_response();
+  
+  if (op_ret >= 0) {
+    if (s->format == RGWFormat::JSON) {
+      s->formatter->open_array_section("Contents");
+    }
+    vector<rgw_bucket_dir_entry>::iterator iter;
+    for (iter = objs.begin(); iter != objs.end(); ++iter) {
+
+      rgw_obj_key key(iter->key);
+      std::string key_name;
+
+      if (encode_key) {
+	url_encode(key.name, key_name);
+      } else {
+	key_name = key.name;
+      }
+      /* conditionally format JSON in the obvious way--I'm unsure if
+       * AWS actually does this */
+      if (s->format == RGWFormat::XML) {
+	s->formatter->open_array_section("Contents");
+      } else {
+	// json
+	s->formatter->open_object_section("dummy");
+      }
+      s->formatter->dump_string("Key", key_name);
+      dump_time(s, "LastModified", iter->meta.mtime);
+      s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str());
+      s->formatter->dump_int("Size", iter->meta.accounted_size);
+      auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class);
+      s->formatter->dump_string("StorageClass", storage_class.c_str());
+      dump_owner(s, rgw_user(iter->meta.owner), iter->meta.owner_display_name);
+      if (s->system_request) {
+	s->formatter->dump_string("RgwxTag", iter->tag);
+      }
+      if (iter->meta.appendable) {
+	s->formatter->dump_string("Type", "Appendable");
+	} else {
+	s->formatter->dump_string("Type", "Normal");
+      }
+      // JSON has one extra section per element
+      s->formatter->close_section();
+    } // foreach obj
+    if (s->format == RGWFormat::JSON) {
+      s->formatter->close_section();
+    }
+  }
+  s->formatter->dump_string("Marker", marker.name);
+  if (is_truncated && !next_marker.empty()) {
+    s->formatter->dump_string("NextMarker", next_marker.name);
+  }
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+} /* RGWListBucket_ObjStore_S3::send_response() */
+
+void RGWListBucket_ObjStore_S3v2::send_versioned_response()
+{
+  s->formatter->open_object_section_in_ns("ListVersionsResult", XMLNS_AWS_S3);
+  RGWListBucket_ObjStore_S3v2::send_common_versioned_response();
+  s->formatter->dump_string("KeyContinuationToken", marker.name);
+  s->formatter->dump_string("VersionIdContinuationToken", marker.instance);
+  if (is_truncated && !next_marker.empty()) {
+    s->formatter->dump_string("NextKeyContinuationToken", next_marker.name);
+    s->formatter->dump_string("NextVersionIdContinuationToken", next_marker.instance);
+  }
+
+  if (strcasecmp(encoding_type.c_str(), "url") == 0) {
+    s->formatter->dump_string("EncodingType", "url");
+    encode_key = true;
+  }
+
+  if (op_ret >= 0) {
+    if (objs_container) {
+      s->formatter->open_array_section("Entries");
+    }
+
+    vector<rgw_bucket_dir_entry>::iterator iter;
+    for (iter = objs.begin(); iter != objs.end(); ++iter) {
+      const char *section_name = (iter->is_delete_marker() ? "DeleteContinuationToken"
+          : "Version");
+      s->formatter->open_object_section(section_name);
+      if (objs_container) {
+        s->formatter->dump_bool("IsDeleteContinuationToken", iter->is_delete_marker());
+      }
+      rgw_obj_key key(iter->key);
+      if (encode_key) {
+        string key_name;
+        url_encode(key.name, key_name);
+        s->formatter->dump_string("Key", key_name);
+      }
+      else {
+        s->formatter->dump_string("Key", key.name);
+      }
+      string version_id = key.instance;
+      if (version_id.empty()) {
+        version_id = "null";
+      }
+      if (s->system_request) {
+        if (iter->versioned_epoch > 0) {
+          s->formatter->dump_int("VersionedEpoch", iter->versioned_epoch);
+        }
+        s->formatter->dump_string("RgwxTag", iter->tag);
+        utime_t ut(iter->meta.mtime);
+        ut.gmtime_nsec(s->formatter->dump_stream("RgwxMtime"));
+      }
+      s->formatter->dump_string("VersionId", version_id);
+      s->formatter->dump_bool("IsLatest", iter->is_current());
+      dump_time(s, "LastModified", iter->meta.mtime);
+      if (!iter->is_delete_marker()) {
+        s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str());
+        s->formatter->dump_int("Size", iter->meta.accounted_size);
+        auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class);
+        s->formatter->dump_string("StorageClass", storage_class.c_str());
+      }
+      if (fetchOwner == true) {
+        dump_owner(s, rgw_user(iter->meta.owner), iter->meta.owner_display_name);
+      }
+      s->formatter->close_section();
+    }
+
+
+    if (objs_container) {
+      s->formatter->close_section();
+    }
+
+     if (!common_prefixes.empty()) {
+      map<string, bool>::iterator pref_iter;
+      for (pref_iter = common_prefixes.begin();
+      pref_iter != common_prefixes.end(); ++pref_iter) {
+      s->formatter->open_array_section("CommonPrefixes");
+      if (encode_key) {
+        s->formatter->dump_string("Prefix", url_encode(pref_iter->first, false));
+      } else {
+        s->formatter->dump_string("Prefix", pref_iter->first);
+      }
+
+      s->formatter->dump_int("KeyCount",objs.size());
+      if (start_after_exist) {
+        s->formatter->dump_string("StartAfter", startAfter);
+      }
+      s->formatter->close_section();
+      }
+    }
+
+    s->formatter->close_section();
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+void RGWListBucket_ObjStore_S3v2::send_response()
+{
+  if (op_ret < 0) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+
+  // Explicitly use chunked transfer encoding so that we can stream the result
+  // to the user without having to wait for the full length of it.
+  end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING);
+  dump_start(s);
+  if (op_ret < 0) {
+    return;
+  }
+  if (list_versions) {
+    send_versioned_response();
+    return;
+  }
+
+  s->formatter->open_object_section_in_ns("ListBucketResult", XMLNS_AWS_S3);
+  if (strcasecmp(encoding_type.c_str(), "url") == 0) {
+    s->formatter->dump_string("EncodingType", "url");
+    encode_key = true;
+  }
+
+  RGWListBucket_ObjStore_S3::send_common_response();
+  if (op_ret >= 0) {
+    vector<rgw_bucket_dir_entry>::iterator iter;
+    for (iter = objs.begin(); iter != objs.end(); ++iter) {
+      rgw_obj_key key(iter->key);
+      s->formatter->open_array_section("Contents");
+      if (encode_key) {
+        string key_name;
+        url_encode(key.name, key_name);
+        s->formatter->dump_string("Key", key_name);
+      }
+      else {
+        s->formatter->dump_string("Key", key.name);
+      }
+      dump_time(s, "LastModified", iter->meta.mtime);
+      s->formatter->dump_format("ETag", "\"%s\"", iter->meta.etag.c_str());
+      s->formatter->dump_int("Size", iter->meta.accounted_size);
+      auto& storage_class = rgw_placement_rule::get_canonical_storage_class(iter->meta.storage_class);
+      s->formatter->dump_string("StorageClass", storage_class.c_str());
+      if (fetchOwner == true) {
+        dump_owner(s, rgw_user(iter->meta.owner), iter->meta.owner_display_name);
+      }
+      if (s->system_request) {
+        s->formatter->dump_string("RgwxTag", iter->tag);
+      }
+      if (iter->meta.appendable) {
+        s->formatter->dump_string("Type", "Appendable");
+      } else {
+        s->formatter->dump_string("Type", "Normal");
+      }
+      s->formatter->close_section();
+    }
+  }
+  if (continuation_token_exist) {
+    s->formatter->dump_string("ContinuationToken", continuation_token);
+  }
+  if (is_truncated && !next_marker.empty()) {
+    s->formatter->dump_string("NextContinuationToken", next_marker.name);
+  }
+  s->formatter->dump_int("KeyCount", objs.size() + common_prefixes.size());
+  if (start_after_exist) {
+    s->formatter->dump_string("StartAfter", startAfter);
+  }
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void RGWGetBucketLogging_ObjStore_S3::send_response()
+{
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  s->formatter->open_object_section_in_ns("BucketLoggingStatus", XMLNS_AWS_S3);
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void RGWGetBucketLocation_ObjStore_S3::send_response()
+{
+  dump_errno(s);
+  end_header(s, this);
+  dump_start(s);
+
+  std::unique_ptr<rgw::sal::ZoneGroup> zonegroup;
+  string api_name;
+
+  int ret = driver->get_zonegroup(s->bucket->get_info().zonegroup, &zonegroup);
+  if (ret >= 0) {
+    api_name = zonegroup->get_api_name();
+  } else  {
+    if (s->bucket->get_info().zonegroup != "default") {
+      api_name = s->bucket->get_info().zonegroup;
+    }
+  }
+
+  s->formatter->dump_format_ns("LocationConstraint", XMLNS_AWS_S3,
+			       "%s", api_name.c_str());
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void RGWGetBucketVersioning_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  s->formatter->open_object_section_in_ns("VersioningConfiguration", XMLNS_AWS_S3);
+  if (versioned) {
+    const char *status = (versioning_enabled ? "Enabled" : "Suspended");
+    s->formatter->dump_string("Status", status);
+    const char *mfa_status = (mfa_enabled ? "Enabled" : "Disabled");
+    s->formatter->dump_string("MfaDelete", mfa_status);
+  }
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+struct ver_config_status {
+  int status{VersioningSuspended};
+
+  enum MFAStatus {
+    MFA_UNKNOWN,
+    MFA_DISABLED,
+    MFA_ENABLED,
+  } mfa_status{MFA_UNKNOWN};
+  int retcode{0};
+
+  void decode_xml(XMLObj *obj) {
+    string status_str;
+    string mfa_str;
+    RGWXMLDecoder::decode_xml("Status", status_str, obj);
+    if (status_str == "Enabled") {
+      status = VersioningEnabled;
+    } else if (status_str != "Suspended") {
+      status = VersioningStatusInvalid;
+    }
+
+
+    if (RGWXMLDecoder::decode_xml("MfaDelete", mfa_str, obj)) {
+      if (mfa_str == "Enabled") {
+        mfa_status = MFA_ENABLED;
+      } else if (mfa_str == "Disabled") {
+        mfa_status = MFA_DISABLED;
+      } else {
+        retcode = -EINVAL;
+      }
+    }
+  }
+};
+
+int RGWSetBucketVersioning_ObjStore_S3::get_params(optional_yield y)
+{
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) =
+    read_all_input(s, s->cct->_conf->rgw_max_put_param_size, false);
+  if (r < 0) {
+    return r;
+  }
+
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl;
+    return -EIO;
+  }
+
+  char* buf = data.c_str();
+  if (!parser.parse(buf, data.length(), 1)) {
+    ldpp_dout(this, 10) << "NOTICE: failed to parse data: " << buf << dendl;
+    r = -EINVAL;
+    return r;
+  }
+
+  ver_config_status status_conf;
+
+  if (!RGWXMLDecoder::decode_xml("VersioningConfiguration", status_conf, &parser)) {
+    ldpp_dout(this, 10) << "NOTICE: bad versioning config input" << dendl;
+    return -EINVAL;
+  }
+
+  if (!driver->is_meta_master()) {
+    /* only need to keep this data around if we're not meta master */
+    in_data.append(data);
+  }
+
+  versioning_status = status_conf.status;
+  if (versioning_status == VersioningStatusInvalid) {
+    r = -EINVAL;
+  }
+
+  if (status_conf.mfa_status != ver_config_status::MFA_UNKNOWN) {
+    mfa_set_status = true;
+    switch (status_conf.mfa_status) {
+      case ver_config_status::MFA_DISABLED:
+        mfa_status = false;
+        break;
+      case ver_config_status::MFA_ENABLED:
+        mfa_status = true;
+        break;
+      default:
+        ldpp_dout(this, 0) << "ERROR: RGWSetBucketVersioning_ObjStore_S3::get_params(optional_yield y): unexpected switch case mfa_status=" << status_conf.mfa_status << dendl;
+        r = -EIO;
+    }
+  } else if (status_conf.retcode < 0) {
+    r = status_conf.retcode;
+  }
+  return r;
+}
+
+void RGWSetBucketVersioning_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+}
+
+int RGWSetBucketWebsite_ObjStore_S3::get_params(optional_yield y)
+{
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = read_all_input(s, max_size, false);
+
+  if (r < 0) {
+    return r;
+  }
+
+  in_data.append(data);
+
+  RGWXMLDecoder::XMLParser parser;
+  if (!parser.init()) {
+    ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl;
+    return -EIO;
+  }
+
+  char* buf = data.c_str();
+  if (!parser.parse(buf, data.length(), 1)) {
+    ldpp_dout(this, 5) << "failed to parse xml: " << buf << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    RGWXMLDecoder::decode_xml("WebsiteConfiguration", website_conf, &parser, true);
+  } catch (RGWXMLDecoder::err& err) {
+    ldpp_dout(this, 5) << "unexpected xml: " << buf << dendl;
+    return -EINVAL;
+  }
+
+  if (website_conf.is_redirect_all && website_conf.redirect_all.hostname.empty()) {
+    s->err.message = "A host name must be provided to redirect all requests (e.g. \"example.com\").";
+    ldpp_dout(this, 5) << s->err.message << dendl;
+    return -EINVAL;
+  } else if (!website_conf.is_redirect_all && !website_conf.is_set_index_doc) {
+    s->err.message = "A value for IndexDocument Suffix must be provided if RedirectAllRequestsTo is empty";
+    ldpp_dout(this, 5) << s->err.message << dendl;
+    return -EINVAL;
+  } else if (!website_conf.is_redirect_all && website_conf.is_set_index_doc &&
+             website_conf.index_doc_suffix.empty()) {
+    s->err.message = "The IndexDocument Suffix is not well formed";
+    ldpp_dout(this, 5) << s->err.message << dendl;
+    return -EINVAL;
+  }
+
+#define WEBSITE_ROUTING_RULES_MAX_NUM      50
+  int max_num = s->cct->_conf->rgw_website_routing_rules_max_num;
+  if (max_num < 0) {
+    max_num = WEBSITE_ROUTING_RULES_MAX_NUM;
+  }
+  int routing_rules_num = website_conf.routing_rules.rules.size();
+  if (routing_rules_num > max_num) {
+    ldpp_dout(this, 4) << "An website routing config can have up to "
+                     << max_num
+                     << " rules, request website routing rules num: "
+                     << routing_rules_num << dendl;
+    op_ret = -ERR_INVALID_WEBSITE_ROUTING_RULES_ERROR;
+    s->err.message = std::to_string(routing_rules_num) +" routing rules provided, the number of routing rules in a website configuration is limited to "
+                     + std::to_string(max_num)
+                     + ".";
+    return -ERR_INVALID_REQUEST;
+  }
+
+  return 0;
+}
+
+void RGWSetBucketWebsite_ObjStore_S3::send_response()
+{
+  if (op_ret < 0)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+}
+
+void RGWDeleteBucketWebsite_ObjStore_S3::send_response()
+{
+  if (op_ret == 0) {
+    op_ret = STATUS_NO_CONTENT;
+  }
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+}
+
+void RGWGetBucketWebsite_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  if (op_ret < 0) {
+    return;
+  }
+
+  RGWBucketWebsiteConf& conf = s->bucket->get_info().website_conf;
+
+  s->formatter->open_object_section_in_ns("WebsiteConfiguration", XMLNS_AWS_S3);
+  conf.dump_xml(s->formatter);
+  s->formatter->close_section(); // WebsiteConfiguration
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+static void dump_bucket_metadata(req_state *s, rgw::sal::Bucket* bucket)
+{
+  dump_header(s, "X-RGW-Object-Count", static_cast<long long>(bucket->get_count()));
+  dump_header(s, "X-RGW-Bytes-Used", static_cast<long long>(bucket->get_size()));
+  // only bucket's owner is allowed to get the quota settings of the account
+  if (bucket->is_owner(s->user.get())) {
+    auto user_info = s->user->get_info();
+    auto bucket_quota = s->bucket->get_info().quota; // bucket quota
+    dump_header(s, "X-RGW-Quota-User-Size", static_cast<long long>(user_info.quota.user_quota.max_size));
+    dump_header(s, "X-RGW-Quota-User-Objects", static_cast<long long>(user_info.quota.user_quota.max_objects));
+    dump_header(s, "X-RGW-Quota-Max-Buckets", static_cast<long long>(user_info.max_buckets));
+    dump_header(s, "X-RGW-Quota-Bucket-Size", static_cast<long long>(bucket_quota.max_size));
+    dump_header(s, "X-RGW-Quota-Bucket-Objects", static_cast<long long>(bucket_quota.max_objects));
+  }
+}
+
+void RGWStatBucket_ObjStore_S3::send_response()
+{
+  if (op_ret >= 0) {
+    dump_bucket_metadata(s, bucket.get());
+  }
+
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+
+  end_header(s, this);
+  dump_start(s);
+}
+
+static int create_s3_policy(req_state *s, rgw::sal::Driver* driver,
+			    RGWAccessControlPolicy_S3& s3policy,
+			    ACLOwner& owner)
+{
+  if (s->has_acl_header) {
+    if (!s->canned_acl.empty())
+      return -ERR_INVALID_REQUEST;
+
+    return s3policy.create_from_headers(s, driver, s->info.env, owner);
+  }
+
+  return s3policy.create_canned(owner, s->bucket_owner, s->canned_acl);
+}
+
+class RGWLocationConstraint : public XMLObj
+{
+public:
+  RGWLocationConstraint() {}
+  ~RGWLocationConstraint() override {}
+  bool xml_end(const char *el) override {
+    if (!el)
+      return false;
+
+    location_constraint = get_data();
+
+    return true;
+  }
+
+  string location_constraint;
+};
+
+class RGWCreateBucketConfig : public XMLObj
+{
+public:
+  RGWCreateBucketConfig() {}
+  ~RGWCreateBucketConfig() override {}
+};
+
+class RGWCreateBucketParser : public RGWXMLParser
+{
+  XMLObj *alloc_obj(const char *el) override {
+    return new XMLObj;
+  }
+
+public:
+  RGWCreateBucketParser() {}
+  ~RGWCreateBucketParser() override {}
+
+  bool get_location_constraint(string& zone_group) {
+    XMLObj *config = find_first("CreateBucketConfiguration");
+    if (!config)
+      return false;
+
+    XMLObj *constraint = config->find_first("LocationConstraint");
+    if (!constraint)
+      return false;
+
+    zone_group = constraint->get_data();
+
+    return true;
+  }
+};
+
+int RGWCreateBucket_ObjStore_S3::get_params(optional_yield y)
+{
+  RGWAccessControlPolicy_S3 s3policy(s->cct);
+  bool relaxed_names = s->cct->_conf->rgw_relaxed_s3_bucket_names;
+
+  int r;
+  if (!s->system_request) {
+    r = valid_s3_bucket_name(s->bucket_name, relaxed_names);
+    if (r) return r;
+  }
+
+  r = create_s3_policy(s, driver, s3policy, s->owner);
+  if (r < 0)
+    return r;
+
+  policy = s3policy;
+
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+
+  int op_ret = 0;
+  bufferlist data;
+  std::tie(op_ret, data) = read_all_input(s, max_size, false);
+
+  if ((op_ret < 0) && (op_ret != -ERR_LENGTH_REQUIRED))
+    return op_ret;
+
+  in_data.append(data);
+
+  if (data.length()) {
+    RGWCreateBucketParser parser;
+
+    if (!parser.init()) {
+      ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl;
+      return -EIO;
+    }
+
+    char* buf = data.c_str();
+    bool success = parser.parse(buf, data.length(), 1);
+    ldpp_dout(this, 20) << "create bucket input data=" << buf << dendl;
+
+    if (!success) {
+      ldpp_dout(this, 0) << "failed to parse input: " << buf << dendl;
+      return -EINVAL;
+    }
+
+    if (!parser.get_location_constraint(location_constraint)) {
+      ldpp_dout(this, 0) << "provided input did not specify location constraint correctly" << dendl;
+      return -EINVAL;
+    }
+
+    ldpp_dout(this, 10) << "create bucket location constraint: "
+		      << location_constraint << dendl;
+  }
+
+  size_t pos = location_constraint.find(':');
+  if (pos != string::npos) {
+    placement_rule.init(location_constraint.substr(pos + 1), s->info.storage_class);
+    location_constraint = location_constraint.substr(0, pos);
+  } else {
+    placement_rule.storage_class = s->info.storage_class;
+  }
+  auto iter = s->info.x_meta_map.find("x-amz-bucket-object-lock-enabled");
+  if (iter != s->info.x_meta_map.end()) {
+    if (!boost::algorithm::iequals(iter->second, "true") && !boost::algorithm::iequals(iter->second, "false")) {
+      return -EINVAL;
+    }
+    obj_lock_enabled = boost::algorithm::iequals(iter->second, "true");
+  }
+  return 0;
+}
+
+void RGWCreateBucket_ObjStore_S3::send_response()
+{
+  if (op_ret == -ERR_BUCKET_EXISTS)
+    op_ret = 0;
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+
+  if (op_ret < 0)
+    return;
+
+  if (s->system_request) {
+    JSONFormatter f; /* use json formatter for system requests output */
+
+    f.open_object_section("info");
+    encode_json("entry_point_object_ver", ep_objv, &f);
+    encode_json("object_ver", info.objv_tracker.read_version, &f);
+    encode_json("bucket_info", info, &f);
+    f.close_section();
+    rgw_flush_formatter_and_reset(s, &f);
+  }
+}
+
+void RGWDeleteBucket_ObjStore_S3::send_response()
+{
+  int r = op_ret;
+  if (!r)
+    r = STATUS_NO_CONTENT;
+
+  set_req_state_err(s, r);
+  dump_errno(s);
+  end_header(s, this);
+}
+
+static inline void map_qs_metadata(req_state* s, bool crypto_too)
+{
+  /* merge S3 valid user metadata from the query-string into
+   * x_meta_map, which maps them to attributes */
+  const auto& params = const_cast<RGWHTTPArgs&>(s->info.args).get_params();
+  for (const auto& elt : params) {
+    std::string k = boost::algorithm::to_lower_copy(elt.first);
+    if (k.find("x-amz-meta-") == /* offset */ 0) {
+      rgw_add_amz_meta_header(s->info.x_meta_map, k, elt.second);
+    }
+    if (crypto_too && k.find("x-amz-server-side-encryption") == /* offset */ 0) {
+      rgw_set_amz_meta_header(s->info.crypt_attribute_map, k, elt.second, OVERWRITE);
+    }
+  }
+}
+
+int RGWPutObj_ObjStore_S3::get_params(optional_yield y)
+{
+  if (!s->length) {
+    const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+    if (!encoding || strcmp(encoding, "chunked") != 0) {
+      ldout(s->cct, 20) << "neither length nor chunked encoding" << dendl;
+      return -ERR_LENGTH_REQUIRED;
+    }
+
+    chunked_upload = true;
+  }
+
+  int ret;
+
+  map_qs_metadata(s, true);
+  ret = get_encryption_defaults(s);
+  if (ret < 0) {
+    ldpp_dout(this, 5) << __func__ << "(): get_encryption_defaults() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  RGWAccessControlPolicy_S3 s3policy(s->cct);
+  ret = create_s3_policy(s, driver, s3policy, s->owner);
+  if (ret < 0)
+    return ret;
+
+  policy = s3policy;
+
+  if_match = s->info.env->get("HTTP_IF_MATCH");
+  if_nomatch = s->info.env->get("HTTP_IF_NONE_MATCH");
+
+  /* handle object tagging */
+  auto tag_str = s->info.env->get("HTTP_X_AMZ_TAGGING");
+  if (tag_str){
+    obj_tags = std::make_unique<RGWObjTags>();
+    ret = obj_tags->set_from_string(tag_str);
+    if (ret < 0){
+      ldpp_dout(this,0) << "setting obj tags failed with " << ret << dendl;
+      if (ret == -ERR_INVALID_TAG){
+        ret = -EINVAL; //s3 returns only -EINVAL for PUT requests
+      }
+
+      return ret;
+    }
+  }
+
+  //handle object lock
+  auto obj_lock_mode_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_MODE");
+  auto obj_lock_date_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_RETAIN_UNTIL_DATE");
+  auto obj_legal_hold_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_LEGAL_HOLD");
+  if (obj_lock_mode_str && obj_lock_date_str) {
+    boost::optional<ceph::real_time> date = ceph::from_iso_8601(obj_lock_date_str);
+    if (boost::none == date || ceph::real_clock::to_time_t(*date) <= ceph_clock_now()) {
+        ret = -EINVAL;
+        ldpp_dout(this,0) << "invalid x-amz-object-lock-retain-until-date value" << dendl;
+        return ret;
+    }
+    if (strcmp(obj_lock_mode_str, "GOVERNANCE") != 0 && strcmp(obj_lock_mode_str, "COMPLIANCE") != 0) {
+        ret = -EINVAL;
+        ldpp_dout(this,0) << "invalid x-amz-object-lock-mode value" << dendl;
+        return ret;
+    }
+    obj_retention = new RGWObjectRetention(obj_lock_mode_str, *date);
+  } else if ((obj_lock_mode_str && !obj_lock_date_str) || (!obj_lock_mode_str && obj_lock_date_str)) {
+    ret = -EINVAL;
+    ldpp_dout(this,0) << "need both x-amz-object-lock-mode and x-amz-object-lock-retain-until-date " << dendl;
+    return ret;
+  }
+  if (obj_legal_hold_str) {
+    if (strcmp(obj_legal_hold_str, "ON") != 0 && strcmp(obj_legal_hold_str, "OFF") != 0) {
+        ret = -EINVAL;
+        ldpp_dout(this,0) << "invalid x-amz-object-lock-legal-hold value" << dendl;
+        return ret;
+    }
+    obj_legal_hold = new RGWObjectLegalHold(obj_legal_hold_str);
+  }
+  if (!s->bucket->get_info().obj_lock_enabled() && (obj_retention || obj_legal_hold)) {
+    ldpp_dout(this, 0) << "ERROR: object retention or legal hold can't be set if bucket object lock not configured" << dendl;
+    ret = -ERR_INVALID_REQUEST;
+    return ret;
+  }
+  multipart_upload_id = s->info.args.get("uploadId");
+  multipart_part_str = s->info.args.get("partNumber");
+  if (!multipart_part_str.empty()) {
+    string err;
+    multipart_part_num = strict_strtol(multipart_part_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(s, 10) << "bad part number: " << multipart_part_str << ": " << err << dendl;
+      return -EINVAL;
+    }
+  } else if (!multipart_upload_id.empty()) {
+    ldpp_dout(s, 10) << "part number with no multipart upload id" << dendl;
+    return -EINVAL;
+  }
+
+  append = s->info.args.exists("append");
+  if (append) {
+    string pos_str = s->info.args.get("position");
+    string err;
+    long long pos_tmp = strict_strtoll(pos_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(s, 10) << "bad position: " << pos_str << ": " << err << dendl;
+      return -EINVAL;
+    } else if (pos_tmp < 0) {
+      ldpp_dout(s, 10) << "bad position: " << pos_str << ": " << "position shouldn't be negative" << dendl;
+      return -EINVAL;
+    }
+    position = uint64_t(pos_tmp);
+  }
+
+  return RGWPutObj_ObjStore::get_params(y);
+}
+
+int RGWPutObj_ObjStore_S3::get_data(bufferlist& bl)
+{
+  const int ret = RGWPutObj_ObjStore::get_data(bl);
+  if (ret == 0) {
+    const int ret_auth = do_aws4_auth_completion();
+    if (ret_auth < 0) {
+      return ret_auth;
+    }
+  }
+
+  return ret;
+}
+
+static int get_success_retcode(int code)
+{
+  switch (code) {
+    case 201:
+      return STATUS_CREATED;
+    case 204:
+      return STATUS_NO_CONTENT;
+  }
+  return 0;
+}
+
+void RGWPutObj_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+    dump_errno(s);
+  } else {
+    if (s->cct->_conf->rgw_s3_success_create_obj_status) {
+      op_ret = get_success_retcode(
+	s->cct->_conf->rgw_s3_success_create_obj_status);
+      set_req_state_err(s, op_ret);
+    }
+
+    string expires = get_s3_expiration_header(s, mtime);
+
+    if (copy_source.empty()) {
+      dump_errno(s);
+      dump_etag(s, etag);
+      dump_content_length(s, 0);
+      dump_header_if_nonempty(s, "x-amz-version-id", version_id);
+      dump_header_if_nonempty(s, "x-amz-expiration", expires);
+      for (auto &it : crypt_http_responses)
+        dump_header(s, it.first, it.second);
+    } else {
+      dump_errno(s);
+      dump_header_if_nonempty(s, "x-amz-version-id", version_id);
+      dump_header_if_nonempty(s, "x-amz-expiration", expires);
+      end_header(s, this, to_mime_type(s->format));
+      dump_start(s);
+      struct tm tmp;
+      utime_t ut(mtime);
+      time_t secs = (time_t)ut.sec();
+      gmtime_r(&secs, &tmp);
+      char buf[TIME_BUF_SIZE];
+      s->formatter->open_object_section_in_ns("CopyPartResult",
+          "http://s3.amazonaws.com/doc/2006-03-01/");
+      if (strftime(buf, sizeof(buf), "%Y-%m-%dT%T.000Z", &tmp) > 0) {
+        s->formatter->dump_string("LastModified", buf);
+      }
+      s->formatter->dump_string("ETag", etag);
+      s->formatter->close_section();
+      rgw_flush_formatter_and_reset(s, s->formatter);
+      return;
+    }
+  }
+  if (append) {
+    if (op_ret == 0 || op_ret == -ERR_POSITION_NOT_EQUAL_TO_LENGTH) {
+      dump_header(s, "x-rgw-next-append-position", cur_accounted_size);
+    }
+  }
+  if (s->system_request && !real_clock::is_zero(mtime)) {
+    dump_epoch_header(s, "Rgwx-Mtime", mtime);
+  }
+  end_header(s, this);
+}
+
+static inline void set_attr(map<string, bufferlist>& attrs, const char* key, const std::string& value)
+{
+  bufferlist bl;
+  encode(value,bl);
+  attrs.emplace(key, std::move(bl));
+}
+
+static inline void set_attr(map<string, bufferlist>& attrs, const char* key, const char* value)
+{
+  bufferlist bl;
+  encode(value,bl);
+  attrs.emplace(key, std::move(bl));
+}
+
+int RGWPutObj_ObjStore_S3::get_decrypt_filter(
+    std::unique_ptr<RGWGetObj_Filter>* filter,
+    RGWGetObj_Filter* cb,
+    map<string, bufferlist>& attrs,
+    bufferlist* manifest_bl)
+{
+  std::map<std::string, std::string> crypt_http_responses_unused;
+
+  std::unique_ptr<BlockCrypt> block_crypt;
+  int res = rgw_s3_prepare_decrypt(s, attrs, &block_crypt, crypt_http_responses_unused);
+  if (res < 0) {
+    return res;
+  }
+  if (block_crypt == nullptr) {
+    return 0;
+  }
+
+  // in case of a multipart upload, we need to know the part lengths to
+  // correctly decrypt across part boundaries
+  std::vector<size_t> parts_len;
+
+  // for replicated objects, the original part lengths are preserved in an xattr
+  if (auto i = attrs.find(RGW_ATTR_CRYPT_PARTS); i != attrs.end()) {
+    try {
+      auto p = i->second.cbegin();
+      using ceph::decode;
+      decode(parts_len, p);
+    } catch (const buffer::error&) {
+      ldpp_dout(this, 1) << "failed to decode RGW_ATTR_CRYPT_PARTS" << dendl;
+      return -EIO;
+    }
+  } else if (manifest_bl) {
+    // otherwise, we read the part lengths from the manifest
+    res = RGWGetObj_BlockDecrypt::read_manifest_parts(this, *manifest_bl,
+                                                      parts_len);
+    if (res < 0) {
+      return res;
+    }
+  }
+
+  *filter = std::make_unique<RGWGetObj_BlockDecrypt>(
+      s, s->cct, cb, std::move(block_crypt),
+      std::move(parts_len));
+  return 0;
+}
+
+int RGWPutObj_ObjStore_S3::get_encrypt_filter(
+    std::unique_ptr<rgw::sal::DataProcessor> *filter,
+    rgw::sal::DataProcessor *cb)
+{
+  int res = 0;
+  if (!multipart_upload_id.empty()) {
+    std::unique_ptr<rgw::sal::MultipartUpload> upload =
+      s->bucket->get_multipart_upload(s->object->get_name(),
+				  multipart_upload_id);
+    std::unique_ptr<rgw::sal::Object> obj = upload->get_meta_obj();
+    obj->set_in_extra_data(true);
+    res = obj->get_obj_attrs(s->yield, this);
+    if (res == 0) {
+      std::unique_ptr<BlockCrypt> block_crypt;
+      /* We are adding to existing object.
+       * We use crypto mode that configured as if we were decrypting. */
+      res = rgw_s3_prepare_decrypt(s, obj->get_attrs(), &block_crypt, crypt_http_responses);
+      if (res == 0 && block_crypt != nullptr)
+        filter->reset(new RGWPutObj_BlockEncrypt(s, s->cct, cb, std::move(block_crypt)));
+    }
+    /* it is ok, to not have encryption at all */
+  }
+  else
+  {
+    std::unique_ptr<BlockCrypt> block_crypt;
+    res = rgw_s3_prepare_encrypt(s, attrs, &block_crypt, crypt_http_responses);
+    if (res == 0 && block_crypt != nullptr) {
+      filter->reset(new RGWPutObj_BlockEncrypt(s, s->cct, cb, std::move(block_crypt)));
+    }
+  }
+  return res;
+}
+
+void RGWPostObj_ObjStore_S3::rebuild_key(rgw::sal::Object* obj)
+{
+  string key = obj->get_name();
+  static string var = "${filename}";
+  int pos = key.find(var);
+  if (pos < 0)
+    return;
+
+  string new_key = key.substr(0, pos);
+  new_key.append(filename);
+  new_key.append(key.substr(pos + var.size()));
+
+  obj->set_key(new_key);
+}
+
+std::string RGWPostObj_ObjStore_S3::get_current_filename() const
+{
+  return s->object->get_name();
+}
+
+std::string RGWPostObj_ObjStore_S3::get_current_content_type() const
+{
+  return content_type;
+}
+
+int RGWPostObj_ObjStore_S3::get_params(optional_yield y)
+{
+  op_ret = RGWPostObj_ObjStore::get_params(y);
+  if (op_ret < 0) {
+    return op_ret;
+  }
+
+  map_qs_metadata(s, false);
+
+  bool done;
+  do {
+    struct post_form_part part;
+    int r = read_form_part_header(&part, done);
+    if (r < 0)
+      return r;
+
+    if (s->cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+      ldpp_dout(this, 20) << "read part header -- part.name="
+                        << part.name << dendl;
+
+      for (const auto& pair : part.fields) {
+        ldpp_dout(this, 20) << "field.name=" << pair.first << dendl;
+        ldpp_dout(this, 20) << "field.val=" << pair.second.val << dendl;
+        ldpp_dout(this, 20) << "field.params:" << dendl;
+
+        for (const auto& param_pair : pair.second.params) {
+          ldpp_dout(this, 20) << " " << param_pair.first
+                            << " -> " << param_pair.second << dendl;
+        }
+      }
+    }
+
+    if (done) { /* unexpected here */
+      err_msg = "Malformed request";
+      return -EINVAL;
+    }
+
+    if (stringcasecmp(part.name, "file") == 0) { /* beginning of data transfer */
+      struct post_part_field& field = part.fields["Content-Disposition"];
+      map<string, string>::iterator iter = field.params.find("filename");
+      if (iter != field.params.end()) {
+	filename = iter->second;
+      }
+      parts[part.name] = part;
+      break;
+    }
+
+    bool boundary;
+    uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
+    r = read_data(part.data, chunk_size, boundary, done);
+    if (r < 0 || !boundary) {
+      err_msg = "Couldn't find boundary";
+      return -EINVAL;
+    }
+    parts[part.name] = part;
+    string part_str(part.data.c_str(), part.data.length());
+    env.add_var(part.name, part_str);
+  } while (!done);
+
+  for (auto &p: parts) {
+    if (! boost::istarts_with(p.first, "x-amz-server-side-encryption")) {
+      continue;
+    }
+    bufferlist &d { p.second.data };
+    std::string v { rgw_trim_whitespace(std::string_view(d.c_str(), d.length())) };
+    rgw_set_amz_meta_header(s->info.crypt_attribute_map, p.first, v, OVERWRITE);
+  }
+  int r = get_encryption_defaults(s);
+  if (r < 0) {
+    ldpp_dout(this, 5) << __func__ << "(): get_encryption_defaults() returned ret=" << r << dendl;
+    return r;
+  }
+
+  ldpp_dout(this, 20) << "adding bucket to policy env: " << s->bucket->get_name()
+		    << dendl;
+  env.add_var("bucket", s->bucket->get_name());
+
+  string object_str;
+  if (!part_str(parts, "key", &object_str)) {
+    err_msg = "Key not specified";
+    return -EINVAL;
+  }
+
+  s->object = s->bucket->get_object(rgw_obj_key(object_str));
+
+  rebuild_key(s->object.get());
+
+  if (rgw::sal::Object::empty(s->object.get())) {
+    err_msg = "Empty object name";
+    return -EINVAL;
+  }
+
+  env.add_var("key", s->object->get_name());
+
+  part_str(parts, "Content-Type", &content_type);
+
+  /* AWS permits POST without Content-Type: http://tracker.ceph.com/issues/20201 */
+  if (! content_type.empty()) {
+    env.add_var("Content-Type", content_type);
+  }
+
+  std::string storage_class;
+  part_str(parts, "x-amz-storage-class", &storage_class);
+
+  if (! storage_class.empty()) {
+    s->dest_placement.storage_class = storage_class;
+    if (!driver->valid_placement(s->dest_placement)) {
+      ldpp_dout(this, 0) << "NOTICE: invalid dest placement: " << s->dest_placement.to_str() << dendl;
+      err_msg = "The storage class you specified is not valid";
+      return -EINVAL;
+    }
+  }
+
+  map<string, struct post_form_part, ltstr_nocase>::iterator piter =
+    parts.upper_bound(RGW_AMZ_META_PREFIX);
+  for (; piter != parts.end(); ++piter) {
+    string n = piter->first;
+    if (strncasecmp(n.c_str(), RGW_AMZ_META_PREFIX,
+		    sizeof(RGW_AMZ_META_PREFIX) - 1) != 0)
+      break;
+
+    string attr_name = RGW_ATTR_PREFIX;
+    attr_name.append(n);
+
+    /* need to null terminate it */
+    bufferlist& data = piter->second.data;
+    string str = string(data.c_str(), data.length());
+
+    bufferlist attr_bl;
+    attr_bl.append(str.c_str(), str.size() + 1);
+
+    attrs[attr_name] = attr_bl;
+  }
+  // TODO: refactor this and the above loop to share code
+  piter = parts.find(RGW_AMZ_WEBSITE_REDIRECT_LOCATION);
+  if (piter != parts.end()) {
+    string n = piter->first;
+    string attr_name = RGW_ATTR_PREFIX;
+    attr_name.append(n);
+    /* need to null terminate it */
+    bufferlist& data = piter->second.data;
+    string str = string(data.c_str(), data.length());
+
+    bufferlist attr_bl;
+    attr_bl.append(str.c_str(), str.size() + 1);
+
+    attrs[attr_name] = attr_bl;
+  }
+
+  r = get_policy(y);
+  if (r < 0)
+    return r;
+
+  r = get_tags();
+  if (r < 0)
+    return r;
+
+
+  min_len = post_policy.min_length;
+  max_len = post_policy.max_length;
+
+
+
+  return 0;
+}
+
+int RGWPostObj_ObjStore_S3::get_tags()
+{
+  string tags_str;
+  if (part_str(parts, "tagging", &tags_str)) {
+    RGWXMLParser parser;
+    if (!parser.init()){
+      ldpp_dout(this, 0) << "Couldn't init RGWObjTags XML parser" << dendl;
+      err_msg = "Server couldn't process the request";
+      return -EINVAL; // TODO: This class of errors in rgw code should be a 5XX error
+    }
+    if (!parser.parse(tags_str.c_str(), tags_str.size(), 1)) {
+      ldpp_dout(this,0 ) << "Invalid Tagging XML" << dendl;
+      err_msg = "Invalid Tagging XML";
+      return -EINVAL;
+    }
+
+    RGWObjTagging_S3 tagging;
+
+    try {
+      RGWXMLDecoder::decode_xml("Tagging", tagging, &parser);
+    } catch (RGWXMLDecoder::err& err) {
+      ldpp_dout(this, 5) << "Malformed tagging request: " << err << dendl;
+      return -EINVAL;
+    }
+
+    RGWObjTags obj_tags;
+    int r = tagging.rebuild(obj_tags);
+    if (r < 0)
+      return r;
+
+    bufferlist tags_bl;
+    obj_tags.encode(tags_bl);
+    ldpp_dout(this, 20) << "Read " << obj_tags.count() << "tags" << dendl;
+    attrs[RGW_ATTR_TAGS] = tags_bl;
+  }
+
+
+  return 0;
+}
+
+int RGWPostObj_ObjStore_S3::get_policy(optional_yield y)
+{
+  if (part_bl(parts, "policy", &s->auth.s3_postobj_creds.encoded_policy)) {
+    bool aws4_auth = false;
+
+    /* x-amz-algorithm handling */
+    using rgw::auth::s3::AWS4_HMAC_SHA256_STR;
+    if ((part_str(parts, "x-amz-algorithm", &s->auth.s3_postobj_creds.x_amz_algorithm)) &&
+        (s->auth.s3_postobj_creds.x_amz_algorithm == AWS4_HMAC_SHA256_STR)) {
+      ldpp_dout(this, 0) << "Signature verification algorithm AWS v4 (AWS4-HMAC-SHA256)" << dendl;
+      aws4_auth = true;
+    } else {
+      ldpp_dout(this, 0) << "Signature verification algorithm AWS v2" << dendl;
+    }
+
+    // check that the signature matches the encoded policy
+    if (aws4_auth) {
+      /* AWS4 */
+
+      /* x-amz-credential handling */
+      if (!part_str(parts, "x-amz-credential",
+                    &s->auth.s3_postobj_creds.x_amz_credential)) {
+        ldpp_dout(this, 0) << "No S3 aws4 credential found!" << dendl;
+        err_msg = "Missing aws4 credential";
+        return -EINVAL;
+      }
+
+      /* x-amz-signature handling */
+      if (!part_str(parts, "x-amz-signature",
+                    &s->auth.s3_postobj_creds.signature)) {
+        ldpp_dout(this, 0) << "No aws4 signature found!" << dendl;
+        err_msg = "Missing aws4 signature";
+        return -EINVAL;
+      }
+
+      /* x-amz-date handling */
+      std::string received_date_str;
+      if (!part_str(parts, "x-amz-date", &received_date_str)) {
+        ldpp_dout(this, 0) << "No aws4 date found!" << dendl;
+        err_msg = "Missing aws4 date";
+        return -EINVAL;
+      }
+    } else {
+      /* AWS2 */
+
+      // check that the signature matches the encoded policy
+      if (!part_str(parts, "AWSAccessKeyId",
+                    &s->auth.s3_postobj_creds.access_key)) {
+        ldpp_dout(this, 0) << "No S3 aws2 access key found!" << dendl;
+        err_msg = "Missing aws2 access key";
+        return -EINVAL;
+      }
+
+      if (!part_str(parts, "signature", &s->auth.s3_postobj_creds.signature)) {
+        ldpp_dout(this, 0) << "No aws2 signature found!" << dendl;
+        err_msg = "Missing aws2 signature";
+        return -EINVAL;
+      }
+    }
+
+    if (part_str(parts, "x-amz-security-token", &s->auth.s3_postobj_creds.x_amz_security_token)) {
+      if (s->auth.s3_postobj_creds.x_amz_security_token.size() == 0) {
+        err_msg = "Invalid token";
+        return -EINVAL;
+      }
+    }
+
+    /* FIXME: this is a makeshift solution. The browser upload authentication will be
+     * handled by an instance of rgw::auth::Completer spawned in Handler's authorize()
+     * method. */
+    const int ret = rgw::auth::Strategy::apply(this, auth_registry_ptr->get_s3_post(), s, y);
+    if (ret != 0) {
+      return -EACCES;
+    } else {
+      /* Populate the owner info. */
+      s->owner.set_id(s->user->get_id());
+      s->owner.set_name(s->user->get_display_name());
+      ldpp_dout(this, 20) << "Successful Signature Verification!" << dendl;
+    }
+
+    ceph::bufferlist decoded_policy;
+    try {
+      decoded_policy.decode_base64(s->auth.s3_postobj_creds.encoded_policy);
+    } catch (buffer::error& err) {
+      ldpp_dout(this, 0) << "failed to decode_base64 policy" << dendl;
+      err_msg = "Could not decode policy";
+      return -EINVAL;
+    }
+
+    decoded_policy.append('\0'); // NULL terminate
+    ldpp_dout(this, 20) << "POST policy: " << decoded_policy.c_str() << dendl;
+
+
+    int r = post_policy.from_json(decoded_policy, err_msg);
+    if (r < 0) {
+      if (err_msg.empty()) {
+	err_msg = "Failed to parse policy";
+      }
+      ldpp_dout(this, 0) << "failed to parse policy" << dendl;
+      return -EINVAL;
+    }
+
+    if (aws4_auth) {
+      /* AWS4 */
+      post_policy.set_var_checked("x-amz-signature");
+    } else {
+      /* AWS2 */
+      post_policy.set_var_checked("AWSAccessKeyId");
+      post_policy.set_var_checked("signature");
+    }
+    post_policy.set_var_checked("policy");
+
+    r = post_policy.check(&env, err_msg);
+    if (r < 0) {
+      if (err_msg.empty()) {
+	err_msg = "Policy check failed";
+      }
+      ldpp_dout(this, 0) << "policy check failed" << dendl;
+      return r;
+    }
+
+  } else {
+    ldpp_dout(this, 0) << "No attached policy found!" << dendl;
+  }
+
+  string canned_acl;
+  part_str(parts, "acl", &canned_acl);
+
+  RGWAccessControlPolicy_S3 s3policy(s->cct);
+  ldpp_dout(this, 20) << "canned_acl=" << canned_acl << dendl;
+  if (s3policy.create_canned(s->owner, s->bucket_owner, canned_acl) < 0) {
+    err_msg = "Bad canned ACLs";
+    return -EINVAL;
+  }
+
+  policy = s3policy;
+
+  return 0;
+}
+
+int RGWPostObj_ObjStore_S3::complete_get_params()
+{
+  bool done;
+  do {
+    struct post_form_part part;
+    int r = read_form_part_header(&part, done);
+    if (r < 0) {
+      return r;
+    }
+
+    ceph::bufferlist part_data;
+    bool boundary;
+    uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
+    r = read_data(part.data, chunk_size, boundary, done);
+    if (r < 0 || !boundary) {
+      return -EINVAL;
+    }
+
+    /* Just reading the data but not storing any results of that. */
+  } while (!done);
+
+  return 0;
+}
+
+int RGWPostObj_ObjStore_S3::get_data(ceph::bufferlist& bl, bool& again)
+{
+  bool boundary;
+  bool done;
+
+  const uint64_t chunk_size = s->cct->_conf->rgw_max_chunk_size;
+  int r = read_data(bl, chunk_size, boundary, done);
+  if (r < 0) {
+    return r;
+  }
+
+  if (boundary) {
+    if (!done) {
+      /* Reached end of data, let's drain the rest of the params */
+      r = complete_get_params();
+      if (r < 0) {
+       return r;
+      }
+    }
+  }
+
+  again = !boundary;
+  return bl.length();
+}
+
+void RGWPostObj_ObjStore_S3::send_response()
+{
+  if (op_ret == 0 && parts.count("success_action_redirect")) {
+    string redirect;
+
+    part_str(parts, "success_action_redirect", &redirect);
+
+    string tenant;
+    string bucket;
+    string key;
+    string etag_str = "\"";
+
+    etag_str.append(etag);
+    etag_str.append("\"");
+
+    string etag_url;
+
+    url_encode(s->bucket_tenant, tenant); /* surely overkill, but cheap */
+    url_encode(s->bucket_name, bucket);
+    url_encode(s->object->get_name(), key);
+    url_encode(etag_str, etag_url);
+
+    if (!s->bucket_tenant.empty()) {
+      /*
+       * What we really would like is to quaily the bucket name, so
+       * that the client could simply copy it and paste into next request.
+       * Unfortunately, in S3 we cannot know if the client will decide
+       * to come through DNS, with "bucket.tenant" sytanx, or through
+       * URL with "tenant\bucket" syntax. Therefore, we provide the
+       * tenant separately.
+       */
+      redirect.append("?tenant=");
+      redirect.append(tenant);
+      redirect.append("&bucket=");
+      redirect.append(bucket);
+    } else {
+      redirect.append("?bucket=");
+      redirect.append(bucket);
+    }
+    redirect.append("&key=");
+    redirect.append(key);
+    redirect.append("&etag=");
+    redirect.append(etag_url);
+
+    int r = check_utf8(redirect.c_str(), redirect.size());
+    if (r < 0) {
+      op_ret = r;
+      goto done;
+    }
+    dump_redirect(s, redirect);
+    op_ret = STATUS_REDIRECT;
+  } else if (op_ret == 0 && parts.count("success_action_status")) {
+    string status_string;
+    uint32_t status_int;
+
+    part_str(parts, "success_action_status", &status_string);
+
+    int r = stringtoul(status_string, &status_int);
+    if (r < 0) {
+      op_ret = r;
+      goto done;
+    }
+
+    switch (status_int) {
+      case 200:
+	break;
+      case 201:
+	op_ret = STATUS_CREATED;
+	break;
+      default:
+	op_ret = STATUS_NO_CONTENT;
+	break;
+    }
+  } else if (! op_ret) {
+    op_ret = STATUS_NO_CONTENT;
+  }
+
+done:
+  if (op_ret == STATUS_CREATED) {
+    for (auto &it : crypt_http_responses)
+      dump_header(s, it.first, it.second);
+    s->formatter->open_object_section("PostResponse");
+    std::string base_uri = compute_domain_uri(s);
+    if (!s->bucket_tenant.empty()){
+      s->formatter->dump_format("Location", "%s/%s:%s/%s",
+                                base_uri.c_str(),
+                                url_encode(s->bucket_tenant).c_str(),
+                                url_encode(s->bucket_name).c_str(),
+                                url_encode(s->object->get_name()).c_str());
+      s->formatter->dump_string("Tenant", s->bucket_tenant);
+    } else {
+      s->formatter->dump_format("Location", "%s/%s/%s",
+                                base_uri.c_str(),
+                                url_encode(s->bucket_name).c_str(),
+                                url_encode(s->object->get_name()).c_str());
+    }
+    s->formatter->dump_string("Bucket", s->bucket_name);
+    s->formatter->dump_string("Key", s->object->get_name());
+    s->formatter->dump_string("ETag", etag);
+    s->formatter->close_section();
+  }
+  s->err.message = err_msg;
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  if (op_ret >= 0) {
+    dump_content_length(s, s->formatter->get_len());
+  }
+  end_header(s, this);
+  if (op_ret != STATUS_CREATED)
+    return;
+
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+int RGWPostObj_ObjStore_S3::get_encrypt_filter(
+    std::unique_ptr<rgw::sal::DataProcessor> *filter,
+    rgw::sal::DataProcessor *cb)
+{
+  std::unique_ptr<BlockCrypt> block_crypt;
+  int res = rgw_s3_prepare_encrypt(s, attrs, &block_crypt,
+                                   crypt_http_responses);
+  if (res == 0 && block_crypt != nullptr) {
+    filter->reset(new RGWPutObj_BlockEncrypt(s, s->cct, cb, std::move(block_crypt)));
+  }
+  return res;
+}
+
+int RGWDeleteObj_ObjStore_S3::get_params(optional_yield y)
+{
+  const char *if_unmod = s->info.env->get("HTTP_X_AMZ_DELETE_IF_UNMODIFIED_SINCE");
+
+  if (s->system_request) {
+    s->info.args.get_bool(RGW_SYS_PARAM_PREFIX "no-precondition-error", &no_precondition_error, false);
+  }
+
+  if (if_unmod) {
+    std::string if_unmod_decoded = url_decode(if_unmod);
+    uint64_t epoch;
+    uint64_t nsec;
+    if (utime_t::parse_date(if_unmod_decoded, &epoch, &nsec) < 0) {
+      ldpp_dout(this, 10) << "failed to parse time: " << if_unmod_decoded << dendl;
+      return -EINVAL;
+    }
+    unmod_since = utime_t(epoch, nsec).to_real_time();
+  }
+
+  const char *bypass_gov_header = s->info.env->get("HTTP_X_AMZ_BYPASS_GOVERNANCE_RETENTION");
+  if (bypass_gov_header) {
+    std::string bypass_gov_decoded = url_decode(bypass_gov_header);
+    bypass_governance_mode = boost::algorithm::iequals(bypass_gov_decoded, "true");
+  }
+
+  return 0;
+}
+
+void RGWDeleteObj_ObjStore_S3::send_response()
+{
+  int r = op_ret;
+  if (r == -ENOENT)
+    r = 0;
+  if (!r)
+    r = STATUS_NO_CONTENT;
+
+  set_req_state_err(s, r);
+  dump_errno(s);
+  dump_header_if_nonempty(s, "x-amz-version-id", version_id);
+  if (delete_marker) {
+    dump_header(s, "x-amz-delete-marker", "true");
+  }
+  end_header(s, this);
+}
+
+int RGWCopyObj_ObjStore_S3::init_dest_policy()
+{
+  RGWAccessControlPolicy_S3 s3policy(s->cct);
+
+  /* build a policy for the target object */
+  int r = create_s3_policy(s, driver, s3policy, s->owner);
+  if (r < 0)
+    return r;
+
+  dest_policy = s3policy;
+
+  return 0;
+}
+
+int RGWCopyObj_ObjStore_S3::get_params(optional_yield y)
+{
+  //handle object lock
+  auto obj_lock_mode_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_MODE");
+  auto obj_lock_date_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_RETAIN_UNTIL_DATE");
+  auto obj_legal_hold_str = s->info.env->get("HTTP_X_AMZ_OBJECT_LOCK_LEGAL_HOLD");
+  if (obj_lock_mode_str && obj_lock_date_str) {
+    boost::optional<ceph::real_time> date = ceph::from_iso_8601(obj_lock_date_str);
+    if (boost::none == date || ceph::real_clock::to_time_t(*date) <= ceph_clock_now()) {
+      s->err.message = "invalid x-amz-object-lock-retain-until-date value";
+      ldpp_dout(this,0) << s->err.message << dendl;
+      return -EINVAL;
+    }
+    if (strcmp(obj_lock_mode_str, "GOVERNANCE") != 0 && strcmp(obj_lock_mode_str, "COMPLIANCE") != 0) {
+      s->err.message = "invalid x-amz-object-lock-mode value";
+      ldpp_dout(this,0) << s->err.message << dendl;
+      return -EINVAL;
+    }
+    obj_retention = new RGWObjectRetention(obj_lock_mode_str, *date);
+  } else if (obj_lock_mode_str || obj_lock_date_str) {
+    s->err.message = "need both x-amz-object-lock-mode and x-amz-object-lock-retain-until-date ";
+    ldpp_dout(this,0) << s->err.message << dendl;
+    return -EINVAL;
+  }
+  if (obj_legal_hold_str) {
+    if (strcmp(obj_legal_hold_str, "ON") != 0 && strcmp(obj_legal_hold_str, "OFF") != 0) {
+      s->err.message = "invalid x-amz-object-lock-legal-hold value";
+      ldpp_dout(this,0) << s->err.message << dendl;
+      return -EINVAL;
+    }
+    obj_legal_hold = new RGWObjectLegalHold(obj_legal_hold_str);
+  }
+
+  if_mod = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_IF_MODIFIED_SINCE");
+  if_unmod = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_IF_UNMODIFIED_SINCE");
+  if_match = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_IF_MATCH");
+  if_nomatch = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_IF_NONE_MATCH");
+
+  if (s->system_request) {
+    source_zone = s->info.args.get(RGW_SYS_PARAM_PREFIX "source-zone");
+    s->info.args.get_bool(RGW_SYS_PARAM_PREFIX "copy-if-newer", &copy_if_newer, false);
+  }
+
+  const char *copy_source_temp = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE");
+  if (copy_source_temp) {
+    copy_source = copy_source_temp;
+  }
+  auto tmp_md_d = s->info.env->get("HTTP_X_AMZ_METADATA_DIRECTIVE");
+  if (tmp_md_d) {
+    if (strcasecmp(tmp_md_d, "COPY") == 0) {
+      attrs_mod = rgw::sal::ATTRSMOD_NONE;
+    } else if (strcasecmp(tmp_md_d, "REPLACE") == 0) {
+      attrs_mod = rgw::sal::ATTRSMOD_REPLACE;
+    } else if (!source_zone.empty()) {
+      attrs_mod = rgw::sal::ATTRSMOD_NONE; // default for intra-zone_group copy
+    } else {
+      s->err.message = "Unknown metadata directive.";
+      ldpp_dout(this, 0) << s->err.message << dendl;
+      return -EINVAL;
+    }
+    md_directive = tmp_md_d;
+  }
+
+  if (source_zone.empty() &&
+      (s->bucket->get_tenant() == s->src_tenant_name) &&
+      (s->bucket->get_name() == s->src_bucket_name) &&
+      (s->object->get_name() == s->src_object->get_name()) &&
+      s->src_object->get_instance().empty() &&
+      (attrs_mod != rgw::sal::ATTRSMOD_REPLACE)) {
+    need_to_check_storage_class = true;
+  }
+
+  return 0;
+}
+
+int RGWCopyObj_ObjStore_S3::check_storage_class(const rgw_placement_rule& src_placement)
+{
+  if (src_placement == s->dest_placement) {
+    /* can only copy object into itself if replacing attrs */
+    s->err.message = "This copy request is illegal because it is trying to copy "
+      "an object to itself without changing the object's metadata, "
+      "storage class, website redirect location or encryption attributes.";
+    ldpp_dout(this, 0) << s->err.message << dendl;
+    return -ERR_INVALID_REQUEST;
+  }
+  return 0;
+}
+
+void RGWCopyObj_ObjStore_S3::send_partial_response(off_t ofs)
+{
+  if (! sent_header) {
+    if (op_ret)
+    set_req_state_err(s, op_ret);
+    dump_errno(s);
+
+    // Explicitly use chunked transfer encoding so that we can stream the result
+    // to the user without having to wait for the full length of it.
+    end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING);
+    dump_start(s);
+    if (op_ret == 0) {
+      s->formatter->open_object_section_in_ns("CopyObjectResult", XMLNS_AWS_S3);
+    }
+    sent_header = true;
+  } else {
+    /* Send progress field. Note that this diverge from the original S3
+     * spec. We do this in order to keep connection alive.
+     */
+    s->formatter->dump_int("Progress", (uint64_t)ofs);
+  }
+  rgw_flush_formatter(s, s->formatter);
+}
+
+void RGWCopyObj_ObjStore_S3::send_response()
+{
+  if (!sent_header)
+    send_partial_response(0);
+
+  if (op_ret == 0) {
+    dump_time(s, "LastModified", mtime);
+    if (!etag.empty()) {
+      s->formatter->dump_format("ETag", "\"%s\"",etag.c_str());
+    }
+    s->formatter->close_section();
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+void RGWGetACLs_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+  rgw_flush_formatter(s, s->formatter);
+  dump_body(s, acls);
+}
+
+int RGWPutACLs_ObjStore_S3::get_params(optional_yield y)
+{
+  int ret =  RGWPutACLs_ObjStore::get_params(y);
+  if (ret >= 0) {
+    const int ret_auth = do_aws4_auth_completion();
+    if (ret_auth < 0) {
+      return ret_auth;
+    }
+  } else {
+    /* a request body is not required an S3 PutACLs request--n.b.,
+     * s->length is non-null iff a content length was parsed (the
+     * ACP or canned ACL could be in any of 3 headers, don't worry
+     * about that here) */
+    if ((ret == -ERR_LENGTH_REQUIRED) &&
+	!!(s->length)) {
+      return 0;
+    }
+  }
+  return ret;
+}
+
+int RGWPutACLs_ObjStore_S3::get_policy_from_state(rgw::sal::Driver* driver,
+						  req_state *s,
+						  stringstream& ss)
+{
+  RGWAccessControlPolicy_S3 s3policy(s->cct);
+
+  // bucket-* canned acls do not apply to bucket
+  if (rgw::sal::Object::empty(s->object.get())) {
+    if (s->canned_acl.find("bucket") != string::npos)
+      s->canned_acl.clear();
+  }
+
+  int r = create_s3_policy(s, driver, s3policy, owner);
+  if (r < 0)
+    return r;
+
+  s3policy.to_xml(ss);
+
+  return 0;
+}
+
+void RGWPutACLs_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+}
+
+void RGWGetLC_ObjStore_S3::execute(optional_yield y)
+{
+  config.set_ctx(s->cct);
+
+  map<string, bufferlist>::iterator aiter = s->bucket_attrs.find(RGW_ATTR_LC);
+  if (aiter == s->bucket_attrs.end()) {
+    op_ret = -ENOENT;
+    return;
+  }
+
+  bufferlist::const_iterator iter{&aiter->second};
+  try {
+      config.decode(iter);
+    } catch (const buffer::error& e) {
+      ldpp_dout(this, 0) << __func__ <<  "decode life cycle config failed" << dendl;
+      op_ret = -EIO;
+      return;
+    }
+}
+
+void RGWGetLC_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    if (op_ret == -ENOENT) {
+      set_req_state_err(s, ERR_NO_SUCH_LC);
+    } else {
+      set_req_state_err(s, op_ret);
+    }
+  }
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  if (op_ret < 0)
+    return;
+
+  encode_xml("LifecycleConfiguration", XMLNS_AWS_S3, config, s->formatter);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void RGWPutLC_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+}
+
+void RGWDeleteLC_ObjStore_S3::send_response()
+{
+  if (op_ret == 0)
+      op_ret = STATUS_NO_CONTENT;
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+}
+
+void RGWGetCORS_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    if (op_ret == -ENOENT)
+      set_req_state_err(s, ERR_NO_SUCH_CORS_CONFIGURATION);
+    else
+      set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s, NULL, to_mime_type(s->format));
+  dump_start(s);
+  if (! op_ret) {
+    string cors;
+    RGWCORSConfiguration_S3 *s3cors =
+      static_cast<RGWCORSConfiguration_S3 *>(&bucket_cors);
+    stringstream ss;
+
+    s3cors->to_xml(ss);
+    cors = ss.str();
+    dump_body(s, cors);
+  }
+}
+
+int RGWPutCORS_ObjStore_S3::get_params(optional_yield y)
+{
+  RGWCORSXMLParser_S3 parser(this, s->cct);
+  RGWCORSConfiguration_S3 *cors_config;
+
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+
+  int r = 0;
+  bufferlist data;
+  std::tie(r, data) = read_all_input(s, max_size, false);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!parser.init()) {
+    return -EINVAL;
+  }
+
+  char* buf = data.c_str();
+  if (!buf || !parser.parse(buf, data.length(), 1)) {
+    return -ERR_MALFORMED_XML;
+  }
+  cors_config =
+    static_cast<RGWCORSConfiguration_S3 *>(parser.find_first(
+					     "CORSConfiguration"));
+  if (!cors_config) {
+    return -ERR_MALFORMED_XML;
+  }
+
+#define CORS_RULES_MAX_NUM      100
+  int max_num = s->cct->_conf->rgw_cors_rules_max_num;
+  if (max_num < 0) {
+    max_num = CORS_RULES_MAX_NUM;
+  }
+  int cors_rules_num = cors_config->get_rules().size();
+  if (cors_rules_num > max_num) {
+    ldpp_dout(this, 4) << "An cors config can have up to "
+                     << max_num
+                     << " rules, request cors rules num: "
+                     << cors_rules_num << dendl;
+    op_ret = -ERR_INVALID_CORS_RULES_ERROR;
+    s->err.message = "The number of CORS rules should not exceed allowed limit of "
+                     + std::to_string(max_num) + " rules.";
+    return -ERR_INVALID_REQUEST;
+  }
+
+  // forward bucket cors requests to meta master zone
+  if (!driver->is_meta_master()) {
+    /* only need to keep this data around if we're not meta master */
+    in_data.append(data);
+  }
+
+  if (s->cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
+    ldpp_dout(this, 15) << "CORSConfiguration";
+    cors_config->to_xml(*_dout);
+    *_dout << dendl;
+  }
+
+  cors_config->encode(cors_bl);
+
+  return 0;
+}
+
+void RGWPutCORS_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, NULL, to_mime_type(s->format));
+  dump_start(s);
+}
+
+void RGWDeleteCORS_ObjStore_S3::send_response()
+{
+  int r = op_ret;
+  if (!r || r == -ENOENT)
+    r = STATUS_NO_CONTENT;
+
+  set_req_state_err(s, r);
+  dump_errno(s);
+  end_header(s, NULL);
+}
+
+void RGWOptionsCORS_ObjStore_S3::send_response()
+{
+  string hdrs, exp_hdrs;
+  uint32_t max_age = CORS_MAX_AGE_INVALID;
+  /*EACCES means, there is no CORS registered yet for the bucket
+   *ENOENT means, there is no match of the Origin in the list of CORSRule
+   */
+  if (op_ret == -ENOENT)
+    op_ret = -EACCES;
+  if (op_ret < 0) {
+    set_req_state_err(s, op_ret);
+    dump_errno(s);
+    end_header(s, NULL);
+    return;
+  }
+  get_response_params(hdrs, exp_hdrs, &max_age);
+
+  dump_errno(s);
+  dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(),
+		      max_age);
+  end_header(s, NULL);
+}
+
+void RGWPutBucketEncryption_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+void RGWGetBucketEncryption_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    if (op_ret == -ENOENT)
+      set_req_state_err(s, ERR_NO_SUCH_BUCKET_ENCRYPTION_CONFIGURATION);
+    else
+      set_req_state_err(s, op_ret);
+  }
+
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  if (!op_ret) {
+    encode_xml("ServerSideEncryptionConfiguration", XMLNS_AWS_S3,
+      bucket_encryption_conf, s->formatter);
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+void RGWDeleteBucketEncryption_ObjStore_S3::send_response()
+{
+  if (op_ret == 0) {
+    op_ret = STATUS_NO_CONTENT;
+  }
+
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+}
+
+void RGWGetRequestPayment_ObjStore_S3::send_response()
+{
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  s->formatter->open_object_section_in_ns("RequestPaymentConfiguration", XMLNS_AWS_S3);
+  const char *payer = requester_pays ? "Requester" :  "BucketOwner";
+  s->formatter->dump_string("Payer", payer);
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+class RGWSetRequestPaymentParser : public RGWXMLParser
+{
+  XMLObj *alloc_obj(const char *el) override {
+    return new XMLObj;
+  }
+
+public:
+  RGWSetRequestPaymentParser() {}
+  ~RGWSetRequestPaymentParser() override {}
+
+  int get_request_payment_payer(bool *requester_pays) {
+    XMLObj *config = find_first("RequestPaymentConfiguration");
+    if (!config)
+      return -EINVAL;
+
+    *requester_pays = false;
+
+    XMLObj *field = config->find_first("Payer");
+    if (!field)
+      return 0;
+
+    auto& s = field->get_data();
+
+    if (stringcasecmp(s, "Requester") == 0) {
+      *requester_pays = true;
+    } else if (stringcasecmp(s, "BucketOwner") != 0) {
+      return -EINVAL;
+    }
+
+    return 0;
+  }
+};
+
+int RGWSetRequestPayment_ObjStore_S3::get_params(optional_yield y)
+{
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+
+  int r = 0;
+  std::tie(r, in_data) = read_all_input(s, max_size, false);
+
+  if (r < 0) {
+    return r;
+  }
+
+
+  RGWSetRequestPaymentParser parser;
+
+  if (!parser.init()) {
+    ldpp_dout(this, 0) << "ERROR: failed to initialize parser" << dendl;
+    return -EIO;
+  }
+
+  char* buf = in_data.c_str();
+  if (!parser.parse(buf, in_data.length(), 1)) {
+    ldpp_dout(this, 10) << "failed to parse data: " << buf << dendl;
+    return -EINVAL;
+  }
+
+  return parser.get_request_payment_payer(&requester_pays);
+}
+
+void RGWSetRequestPayment_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s);
+}
+
+int RGWInitMultipart_ObjStore_S3::get_params(optional_yield y)
+{
+  int ret;
+
+  ret = get_encryption_defaults(s);
+  if (ret < 0) {
+    ldpp_dout(this, 5) << __func__ << "(): get_encryption_defaults() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  RGWAccessControlPolicy_S3 s3policy(s->cct);
+  ret = create_s3_policy(s, driver, s3policy, s->owner);
+  if (ret < 0)
+    return ret;
+
+  policy = s3policy;
+
+  return 0;
+}
+
+void RGWInitMultipart_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  for (auto &it : crypt_http_responses)
+     dump_header(s, it.first, it.second);
+  ceph::real_time abort_date;
+  string rule_id;
+  bool exist_multipart_abort = get_s3_multipart_abort_header(s, mtime, abort_date, rule_id);
+  if (exist_multipart_abort) {
+    dump_time_header(s, "x-amz-abort-date", abort_date);
+    dump_header_if_nonempty(s, "x-amz-abort-rule-id", rule_id);
+  }
+  end_header(s, this, to_mime_type(s->format));
+  if (op_ret == 0) {
+    dump_start(s);
+    s->formatter->open_object_section_in_ns("InitiateMultipartUploadResult", XMLNS_AWS_S3);
+    if (!s->bucket_tenant.empty())
+      s->formatter->dump_string("Tenant", s->bucket_tenant);
+    s->formatter->dump_string("Bucket", s->bucket_name);
+    s->formatter->dump_string("Key", s->object->get_name());
+    s->formatter->dump_string("UploadId", upload_id);
+    s->formatter->close_section();
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+int RGWInitMultipart_ObjStore_S3::prepare_encryption(map<string, bufferlist>& attrs)
+{
+  int res = 0;
+  res = rgw_s3_prepare_encrypt(s, attrs, nullptr, crypt_http_responses);
+  return res;
+}
+
+int RGWCompleteMultipart_ObjStore_S3::get_params(optional_yield y)
+{
+  int ret = RGWCompleteMultipart_ObjStore::get_params(y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  map_qs_metadata(s, true);
+
+  return do_aws4_auth_completion();
+}
+
+void RGWCompleteMultipart_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  dump_header_if_nonempty(s, "x-amz-version-id", version_id);
+  end_header(s, this, to_mime_type(s->format));
+  if (op_ret == 0) {
+    dump_start(s);
+    s->formatter->open_object_section_in_ns("CompleteMultipartUploadResult", XMLNS_AWS_S3);
+    std::string base_uri = compute_domain_uri(s);
+    if (!s->bucket_tenant.empty()) {
+      s->formatter->dump_format("Location", "%s/%s:%s/%s",
+	  base_uri.c_str(),
+	  s->bucket_tenant.c_str(),
+	  s->bucket_name.c_str(),
+	  s->object->get_name().c_str()
+          );
+      s->formatter->dump_string("Tenant", s->bucket_tenant);
+    } else {
+      s->formatter->dump_format("Location", "%s/%s/%s",
+	  base_uri.c_str(),
+	  s->bucket_name.c_str(),
+	  s->object->get_name().c_str()
+          );
+    }
+    s->formatter->dump_string("Bucket", s->bucket_name);
+    s->formatter->dump_string("Key", s->object->get_name());
+    s->formatter->dump_string("ETag", etag);
+    s->formatter->close_section();
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+void RGWAbortMultipart_ObjStore_S3::send_response()
+{
+  int r = op_ret;
+  if (!r)
+    r = STATUS_NO_CONTENT;
+
+  set_req_state_err(s, r);
+  dump_errno(s);
+  end_header(s, this);
+}
+
+void RGWListMultipart_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  // Explicitly use chunked transfer encoding so that we can stream the result
+  // to the user without having to wait for the full length of it.
+  end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING);
+
+  if (op_ret == 0) {
+    dump_start(s);
+    s->formatter->open_object_section_in_ns("ListPartsResult", XMLNS_AWS_S3);
+    map<uint32_t, std::unique_ptr<rgw::sal::MultipartPart>>::iterator iter;
+    map<uint32_t, std::unique_ptr<rgw::sal::MultipartPart>>::reverse_iterator test_iter;
+    int cur_max = 0;
+
+    iter = upload->get_parts().begin();
+    test_iter = upload->get_parts().rbegin();
+    if (test_iter != upload->get_parts().rend()) {
+      cur_max = test_iter->first;
+    }
+    if (!s->bucket_tenant.empty())
+      s->formatter->dump_string("Tenant", s->bucket_tenant);
+    s->formatter->dump_string("Bucket", s->bucket_name);
+    s->formatter->dump_string("Key", s->object->get_name());
+    s->formatter->dump_string("UploadId", upload_id);
+    s->formatter->dump_string("StorageClass", placement->get_storage_class());
+    s->formatter->dump_int("PartNumberMarker", marker);
+    s->formatter->dump_int("NextPartNumberMarker", cur_max);
+    s->formatter->dump_int("MaxParts", max_parts);
+    s->formatter->dump_string("IsTruncated", (truncated ? "true" : "false"));
+
+    ACLOwner& owner = policy.get_owner();
+    dump_owner(s, owner.get_id(), owner.get_display_name());
+
+    for (; iter != upload->get_parts().end(); ++iter) {
+      rgw::sal::MultipartPart* part = iter->second.get();
+
+      s->formatter->open_object_section("Part");
+
+      dump_time(s, "LastModified", part->get_mtime());
+
+      s->formatter->dump_unsigned("PartNumber", part->get_num());
+      s->formatter->dump_format("ETag", "\"%s\"", part->get_etag().c_str());
+      s->formatter->dump_unsigned("Size", part->get_size());
+      s->formatter->close_section();
+    }
+    s->formatter->close_section();
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+void RGWListBucketMultiparts_ObjStore_S3::send_response()
+{
+  if (op_ret < 0)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+
+  // Explicitly use chunked transfer encoding so that we can stream the result
+  // to the user without having to wait for the full length of it.
+  end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING);
+  dump_start(s);
+  if (op_ret < 0)
+    return;
+
+  s->formatter->open_object_section_in_ns("ListMultipartUploadsResult", XMLNS_AWS_S3);
+  if (!s->bucket_tenant.empty())
+    s->formatter->dump_string("Tenant", s->bucket_tenant);
+  s->formatter->dump_string("Bucket", s->bucket_name);
+  if (!prefix.empty())
+    s->formatter->dump_string("Prefix", prefix);
+  if (!marker_key.empty())
+    s->formatter->dump_string("KeyMarker", marker_key);
+  if (!marker_upload_id.empty())
+    s->formatter->dump_string("UploadIdMarker", marker_upload_id);
+  if (!next_marker_key.empty())
+    s->formatter->dump_string("NextKeyMarker", next_marker_key);
+  if (!next_marker_upload_id.empty())
+    s->formatter->dump_string("NextUploadIdMarker", next_marker_upload_id);
+  s->formatter->dump_int("MaxUploads", max_uploads);
+  if (!delimiter.empty())
+    s->formatter->dump_string("Delimiter", delimiter);
+  s->formatter->dump_string("IsTruncated", (is_truncated ? "true" : "false"));
+
+  if (op_ret >= 0) {
+    vector<std::unique_ptr<rgw::sal::MultipartUpload>>::iterator iter;
+    for (iter = uploads.begin(); iter != uploads.end(); ++iter) {
+      rgw::sal::MultipartUpload* upload = iter->get();
+      s->formatter->open_array_section("Upload");
+      if (encode_url) {
+        s->formatter->dump_string("Key", url_encode(upload->get_key(), false));
+      } else {
+        s->formatter->dump_string("Key", upload->get_key());
+      }
+      s->formatter->dump_string("UploadId", upload->get_upload_id());
+      const ACLOwner& owner = upload->get_owner();
+      dump_owner(s, owner.get_id(), owner.get_display_name(), "Initiator");
+      dump_owner(s, owner.get_id(), owner.get_display_name()); // Owner
+      s->formatter->dump_string("StorageClass", "STANDARD");
+      dump_time(s, "Initiated", upload->get_mtime());
+      s->formatter->close_section();
+    }
+    if (!common_prefixes.empty()) {
+      s->formatter->open_array_section("CommonPrefixes");
+      for (const auto& kv : common_prefixes) {
+        if (encode_url) {
+          s->formatter->dump_string("Prefix", url_encode(kv.first, false));
+        } else {
+          s->formatter->dump_string("Prefix", kv.first);
+        }
+      }
+      s->formatter->close_section();
+    }
+  }
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+int RGWDeleteMultiObj_ObjStore_S3::get_params(optional_yield y)
+{
+  int ret = RGWDeleteMultiObj_ObjStore::get_params(y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  const char *bypass_gov_header = s->info.env->get("HTTP_X_AMZ_BYPASS_GOVERNANCE_RETENTION");
+  if (bypass_gov_header) {
+    std::string bypass_gov_decoded = url_decode(bypass_gov_header);
+    bypass_governance_mode = boost::algorithm::iequals(bypass_gov_decoded, "true");
+  }
+
+  return do_aws4_auth_completion();
+}
+
+void RGWDeleteMultiObj_ObjStore_S3::send_status()
+{
+  if (! status_dumped) {
+    if (op_ret < 0)
+      set_req_state_err(s, op_ret);
+    dump_errno(s);
+    status_dumped = true;
+  }
+}
+
+void RGWDeleteMultiObj_ObjStore_S3::begin_response()
+{
+
+  if (!status_dumped) {
+    send_status();
+  }
+
+  dump_start(s);
+  // Explicitly use chunked transfer encoding so that we can stream the result
+  // to the user without having to wait for the full length of it.
+  end_header(s, this, to_mime_type(s->format), CHUNKED_TRANSFER_ENCODING);
+  s->formatter->open_object_section_in_ns("DeleteResult", XMLNS_AWS_S3);
+
+  rgw_flush_formatter(s, s->formatter);
+}
+
+void RGWDeleteMultiObj_ObjStore_S3::send_partial_response(const rgw_obj_key& key,
+							  bool delete_marker,
+							  const string& marker_version_id,
+                                                          int ret,
+                                                          boost::asio::deadline_timer *formatter_flush_cond)
+{
+  if (!key.empty()) {
+    delete_multi_obj_entry ops_log_entry;
+    ops_log_entry.key = key.name;
+    ops_log_entry.version_id = key.instance;
+    if (ret == 0) {
+      ops_log_entry.error = false;
+      ops_log_entry.http_status = 200;
+      ops_log_entry.delete_marker = delete_marker;
+      if (delete_marker) {
+        ops_log_entry.marker_version_id = marker_version_id;
+      }
+      if (!quiet) {
+        s->formatter->open_object_section("Deleted");
+        s->formatter->dump_string("Key", key.name);
+        if (!key.instance.empty()) {
+            s->formatter->dump_string("VersionId", key.instance);
+        }
+        if (delete_marker) {
+            s->formatter->dump_bool("DeleteMarker", true);
+            s->formatter->dump_string("DeleteMarkerVersionId", marker_version_id);
+        }
+        s->formatter->close_section();
+      }
+    } else if (ret < 0) {
+      struct rgw_http_error r;
+      int err_no;
+
+      s->formatter->open_object_section("Error");
+
+      err_no = -ret;
+      rgw_get_errno_s3(&r, err_no);
+
+      ops_log_entry.error = true;
+      ops_log_entry.http_status = r.http_ret;
+      ops_log_entry.error_message = r.s3_code;
+
+      s->formatter->dump_string("Key", key.name);
+      s->formatter->dump_string("VersionId", key.instance);
+      s->formatter->dump_string("Code", r.s3_code);
+      s->formatter->dump_string("Message", r.s3_code);
+      s->formatter->close_section();
+    }
+
+    ops_log_entries.push_back(std::move(ops_log_entry));
+    if (formatter_flush_cond) {
+      formatter_flush_cond->cancel();
+    } else {
+      rgw_flush_formatter(s, s->formatter);
+    }
+  }
+}
+
+void RGWDeleteMultiObj_ObjStore_S3::end_response()
+{
+
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void RGWGetObjLayout_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, "application/json");
+
+  JSONFormatter f;
+
+  if (op_ret < 0) {
+    return;
+  }
+
+  f.open_object_section("result");
+  s->object->dump_obj_layout(this, s->yield, &f);
+  f.close_section();
+  rgw_flush_formatter(s, &f);
+}
+
+int RGWConfigBucketMetaSearch_ObjStore_S3::get_params(optional_yield y)
+{
+  auto iter = s->info.x_meta_map.find("x-amz-meta-search");
+  if (iter == s->info.x_meta_map.end()) {
+    s->err.message = "X-Rgw-Meta-Search header not provided";
+    ldpp_dout(this, 5) << s->err.message << dendl;
+    return -EINVAL;
+  }
+
+  list<string> expressions;
+  get_str_list(iter->second, ",", expressions);
+
+  for (auto& expression : expressions) {
+    vector<string> args;
+    get_str_vec(expression, ";", args);
+
+    if (args.empty()) {
+      s->err.message = "invalid empty expression";
+      ldpp_dout(this, 5) << s->err.message << dendl;
+      return -EINVAL;
+    }
+    if (args.size() > 2) {
+      s->err.message = string("invalid expression: ") + expression;
+      ldpp_dout(this, 5) << s->err.message << dendl;
+      return -EINVAL;
+    }
+
+    string key = boost::algorithm::to_lower_copy(rgw_trim_whitespace(args[0]));
+    string val;
+    if (args.size() > 1) {
+      val = boost::algorithm::to_lower_copy(rgw_trim_whitespace(args[1]));
+    }
+
+    if (!boost::algorithm::starts_with(key, RGW_AMZ_META_PREFIX)) {
+      s->err.message = string("invalid expression, key must start with '" RGW_AMZ_META_PREFIX "' : ") + expression;
+      ldpp_dout(this, 5) << s->err.message << dendl;
+      return -EINVAL;
+    }
+
+    key = key.substr(sizeof(RGW_AMZ_META_PREFIX) - 1);
+
+    ESEntityTypeMap::EntityType entity_type;
+
+    if (val.empty() || val == "str" || val == "string") {
+      entity_type = ESEntityTypeMap::ES_ENTITY_STR;
+    } else if (val == "int" || val == "integer") {
+      entity_type = ESEntityTypeMap::ES_ENTITY_INT;
+    } else if (val == "date" || val == "datetime") {
+      entity_type = ESEntityTypeMap::ES_ENTITY_DATE;
+    } else {
+      s->err.message = string("invalid entity type: ") + val;
+      ldpp_dout(this, 5) << s->err.message << dendl;
+      return -EINVAL;
+    }
+
+    mdsearch_config[key] = entity_type;
+  }
+
+  return 0;
+}
+
+void RGWConfigBucketMetaSearch_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this);
+}
+
+void RGWGetBucketMetaSearch_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, NULL, to_mime_type(s->format));
+
+  Formatter *f = s->formatter;
+  f->open_array_section("GetBucketMetaSearchResult");
+  for (auto& e : s->bucket->get_info().mdsearch_config) {
+    f->open_object_section("Entry");
+    string k = string("x-amz-meta-") + e.first;
+    f->dump_string("Key", k.c_str());
+    const char *type;
+    switch (e.second) {
+      case ESEntityTypeMap::ES_ENTITY_INT:
+        type = "int";
+        break;
+      case ESEntityTypeMap::ES_ENTITY_DATE:
+        type = "date";
+        break;
+      default:
+        type = "str";
+    }
+    f->dump_string("Type", type);
+    f->close_section();
+  }
+  f->close_section();
+  rgw_flush_formatter(s, f);
+}
+
+void RGWDelBucketMetaSearch_ObjStore_S3::send_response()
+{
+  if (op_ret)
+    set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this);
+}
+
+void RGWPutBucketObjectLock_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+void RGWGetBucketObjectLock_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  if (op_ret) {
+    return;
+  }
+  encode_xml("ObjectLockConfiguration", s->bucket->get_info().obj_lock, s->formatter);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+
+int RGWPutObjRetention_ObjStore_S3::get_params(optional_yield y)
+{
+  const char *bypass_gov_header = s->info.env->get("HTTP_X_AMZ_BYPASS_GOVERNANCE_RETENTION");
+  if (bypass_gov_header) {
+    std::string bypass_gov_decoded = url_decode(bypass_gov_header);
+    bypass_governance_mode = boost::algorithm::iequals(bypass_gov_decoded, "true");
+  }
+
+  const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+  std::tie(op_ret, data) = read_all_input(s, max_size, false);
+  return op_ret;
+}
+
+void RGWPutObjRetention_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+void RGWGetObjRetention_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  if (op_ret) {
+    return;
+  }
+  encode_xml("Retention", obj_retention, s->formatter);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void RGWPutObjLegalHold_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+void RGWGetObjLegalHold_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  if (op_ret) {
+    return;
+  }
+  encode_xml("LegalHold", obj_legal_hold, s->formatter);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void RGWGetBucketPolicyStatus_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  s->formatter->open_object_section_in_ns("PolicyStatus", XMLNS_AWS_S3);
+  // https://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketGETPolicyStatus.html
+  // mentions TRUE and FALSE, but boto/aws official clients seem to want lower
+  // case which is returned by AWS as well; so let's be bug to bug compatible
+  // with the API
+  s->formatter->dump_bool("IsPublic", isPublic);
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+
+}
+
+void RGWPutBucketPublicAccessBlock_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+void RGWGetBucketPublicAccessBlock_ObjStore_S3::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s, this, to_mime_type(s->format));
+  dump_start(s);
+
+  access_conf.dump_xml(s->formatter);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+RGWOp *RGWHandler_REST_Service_S3::op_get()
+{
+  if (is_usage_op()) {
+    return new RGWGetUsage_ObjStore_S3;
+  } else {
+    return new RGWListBuckets_ObjStore_S3;
+  }
+}
+
+RGWOp *RGWHandler_REST_Service_S3::op_head()
+{
+  return new RGWListBuckets_ObjStore_S3;
+}
+
+RGWOp *RGWHandler_REST_Bucket_S3::get_obj_op(bool get_data) const
+{
+  // Non-website mode
+  if (get_data) {
+    int list_type = 1;
+    s->info.args.get_int("list-type", &list_type, 1);
+    switch (list_type) {
+      case 1:
+        return new RGWListBucket_ObjStore_S3;
+      case 2:
+        return new RGWListBucket_ObjStore_S3v2;
+      default:
+        ldpp_dout(s, 5) << __func__ << ": unsupported list-type " << list_type << dendl;
+        return new RGWListBucket_ObjStore_S3;
+    }
+  } else {
+    return new RGWStatBucket_ObjStore_S3;
+  }
+}
+
+RGWOp *RGWHandler_REST_Bucket_S3::op_get()
+{
+  if (s->info.args.sub_resource_exists("encryption"))
+    return nullptr;
+
+  if (s->info.args.sub_resource_exists("logging"))
+    return new RGWGetBucketLogging_ObjStore_S3;
+
+  if (s->info.args.sub_resource_exists("location"))
+    return new RGWGetBucketLocation_ObjStore_S3;
+
+  if (s->info.args.sub_resource_exists("versioning"))
+    return new RGWGetBucketVersioning_ObjStore_S3;
+
+  if (s->info.args.sub_resource_exists("website")) {
+    if (!s->cct->_conf->rgw_enable_static_website) {
+      return NULL;
+    }
+    return new RGWGetBucketWebsite_ObjStore_S3;
+  }
+
+  if (s->info.args.exists("mdsearch")) {
+    return new RGWGetBucketMetaSearch_ObjStore_S3;
+  }
+
+  if (is_acl_op()) {
+    return new RGWGetACLs_ObjStore_S3;
+  } else if (is_cors_op()) {
+    return new RGWGetCORS_ObjStore_S3;
+  } else if (is_request_payment_op()) {
+    return new RGWGetRequestPayment_ObjStore_S3;
+  } else if (s->info.args.exists("uploads")) {
+    return new RGWListBucketMultiparts_ObjStore_S3;
+  } else if(is_lc_op()) {
+    return new RGWGetLC_ObjStore_S3;
+  } else if(is_policy_op()) {
+    return new RGWGetBucketPolicy;
+  } else if (is_tagging_op()) {
+    return new RGWGetBucketTags_ObjStore_S3;
+  } else if (is_object_lock_op()) {
+    return new RGWGetBucketObjectLock_ObjStore_S3;
+  } else if (is_notification_op()) {
+    return RGWHandler_REST_PSNotifs_S3::create_get_op();
+  } else if (is_replication_op()) {
+    return new RGWGetBucketReplication_ObjStore_S3;
+  } else if (is_policy_status_op()) {
+    return new RGWGetBucketPolicyStatus_ObjStore_S3;
+  } else if (is_block_public_access_op()) {
+    return new RGWGetBucketPublicAccessBlock_ObjStore_S3;
+  } else if (is_bucket_encryption_op()) {
+    return new RGWGetBucketEncryption_ObjStore_S3;
+  }
+  return get_obj_op(true);
+}
+
+RGWOp *RGWHandler_REST_Bucket_S3::op_head()
+{
+  if (is_acl_op()) {
+    return new RGWGetACLs_ObjStore_S3;
+  } else if (s->info.args.exists("uploads")) {
+    return new RGWListBucketMultiparts_ObjStore_S3;
+  }
+  return get_obj_op(false);
+}
+
+RGWOp *RGWHandler_REST_Bucket_S3::op_put()
+{
+  if (s->info.args.sub_resource_exists("logging") ||
+      s->info.args.sub_resource_exists("encryption"))
+    return nullptr;
+  if (s->info.args.sub_resource_exists("versioning"))
+    return new RGWSetBucketVersioning_ObjStore_S3;
+  if (s->info.args.sub_resource_exists("website")) {
+    if (!s->cct->_conf->rgw_enable_static_website) {
+      return NULL;
+    }
+    return new RGWSetBucketWebsite_ObjStore_S3;
+  }
+  if (is_tagging_op()) {
+    return new RGWPutBucketTags_ObjStore_S3;
+  } else if (is_acl_op()) {
+    return new RGWPutACLs_ObjStore_S3;
+  } else if (is_cors_op()) {
+    return new RGWPutCORS_ObjStore_S3;
+  } else if (is_request_payment_op()) {
+    return new RGWSetRequestPayment_ObjStore_S3;
+  } else if(is_lc_op()) {
+    return new RGWPutLC_ObjStore_S3;
+  } else if(is_policy_op()) {
+    return new RGWPutBucketPolicy;
+  } else if (is_object_lock_op()) {
+    return new RGWPutBucketObjectLock_ObjStore_S3;
+  } else if (is_notification_op()) {
+    return RGWHandler_REST_PSNotifs_S3::create_put_op();
+  } else if (is_replication_op()) {
+    RGWBucketSyncPolicyHandlerRef sync_policy_handler;
+    int ret = driver->get_sync_policy_handler(s, nullopt, nullopt,
+					     &sync_policy_handler, null_yield);
+    if (ret < 0 || !sync_policy_handler ||
+        sync_policy_handler->is_legacy_config()) {
+      return nullptr;
+    }
+
+    return new RGWPutBucketReplication_ObjStore_S3;
+  } else if (is_block_public_access_op()) {
+    return new RGWPutBucketPublicAccessBlock_ObjStore_S3;
+  } else if (is_bucket_encryption_op()) {
+    return new RGWPutBucketEncryption_ObjStore_S3;
+  }
+  return new RGWCreateBucket_ObjStore_S3;
+}
+
+RGWOp *RGWHandler_REST_Bucket_S3::op_delete()
+{
+  if (s->info.args.sub_resource_exists("logging") ||
+      s->info.args.sub_resource_exists("encryption"))
+    return nullptr;
+
+  if (is_tagging_op()) {
+    return new RGWDeleteBucketTags_ObjStore_S3;
+  } else if (is_cors_op()) {
+    return new RGWDeleteCORS_ObjStore_S3;
+  } else if(is_lc_op()) {
+    return new RGWDeleteLC_ObjStore_S3;
+  } else if(is_policy_op()) {
+    return new RGWDeleteBucketPolicy;
+  } else if (is_notification_op()) {
+    return RGWHandler_REST_PSNotifs_S3::create_delete_op();
+  } else if (is_replication_op()) {
+    return new RGWDeleteBucketReplication_ObjStore_S3;
+  } else if (is_block_public_access_op()) {
+    return new RGWDeleteBucketPublicAccessBlock;
+  } else if (is_bucket_encryption_op()) {
+    return new RGWDeleteBucketEncryption_ObjStore_S3;
+  }
+
+  if (s->info.args.sub_resource_exists("website")) {
+    if (!s->cct->_conf->rgw_enable_static_website) {
+      return NULL;
+    }
+    return new RGWDeleteBucketWebsite_ObjStore_S3;
+  }
+
+  if (s->info.args.exists("mdsearch")) {
+    return new RGWDelBucketMetaSearch_ObjStore_S3;
+  }
+
+  return new RGWDeleteBucket_ObjStore_S3;
+}
+
+RGWOp *RGWHandler_REST_Bucket_S3::op_post()
+{
+  if (s->info.args.exists("delete")) {
+    return new RGWDeleteMultiObj_ObjStore_S3;
+  }
+
+  if (s->info.args.exists("mdsearch")) {
+    return new RGWConfigBucketMetaSearch_ObjStore_S3;
+  }
+
+  return new RGWPostObj_ObjStore_S3;
+}
+
+RGWOp *RGWHandler_REST_Bucket_S3::op_options()
+{
+  return new RGWOptionsCORS_ObjStore_S3;
+}
+
+RGWOp *RGWHandler_REST_Obj_S3::get_obj_op(bool get_data)
+{
+  RGWGetObj_ObjStore_S3 *get_obj_op = new RGWGetObj_ObjStore_S3;
+  get_obj_op->set_get_data(get_data);
+  return get_obj_op;
+}
+
+RGWOp *RGWHandler_REST_Obj_S3::op_get()
+{
+  if (is_acl_op()) {
+    return new RGWGetACLs_ObjStore_S3;
+  } else if (s->info.args.exists("uploadId")) {
+    return new RGWListMultipart_ObjStore_S3;
+  } else if (s->info.args.exists("layout")) {
+    return new RGWGetObjLayout_ObjStore_S3;
+  } else if (is_tagging_op()) {
+    return new RGWGetObjTags_ObjStore_S3;
+  } else if (is_obj_retention_op()) {
+    return new RGWGetObjRetention_ObjStore_S3;
+  } else if (is_obj_legal_hold_op()) {
+    return new RGWGetObjLegalHold_ObjStore_S3;
+  }
+  return get_obj_op(true);
+}
+
+RGWOp *RGWHandler_REST_Obj_S3::op_head()
+{
+  if (is_acl_op()) {
+    return new RGWGetACLs_ObjStore_S3;
+  } else if (s->info.args.exists("uploadId")) {
+    return new RGWListMultipart_ObjStore_S3;
+  }
+  return get_obj_op(false);
+}
+
+RGWOp *RGWHandler_REST_Obj_S3::op_put()
+{
+  if (is_acl_op()) {
+    return new RGWPutACLs_ObjStore_S3;
+  } else if (is_tagging_op()) {
+    return new RGWPutObjTags_ObjStore_S3;
+  } else if (is_obj_retention_op()) {
+    return new RGWPutObjRetention_ObjStore_S3;
+  } else if (is_obj_legal_hold_op()) {
+    return new RGWPutObjLegalHold_ObjStore_S3;
+  }
+
+  if (s->init_state.src_bucket.empty())
+    return new RGWPutObj_ObjStore_S3;
+  else
+    return new RGWCopyObj_ObjStore_S3;
+}
+
+RGWOp *RGWHandler_REST_Obj_S3::op_delete()
+{
+  if (is_tagging_op()) {
+    return new RGWDeleteObjTags_ObjStore_S3;
+  }
+  string upload_id = s->info.args.get("uploadId");
+
+  if (upload_id.empty())
+    return new RGWDeleteObj_ObjStore_S3;
+  else
+    return new RGWAbortMultipart_ObjStore_S3;
+}
+
+RGWOp *RGWHandler_REST_Obj_S3::op_post()
+{
+  if (s->info.args.exists("uploadId"))
+    return new RGWCompleteMultipart_ObjStore_S3;
+
+  if (s->info.args.exists("uploads"))
+    return new RGWInitMultipart_ObjStore_S3;
+  
+  if (is_select_op())
+    return rgw::s3select::create_s3select_op();
+
+  return new RGWPostObj_ObjStore_S3;
+}
+
+RGWOp *RGWHandler_REST_Obj_S3::op_options()
+{
+  return new RGWOptionsCORS_ObjStore_S3;
+}
+
+int RGWHandler_REST_S3::init_from_header(rgw::sal::Driver* driver,
+					 req_state* s,
+					 RGWFormat default_formatter,
+					 bool configurable_format)
+{
+  string req;
+  string first;
+
+  const char *req_name = s->relative_uri.c_str();
+  const char *p;
+
+  if (*req_name == '?') {
+    p = req_name;
+  } else {
+    p = s->info.request_params.c_str();
+  }
+
+  s->info.args.set(p);
+  s->info.args.parse(s);
+
+  /* must be called after the args parsing */
+  int ret = allocate_formatter(s, default_formatter, configurable_format);
+  if (ret < 0)
+    return ret;
+
+  if (*req_name != '/')
+    return 0;
+
+  req_name++;
+
+  if (!*req_name)
+    return 0;
+
+  req = req_name;
+  int pos = req.find('/');
+  if (pos >= 0) {
+    first = req.substr(0, pos);
+  } else {
+    first = req;
+  }
+
+  /*
+   * XXX The intent of the check for empty is apparently to let the bucket
+   * name from DNS to be set ahead. However, we currently take the DNS
+   * bucket and re-insert it into URL in rgw_rest.cc:RGWREST::preprocess().
+   * So, this check is meaningless.
+   *
+   * Rather than dropping this, the code needs to be changed into putting
+   * the bucket (and its tenant) from DNS and Host: header (HTTP_HOST)
+   * into req_status.bucket_name directly.
+   */
+  if (s->init_state.url_bucket.empty()) {
+    // Save bucket to tide us over until token is parsed.
+    s->init_state.url_bucket = first;
+    string encoded_obj_str;
+    if (pos >= 0) {
+      encoded_obj_str = req.substr(pos+1);
+    }
+
+    /* dang: s->bucket is never set here, since it's created with permissions.
+     * These calls will always create an object with no bucket. */
+    if (!encoded_obj_str.empty()) {
+      if (s->bucket) {
+	s->object = s->bucket->get_object(rgw_obj_key(encoded_obj_str, s->info.args.get("versionId")));
+      } else {
+	s->object = driver->get_object(rgw_obj_key(encoded_obj_str, s->info.args.get("versionId")));
+      }
+    }
+  } else {
+    if (s->bucket) {
+      s->object = s->bucket->get_object(rgw_obj_key(req_name, s->info.args.get("versionId")));
+    } else {
+      s->object = driver->get_object(rgw_obj_key(req_name, s->info.args.get("versionId")));
+    }
+  }
+  return 0;
+}
+
+int RGWHandler_REST_S3::postauth_init(optional_yield y)
+{
+  struct req_init_state *t = &s->init_state;
+
+  int ret = rgw_parse_url_bucket(t->url_bucket, s->user->get_tenant(),
+                                 s->bucket_tenant, s->bucket_name);
+  if (ret) {
+    return ret;
+  }
+  if (s->auth.identity->get_identity_type() == TYPE_ROLE) {
+    s->bucket_tenant = s->auth.identity->get_role_tenant();
+  }
+
+  ldpp_dout(s, 10) << "s->object=" << s->object
+           << " s->bucket=" << rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name) << dendl;
+
+  ret = rgw_validate_tenant_name(s->bucket_tenant);
+  if (ret)
+    return ret;
+  if (!s->bucket_name.empty() && !rgw::sal::Object::empty(s->object.get())) {
+    ret = validate_object_name(s->object->get_name());
+    if (ret)
+      return ret;
+  }
+
+  if (!t->src_bucket.empty()) {
+    string auth_tenant;
+    if (s->auth.identity->get_identity_type() == TYPE_ROLE) {
+      auth_tenant = s->auth.identity->get_role_tenant();
+    } else {
+      auth_tenant = s->user->get_tenant();
+    }
+    ret = rgw_parse_url_bucket(t->src_bucket, auth_tenant,
+                               s->src_tenant_name, s->src_bucket_name);
+    if (ret) {
+      return ret;
+    }
+    ret = rgw_validate_tenant_name(s->src_tenant_name);
+    if (ret)
+      return ret;
+  }
+
+  const char *mfa = s->info.env->get("HTTP_X_AMZ_MFA");
+  if (mfa) {
+    ret = s->user->verify_mfa(string(mfa), &s->mfa_verified, s, y);
+  }
+
+  return 0;
+}
+
+int RGWHandler_REST_S3::init(rgw::sal::Driver* driver, req_state *s,
+                             rgw::io::BasicClient *cio)
+{
+  int ret;
+
+  s->dialect = "s3";
+
+  ret = rgw_validate_tenant_name(s->bucket_tenant);
+  if (ret)
+    return ret;
+  if (!s->bucket_name.empty()) {
+    ret = validate_object_name(s->object->get_name());
+    if (ret)
+      return ret;
+  }
+
+  const char *cacl = s->info.env->get("HTTP_X_AMZ_ACL");
+  if (cacl)
+    s->canned_acl = cacl;
+
+  s->has_acl_header = s->info.env->exists_prefix("HTTP_X_AMZ_GRANT");
+
+  const char *copy_source = s->info.env->get("HTTP_X_AMZ_COPY_SOURCE");
+  if (copy_source &&
+      (! s->info.env->get("HTTP_X_AMZ_COPY_SOURCE_RANGE")) &&
+      (! s->info.args.exists("uploadId"))) {
+    rgw_obj_key key;
+
+    ret = RGWCopyObj::parse_copy_location(copy_source,
+                                          s->init_state.src_bucket,
+                                          key,
+                                          s);
+    if (!ret) {
+      ldpp_dout(s, 0) << "failed to parse copy location" << dendl;
+      return -EINVAL; // XXX why not -ERR_INVALID_BUCKET_NAME or -ERR_BAD_URL?
+    }
+    s->src_object = driver->get_object(key);
+  }
+
+  const char *sc = s->info.env->get("HTTP_X_AMZ_STORAGE_CLASS");
+  if (sc) {
+    s->info.storage_class = sc;
+  }
+
+  return RGWHandler_REST::init(driver, s, cio);
+}
+
+int RGWHandler_REST_S3::authorize(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  if (s->info.args.exists("Action") && s->info.args.get("Action") == "AssumeRoleWithWebIdentity") {
+    return RGW_Auth_STS::authorize(dpp, driver, auth_registry, s, y);
+  }
+  return RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y);
+}
+
+enum class AwsVersion {
+  UNKNOWN,
+  V2,
+  V4
+};
+
+enum class AwsRoute {
+  UNKNOWN,
+  QUERY_STRING,
+  HEADERS
+};
+
+static inline std::pair<AwsVersion, AwsRoute>
+discover_aws_flavour(const req_info& info)
+{
+  using rgw::auth::s3::AWS4_HMAC_SHA256_STR;
+
+  AwsVersion version = AwsVersion::UNKNOWN;
+  AwsRoute route = AwsRoute::UNKNOWN;
+
+  const char* http_auth = info.env->get("HTTP_AUTHORIZATION");
+  if (http_auth && http_auth[0]) {
+    /* Authorization in Header */
+    route = AwsRoute::HEADERS;
+
+    if (!strncmp(http_auth, AWS4_HMAC_SHA256_STR,
+                 strlen(AWS4_HMAC_SHA256_STR))) {
+      /* AWS v4 */
+      version = AwsVersion::V4;
+    } else if (!strncmp(http_auth, "AWS ", 4)) {
+      /* AWS v2 */
+      version = AwsVersion::V2;
+    }
+  } else {
+    route = AwsRoute::QUERY_STRING;
+
+    if (info.args.get("x-amz-algorithm") == AWS4_HMAC_SHA256_STR) {
+      /* AWS v4 */
+      version = AwsVersion::V4;
+    } else if (!info.args.get("AWSAccessKeyId").empty()) {
+      /* AWS v2 */
+      version = AwsVersion::V2;
+    }
+  }
+
+  return std::make_pair(version, route);
+}
+
+/*
+ * verify that a signed request comes from the keyholder
+ * by checking the signature against our locally-computed version
+ *
+ * it tries AWS v4 before AWS v2
+ */
+int RGW_Auth_S3::authorize(const DoutPrefixProvider *dpp,
+                           rgw::sal::Driver* const driver,
+                           const rgw::auth::StrategyRegistry& auth_registry,
+                           req_state* const s, optional_yield y)
+{
+
+  /* neither keystone and rados enabled; warn and exit! */
+  if (!driver->ctx()->_conf->rgw_s3_auth_use_rados &&
+      !driver->ctx()->_conf->rgw_s3_auth_use_keystone &&
+      !driver->ctx()->_conf->rgw_s3_auth_use_ldap) {
+    ldpp_dout(dpp, 0) << "WARNING: no authorization backend enabled! Users will never authenticate." << dendl;
+    return -EPERM;
+  }
+
+  const auto ret = rgw::auth::Strategy::apply(dpp, auth_registry.get_s3_main(), s, y);
+  if (ret == 0) {
+    /* Populate the owner info. */
+    s->owner.set_id(s->user->get_id());
+    s->owner.set_name(s->user->get_display_name());
+  }
+  return ret;
+}
+
+int RGWHandler_Auth_S3::init(rgw::sal::Driver* driver, req_state *state,
+                             rgw::io::BasicClient *cio)
+{
+  int ret = RGWHandler_REST_S3::init_from_header(driver, state, RGWFormat::JSON, true);
+  if (ret < 0)
+    return ret;
+
+  return RGWHandler_REST::init(driver, state, cio);
+}
+
+namespace {
+// utility classes and functions for handling parameters with the following format:
+// Attributes.entry.{N}.{key|value}={VALUE}
+// N - any unsigned number
+// VALUE - url encoded string
+
+// and Attribute is holding key and value
+// ctor and set are done according to the "type" argument
+// if type is not "key" or "value" its a no-op
+class Attribute {
+  std::string key;
+  std::string value;
+public:
+  Attribute(const std::string& type, const std::string& key_or_value) {
+    set(type, key_or_value);
+  }
+  void set(const std::string& type, const std::string& key_or_value) {
+    if (type == "key") {
+      key = key_or_value;
+    } else if (type == "value") {
+      value = key_or_value;
+    }
+  }
+  const std::string& get_key() const { return key; }
+  const std::string& get_value() const { return value; }
+};
+
+using AttributeMap = std::map<unsigned, Attribute>;
+
+// aggregate the attributes into a map
+// the key and value are associated by the index (N)
+// no assumptions are made on the order in which these parameters are added
+void update_attribute_map(const std::string& input, AttributeMap& map) {
+  const boost::char_separator<char> sep(".");
+  const boost::tokenizer tokens(input, sep);
+  auto token = tokens.begin();
+  if (*token != "Attributes") {
+      return;
+  }
+  ++token;
+
+  if (*token != "entry") {
+      return;
+  }
+  ++token;
+
+  unsigned idx;
+  try {
+    idx = std::stoul(*token);
+  } catch (const std::invalid_argument&) {
+    return;
+  }
+  ++token;
+
+  std::string key_or_value = "";
+  // get the rest of the string regardless of dots
+  // this is to allow dots in the value
+  while (token != tokens.end()) {
+    key_or_value.append(*token+".");
+    ++token;
+  }
+  // remove last separator
+  key_or_value.pop_back();
+
+  auto pos = key_or_value.find("=");
+  if (pos != std::string::npos) {
+    const auto key_or_value_lhs = key_or_value.substr(0, pos);
+    const auto key_or_value_rhs = url_decode(key_or_value.substr(pos + 1, key_or_value.size() - 1));
+    const auto map_it = map.find(idx);
+    if (map_it == map.end()) {
+      // new entry
+      map.emplace(std::make_pair(idx, Attribute(key_or_value_lhs, key_or_value_rhs)));
+    } else {
+      // existing entry
+      map_it->second.set(key_or_value_lhs, key_or_value_rhs);
+    }
+  }
+}
+}
+
+void parse_post_action(const std::string& post_body, req_state* s)
+{
+  if (post_body.size() > 0) {
+    ldpp_dout(s, 10) << "Content of POST: " << post_body << dendl;
+
+    if (post_body.find("Action") != string::npos) {
+      const boost::char_separator<char> sep("&");
+      const boost::tokenizer<boost::char_separator<char>> tokens(post_body, sep);
+      AttributeMap map;
+      for (const auto& t : tokens) {
+        const auto pos = t.find("=");
+        if (pos != string::npos) {
+          const auto key = t.substr(0, pos);
+          if (boost::starts_with(key, "Attributes.")) {
+            update_attribute_map(t, map);
+          } else {
+            s->info.args.append(t.substr(0, pos),
+                              url_decode(t.substr(pos+1, t.size() -1)));
+          }
+        }
+      }
+      // update the regular args with the content of the attribute map
+      for (const auto& attr : map) {
+          s->info.args.append(attr.second.get_key(), attr.second.get_value());
+      }
+    }
+  }
+  const auto payload_hash = rgw::auth::s3::calc_v4_payload_hash(post_body);
+  s->info.args.append("PayloadHash", payload_hash);
+}
+
+RGWHandler_REST* RGWRESTMgr_S3::get_handler(rgw::sal::Driver* driver,
+					    req_state* const s,
+                                            const rgw::auth::StrategyRegistry& auth_registry,
+                                            const std::string& frontend_prefix)
+{
+  bool is_s3website = enable_s3website && (s->prot_flags & RGW_REST_WEBSITE);
+  int ret =
+    RGWHandler_REST_S3::init_from_header(driver, s,
+					is_s3website ? RGWFormat::HTML :
+					RGWFormat::XML, true);
+  if (ret < 0) {
+    return nullptr;
+  }
+
+  if (is_s3website) {
+    if (s->init_state.url_bucket.empty()) {
+      return new RGWHandler_REST_Service_S3Website(auth_registry);
+    }
+    if (rgw::sal::Object::empty(s->object.get())) {
+      return new RGWHandler_REST_Bucket_S3Website(auth_registry);
+    }
+    return new RGWHandler_REST_Obj_S3Website(auth_registry);
+  }
+
+  if (s->init_state.url_bucket.empty()) {
+    // no bucket
+    if (s->op == OP_POST) {
+      // POST will be one of: IAM, STS or topic service
+      const auto max_size = s->cct->_conf->rgw_max_put_param_size;
+      int ret;
+      bufferlist data;
+      std::tie(ret, data) = rgw_rest_read_all_input(s, max_size, false);
+      if (ret < 0) {
+        return nullptr;
+      }
+      parse_post_action(data.to_str(), s);
+      if (enable_sts && RGWHandler_REST_STS::action_exists(s)) {
+        return new RGWHandler_REST_STS(auth_registry);
+      }
+      if (enable_iam && RGWHandler_REST_IAM::action_exists(s)) {
+        return new RGWHandler_REST_IAM(auth_registry, data);
+      }
+      if (enable_pubsub && RGWHandler_REST_PSTopic_AWS::action_exists(s)) {
+        return new RGWHandler_REST_PSTopic_AWS(auth_registry); 
+      }
+      return nullptr;
+    }
+    // non-POST S3 service without a bucket
+    return new RGWHandler_REST_Service_S3(auth_registry);
+  }
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    // has object
+    return new RGWHandler_REST_Obj_S3(auth_registry);
+  }
+  if (s->info.args.exist_obj_excl_sub_resource()) {
+    return nullptr;
+  }
+  // has bucket
+  return new RGWHandler_REST_Bucket_S3(auth_registry, enable_pubsub);
+}
+
+bool RGWHandler_REST_S3Website::web_dir() const {
+  std::string subdir_name;
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    subdir_name = url_decode(s->object->get_name());
+  }
+
+  if (subdir_name.empty()) {
+    return false;
+  } else if (subdir_name.back() == '/' && subdir_name.size() > 1) {
+    subdir_name.pop_back();
+  }
+
+  std::unique_ptr<rgw::sal::Object> obj = s->bucket->get_object(rgw_obj_key(subdir_name));
+
+  obj->set_atomic();
+
+  RGWObjState* state = nullptr;
+  if (obj->get_obj_state(s, &state, s->yield) < 0) {
+    return false;
+  }
+  if (! state->exists) {
+    return false;
+  }
+  return state->exists;
+}
+
+int RGWHandler_REST_S3Website::init(rgw::sal::Driver* driver, req_state *s,
+                                    rgw::io::BasicClient* cio)
+{
+  // save the original object name before retarget() replaces it with the
+  // result of get_effective_key(). the error_handler() needs the original
+  // object name for redirect handling
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    original_object_name = s->object->get_name();
+  } else {
+    original_object_name = "";
+  }
+
+  return RGWHandler_REST_S3::init(driver, s, cio);
+}
+
+int RGWHandler_REST_S3Website::retarget(RGWOp* op, RGWOp** new_op, optional_yield y) {
+  *new_op = op;
+  ldpp_dout(s, 10) << __func__ << " Starting retarget" << dendl;
+
+  if (!(s->prot_flags & RGW_REST_WEBSITE))
+    return 0;
+
+  if (rgw::sal::Bucket::empty(s->bucket.get())) {
+    // TODO-FUTURE: if the bucket does not exist, maybe expose it here?
+    return -ERR_NO_SUCH_BUCKET;
+  }
+
+  if (!s->bucket->get_info().has_website) {
+    // TODO-FUTURE: if the bucket has no WebsiteConfig, expose it here
+    return -ERR_NO_SUCH_WEBSITE_CONFIGURATION;
+  }
+
+  rgw_obj_key new_obj;
+  string key_name;
+  if (!rgw::sal::Object::empty(s->object.get())) {
+    key_name = s->object->get_name();
+  }
+  bool get_res = s->bucket->get_info().website_conf.get_effective_key(key_name, &new_obj.name, web_dir());
+  if (!get_res) {
+    s->err.message = "The IndexDocument Suffix is not configurated or not well formed!";
+    ldpp_dout(s, 5) << s->err.message << dendl;
+    return -EINVAL;
+  }
+
+  ldpp_dout(s, 10) << "retarget get_effective_key " << s->object << " -> "
+		    << new_obj << dendl;
+
+  RGWBWRoutingRule rrule;
+  bool should_redirect =
+    s->bucket->get_info().website_conf.should_redirect(new_obj.name, 0, &rrule);
+
+  if (should_redirect) {
+    const string& hostname = s->info.env->get("HTTP_HOST", "");
+    const string& protocol =
+      (s->info.env->get("SERVER_PORT_SECURE") ? "https" : "http");
+    int redirect_code = 0;
+    rrule.apply_rule(protocol, hostname, key_name, &s->redirect,
+		    &redirect_code);
+    // APply a custom HTTP response code
+    if (redirect_code > 0)
+      s->err.http_ret = redirect_code; // Apply a custom HTTP response code
+    ldpp_dout(s, 10) << "retarget redirect code=" << redirect_code
+		      << " proto+host:" << protocol << "://" << hostname
+		      << " -> " << s->redirect << dendl;
+    return -ERR_WEBSITE_REDIRECT;
+  }
+
+  /*
+   * FIXME: if s->object != new_obj, drop op and create a new op to handle
+   * operation. Or remove this comment if it's not applicable anymore
+   * dang: This could be problematic, since we're not actually replacing op, but
+   * we are replacing s->object.  Something might have a pointer to it.
+   */
+  s->object = s->bucket->get_object(new_obj);
+
+  return 0;
+}
+
+RGWOp* RGWHandler_REST_S3Website::op_get()
+{
+  return get_obj_op(true);
+}
+
+RGWOp* RGWHandler_REST_S3Website::op_head()
+{
+  return get_obj_op(false);
+}
+
+int RGWHandler_REST_S3Website::serve_errordoc(const DoutPrefixProvider *dpp, int http_ret, const string& errordoc_key, optional_yield y) {
+  int ret = 0;
+  s->formatter->reset(); /* Try to throw it all away */
+
+  std::shared_ptr<RGWGetObj_ObjStore_S3Website> getop( static_cast<RGWGetObj_ObjStore_S3Website*>(op_get()));
+  if (getop.get() == NULL) {
+    return -1; // Trigger double error handler
+  }
+  getop->init(driver, s, this);
+  getop->range_str = NULL;
+  getop->if_mod = NULL;
+  getop->if_unmod = NULL;
+  getop->if_match = NULL;
+  getop->if_nomatch = NULL;
+  /* This is okay.  It's an error, so nothing will run after this, and it can be
+   * called by abort_early(), which can be called before s->object or s->bucket
+   * are set up. Note, it won't have bucket. */
+  s->object = driver->get_object(errordoc_key);
+
+  ret = init_permissions(getop.get(), y);
+  if (ret < 0) {
+    ldpp_dout(s, 20) << "serve_errordoc failed, init_permissions ret=" << ret << dendl;
+    return -1; // Trigger double error handler
+  }
+
+  ret = read_permissions(getop.get(), y);
+  if (ret < 0) {
+    ldpp_dout(s, 20) << "serve_errordoc failed, read_permissions ret=" << ret << dendl;
+    return -1; // Trigger double error handler
+  }
+
+  if (http_ret) {
+     getop->set_custom_http_response(http_ret);
+  }
+
+  ret = getop->init_processing(y);
+  if (ret < 0) {
+    ldpp_dout(s, 20) << "serve_errordoc failed, init_processing ret=" << ret << dendl;
+    return -1; // Trigger double error handler
+  }
+
+  ret = getop->verify_op_mask();
+  if (ret < 0) {
+    ldpp_dout(s, 20) << "serve_errordoc failed, verify_op_mask ret=" << ret << dendl;
+    return -1; // Trigger double error handler
+  }
+
+  ret = getop->verify_permission(y);
+  if (ret < 0) {
+    ldpp_dout(s, 20) << "serve_errordoc failed, verify_permission ret=" << ret << dendl;
+    return -1; // Trigger double error handler
+  }
+
+  ret = getop->verify_params();
+  if (ret < 0) {
+    ldpp_dout(s, 20) << "serve_errordoc failed, verify_params ret=" << ret << dendl;
+    return -1; // Trigger double error handler
+  }
+
+  // No going back now
+  getop->pre_exec();
+  /*
+   * FIXME Missing headers:
+   * With a working errordoc, the s3 error fields are rendered as HTTP headers,
+   *   x-amz-error-code: NoSuchKey
+   *   x-amz-error-message: The specified key does not exist.
+   *   x-amz-error-detail-Key: foo
+   */
+  getop->execute(y);
+  getop->complete();
+  return 0;
+}
+
+int RGWHandler_REST_S3Website::error_handler(int err_no,
+					     string* error_content,
+					     optional_yield y) {
+  int new_err_no = -1;
+  rgw_http_errors::const_iterator r = rgw_http_s3_errors.find(err_no > 0 ? err_no : -err_no);
+  int http_error_code = -1;
+
+  if (r != rgw_http_s3_errors.end()) {
+    http_error_code = r->second.first;
+  }
+  ldpp_dout(s, 10) << "RGWHandler_REST_S3Website::error_handler err_no=" << err_no << " http_ret=" << http_error_code << dendl;
+
+  RGWBWRoutingRule rrule;
+  bool have_bucket = !rgw::sal::Bucket::empty(s->bucket.get());
+  bool should_redirect = false;
+  if (have_bucket) {
+    should_redirect =
+      s->bucket->get_info().website_conf.should_redirect(original_object_name,
+							 http_error_code, &rrule);
+  }
+
+  if (should_redirect) {
+    const string& hostname = s->info.env->get("HTTP_HOST", "");
+    const string& protocol =
+      (s->info.env->get("SERVER_PORT_SECURE") ? "https" : "http");
+    int redirect_code = 0;
+    rrule.apply_rule(protocol, hostname, original_object_name,
+                     &s->redirect, &redirect_code);
+    // Apply a custom HTTP response code
+    if (redirect_code > 0)
+      s->err.http_ret = redirect_code; // Apply a custom HTTP response code
+    ldpp_dout(s, 10) << "error handler redirect code=" << redirect_code
+		      << " proto+host:" << protocol << "://" << hostname
+		      << " -> " << s->redirect << dendl;
+    return -ERR_WEBSITE_REDIRECT;
+  } else if (err_no == -ERR_WEBSITE_REDIRECT) {
+    // Do nothing here, this redirect will be handled in abort_early's ERR_WEBSITE_REDIRECT block
+    // Do NOT fire the ErrorDoc handler
+  } else if (have_bucket && !s->bucket->get_info().website_conf.error_doc.empty()) {
+    /* This serves an entire page!
+       On success, it will return zero, and no further content should be sent to the socket
+       On failure, we need the double-error handler
+     */
+    new_err_no = RGWHandler_REST_S3Website::serve_errordoc(s, http_error_code, s->bucket->get_info().website_conf.error_doc, y);
+    if (new_err_no != -1) {
+      err_no = new_err_no;
+    }
+  } else {
+    ldpp_dout(s, 20) << "No special error handling today!" << dendl;
+  }
+
+  return err_no;
+}
+
+RGWOp* RGWHandler_REST_Obj_S3Website::get_obj_op(bool get_data)
+{
+  /** If we are in website mode, then it is explicitly impossible to run GET or
+   * HEAD on the actual directory. We must convert the request to run on the
+   * suffix object instead!
+   */
+  RGWGetObj_ObjStore_S3Website* op = new RGWGetObj_ObjStore_S3Website;
+  op->set_get_data(get_data);
+  return op;
+}
+
+RGWOp* RGWHandler_REST_Bucket_S3Website::get_obj_op(bool get_data)
+{
+  /** If we are in website mode, then it is explicitly impossible to run GET or
+   * HEAD on the actual directory. We must convert the request to run on the
+   * suffix object instead!
+   */
+  RGWGetObj_ObjStore_S3Website* op = new RGWGetObj_ObjStore_S3Website;
+  op->set_get_data(get_data);
+  return op;
+}
+
+RGWOp* RGWHandler_REST_Service_S3Website::get_obj_op(bool get_data)
+{
+  /** If we are in website mode, then it is explicitly impossible to run GET or
+   * HEAD on the actual directory. We must convert the request to run on the
+   * suffix object instead!
+   */
+  RGWGetObj_ObjStore_S3Website* op = new RGWGetObj_ObjStore_S3Website;
+  op->set_get_data(get_data);
+  return op;
+}
+
+
+namespace rgw::auth::s3 {
+
+static rgw::auth::Completer::cmplptr_t
+null_completer_factory(const boost::optional<std::string>& secret_key)
+{
+  return nullptr;
+}
+
+
+AWSEngine::VersionAbstractor::auth_data_t
+AWSGeneralAbstractor::get_auth_data(const req_state* const s) const
+{
+  AwsVersion version;
+  AwsRoute route;
+  std::tie(version, route) = discover_aws_flavour(s->info);
+
+  if (version == AwsVersion::V2) {
+    return get_auth_data_v2(s);
+  } else if (version == AwsVersion::V4) {
+    return get_auth_data_v4(s, route == AwsRoute::QUERY_STRING);
+  } else {
+    /* FIXME(rzarzynski): handle anon user. */
+    throw -EINVAL;
+  }
+}
+
+boost::optional<std::string>
+AWSGeneralAbstractor::get_v4_canonical_headers(
+  const req_info& info,
+  const std::string_view& signedheaders,
+  const bool using_qs) const
+{
+  return rgw::auth::s3::get_v4_canonical_headers(info, signedheaders,
+                                                 using_qs, false);
+}
+
+AWSSignerV4::prepare_result_t
+AWSSignerV4::prepare(const DoutPrefixProvider *dpp,
+                     const std::string& access_key_id,
+                     const string& region,
+                     const string& service,
+                     const req_info& info,
+                     const bufferlist *opt_content,
+                     bool s3_op)
+{
+  std::string signed_hdrs;
+
+  ceph::real_time timestamp = ceph::real_clock::now();
+
+  map<string, string> extra_headers;
+
+  std::string date = ceph::to_iso_8601_no_separators(timestamp, ceph::iso_8601_format::YMDhms);
+
+  std::string credential_scope = gen_v4_scope(timestamp, region, service);
+
+  extra_headers["x-amz-date"] = date;
+
+  string content_hash;
+
+  if (opt_content) {
+    content_hash = rgw::auth::s3::calc_v4_payload_hash(opt_content->to_str());
+    extra_headers["x-amz-content-sha256"] = content_hash;
+
+  }
+
+  /* craft canonical headers */
+  std::string canonical_headers = \
+    gen_v4_canonical_headers(info, extra_headers, &signed_hdrs);
+
+  using sanitize = rgw::crypt_sanitize::log_content;
+  ldpp_dout(dpp, 10) << "canonical headers format = "
+                     << sanitize{canonical_headers} << dendl;
+
+  bool is_non_s3_op = !s3_op;
+
+  const char* exp_payload_hash = nullptr;
+  string payload_hash;
+  if (is_non_s3_op) {
+    //For non s3 ops, we need to calculate the payload hash
+    payload_hash = info.args.get("PayloadHash");
+    exp_payload_hash = payload_hash.c_str();
+  } else {
+    /* Get the expected hash. */
+    if (content_hash.empty()) {
+      exp_payload_hash = rgw::auth::s3::get_v4_exp_payload_hash(info);
+    } else {
+      exp_payload_hash = content_hash.c_str();
+    }
+  }
+
+  /* Craft canonical URI. Using std::move later so let it be non-const. */
+  auto canonical_uri = rgw::auth::s3::gen_v4_canonical_uri(info);
+
+
+  /* Craft canonical query string. std::moving later so non-const here. */
+  auto canonical_qs = rgw::auth::s3::gen_v4_canonical_qs(info, is_non_s3_op);
+
+  auto cct = dpp->get_cct();
+
+  /* Craft canonical request. */
+  auto canonical_req_hash = \
+    rgw::auth::s3::get_v4_canon_req_hash(cct,
+                                         info.method,
+                                         std::move(canonical_uri),
+                                         std::move(canonical_qs),
+                                         std::move(canonical_headers),
+                                         signed_hdrs,
+                                         exp_payload_hash,
+                                         dpp);
+
+  auto string_to_sign = \
+    rgw::auth::s3::get_v4_string_to_sign(cct,
+                                         AWS4_HMAC_SHA256_STR,
+                                         date,
+                                         credential_scope,
+                                         std::move(canonical_req_hash),
+                                         dpp);
+
+  const auto sig_factory = gen_v4_signature;
+
+  /* Requests authenticated with the Query Parameters are treated as unsigned.
+   * From "Authenticating Requests: Using Query Parameters (AWS Signature
+   * Version 4)":
+   *
+   *   You don't include a payload hash in the Canonical Request, because
+   *   when you create a presigned URL, you don't know the payload content
+   *   because the URL is used to upload an arbitrary payload. Instead, you
+   *   use a constant string UNSIGNED-PAYLOAD.
+   *
+   * This means we have absolutely no business in spawning completer. Both
+   * aws4_auth_needs_complete and aws4_auth_streaming_mode are set to false
+   * by default. We don't need to change that. */
+  return {
+    access_key_id,
+    date,
+    credential_scope,
+    std::move(signed_hdrs),
+    std::move(string_to_sign),
+    std::move(extra_headers),
+    sig_factory,
+  };
+}
+
+AWSSignerV4::signature_headers_t
+gen_v4_signature(const DoutPrefixProvider *dpp,
+                 const std::string_view& secret_key,
+                 const AWSSignerV4::prepare_result_t& sig_info)
+{
+  auto signature = rgw::auth::s3::get_v4_signature(sig_info.scope,
+                                                   dpp->get_cct(),
+                                                   secret_key,
+                                                   sig_info.string_to_sign,
+                                                   dpp);
+  AWSSignerV4::signature_headers_t result;
+
+  for (auto& entry : sig_info.extra_headers) {
+    result[entry.first] = entry.second;
+  }
+  auto& payload_hash = result["x-amz-content-sha256"];
+  if (payload_hash.empty()) {
+    payload_hash = AWS4_UNSIGNED_PAYLOAD_HASH;
+  }
+  string auth_header = string("AWS4-HMAC-SHA256 Credential=").append(sig_info.access_key_id) + "/";
+  auth_header.append(sig_info.scope + ",SignedHeaders=")
+             .append(sig_info.signed_headers + ",Signature=")
+             .append(signature);
+  result["Authorization"] = auth_header;
+
+  return result;
+}
+
+
+AWSEngine::VersionAbstractor::auth_data_t
+AWSGeneralAbstractor::get_auth_data_v4(const req_state* const s,
+                                       const bool using_qs) const
+{
+  std::string_view access_key_id;
+  std::string_view signed_hdrs;
+
+  std::string_view date;
+  std::string_view credential_scope;
+  std::string_view client_signature;
+  std::string_view session_token;
+
+  int ret = rgw::auth::s3::parse_v4_credentials(s->info,
+						access_key_id,
+						credential_scope,
+						signed_hdrs,
+						client_signature,
+						date,
+						session_token,
+						using_qs,
+                                                s);
+  if (ret < 0) {
+    throw ret;
+  }
+
+  /* craft canonical headers */
+  boost::optional<std::string> canonical_headers = \
+    get_v4_canonical_headers(s->info, signed_hdrs, using_qs);
+  if (canonical_headers) {
+    using sanitize = rgw::crypt_sanitize::log_content;
+    ldpp_dout(s, 10) << "canonical headers format = "
+                      << sanitize{*canonical_headers} << dendl;
+  } else {
+    throw -EPERM;
+  }
+
+  bool is_non_s3_op = rgw::auth::s3::is_non_s3_op(s->op_type);
+
+  const char* exp_payload_hash = nullptr;
+  string payload_hash;
+  if (is_non_s3_op) {
+    //For non s3 ops, we need to calculate the payload hash
+    payload_hash = s->info.args.get("PayloadHash");
+    exp_payload_hash = payload_hash.c_str();
+  } else {
+    /* Get the expected hash. */
+    exp_payload_hash = rgw::auth::s3::get_v4_exp_payload_hash(s->info);
+  }
+
+  /* Craft canonical URI. Using std::move later so let it be non-const. */
+  auto canonical_uri = rgw::auth::s3::get_v4_canonical_uri(s->info);
+
+  /* Craft canonical query string. std::moving later so non-const here. */
+  auto canonical_qs = rgw::auth::s3::get_v4_canonical_qs(s->info, using_qs);
+
+  /* Craft canonical method. */
+  auto canonical_method = rgw::auth::s3::get_v4_canonical_method(s);
+
+  /* Craft canonical request. */
+  auto canonical_req_hash = \
+    rgw::auth::s3::get_v4_canon_req_hash(s->cct,
+                                         std::move(canonical_method),
+                                         std::move(canonical_uri),
+                                         std::move(canonical_qs),
+                                         std::move(*canonical_headers),
+                                         signed_hdrs,
+                                         exp_payload_hash,
+                                         s);
+
+  auto string_to_sign = \
+    rgw::auth::s3::get_v4_string_to_sign(s->cct,
+                                         AWS4_HMAC_SHA256_STR,
+                                         date,
+                                         credential_scope,
+                                         std::move(canonical_req_hash),
+                                         s);
+
+  const auto sig_factory = std::bind(rgw::auth::s3::get_v4_signature,
+                                     credential_scope,
+                                     std::placeholders::_1,
+                                     std::placeholders::_2,
+                                     std::placeholders::_3,
+                                     s);
+
+  /* Requests authenticated with the Query Parameters are treated as unsigned.
+   * From "Authenticating Requests: Using Query Parameters (AWS Signature
+   * Version 4)":
+   *
+   *   You don't include a payload hash in the Canonical Request, because
+   *   when you create a presigned URL, you don't know the payload content
+   *   because the URL is used to upload an arbitrary payload. Instead, you
+   *   use a constant string UNSIGNED-PAYLOAD.
+   *
+   * This means we have absolutely no business in spawning completer. Both
+   * aws4_auth_needs_complete and aws4_auth_streaming_mode are set to false
+   * by default. We don't need to change that. */
+  if (is_v4_payload_unsigned(exp_payload_hash) || is_v4_payload_empty(s) || is_non_s3_op) {
+    return {
+      access_key_id,
+      client_signature,
+      session_token,
+      std::move(string_to_sign),
+      sig_factory,
+      null_completer_factory
+    };
+  } else {
+    /* We're going to handle a signed payload. Be aware that even empty HTTP
+     * body (no payload) requires verification:
+     *
+     *   The x-amz-content-sha256 header is required for all AWS Signature
+     *   Version 4 requests. It provides a hash of the request payload. If
+     *   there is no payload, you must provide the hash of an empty string. */
+    if (!is_v4_payload_streamed(exp_payload_hash)) {
+      ldpp_dout(s, 10) << "delaying v4 auth" << dendl;
+
+      /* payload in a single chunk */
+      switch (s->op_type)
+      {
+        case RGW_OP_CREATE_BUCKET:
+        case RGW_OP_PUT_OBJ:
+        case RGW_OP_PUT_ACLS:
+        case RGW_OP_PUT_CORS:
+        case RGW_OP_PUT_BUCKET_ENCRYPTION:
+        case RGW_OP_GET_BUCKET_ENCRYPTION:
+        case RGW_OP_DELETE_BUCKET_ENCRYPTION:
+        case RGW_OP_INIT_MULTIPART: // in case that Init Multipart uses CHUNK encoding
+        case RGW_OP_COMPLETE_MULTIPART:
+        case RGW_OP_SET_BUCKET_VERSIONING:
+        case RGW_OP_DELETE_MULTI_OBJ:
+        case RGW_OP_ADMIN_SET_METADATA:
+        case RGW_OP_SYNC_DATALOG_NOTIFY:
+        case RGW_OP_SYNC_DATALOG_NOTIFY2:
+        case RGW_OP_SYNC_MDLOG_NOTIFY:
+        case RGW_OP_PERIOD_POST:
+        case RGW_OP_SET_BUCKET_WEBSITE:
+        case RGW_OP_PUT_BUCKET_POLICY:
+        case RGW_OP_PUT_OBJ_TAGGING:
+	case RGW_OP_PUT_BUCKET_TAGGING:
+	case RGW_OP_PUT_BUCKET_REPLICATION:
+        case RGW_OP_PUT_LC:
+        case RGW_OP_SET_REQUEST_PAYMENT:
+        case RGW_OP_PUBSUB_NOTIF_CREATE:
+        case RGW_OP_PUBSUB_NOTIF_DELETE:
+        case RGW_OP_PUBSUB_NOTIF_LIST:
+        case RGW_OP_PUT_BUCKET_OBJ_LOCK:
+        case RGW_OP_PUT_OBJ_RETENTION:
+        case RGW_OP_PUT_OBJ_LEGAL_HOLD:
+        case RGW_STS_GET_SESSION_TOKEN:
+        case RGW_STS_ASSUME_ROLE:
+        case RGW_OP_PUT_BUCKET_PUBLIC_ACCESS_BLOCK:
+        case RGW_OP_GET_BUCKET_PUBLIC_ACCESS_BLOCK:
+        case RGW_OP_DELETE_BUCKET_PUBLIC_ACCESS_BLOCK:
+	case RGW_OP_GET_OBJ://s3select its post-method(payload contain the query) , the request is get-object
+          break;
+        default:
+          ldpp_dout(s, 10) << "ERROR: AWS4 completion for operation: " << s->op_type << ", NOT IMPLEMENTED" << dendl;
+          throw -ERR_NOT_IMPLEMENTED;
+      }
+
+      const auto cmpl_factory = std::bind(AWSv4ComplSingle::create,
+                                          s,
+                                          std::placeholders::_1);
+      return {
+        access_key_id,
+        client_signature,
+        session_token,
+        std::move(string_to_sign),
+        sig_factory,
+        cmpl_factory
+      };
+    } else {
+      /* IMHO "streamed" doesn't fit too good here. I would prefer to call
+       * it "chunked" but let's be coherent with Amazon's terminology. */
+
+      ldpp_dout(s, 10) << "body content detected in multiple chunks" << dendl;
+
+      /* payload in multiple chunks */
+
+      switch(s->op_type)
+      {
+        case RGW_OP_PUT_OBJ:
+          break;
+        default:
+          ldpp_dout(s, 10) << "ERROR: AWS4 completion for this operation NOT IMPLEMENTED (streaming mode)" << dendl;
+          throw -ERR_NOT_IMPLEMENTED;
+      }
+
+      ldpp_dout(s, 10) << "aws4 seed signature ok... delaying v4 auth" << dendl;
+
+      /* In the case of streamed payload client sets the x-amz-content-sha256
+       * to "STREAMING-AWS4-HMAC-SHA256-PAYLOAD" but uses "UNSIGNED-PAYLOAD"
+       * when constructing the Canonical Request. */
+
+      /* In the case of single-chunk upload client set the header's value is
+       * coherent with the one used for Canonical Request crafting. */
+
+      /* In the case of query string-based authentication there should be no
+       * x-amz-content-sha256 header and the value "UNSIGNED-PAYLOAD" is used
+       * for CanonReq. */
+      const auto cmpl_factory = std::bind(AWSv4ComplMulti::create,
+                                          s,
+                                          date,
+                                          credential_scope,
+                                          client_signature,
+                                          std::placeholders::_1);
+      return {
+        access_key_id,
+        client_signature,
+        session_token,
+        std::move(string_to_sign),
+        sig_factory,
+        cmpl_factory
+      };
+    }
+  }
+}
+
+
+boost::optional<std::string>
+AWSGeneralBoto2Abstractor::get_v4_canonical_headers(
+  const req_info& info,
+  const std::string_view& signedheaders,
+  const bool using_qs) const
+{
+  return rgw::auth::s3::get_v4_canonical_headers(info, signedheaders,
+                                                 using_qs, true);
+}
+
+
+AWSEngine::VersionAbstractor::auth_data_t
+AWSGeneralAbstractor::get_auth_data_v2(const req_state* const s) const
+{
+  std::string_view access_key_id;
+  std::string_view signature;
+  std::string_view session_token;
+  bool qsr = false;
+
+  const char* http_auth = s->info.env->get("HTTP_AUTHORIZATION");
+  if (! http_auth || http_auth[0] == '\0') {
+    /* Credentials are provided in query string. We also need to verify
+     * the "Expires" parameter now. */
+    access_key_id = s->info.args.get("AWSAccessKeyId");
+    signature = s->info.args.get("Signature");
+    qsr = true;
+
+    std::string_view expires = s->info.args.get("Expires");
+    if (expires.empty()) {
+      throw -EPERM;
+    }
+
+    /* It looks we have the guarantee that expires is a null-terminated,
+     * and thus string_view::data() can be safely used. */
+    const time_t exp = atoll(expires.data());
+    time_t now;
+    time(&now);
+
+    if (now >= exp) {
+      throw -EPERM;
+    }
+    if (s->info.args.exists("x-amz-security-token")) {
+      session_token = s->info.args.get("x-amz-security-token");
+      if (session_token.size() == 0) {
+        throw -EPERM;
+      }
+    }
+
+  } else {
+    /* The "Authorization" HTTP header is being used. */
+    const std::string_view auth_str(http_auth + strlen("AWS "));
+    const size_t pos = auth_str.rfind(':');
+    if (pos != std::string_view::npos) {
+      access_key_id = auth_str.substr(0, pos);
+      signature = auth_str.substr(pos + 1);
+    }
+
+    auto token = s->info.env->get_optional("HTTP_X_AMZ_SECURITY_TOKEN");
+    if (token) {
+      session_token = *token;
+      if (session_token.size() == 0) {
+        throw -EPERM;
+      }
+    }
+  }
+
+  /* Let's canonize the HTTP headers that are covered by the AWS auth v2. */
+  std::string string_to_sign;
+  utime_t header_time;
+  if (! rgw_create_s3_canonical_header(s, s->info, &header_time, string_to_sign,
+        qsr)) {
+    ldpp_dout(s, 10) << "failed to create the canonized auth header\n"
+                   << rgw::crypt_sanitize::auth{s,string_to_sign} << dendl;
+    throw -EPERM;
+  }
+
+  ldpp_dout(s, 10) << "string_to_sign:\n"
+                 << rgw::crypt_sanitize::auth{s,string_to_sign} << dendl;
+
+  if (!qsr && !is_time_skew_ok(header_time)) {
+    throw -ERR_REQUEST_TIME_SKEWED;
+  }
+
+  return {
+    std::move(access_key_id),
+    std::move(signature),
+    std::move(session_token),
+    std::move(string_to_sign),
+    rgw::auth::s3::get_v2_signature,
+    null_completer_factory
+  };
+}
+
+
+AWSEngine::VersionAbstractor::auth_data_t
+AWSBrowserUploadAbstractor::get_auth_data_v2(const req_state* const s) const
+{
+  return {
+    s->auth.s3_postobj_creds.access_key,
+    s->auth.s3_postobj_creds.signature,
+    s->auth.s3_postobj_creds.x_amz_security_token,
+    s->auth.s3_postobj_creds.encoded_policy.to_str(),
+    rgw::auth::s3::get_v2_signature,
+    null_completer_factory
+  };
+}
+
+AWSEngine::VersionAbstractor::auth_data_t
+AWSBrowserUploadAbstractor::get_auth_data_v4(const req_state* const s) const
+{
+  const std::string_view credential = s->auth.s3_postobj_creds.x_amz_credential;
+
+  /* grab access key id */
+  const size_t pos = credential.find("/");
+  const std::string_view access_key_id = credential.substr(0, pos);
+  ldpp_dout(s, 10) << "access key id = " << access_key_id << dendl;
+
+  /* grab credential scope */
+  const std::string_view credential_scope = credential.substr(pos + 1);
+  ldpp_dout(s, 10) << "credential scope = " << credential_scope << dendl;
+
+  const auto sig_factory = std::bind(rgw::auth::s3::get_v4_signature,
+                                     credential_scope,
+                                     std::placeholders::_1,
+                                     std::placeholders::_2,
+                                     std::placeholders::_3,
+                                     s);
+
+  return {
+    access_key_id,
+    s->auth.s3_postobj_creds.signature,
+    s->auth.s3_postobj_creds.x_amz_security_token,
+    s->auth.s3_postobj_creds.encoded_policy.to_str(),
+    sig_factory,
+    null_completer_factory
+  };
+}
+
+AWSEngine::VersionAbstractor::auth_data_t
+AWSBrowserUploadAbstractor::get_auth_data(const req_state* const s) const
+{
+  if (s->auth.s3_postobj_creds.x_amz_algorithm == AWS4_HMAC_SHA256_STR) {
+    ldpp_dout(s, 0) << "Signature verification algorithm AWS v4"
+                     << " (AWS4-HMAC-SHA256)" << dendl;
+    return get_auth_data_v4(s);
+  } else {
+    ldpp_dout(s, 0) << "Signature verification algorithm AWS v2" << dendl;
+    return get_auth_data_v2(s);
+  }
+}
+
+AWSEngine::result_t
+AWSEngine::authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const
+{
+  /* Small reminder: an ver_abstractor is allowed to throw! */
+  const auto auth_data = ver_abstractor.get_auth_data(s);
+
+  if (auth_data.access_key_id.empty() || auth_data.client_signature.empty()) {
+    return result_t::deny(-EINVAL);
+  } else {
+    return authenticate(dpp,
+                        auth_data.access_key_id,
+		        auth_data.client_signature,
+			auth_data.session_token,
+			auth_data.string_to_sign,
+                        auth_data.signature_factory,
+			auth_data.completer_factory,
+			s, y);
+  }
+}
+
+} // namespace rgw::auth::s3
+
+rgw::LDAPHelper* rgw::auth::s3::LDAPEngine::ldh = nullptr;
+std::mutex rgw::auth::s3::LDAPEngine::mtx;
+
+void rgw::auth::s3::LDAPEngine::init(CephContext* const cct)
+{
+  if (! cct->_conf->rgw_s3_auth_use_ldap ||
+      cct->_conf->rgw_ldap_uri.empty()) {
+    return;
+  }
+
+  if (! ldh) {
+    std::lock_guard<std::mutex> lck(mtx);
+    if (! ldh) {
+      const string& ldap_uri = cct->_conf->rgw_ldap_uri;
+      const string& ldap_binddn = cct->_conf->rgw_ldap_binddn;
+      const string& ldap_searchdn = cct->_conf->rgw_ldap_searchdn;
+      const string& ldap_searchfilter = cct->_conf->rgw_ldap_searchfilter;
+      const string& ldap_dnattr = cct->_conf->rgw_ldap_dnattr;
+      std::string ldap_bindpw = parse_rgw_ldap_bindpw(cct);
+
+      ldh = new rgw::LDAPHelper(ldap_uri, ldap_binddn, ldap_bindpw,
+                                ldap_searchdn, ldap_searchfilter, ldap_dnattr);
+
+      ldh->init();
+      ldh->bind();
+    }
+  }
+}
+
+bool rgw::auth::s3::LDAPEngine::valid() {
+  std::lock_guard<std::mutex> lck(mtx);
+  return (!!ldh);
+}
+
+rgw::auth::RemoteApplier::acl_strategy_t
+rgw::auth::s3::LDAPEngine::get_acl_strategy() const
+{
+  //This is based on the assumption that the default acl strategy in
+  // get_perms_from_aclspec, will take care. Extra acl spec is not required.
+  return nullptr;
+}
+
+rgw::auth::RemoteApplier::AuthInfo
+rgw::auth::s3::LDAPEngine::get_creds_info(const rgw::RGWToken& token) const noexcept
+{
+  /* The short form of "using" can't be used here -- we're aliasing a class'
+   * member. */
+  using acct_privilege_t = \
+    rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t;
+
+  return rgw::auth::RemoteApplier::AuthInfo {
+    rgw_user(token.id),
+    token.id,
+    RGW_PERM_FULL_CONTROL,
+    acct_privilege_t::IS_PLAIN_ACCT,
+    rgw::auth::RemoteApplier::AuthInfo::NO_ACCESS_KEY,
+    rgw::auth::RemoteApplier::AuthInfo::NO_SUBUSER,
+    TYPE_LDAP
+  };
+}
+
+rgw::auth::Engine::result_t
+rgw::auth::s3::LDAPEngine::authenticate(
+  const DoutPrefixProvider* dpp,
+  const std::string_view& access_key_id,
+  const std::string_view& signature,
+  const std::string_view& session_token,
+  const string_to_sign_t& string_to_sign,
+  const signature_factory_t&,
+  const completer_factory_t& completer_factory,
+  const req_state* const s,
+  optional_yield y) const
+{
+  /* boost filters and/or string_ref may throw on invalid input */
+  rgw::RGWToken base64_token;
+  try {
+    base64_token = rgw::from_base64(access_key_id);
+  } catch (...) {
+    base64_token = std::string("");
+  }
+
+  if (! base64_token.valid()) {
+    return result_t::deny();
+  }
+
+  //TODO: Uncomment, when we have a migration plan in place.
+  //Check if a user of type other than 'ldap' is already present, if yes, then
+  //return error.
+  /*RGWUserInfo user_info;
+  user_info.user_id = base64_token.id;
+  if (rgw_get_user_info_by_uid(driver, user_info.user_id, user_info) >= 0) {
+    if (user_info.type != TYPE_LDAP) {
+      ldpp_dout(dpp, 10) << "ERROR: User id of type: " << user_info.type << " is already present" << dendl;
+      return nullptr;
+    }
+  }*/
+
+  if (ldh->auth(base64_token.id, base64_token.key) != 0) {
+    return result_t::deny(-ERR_INVALID_ACCESS_KEY);
+  }
+
+  auto apl = apl_factory->create_apl_remote(cct, s, get_acl_strategy(),
+                                            get_creds_info(base64_token));
+  return result_t::grant(std::move(apl), completer_factory(boost::none));
+} /* rgw::auth::s3::LDAPEngine::authenticate */
+
+void rgw::auth::s3::LDAPEngine::shutdown() {
+  if (ldh) {
+    delete ldh;
+    ldh = nullptr;
+  }
+}
+
+/* LocalEngine */
+rgw::auth::Engine::result_t
+rgw::auth::s3::LocalEngine::authenticate(
+  const DoutPrefixProvider* dpp,
+  const std::string_view& _access_key_id,
+  const std::string_view& signature,
+  const std::string_view& session_token,
+  const string_to_sign_t& string_to_sign,
+  const signature_factory_t& signature_factory,
+  const completer_factory_t& completer_factory,
+  const req_state* const s,
+  optional_yield y) const
+{
+  /* get the user info */
+  std::unique_ptr<rgw::sal::User> user;
+  const std::string access_key_id(_access_key_id);
+  /* TODO(rzarzynski): we need to have string-view taking variant. */
+  if (driver->get_user_by_access_key(dpp, access_key_id, y, &user) < 0) {
+      ldpp_dout(dpp, 5) << "error reading user info, uid=" << access_key_id
+              << " can't authenticate" << dendl;
+      return result_t::deny(-ERR_INVALID_ACCESS_KEY);
+  }
+  //TODO: Uncomment, when we have a migration plan in place.
+  /*else {
+    if (s->user->type != TYPE_RGW) {
+      ldpp_dout(dpp, 10) << "ERROR: User id of type: " << s->user->type
+                     << " is present" << dendl;
+      throw -EPERM;
+    }
+  }*/
+
+  const auto iter = user->get_info().access_keys.find(access_key_id);
+  if (iter == std::end(user->get_info().access_keys)) {
+    ldpp_dout(dpp, 0) << "ERROR: access key not encoded in user info" << dendl;
+    return result_t::deny(-EPERM);
+  }
+  const RGWAccessKey& k = iter->second;
+
+  const VersionAbstractor::server_signature_t server_signature = \
+    signature_factory(cct, k.key, string_to_sign);
+  auto compare = signature.compare(server_signature);
+
+  ldpp_dout(dpp, 15) << "string_to_sign="
+                 << rgw::crypt_sanitize::log_content{string_to_sign}
+                 << dendl;
+  ldpp_dout(dpp, 15) << "server signature=" << server_signature << dendl;
+  ldpp_dout(dpp, 15) << "client signature=" << signature << dendl;
+  ldpp_dout(dpp, 15) << "compare=" << compare << dendl;
+
+  if (compare != 0) {
+    return result_t::deny(-ERR_SIGNATURE_NO_MATCH);
+  }
+
+  auto apl = apl_factory->create_apl_local(cct, s, user->get_info(),
+                                           k.subuser, std::nullopt, access_key_id);
+  return result_t::grant(std::move(apl), completer_factory(k.key));
+}
+
+rgw::auth::RemoteApplier::AuthInfo
+rgw::auth::s3::STSEngine::get_creds_info(const STS::SessionToken& token) const noexcept
+{
+  using acct_privilege_t = \
+    rgw::auth::RemoteApplier::AuthInfo::acct_privilege_t;
+
+  return rgw::auth::RemoteApplier::AuthInfo {
+    token.user,
+    token.acct_name,
+    token.perm_mask,
+    (token.is_admin) ? acct_privilege_t::IS_ADMIN_ACCT: acct_privilege_t::IS_PLAIN_ACCT,
+    token.access_key_id,
+    rgw::auth::RemoteApplier::AuthInfo::NO_SUBUSER,
+    token.acct_type
+  };
+}
+
+int
+rgw::auth::s3::STSEngine::get_session_token(const DoutPrefixProvider* dpp, const std::string_view& session_token,
+                                            STS::SessionToken& token) const
+{
+  string decodedSessionToken;
+  try {
+    decodedSessionToken = rgw::from_base64(session_token);
+  } catch (...) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid session token, not base64 encoded." << dendl;
+    return -EINVAL;
+  }
+
+  auto* cryptohandler = cct->get_crypto_handler(CEPH_CRYPTO_AES);
+  if (! cryptohandler) {
+    return -EINVAL;
+  }
+  string secret_s = cct->_conf->rgw_sts_key;
+  buffer::ptr secret(secret_s.c_str(), secret_s.length());
+  int ret = 0;
+  if (ret = cryptohandler->validate_secret(secret); ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid secret key" << dendl;
+    return -EINVAL;
+  }
+  string error;
+  std::unique_ptr<CryptoKeyHandler> keyhandler(cryptohandler->get_key_handler(secret, error));
+  if (! keyhandler) {
+    return -EINVAL;
+  }
+  error.clear();
+
+  string decrypted_str;
+  buffer::list en_input, dec_output;
+  en_input = buffer::list::static_from_string(decodedSessionToken);
+
+  ret = keyhandler->decrypt(en_input, dec_output, &error);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: Decryption failed: " << error << dendl;
+    return -EPERM;
+  } else {
+    try {
+      dec_output.append('\0');
+      auto iter = dec_output.cbegin();
+      decode(token, iter);
+    } catch (const buffer::error& e) {
+      ldpp_dout(dpp, 0) << "ERROR: decode SessionToken failed: " << error << dendl;
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
+rgw::auth::Engine::result_t
+rgw::auth::s3::STSEngine::authenticate(
+  const DoutPrefixProvider* dpp,
+  const std::string_view& _access_key_id,
+  const std::string_view& signature,
+  const std::string_view& session_token,
+  const string_to_sign_t& string_to_sign,
+  const signature_factory_t& signature_factory,
+  const completer_factory_t& completer_factory,
+  const req_state* const s,
+  optional_yield y) const
+{
+  if (! s->info.args.exists("x-amz-security-token") &&
+      ! s->info.env->exists("HTTP_X_AMZ_SECURITY_TOKEN") &&
+      s->auth.s3_postobj_creds.x_amz_security_token.empty()) {
+    return result_t::deny();
+  }
+
+  STS::SessionToken token;
+  if (int ret = get_session_token(dpp, session_token, token); ret < 0) {
+    return result_t::reject(ret);
+  }
+  //Authentication
+  //Check if access key is not the same passed in by client
+  if (token.access_key_id != _access_key_id) {
+    ldpp_dout(dpp, 0) << "Invalid access key" << dendl;
+    return result_t::reject(-EPERM);
+  }
+  //Check if the token has expired
+  if (! token.expiration.empty()) {
+    std::string expiration = token.expiration;
+    if (! expiration.empty()) {
+      boost::optional<real_clock::time_point> exp = ceph::from_iso_8601(expiration, false);
+      if (exp) {
+        real_clock::time_point now = real_clock::now();
+        if (now >= *exp) {
+          ldpp_dout(dpp, 0) << "ERROR: Token expired" << dendl;
+          return result_t::reject(-EPERM);
+        }
+      } else {
+        ldpp_dout(dpp, 0) << "ERROR: Invalid expiration: " << expiration << dendl;
+        return result_t::reject(-EPERM);
+      }
+    }
+  }
+  //Check for signature mismatch
+  const VersionAbstractor::server_signature_t server_signature = \
+    signature_factory(cct, token.secret_access_key, string_to_sign);
+  auto compare = signature.compare(server_signature);
+
+  ldpp_dout(dpp, 15) << "string_to_sign="
+                 << rgw::crypt_sanitize::log_content{string_to_sign}
+                 << dendl;
+  ldpp_dout(dpp, 15) << "server signature=" << server_signature << dendl;
+  ldpp_dout(dpp, 15) << "client signature=" << signature << dendl;
+  ldpp_dout(dpp, 15) << "compare=" << compare << dendl;
+
+  if (compare != 0) {
+    return result_t::reject(-ERR_SIGNATURE_NO_MATCH);
+  }
+
+  // Get all the authorization info
+  std::unique_ptr<rgw::sal::User> user;
+  rgw_user user_id;
+  string role_id;
+  rgw::auth::RoleApplier::Role r;
+  rgw::auth::RoleApplier::TokenAttrs t_attrs;
+  if (! token.roleId.empty()) {
+    std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(token.roleId);
+    if (role->get_by_id(dpp, y) < 0) {
+      return result_t::deny(-EPERM);
+    }
+    r.id = token.roleId;
+    r.name = role->get_name();
+    r.tenant = role->get_tenant();
+
+    vector<string> role_policy_names = role->get_role_policy_names();
+    for (auto& policy_name : role_policy_names) {
+      string perm_policy;
+      if (int ret = role->get_role_policy(dpp, policy_name, perm_policy); ret == 0) {
+        r.role_policies.push_back(std::move(perm_policy));
+      }
+    }
+  }
+
+  user = driver->get_user(token.user);
+  if (! token.user.empty() && token.acct_type != TYPE_ROLE) {
+    // get user info
+    int ret = user->load_user(dpp, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 5) << "ERROR: failed reading user info: uid=" << token.user << dendl;
+      return result_t::reject(-EPERM);
+    }
+  }
+
+  if (token.acct_type == TYPE_KEYSTONE || token.acct_type == TYPE_LDAP) {
+    auto apl = remote_apl_factory->create_apl_remote(cct, s, get_acl_strategy(),
+                                            get_creds_info(token));
+    return result_t::grant(std::move(apl), completer_factory(token.secret_access_key));
+  } else if (token.acct_type == TYPE_ROLE) {
+    t_attrs.user_id = std::move(token.user); // This is mostly needed to assign the owner of a bucket during its creation
+    t_attrs.token_policy = std::move(token.policy);
+    t_attrs.role_session_name = std::move(token.role_session);
+    t_attrs.token_claims = std::move(token.token_claims);
+    t_attrs.token_issued_at = std::move(token.issued_at);
+    t_attrs.principal_tags = std::move(token.principal_tags);
+    auto apl = role_apl_factory->create_apl_role(cct, s, r, t_attrs);
+    return result_t::grant(std::move(apl), completer_factory(token.secret_access_key));
+  } else { // This is for all local users of type TYPE_RGW or TYPE_NONE
+    string subuser;
+    auto apl = local_apl_factory->create_apl_local(cct, s, user->get_info(), subuser, token.perm_mask, std::string(_access_key_id));
+    return result_t::grant(std::move(apl), completer_factory(token.secret_access_key));
+  }
+}
+
+bool rgw::auth::s3::S3AnonymousEngine::is_applicable(
+  const req_state* s
+) const noexcept {
+  AwsVersion version;
+  AwsRoute route;
+  std::tie(version, route) = discover_aws_flavour(s->info);
+
+  /* If HTTP OPTIONS and no authentication provided using the
+   * anonymous engine is applicable */
+  if (s->op == OP_OPTIONS && version == AwsVersion::UNKNOWN) {
+    return true;
+  }
+
+  return route == AwsRoute::QUERY_STRING && version == AwsVersion::UNKNOWN;
+}
diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h
new file mode 100644
index 000000000..20237166b
--- /dev/null
+++ b/src/rgw/rgw_rest_s3.h
@@ -0,0 +1,1215 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#define TIME_BUF_SIZE 128
+
+#include <mutex>
+#include <string_view>
+
+#include <boost/container/static_vector.hpp>
+#include <boost/crc.hpp>
+
+#include "common/sstring.hh"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_http_errors.h"
+#include "rgw_acl_s3.h"
+#include "rgw_policy_s3.h"
+#include "rgw_lc_s3.h"
+#include "rgw_keystone.h"
+#include "rgw_rest_conn.h"
+#include "rgw_ldap.h"
+
+#include "rgw_token.h"
+#include "include/ceph_assert.h"
+
+#include "rgw_auth.h"
+#include "rgw_auth_filters.h"
+#include "rgw_sts.h"
+
+struct rgw_http_error {
+  int http_ret;
+  const char *s3_code;
+};
+
+void rgw_get_errno_s3(struct rgw_http_error *e, int err_no);
+
+class RGWGetObj_ObjStore_S3 : public RGWGetObj_ObjStore
+{
+protected:
+  // Serving a custom error page from an object is really a 200 response with
+  // just the status line altered.
+  int custom_http_ret = 0;
+  std::map<std::string, std::string> crypt_http_responses;
+  int override_range_hdr(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y);
+public:
+  RGWGetObj_ObjStore_S3() {}
+  ~RGWGetObj_ObjStore_S3() override {}
+
+  int verify_requester(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y) override;
+  int get_params(optional_yield y) override;
+  int send_response_data_error(optional_yield y) override;
+  int send_response_data(bufferlist& bl, off_t ofs, off_t len) override;
+  void set_custom_http_response(int http_ret) { custom_http_ret = http_ret; }
+  int get_decrypt_filter(std::unique_ptr<RGWGetObj_Filter>* filter,
+                         RGWGetObj_Filter* cb,
+                         bufferlist* manifest_bl) override;
+};
+
+class RGWGetObjTags_ObjStore_S3 : public RGWGetObjTags_ObjStore
+{
+public:
+  RGWGetObjTags_ObjStore_S3() {}
+  ~RGWGetObjTags_ObjStore_S3() {}
+
+  void send_response_data(bufferlist &bl) override;
+};
+
+class RGWPutObjTags_ObjStore_S3 : public RGWPutObjTags_ObjStore
+{
+public:
+  RGWPutObjTags_ObjStore_S3() {}
+  ~RGWPutObjTags_ObjStore_S3() {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWDeleteObjTags_ObjStore_S3 : public RGWDeleteObjTags
+{
+public:
+  ~RGWDeleteObjTags_ObjStore_S3() override {}
+  void send_response() override;
+};
+
+class RGWGetBucketTags_ObjStore_S3 : public RGWGetBucketTags_ObjStore
+{
+  bufferlist tags_bl;
+public:
+  void send_response_data(bufferlist &bl) override;
+};
+
+class RGWPutBucketTags_ObjStore_S3 : public RGWPutBucketTags_ObjStore
+{
+public:
+  int get_params(const DoutPrefixProvider *dpp, optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWDeleteBucketTags_ObjStore_S3 : public RGWDeleteBucketTags
+{
+public:
+  void send_response() override;
+};
+
+class RGWGetBucketReplication_ObjStore_S3 : public RGWGetBucketReplication_ObjStore
+{
+public:
+  void send_response_data() override;
+};
+
+class RGWPutBucketReplication_ObjStore_S3 : public RGWPutBucketReplication_ObjStore
+{
+public:
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWDeleteBucketReplication_ObjStore_S3 : public RGWDeleteBucketReplication_ObjStore
+{
+protected:
+  void update_sync_policy(rgw_sync_policy_info *policy) override;
+public:
+  void send_response() override;
+};
+
+class RGWListBuckets_ObjStore_S3 : public RGWListBuckets_ObjStore {
+public:
+  RGWListBuckets_ObjStore_S3() {}
+  ~RGWListBuckets_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override {
+    limit = -1; /* no limit */
+    return 0;
+  }
+  void send_response_begin(bool has_buckets) override;
+  void send_response_data(rgw::sal::BucketList& buckets) override;
+  void send_response_end() override;
+};
+
+class RGWGetUsage_ObjStore_S3 : public RGWGetUsage_ObjStore {
+public:
+  RGWGetUsage_ObjStore_S3() {}
+  ~RGWGetUsage_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override ;
+  void send_response() override;
+};
+
+class RGWListBucket_ObjStore_S3 : public RGWListBucket_ObjStore {
+protected:
+  bool objs_container;
+  bool encode_key {false};
+  int get_common_params();
+  void send_common_response();
+  void send_common_versioned_response();
+  public:
+  RGWListBucket_ObjStore_S3() : objs_container(false) {
+    default_max = 1000;
+  }
+  ~RGWListBucket_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+  void send_versioned_response();
+};
+
+class RGWListBucket_ObjStore_S3v2 : public RGWListBucket_ObjStore_S3 {
+  bool fetchOwner;
+  bool start_after_exist;
+  bool continuation_token_exist;
+  std::string startAfter;
+  std::string continuation_token;
+public:
+  RGWListBucket_ObjStore_S3v2() :  fetchOwner(false) {
+  }
+  ~RGWListBucket_ObjStore_S3v2() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+  void send_versioned_response();
+};
+
+class RGWGetBucketLogging_ObjStore_S3 : public RGWGetBucketLogging {
+public:
+  RGWGetBucketLogging_ObjStore_S3() {}
+  ~RGWGetBucketLogging_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWGetBucketLocation_ObjStore_S3 : public RGWGetBucketLocation {
+public:
+  RGWGetBucketLocation_ObjStore_S3() {}
+  ~RGWGetBucketLocation_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWGetBucketVersioning_ObjStore_S3 : public RGWGetBucketVersioning {
+public:
+  RGWGetBucketVersioning_ObjStore_S3() {}
+  ~RGWGetBucketVersioning_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWSetBucketVersioning_ObjStore_S3 : public RGWSetBucketVersioning {
+public:
+  RGWSetBucketVersioning_ObjStore_S3() {}
+  ~RGWSetBucketVersioning_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWGetBucketWebsite_ObjStore_S3 : public RGWGetBucketWebsite {
+public:
+  RGWGetBucketWebsite_ObjStore_S3() {}
+  ~RGWGetBucketWebsite_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWSetBucketWebsite_ObjStore_S3 : public RGWSetBucketWebsite {
+public:
+  RGWSetBucketWebsite_ObjStore_S3() {}
+  ~RGWSetBucketWebsite_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWDeleteBucketWebsite_ObjStore_S3 : public RGWDeleteBucketWebsite {
+public:
+  RGWDeleteBucketWebsite_ObjStore_S3() {}
+  ~RGWDeleteBucketWebsite_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWStatBucket_ObjStore_S3 : public RGWStatBucket_ObjStore {
+public:
+  RGWStatBucket_ObjStore_S3() {}
+  ~RGWStatBucket_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWCreateBucket_ObjStore_S3 : public RGWCreateBucket_ObjStore {
+public:
+  RGWCreateBucket_ObjStore_S3() {}
+  ~RGWCreateBucket_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWDeleteBucket_ObjStore_S3 : public RGWDeleteBucket_ObjStore {
+public:
+  RGWDeleteBucket_ObjStore_S3() {}
+  ~RGWDeleteBucket_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWPutObj_ObjStore_S3 : public RGWPutObj_ObjStore {
+private:
+  std::map<std::string, std::string> crypt_http_responses;
+
+public:
+  RGWPutObj_ObjStore_S3() {}
+  ~RGWPutObj_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  int get_data(bufferlist& bl) override;
+  void send_response() override;
+
+  int get_encrypt_filter(std::unique_ptr<rgw::sal::DataProcessor> *filter,
+                         rgw::sal::DataProcessor *cb) override;
+  int get_decrypt_filter(std::unique_ptr<RGWGetObj_Filter>* filter,
+                         RGWGetObj_Filter* cb,
+                         std::map<std::string, bufferlist>& attrs,
+                         bufferlist* manifest_bl) override;
+};
+
+class RGWPostObj_ObjStore_S3 : public RGWPostObj_ObjStore {
+  parts_collection_t parts;
+  std::string filename;
+  std::string content_type;
+  RGWPolicyEnv env;
+  RGWPolicy post_policy;
+  std::map<std::string, std::string> crypt_http_responses;
+
+  const rgw::auth::StrategyRegistry* auth_registry_ptr = nullptr;
+
+  int get_policy(optional_yield y);
+  int get_tags();
+  void rebuild_key(rgw::sal::Object* obj);
+
+  std::string get_current_filename() const override;
+  std::string get_current_content_type() const override;
+
+public:
+  RGWPostObj_ObjStore_S3() {}
+  ~RGWPostObj_ObjStore_S3() override {}
+
+  int verify_requester(const rgw::auth::StrategyRegistry& auth_registry, optional_yield y) override {
+    auth_registry_ptr = &auth_registry;
+    return RGWPostObj_ObjStore::verify_requester(auth_registry, y);
+  }
+
+  int get_params(optional_yield y) override;
+  int complete_get_params();
+
+  void send_response() override;
+  int get_data(ceph::bufferlist& bl, bool& again) override;
+  int get_encrypt_filter(std::unique_ptr<rgw::sal::DataProcessor> *filter,
+                         rgw::sal::DataProcessor *cb) override;
+};
+
+class RGWDeleteObj_ObjStore_S3 : public RGWDeleteObj_ObjStore {
+public:
+  RGWDeleteObj_ObjStore_S3() {}
+  ~RGWDeleteObj_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWCopyObj_ObjStore_S3 : public RGWCopyObj_ObjStore {
+  bool sent_header;
+public:
+  RGWCopyObj_ObjStore_S3() : sent_header(false) {}
+  ~RGWCopyObj_ObjStore_S3() override {}
+
+  int init_dest_policy() override;
+  int get_params(optional_yield y) override;
+  int check_storage_class(const rgw_placement_rule& src_placement) override;
+  void send_partial_response(off_t ofs) override;
+  void send_response() override;
+};
+
+class RGWGetACLs_ObjStore_S3 : public RGWGetACLs_ObjStore {
+public:
+  RGWGetACLs_ObjStore_S3() {}
+  ~RGWGetACLs_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWPutACLs_ObjStore_S3 : public RGWPutACLs_ObjStore {
+public:
+  RGWPutACLs_ObjStore_S3() {}
+  ~RGWPutACLs_ObjStore_S3() override {}
+
+  int get_policy_from_state(rgw::sal::Driver* driver, req_state *s, std::stringstream& ss) override;
+  void send_response() override;
+  int get_params(optional_yield y) override;
+};
+
+class RGWGetLC_ObjStore_S3 : public RGWGetLC_ObjStore {
+protected:
+  RGWLifecycleConfiguration_S3 config;
+public:
+  RGWGetLC_ObjStore_S3() {}
+  ~RGWGetLC_ObjStore_S3() override {}
+  void execute(optional_yield y) override;
+
+ void send_response() override;
+};
+
+class RGWPutLC_ObjStore_S3 : public RGWPutLC_ObjStore {
+public:
+  RGWPutLC_ObjStore_S3() {}
+  ~RGWPutLC_ObjStore_S3() override {}
+
+ void send_response() override;
+};
+
+class RGWDeleteLC_ObjStore_S3 : public RGWDeleteLC_ObjStore {
+public:
+  RGWDeleteLC_ObjStore_S3() {}
+  ~RGWDeleteLC_ObjStore_S3() override {}
+
+ void send_response() override;
+};
+
+class RGWGetCORS_ObjStore_S3 : public RGWGetCORS_ObjStore {
+public:
+  RGWGetCORS_ObjStore_S3() {}
+  ~RGWGetCORS_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWPutCORS_ObjStore_S3 : public RGWPutCORS_ObjStore {
+public:
+  RGWPutCORS_ObjStore_S3() {}
+  ~RGWPutCORS_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWDeleteCORS_ObjStore_S3 : public RGWDeleteCORS_ObjStore {
+public:
+  RGWDeleteCORS_ObjStore_S3() {}
+  ~RGWDeleteCORS_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWOptionsCORS_ObjStore_S3 : public RGWOptionsCORS_ObjStore {
+public:
+  RGWOptionsCORS_ObjStore_S3() {}
+  ~RGWOptionsCORS_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWGetBucketEncryption_ObjStore_S3 : public RGWGetBucketEncryption_ObjStore {
+public:
+  RGWGetBucketEncryption_ObjStore_S3() {}
+  ~RGWGetBucketEncryption_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWPutBucketEncryption_ObjStore_S3 : public RGWPutBucketEncryption_ObjStore {
+public:
+  RGWPutBucketEncryption_ObjStore_S3() {}
+  ~RGWPutBucketEncryption_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWDeleteBucketEncryption_ObjStore_S3 : public RGWDeleteBucketEncryption_ObjStore {
+public:
+  RGWDeleteBucketEncryption_ObjStore_S3() {}
+  ~RGWDeleteBucketEncryption_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWGetRequestPayment_ObjStore_S3 : public RGWGetRequestPayment {
+public:
+  RGWGetRequestPayment_ObjStore_S3() {}
+  ~RGWGetRequestPayment_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWSetRequestPayment_ObjStore_S3 : public RGWSetRequestPayment {
+public:
+  RGWSetRequestPayment_ObjStore_S3() {}
+  ~RGWSetRequestPayment_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWInitMultipart_ObjStore_S3 : public RGWInitMultipart_ObjStore {
+private:
+  std::map<std::string, std::string> crypt_http_responses;
+public:
+  RGWInitMultipart_ObjStore_S3() {}
+  ~RGWInitMultipart_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+  int prepare_encryption(std::map<std::string, bufferlist>& attrs) override;
+};
+
+class RGWCompleteMultipart_ObjStore_S3 : public RGWCompleteMultipart_ObjStore {
+public:
+  RGWCompleteMultipart_ObjStore_S3() {}
+  ~RGWCompleteMultipart_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWAbortMultipart_ObjStore_S3 : public RGWAbortMultipart_ObjStore {
+public:
+  RGWAbortMultipart_ObjStore_S3() {}
+  ~RGWAbortMultipart_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWListMultipart_ObjStore_S3 : public RGWListMultipart_ObjStore {
+public:
+  RGWListMultipart_ObjStore_S3() {}
+  ~RGWListMultipart_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWListBucketMultiparts_ObjStore_S3 : public RGWListBucketMultiparts_ObjStore {
+public:
+  RGWListBucketMultiparts_ObjStore_S3() {
+    default_max = 1000;
+  }
+  ~RGWListBucketMultiparts_ObjStore_S3() override {}
+
+  void send_response() override;
+};
+
+class RGWDeleteMultiObj_ObjStore_S3 : public RGWDeleteMultiObj_ObjStore {
+public:
+  RGWDeleteMultiObj_ObjStore_S3() {}
+  ~RGWDeleteMultiObj_ObjStore_S3() override {}
+
+  int get_params(optional_yield y) override;
+  void send_status() override;
+  void begin_response() override;
+  void send_partial_response(const rgw_obj_key& key, bool delete_marker,
+                             const std::string& marker_version_id, int ret,
+                             boost::asio::deadline_timer *formatter_flush_cond) override;
+  void end_response() override;
+};
+
+class RGWPutBucketObjectLock_ObjStore_S3 : public RGWPutBucketObjectLock_ObjStore {
+public:
+  RGWPutBucketObjectLock_ObjStore_S3() {}
+  ~RGWPutBucketObjectLock_ObjStore_S3() override {}
+  void send_response() override;
+};
+
+class RGWGetBucketObjectLock_ObjStore_S3 : public RGWGetBucketObjectLock_ObjStore {
+public:
+  RGWGetBucketObjectLock_ObjStore_S3() {}
+  ~RGWGetBucketObjectLock_ObjStore_S3() {}
+  void send_response() override;
+};
+
+class RGWPutObjRetention_ObjStore_S3 : public RGWPutObjRetention_ObjStore {
+public:
+  RGWPutObjRetention_ObjStore_S3() {}
+  ~RGWPutObjRetention_ObjStore_S3() {}
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWGetObjRetention_ObjStore_S3 : public RGWGetObjRetention_ObjStore {
+public:
+  RGWGetObjRetention_ObjStore_S3() {}
+  ~RGWGetObjRetention_ObjStore_S3() {}
+  void send_response() override;
+};
+
+class RGWPutObjLegalHold_ObjStore_S3 : public RGWPutObjLegalHold_ObjStore {
+public:
+  RGWPutObjLegalHold_ObjStore_S3() {}
+  ~RGWPutObjLegalHold_ObjStore_S3() {}
+  void send_response() override;
+};
+
+class RGWGetObjLegalHold_ObjStore_S3 : public RGWGetObjLegalHold_ObjStore {
+public:
+  RGWGetObjLegalHold_ObjStore_S3() {}
+  ~RGWGetObjLegalHold_ObjStore_S3() {}
+  void send_response() override;
+};
+
+class RGWGetObjLayout_ObjStore_S3 : public RGWGetObjLayout {
+public:
+  RGWGetObjLayout_ObjStore_S3() {}
+  ~RGWGetObjLayout_ObjStore_S3() {}
+
+  void send_response() override;
+};
+
+class RGWConfigBucketMetaSearch_ObjStore_S3 : public RGWConfigBucketMetaSearch {
+public:
+  RGWConfigBucketMetaSearch_ObjStore_S3() {}
+  ~RGWConfigBucketMetaSearch_ObjStore_S3() {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWGetBucketMetaSearch_ObjStore_S3 : public RGWGetBucketMetaSearch {
+public:
+  RGWGetBucketMetaSearch_ObjStore_S3() {}
+  ~RGWGetBucketMetaSearch_ObjStore_S3() {}
+
+  void send_response() override;
+};
+
+class RGWDelBucketMetaSearch_ObjStore_S3 : public RGWDelBucketMetaSearch {
+public:
+  RGWDelBucketMetaSearch_ObjStore_S3() {}
+  ~RGWDelBucketMetaSearch_ObjStore_S3() {}
+
+  void send_response() override;
+};
+
+class RGWGetBucketPolicyStatus_ObjStore_S3 : public RGWGetBucketPolicyStatus {
+public:
+  void send_response() override;
+};
+
+class RGWPutBucketPublicAccessBlock_ObjStore_S3 : public RGWPutBucketPublicAccessBlock {
+public:
+  void send_response() override;
+};
+
+class RGWGetBucketPublicAccessBlock_ObjStore_S3 : public RGWGetBucketPublicAccessBlock {
+public:
+  void send_response() override;
+};
+
+class RGW_Auth_S3 {
+public:
+  static int authorize(const DoutPrefixProvider *dpp,
+                       rgw::sal::Driver* driver,
+                       const rgw::auth::StrategyRegistry& auth_registry,
+                       req_state *s, optional_yield y);
+};
+
+class RGWHandler_Auth_S3 : public RGWHandler_REST {
+  friend class RGWRESTMgr_S3;
+
+  const rgw::auth::StrategyRegistry& auth_registry;
+
+public:
+  explicit RGWHandler_Auth_S3(const rgw::auth::StrategyRegistry& auth_registry)
+    : RGWHandler_REST(),
+      auth_registry(auth_registry) {
+  }
+  ~RGWHandler_Auth_S3() override = default;
+
+  static int validate_bucket_name(const std::string& bucket);
+  static int validate_object_name(const std::string& bucket);
+
+  int init(rgw::sal::Driver* driver,
+           req_state *s,
+           rgw::io::BasicClient *cio) override;
+  int authorize(const DoutPrefixProvider *dpp, optional_yield y) override {
+    return RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y);
+  }
+  int postauth_init(optional_yield) override { return 0; }
+};
+
+class RGWHandler_REST_S3 : public RGWHandler_REST {
+  friend class RGWRESTMgr_S3;
+protected:
+  const rgw::auth::StrategyRegistry& auth_registry;
+public:
+  static int init_from_header(rgw::sal::Driver* driver, req_state *s, RGWFormat default_formatter,
+			      bool configurable_format);
+
+  explicit RGWHandler_REST_S3(const rgw::auth::StrategyRegistry& auth_registry)
+    : RGWHandler_REST(),
+      auth_registry(auth_registry) {
+    }
+  ~RGWHandler_REST_S3() override = default;
+
+  int init(rgw::sal::Driver* driver,
+           req_state *s,
+           rgw::io::BasicClient *cio) override;
+  int authorize(const DoutPrefixProvider *dpp, optional_yield y) override;
+  int postauth_init(optional_yield y) override;
+};
+
+class RGWHandler_REST_Service_S3 : public RGWHandler_REST_S3 {
+protected:
+  bool is_usage_op() const {
+    return s->info.args.exists("usage");
+  }
+  RGWOp *op_get() override;
+  RGWOp *op_head() override;
+public:
+   RGWHandler_REST_Service_S3(const rgw::auth::StrategyRegistry& auth_registry) :
+      RGWHandler_REST_S3(auth_registry) {}
+  ~RGWHandler_REST_Service_S3() override = default;
+};
+
+class RGWHandler_REST_Bucket_S3 : public RGWHandler_REST_S3 {
+  const bool enable_pubsub;
+protected:
+  bool is_acl_op() const {
+    return s->info.args.exists("acl");
+  }
+  bool is_cors_op() const {
+      return s->info.args.exists("cors");
+  }
+  bool is_lc_op() const {
+      return s->info.args.exists("lifecycle");
+  }
+  bool is_obj_update_op() const override {
+    return is_acl_op() || is_cors_op();
+  }
+  bool is_tagging_op() const {
+    return s->info.args.exists("tagging");
+  }
+  bool is_request_payment_op() const {
+    return s->info.args.exists("requestPayment");
+  }
+  bool is_policy_op() const {
+    return s->info.args.exists("policy");
+  }
+  bool is_object_lock_op() const {
+    return s->info.args.exists("object-lock");
+  }
+  bool is_notification_op() const {
+    if (enable_pubsub) {
+        return s->info.args.exists("notification");
+    }
+    return false;
+  }
+  bool is_replication_op() const {
+    return s->info.args.exists("replication");
+  }
+  bool is_policy_status_op() {
+    return s->info.args.exists("policyStatus");
+  }
+  bool is_block_public_access_op() {
+    return s->info.args.exists("publicAccessBlock");
+  }
+  bool is_bucket_encryption_op() {
+    return s->info.args.exists("encryption");
+  }
+
+  RGWOp *get_obj_op(bool get_data) const;
+  RGWOp *op_get() override;
+  RGWOp *op_head() override;
+  RGWOp *op_put() override;
+  RGWOp *op_delete() override;
+  RGWOp *op_post() override;
+  RGWOp *op_options() override;
+public:
+  RGWHandler_REST_Bucket_S3(const rgw::auth::StrategyRegistry& auth_registry, bool _enable_pubsub) :
+      RGWHandler_REST_S3(auth_registry), enable_pubsub(_enable_pubsub) {}
+  ~RGWHandler_REST_Bucket_S3() override = default;
+};
+
+class RGWHandler_REST_Obj_S3 : public RGWHandler_REST_S3 {
+protected:
+  bool is_acl_op() const {
+    return s->info.args.exists("acl");
+  }
+  bool is_tagging_op() const {
+    return s->info.args.exists("tagging");
+  }
+  bool is_obj_retention_op() const {
+    return s->info.args.exists("retention");
+  }
+  bool is_obj_legal_hold_op() const {
+    return s->info.args.exists("legal-hold");
+  }
+
+  bool is_select_op() const {
+    return s->info.args.exists("select-type");
+  }
+
+  bool is_obj_update_op() const override {
+    return is_acl_op() || is_tagging_op() || is_obj_retention_op() || is_obj_legal_hold_op() || is_select_op();
+  }
+  RGWOp *get_obj_op(bool get_data);
+
+  RGWOp *op_get() override;
+  RGWOp *op_head() override;
+  RGWOp *op_put() override;
+  RGWOp *op_delete() override;
+  RGWOp *op_post() override;
+  RGWOp *op_options() override;
+public:
+  using RGWHandler_REST_S3::RGWHandler_REST_S3;
+  ~RGWHandler_REST_Obj_S3() override = default;
+};
+
+class RGWRESTMgr_S3 : public RGWRESTMgr {
+private:
+  const bool enable_s3website;
+  const bool enable_sts;
+  const bool enable_iam;
+  const bool enable_pubsub;
+public:
+  explicit RGWRESTMgr_S3(bool _enable_s3website=false, bool _enable_sts=false, bool _enable_iam=false, bool _enable_pubsub=false)
+    : enable_s3website(_enable_s3website),
+      enable_sts(_enable_sts),
+      enable_iam(_enable_iam),
+      enable_pubsub(_enable_pubsub) {
+  }
+
+  ~RGWRESTMgr_S3() override = default;
+
+  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+			       req_state* s,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string& frontend_prefix) override;
+};
+
+class RGWHandler_REST_Obj_S3Website;
+
+static inline bool looks_like_ip_address(const char *bucket)
+{
+  struct in6_addr a;
+  if (inet_pton(AF_INET6, bucket, static_cast<void*>(&a)) == 1) {
+    return true;
+  }
+  int num_periods = 0;
+  bool expect_period = false;
+  for (const char *b = bucket; *b; ++b) {
+    if (*b == '.') {
+      if (!expect_period)
+	return false;
+      ++num_periods;
+      if (num_periods > 3)
+	return false;
+      expect_period = false;
+    }
+    else if (isdigit(*b)) {
+      expect_period = true;
+    }
+    else {
+      return false;
+    }
+  }
+  return (num_periods == 3);
+}
+
+inline int valid_s3_object_name(const std::string& name) {
+  if (name.size() > 1024) {
+    return -ERR_INVALID_OBJECT_NAME;
+  }
+  if (check_utf8(name.c_str(), name.size())) {
+    return -ERR_INVALID_OBJECT_NAME;
+  }
+  return 0;
+}
+
+inline int valid_s3_bucket_name(const std::string& name, bool relaxed=false)
+{
+  // This function enforces Amazon's spec for bucket names.
+  // (The requirements, not the recommendations.)
+  int len = name.size();
+  int max = (relaxed ? 255 : 63);
+
+  if (len < 3) {
+    // Name too short
+    return -ERR_INVALID_BUCKET_NAME;
+  } else if (len > max) {
+    // Name too long
+    return -ERR_INVALID_BUCKET_NAME;
+  }
+
+  // bucket names must start with a number or letter
+  if (!(isalpha(name[0]) || isdigit(name[0]))) {
+    if (!relaxed)
+      return -ERR_INVALID_BUCKET_NAME;
+    else if (!(name[0] == '_' || name[0] == '.' || name[0] == '-'))
+      return -ERR_INVALID_BUCKET_NAME;
+  }
+
+  // bucket names must end with a number or letter
+  if (!(isalpha(name[len-1]) || isdigit(name[len-1])))
+    if (!relaxed)
+      return -ERR_INVALID_BUCKET_NAME;
+
+  for (const char *s = name.c_str(); *s; ++s) {
+    char c = *s;
+    if (isdigit(c))
+      continue;
+
+    if (isalpha(c)) {
+      // name cannot contain uppercase letters
+      if (relaxed || islower(c))
+	continue;
+    }
+
+    if (c == '_')
+      // name cannot contain underscore
+      if (relaxed)
+	continue;
+
+    if (c == '-')
+      continue;
+
+    if (c == '.') {
+      if (!relaxed && s && *s) {
+	// name cannot have consecutive periods or dashes
+	// adjacent to periods
+	// ensure s is neither the first nor the last character
+	char p = *(s-1);
+	char n = *(s+1);
+	if ((p != '-') && (n != '.') && (n != '-'))
+	  continue;
+      } else {
+	continue;
+      }
+    }
+
+    // Invalid character
+    return -ERR_INVALID_BUCKET_NAME;
+  }
+
+  if (looks_like_ip_address(name.c_str()))
+    return -ERR_INVALID_BUCKET_NAME;
+
+  return 0;
+}
+
+namespace rgw::auth::s3 {
+
+class AWSEngine : public rgw::auth::Engine {
+public:
+  class VersionAbstractor {
+    static constexpr size_t DIGEST_SIZE_V2 = CEPH_CRYPTO_HMACSHA1_DIGESTSIZE;
+    static constexpr size_t DIGEST_SIZE_V4 = CEPH_CRYPTO_HMACSHA256_DIGESTSIZE;
+
+    /* Knowing the signature max size allows us to employ the sstring, and thus
+     * avoid dynamic allocations. The multiplier comes from representing digest
+     * in the base64-encoded form. */
+    static constexpr size_t SIGNATURE_MAX_SIZE = \
+      std::max(DIGEST_SIZE_V2, DIGEST_SIZE_V4) * 2 + sizeof('\0');
+
+  public:
+    virtual ~VersionAbstractor() {};
+
+    using access_key_id_t = std::string_view;
+    using client_signature_t = std::string_view;
+    using session_token_t = std::string_view;
+    using server_signature_t = basic_sstring<char, uint16_t, SIGNATURE_MAX_SIZE>;
+    using string_to_sign_t = std::string;
+
+    /* Transformation for crafting the AWS signature at server side which is
+     * used later to compare with the user-provided one. The methodology for
+     * doing that depends on AWS auth version. */
+    using signature_factory_t = \
+      std::function<server_signature_t(CephContext* cct,
+                                       const std::string& secret_key,
+                                       const string_to_sign_t& string_to_sign)>;
+
+    /* Return an instance of Completer for verifying the payload's fingerprint
+     * if necessary. Otherwise caller gets nullptr. Caller may provide secret
+     * key */
+    using completer_factory_t = \
+      std::function<rgw::auth::Completer::cmplptr_t(
+        const boost::optional<std::string>& secret_key)>;
+
+    struct auth_data_t {
+      access_key_id_t access_key_id;
+      client_signature_t client_signature;
+      session_token_t session_token;
+      string_to_sign_t string_to_sign;
+      signature_factory_t signature_factory;
+      completer_factory_t completer_factory;
+    };
+
+    virtual auth_data_t get_auth_data(const req_state* s) const = 0;
+  };
+
+protected:
+  CephContext* cct;
+  const VersionAbstractor& ver_abstractor;
+
+  AWSEngine(CephContext* const cct, const VersionAbstractor& ver_abstractor)
+    : cct(cct),
+      ver_abstractor(ver_abstractor) {
+  }
+
+  using result_t = rgw::auth::Engine::result_t;
+  using string_to_sign_t = VersionAbstractor::string_to_sign_t;
+  using signature_factory_t = VersionAbstractor::signature_factory_t;
+  using completer_factory_t = VersionAbstractor::completer_factory_t;
+
+  /* TODO(rzarzynski): clean up. We've too many input parameter hee. Also
+   * the signature get_auth_data() of VersionAbstractor is too complicated.
+   * Replace these thing with a simple, dedicated structure. */
+  virtual result_t authenticate(const DoutPrefixProvider* dpp,
+                                const std::string_view& access_key_id,
+                                const std::string_view& signature,
+                                const std::string_view& session_token,
+                                const string_to_sign_t& string_to_sign,
+                                const signature_factory_t& signature_factory,
+                                const completer_factory_t& completer_factory,
+                                const req_state* s,
+				optional_yield y) const = 0;
+
+public:
+  result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s,
+			optional_yield y) const final;
+};
+
+
+class AWSGeneralAbstractor : public AWSEngine::VersionAbstractor {
+  CephContext* const cct;
+
+  virtual boost::optional<std::string>
+  get_v4_canonical_headers(const req_info& info,
+                           const std::string_view& signedheaders,
+                           const bool using_qs) const;
+
+  auth_data_t get_auth_data_v2(const req_state* s) const;
+  auth_data_t get_auth_data_v4(const req_state* s, const bool using_qs) const;
+
+public:
+  explicit AWSGeneralAbstractor(CephContext* const cct)
+    : cct(cct) {
+  }
+
+  auth_data_t get_auth_data(const req_state* s) const override;
+};
+
+class AWSGeneralBoto2Abstractor : public AWSGeneralAbstractor {
+  boost::optional<std::string>
+  get_v4_canonical_headers(const req_info& info,
+                           const std::string_view& signedheaders,
+                           const bool using_qs) const override;
+
+public:
+  using AWSGeneralAbstractor::AWSGeneralAbstractor;
+};
+
+class AWSBrowserUploadAbstractor : public AWSEngine::VersionAbstractor {
+  static std::string to_string(ceph::bufferlist bl) {
+    return std::string(bl.c_str(),
+                       static_cast<std::string::size_type>(bl.length()));
+  }
+
+  auth_data_t get_auth_data_v2(const req_state* s) const;
+  auth_data_t get_auth_data_v4(const req_state* s) const;
+
+public:
+  explicit AWSBrowserUploadAbstractor(CephContext*) {
+  }
+
+  auth_data_t get_auth_data(const req_state* s) const override;
+};
+
+class AWSSignerV4 {
+  const DoutPrefixProvider *dpp;
+  CephContext *cct;
+
+public:
+  AWSSignerV4(const DoutPrefixProvider *_dpp) : dpp(_dpp),
+                                                cct(_dpp->get_cct()) {}
+
+  using access_key_id_t = std::string_view;
+  using string_to_sign_t = AWSEngine::VersionAbstractor::string_to_sign_t;
+  using signature_headers_t = std::map<std::string, std::string>;
+
+  struct prepare_result_t;
+
+  using signature_factory_t = \
+      std::function<signature_headers_t(const DoutPrefixProvider* dpp,
+                                        const std::string& secret_key,
+                                        const prepare_result_t&)>;
+
+  struct prepare_result_t {
+    access_key_id_t access_key_id;
+    std::string date;
+    std::string scope;
+    std::string signed_headers;
+    string_to_sign_t string_to_sign;
+    std::map<std::string, std::string> extra_headers;
+    signature_factory_t signature_factory;
+  };
+
+  static prepare_result_t prepare(const DoutPrefixProvider *dpp,
+                                  const std::string& access_key_id,
+                                  const string& region,
+                                  const string& service,
+                                  const req_info& info,
+                                  const bufferlist *opt_content,
+                                  bool s3_op);
+};
+
+
+extern AWSSignerV4::signature_headers_t
+gen_v4_signature(const DoutPrefixProvider *dpp,
+                 const std::string_view& secret_key,
+                 const AWSSignerV4::prepare_result_t& sig_info);
+
+class LDAPEngine : public AWSEngine {
+  static rgw::LDAPHelper* ldh;
+  static std::mutex mtx;
+
+  static void init(CephContext* const cct);
+
+  using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t;
+  using auth_info_t = rgw::auth::RemoteApplier::AuthInfo;
+  using result_t = rgw::auth::Engine::result_t;
+
+protected:
+  rgw::sal::Driver* driver;
+  const rgw::auth::RemoteApplier::Factory* const apl_factory;
+
+  acl_strategy_t get_acl_strategy() const;
+  auth_info_t get_creds_info(const rgw::RGWToken& token) const noexcept;
+
+  result_t authenticate(const DoutPrefixProvider* dpp,
+                        const std::string_view& access_key_id,
+                        const std::string_view& signature,
+                        const std::string_view& session_token,
+                        const string_to_sign_t& string_to_sign,
+                        const signature_factory_t&,
+                        const completer_factory_t& completer_factory,
+                        const req_state* s,
+			optional_yield y) const override;
+public:
+  LDAPEngine(CephContext* const cct,
+             rgw::sal::Driver* driver,
+             const VersionAbstractor& ver_abstractor,
+             const rgw::auth::RemoteApplier::Factory* const apl_factory)
+    : AWSEngine(cct, ver_abstractor),
+      driver(driver),
+      apl_factory(apl_factory) {
+    init(cct);
+  }
+
+  using AWSEngine::authenticate;
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::s3::LDAPEngine";
+  }
+
+  static bool valid();
+  static void shutdown();
+};
+
+class LocalEngine : public AWSEngine {
+  rgw::sal::Driver* driver;
+  const rgw::auth::LocalApplier::Factory* const apl_factory;
+
+  result_t authenticate(const DoutPrefixProvider* dpp,
+                        const std::string_view& access_key_id,
+                        const std::string_view& signature,
+                        const std::string_view& session_token,
+                        const string_to_sign_t& string_to_sign,
+                        const signature_factory_t& signature_factory,
+                        const completer_factory_t& completer_factory,
+                        const req_state* s,
+			optional_yield y) const override;
+public:
+  LocalEngine(CephContext* const cct,
+              rgw::sal::Driver* driver,
+              const VersionAbstractor& ver_abstractor,
+              const rgw::auth::LocalApplier::Factory* const apl_factory)
+    : AWSEngine(cct, ver_abstractor),
+      driver(driver),
+      apl_factory(apl_factory) {
+  }
+
+  using AWSEngine::authenticate;
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::s3::LocalEngine";
+  }
+};
+
+class STSEngine : public AWSEngine {
+  rgw::sal::Driver* driver;
+  const rgw::auth::LocalApplier::Factory* const local_apl_factory;
+  const rgw::auth::RemoteApplier::Factory* const remote_apl_factory;
+  const rgw::auth::RoleApplier::Factory* const role_apl_factory;
+
+  using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t;
+  using auth_info_t = rgw::auth::RemoteApplier::AuthInfo;
+
+  acl_strategy_t get_acl_strategy() const { return nullptr; };
+  auth_info_t get_creds_info(const STS::SessionToken& token) const noexcept;
+
+  int get_session_token(const DoutPrefixProvider* dpp, const std::string_view& session_token,
+                        STS::SessionToken& token) const;
+
+  result_t authenticate(const DoutPrefixProvider* dpp,
+                        const std::string_view& access_key_id,
+                        const std::string_view& signature,
+                        const std::string_view& session_token,
+                        const string_to_sign_t& string_to_sign,
+                        const signature_factory_t& signature_factory,
+                        const completer_factory_t& completer_factory,
+                        const req_state* s,
+			optional_yield y) const override;
+public:
+  STSEngine(CephContext* const cct,
+              rgw::sal::Driver* driver,
+              const VersionAbstractor& ver_abstractor,
+              const rgw::auth::LocalApplier::Factory* const local_apl_factory,
+              const rgw::auth::RemoteApplier::Factory* const remote_apl_factory,
+              const rgw::auth::RoleApplier::Factory* const role_apl_factory)
+    : AWSEngine(cct, ver_abstractor),
+      driver(driver),
+      local_apl_factory(local_apl_factory),
+      remote_apl_factory(remote_apl_factory),
+      role_apl_factory(role_apl_factory) {
+  }
+
+  using AWSEngine::authenticate;
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::s3::STSEngine";
+  }
+};
+
+class S3AnonymousEngine : public rgw::auth::AnonymousEngine {
+  bool is_applicable(const req_state* s) const noexcept override;
+
+public:
+  /* Let's reuse the parent class' constructor. */
+  using rgw::auth::AnonymousEngine::AnonymousEngine;
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::s3::S3AnonymousEngine";
+  }
+};
+
+
+} // namespace rgw::auth::s3
diff --git a/src/rgw/rgw_rest_s3website.h b/src/rgw/rgw_rest_s3website.h
new file mode 100644
index 000000000..3030926a7
--- /dev/null
+++ b/src/rgw/rgw_rest_s3website.h
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Robin H. Johnson <robin.johnson@dreamhost.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_rest_s3.h"
+
+class RGWHandler_REST_S3Website : public RGWHandler_REST_S3 {
+  std::string original_object_name; // object name before retarget()
+  bool web_dir() const;
+protected:
+  int retarget(RGWOp *op, RGWOp **new_op, optional_yield y) override;
+  // TODO: this should be virtual I think, and ensure that it's always
+  // overridden, but that conflates that op_get/op_head are defined in this
+  // class and call this; and don't need to be overridden later.
+  virtual RGWOp *get_obj_op(bool get_data) { return NULL; }
+  RGWOp *op_get() override;
+  RGWOp *op_head() override;
+  // Only allowed to use GET+HEAD
+  RGWOp *op_put() override { return NULL; }
+  RGWOp *op_delete() override { return NULL; }
+  RGWOp *op_post() override { return NULL; }
+  RGWOp *op_copy() override { return NULL; }
+  RGWOp *op_options() override { return NULL; }
+
+  int serve_errordoc(const DoutPrefixProvider *dpp, int http_ret, const std::string &errordoc_key, optional_yield y);
+public:
+  using RGWHandler_REST_S3::RGWHandler_REST_S3;
+  ~RGWHandler_REST_S3Website() override = default;
+
+  int init(rgw::sal::Driver* driver, req_state *s, rgw::io::BasicClient* cio) override;
+  int error_handler(int err_no, std::string *error_content, optional_yield y) override;
+};
+
+class RGWHandler_REST_Service_S3Website : public RGWHandler_REST_S3Website {
+protected:
+  RGWOp *get_obj_op(bool get_data) override;
+public:
+  using RGWHandler_REST_S3Website::RGWHandler_REST_S3Website;
+  ~RGWHandler_REST_Service_S3Website() override = default;
+};
+
+class RGWHandler_REST_Obj_S3Website : public RGWHandler_REST_S3Website {
+protected:
+  RGWOp *get_obj_op(bool get_data) override;
+public:
+  using RGWHandler_REST_S3Website::RGWHandler_REST_S3Website;
+  ~RGWHandler_REST_Obj_S3Website() override = default;
+};
+
+/* The cross-inheritance from Obj to Bucket is deliberate!
+ * S3Websites do NOT support any bucket operations
+ */
+class RGWHandler_REST_Bucket_S3Website : public RGWHandler_REST_S3Website {
+protected:
+  RGWOp *get_obj_op(bool get_data) override;
+public:
+  using RGWHandler_REST_S3Website::RGWHandler_REST_S3Website;
+  ~RGWHandler_REST_Bucket_S3Website() override = default;
+};
+
+// TODO: do we actually need this?
+class  RGWGetObj_ObjStore_S3Website : public RGWGetObj_ObjStore_S3
+{
+  friend class RGWHandler_REST_S3Website;
+private:
+   bool is_errordoc_request;
+public:
+  RGWGetObj_ObjStore_S3Website() : is_errordoc_request(false) {}
+  explicit RGWGetObj_ObjStore_S3Website(bool is_errordoc_request) : is_errordoc_request(false) { this->is_errordoc_request = is_errordoc_request; }
+  ~RGWGetObj_ObjStore_S3Website() override {}
+  int send_response_data_error(optional_yield y) override;
+  int send_response_data(bufferlist& bl, off_t ofs, off_t len) override;
+  // We override RGWGetObj_ObjStore::get_params here, to allow ignoring all
+  // conditional params for error pages.
+  int get_params(optional_yield y) override {
+      if (is_errordoc_request) {
+        range_str = NULL;
+        if_mod = NULL;
+        if_unmod = NULL;
+        if_match = NULL;
+        if_nomatch = NULL;
+        return 0;
+      } else {
+        return RGWGetObj_ObjStore_S3::get_params(y);
+      }
+  }
+};
diff --git a/src/rgw/rgw_rest_sts.cc b/src/rgw/rgw_rest_sts.cc
new file mode 100644
index 000000000..09f77f61d
--- /dev/null
+++ b/src/rgw/rgw_rest_sts.cc
@@ -0,0 +1,819 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+#include <vector>
+#include <string>
+#include <array>
+#include <string_view>
+#include <sstream>
+#include <memory>
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/format.hpp>
+#include <boost/optional.hpp>
+#include <boost/utility/in_place_factory.hpp>
+#include <boost/tokenizer.hpp>
+
+
+
+#include "ceph_ver.h"
+#include "common/Formatter.h"
+#include "common/utf8.h"
+#include "common/ceph_json.h"
+
+#include "rgw_rest.h"
+#include "rgw_auth.h"
+#include "rgw_auth_registry.h"
+#include "jwt-cpp/jwt.h"
+#include "rgw_rest_sts.h"
+
+#include "rgw_formats.h"
+#include "rgw_client_io.h"
+
+#include "rgw_request.h"
+#include "rgw_process.h"
+#include "rgw_iam_policy.h"
+#include "rgw_iam_policy_keywords.h"
+
+#include "rgw_sts.h"
+#include "rgw_rest_oidc_provider.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw::auth::sts {
+
+bool
+WebTokenEngine::is_applicable(const std::string& token) const noexcept
+{
+  return ! token.empty();
+}
+
+std::string
+WebTokenEngine::get_role_tenant(const string& role_arn) const
+{
+  string tenant;
+  auto r_arn = rgw::ARN::parse(role_arn);
+  if (r_arn) {
+    tenant = r_arn->account;
+  }
+  return tenant;
+}
+
+std::string
+WebTokenEngine::get_role_name(const string& role_arn) const
+{
+  string role_name;
+  auto r_arn = rgw::ARN::parse(role_arn);
+  if (r_arn) {
+    role_name = r_arn->resource;
+  }
+  if (!role_name.empty()) {
+    auto pos = role_name.find_last_of('/');
+    if(pos != string::npos) {
+      role_name = role_name.substr(pos + 1);
+    }
+  }
+  return role_name;
+}
+
+std::unique_ptr<rgw::sal::RGWOIDCProvider>
+WebTokenEngine::get_provider(const DoutPrefixProvider *dpp, const string& role_arn, const string& iss) const
+{
+  string tenant = get_role_tenant(role_arn);
+
+  string idp_url = iss;
+  auto pos = idp_url.find("http://");
+  if (pos == std::string::npos) {
+    pos = idp_url.find("https://");
+    if (pos != std::string::npos) {
+      idp_url.erase(pos, 8);
+    } else {
+      pos = idp_url.find("www.");
+      if (pos != std::string::npos) {
+        idp_url.erase(pos, 4);
+      }
+    }
+  } else {
+    idp_url.erase(pos, 7);
+  }
+  auto provider_arn = rgw::ARN(idp_url, "oidc-provider", tenant);
+  string p_arn = provider_arn.to_string();
+  std::unique_ptr<rgw::sal::RGWOIDCProvider> provider = driver->get_oidc_provider();
+  provider->set_arn(p_arn);
+  provider->set_tenant(tenant);
+  auto ret = provider->get(dpp);
+  if (ret < 0) {
+    return nullptr;
+  }
+  return provider;
+}
+
+bool
+WebTokenEngine::is_client_id_valid(vector<string>& client_ids, const string& client_id) const
+{
+  for (auto it : client_ids) {
+    if (it == client_id) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool
+WebTokenEngine::is_cert_valid(const vector<string>& thumbprints, const string& cert) const
+{
+  //calculate thumbprint of cert
+  std::unique_ptr<BIO, decltype(&BIO_free_all)> certbio(BIO_new_mem_buf(cert.data(), cert.size()), BIO_free_all);
+  std::unique_ptr<BIO, decltype(&BIO_free_all)> keybio(BIO_new(BIO_s_mem()), BIO_free_all);
+  string pw="";
+  std::unique_ptr<X509, decltype(&X509_free)> x_509cert(PEM_read_bio_X509(certbio.get(), nullptr, nullptr, const_cast<char*>(pw.c_str())), X509_free);
+  const EVP_MD* fprint_type = EVP_sha1();
+  unsigned int fprint_size;
+  unsigned char fprint[EVP_MAX_MD_SIZE];
+
+  if (!X509_digest(x_509cert.get(), fprint_type, fprint, &fprint_size)) {
+    return false;
+  }
+  stringstream ss;
+  for (unsigned int i = 0; i < fprint_size; i++) {
+    ss << std::setfill('0') << std::setw(2) << std::hex << (0xFF & (unsigned int)fprint[i]);
+  }
+  std::string digest = ss.str();
+
+  for (auto& it : thumbprints) {
+    if (boost::iequals(it,digest)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename T>
+void
+WebTokenEngine::recurse_and_insert(const string& key, const jwt::claim& c, T& t) const
+{
+  string s_val;
+  jwt::claim::type c_type = c.get_type();
+  switch(c_type) {
+    case jwt::claim::type::null:
+      break;
+    case jwt::claim::type::boolean:
+    case jwt::claim::type::number:
+    case jwt::claim::type::int64:
+    {
+      s_val = c.to_json().serialize();
+      t.emplace(std::make_pair(key, s_val));
+      break;
+    }
+    case jwt::claim::type::string:
+    {
+      s_val = c.to_json().to_str();
+      t.emplace(std::make_pair(key, s_val));
+      break;
+    }
+    case jwt::claim::type::array:
+    {
+      const picojson::array& arr = c.as_array();
+      for (auto& a : arr) {
+        recurse_and_insert(key, jwt::claim(a), t);
+      }
+      break;
+    }
+    case jwt::claim::type::object:
+    {
+      const picojson::object& obj = c.as_object();
+      for (auto& m : obj) {
+        recurse_and_insert(m.first, jwt::claim(m.second), t);
+      }
+      break;
+    }
+  }
+  return;
+}
+
+//Extract all token claims so that they can be later used in the Condition element of Role's trust policy
+WebTokenEngine::token_t
+WebTokenEngine::get_token_claims(const jwt::decoded_jwt& decoded) const
+{
+  WebTokenEngine::token_t token;
+  const auto& claims = decoded.get_payload_claims();
+
+  for (auto& c : claims) {
+    if (c.first == string(princTagsNamespace)) {
+      continue;
+    }
+    recurse_and_insert(c.first, c.second, token);
+  }
+  return token;
+}
+
+//Offline validation of incoming Web Token which is a signed JWT (JSON Web Token)
+std::tuple<boost::optional<WebTokenEngine::token_t>, boost::optional<WebTokenEngine::principal_tags_t>>
+WebTokenEngine::get_from_jwt(const DoutPrefixProvider* dpp, const std::string& token, const req_state* const s,
+			     optional_yield y) const
+{
+  WebTokenEngine::token_t t;
+  WebTokenEngine::principal_tags_t principal_tags;
+  try {
+    const auto& decoded = jwt::decode(token);
+
+    auto& payload = decoded.get_payload();
+    ldpp_dout(dpp, 20) << " payload = " << payload << dendl;
+
+    t = get_token_claims(decoded);
+
+    string iss;
+    if (decoded.has_issuer()) {
+      iss = decoded.get_issuer();
+    }
+
+    set<string> aud;
+    if (decoded.has_audience()) {
+      aud = decoded.get_audience();
+    }
+
+    string client_id;
+    if (decoded.has_payload_claim("client_id")) {
+      client_id = decoded.get_payload_claim("client_id").as_string();
+    }
+    if (client_id.empty() && decoded.has_payload_claim("clientId")) {
+      client_id = decoded.get_payload_claim("clientId").as_string();
+    }
+    string azp;
+    if (decoded.has_payload_claim("azp")) {
+      azp = decoded.get_payload_claim("azp").as_string();
+    }
+
+    string role_arn = s->info.args.get("RoleArn");
+    auto provider = get_provider(dpp, role_arn, iss);
+    if (! provider) {
+      ldpp_dout(dpp, 0) << "Couldn't get oidc provider info using input iss" << iss << dendl;
+      throw -EACCES;
+    }
+    if (decoded.has_payload_claim(string(princTagsNamespace))) {
+      auto& cl = decoded.get_payload_claim(string(princTagsNamespace));
+      if (cl.get_type() == jwt::claim::type::object || cl.get_type() == jwt::claim::type::array) {
+        recurse_and_insert("dummy", cl, principal_tags);
+        for (auto it : principal_tags) {
+          ldpp_dout(dpp, 5) << "Key: " << it.first << " Value: " << it.second << dendl;
+        }
+      } else {
+        ldpp_dout(dpp, 0) << "Malformed principal tags" << cl.as_string() << dendl;
+        throw -EINVAL;
+      }
+    }
+    vector<string> client_ids = provider->get_client_ids();
+    vector<string> thumbprints = provider->get_thumbprints();
+    if (! client_ids.empty()) {
+      bool found = false;
+      for (auto& it : aud) {
+        if (is_client_id_valid(client_ids, it)) {
+          found = true;
+          break;
+        }
+      }
+      if (! found && ! is_client_id_valid(client_ids, client_id) && ! is_client_id_valid(client_ids, azp)) {
+        ldpp_dout(dpp, 0) << "Client id in token doesn't match with that registered with oidc provider" << dendl;
+        throw -EACCES;
+      }
+    }
+    //Validate signature
+    if (decoded.has_algorithm()) {
+      auto& algorithm = decoded.get_algorithm();
+      try {
+        validate_signature(dpp, decoded, algorithm, iss, thumbprints, y);
+      } catch (...) {
+        throw -EACCES;
+      }
+    } else {
+      return {boost::none, boost::none};
+    }
+  } catch (int error) {
+    if (error == -EACCES) {
+      throw -EACCES;
+    }
+    ldpp_dout(dpp, 5) << "Invalid JWT token" << dendl;
+    return {boost::none, boost::none};
+  }
+  catch (...) {
+    ldpp_dout(dpp, 5) << "Invalid JWT token" << dendl;
+    return {boost::none, boost::none};
+  }
+  return {t, principal_tags};
+}
+
+std::string
+WebTokenEngine::get_cert_url(const string& iss, const DoutPrefixProvider *dpp, optional_yield y) const
+{
+  string cert_url;
+  string openidc_wellknown_url = iss;
+  bufferlist openidc_resp;
+
+  if (openidc_wellknown_url.back() == '/') {
+    openidc_wellknown_url.pop_back();
+  }
+  openidc_wellknown_url.append("/.well-known/openid-configuration");
+
+  RGWHTTPTransceiver openidc_req(cct, "GET", openidc_wellknown_url, &openidc_resp);
+
+  //Headers
+  openidc_req.append_header("Content-Type", "application/x-www-form-urlencoded");
+
+  int res = openidc_req.process(y);
+  if (res < 0) {
+    ldpp_dout(dpp, 10) << "HTTP request res: " << res << dendl;
+    throw -EINVAL;
+  }
+
+  //Debug only
+  ldpp_dout(dpp, 20) << "HTTP status: " << openidc_req.get_http_status() << dendl;
+  ldpp_dout(dpp, 20) << "JSON Response is: " << openidc_resp.c_str() << dendl;
+
+  JSONParser parser;
+  if (parser.parse(openidc_resp.c_str(), openidc_resp.length())) {
+    JSONObj::data_val val;
+    if (parser.get_data("jwks_uri", &val)) {
+      cert_url = val.str.c_str();
+      ldpp_dout(dpp, 20) << "Cert URL is: " << cert_url.c_str() << dendl;
+    } else {
+      ldpp_dout(dpp, 0) << "Malformed json returned while fetching openidc url" << dendl;
+    }
+  }
+  return cert_url;
+}
+
+void
+WebTokenEngine::validate_signature(const DoutPrefixProvider* dpp, const jwt::decoded_jwt& decoded, const string& algorithm, const string& iss, const vector<string>& thumbprints, optional_yield y) const
+{
+  if (algorithm != "HS256" && algorithm != "HS384" && algorithm != "HS512") {
+    string cert_url = get_cert_url(iss, dpp, y);
+    if (cert_url.empty()) {
+      throw -EINVAL;
+    }
+
+    // Get certificate
+    bufferlist cert_resp;
+    RGWHTTPTransceiver cert_req(cct, "GET", cert_url, &cert_resp);
+    //Headers
+    cert_req.append_header("Content-Type", "application/x-www-form-urlencoded");
+
+    int res = cert_req.process(y);
+    if (res < 0) {
+      ldpp_dout(dpp, 10) << "HTTP request res: " << res << dendl;
+      throw -EINVAL;
+    }
+    //Debug only
+    ldpp_dout(dpp, 20) << "HTTP status: " << cert_req.get_http_status() << dendl;
+    ldpp_dout(dpp, 20) << "JSON Response is: " << cert_resp.c_str() << dendl;
+
+    JSONParser parser;
+    if (parser.parse(cert_resp.c_str(), cert_resp.length())) {
+      JSONObj::data_val val;
+      if (parser.get_data("keys", &val)) {
+        if (val.str[0] == '[') {
+          val.str.erase(0, 1);
+        }
+        if (val.str[val.str.size() - 1] == ']') {
+          val.str = val.str.erase(val.str.size() - 1, 1);
+        }
+        if (parser.parse(val.str.c_str(), val.str.size())) {
+          vector<string> x5c;
+          if (JSONDecoder::decode_json("x5c", x5c, &parser)) {
+            string cert;
+            bool found_valid_cert = false;
+            for (auto& it : x5c) {
+              cert = "-----BEGIN CERTIFICATE-----\n" + it + "\n-----END CERTIFICATE-----";
+              ldpp_dout(dpp, 20) << "Certificate is: " << cert.c_str() << dendl;
+              if (is_cert_valid(thumbprints, cert)) {
+               found_valid_cert = true;
+               break;
+              }
+              found_valid_cert = true;
+            }
+            if (! found_valid_cert) {
+              ldpp_dout(dpp, 0) << "Cert doesn't match that with the thumbprints registered with oidc provider: " << cert.c_str() << dendl;
+              throw -EINVAL;
+            }
+            try {
+              //verify method takes care of expired tokens also
+              if (algorithm == "RS256") {
+                auto verifier = jwt::verify()
+                            .allow_algorithm(jwt::algorithm::rs256{cert});
+
+                verifier.verify(decoded);
+              } else if (algorithm == "RS384") {
+                auto verifier = jwt::verify()
+                            .allow_algorithm(jwt::algorithm::rs384{cert});
+
+                verifier.verify(decoded);
+              } else if (algorithm == "RS512") {
+                auto verifier = jwt::verify()
+                            .allow_algorithm(jwt::algorithm::rs512{cert});
+
+                verifier.verify(decoded);
+              } else if (algorithm == "ES256") {
+                auto verifier = jwt::verify()
+                            .allow_algorithm(jwt::algorithm::es256{cert});
+
+                verifier.verify(decoded);
+              } else if (algorithm == "ES384") {
+                auto verifier = jwt::verify()
+                            .allow_algorithm(jwt::algorithm::es384{cert});
+
+                verifier.verify(decoded);
+              } else if (algorithm == "ES512") {
+                auto verifier = jwt::verify()
+                              .allow_algorithm(jwt::algorithm::es512{cert});
+
+                verifier.verify(decoded);
+              } else if (algorithm == "PS256") {
+                auto verifier = jwt::verify()
+                              .allow_algorithm(jwt::algorithm::ps256{cert});
+
+                verifier.verify(decoded);
+              } else if (algorithm == "PS384") {
+                auto verifier = jwt::verify()
+                              .allow_algorithm(jwt::algorithm::ps384{cert});
+
+                verifier.verify(decoded);
+              } else if (algorithm == "PS512") {
+                auto verifier = jwt::verify()
+                              .allow_algorithm(jwt::algorithm::ps512{cert});
+
+                verifier.verify(decoded);
+              }
+            } catch (std::runtime_error& e) {
+              ldpp_dout(dpp, 0) << "Signature validation failed: " << e.what() << dendl;
+              throw;
+            }
+            catch (...) {
+              ldpp_dout(dpp, 0) << "Signature validation failed" << dendl;
+              throw;
+            }
+          } else {
+            ldpp_dout(dpp, 0) << "x5c not present" << dendl;
+            throw -EINVAL;
+          }
+        } else {
+          ldpp_dout(dpp, 0) << "Malformed JSON object for keys" << dendl;
+          throw -EINVAL;
+        }
+      } else {
+        ldpp_dout(dpp, 0) << "keys not present in JSON" << dendl;
+        throw -EINVAL;
+      } //if-else get-data
+    } else {
+      ldpp_dout(dpp, 0) << "Malformed json returned while fetching cert" << dendl;
+      throw -EINVAL;
+    } //if-else parser cert_resp
+  } else {
+    ldpp_dout(dpp, 0) << "JWT signed by HMAC algos are currently not supported" << dendl;
+    throw -EINVAL;
+  }
+}
+
+WebTokenEngine::result_t
+WebTokenEngine::authenticate( const DoutPrefixProvider* dpp,
+                              const std::string& token,
+                              const req_state* const s,
+			      optional_yield y) const
+{
+  if (! is_applicable(token)) {
+    return result_t::deny();
+  }
+
+  try {
+    auto [t, princ_tags] = get_from_jwt(dpp, token, s, y);
+    if (t) {
+      string role_session = s->info.args.get("RoleSessionName");
+      if (role_session.empty()) {
+        ldout(s->cct, 0) << "Role Session Name is empty " << dendl;
+        return result_t::deny(-EACCES);
+      }
+      string role_arn = s->info.args.get("RoleArn");
+      string role_tenant = get_role_tenant(role_arn);
+      string role_name = get_role_name(role_arn);
+      std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(role_name, role_tenant);
+      int ret = role->get(dpp, y);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "Role not found: name:" << role_name << " tenant: " << role_tenant << dendl;
+        return result_t::deny(-EACCES);
+      }
+      boost::optional<multimap<string,string>> role_tags = role->get_tags();
+      auto apl = apl_factory->create_apl_web_identity(cct, s, role_session, role_tenant, *t, role_tags, princ_tags);
+      return result_t::grant(std::move(apl));
+    }
+    return result_t::deny(-EACCES);
+  }
+  catch (...) {
+    return result_t::deny(-EACCES);
+  }
+}
+
+} // namespace rgw::auth::sts
+
+int RGWREST_STS::verify_permission(optional_yield y)
+{
+  STS::STSService _sts(s->cct, driver, s->user->get_id(), s->auth.identity.get());
+  sts = std::move(_sts);
+
+  string rArn = s->info.args.get("RoleArn");
+  const auto& [ret, role] = sts.getRoleInfo(s, rArn, y);
+  if (ret < 0) {
+    ldpp_dout(this, 0) << "failed to get role info using role arn: " << rArn << dendl;
+    return ret;
+  }
+  string policy = role->get_assume_role_policy();
+  buffer::list bl = buffer::list::static_from_string(policy);
+
+  //Parse the policy
+  //TODO - This step should be part of Role Creation
+  try {
+    const rgw::IAM::Policy p(s->cct, s->user->get_tenant(), bl, false);
+    if (!s->principal_tags.empty()) {
+      auto res = p.eval(s->env, *s->auth.identity, rgw::IAM::stsTagSession, boost::none);
+      if (res != rgw::IAM::Effect::Allow) {
+        ldout(s->cct, 0) << "evaluating policy for stsTagSession returned deny/pass" << dendl;
+        return -EPERM;
+      }
+    }
+    uint64_t op;
+    if (get_type() == RGW_STS_ASSUME_ROLE_WEB_IDENTITY) {
+      op = rgw::IAM::stsAssumeRoleWithWebIdentity;
+    } else {
+      op = rgw::IAM::stsAssumeRole;
+    }
+
+    auto res = p.eval(s->env, *s->auth.identity, op, boost::none);
+    if (res != rgw::IAM::Effect::Allow) {
+      ldout(s->cct, 0) << "evaluating policy for op: " << op << " returned deny/pass" << dendl;
+      return -EPERM;
+    }
+  } catch (rgw::IAM::PolicyParseException& e) {
+    ldpp_dout(this, 0) << "failed to parse policy: " << e.what() << dendl;
+    return -EPERM;
+  }
+
+  return 0;
+}
+
+void RGWREST_STS::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+int RGWSTSGetSessionToken::verify_permission(optional_yield y)
+{
+  rgw::Partition partition = rgw::Partition::aws;
+  rgw::Service service = rgw::Service::s3;
+  if (!verify_user_permission(this,
+                              s,
+                              rgw::ARN(partition, service, "", s->user->get_tenant(), ""),
+                              rgw::IAM::stsGetSessionToken)) {
+    ldpp_dout(this, 0) << "User does not have permssion to perform GetSessionToken" << dendl;
+    return -EACCES;
+  }
+
+  return 0;
+}
+
+int RGWSTSGetSessionToken::get_params()
+{
+  duration = s->info.args.get("DurationSeconds");
+  serialNumber = s->info.args.get("SerialNumber");
+  tokenCode = s->info.args.get("TokenCode");
+
+  if (! duration.empty()) {
+    string err;
+    uint64_t duration_in_secs = strict_strtoll(duration.c_str(), 10, &err);
+    if (!err.empty()) {
+      ldpp_dout(this, 0) << "Invalid value of input duration: " << duration << dendl;
+      return -EINVAL;
+    }
+
+    if (duration_in_secs < STS::GetSessionTokenRequest::getMinDuration() ||
+            duration_in_secs > s->cct->_conf->rgw_sts_max_session_duration) {
+      ldpp_dout(this, 0) << "Invalid duration in secs: " << duration_in_secs << dendl;
+      return -EINVAL;
+    }
+  }
+
+  return 0;
+}
+
+void RGWSTSGetSessionToken::execute(optional_yield y)
+{
+  if (op_ret = get_params(); op_ret < 0) {
+    return;
+  }
+
+  STS::STSService sts(s->cct, driver, s->user->get_id(), s->auth.identity.get());
+
+  STS::GetSessionTokenRequest req(duration, serialNumber, tokenCode);
+  const auto& [ret, creds] = sts.getSessionToken(this, req);
+  op_ret = std::move(ret);
+  //Dump the output
+  if (op_ret == 0) {
+    s->formatter->open_object_section("GetSessionTokenResponse");
+    s->formatter->open_object_section("GetSessionTokenResult");
+    s->formatter->open_object_section("Credentials");
+    creds.dump(s->formatter);
+    s->formatter->close_section();
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWSTSAssumeRoleWithWebIdentity::get_params()
+{
+  duration = s->info.args.get("DurationSeconds");
+  providerId = s->info.args.get("ProviderId");
+  policy = s->info.args.get("Policy");
+  roleArn = s->info.args.get("RoleArn");
+  roleSessionName = s->info.args.get("RoleSessionName");
+  iss = s->info.args.get("provider_id");
+  sub = s->info.args.get("sub");
+  aud = s->info.args.get("aud");
+
+  if (roleArn.empty() || roleSessionName.empty() || sub.empty() || aud.empty()) {
+    ldpp_dout(this, 0) << "ERROR: one of role arn or role session name or token is empty" << dendl;
+    return -EINVAL;
+  }
+
+  if (! policy.empty()) {
+    bufferlist bl = bufferlist::static_from_string(policy);
+    try {
+      const rgw::IAM::Policy p(
+	s->cct, s->user->get_tenant(), bl,
+	s->cct->_conf.get_val<bool>("rgw_policy_reject_invalid_principals"));
+    }
+    catch (rgw::IAM::PolicyParseException& e) {
+      ldpp_dout(this, 5) << "failed to parse policy: " << e.what() << "policy" << policy << dendl;
+      s->err.message = e.what();
+      return -ERR_MALFORMED_DOC;
+    }
+  }
+
+  return 0;
+}
+
+void RGWSTSAssumeRoleWithWebIdentity::execute(optional_yield y)
+{
+  if (op_ret = get_params(); op_ret < 0) {
+    return;
+  }
+
+  STS::AssumeRoleWithWebIdentityRequest req(s->cct, duration, providerId, policy, roleArn,
+                        roleSessionName, iss, sub, aud, s->principal_tags);
+  STS::AssumeRoleWithWebIdentityResponse response = sts.assumeRoleWithWebIdentity(this, req);
+  op_ret = std::move(response.assumeRoleResp.retCode);
+
+  //Dump the output
+  if (op_ret == 0) {
+    s->formatter->open_object_section("AssumeRoleWithWebIdentityResponse");
+    s->formatter->open_object_section("AssumeRoleWithWebIdentityResult");
+    encode_json("SubjectFromWebIdentityToken", response.sub , s->formatter);
+    encode_json("Audience", response.aud , s->formatter);
+    s->formatter->open_object_section("AssumedRoleUser");
+    response.assumeRoleResp.user.dump(s->formatter);
+    s->formatter->close_section();
+    s->formatter->open_object_section("Credentials");
+    response.assumeRoleResp.creds.dump(s->formatter);
+    s->formatter->close_section();
+    encode_json("Provider", response.providerId , s->formatter);
+    encode_json("PackedPolicySize", response.assumeRoleResp.packedPolicySize , s->formatter);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGWSTSAssumeRole::get_params()
+{
+  duration = s->info.args.get("DurationSeconds");
+  externalId = s->info.args.get("ExternalId");
+  policy = s->info.args.get("Policy");
+  roleArn = s->info.args.get("RoleArn");
+  roleSessionName = s->info.args.get("RoleSessionName");
+  serialNumber = s->info.args.get("SerialNumber");
+  tokenCode = s->info.args.get("TokenCode");
+
+  if (roleArn.empty() || roleSessionName.empty()) {
+    ldpp_dout(this, 0) << "ERROR: one of role arn or role session name is empty" << dendl;
+    return -EINVAL;
+  }
+
+  if (! policy.empty()) {
+    bufferlist bl = bufferlist::static_from_string(policy);
+    try {
+      const rgw::IAM::Policy p(
+	s->cct, s->user->get_tenant(), bl,
+	s->cct->_conf.get_val<bool>("rgw_policy_reject_invalid_principals"));
+    }
+    catch (rgw::IAM::PolicyParseException& e) {
+      ldpp_dout(this, 0) << "failed to parse policy: " << e.what() << "policy" << policy << dendl;
+      s->err.message = e.what();
+      return -ERR_MALFORMED_DOC;
+    }
+  }
+
+  return 0;
+}
+
+void RGWSTSAssumeRole::execute(optional_yield y)
+{
+  if (op_ret = get_params(); op_ret < 0) {
+    return;
+  }
+
+  STS::AssumeRoleRequest req(s->cct, duration, externalId, policy, roleArn,
+                        roleSessionName, serialNumber, tokenCode);
+  STS::AssumeRoleResponse response = sts.assumeRole(s, req, y);
+  op_ret = std::move(response.retCode);
+  //Dump the output
+  if (op_ret == 0) {
+    s->formatter->open_object_section("AssumeRoleResponse");
+    s->formatter->open_object_section("AssumeRoleResult");
+    s->formatter->open_object_section("Credentials");
+    response.creds.dump(s->formatter);
+    s->formatter->close_section();
+    s->formatter->open_object_section("AssumedRoleUser");
+    response.user.dump(s->formatter);
+    s->formatter->close_section();
+    encode_json("PackedPolicySize", response.packedPolicySize , s->formatter);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+int RGW_Auth_STS::authorize(const DoutPrefixProvider *dpp,
+                            rgw::sal::Driver* driver,
+                            const rgw::auth::StrategyRegistry& auth_registry,
+                            req_state *s, optional_yield y)
+{
+  return rgw::auth::Strategy::apply(dpp, auth_registry.get_sts(), s, y);
+}
+
+using op_generator = RGWOp*(*)();
+static const std::unordered_map<std::string_view, op_generator> op_generators = {
+  {"AssumeRole", []() -> RGWOp* {return new RGWSTSAssumeRole;}},
+  {"GetSessionToken", []() -> RGWOp* {return new RGWSTSGetSessionToken;}},
+  {"AssumeRoleWithWebIdentity", []() -> RGWOp* {return new RGWSTSAssumeRoleWithWebIdentity;}}
+};
+
+bool RGWHandler_REST_STS::action_exists(const req_state* s)
+{
+  if (s->info.args.exists("Action")) {
+    const std::string action_name = s->info.args.get("Action");
+    return op_generators.contains(action_name);
+  }
+  return false;
+}
+
+RGWOp *RGWHandler_REST_STS::op_post()
+{
+  if (s->info.args.exists("Action")) {
+    const std::string action_name = s->info.args.get("Action");
+    const auto action_it = op_generators.find(action_name);
+    if (action_it != op_generators.end()) {
+      return action_it->second();
+    }
+    ldpp_dout(s, 10) << "unknown action '" << action_name << "' for STS handler" << dendl;
+  } else {
+    ldpp_dout(s, 10) << "missing action argument in STS handler" << dendl;
+  }
+  return nullptr;
+}
+
+int RGWHandler_REST_STS::init(rgw::sal::Driver* driver,
+                              req_state *s,
+                              rgw::io::BasicClient *cio)
+{
+  s->dialect = "sts";
+  s->prot_flags = RGW_REST_STS;
+
+  return RGWHandler_REST::init(driver, s, cio);
+}
+
+int RGWHandler_REST_STS::authorize(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  if (s->info.args.exists("Action") && s->info.args.get("Action") == "AssumeRoleWithWebIdentity") {
+    return RGW_Auth_STS::authorize(dpp, driver, auth_registry, s, y);
+  }
+  return RGW_Auth_S3::authorize(dpp, driver, auth_registry, s, y);
+}
+
+RGWHandler_REST*
+RGWRESTMgr_STS::get_handler(rgw::sal::Driver* driver,
+			    req_state* const s,
+			    const rgw::auth::StrategyRegistry& auth_registry,
+			    const std::string& frontend_prefix)
+{
+  return new RGWHandler_REST_STS(auth_registry);
+}
diff --git a/src/rgw/rgw_rest_sts.h b/src/rgw/rgw_rest_sts.h
new file mode 100644
index 000000000..ec15de245
--- /dev/null
+++ b/src/rgw/rgw_rest_sts.h
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_auth.h"
+#include "rgw_auth_filters.h"
+#include "rgw_rest.h"
+#include "rgw_sts.h"
+#include "rgw_web_idp.h"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#include "jwt-cpp/jwt.h"
+#pragma clang diagnostic pop
+#pragma GCC diagnostic pop
+#include "rgw_oidc_provider.h"
+
+
+namespace rgw::auth::sts {
+
+class WebTokenEngine : public rgw::auth::Engine {
+  static constexpr std::string_view princTagsNamespace = "https://aws.amazon.com/tags";
+  CephContext* const cct;
+  rgw::sal::Driver* driver;
+
+  using result_t = rgw::auth::Engine::result_t;
+  using Pair = std::pair<std::string, std::string>;
+  using token_t = std::unordered_multimap<string, string>;
+  using principal_tags_t = std::set<Pair>;
+
+  const rgw::auth::TokenExtractor* const extractor;
+  const rgw::auth::WebIdentityApplier::Factory* const apl_factory;
+
+  bool is_applicable(const std::string& token) const noexcept;
+
+  bool is_client_id_valid(std::vector<std::string>& client_ids, const std::string& client_id) const;
+
+  bool is_cert_valid(const std::vector<std::string>& thumbprints, const std::string& cert) const;
+
+  std::unique_ptr<rgw::sal::RGWOIDCProvider> get_provider(const DoutPrefixProvider *dpp, const std::string& role_arn, const std::string& iss) const;
+
+  std::string get_role_tenant(const std::string& role_arn) const;
+
+  std::string get_role_name(const string& role_arn) const;
+
+  std::string get_cert_url(const std::string& iss, const DoutPrefixProvider *dpp,optional_yield y) const;
+
+  std::tuple<boost::optional<WebTokenEngine::token_t>, boost::optional<WebTokenEngine::principal_tags_t>>
+  get_from_jwt(const DoutPrefixProvider* dpp, const std::string& token, const req_state* const s, optional_yield y) const;
+
+  void validate_signature (const DoutPrefixProvider* dpp, const jwt::decoded_jwt& decoded, const std::string& algorithm, const std::string& iss, const std::vector<std::string>& thumbprints, optional_yield y) const;
+
+  result_t authenticate(const DoutPrefixProvider* dpp,
+                        const std::string& token,
+                        const req_state* s, optional_yield y) const;
+
+  template <typename T>
+  void recurse_and_insert(const string& key, const jwt::claim& c, T& t) const;
+  WebTokenEngine::token_t get_token_claims(const jwt::decoded_jwt& decoded) const;
+
+public:
+  WebTokenEngine(CephContext* const cct,
+                    rgw::sal::Driver* driver,
+                    const rgw::auth::TokenExtractor* const extractor,
+                    const rgw::auth::WebIdentityApplier::Factory* const apl_factory)
+    : cct(cct),
+      driver(driver),
+      extractor(extractor),
+      apl_factory(apl_factory) {
+  }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::sts::WebTokenEngine";
+  }
+
+  result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const override {
+    return authenticate(dpp, extractor->get_token(s), s, y);
+  }
+}; /* class WebTokenEngine */
+
+class DefaultStrategy : public rgw::auth::Strategy,
+                        public rgw::auth::TokenExtractor,
+                        public rgw::auth::WebIdentityApplier::Factory {
+  rgw::sal::Driver* driver;
+  const ImplicitTenants& implicit_tenant_context;
+
+  /* The engine. */
+  const WebTokenEngine web_token_engine;
+
+  using aplptr_t = rgw::auth::IdentityApplier::aplptr_t;
+
+  /* The method implements TokenExtractor for Web Token in req_state. */
+  std::string get_token(const req_state* const s) const override {
+    return s->info.args.get("WebIdentityToken");
+  }
+
+  aplptr_t create_apl_web_identity( CephContext* cct,
+                                    const req_state* s,
+                                    const std::string& role_session,
+                                    const std::string& role_tenant,
+                                    const std::unordered_multimap<std::string, std::string>& token,
+                                    boost::optional<std::multimap<std::string, std::string>> role_tags,
+                                    boost::optional<std::set<std::pair<std::string, std::string>>> principal_tags) const override {
+    auto apl = rgw::auth::add_sysreq(cct, driver, s,
+      rgw::auth::WebIdentityApplier(cct, driver, role_session, role_tenant, token, role_tags, principal_tags));
+    return aplptr_t(new decltype(apl)(std::move(apl)));
+  }
+
+public:
+  DefaultStrategy(CephContext* const cct,
+                  const ImplicitTenants& implicit_tenant_context,
+                  rgw::sal::Driver* driver)
+    : driver(driver),
+      implicit_tenant_context(implicit_tenant_context),
+      web_token_engine(cct, driver,
+                        static_cast<rgw::auth::TokenExtractor*>(this),
+                        static_cast<rgw::auth::WebIdentityApplier::Factory*>(this)) {
+    /* When the constructor's body is being executed, all member engines
+     * should be initialized. Thus, we can safely add them. */
+    using Control = rgw::auth::Strategy::Control;
+    add_engine(Control::SUFFICIENT, web_token_engine);
+  }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::sts::DefaultStrategy";
+  }
+};
+
+} // namespace rgw::auth::sts
+
+class RGWREST_STS : public RGWRESTOp {
+protected:
+  STS::STSService sts;
+public:
+  RGWREST_STS() = default;
+  int verify_permission(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWSTSAssumeRoleWithWebIdentity : public RGWREST_STS {
+protected:
+  std::string duration;
+  std::string providerId;
+  std::string policy;
+  std::string roleArn;
+  std::string roleSessionName;
+  std::string sub;
+  std::string aud;
+  std::string iss;
+public:
+  RGWSTSAssumeRoleWithWebIdentity() = default;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "assume_role_web_identity"; }
+  RGWOpType get_type() override { return RGW_STS_ASSUME_ROLE_WEB_IDENTITY; }
+};
+
+class RGWSTSAssumeRole : public RGWREST_STS {
+protected:
+  std::string duration;
+  std::string externalId;
+  std::string policy;
+  std::string roleArn;
+  std::string roleSessionName;
+  std::string serialNumber;
+  std::string tokenCode;
+public:
+  RGWSTSAssumeRole() = default;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "assume_role"; }
+  RGWOpType get_type() override { return RGW_STS_ASSUME_ROLE; }
+};
+
+class RGWSTSGetSessionToken : public RGWREST_STS {
+protected:
+  std::string duration;
+  std::string serialNumber;
+  std::string tokenCode;
+public:
+  RGWSTSGetSessionToken() = default;
+  void execute(optional_yield y) override;
+  int verify_permission(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "get_session_token"; }
+  RGWOpType get_type() override { return RGW_STS_GET_SESSION_TOKEN; }
+};
+
+class RGW_Auth_STS {
+public:
+  static int authorize(const DoutPrefixProvider *dpp,
+                       rgw::sal::Driver* driver,
+                       const rgw::auth::StrategyRegistry& auth_registry,
+                       req_state *s, optional_yield y);
+};
+
+class RGWHandler_REST_STS : public RGWHandler_REST {
+  const rgw::auth::StrategyRegistry& auth_registry;
+  RGWOp *op_post() override;
+public:
+
+  static bool action_exists(const req_state* s);
+
+  RGWHandler_REST_STS(const rgw::auth::StrategyRegistry& auth_registry)
+    : RGWHandler_REST(),
+      auth_registry(auth_registry) {}
+  ~RGWHandler_REST_STS() override = default;
+
+  int init(rgw::sal::Driver* driver,
+           req_state *s,
+           rgw::io::BasicClient *cio) override;
+  int authorize(const DoutPrefixProvider* dpp, optional_yield y) override;
+  int postauth_init(optional_yield y) override { return 0; }
+};
+
+class RGWRESTMgr_STS : public RGWRESTMgr {
+public:
+  RGWRESTMgr_STS() = default;
+  ~RGWRESTMgr_STS() override = default;
+
+  RGWRESTMgr *get_resource_mgr(req_state* const s,
+                               const std::string& uri,
+                               std::string* const out_uri) override {
+    return this;
+  }
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry&,
+                               const std::string&) override;
+};
+
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
new file mode 100644
index 000000000..ee943ea44
--- /dev/null
+++ b/src/rgw/rgw_rest_swift.cc
@@ -0,0 +1,3114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/format.hpp>
+#include <boost/optional.hpp>
+#include <boost/utility/in_place_factory.hpp>
+
+#include "include/ceph_assert.h"
+#include "ceph_ver.h"
+
+#include "common/Formatter.h"
+#include "common/utf8.h"
+#include "common/ceph_json.h"
+
+#include "rgw_rest_swift.h"
+#include "rgw_acl_swift.h"
+#include "rgw_cors_swift.h"
+#include "rgw_formats.h"
+#include "rgw_client_io.h"
+#include "rgw_compression.h"
+
+#include "rgw_auth.h"
+#include "rgw_auth_registry.h"
+#include "rgw_swift_auth.h"
+
+#include "rgw_request.h"
+#include "rgw_process.h"
+
+#include "rgw_zone.h"
+#include "rgw_sal.h"
+
+#include "services/svc_zone.h"
+
+#include <array>
+#include <string_view>
+#include <sstream>
+#include <memory>
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int RGWListBuckets_ObjStore_SWIFT::get_params(optional_yield y)
+{
+  prefix = s->info.args.get("prefix");
+  marker = s->info.args.get("marker");
+  end_marker = s->info.args.get("end_marker");
+  wants_reversed = s->info.args.exists("reverse");
+
+  if (wants_reversed) {
+    std::swap(marker, end_marker);
+  }
+
+  std::string limit_str = s->info.args.get("limit");
+  if (!limit_str.empty()) {
+    std::string err;
+    long l = strict_strtol(limit_str.c_str(), 10, &err);
+    if (!err.empty()) {
+      return -EINVAL;
+    }
+
+    if (l > (long)limit_max || l < 0) {
+      return -ERR_PRECONDITION_FAILED;
+    }
+
+    limit = (uint64_t)l;
+  }
+
+  if (s->cct->_conf->rgw_swift_need_stats) {
+    bool stats, exists;
+    int r = s->info.args.get_bool("stats", &stats, &exists);
+
+    if (r < 0) {
+      return r;
+    }
+
+    if (exists) {
+      need_stats = stats;
+    }
+  } else {
+    need_stats = false;
+  }
+
+  return 0;
+}
+
+static void dump_account_metadata(req_state * const s,
+                                  const RGWUsageStats& global_stats,
+                                  const std::map<std::string, RGWUsageStats> &policies_stats,
+                                  /* const */map<string, bufferlist>& attrs,
+                                  const RGWQuotaInfo& quota,
+                                  const RGWAccessControlPolicy_SWIFTAcct &policy)
+{
+  /* Adding X-Timestamp to keep align with Swift API */
+  dump_header(s, "X-Timestamp", ceph_clock_now());
+
+  dump_header(s, "X-Account-Container-Count", global_stats.buckets_count);
+  dump_header(s, "X-Account-Object-Count", global_stats.objects_count);
+  dump_header(s, "X-Account-Bytes-Used", global_stats.bytes_used);
+  dump_header(s, "X-Account-Bytes-Used-Actual", global_stats.bytes_used_rounded);
+
+  for (const auto& kv : policies_stats) {
+    const auto& policy_name = camelcase_dash_http_attr(kv.first);
+    const auto& policy_stats = kv.second;
+
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Container-Count", policy_stats.buckets_count);
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Object-Count", policy_stats.objects_count);
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Bytes-Used", policy_stats.bytes_used);
+    dump_header_infixed(s, "X-Account-Storage-Policy-", policy_name,
+                        "-Bytes-Used-Actual", policy_stats.bytes_used_rounded);
+  }
+
+  /* Dump TempURL-related stuff */
+  if (s->perm_mask == RGW_PERM_FULL_CONTROL) {
+    auto iter = s->user->get_info().temp_url_keys.find(0);
+    if (iter != std::end(s->user->get_info().temp_url_keys) && ! iter->second.empty()) {
+      dump_header(s, "X-Account-Meta-Temp-Url-Key", iter->second);
+    }
+
+    iter = s->user->get_info().temp_url_keys.find(1);
+    if (iter != std::end(s->user->get_info().temp_url_keys) && ! iter->second.empty()) {
+      dump_header(s, "X-Account-Meta-Temp-Url-Key-2", iter->second);
+    }
+  }
+
+  /* Dump quota headers. */
+  if (quota.enabled) {
+    if (quota.max_size >= 0) {
+      dump_header(s, "X-Account-Meta-Quota-Bytes", quota.max_size);
+    }
+
+    /* Limit on the number of objects in a given account is a RadosGW's
+     * extension. Swift's account quota WSGI filter doesn't support it. */
+    if (quota.max_objects >= 0) {
+      dump_header(s, "X-Account-Meta-Quota-Count", quota.max_objects);
+    }
+  }
+
+  /* Dump user-defined metadata items and generic attrs. */
+  const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1;
+  map<string, bufferlist>::iterator iter;
+  for (iter = attrs.lower_bound(RGW_ATTR_PREFIX); iter != attrs.end(); ++iter) {
+    const char *name = iter->first.c_str();
+    map<string, string>::const_iterator geniter = rgw_to_http_attrs.find(name);
+
+    if (geniter != rgw_to_http_attrs.end()) {
+      dump_header(s, geniter->second, iter->second);
+    } else if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) {
+      dump_header_prefixed(s, "X-Account-Meta-",
+                           camelcase_dash_http_attr(name + PREFIX_LEN),
+                           iter->second);
+    }
+  }
+
+  /* Dump account ACLs */
+  auto account_acls = policy.to_str();
+  if (account_acls) {
+    dump_header(s, "X-Account-Access-Control", std::move(*account_acls));
+  }
+}
+
+void RGWListBuckets_ObjStore_SWIFT::send_response_begin(bool has_buckets)
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  } else if (!has_buckets && s->format == RGWFormat::PLAIN) {
+    op_ret = STATUS_NO_CONTENT;
+    set_req_state_err(s, op_ret);
+  }
+
+  if (! s->cct->_conf->rgw_swift_enforce_content_length) {
+    /* Adding account stats in the header to keep align with Swift API */
+    dump_account_metadata(s,
+            global_stats,
+            policies_stats,
+            s->user->get_attrs(),
+            s->user->get_info().quota.user_quota,
+            static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
+    dump_errno(s);
+    dump_header(s, "Accept-Ranges", "bytes");
+    end_header(s, NULL, NULL, NO_CONTENT_LENGTH, true);
+  }
+
+  if (! op_ret) {
+    dump_start(s);
+    s->formatter->open_array_section_with_attrs("account",
+            FormatterAttrs("name", s->user->get_display_name().c_str(), NULL));
+
+    sent_data = true;
+  }
+}
+
+void RGWListBuckets_ObjStore_SWIFT::handle_listing_chunk(rgw::sal::BucketList&& buckets)
+{
+  if (wants_reversed) {
+    /* Just store in the reversal buffer. Its content will be handled later,
+     * in send_response_end(). */
+    reverse_buffer.emplace(std::begin(reverse_buffer), std::move(buckets));
+  } else {
+    return send_response_data(buckets);
+  }
+}
+
+void RGWListBuckets_ObjStore_SWIFT::send_response_data(rgw::sal::BucketList& buckets)
+{
+  if (! sent_data) {
+    return;
+  }
+
+  /* Take care of the prefix parameter of Swift API. There is no business
+   * in applying the filter earlier as we really need to go through all
+   * entries regardless of it (the headers like X-Account-Container-Count
+   * aren't affected by specifying prefix). */
+  const auto& m = buckets.get_buckets();
+  for (auto iter = m.lower_bound(prefix);
+       iter != m.end() && boost::algorithm::starts_with(iter->first, prefix);
+       ++iter) {
+    dump_bucket_entry(*iter->second);
+  }
+}
+
+void RGWListBuckets_ObjStore_SWIFT::dump_bucket_entry(const rgw::sal::Bucket& bucket)
+{
+  s->formatter->open_object_section("container");
+  s->formatter->dump_string("name", bucket.get_name());
+
+  if (need_stats) {
+    s->formatter->dump_int("count", bucket.get_count());
+    s->formatter->dump_int("bytes", bucket.get_size());
+  }
+
+  s->formatter->close_section();
+
+  if (! s->cct->_conf->rgw_swift_enforce_content_length) {
+    rgw_flush_formatter(s, s->formatter);
+  }
+}
+
+void RGWListBuckets_ObjStore_SWIFT::send_response_data_reversed(rgw::sal::BucketList& buckets)
+{
+  if (! sent_data) {
+    return;
+  }
+
+  /* Take care of the prefix parameter of Swift API. There is no business
+   * in applying the filter earlier as we really need to go through all
+   * entries regardless of it (the headers like X-Account-Container-Count
+   * aren't affected by specifying prefix). */
+  auto& m = buckets.get_buckets();
+
+  auto iter = m.rbegin();
+  for (/* initialized above */;
+       iter != m.rend() && !boost::algorithm::starts_with(iter->first, prefix);
+       ++iter) {
+    /* NOP */;
+  }
+
+  for (/* iter carried */;
+       iter != m.rend() && boost::algorithm::starts_with(iter->first, prefix);
+       ++iter) {
+    dump_bucket_entry(*iter->second);
+  }
+}
+
+void RGWListBuckets_ObjStore_SWIFT::send_response_end()
+{
+  if (wants_reversed) {
+    for (auto& buckets : reverse_buffer) {
+      send_response_data_reversed(buckets);
+    }
+  }
+
+  if (sent_data) {
+    s->formatter->close_section();
+  }
+
+  if (s->cct->_conf->rgw_swift_enforce_content_length) {
+    /* Adding account stats in the header to keep align with Swift API */
+    dump_account_metadata(s,
+            global_stats,
+            policies_stats,
+            s->user->get_attrs(),
+            s->user->get_info().quota.user_quota,
+            static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
+    dump_errno(s);
+    end_header(s, nullptr, nullptr, s->formatter->get_len(), true);
+  }
+
+  if (sent_data || s->cct->_conf->rgw_swift_enforce_content_length) {
+    rgw_flush_formatter_and_reset(s, s->formatter);
+  }
+}
+
+int RGWListBucket_ObjStore_SWIFT::get_params(optional_yield y)
+{
+  prefix = s->info.args.get("prefix");
+  marker = s->info.args.get("marker");
+  end_marker = s->info.args.get("end_marker");
+  max_keys = s->info.args.get("limit");
+
+  // non-standard
+  s->info.args.get_bool("allow_unordered", &allow_unordered, false);
+
+  delimiter = s->info.args.get("delimiter");
+
+  op_ret = parse_max_keys();
+  if (op_ret < 0) {
+    return op_ret;
+  }
+  // S3 behavior is to silently cap the max-keys.
+  // Swift behavior is to abort.
+  if (max > default_max)
+    return -ERR_PRECONDITION_FAILED;
+
+  string path_args;
+  if (s->info.args.exists("path")) { // should handle empty path
+    path_args = s->info.args.get("path");
+    if (!delimiter.empty() || !prefix.empty()) {
+      return -EINVAL;
+    }
+    prefix = path_args;
+    delimiter="/";
+
+    path = prefix;
+    if (path.size() && path[path.size() - 1] != '/')
+      path.append("/");
+
+    int len = prefix.size();
+    int delim_size = delimiter.size();
+
+    if (len >= delim_size) {
+      if (prefix.substr(len - delim_size).compare(delimiter) != 0)
+        prefix.append(delimiter);
+    }
+  }
+
+  return 0;
+}
+
+static void dump_container_metadata(req_state *,
+                                    const rgw::sal::Bucket*,
+                                    const RGWQuotaInfo&,
+                                    const RGWBucketWebsiteConf&);
+
+void RGWListBucket_ObjStore_SWIFT::send_response()
+{
+  vector<rgw_bucket_dir_entry>::iterator iter = objs.begin();
+  map<string, bool>::iterator pref_iter = common_prefixes.begin();
+
+  dump_start(s);
+  dump_container_metadata(s, s->bucket.get(), quota.bucket_quota,
+                          s->bucket->get_info().website_conf);
+
+  s->formatter->open_array_section_with_attrs("container",
+					      FormatterAttrs("name",
+							     s->bucket->get_name().c_str(),
+							     NULL));
+
+  while (iter != objs.end() || pref_iter != common_prefixes.end()) {
+    bool do_pref = false;
+    bool do_objs = false;
+    rgw_obj_key key;
+    if (iter != objs.end()) {
+      key = iter->key;
+    }
+    if (pref_iter == common_prefixes.end())
+      do_objs = true;
+    else if (iter == objs.end())
+      do_pref = true;
+    else if (!key.empty() && key.name.compare(pref_iter->first) == 0) {
+      do_objs = true;
+      ++pref_iter;
+    } else if (!key.empty() && key.name.compare(pref_iter->first) <= 0)
+      do_objs = true;
+    else
+      do_pref = true;
+
+    if (do_objs && (allow_unordered || marker.empty() || marker < key)) {
+      if (key.name.compare(path) == 0)
+        goto next;
+
+      s->formatter->open_object_section("object");
+      s->formatter->dump_string("name", key.name);
+      s->formatter->dump_string("hash", iter->meta.etag);
+      s->formatter->dump_int("bytes", iter->meta.accounted_size);
+      if (!iter->meta.user_data.empty())
+        s->formatter->dump_string("user_custom_data", iter->meta.user_data);
+      string single_content_type = iter->meta.content_type;
+      if (iter->meta.content_type.size()) {
+        // content type might hold multiple values, just dump the last one
+        ssize_t pos = iter->meta.content_type.rfind(',');
+        if (pos > 0) {
+          ++pos;
+          while (single_content_type[pos] == ' ')
+            ++pos;
+          single_content_type = single_content_type.substr(pos);
+        }
+        s->formatter->dump_string("content_type", single_content_type);
+      }
+      dump_time(s, "last_modified", iter->meta.mtime);
+      s->formatter->close_section();
+    }
+
+    if (do_pref &&  (marker.empty() || pref_iter->first.compare(marker.name) > 0)) {
+      const string& name = pref_iter->first;
+      if (name.compare(delimiter) == 0)
+        goto next;
+
+      s->formatter->open_object_section_with_attrs("subdir", FormatterAttrs("name", name.c_str(), NULL));
+
+      /* swift is a bit inconsistent here */
+      switch (s->format) {
+        case RGWFormat::XML:
+          s->formatter->dump_string("name", name);
+          break;
+        default:
+          s->formatter->dump_string("subdir", name);
+      }
+      s->formatter->close_section();
+    }
+next:
+    if (do_objs)
+      ++iter;
+    else
+      ++pref_iter;
+  }
+
+  s->formatter->close_section();
+
+  int64_t content_len = 0;
+  if (! op_ret) {
+    content_len = s->formatter->get_len();
+    if (content_len == 0) {
+      op_ret = STATUS_NO_CONTENT;
+    }
+  } else if (op_ret > 0) {
+    op_ret = 0;
+  }
+
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, NULL, content_len);
+  if (op_ret < 0) {
+    return;
+  }
+
+  rgw_flush_formatter_and_reset(s, s->formatter);
+} // RGWListBucket_ObjStore_SWIFT::send_response
+
+static void dump_container_metadata(req_state *s,
+                                    const rgw::sal::Bucket* bucket,
+                                    const RGWQuotaInfo& quota,
+                                    const RGWBucketWebsiteConf& ws_conf)
+{
+  /* Adding X-Timestamp to keep align with Swift API */
+  dump_header(s, "X-Timestamp", utime_t(s->bucket->get_info().creation_time));
+
+  dump_header(s, "X-Container-Object-Count", bucket->get_count());
+  dump_header(s, "X-Container-Bytes-Used", bucket->get_size());
+  dump_header(s, "X-Container-Bytes-Used-Actual", bucket->get_size_rounded());
+
+  if (rgw::sal::Object::empty(s->object.get())) {
+    auto swift_policy = \
+      static_cast<RGWAccessControlPolicy_SWIFT*>(s->bucket_acl.get());
+    std::string read_acl, write_acl;
+    swift_policy->to_str(read_acl, write_acl);
+
+    if (read_acl.size()) {
+      dump_header(s, "X-Container-Read", read_acl);
+    }
+    if (write_acl.size()) {
+      dump_header(s, "X-Container-Write", write_acl);
+    }
+    if (!s->bucket->get_placement_rule().name.empty()) {
+      dump_header(s, "X-Storage-Policy", s->bucket->get_placement_rule().name);
+    }
+    dump_header(s, "X-Storage-Class", s->bucket->get_placement_rule().get_storage_class());
+
+    /* Dump user-defined metadata items and generic attrs. */
+    const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1;
+    map<string, bufferlist>::iterator iter;
+    for (iter = s->bucket_attrs.lower_bound(RGW_ATTR_PREFIX);
+         iter != s->bucket_attrs.end();
+         ++iter) {
+      const char *name = iter->first.c_str();
+      map<string, string>::const_iterator geniter = rgw_to_http_attrs.find(name);
+
+      if (geniter != rgw_to_http_attrs.end()) {
+        dump_header(s, geniter->second, iter->second);
+      } else if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) {
+        dump_header_prefixed(s, "X-Container-Meta-",
+                             camelcase_dash_http_attr(name + PREFIX_LEN),
+                             iter->second);
+      }
+    }
+  }
+
+  /* Dump container versioning info. */
+  if (! s->bucket->get_info().swift_ver_location.empty()) {
+    dump_header(s, "X-Versions-Location",
+                url_encode(s->bucket->get_info().swift_ver_location));
+  }
+
+  /* Dump quota headers. */
+  if (quota.enabled) {
+    if (quota.max_size >= 0) {
+      dump_header(s, "X-Container-Meta-Quota-Bytes", quota.max_size);
+    }
+
+    if (quota.max_objects >= 0) {
+      dump_header(s, "X-Container-Meta-Quota-Count", quota.max_objects);
+    }
+  }
+
+  /* Dump Static Website headers. */
+  if (! ws_conf.index_doc_suffix.empty()) {
+    dump_header(s, "X-Container-Meta-Web-Index", ws_conf.index_doc_suffix);
+  }
+
+  if (! ws_conf.error_doc.empty()) {
+    dump_header(s, "X-Container-Meta-Web-Error", ws_conf.error_doc);
+  }
+
+  if (! ws_conf.subdir_marker.empty()) {
+    dump_header(s, "X-Container-Meta-Web-Directory-Type",
+                ws_conf.subdir_marker);
+  }
+
+  if (! ws_conf.listing_css_doc.empty()) {
+    dump_header(s, "X-Container-Meta-Web-Listings-CSS",
+                ws_conf.listing_css_doc);
+  }
+
+  if (ws_conf.listing_enabled) {
+    dump_header(s, "X-Container-Meta-Web-Listings", "true");
+  }
+
+  /* Dump bucket's modification time. Compliance with the Swift API really
+   * needs that. */
+  dump_last_modified(s, s->bucket_mtime);
+}
+
+void RGWStatAccount_ObjStore_SWIFT::execute(optional_yield y)
+{
+  RGWStatAccount_ObjStore::execute(y);
+  op_ret = s->user->read_attrs(s, s->yield);
+  attrs = s->user->get_attrs();
+}
+
+void RGWStatAccount_ObjStore_SWIFT::send_response()
+{
+  if (op_ret >= 0) {
+    op_ret = STATUS_NO_CONTENT;
+    dump_account_metadata(s,
+            global_stats,
+            policies_stats,
+            attrs,
+            s->user->get_info().quota.user_quota,
+            static_cast<RGWAccessControlPolicy_SWIFTAcct&>(*s->user_acl));
+  }
+
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+
+  end_header(s, NULL, NULL, 0,  true);
+
+  dump_start(s);
+}
+
+void RGWStatBucket_ObjStore_SWIFT::send_response()
+{
+  if (op_ret >= 0) {
+    op_ret = STATUS_NO_CONTENT;
+    dump_container_metadata(s, bucket.get(), quota.bucket_quota,
+                            s->bucket->get_info().website_conf);
+  }
+
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+
+  end_header(s, this, NULL, 0, true);
+  dump_start(s);
+}
+
+static int get_swift_container_settings(req_state * const s,
+                                        rgw::sal::Driver*  const driver,
+                                        RGWAccessControlPolicy * const policy,
+                                        bool * const has_policy,
+                                        uint32_t * rw_mask,
+                                        RGWCORSConfiguration * const cors_config,
+                                        bool * const has_cors)
+{
+  const char * const read_list = s->info.env->get("HTTP_X_CONTAINER_READ");
+  const char * const write_list = s->info.env->get("HTTP_X_CONTAINER_WRITE");
+
+  *has_policy = false;
+
+  if (read_list || write_list) {
+    RGWAccessControlPolicy_SWIFT swift_policy(s->cct);
+    const auto r = swift_policy.create(s, driver,
+                                       s->user->get_id(),
+                                       s->user->get_display_name(),
+                                       read_list,
+                                       write_list,
+                                       *rw_mask);
+    if (r < 0) {
+      return r;
+    }
+
+    *policy = swift_policy;
+    *has_policy = true;
+  }
+
+  *has_cors = false;
+
+  /*Check and update CORS configuration*/
+  const char *allow_origins = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_ALLOW_ORIGIN");
+  const char *allow_headers = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_ALLOW_HEADERS");
+  const char *expose_headers = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_EXPOSE_HEADERS");
+  const char *max_age = s->info.env->get("HTTP_X_CONTAINER_META_ACCESS_CONTROL_MAX_AGE");
+  if (allow_origins) {
+    RGWCORSConfiguration_SWIFT *swift_cors = new RGWCORSConfiguration_SWIFT;
+    int r = swift_cors->create_update(allow_origins, allow_headers, expose_headers, max_age);
+    if (r < 0) {
+      ldpp_dout(s, 0) << "Error creating/updating the cors configuration" << dendl;
+      delete swift_cors;
+      return r;
+    }
+    *has_cors = true;
+    *cors_config = *swift_cors;
+    cors_config->dump();
+    delete swift_cors;
+  }
+
+  return 0;
+}
+
+#define ACCT_REMOVE_ATTR_PREFIX     "HTTP_X_REMOVE_ACCOUNT_META_"
+#define ACCT_PUT_ATTR_PREFIX        "HTTP_X_ACCOUNT_META_"
+#define CONT_REMOVE_ATTR_PREFIX     "HTTP_X_REMOVE_CONTAINER_META_"
+#define CONT_PUT_ATTR_PREFIX        "HTTP_X_CONTAINER_META_"
+
+static void get_rmattrs_from_headers(const req_state * const s,
+				     const char * const put_prefix,
+				     const char * const del_prefix,
+				     set<string>& rmattr_names)
+{
+  const size_t put_prefix_len = strlen(put_prefix);
+  const size_t del_prefix_len = strlen(del_prefix);
+
+  for (const auto& kv : s->info.env->get_map()) {
+    size_t prefix_len = 0;
+    const char * const p = kv.first.c_str();
+
+    if (strncasecmp(p, del_prefix, del_prefix_len) == 0) {
+      /* Explicitly requested removal. */
+      prefix_len = del_prefix_len;
+    } else if ((strncasecmp(p, put_prefix, put_prefix_len) == 0)
+	       && kv.second.empty()) {
+      /* Removal requested by putting an empty value. */
+      prefix_len = put_prefix_len;
+    }
+
+    if (prefix_len > 0) {
+      string name(RGW_ATTR_META_PREFIX);
+      name.append(lowercase_dash_http_attr(p + prefix_len));
+      rmattr_names.insert(name);
+    }
+  }
+}
+
+static int get_swift_versioning_settings(
+  req_state * const s,
+  boost::optional<std::string>& swift_ver_location)
+{
+  /* Removing the Swift's versions location has lower priority than setting
+   * a new one. That's the reason why we're handling it first. */
+  const std::string vlocdel =
+    s->info.env->get("HTTP_X_REMOVE_VERSIONS_LOCATION", "");
+  if (vlocdel.size()) {
+    swift_ver_location = boost::in_place(std::string());
+  }
+
+  if (s->info.env->exists("HTTP_X_VERSIONS_LOCATION")) {
+    /* If the Swift's versioning is globally disabled but someone wants to
+     * enable it for a given container, new version of Swift will generate
+     * the precondition failed error. */
+    if (! s->cct->_conf->rgw_swift_versioning_enabled) {
+      return -ERR_PRECONDITION_FAILED;
+    }
+
+    swift_ver_location = s->info.env->get("HTTP_X_VERSIONS_LOCATION", "");
+  }
+
+  return 0;
+}
+
+int RGWCreateBucket_ObjStore_SWIFT::get_params(optional_yield y)
+{
+  bool has_policy;
+  uint32_t policy_rw_mask = 0;
+
+  int r = get_swift_container_settings(s, driver, &policy, &has_policy,
+				       &policy_rw_mask, &cors_config, &has_cors);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!has_policy) {
+    policy.create_default(s->user->get_id(), s->user->get_display_name());
+  }
+
+  location_constraint = driver->get_zone()->get_zonegroup().get_api_name();
+  get_rmattrs_from_headers(s, CONT_PUT_ATTR_PREFIX,
+                           CONT_REMOVE_ATTR_PREFIX, rmattr_names);
+  placement_rule.init(s->info.env->get("HTTP_X_STORAGE_POLICY", ""), s->info.storage_class);
+
+  return get_swift_versioning_settings(s, swift_ver_location);
+}
+
+static inline int handle_metadata_errors(req_state* const s, const int op_ret)
+{
+  if (op_ret == -EFBIG) {
+    /* Handle the custom error message of exceeding maximum custom attribute
+     * (stored as xattr) size. */
+    const auto error_message = boost::str(
+      boost::format("Metadata value longer than %lld")
+        % s->cct->_conf.get_val<Option::size_t>("rgw_max_attr_size"));
+    set_req_state_err(s, EINVAL, error_message);
+    return -EINVAL;
+  } else if (op_ret == -E2BIG) {
+    const auto error_message = boost::str(
+      boost::format("Too many metadata items; max %lld")
+        % s->cct->_conf.get_val<uint64_t>("rgw_max_attrs_num_in_req"));
+    set_req_state_err(s, EINVAL, error_message);
+    return -EINVAL;
+  }
+
+  return op_ret;
+}
+
+void RGWCreateBucket_ObjStore_SWIFT::send_response()
+{
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_CREATED;
+    } else if (op_ret == -ERR_BUCKET_EXISTS) {
+      op_ret = STATUS_ACCEPTED;
+    }
+    set_req_state_err(s, op_ret);
+  }
+
+  dump_errno(s);
+  /* Propose ending HTTP header with 0 Content-Length header. */
+  end_header(s, NULL, NULL, 0);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void RGWDeleteBucket_ObjStore_SWIFT::send_response()
+{
+  int r = op_ret;
+  if (!r)
+    r = STATUS_NO_CONTENT;
+
+  set_req_state_err(s, r);
+  dump_errno(s);
+  end_header(s, this, NULL, 0);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+static int get_delete_at_param(req_state *s, boost::optional<real_time> &delete_at)
+{
+  /* Handle Swift object expiration. */
+  real_time delat_proposal;
+  string x_delete = s->info.env->get("HTTP_X_DELETE_AFTER", "");
+
+  if (x_delete.empty()) {
+    x_delete = s->info.env->get("HTTP_X_DELETE_AT", "");
+  } else {
+    /* X-Delete-After HTTP is present. It means we need add its value
+     * to the current time. */
+    delat_proposal = real_clock::now();
+  }
+
+  if (x_delete.empty()) {
+    delete_at = boost::none;
+    if (s->info.env->exists("HTTP_X_REMOVE_DELETE_AT")) {
+      delete_at = boost::in_place(real_time());
+    }
+    return 0;
+  }
+  string err;
+  long ts = strict_strtoll(x_delete.c_str(), 10, &err);
+
+  if (!err.empty()) {
+    return -EINVAL;
+  }
+
+  delat_proposal += make_timespan(ts);
+  if (delat_proposal < real_clock::now()) {
+    return -EINVAL;
+  }
+
+  delete_at = delat_proposal;
+
+  return 0;
+}
+
+int RGWPutObj_ObjStore_SWIFT::verify_permission(optional_yield y)
+{
+  op_ret = RGWPutObj_ObjStore::verify_permission(y);
+
+  /* We have to differentiate error codes depending on whether user is
+   * anonymous (401 Unauthorized) or he doesn't have necessary permissions
+   * (403 Forbidden). */
+  if (s->auth.identity->is_anonymous() && op_ret == -EACCES) {
+    return -EPERM;
+  } else {
+    return op_ret;
+  }
+}
+
+int RGWPutObj_ObjStore_SWIFT::update_slo_segment_size(rgw_slo_entry& entry) {
+
+  int r = 0;
+  const string& path = entry.path;
+
+  /* If the path starts with slashes, strip them all. */
+  const size_t pos_init = path.find_first_not_of('/');
+
+  if (pos_init == string::npos) {
+    return -EINVAL;
+  }
+
+  const size_t pos_sep = path.find('/', pos_init);
+  if (pos_sep == string::npos) {
+    return -EINVAL;
+  }
+
+  string bucket_name = path.substr(pos_init, pos_sep - pos_init);
+  string obj_name = path.substr(pos_sep + 1);
+
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  if (bucket_name.compare(s->bucket->get_name()) != 0) {
+    r = driver->get_bucket(s, s->user.get(), s->user->get_id().tenant, bucket_name, &bucket, s->yield);
+    if (r < 0) {
+      ldpp_dout(this, 0) << "could not get bucket info for bucket="
+			 << bucket_name << dendl;
+      return r;
+    }
+  } else {
+    bucket = s->bucket->clone();
+  }
+
+  /* fetch the stored size of the seg (or error if not valid) */
+  std::unique_ptr<rgw::sal::Object> slo_seg = bucket->get_object(rgw_obj_key(obj_name));
+
+  /* no prefetch */
+  slo_seg->set_atomic();
+
+  bool compressed;
+  RGWCompressionInfo cs_info;
+  uint64_t size_bytes{0};
+
+  r = slo_seg->get_obj_attrs(s->yield, s);
+  if (r < 0) {
+    return r;
+  }
+
+  size_bytes = slo_seg->get_obj_size();
+
+  r = rgw_compression_info_from_attrset(slo_seg->get_attrs(), compressed, cs_info);
+  if (r < 0) {
+    return -EIO;
+  }
+
+  if (compressed) {
+    size_bytes = cs_info.orig_size;
+  }
+
+  /* "When the PUT operation sees the multipart-manifest=put query
+   * parameter, it reads the request body and verifies that each
+   * segment object exists and that the sizes and ETags match. If
+   * there is a mismatch, the PUT operation fails."
+   */
+  if (entry.size_bytes &&
+      (entry.size_bytes != size_bytes)) {
+    return -EINVAL;
+  }
+
+  entry.size_bytes = size_bytes;
+
+  return 0;
+} /* RGWPutObj_ObjStore_SWIFT::update_slo_segment_sizes */
+
+int RGWPutObj_ObjStore_SWIFT::get_params(optional_yield y)
+{
+  if (s->has_bad_meta) {
+    return -EINVAL;
+  }
+
+  if (!s->length) {
+    const char *encoding = s->info.env->get("HTTP_TRANSFER_ENCODING");
+    if (!encoding || strcmp(encoding, "chunked") != 0) {
+      ldpp_dout(this, 20) << "neither length nor chunked encoding" << dendl;
+      return -ERR_LENGTH_REQUIRED;
+    }
+
+    chunked_upload = true;
+  }
+
+  supplied_etag = s->info.env->get("HTTP_ETAG");
+
+  if (!s->generic_attrs.count(RGW_ATTR_CONTENT_TYPE)) {
+    ldpp_dout(this, 5) << "content type wasn't provided, trying to guess" << dendl;
+    const char *suffix = strrchr(s->object->get_name().c_str(), '.');
+    if (suffix) {
+      suffix++;
+      if (*suffix) {
+	string suffix_str(suffix);
+	const char *mime = rgw_find_mime_by_ext(suffix_str);
+	if (mime) {
+	  s->generic_attrs[RGW_ATTR_CONTENT_TYPE] = mime;
+	}
+      }
+    }
+  }
+
+  policy.create_default(s->user->get_id(), s->user->get_display_name());
+
+  int r = get_delete_at_param(s, delete_at);
+  if (r < 0) {
+    ldpp_dout(this, 5) << "ERROR: failed to get Delete-At param" << dendl;
+    return r;
+  }
+
+  if (!s->cct->_conf->rgw_swift_custom_header.empty()) {
+    string custom_header = s->cct->_conf->rgw_swift_custom_header;
+    auto data = s->info.env->get_optional(custom_header);
+    if (data) {
+      user_data = *data;
+    }
+  }
+
+  dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST");
+  bool exists;
+  string multipart_manifest = s->info.args.get("multipart-manifest", &exists);
+  if (exists) {
+    if (multipart_manifest != "put") {
+      ldpp_dout(this, 5) << "invalid multipart-manifest http param: " << multipart_manifest << dendl;
+      return -EINVAL;
+    }
+
+#define MAX_SLO_ENTRY_SIZE (1024 + 128) // 1024 - max obj name, 128 - enough extra for other info
+    uint64_t max_len = s->cct->_conf->rgw_max_slo_entries * MAX_SLO_ENTRY_SIZE;
+    
+    slo_info = new RGWSLOInfo;
+    
+    int r = 0;
+    std::tie(r, slo_info->raw_data) = rgw_rest_get_json_input_keep_data(s->cct, s, slo_info->entries, max_len);
+    if (r < 0) {
+      ldpp_dout(this, 5) << "failed to read input for slo r=" << r << dendl;
+      return r;
+    }
+
+    if ((int64_t)slo_info->entries.size() > s->cct->_conf->rgw_max_slo_entries) {
+      ldpp_dout(this, 5) << "too many entries in slo request: " << slo_info->entries.size() << dendl;
+      return -EINVAL;
+    }
+
+    MD5 etag_sum;
+    // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+    etag_sum.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+    uint64_t total_size = 0;
+    for (auto& entry : slo_info->entries) {
+      etag_sum.Update((const unsigned char *)entry.etag.c_str(),
+                      entry.etag.length());
+
+      /* if size_bytes == 0, it should be replaced with the
+       * real segment size (which could be 0);  this follows from the
+       * fact that Swift requires all segments to exist, but permits
+       * the size_bytes element to be omitted from the SLO manifest, see
+       * https://docs.openstack.org/swift/latest/api/large_objects.html
+       */
+      r = update_slo_segment_size(entry);
+      if (r < 0) {
+	return r;
+      }
+
+      total_size += entry.size_bytes;
+
+      ldpp_dout(this, 20) << "slo_part: " << entry.path
+                        << " size=" << entry.size_bytes
+                        << " etag=" << entry.etag
+                        << dendl;
+    }
+    complete_etag(etag_sum, &lo_etag);
+    slo_info->total_size = total_size;
+
+    ofs = slo_info->raw_data.length();
+  }
+
+  return RGWPutObj_ObjStore::get_params(y);
+}
+
+void RGWPutObj_ObjStore_SWIFT::send_response()
+{
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_CREATED;
+    }
+    set_req_state_err(s, op_ret);
+  }
+
+  if (! lo_etag.empty()) {
+    /* Static Large Object of Swift API has two etags represented by
+     * following members:
+     *  - etag - for the manifest itself (it will be stored in xattrs),
+     *  - lo_etag - for the content composited from SLO's segments.
+     *    The value is calculated basing on segments' etags.
+     * In response for PUT request we have to expose the second one.
+     * The first one may be obtained by GET with "multipart-manifest=get"
+     * in query string on a given SLO. */
+    dump_etag(s, lo_etag, true /* quoted */);
+  } else {
+    dump_etag(s, etag);
+  }
+
+  dump_last_modified(s, mtime);
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+static int get_swift_account_settings(req_state * const s,
+                                      rgw::sal::Driver*  const driver,
+                                      RGWAccessControlPolicy_SWIFTAcct*  const policy,
+                                      bool * const has_policy)
+{
+  *has_policy = false;
+
+  const char * const acl_attr = s->info.env->get("HTTP_X_ACCOUNT_ACCESS_CONTROL");
+  if (acl_attr) {
+    RGWAccessControlPolicy_SWIFTAcct swift_acct_policy(s->cct);
+    const bool r = swift_acct_policy.create(s, driver,
+                                     s->user->get_id(),
+                                     s->user->get_display_name(),
+                                     string(acl_attr));
+    if (r != true) {
+      return -EINVAL;
+    }
+
+    *policy = swift_acct_policy;
+    *has_policy = true;
+  }
+
+  return 0;
+}
+
+int RGWPutMetadataAccount_ObjStore_SWIFT::get_params(optional_yield y)
+{
+  if (s->has_bad_meta) {
+    return -EINVAL;
+  }
+
+  int ret = get_swift_account_settings(s,
+                                       driver,
+                                       // FIXME: we need to carry unique_ptr in generic class
+                                       // and allocate appropriate ACL class in the ctor
+                                       static_cast<RGWAccessControlPolicy_SWIFTAcct *>(&policy),
+                                       &has_policy);
+  if (ret < 0) {
+    return ret;
+  }
+
+  get_rmattrs_from_headers(s, ACCT_PUT_ATTR_PREFIX, ACCT_REMOVE_ATTR_PREFIX,
+			   rmattr_names);
+  return 0;
+}
+
+void RGWPutMetadataAccount_ObjStore_SWIFT::send_response()
+{
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_NO_CONTENT;
+    }
+    set_req_state_err(s, op_ret);
+  }
+
+  dump_errno(s);
+  end_header(s, this);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+int RGWPutMetadataBucket_ObjStore_SWIFT::get_params(optional_yield y)
+{
+  if (s->has_bad_meta) {
+    return -EINVAL;
+  }
+
+  int r = get_swift_container_settings(s, driver, &policy, &has_policy,
+				       &policy_rw_mask, &cors_config, &has_cors);
+  if (r < 0) {
+    return r;
+  }
+
+  get_rmattrs_from_headers(s, CONT_PUT_ATTR_PREFIX, CONT_REMOVE_ATTR_PREFIX,
+			   rmattr_names);
+  placement_rule.init(s->info.env->get("HTTP_X_STORAGE_POLICY", ""), s->info.storage_class);
+
+  return get_swift_versioning_settings(s, swift_ver_location);
+}
+
+void RGWPutMetadataBucket_ObjStore_SWIFT::send_response()
+{
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret && (op_ret != -EINVAL)) {
+      op_ret = STATUS_NO_CONTENT;
+    }
+    set_req_state_err(s, op_ret);
+  }
+
+  dump_errno(s);
+  end_header(s, this);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+int RGWPutMetadataObject_ObjStore_SWIFT::get_params(optional_yield y)
+{
+  if (s->has_bad_meta) {
+    return -EINVAL;
+  }
+
+  /* Handle Swift object expiration. */
+  int r = get_delete_at_param(s, delete_at);
+  if (r < 0) {
+    ldpp_dout(this, 5) << "ERROR: failed to get Delete-At param" << dendl;
+    return r;
+  }
+
+  dlo_manifest = s->info.env->get("HTTP_X_OBJECT_MANIFEST");
+
+  return 0;
+}
+
+void RGWPutMetadataObject_ObjStore_SWIFT::send_response()
+{
+  const auto meta_ret = handle_metadata_errors(s, op_ret);
+  if (meta_ret != op_ret) {
+    op_ret = meta_ret;
+  } else {
+    if (!op_ret) {
+      op_ret = STATUS_ACCEPTED;
+    }
+    set_req_state_err(s, op_ret);
+  }
+
+  if (!s->is_err()) {
+    dump_content_length(s, 0);
+  }
+
+  dump_errno(s);
+  end_header(s, this);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+static void bulkdelete_respond(const unsigned num_deleted,
+                               const unsigned int num_unfound,
+                               const std::list<RGWBulkDelete::fail_desc_t>& failures,
+                               const int prot_flags,                  /* in  */
+                               ceph::Formatter& formatter)            /* out */
+{
+  formatter.open_object_section("delete");
+
+  string resp_status;
+  string resp_body;
+
+  if (!failures.empty()) {
+    int reason = ERR_INVALID_REQUEST;
+    for (const auto& fail_desc : failures) {
+      if (-ENOENT != fail_desc.err && -EACCES != fail_desc.err) {
+        reason = fail_desc.err;
+      }
+    }
+    rgw_err err;
+    set_req_state_err(err, reason, prot_flags);
+    dump_errno(err, resp_status);
+  } else if (0 == num_deleted && 0 == num_unfound) {
+    /* 400 Bad Request */
+    dump_errno(400, resp_status);
+    resp_body = "Invalid bulk delete.";
+  } else {
+    /* 200 OK */
+    dump_errno(200, resp_status);
+  }
+
+  encode_json("Number Deleted", num_deleted, &formatter);
+  encode_json("Number Not Found", num_unfound, &formatter);
+  encode_json("Response Body", resp_body, &formatter);
+  encode_json("Response Status", resp_status, &formatter);
+
+  formatter.open_array_section("Errors");
+  for (const auto& fail_desc : failures) {
+    formatter.open_array_section("object");
+
+    stringstream ss_name;
+    ss_name << fail_desc.path;
+    encode_json("Name", ss_name.str(), &formatter);
+
+    rgw_err err;
+    set_req_state_err(err, fail_desc.err, prot_flags);
+    string status;
+    dump_errno(err, status);
+    encode_json("Status", status, &formatter);
+    formatter.close_section();
+  }
+  formatter.close_section();
+
+  formatter.close_section();
+}
+
+int RGWDeleteObj_ObjStore_SWIFT::verify_permission(optional_yield y)
+{
+  op_ret = RGWDeleteObj_ObjStore::verify_permission(y);
+
+  /* We have to differentiate error codes depending on whether user is
+   * anonymous (401 Unauthorized) or he doesn't have necessary permissions
+   * (403 Forbidden). */
+  if (s->auth.identity->is_anonymous() && op_ret == -EACCES) {
+    return -EPERM;
+  } else {
+    return op_ret;
+  }
+}
+
+int RGWDeleteObj_ObjStore_SWIFT::get_params(optional_yield y)
+{
+  const string& mm = s->info.args.get("multipart-manifest");
+  multipart_delete = (mm.compare("delete") == 0);
+
+  return RGWDeleteObj_ObjStore::get_params(y);
+}
+
+void RGWDeleteObj_ObjStore_SWIFT::send_response()
+{
+  int r = op_ret;
+
+  if (multipart_delete) {
+    r = 0;
+  } else if(!r) {
+    r = STATUS_NO_CONTENT;
+  }
+
+  set_req_state_err(s, r);
+  dump_errno(s);
+
+  if (multipart_delete) {
+    end_header(s, this /* RGWOp */, nullptr /* contype */,
+               CHUNKED_TRANSFER_ENCODING);
+
+    if (deleter) {
+      bulkdelete_respond(deleter->get_num_deleted(),
+                         deleter->get_num_unfound(),
+                         deleter->get_failures(),
+                         s->prot_flags,
+                         *s->formatter);
+    } else if (-ENOENT == op_ret) {
+      bulkdelete_respond(0, 1, {}, s->prot_flags, *s->formatter);
+    } else {
+      RGWBulkDelete::acct_path_t path;
+      path.bucket_name = s->bucket_name;
+      path.obj_key = s->object->get_key();
+
+      RGWBulkDelete::fail_desc_t fail_desc;
+      fail_desc.err = op_ret;
+      fail_desc.path = path;
+
+      bulkdelete_respond(0, 0, { fail_desc }, s->prot_flags, *s->formatter);
+    }
+  } else {
+    end_header(s, this);
+  }
+
+  rgw_flush_formatter_and_reset(s, s->formatter);
+
+}
+
+static void get_contype_from_attrs(map<string, bufferlist>& attrs,
+				   string& content_type)
+{
+  map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_CONTENT_TYPE);
+  if (iter != attrs.end()) {
+    content_type = rgw_bl_str(iter->second);
+  }
+}
+
+static void dump_object_metadata(const DoutPrefixProvider* dpp, req_state * const s,
+				 const map<string, bufferlist>& attrs)
+{
+  map<string, string> response_attrs;
+
+  for (auto kv : attrs) {
+    const char * name = kv.first.c_str();
+    const auto aiter = rgw_to_http_attrs.find(name);
+
+    if (aiter != std::end(rgw_to_http_attrs)) {
+      response_attrs[aiter->second] = rgw_bl_str(kv.second);
+    } else if (strcmp(name, RGW_ATTR_SLO_UINDICATOR) == 0) {
+      // this attr has an extra length prefix from encode() in prior versions
+      dump_header(s, "X-Object-Meta-Static-Large-Object", "True");
+    } else if (strncmp(name, RGW_ATTR_META_PREFIX,
+		       sizeof(RGW_ATTR_META_PREFIX)-1) == 0) {
+      name += sizeof(RGW_ATTR_META_PREFIX) - 1;
+      dump_header_prefixed(s, "X-Object-Meta-",
+                           camelcase_dash_http_attr(name), kv.second);
+    }
+  }
+
+  /* Handle override and fallback for Content-Disposition HTTP header.
+   * At the moment this will be used only by TempURL of the Swift API. */
+  const auto cditer = rgw_to_http_attrs.find(RGW_ATTR_CONTENT_DISP);
+  if (cditer != std::end(rgw_to_http_attrs)) {
+    const auto& name = cditer->second;
+
+    if (!s->content_disp.override.empty()) {
+      response_attrs[name] = s->content_disp.override;
+    } else if (!s->content_disp.fallback.empty()
+        && response_attrs.find(name) == std::end(response_attrs)) {
+      response_attrs[name] = s->content_disp.fallback;
+    }
+  }
+
+  for (const auto& kv : response_attrs) {
+    dump_header(s, kv.first, kv.second);
+  }
+
+  const auto iter = attrs.find(RGW_ATTR_DELETE_AT);
+  if (iter != std::end(attrs)) {
+    utime_t delete_at;
+    try {
+      decode(delete_at, iter->second);
+      if (!delete_at.is_zero()) {
+        dump_header(s, "X-Delete-At", delete_at.sec());
+      }
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: cannot decode object's " RGW_ATTR_DELETE_AT
+                          " attr, ignoring"
+                       << dendl;
+    }
+  }
+}
+
+int RGWCopyObj_ObjStore_SWIFT::init_dest_policy()
+{
+  dest_policy.create_default(s->user->get_id(), s->user->get_display_name());
+
+  return 0;
+}
+
+int RGWCopyObj_ObjStore_SWIFT::get_params(optional_yield y)
+{
+  if_mod = s->info.env->get("HTTP_IF_MODIFIED_SINCE");
+  if_unmod = s->info.env->get("HTTP_IF_UNMODIFIED_SINCE");
+  if_match = s->info.env->get("HTTP_COPY_IF_MATCH");
+  if_nomatch = s->info.env->get("HTTP_COPY_IF_NONE_MATCH");
+
+  const char * const fresh_meta = s->info.env->get("HTTP_X_FRESH_METADATA");
+  if (fresh_meta && strcasecmp(fresh_meta, "TRUE") == 0) {
+    attrs_mod = rgw::sal::ATTRSMOD_REPLACE;
+  } else {
+    attrs_mod = rgw::sal::ATTRSMOD_MERGE;
+  }
+
+  int r = get_delete_at_param(s, delete_at);
+  if (r < 0) {
+    ldpp_dout(this, 5) << "ERROR: failed to get Delete-At param" << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+void RGWCopyObj_ObjStore_SWIFT::send_partial_response(off_t ofs)
+{
+  if (! sent_header) {
+    if (! op_ret)
+      op_ret = STATUS_CREATED;
+    set_req_state_err(s, op_ret);
+    dump_errno(s);
+    end_header(s, this);
+
+    /* Send progress information. Note that this diverge from the original swift
+     * spec. We do this in order to keep connection alive.
+     */
+    if (op_ret == 0) {
+      s->formatter->open_array_section("progress");
+    }
+    sent_header = true;
+  } else {
+    s->formatter->dump_int("ofs", (uint64_t)ofs);
+  }
+  rgw_flush_formatter(s, s->formatter);
+}
+
+void RGWCopyObj_ObjStore_SWIFT::dump_copy_info()
+{
+  /* Dump X-Copied-From. */
+  dump_header(s, "X-Copied-From", url_encode(src_bucket->get_name()) +
+              "/" + url_encode(s->src_object->get_name()));
+
+  /* Dump X-Copied-From-Account. */
+  /* XXX tenant */
+  dump_header(s, "X-Copied-From-Account", url_encode(s->user->get_id().id));
+
+  /* Dump X-Copied-From-Last-Modified. */
+  dump_time_header(s, "X-Copied-From-Last-Modified", src_mtime);
+}
+
+void RGWCopyObj_ObjStore_SWIFT::send_response()
+{
+  if (! sent_header) {
+    string content_type;
+    if (! op_ret)
+      op_ret = STATUS_CREATED;
+    set_req_state_err(s, op_ret);
+    dump_errno(s);
+    dump_etag(s, etag);
+    dump_last_modified(s, mtime);
+    dump_copy_info();
+    get_contype_from_attrs(attrs, content_type);
+    dump_object_metadata(this, s, attrs);
+    end_header(s, this, !content_type.empty() ? content_type.c_str()
+	       : "binary/octet-stream");
+  } else {
+    s->formatter->close_section();
+    rgw_flush_formatter(s, s->formatter);
+  }
+}
+
+int RGWGetObj_ObjStore_SWIFT::verify_permission(optional_yield y)
+{
+  op_ret = RGWGetObj_ObjStore::verify_permission(y);
+
+  /* We have to differentiate error codes depending on whether user is
+   * anonymous (401 Unauthorized) or he doesn't have necessary permissions
+   * (403 Forbidden). */
+  if (s->auth.identity->is_anonymous() && op_ret == -EACCES) {
+    return -EPERM;
+  } else {
+    return op_ret;
+  }
+}
+
+int RGWGetObj_ObjStore_SWIFT::get_params(optional_yield y)
+{
+  const string& mm = s->info.args.get("multipart-manifest");
+  skip_manifest = (mm.compare("get") == 0);
+
+  return RGWGetObj_ObjStore::get_params(y);
+}
+
+int RGWGetObj_ObjStore_SWIFT::send_response_data_error(optional_yield y)
+{
+  std::string error_content;
+  op_ret = error_handler(op_ret, &error_content, y);
+  if (! op_ret) {
+    /* The error handler has taken care of the error. */
+    return 0;
+  }
+
+  bufferlist error_bl;
+  error_bl.append(error_content);
+  return send_response_data(error_bl, 0, error_bl.length());
+}
+
+int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl,
+                                                 const off_t bl_ofs,
+                                                 const off_t bl_len)
+{
+  string content_type;
+
+  if (sent_header) {
+    goto send_data;
+  }
+
+  if (custom_http_ret) {
+    set_req_state_err(s, 0);
+    dump_errno(s, custom_http_ret);
+  } else {
+    set_req_state_err(s, (partial_content && !op_ret) ? STATUS_PARTIAL_CONTENT
+		    : op_ret);
+    dump_errno(s);
+
+    if (s->is_err()) {
+      end_header(s, NULL);
+      return 0;
+    }
+  }
+
+  if (range_str) {
+    dump_range(s, ofs, end, s->obj_size);
+  }
+
+  if (s->is_err()) {
+    end_header(s, NULL);
+    return 0;
+  }
+
+  dump_content_length(s, total_len);
+  dump_last_modified(s, lastmod);
+  dump_header(s, "X-Timestamp", utime_t(lastmod));
+  if (is_slo) {
+    dump_header(s, "X-Static-Large-Object", "True");
+  }
+
+  if (! op_ret) {
+    if (! lo_etag.empty()) {
+      dump_etag(s, lo_etag, true /* quoted */);
+    } else {
+      auto iter = attrs.find(RGW_ATTR_ETAG);
+      if (iter != attrs.end()) {
+        dump_etag(s, iter->second.to_str());
+      }
+    }
+
+    get_contype_from_attrs(attrs, content_type);
+    dump_object_metadata(this, s, attrs);
+  }
+
+  end_header(s, this, !content_type.empty() ? content_type.c_str()
+	     : "binary/octet-stream");
+
+  sent_header = true;
+
+send_data:
+  if (get_data && !op_ret) {
+    const auto r = dump_body(s, bl.c_str() + bl_ofs, bl_len);
+    if (r < 0) {
+      return r;
+    }
+  }
+  rgw_flush_formatter_and_reset(s, s->formatter);
+
+  return 0;
+}
+
+void RGWOptionsCORS_ObjStore_SWIFT::send_response()
+{
+  string hdrs, exp_hdrs;
+  uint32_t max_age = CORS_MAX_AGE_INVALID;
+  /*EACCES means, there is no CORS registered yet for the bucket
+   *ENOENT means, there is no match of the Origin in the list of CORSRule
+   */
+  if (op_ret == -ENOENT)
+    op_ret = -EACCES;
+  if (op_ret < 0) {
+    set_req_state_err(s, op_ret);
+    dump_errno(s);
+    end_header(s, NULL);
+    return;
+  }
+  get_response_params(hdrs, exp_hdrs, &max_age);
+  dump_errno(s);
+  dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(),
+		      max_age);
+  end_header(s, NULL);
+}
+
+int RGWBulkDelete_ObjStore_SWIFT::get_data(
+  list<RGWBulkDelete::acct_path_t>& items, bool * const is_truncated)
+{
+  constexpr size_t MAX_LINE_SIZE = 2048;
+
+  RGWClientIOStreamBuf ciosb(static_cast<RGWRestfulIO&>(*(s->cio)),
+			     size_t(s->cct->_conf->rgw_max_chunk_size));
+  istream cioin(&ciosb);
+
+  char buf[MAX_LINE_SIZE];
+  while (cioin.getline(buf, sizeof(buf))) {
+    string path_str(buf);
+
+    ldpp_dout(this, 20) << "extracted Bulk Delete entry: " << path_str << dendl;
+
+    RGWBulkDelete::acct_path_t path;
+
+    /* We need to skip all slashes at the beginning in order to preserve
+     * compliance with Swift. */
+    const size_t start_pos = path_str.find_first_not_of('/');
+
+    if (string::npos != start_pos) {
+      /* Seperator is the first slash after the leading ones. */
+      const size_t sep_pos = path_str.find('/', start_pos);
+
+      if (string::npos != sep_pos) {
+        path.bucket_name = url_decode(path_str.substr(start_pos,
+                                                      sep_pos - start_pos));
+        path.obj_key = url_decode(path_str.substr(sep_pos + 1));
+      } else {
+        /* It's guaranteed here that bucket name is at least one character
+         * long and is different than slash. */
+        path.bucket_name = url_decode(path_str.substr(start_pos));
+      }
+
+      items.push_back(path);
+    }
+
+    if (items.size() == MAX_CHUNK_ENTRIES) {
+      *is_truncated = true;
+      return 0;
+    }
+  }
+
+  *is_truncated = false;
+  return 0;
+}
+
+void RGWBulkDelete_ObjStore_SWIFT::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this /* RGWOp */, nullptr /* contype */,
+             CHUNKED_TRANSFER_ENCODING);
+
+  bulkdelete_respond(deleter->get_num_deleted(),
+                     deleter->get_num_unfound(),
+                     deleter->get_failures(),
+                     s->prot_flags,
+                     *s->formatter);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+
+std::unique_ptr<RGWBulkUploadOp::StreamGetter>
+RGWBulkUploadOp_ObjStore_SWIFT::create_stream()
+{
+  class SwiftStreamGetter : public StreamGetter {
+    const DoutPrefixProvider* dpp;
+    const size_t conlen;
+    size_t curpos;
+    req_state* const s;
+
+  public:
+    SwiftStreamGetter(const DoutPrefixProvider* dpp, req_state* const s, const size_t conlen)
+      : dpp(dpp),
+        conlen(conlen),
+        curpos(0),
+        s(s) {
+    }
+
+    ssize_t get_at_most(size_t want, ceph::bufferlist& dst) override {
+      /* maximum requested by a caller */
+      /* data provided by client */
+      /* RadosGW's limit. */
+      const size_t max_chunk_size = \
+        static_cast<size_t>(s->cct->_conf->rgw_max_chunk_size);
+      const size_t max_to_read = std::min({ want, conlen - curpos, max_chunk_size });
+
+      ldpp_dout(dpp, 20) << "bulk_upload: get_at_most max_to_read="
+                        << max_to_read
+                        << ", dst.c_str()=" << reinterpret_cast<intptr_t>(dst.c_str()) << dendl;
+
+      bufferptr bp(max_to_read);
+      const auto read_len = recv_body(s, bp.c_str(), max_to_read);
+      dst.append(bp, 0, read_len);
+      //const auto read_len = recv_body(s, dst.c_str(), max_to_read);
+      if (read_len < 0) {
+        return read_len;
+      }
+
+      curpos += read_len;
+      return curpos > s->cct->_conf->rgw_max_put_size ? -ERR_TOO_LARGE
+                                                      : read_len;
+    }
+
+    ssize_t get_exactly(size_t want, ceph::bufferlist& dst) override {
+      ldpp_dout(dpp, 20) << "bulk_upload: get_exactly want=" << want << dendl;
+
+      /* FIXME: do this in a loop. */
+      const auto ret = get_at_most(want, dst);
+      ldpp_dout(dpp, 20) << "bulk_upload: get_exactly ret=" << ret << dendl;
+      if (ret < 0) {
+        return ret;
+      } else if (static_cast<size_t>(ret) != want) {
+        return -EINVAL;
+      } else {
+        return want;
+      }
+    }
+  };
+
+  if (! s->length) {
+    op_ret = -EINVAL;
+    return nullptr;
+  } else {
+    ldpp_dout(this, 20) << "bulk upload: create_stream for length="
+                      << s->length << dendl;
+
+    const size_t conlen = atoll(s->length);
+    return std::unique_ptr<SwiftStreamGetter>(new SwiftStreamGetter(this, s, conlen));
+  }
+}
+
+void RGWBulkUploadOp_ObjStore_SWIFT::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this /* RGWOp */, nullptr /* contype */,
+             CHUNKED_TRANSFER_ENCODING);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+
+  s->formatter->open_object_section("delete");
+
+  std::string resp_status;
+  std::string resp_body;
+
+  if (! failures.empty()) {
+    rgw_err err;
+
+    const auto last_err = { failures.back().err };
+    if (boost::algorithm::contains(last_err, terminal_errors)) {
+      /* The terminal errors are affecting the status of the whole upload. */
+      set_req_state_err(err, failures.back().err, s->prot_flags);
+    } else {
+      set_req_state_err(err, ERR_INVALID_REQUEST, s->prot_flags);
+    }
+
+    dump_errno(err, resp_status);
+  } else if (0 == num_created && failures.empty()) {
+    /* Nothing created, nothing failed. This means the archive contained no
+     * entity we could understand (regular file or directory). We need to
+     * send 400 Bad Request to an HTTP client in the internal status field. */
+    dump_errno(400, resp_status);
+    resp_body = "Invalid Tar File: No Valid Files";
+  } else {
+    /* 200 OK */
+    dump_errno(201, resp_status);
+  }
+
+  encode_json("Number Files Created", num_created, s->formatter);
+  encode_json("Response Body", resp_body, s->formatter);
+  encode_json("Response Status", resp_status, s->formatter);
+
+  s->formatter->open_array_section("Errors");
+  for (const auto& fail_desc : failures) {
+    s->formatter->open_array_section("object");
+
+    encode_json("Name", fail_desc.path, s->formatter);
+
+    rgw_err err;
+    set_req_state_err(err, fail_desc.err, s->prot_flags);
+    std::string status;
+    dump_errno(err, status);
+    encode_json("Status", status, s->formatter);
+
+    s->formatter->close_section();
+  }
+  s->formatter->close_section();
+
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+
+void RGWGetCrossDomainPolicy_ObjStore_SWIFT::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, "application/xml");
+
+  std::stringstream ss;
+
+  ss << R"(<?xml version="1.0"?>)" << "\n"
+     << R"(<!DOCTYPE cross-domain-policy SYSTEM )"
+     << R"("http://www.adobe.com/xml/dtds/cross-domain-policy.dtd" >)" << "\n"
+     << R"(<cross-domain-policy>)" << "\n"
+     << g_conf()->rgw_cross_domain_policy << "\n"
+     << R"(</cross-domain-policy>)";
+
+  dump_body(s, ss.str());
+}
+
+void RGWGetHealthCheck_ObjStore_SWIFT::send_response()
+{
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this, "application/xml");
+
+  if (op_ret) {
+    static constexpr char DISABLED[] = "DISABLED BY FILE";
+    dump_body(s, DISABLED, strlen(DISABLED));
+  }
+}
+
+const vector<pair<string, RGWInfo_ObjStore_SWIFT::info>> RGWInfo_ObjStore_SWIFT::swift_info =
+{
+    {"bulk_delete", {false, nullptr}},
+    {"container_quotas", {false, nullptr}},
+    {"swift", {false, RGWInfo_ObjStore_SWIFT::list_swift_data}},
+    {"tempurl", { false, RGWInfo_ObjStore_SWIFT::list_tempurl_data}},
+    {"slo", {false, RGWInfo_ObjStore_SWIFT::list_slo_data}},
+    {"account_quotas", {false, nullptr}},
+    {"staticweb", {false, nullptr}},
+    {"tempauth", {false, RGWInfo_ObjStore_SWIFT::list_tempauth_data}},
+};
+
+void RGWInfo_ObjStore_SWIFT::execute(optional_yield y)
+{
+  bool is_admin_info_enabled = false;
+
+  const string& swiftinfo_sig = s->info.args.get("swiftinfo_sig");
+  const string& swiftinfo_expires = s->info.args.get("swiftinfo_expires");
+
+  if (!swiftinfo_sig.empty() &&
+      !swiftinfo_expires.empty() &&
+      !is_expired(swiftinfo_expires, this)) {
+    is_admin_info_enabled = true;
+  }
+
+  s->formatter->open_object_section("info");
+
+  for (const auto& pair : swift_info) {
+    if(!is_admin_info_enabled && pair.second.is_admin_info)
+      continue;
+
+    if (!pair.second.list_data) {
+      s->formatter->open_object_section((pair.first).c_str());
+      s->formatter->close_section();
+    }
+    else {
+      pair.second.list_data(*(s->formatter), s->cct->_conf, driver);
+    }
+  }
+
+  s->formatter->close_section();
+}
+
+void RGWInfo_ObjStore_SWIFT::send_response()
+{
+  if (op_ret <  0) {
+    op_ret = STATUS_NO_CONTENT;
+  }
+  set_req_state_err(s, op_ret);
+  dump_errno(s);
+  end_header(s, this);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void RGWInfo_ObjStore_SWIFT::list_swift_data(Formatter& formatter,
+                                              const ConfigProxy& config,
+                                              rgw::sal::Driver* driver)
+{
+  formatter.open_object_section("swift");
+  formatter.dump_int("max_file_size", config->rgw_max_put_size);
+  formatter.dump_int("container_listing_limit", RGW_LIST_BUCKETS_LIMIT_MAX);
+
+  string ceph_version(CEPH_GIT_NICE_VER);
+  formatter.dump_string("version", ceph_version);
+
+  const size_t max_attr_name_len = \
+    g_conf().get_val<Option::size_t>("rgw_max_attr_name_len");
+  if (max_attr_name_len) {
+    const size_t meta_name_limit = \
+      max_attr_name_len - strlen(RGW_ATTR_PREFIX RGW_AMZ_META_PREFIX);
+    formatter.dump_int("max_meta_name_length", meta_name_limit);
+  }
+
+  const size_t meta_value_limit = g_conf().get_val<Option::size_t>("rgw_max_attr_size");
+  if (meta_value_limit) {
+    formatter.dump_int("max_meta_value_length", meta_value_limit);
+  }
+
+  const size_t meta_num_limit = \
+    g_conf().get_val<uint64_t>("rgw_max_attrs_num_in_req");
+  if (meta_num_limit) {
+    formatter.dump_int("max_meta_count", meta_num_limit);
+  }
+
+  formatter.open_array_section("policies");
+  const rgw::sal::ZoneGroup& zonegroup = driver->get_zone()->get_zonegroup();
+
+  std::set<std::string> targets;
+  zonegroup.get_placement_target_names(targets);
+  for (const auto& placement_targets : targets) {
+    formatter.open_object_section("policy");
+    if (placement_targets.compare(zonegroup.get_default_placement_name()) == 0)
+      formatter.dump_bool("default", true);
+    formatter.dump_string("name", placement_targets.c_str());
+    formatter.close_section();
+  }
+  formatter.close_section();
+
+  formatter.dump_int("max_object_name_size", RGWHandler_REST::MAX_OBJ_NAME_LEN);
+  formatter.dump_bool("strict_cors_mode", true);
+  formatter.dump_int("max_container_name_length", RGWHandler_REST::MAX_BUCKET_NAME_LEN);
+  formatter.close_section();
+}
+
+void RGWInfo_ObjStore_SWIFT::list_tempauth_data(Formatter& formatter,
+                                                 const ConfigProxy& config,
+                                                 rgw::sal::Driver* driver)
+{
+  formatter.open_object_section("tempauth");
+  formatter.dump_bool("account_acls", true);
+  formatter.close_section();
+}
+void RGWInfo_ObjStore_SWIFT::list_tempurl_data(Formatter& formatter,
+                                                const ConfigProxy& config,
+                                                rgw::sal::Driver* driver)
+{
+  formatter.open_object_section("tempurl");
+  formatter.open_array_section("methods");
+  formatter.dump_string("methodname", "GET");
+  formatter.dump_string("methodname", "HEAD");
+  formatter.dump_string("methodname", "PUT");
+  formatter.dump_string("methodname", "POST");
+  formatter.dump_string("methodname", "DELETE");
+  formatter.close_section();
+  formatter.close_section();
+}
+
+void RGWInfo_ObjStore_SWIFT::list_slo_data(Formatter& formatter,
+                                            const ConfigProxy& config,
+                                            rgw::sal::Driver* driver)
+{
+  formatter.open_object_section("slo");
+  formatter.dump_int("max_manifest_segments", config->rgw_max_slo_entries);
+  formatter.close_section();
+}
+
+bool RGWInfo_ObjStore_SWIFT::is_expired(const std::string& expires, const DoutPrefixProvider *dpp)
+{
+  string err;
+  const utime_t now = ceph_clock_now();
+  const uint64_t expiration = (uint64_t)strict_strtoll(expires.c_str(),
+                                                       10, &err);
+  if (!err.empty()) {
+    ldpp_dout(dpp, 5) << "failed to parse siginfo_expires: " << err << dendl;
+    return true;
+  }
+
+  if (expiration <= (uint64_t)now.sec()) {
+    ldpp_dout(dpp, 5) << "siginfo expired: " << expiration << " <= " << now.sec() << dendl;
+    return true;
+  }
+
+  return false;
+}
+
+
+void RGWFormPost::init(rgw::sal::Driver* const driver,
+                       req_state* const s,
+                       RGWHandler* const dialect_handler)
+{
+  if (!rgw::sal::Object::empty(s->object)) {
+    prefix = std::move(s->object->get_name());
+    s->object->set_key(rgw_obj_key());
+  }
+
+  return RGWPostObj_ObjStore::init(driver, s, dialect_handler);
+}
+
+std::size_t RGWFormPost::get_max_file_size() /*const*/
+{
+  std::string max_str = get_part_str(ctrl_parts, "max_file_size", "0");
+
+  std::string err;
+  const std::size_t max_file_size =
+    static_cast<uint64_t>(strict_strtoll(max_str.c_str(), 10, &err));
+
+  if (! err.empty()) {
+    ldpp_dout(this, 5) << "failed to parse FormPost's max_file_size: " << err
+                     << dendl;
+    return 0;
+  }
+
+  return max_file_size;
+}
+
+bool RGWFormPost::is_non_expired()
+{
+  std::string expires = get_part_str(ctrl_parts, "expires", "0");
+
+  std::string err;
+  const uint64_t expires_timestamp =
+    static_cast<uint64_t>(strict_strtoll(expires.c_str(), 10, &err));
+
+  if (! err.empty()) {
+    ldpp_dout(this, 5) << "failed to parse FormPost's expires: " << err << dendl;
+    return false;
+  }
+
+  const utime_t now = ceph_clock_now();
+  if (std::cmp_less_equal(expires_timestamp, now.sec())) {
+    ldpp_dout(this, 5) << "FormPost form expired: "
+            << expires_timestamp << " <= " << now.sec() << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+bool RGWFormPost::is_integral()
+{
+  const std::string form_signature = get_part_str(ctrl_parts, "signature");
+
+  try {
+    get_owner_info(s, s->user->get_info());
+    s->auth.identity = rgw::auth::transform_old_authinfo(s);
+  } catch (...) {
+    ldpp_dout(this, 5) << "cannot get user_info of account's owner" << dendl;
+    return false;
+  }
+
+  for (const auto& kv : s->user->get_info().temp_url_keys) {
+    const int temp_url_key_num = kv.first;
+    const string& temp_url_key = kv.second;
+
+    if (temp_url_key.empty()) {
+      continue;
+    }
+
+    SignatureHelper sig_helper;
+    sig_helper.calc(temp_url_key,
+                    s->info.request_uri,
+                    get_part_str(ctrl_parts, "redirect"),
+                    get_part_str(ctrl_parts, "max_file_size", "0"),
+                    get_part_str(ctrl_parts, "max_file_count", "0"),
+                    get_part_str(ctrl_parts, "expires", "0"));
+
+    const auto local_sig = sig_helper.get_signature();
+
+    ldpp_dout(this, 20) << "FormPost signature [" << temp_url_key_num << "]"
+                      << " (calculated): " << local_sig << dendl;
+
+    if (sig_helper.is_equal_to(form_signature)) {
+      return true;
+    } else {
+      ldpp_dout(this, 5) << "FormPost's signature mismatch: "
+                       << local_sig << " != " << form_signature << dendl;
+    }
+  }
+
+  return false;
+}
+
+void RGWFormPost::get_owner_info(const req_state* const s,
+                                   RGWUserInfo& owner_info) const
+{
+  /* We cannot use req_state::bucket_name because it isn't available
+   * now. It will be initialized in RGWHandler_REST_SWIFT::postauth_init(). */
+  const string& bucket_name = s->init_state.url_bucket;
+
+  std::unique_ptr<rgw::sal::User> user;
+
+  /* TempURL in Formpost only requires that bucket name is specified. */
+  if (bucket_name.empty()) {
+    throw -EPERM;
+  }
+
+  if (!s->account_name.empty()) {
+    RGWUserInfo uinfo;
+    bool found = false;
+
+    const rgw_user uid(s->account_name);
+    if (uid.tenant.empty()) {
+      const rgw_user tenanted_uid(uid.id, uid.id);
+      user = driver->get_user(tenanted_uid);
+
+      if (user->load_user(s, s->yield) >= 0) {
+        /* Succeeded. */
+        found = true;
+      }
+    }
+
+    if (!found) {
+      user = driver->get_user(uid);
+      if (user->load_user(s, s->yield) < 0) {
+	throw -EPERM;
+      }
+    }
+  }
+
+  /* Need to get user info of bucket owner. */
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  int ret = driver->get_bucket(s, user.get(), user->get_tenant(), bucket_name, &bucket, s->yield);
+  if (ret < 0) {
+    throw ret;
+  }
+
+  ldpp_dout(this, 20) << "temp url user (bucket owner): " << bucket->get_info().owner
+                 << dendl;
+
+  user = driver->get_user(bucket->get_info().owner);
+  if (user->load_user(s, s->yield) < 0) {
+    throw -EPERM;
+  }
+
+  owner_info = user->get_info();
+}
+
+int RGWFormPost::get_params(optional_yield y)
+{
+  /* The parentt class extracts boundary info from the Content-Type. */
+  int ret = RGWPostObj_ObjStore::get_params(y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  policy.create_default(s->user->get_id(), s->user->get_display_name());
+
+  /* Let's start parsing the HTTP body by parsing each form part step-
+   * by-step till encountering the first part with file data. */
+  do {
+    struct post_form_part part;
+    ret = read_form_part_header(&part, stream_done);
+    if (ret < 0) {
+      return ret;
+    }
+
+    if (s->cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+      ldpp_dout(this, 20) << "read part header -- part.name="
+                        << part.name << dendl;
+
+      for (const auto& pair : part.fields) {
+        ldpp_dout(this, 20) << "field.name=" << pair.first << dendl;
+        ldpp_dout(this, 20) << "field.val=" << pair.second.val << dendl;
+        ldpp_dout(this, 20) << "field.params:" << dendl;
+
+        for (const auto& param_pair : pair.second.params) {
+          ldpp_dout(this, 20) << " " << param_pair.first
+                            << " -> " << param_pair.second << dendl;
+        }
+      }
+    }
+
+    if (stream_done) {
+      /* Unexpected here. */
+      err_msg = "Malformed request";
+      return -EINVAL;
+    }
+
+    const auto field_iter = part.fields.find("Content-Disposition");
+    if (std::end(part.fields) != field_iter &&
+        std::end(field_iter->second.params) != field_iter->second.params.find("filename")) {
+      /* First data part ahead. */
+      current_data_part = std::move(part);
+
+      /* Stop the iteration. We can assume that all control parts have been
+       * already parsed. The rest of HTTP body should contain data parts
+       * only. They will be picked up by ::get_data(). */
+      break;
+    } else {
+      /* Control part ahead. Receive, parse and driver for later usage. */
+      bool boundary;
+      ret = read_data(part.data, s->cct->_conf->rgw_max_chunk_size,
+                      boundary, stream_done);
+      if (ret < 0) {
+        return ret;
+      } else if (! boundary) {
+        err_msg = "Couldn't find boundary";
+        return -EINVAL;
+      }
+
+      ctrl_parts[part.name] = std::move(part);
+    }
+  } while (! stream_done);
+
+  min_len = 0;
+  max_len = get_max_file_size();
+
+  if (! current_data_part) {
+    err_msg = "FormPost: no files to process";
+    return -EINVAL;
+  }
+
+  if (! is_non_expired()) {
+    err_msg = "FormPost: Form Expired";
+    return -EPERM;
+  }
+
+  if (! is_integral()) {
+    err_msg = "FormPost: Invalid Signature";
+    return -EPERM;
+  }
+
+  return 0;
+}
+
+std::string RGWFormPost::get_current_filename() const
+{
+  try {
+    const auto& field = current_data_part->fields.at("Content-Disposition");
+    const auto iter = field.params.find("filename");
+
+    if (std::end(field.params) != iter) {
+      return prefix + iter->second;
+    }
+  } catch (std::out_of_range&) {
+    /* NOP */;
+  }
+
+  return prefix;
+}
+
+std::string RGWFormPost::get_current_content_type() const
+{
+  try {
+    const auto& field = current_data_part->fields.at("Content-Type");
+    return field.val;
+  } catch (std::out_of_range&) {
+    /* NOP */;
+  }
+
+  return std::string();
+}
+
+bool RGWFormPost::is_next_file_to_upload()
+{
+  if (! stream_done) {
+    /* We have at least one additional part in the body. */
+    struct post_form_part part;
+    int r = read_form_part_header(&part, stream_done);
+    if (r < 0) {
+      return false;
+    }
+
+    const auto field_iter = part.fields.find("Content-Disposition");
+    if (std::end(part.fields) != field_iter) {
+      const auto& params = field_iter->second.params;
+      const auto& filename_iter = params.find("filename");
+
+      if (std::end(params) != filename_iter && ! filename_iter->second.empty()) {
+        current_data_part = std::move(part);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+int RGWFormPost::get_data(ceph::bufferlist& bl, bool& again)
+{
+  bool boundary;
+
+  int r = read_data(bl, s->cct->_conf->rgw_max_chunk_size,
+                    boundary, stream_done);
+  if (r < 0) {
+    return r;
+  }
+
+  /* Tell RGWPostObj::execute(optional_yield y) that it has some data to put. */
+  again = !boundary;
+
+  return bl.length();
+}
+
+void RGWFormPost::send_response()
+{
+  std::string redirect = get_part_str(ctrl_parts, "redirect");
+  if (! redirect.empty()) {
+    op_ret = STATUS_REDIRECT;
+  }
+
+  set_req_state_err(s, op_ret);
+  s->err.err_code = err_msg;
+  dump_errno(s);
+  if (! redirect.empty()) {
+    dump_redirect(s, redirect);
+  }
+  end_header(s, this);
+}
+
+bool RGWFormPost::is_formpost_req(req_state* const s)
+{
+  std::string content_type;
+  std::map<std::string, std::string> params;
+
+  parse_boundary_params(s->info.env->get("CONTENT_TYPE", ""),
+                        content_type, params);
+
+  return boost::algorithm::iequals(content_type, "multipart/form-data") &&
+         params.count("boundary") > 0;
+}
+
+
+RGWOp *RGWHandler_REST_Service_SWIFT::op_get()
+{
+  return new RGWListBuckets_ObjStore_SWIFT;
+}
+
+RGWOp *RGWHandler_REST_Service_SWIFT::op_head()
+{
+  return new RGWStatAccount_ObjStore_SWIFT;
+}
+
+RGWOp *RGWHandler_REST_Service_SWIFT::op_put()
+{
+  if (s->info.args.exists("extract-archive")) {
+    return new RGWBulkUploadOp_ObjStore_SWIFT;
+  }
+  return nullptr;
+}
+
+RGWOp *RGWHandler_REST_Service_SWIFT::op_post()
+{
+  if (s->info.args.exists("bulk-delete")) {
+    return new RGWBulkDelete_ObjStore_SWIFT;
+  }
+  return new RGWPutMetadataAccount_ObjStore_SWIFT;
+}
+
+RGWOp *RGWHandler_REST_Service_SWIFT::op_delete()
+{
+  if (s->info.args.exists("bulk-delete")) {
+    return new RGWBulkDelete_ObjStore_SWIFT;
+  }
+  return NULL;
+}
+
+int RGWSwiftWebsiteHandler::serve_errordoc(const int http_ret,
+                                           const std::string error_doc,
+					   optional_yield y)
+{
+  /* Try to throw it all away. */
+  s->formatter->reset();
+
+  class RGWGetErrorPage : public RGWGetObj_ObjStore_SWIFT {
+  public:
+    RGWGetErrorPage(rgw::sal::Driver* const driver,
+                    RGWHandler_REST* const handler,
+                    req_state* const s,
+                    const int http_ret) {
+      /* Calling a virtual from the base class is safe as the subobject should
+       * be properly initialized and we haven't overridden the init method. */
+      init(driver, s, handler);
+      set_get_data(true);
+      set_custom_http_response(http_ret);
+    }
+
+    int error_handler(const int err_no,
+                      std::string* const error_content, optional_yield y) override {
+      /* Enforce that any error generated while getting the error page will
+       * not be send to a client. This allows us to recover from the double
+       * fault situation by sending the original message. */
+      return 0;
+    }
+  } get_errpage_op(driver, handler, s, http_ret);
+
+  /* This is okay.  It's an error, so nothing will run after this, and it can be
+   * called by abort_early(), which can be called before s->object or s->bucket
+   * are set up. */
+  if (!rgw::sal::Bucket::empty(s->bucket.get())) {
+    s->object = s->bucket->get_object(rgw_obj_key(std::to_string(http_ret) + error_doc));
+  } else {
+    s->object = driver->get_object(rgw_obj_key(std::to_string(http_ret) + error_doc));
+  }
+
+  RGWOp* newop = &get_errpage_op;
+  RGWRequest req(0);
+  return rgw_process_authenticated(handler, newop, &req, s, y, driver, true);
+}
+
+int RGWSwiftWebsiteHandler::error_handler(const int err_no,
+                                          std::string* const error_content,
+					  optional_yield y)
+{
+  if (!s->bucket.get()) {
+    /* No bucket, default no-op handler */
+    return err_no;
+  }
+
+  const auto& ws_conf = s->bucket->get_info().website_conf;
+
+  if (can_be_website_req() && ! ws_conf.error_doc.empty()) {
+    set_req_state_err(s, err_no);
+    return serve_errordoc(s->err.http_ret, ws_conf.error_doc, y);
+  }
+
+  /* Let's go to the default, no-op handler. */
+  return err_no;
+}
+
+bool RGWSwiftWebsiteHandler::is_web_mode() const
+{
+  const std::string_view webmode = s->info.env->get("HTTP_X_WEB_MODE", "");
+  return boost::algorithm::iequals(webmode, "true");
+}
+
+bool RGWSwiftWebsiteHandler::can_be_website_req() const
+{
+  /* Static website works only with the GET or HEAD method. Nothing more. */
+  static const std::set<std::string_view> ws_methods = { "GET", "HEAD" };
+  if (ws_methods.count(s->info.method) == 0) {
+    return false;
+  }
+
+  /* We also need to handle early failures from the auth system. In such cases
+   * req_state::auth.identity may be empty. Let's treat that the same way as
+   * the anonymous access. */
+  if (! s->auth.identity) {
+    return true;
+  }
+
+  /* Swift serves websites only for anonymous requests unless client explicitly
+   * requested this behaviour by supplying X-Web-Mode HTTP header set to true. */
+  if (s->auth.identity->is_anonymous() || is_web_mode()) {
+    return true;
+  }
+
+  return false;
+}
+
+RGWOp* RGWSwiftWebsiteHandler::get_ws_redirect_op()
+{
+  class RGWMovedPermanently: public RGWOp {
+    const std::string location;
+  public:
+    explicit RGWMovedPermanently(const std::string& location)
+      : location(location) {
+    }
+
+    int verify_permission(optional_yield) override {
+      return 0;
+    }
+
+    void execute(optional_yield) override {
+      op_ret = -ERR_PERMANENT_REDIRECT;
+      return;
+    }
+
+    void send_response() override {
+      set_req_state_err(s, op_ret);
+      dump_errno(s);
+      dump_content_length(s, 0);
+      dump_redirect(s, location);
+      end_header(s, this);
+    }
+
+    const char* name() const override {
+      return "RGWMovedPermanently";
+    }
+  };
+
+  return new RGWMovedPermanently(s->info.request_uri + '/');
+}
+
+RGWOp* RGWSwiftWebsiteHandler::get_ws_index_op()
+{
+  /* Retarget to get obj on requested index file. */
+  if (! s->object->empty()) {
+    s->object->set_name(s->object->get_name() +
+                s->bucket->get_info().website_conf.get_index_doc());
+  } else {
+    s->object->set_name(s->bucket->get_info().website_conf.get_index_doc());
+  }
+
+  auto getop = new RGWGetObj_ObjStore_SWIFT;
+  getop->set_get_data(boost::algorithm::equals("GET", s->info.method));
+
+  return getop;
+}
+
+RGWOp* RGWSwiftWebsiteHandler::get_ws_listing_op()
+{
+  class RGWWebsiteListing : public RGWListBucket_ObjStore_SWIFT {
+    const std::string prefix_override;
+
+    int get_params(optional_yield) override {
+      prefix = prefix_override;
+      max = default_max;
+      delimiter = "/";
+      return 0;
+    }
+
+    void send_response() override {
+      /* Generate the header now. */
+      set_req_state_err(s, op_ret);
+      dump_errno(s);
+      dump_container_metadata(s, s->bucket.get(), quota.bucket_quota,
+                              s->bucket->get_info().website_conf);
+      end_header(s, this, "text/html");
+      if (op_ret < 0) {
+        return;
+      }
+
+      /* Now it's the time to start generating HTML bucket listing.
+       * All the crazy stuff with crafting tags will be delegated to
+       * RGWSwiftWebsiteListingFormatter. */
+      std::stringstream ss;
+      RGWSwiftWebsiteListingFormatter htmler(ss, prefix);
+
+      const auto& ws_conf = s->bucket->get_info().website_conf;
+      htmler.generate_header(s->decoded_uri,
+                             ws_conf.listing_css_doc);
+
+      for (const auto& pair : common_prefixes) {
+        std::string subdir_name = pair.first;
+        if (! subdir_name.empty()) {
+          /* To be compliant with Swift we need to remove the trailing
+           * slash. */
+          subdir_name.pop_back();
+        }
+
+        htmler.dump_subdir(subdir_name);
+      }
+
+      for (const rgw_bucket_dir_entry& obj : objs) {
+        if (! common_prefixes.count(obj.key.name + '/')) {
+          htmler.dump_object(obj);
+        }
+      }
+
+      htmler.generate_footer();
+      dump_body(s, ss.str());
+    }
+  public:
+    /* Taking prefix_override by value to leverage std::string r-value ref
+     * ctor and thus avoid extra memory copying/increasing ref counter. */
+    explicit RGWWebsiteListing(std::string prefix_override)
+      : prefix_override(std::move(prefix_override)) {
+    }
+  };
+
+  std::string prefix = std::move(s->object->get_name());
+  s->object->set_key(rgw_obj_key());
+
+  return new RGWWebsiteListing(std::move(prefix));
+}
+
+bool RGWSwiftWebsiteHandler::is_web_dir() const
+{
+  std::string subdir_name = url_decode(s->object->get_name());
+
+  /* Remove character from the subdir name if it is "/". */
+  if (subdir_name.empty()) {
+    return false;
+  } else if (subdir_name.back() == '/') {
+    subdir_name.pop_back();
+    if (subdir_name.empty()) {
+      return false;
+    }
+  }
+
+  std::unique_ptr<rgw::sal::Object> obj = s->bucket->get_object(rgw_obj_key(std::move(subdir_name)));
+
+  /* First, get attrset of the object we'll try to retrieve. */
+  obj->set_atomic();
+  obj->set_prefetch_data();
+
+  RGWObjState* state = nullptr;
+  if (obj->get_obj_state(s, &state, s->yield, false)) {
+    return false;
+  }
+
+  /* A nonexistent object cannot be a considered as a marker representing
+   * the emulation of catalog in FS hierarchy. */
+  if (! state->exists) {
+    return false;
+  }
+
+  /* Decode the content type. */
+  std::string content_type;
+  get_contype_from_attrs(state->attrset, content_type);
+
+  const auto& ws_conf = s->bucket->get_info().website_conf;
+  const std::string subdir_marker = ws_conf.subdir_marker.empty()
+                                      ? "application/directory"
+                                      : ws_conf.subdir_marker;
+  return subdir_marker == content_type && state->size <= 1;
+}
+
+bool RGWSwiftWebsiteHandler::is_index_present(const std::string& index) const
+{
+  std::unique_ptr<rgw::sal::Object> obj = s->bucket->get_object(rgw_obj_key(index));
+
+  obj->set_atomic();
+  obj->set_prefetch_data();
+
+  RGWObjState* state = nullptr;
+  if (obj->get_obj_state(s, &state, s->yield, false)) {
+    return false;
+  }
+
+  /* A nonexistent object cannot be a considered as a viable index. We will
+   * try to list the bucket or - if this is impossible - return an error. */
+  return state->exists;
+}
+
+int RGWSwiftWebsiteHandler::retarget_bucket(RGWOp* op, RGWOp** new_op)
+{
+  ldpp_dout(s, 10) << "Starting retarget" << dendl;
+  RGWOp* op_override = nullptr;
+
+  /* In Swift static web content is served if the request is anonymous or
+   * has X-Web-Mode HTTP header specified to true. */
+  if (can_be_website_req()) {
+    const auto& ws_conf = s->bucket->get_info().website_conf;
+    const auto& index = s->bucket->get_info().website_conf.get_index_doc();
+
+    if (s->decoded_uri.back() != '/') {
+      op_override = get_ws_redirect_op();
+    } else if (! index.empty() && is_index_present(index)) {
+      op_override = get_ws_index_op();
+    } else if (ws_conf.listing_enabled) {
+      op_override = get_ws_listing_op();
+    }
+  }
+
+  if (op_override) {
+    handler->put_op(op);
+    op_override->init(driver, s, handler);
+
+    *new_op = op_override;
+  } else {
+    *new_op = op;
+  }
+
+  /* Return 404 Not Found is the request has web mode enforced but we static web
+   * wasn't able to serve it accordingly. */
+  return ! op_override && is_web_mode() ? -ENOENT : 0;
+}
+
+int RGWSwiftWebsiteHandler::retarget_object(RGWOp* op, RGWOp** new_op)
+{
+  ldpp_dout(s, 10) << "Starting object retarget" << dendl;
+  RGWOp* op_override = nullptr;
+
+  /* In Swift static web content is served if the request is anonymous or
+   * has X-Web-Mode HTTP header specified to true. */
+  if (can_be_website_req() && is_web_dir()) {
+    const auto& ws_conf = s->bucket->get_info().website_conf;
+    const auto& index = s->bucket->get_info().website_conf.get_index_doc();
+
+    if (s->decoded_uri.back() != '/') {
+      op_override = get_ws_redirect_op();
+    } else if (! index.empty() && is_index_present(index)) {
+      op_override = get_ws_index_op();
+    } else if (ws_conf.listing_enabled) {
+      op_override = get_ws_listing_op();
+    }
+  } else {
+    /* A regular request or the specified object isn't a subdirectory marker.
+     * We don't need any re-targeting. Error handling (like sending a custom
+     * error page) will be performed by error_handler of the actual RGWOp. */
+    return 0;
+  }
+
+  if (op_override) {
+    handler->put_op(op);
+    op_override->init(driver, s, handler);
+
+    *new_op = op_override;
+  } else {
+    *new_op = op;
+  }
+
+  /* Return 404 Not Found if we aren't able to re-target for subdir marker. */
+  return ! op_override ? -ENOENT : 0;
+}
+
+
+RGWOp *RGWHandler_REST_Bucket_SWIFT::get_obj_op(bool get_data)
+{
+  if (is_acl_op()) {
+    return new RGWGetACLs_ObjStore_SWIFT;
+  }
+
+  if (get_data)
+    return new RGWListBucket_ObjStore_SWIFT;
+  else
+    return new RGWStatBucket_ObjStore_SWIFT;
+}
+
+RGWOp *RGWHandler_REST_Bucket_SWIFT::op_get()
+{
+  return get_obj_op(true);
+}
+
+RGWOp *RGWHandler_REST_Bucket_SWIFT::op_head()
+{
+  return get_obj_op(false);
+}
+
+RGWOp *RGWHandler_REST_Bucket_SWIFT::op_put()
+{
+  if (is_acl_op()) {
+    return new RGWPutACLs_ObjStore_SWIFT;
+  }
+  if(s->info.args.exists("extract-archive")) {
+    return new RGWBulkUploadOp_ObjStore_SWIFT;
+  }
+  return new RGWCreateBucket_ObjStore_SWIFT;
+}
+
+RGWOp *RGWHandler_REST_Bucket_SWIFT::op_delete()
+{
+  return new RGWDeleteBucket_ObjStore_SWIFT;
+}
+
+RGWOp *RGWHandler_REST_Bucket_SWIFT::op_post()
+{
+  if (RGWFormPost::is_formpost_req(s)) {
+    return new RGWFormPost;
+  } else {
+    return new RGWPutMetadataBucket_ObjStore_SWIFT;
+  }
+}
+
+RGWOp *RGWHandler_REST_Bucket_SWIFT::op_options()
+{
+  return new RGWOptionsCORS_ObjStore_SWIFT;
+}
+
+
+RGWOp *RGWHandler_REST_Obj_SWIFT::get_obj_op(bool get_data)
+{
+  if (is_acl_op()) {
+    return new RGWGetACLs_ObjStore_SWIFT;
+  }
+
+  RGWGetObj_ObjStore_SWIFT *get_obj_op = new RGWGetObj_ObjStore_SWIFT;
+  get_obj_op->set_get_data(get_data);
+  return get_obj_op;
+}
+
+RGWOp *RGWHandler_REST_Obj_SWIFT::op_get()
+{
+  return get_obj_op(true);
+}
+
+RGWOp *RGWHandler_REST_Obj_SWIFT::op_head()
+{
+  return get_obj_op(false);
+}
+
+RGWOp *RGWHandler_REST_Obj_SWIFT::op_put()
+{
+  if (is_acl_op()) {
+    return new RGWPutACLs_ObjStore_SWIFT;
+  }
+  if(s->info.args.exists("extract-archive")) {
+    return new RGWBulkUploadOp_ObjStore_SWIFT;
+  }
+  if (s->init_state.src_bucket.empty())
+    return new RGWPutObj_ObjStore_SWIFT;
+  else
+    return new RGWCopyObj_ObjStore_SWIFT;
+}
+
+RGWOp *RGWHandler_REST_Obj_SWIFT::op_delete()
+{
+  return new RGWDeleteObj_ObjStore_SWIFT;
+}
+
+RGWOp *RGWHandler_REST_Obj_SWIFT::op_post()
+{
+  if (RGWFormPost::is_formpost_req(s)) {
+    return new RGWFormPost;
+  } else {
+    return new RGWPutMetadataObject_ObjStore_SWIFT;
+  }
+}
+
+RGWOp *RGWHandler_REST_Obj_SWIFT::op_copy()
+{
+  return new RGWCopyObj_ObjStore_SWIFT;
+}
+
+RGWOp *RGWHandler_REST_Obj_SWIFT::op_options()
+{
+  return new RGWOptionsCORS_ObjStore_SWIFT;
+}
+
+
+int RGWHandler_REST_SWIFT::authorize(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return rgw::auth::Strategy::apply(dpp, auth_strategy, s, y);
+}
+
+int RGWHandler_REST_SWIFT::postauth_init(optional_yield y)
+{
+  struct req_init_state* t = &s->init_state;
+
+  /* XXX Stub this until Swift Auth sets account into URL. */
+  if (g_conf()->rgw_swift_account_in_url
+      && s->user->get_id().id == RGW_USER_ANON_ID) {
+    s->bucket_tenant = s->account_name;
+  } else {
+    s->bucket_tenant = s->user->get_tenant();
+  }
+  s->bucket_name = t->url_bucket;
+
+  if (!s->object) {
+    /* Need an object, even an empty one */
+    s->object = driver->get_object(rgw_obj_key());
+  }
+
+  ldpp_dout(s, 10) << "s->object=" <<
+    (!s->object->empty() ? s->object->get_key() : rgw_obj_key("<NULL>"))
+           << " s->bucket="
+	   << rgw_make_bucket_entry_name(s->bucket_tenant, s->bucket_name)
+	   << dendl;
+
+  int ret;
+  ret = rgw_validate_tenant_name(s->bucket_tenant);
+  if (ret)
+    return ret;
+  ret = validate_bucket_name(s->bucket_name);
+  if (ret)
+    return ret;
+  ret = validate_object_name(s->object->get_name());
+  if (ret)
+    return ret;
+
+  if (!t->src_bucket.empty()) {
+    /*
+     * We don't allow cross-tenant copy at present. It requires account
+     * names in the URL for Swift.
+     */
+    s->src_tenant_name = s->user->get_tenant();
+    s->src_bucket_name = t->src_bucket;
+
+    ret = validate_bucket_name(s->src_bucket_name);
+    if (ret < 0) {
+      return ret;
+    }
+    ret = validate_object_name(s->src_object->get_name());
+    if (ret < 0) {
+      return ret;
+    }
+  }
+
+  return 0;
+}
+
+int RGWHandler_REST_SWIFT::validate_bucket_name(const string& bucket)
+{
+  const size_t len = bucket.size();
+
+  if (len > MAX_BUCKET_NAME_LEN) {
+    /* Bucket Name too long. Generate custom error message and bind it
+     * to an R-value reference. */
+    const auto msg = boost::str(
+      boost::format("Container name length of %lld longer than %lld")
+        % len % int(MAX_BUCKET_NAME_LEN));
+    set_req_state_err(s, ERR_INVALID_BUCKET_NAME, msg);
+    return -ERR_INVALID_BUCKET_NAME;
+  }
+
+
+  if (len == 0)
+    return 0;
+
+  if (bucket[0] == '.')
+    return -ERR_INVALID_BUCKET_NAME;
+
+  if (check_utf8(bucket.c_str(), len))
+    return -ERR_INVALID_UTF8;
+
+  const char *s = bucket.c_str();
+
+  for (size_t i = 0; i < len; ++i, ++s) {
+    if (*(unsigned char *)s == 0xff)
+      return -ERR_INVALID_BUCKET_NAME;
+    if (*(unsigned char *)s == '/')
+      return -ERR_INVALID_BUCKET_NAME;
+  }
+
+  return 0;
+}
+
+static void next_tok(string& str, string& tok, char delim)
+{
+  if (str.size() == 0) {
+    tok = "";
+    return;
+  }
+  tok = str;
+  int pos = str.find(delim);
+  if (pos > 0) {
+    tok = str.substr(0, pos);
+    str = str.substr(pos + 1);
+  } else {
+    str = "";
+  }
+}
+
+int RGWHandler_REST_SWIFT::init_from_header(rgw::sal::Driver* driver,
+					    req_state* const s,
+                                            const std::string& frontend_prefix)
+{
+  string req;
+  string first;
+
+  s->prot_flags |= RGW_REST_SWIFT;
+
+  char reqbuf[frontend_prefix.length() + s->decoded_uri.length() + 1];
+  sprintf(reqbuf, "%s%s", frontend_prefix.c_str(), s->decoded_uri.c_str());
+  const char *req_name = reqbuf;
+
+  const char *p;
+
+  if (*req_name == '?') {
+    p = req_name;
+  } else {
+    p = s->info.request_params.c_str();
+  }
+
+  s->info.args.set(p);
+  s->info.args.parse(s);
+
+  /* Skip the leading slash of URL hierarchy. */
+  if (req_name[0] != '/') {
+    return 0;
+  } else {
+    req_name++;
+  }
+
+  if ('\0' == req_name[0]) {
+    return g_conf()->rgw_swift_url_prefix == "/" ? -ERR_BAD_URL : 0;
+  }
+
+  req = req_name;
+
+  size_t pos = req.find('/');
+  if (std::string::npos != pos && g_conf()->rgw_swift_url_prefix != "/") {
+    bool cut_url = g_conf()->rgw_swift_url_prefix.length();
+    first = req.substr(0, pos);
+
+    if (first.compare(g_conf()->rgw_swift_url_prefix) == 0) {
+      if (cut_url) {
+        /* Rewind to the "v1/..." part. */
+        next_tok(req, first, '/');
+      }
+    }
+  } else if (req.compare(g_conf()->rgw_swift_url_prefix) == 0) {
+    s->formatter = new RGWFormatter_Plain;
+    return -ERR_BAD_URL;
+  } else {
+    first = req;
+  }
+
+  std::string tenant_path;
+  if (! g_conf()->rgw_swift_tenant_name.empty()) {
+    tenant_path = "/AUTH_";
+    tenant_path.append(g_conf()->rgw_swift_tenant_name);
+  }
+
+  /* verify that the request_uri conforms with what's expected */
+  char buf[g_conf()->rgw_swift_url_prefix.length() + 16 + tenant_path.length()];
+  int blen;
+  if (g_conf()->rgw_swift_url_prefix == "/") {
+    blen = sprintf(buf, "/v1%s", tenant_path.c_str());
+  } else {
+    blen = sprintf(buf, "/%s/v1%s",
+                   g_conf()->rgw_swift_url_prefix.c_str(), tenant_path.c_str());
+  }
+
+  if (strncmp(reqbuf, buf, blen) != 0) {
+    return -ENOENT;
+  }
+
+  int ret = allocate_formatter(s, RGWFormat::PLAIN, true);
+  if (ret < 0)
+    return ret;
+
+  string ver;
+
+  next_tok(req, ver, '/');
+
+  if (!tenant_path.empty() || g_conf()->rgw_swift_account_in_url) {
+    string account_name;
+    next_tok(req, account_name, '/');
+
+    /* Erase all pre-defined prefixes like "AUTH_" or "KEY_". */
+    const vector<string> skipped_prefixes = { "AUTH_", "KEY_" };
+
+    for (const auto& pfx : skipped_prefixes) {
+      const size_t comp_len = min(account_name.length(), pfx.length());
+      if (account_name.compare(0, comp_len, pfx) == 0) {
+        /* Prefix is present. Drop it. */
+        account_name = account_name.substr(comp_len);
+        break;
+      }
+    }
+
+    if (account_name.empty()) {
+      return -ERR_PRECONDITION_FAILED;
+    } else {
+      s->account_name = account_name;
+    }
+  }
+
+  next_tok(req, first, '/');
+
+  ldpp_dout(s, 10) << "ver=" << ver << " first=" << first << " req=" << req << dendl;
+  if (first.size() == 0)
+    return 0;
+
+  s->info.effective_uri = "/" + first;
+
+  // Save bucket to tide us over until token is parsed.
+  s->init_state.url_bucket = first;
+
+  if (req.size()) {
+    s->object = driver->get_object(
+      rgw_obj_key(req, s->info.env->get("HTTP_X_OBJECT_VERSION_ID", ""))); /* rgw swift extension */
+    s->info.effective_uri.append("/" + s->object->get_name());
+  }
+
+  return 0;
+}
+
+int RGWHandler_REST_SWIFT::init(rgw::sal::Driver* driver, req_state* s,
+				rgw::io::BasicClient *cio)
+{
+  struct req_init_state *t = &s->init_state;
+
+  s->dialect = "swift";
+
+  std::string copy_source = s->info.env->get("HTTP_X_COPY_FROM", "");
+  if (! copy_source.empty()) {
+    rgw_obj_key key;
+    bool result = RGWCopyObj::parse_copy_location(copy_source, t->src_bucket, key, s);
+    if (!result)
+      return -ERR_BAD_URL;
+    s->src_object = driver->get_object(key);
+    if (!s->src_object)
+      return -ERR_BAD_URL;
+  }
+
+  if (s->op == OP_COPY) {
+    std::string req_dest = s->info.env->get("HTTP_DESTINATION", "");
+    if (req_dest.empty())
+      return -ERR_BAD_URL;
+
+    std::string dest_bucket_name;
+    rgw_obj_key dest_obj_key;
+    bool result =
+      RGWCopyObj::parse_copy_location(req_dest, dest_bucket_name,
+				      dest_obj_key, s);
+    if (!result)
+       return -ERR_BAD_URL;
+
+    std::string dest_object_name = dest_obj_key.name;
+
+    /* convert COPY operation into PUT */
+    t->src_bucket = t->url_bucket;
+    s->src_object = s->object->clone();
+    t->url_bucket = dest_bucket_name;
+    s->object->set_name(dest_object_name);
+    s->op = OP_PUT;
+  }
+
+  s->info.storage_class = s->info.env->get("HTTP_X_OBJECT_STORAGE_CLASS", "");
+
+  return RGWHandler_REST::init(driver, s, cio);
+}
+
+RGWHandler_REST*
+RGWRESTMgr_SWIFT::get_handler(rgw::sal::Driver* driver,
+			      req_state* const s,
+                              const rgw::auth::StrategyRegistry& auth_registry,
+                              const std::string& frontend_prefix)
+{
+  int ret = RGWHandler_REST_SWIFT::init_from_header(driver, s, frontend_prefix);
+  if (ret < 0) {
+    ldpp_dout(s, 10) << "init_from_header returned err=" << ret <<  dendl;
+    return nullptr;
+  }
+
+  const auto& auth_strategy = auth_registry.get_swift();
+
+  if (s->init_state.url_bucket.empty()) {
+    return new RGWHandler_REST_Service_SWIFT(auth_strategy);
+  }
+
+  if (rgw::sal::Object::empty(s->object.get())) {
+    return new RGWHandler_REST_Bucket_SWIFT(auth_strategy);
+  }
+
+  return new RGWHandler_REST_Obj_SWIFT(auth_strategy);
+}
+
+RGWHandler_REST* RGWRESTMgr_SWIFT_Info::get_handler(
+  rgw::sal::Driver* driver,
+  req_state* const s,
+  const rgw::auth::StrategyRegistry& auth_registry,
+  const std::string& frontend_prefix)
+{
+  s->prot_flags |= RGW_REST_SWIFT;
+  const auto& auth_strategy = auth_registry.get_swift();
+  return new RGWHandler_REST_SWIFT_Info(auth_strategy);
+}
diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h
new file mode 100644
index 000000000..89873131c
--- /dev/null
+++ b/src/rgw/rgw_rest_swift.h
@@ -0,0 +1,685 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+#define TIME_BUF_SIZE 128
+
+#include <string_view>
+
+#include <boost/optional.hpp>
+#include <boost/utility/typed_in_place_factory.hpp>
+
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_swift_auth.h"
+#include "rgw_http_errors.h"
+
+
+class RGWGetObj_ObjStore_SWIFT : public RGWGetObj_ObjStore {
+  int custom_http_ret = 0;
+public:
+  RGWGetObj_ObjStore_SWIFT() {}
+  ~RGWGetObj_ObjStore_SWIFT() override {}
+
+  int verify_permission(optional_yield y) override;
+  int get_params(optional_yield y) override;
+  int send_response_data_error(optional_yield y) override;
+  int send_response_data(bufferlist& bl, off_t ofs, off_t len) override;
+
+  void set_custom_http_response(const int http_ret) {
+    custom_http_ret = http_ret;
+  }
+
+  bool need_object_expiration() override {
+    return true;
+  }
+};
+
+class RGWListBuckets_ObjStore_SWIFT : public RGWListBuckets_ObjStore {
+  bool need_stats;
+  bool wants_reversed;
+  std::string prefix;
+  std::vector<rgw::sal::BucketList> reverse_buffer;
+
+  uint64_t get_default_max() const override {
+    return 0;
+  }
+
+public:
+  RGWListBuckets_ObjStore_SWIFT()
+    : need_stats(true),
+      wants_reversed(false) {
+  }
+  ~RGWListBuckets_ObjStore_SWIFT() override {}
+
+  int get_params(optional_yield y) override;
+  void handle_listing_chunk(rgw::sal::BucketList&& buckets) override;
+  void send_response_begin(bool has_buckets) override;
+  void send_response_data(rgw::sal::BucketList& buckets) override;
+  void send_response_data_reversed(rgw::sal::BucketList& buckets);
+  void dump_bucket_entry(const rgw::sal::Bucket& obj);
+  void send_response_end() override;
+
+  bool should_get_stats() override { return need_stats; }
+  bool supports_account_metadata() override { return true; }
+};
+
+class RGWListBucket_ObjStore_SWIFT : public RGWListBucket_ObjStore {
+  std::string path;
+public:
+  RGWListBucket_ObjStore_SWIFT() {
+    default_max = 10000;
+  }
+  ~RGWListBucket_ObjStore_SWIFT() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+  bool need_container_stats() override { return true; }
+};
+
+class RGWStatAccount_ObjStore_SWIFT : public RGWStatAccount_ObjStore {
+  std::map<std::string, bufferlist> attrs;
+public:
+  RGWStatAccount_ObjStore_SWIFT() {
+  }
+  ~RGWStatAccount_ObjStore_SWIFT() override {}
+
+  void execute(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWStatBucket_ObjStore_SWIFT : public RGWStatBucket_ObjStore {
+public:
+  RGWStatBucket_ObjStore_SWIFT() {}
+  ~RGWStatBucket_ObjStore_SWIFT() override {}
+
+  void send_response() override;
+};
+
+class RGWCreateBucket_ObjStore_SWIFT : public RGWCreateBucket_ObjStore {
+protected:
+  bool need_metadata_upload() const override { return true; }
+public:
+  RGWCreateBucket_ObjStore_SWIFT() {}
+  ~RGWCreateBucket_ObjStore_SWIFT() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWDeleteBucket_ObjStore_SWIFT : public RGWDeleteBucket_ObjStore {
+public:
+  RGWDeleteBucket_ObjStore_SWIFT() {}
+  ~RGWDeleteBucket_ObjStore_SWIFT() override {}
+
+  void send_response() override;
+};
+
+class RGWPutObj_ObjStore_SWIFT : public RGWPutObj_ObjStore {
+  std::string lo_etag;
+public:
+  RGWPutObj_ObjStore_SWIFT() {}
+  ~RGWPutObj_ObjStore_SWIFT() override {}
+
+  int update_slo_segment_size(rgw_slo_entry& entry);
+
+  int verify_permission(optional_yield y) override;
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWPutMetadataAccount_ObjStore_SWIFT : public RGWPutMetadataAccount_ObjStore {
+public:
+  RGWPutMetadataAccount_ObjStore_SWIFT() {}
+  ~RGWPutMetadataAccount_ObjStore_SWIFT() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWPutMetadataBucket_ObjStore_SWIFT : public RGWPutMetadataBucket_ObjStore {
+public:
+  RGWPutMetadataBucket_ObjStore_SWIFT() {}
+  ~RGWPutMetadataBucket_ObjStore_SWIFT() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+};
+
+class RGWPutMetadataObject_ObjStore_SWIFT : public RGWPutMetadataObject_ObjStore {
+public:
+  RGWPutMetadataObject_ObjStore_SWIFT() {}
+  ~RGWPutMetadataObject_ObjStore_SWIFT() override {}
+
+  int get_params(optional_yield y) override;
+  void send_response() override;
+  bool need_object_expiration() override { return true; }
+};
+
+class RGWDeleteObj_ObjStore_SWIFT : public RGWDeleteObj_ObjStore {
+public:
+  RGWDeleteObj_ObjStore_SWIFT() {}
+  ~RGWDeleteObj_ObjStore_SWIFT() override {}
+
+  int verify_permission(optional_yield y) override;
+  int get_params(optional_yield y) override;
+  bool need_object_expiration() override { return true; }
+  void send_response() override;
+};
+
+class RGWCopyObj_ObjStore_SWIFT : public RGWCopyObj_ObjStore {
+  bool sent_header;
+protected:
+  void dump_copy_info();
+public:
+  RGWCopyObj_ObjStore_SWIFT() : sent_header(false) {}
+  ~RGWCopyObj_ObjStore_SWIFT() override {}
+
+  int init_dest_policy() override;
+  int get_params(optional_yield y) override;
+  void send_response() override;
+  void send_partial_response(off_t ofs) override;
+};
+
+class RGWGetACLs_ObjStore_SWIFT : public RGWGetACLs_ObjStore {
+public:
+  RGWGetACLs_ObjStore_SWIFT() {}
+  ~RGWGetACLs_ObjStore_SWIFT() override {}
+
+  void send_response() override {}
+};
+
+class RGWPutACLs_ObjStore_SWIFT : public RGWPutACLs_ObjStore {
+public:
+  RGWPutACLs_ObjStore_SWIFT() : RGWPutACLs_ObjStore() {}
+  ~RGWPutACLs_ObjStore_SWIFT() override {}
+
+  void send_response() override {}
+};
+
+class RGWOptionsCORS_ObjStore_SWIFT : public RGWOptionsCORS_ObjStore {
+public:
+  RGWOptionsCORS_ObjStore_SWIFT() {}
+  ~RGWOptionsCORS_ObjStore_SWIFT() override {}
+
+  void send_response() override;
+};
+
+class RGWBulkDelete_ObjStore_SWIFT : public RGWBulkDelete_ObjStore {
+public:
+  RGWBulkDelete_ObjStore_SWIFT() {}
+  ~RGWBulkDelete_ObjStore_SWIFT() override {}
+
+  int get_data(std::list<RGWBulkDelete::acct_path_t>& items,
+               bool * is_truncated) override;
+  void send_response() override;
+};
+
+class RGWBulkUploadOp_ObjStore_SWIFT : public RGWBulkUploadOp_ObjStore {
+  size_t conlen;
+  size_t curpos;
+
+public:
+  RGWBulkUploadOp_ObjStore_SWIFT()
+    : conlen(0),
+      curpos(0) {
+  }
+  ~RGWBulkUploadOp_ObjStore_SWIFT() = default;
+
+  std::unique_ptr<StreamGetter> create_stream() override;
+  void send_response() override;
+};
+
+class RGWInfo_ObjStore_SWIFT : public RGWInfo_ObjStore {
+protected:
+  struct info
+  {
+    bool is_admin_info;
+    std::function<void (Formatter&, const ConfigProxy&, rgw::sal::Driver*)> list_data;
+  };
+
+  static const std::vector<std::pair<std::string, struct info>> swift_info;
+public:
+  RGWInfo_ObjStore_SWIFT() {}
+  ~RGWInfo_ObjStore_SWIFT() override {}
+
+  void execute(optional_yield y) override;
+  void send_response() override;
+  static void list_swift_data(Formatter& formatter, const ConfigProxy& config, rgw::sal::Driver* driver);
+  static void list_tempauth_data(Formatter& formatter, const ConfigProxy& config, rgw::sal::Driver* driver);
+  static void list_tempurl_data(Formatter& formatter, const ConfigProxy& config, rgw::sal::Driver* driver);
+  static void list_slo_data(Formatter& formatter, const ConfigProxy& config, rgw::sal::Driver* driver);
+  static bool is_expired(const std::string& expires, const DoutPrefixProvider* dpp);
+};
+
+
+class RGWFormPost : public RGWPostObj_ObjStore {
+  std::string get_current_filename() const override;
+  std::string get_current_content_type() const override;
+  std::size_t get_max_file_size() /*const*/;
+  bool is_next_file_to_upload() override;
+  bool is_integral();
+  bool is_non_expired();
+  void get_owner_info(const req_state* s,
+                      RGWUserInfo& owner_info) const;
+
+  parts_collection_t ctrl_parts;
+  boost::optional<post_form_part> current_data_part;
+  std::string prefix;
+  bool stream_done = false;
+
+  class SignatureHelper;
+public:
+  RGWFormPost() = default;
+  ~RGWFormPost() = default;
+
+  void init(rgw::sal::Driver* driver,
+            req_state* s,
+            RGWHandler* dialect_handler) override;
+
+  int get_params(optional_yield y) override;
+  int get_data(ceph::bufferlist& bl, bool& again) override;
+  void send_response() override;
+
+  static bool is_formpost_req(req_state* const s);
+};
+
+class RGWFormPost::SignatureHelper
+{
+private:
+  static constexpr uint32_t output_size =
+    CEPH_CRYPTO_HMACSHA1_DIGESTSIZE * 2 + 1;
+
+  unsigned char dest[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; // 20
+  char dest_str[output_size];
+
+public:
+  SignatureHelper() = default;
+
+  const char* calc(const std::string& key,
+                   const std::string_view& path_info,
+                   const std::string_view& redirect,
+                   const std::string_view& max_file_size,
+                   const std::string_view& max_file_count,
+                   const std::string_view& expires) {
+    using ceph::crypto::HMACSHA1;
+    using UCHARPTR = const unsigned char*;
+
+    HMACSHA1 hmac((UCHARPTR) key.data(), key.size());
+
+    hmac.Update((UCHARPTR) path_info.data(), path_info.size());
+    hmac.Update((UCHARPTR) "\n", 1);
+
+    hmac.Update((UCHARPTR) redirect.data(), redirect.size());
+    hmac.Update((UCHARPTR) "\n", 1);
+
+    hmac.Update((UCHARPTR) max_file_size.data(), max_file_size.size());
+    hmac.Update((UCHARPTR) "\n", 1);
+
+    hmac.Update((UCHARPTR) max_file_count.data(), max_file_count.size());
+    hmac.Update((UCHARPTR) "\n", 1);
+
+    hmac.Update((UCHARPTR) expires.data(), expires.size());
+
+    hmac.Final(dest);
+
+    buf_to_hex((UCHARPTR) dest, sizeof(dest), dest_str);
+
+    return dest_str;
+  }
+
+  const char* get_signature() const {
+    return dest_str;
+  }
+
+  bool is_equal_to(const std::string& rhs) const {
+    /* never allow out-of-range exception */
+    if (rhs.size() < (output_size - 1)) {
+      return false;
+    }
+    return rhs.compare(0 /* pos */,  output_size, dest_str) == 0;
+  }
+
+}; /* RGWFormPost::SignatureHelper */
+
+
+class RGWSwiftWebsiteHandler {
+  rgw::sal::Driver* const driver;
+  req_state* const s;
+  RGWHandler_REST* const handler;
+
+  bool is_web_mode() const;
+  bool can_be_website_req() const;
+  bool is_web_dir() const;
+  bool is_index_present(const std::string& index) const;
+
+  int serve_errordoc(int http_ret, std::string error_doc, optional_yield y);
+
+  RGWOp* get_ws_redirect_op();
+  RGWOp* get_ws_index_op();
+  RGWOp* get_ws_listing_op();
+public:
+  RGWSwiftWebsiteHandler(rgw::sal::Driver* const driver,
+                         req_state* const s,
+                         RGWHandler_REST* const handler)
+    : driver(driver),
+      s(s),
+      handler(handler) {
+  }
+
+  int error_handler(const int err_no,
+                    std::string* const error_content,
+		    optional_yield y);
+  int retarget_bucket(RGWOp* op, RGWOp** new_op);
+  int retarget_object(RGWOp* op, RGWOp** new_op);
+};
+
+
+class RGWHandler_REST_SWIFT : public RGWHandler_REST {
+  friend class RGWRESTMgr_SWIFT;
+  friend class RGWRESTMgr_SWIFT_Info;
+protected:
+  const rgw::auth::Strategy& auth_strategy;
+
+  virtual bool is_acl_op() const {
+    return false;
+  }
+
+  static int init_from_header(rgw::sal::Driver* driver, req_state* s,
+                              const std::string& frontend_prefix);
+public:
+  explicit RGWHandler_REST_SWIFT(const rgw::auth::Strategy& auth_strategy)
+    : auth_strategy(auth_strategy) {
+  }
+  ~RGWHandler_REST_SWIFT() override = default;
+
+  int validate_bucket_name(const std::string& bucket);
+
+  int init(rgw::sal::Driver* driver, req_state *s, rgw::io::BasicClient *cio) override;
+  int authorize(const DoutPrefixProvider *dpp, optional_yield y) override;
+  int postauth_init(optional_yield y) override;
+
+  RGWAccessControlPolicy *alloc_policy() { return nullptr; /* return new RGWAccessControlPolicy_SWIFT; */ }
+  void free_policy(RGWAccessControlPolicy *policy) { delete policy; }
+};
+
+class RGWHandler_REST_Service_SWIFT : public RGWHandler_REST_SWIFT {
+protected:
+  RGWOp *op_get() override;
+  RGWOp *op_head() override;
+  RGWOp *op_put() override;
+  RGWOp *op_post() override;
+  RGWOp *op_delete() override;
+public:
+  using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT;
+  ~RGWHandler_REST_Service_SWIFT() override = default;
+};
+
+class RGWHandler_REST_Bucket_SWIFT : public RGWHandler_REST_SWIFT {
+  /* We need the boost::optional here only because of handler's late
+   * initialization (see the init() method). */
+  boost::optional<RGWSwiftWebsiteHandler> website_handler;
+protected:
+  bool is_obj_update_op() const override {
+    return s->op == OP_POST;
+  }
+
+  RGWOp *get_obj_op(bool get_data);
+  RGWOp *op_get() override;
+  RGWOp *op_head() override;
+  RGWOp *op_put() override;
+  RGWOp *op_delete() override;
+  RGWOp *op_post() override;
+  RGWOp *op_options() override;
+public:
+  using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT;
+  ~RGWHandler_REST_Bucket_SWIFT() override = default;
+
+  int error_handler(int err_no, std::string *error_content, optional_yield y) override {
+    return website_handler->error_handler(err_no, error_content, y);
+  }
+
+  int retarget(RGWOp* op, RGWOp** new_op, optional_yield) override {
+    return website_handler->retarget_bucket(op, new_op);
+  }
+
+  int init(rgw::sal::Driver* const driver,
+           req_state* const s,
+           rgw::io::BasicClient* const cio) override {
+    website_handler = boost::in_place<RGWSwiftWebsiteHandler>(driver, s, this);
+    return RGWHandler_REST_SWIFT::init(driver, s, cio);
+  }
+};
+
+class RGWHandler_REST_Obj_SWIFT : public RGWHandler_REST_SWIFT {
+  /* We need the boost::optional here only because of handler's late
+   * initialization (see the init() method). */
+  boost::optional<RGWSwiftWebsiteHandler> website_handler;
+protected:
+  bool is_obj_update_op() const override {
+    return s->op == OP_POST;
+  }
+
+  RGWOp *get_obj_op(bool get_data);
+  RGWOp *op_get() override;
+  RGWOp *op_head() override;
+  RGWOp *op_put() override;
+  RGWOp *op_delete() override;
+  RGWOp *op_post() override;
+  RGWOp *op_copy() override;
+  RGWOp *op_options() override;
+
+public:
+  using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT;
+  ~RGWHandler_REST_Obj_SWIFT() override = default;
+
+  int error_handler(int err_no, std::string *error_content,
+		    optional_yield y) override {
+    return website_handler->error_handler(err_no, error_content, y);
+  }
+
+  int retarget(RGWOp* op, RGWOp** new_op, optional_yield) override {
+    return website_handler->retarget_object(op, new_op);
+  }
+
+  int init(rgw::sal::Driver* const driver,
+           req_state* const s,
+           rgw::io::BasicClient* const cio) override {
+    website_handler = boost::in_place<RGWSwiftWebsiteHandler>(driver, s, this);
+    return RGWHandler_REST_SWIFT::init(driver, s, cio);
+  }
+};
+
+class RGWRESTMgr_SWIFT : public RGWRESTMgr {
+protected:
+  RGWRESTMgr* get_resource_mgr_as_default(req_state* const s,
+                                          const std::string& uri,
+                                          std::string* const out_uri) override {
+    return this->get_resource_mgr(s, uri, out_uri);
+  }
+
+public:
+  RGWRESTMgr_SWIFT() = default;
+  ~RGWRESTMgr_SWIFT() override = default;
+
+  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+			       req_state *s,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string& frontend_prefix) override;
+};
+
+
+class  RGWGetCrossDomainPolicy_ObjStore_SWIFT
+  : public RGWGetCrossDomainPolicy_ObjStore {
+public:
+  RGWGetCrossDomainPolicy_ObjStore_SWIFT() = default;
+  ~RGWGetCrossDomainPolicy_ObjStore_SWIFT() override = default;
+
+  void send_response() override;
+};
+
+class  RGWGetHealthCheck_ObjStore_SWIFT
+  : public RGWGetHealthCheck_ObjStore {
+public:
+  RGWGetHealthCheck_ObjStore_SWIFT() = default;
+  ~RGWGetHealthCheck_ObjStore_SWIFT() override = default;
+
+  void send_response() override;
+};
+
+class RGWHandler_SWIFT_CrossDomain : public RGWHandler_REST {
+public:
+  RGWHandler_SWIFT_CrossDomain() = default;
+  ~RGWHandler_SWIFT_CrossDomain() override = default;
+
+  RGWOp *op_get() override {
+    return new RGWGetCrossDomainPolicy_ObjStore_SWIFT();
+  }
+
+  int init(rgw::sal::Driver* const driver,
+           req_state* const state,
+           rgw::io::BasicClient* const cio) override {
+    state->dialect = "swift";
+    state->formatter = new JSONFormatter;
+    state->format = RGWFormat::JSON;
+
+    return RGWHandler::init(driver, state, cio);
+  }
+
+  int authorize(const DoutPrefixProvider *dpp, optional_yield) override {
+    return 0;
+  }
+
+  int postauth_init(optional_yield) override {
+    return 0;
+  }
+
+  int read_permissions(RGWOp *, optional_yield y) override {
+    return 0;
+  }
+
+  virtual RGWAccessControlPolicy *alloc_policy() { return nullptr; }
+  virtual void free_policy(RGWAccessControlPolicy *policy) {}
+};
+
+class RGWRESTMgr_SWIFT_CrossDomain : public RGWRESTMgr {
+protected:
+  RGWRESTMgr *get_resource_mgr(req_state* const s,
+                               const std::string& uri,
+                               std::string* const out_uri) override {
+    return this;
+  }
+
+public:
+  RGWRESTMgr_SWIFT_CrossDomain() = default;
+  ~RGWRESTMgr_SWIFT_CrossDomain() override = default;
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state* const s,
+                               const rgw::auth::StrategyRegistry&,
+                               const std::string&) override {
+    s->prot_flags |= RGW_REST_SWIFT;
+    return new RGWHandler_SWIFT_CrossDomain;
+  }
+};
+
+
+class RGWHandler_SWIFT_HealthCheck : public RGWHandler_REST {
+public:
+  RGWHandler_SWIFT_HealthCheck() = default;
+  ~RGWHandler_SWIFT_HealthCheck() override = default;
+
+  RGWOp *op_get() override {
+    return new RGWGetHealthCheck_ObjStore_SWIFT();
+  }
+
+  int init(rgw::sal::Driver* const driver,
+           req_state* const state,
+           rgw::io::BasicClient* const cio) override {
+    state->dialect = "swift";
+    state->formatter = new JSONFormatter;
+    state->format = RGWFormat::JSON;
+
+    return RGWHandler::init(driver, state, cio);
+  }
+
+  int authorize(const DoutPrefixProvider *dpp, optional_yield y) override {
+    return 0;
+  }
+
+  int postauth_init(optional_yield) override {
+    return 0;
+  }
+
+  int read_permissions(RGWOp *, optional_yield y) override {
+    return 0;
+  }
+
+  virtual RGWAccessControlPolicy *alloc_policy() { return nullptr; }
+  virtual void free_policy(RGWAccessControlPolicy *policy) {}
+};
+
+class RGWRESTMgr_SWIFT_HealthCheck : public RGWRESTMgr {
+protected:
+  RGWRESTMgr *get_resource_mgr(req_state* const s,
+                               const std::string& uri,
+                               std::string* const out_uri) override {
+    return this;
+  }
+
+public:
+  RGWRESTMgr_SWIFT_HealthCheck() = default;
+  ~RGWRESTMgr_SWIFT_HealthCheck() override = default;
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state* const s,
+                               const rgw::auth::StrategyRegistry&,
+                               const std::string&) override {
+    s->prot_flags |= RGW_REST_SWIFT;
+    return new RGWHandler_SWIFT_HealthCheck;
+  }
+};
+
+
+class RGWHandler_REST_SWIFT_Info : public RGWHandler_REST_SWIFT {
+public:
+  using RGWHandler_REST_SWIFT::RGWHandler_REST_SWIFT;
+  ~RGWHandler_REST_SWIFT_Info() override = default;
+
+  RGWOp *op_get() override {
+    return new RGWInfo_ObjStore_SWIFT();
+  }
+
+  int init(rgw::sal::Driver* const driver,
+           req_state* const state,
+           rgw::io::BasicClient* const cio) override {
+    state->dialect = "swift";
+    state->formatter = new JSONFormatter;
+    state->format = RGWFormat::JSON;
+
+    return RGWHandler::init(driver, state, cio);
+  }
+
+  int authorize(const DoutPrefixProvider *dpp, optional_yield) override {
+    return 0;
+  }
+
+  int postauth_init(optional_yield) override {
+    return 0;
+  }
+
+  int read_permissions(RGWOp *, optional_yield y) override {
+    return 0;
+  }
+};
+
+class RGWRESTMgr_SWIFT_Info : public RGWRESTMgr {
+public:
+  RGWRESTMgr_SWIFT_Info() = default;
+  ~RGWRESTMgr_SWIFT_Info() override = default;
+
+  RGWHandler_REST *get_handler(rgw::sal::Driver* driver,
+			       req_state* s,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string& frontend_prefix) override;
+};
diff --git a/src/rgw/rgw_rest_usage.cc b/src/rgw/rgw_rest_usage.cc
new file mode 100644
index 000000000..9207a68cd
--- /dev/null
+++ b/src/rgw/rgw_rest_usage.cc
@@ -0,0 +1,121 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_op.h"
+#include "rgw_usage.h"
+#include "rgw_rest_usage.h"
+#include "rgw_sal.h"
+
+#include "include/str_list.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWOp_Usage_Get : public RGWRESTOp {
+
+public:
+  RGWOp_Usage_Get() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("usage", RGW_CAP_READ);
+  }
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "get_usage"; }
+};
+
+void RGWOp_Usage_Get::execute(optional_yield y) {
+  map<std::string, bool> categories;
+
+  string uid_str;
+  string bucket_name;
+  uint64_t start, end;
+  bool show_entries;
+  bool show_summary;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name);
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(uid_str));
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  if (!bucket_name.empty()) {
+    driver->get_bucket(nullptr, user.get(), std::string(), bucket_name, &bucket, null_yield);
+  }
+
+  RESTArgs::get_epoch(s, "start", 0, &start);
+  RESTArgs::get_epoch(s, "end", (uint64_t)-1, &end);
+  RESTArgs::get_bool(s, "show-entries", true, &show_entries);
+  RESTArgs::get_bool(s, "show-summary", true, &show_summary);
+
+  string cat_str;
+  RESTArgs::get_string(s, "categories", cat_str, &cat_str);
+
+  if (!cat_str.empty()) {
+    list<string> cat_list;
+    list<string>::iterator iter;
+    get_str_list(cat_str, cat_list);
+    for (iter = cat_list.begin(); iter != cat_list.end(); ++iter) {
+      categories[*iter] = true;
+    }
+  }
+
+  op_ret = RGWUsage::show(this, driver, user.get(), bucket.get(), start, end, show_entries, show_summary, &categories, flusher);
+}
+
+class RGWOp_Usage_Delete : public RGWRESTOp {
+
+public:
+  RGWOp_Usage_Delete() {}
+
+  int check_caps(const RGWUserCaps& caps) override {
+    return caps.check_cap("usage", RGW_CAP_WRITE);
+  }
+  void execute(optional_yield y) override;
+
+  const char* name() const override { return "trim_usage"; }
+};
+
+void RGWOp_Usage_Delete::execute(optional_yield y) {
+  string uid_str;
+  string bucket_name;
+  uint64_t start, end;
+
+  RESTArgs::get_string(s, "uid", uid_str, &uid_str);
+  RESTArgs::get_string(s, "bucket", bucket_name, &bucket_name);
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(uid_str));
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+
+  if (!bucket_name.empty()) {
+    driver->get_bucket(nullptr, user.get(), std::string(), bucket_name, &bucket, null_yield);
+  }
+
+  RESTArgs::get_epoch(s, "start", 0, &start);
+  RESTArgs::get_epoch(s, "end", (uint64_t)-1, &end);
+
+  if (rgw::sal::User::empty(user.get()) &&
+      bucket_name.empty() &&
+      !start &&
+      end == (uint64_t)-1) {
+    bool remove_all;
+    RESTArgs::get_bool(s, "remove-all", false, &remove_all);
+    if (!remove_all) {
+      op_ret = -EINVAL;
+      return;
+    }
+  }
+
+  op_ret = RGWUsage::trim(this, driver, user.get(), bucket.get(), start, end);
+}
+
+RGWOp *RGWHandler_Usage::op_get()
+{
+  return new RGWOp_Usage_Get;
+}
+
+RGWOp *RGWHandler_Usage::op_delete()
+{
+  return new RGWOp_Usage_Delete;
+}
+
+
diff --git a/src/rgw/rgw_rest_usage.h b/src/rgw/rgw_rest_usage.h
new file mode 100644
index 000000000..f68edb0ec
--- /dev/null
+++ b/src/rgw/rgw_rest_usage.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_rest.h"
+#include "rgw_rest_s3.h"
+
+
+class RGWHandler_Usage : public RGWHandler_Auth_S3 {
+protected:
+  RGWOp *op_get() override;
+  RGWOp *op_delete() override;
+public:
+  using RGWHandler_Auth_S3::RGWHandler_Auth_S3;
+  ~RGWHandler_Usage() override = default;
+
+  int read_permissions(RGWOp*, optional_yield) override {
+    return 0;
+  }
+};
+
+class RGWRESTMgr_Usage : public RGWRESTMgr {
+public:
+  RGWRESTMgr_Usage() = default;
+  ~RGWRESTMgr_Usage() override = default;
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry& auth_registry,
+                               const std::string&) override {
+    return new RGWHandler_Usage(auth_registry);
+  }
+};
diff --git a/src/rgw/rgw_rest_user_policy.cc b/src/rgw/rgw_rest_user_policy.cc
new file mode 100644
index 000000000..2e300468b
--- /dev/null
+++ b/src/rgw/rgw_rest_user_policy.cc
@@ -0,0 +1,413 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <regex>
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+
+#include "include/types.h"
+#include "rgw_string.h"
+
+#include "rgw_common.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_rest_user_policy.h"
+#include "rgw_sal.h"
+#include "services/svc_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+
+void RGWRestUserPolicy::dump(Formatter *f) const
+{
+  encode_json("PolicyName", policy_name , f);
+  encode_json("UserName", user_name , f);
+  encode_json("PolicyDocument", policy, f);
+}
+
+void RGWRestUserPolicy::send_response()
+{
+  if (op_ret) {
+    set_req_state_err(s, op_ret);
+  }
+  dump_errno(s);
+  end_header(s);
+}
+
+int RGWRestUserPolicy::verify_permission(optional_yield y)
+{
+  if (s->auth.identity->is_anonymous()) {
+    return -EACCES;
+  }
+
+  if(int ret = check_caps(s->user->get_caps()); ret == 0) {
+    return ret;
+  }
+
+  uint64_t op = get_op();
+  std::string user_name = s->info.args.get("UserName");
+  rgw_user user_id(user_name);
+  if (! verify_user_permission(this, s, rgw::ARN(rgw::ARN(user_id.id,
+                                                "user",
+                                                 user_id.tenant)), op)) {
+    return -EACCES;
+  }
+  return 0;
+}
+
+bool RGWRestUserPolicy::validate_input()
+{
+  if (policy_name.length() > MAX_POLICY_NAME_LEN) {
+    ldpp_dout(this, 0) << "ERROR: Invalid policy name length " << dendl;
+    return false;
+  }
+
+  std::regex regex_policy_name("[A-Za-z0-9:=,.@-]+");
+  if (! std::regex_match(policy_name, regex_policy_name)) {
+    ldpp_dout(this, 0) << "ERROR: Invalid chars in policy name " << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+int RGWUserPolicyRead::check_caps(const RGWUserCaps& caps)
+{
+    return caps.check_cap("user-policy", RGW_CAP_READ);
+}
+
+int RGWUserPolicyWrite::check_caps(const RGWUserCaps& caps)
+{
+    return caps.check_cap("user-policy", RGW_CAP_WRITE);
+}
+
+uint64_t RGWPutUserPolicy::get_op()
+{
+  return rgw::IAM::iamPutUserPolicy;
+}
+
+int RGWPutUserPolicy::get_params()
+{
+  policy_name = url_decode(s->info.args.get("PolicyName"), true);
+  user_name = url_decode(s->info.args.get("UserName"), true);
+  policy = url_decode(s->info.args.get("PolicyDocument"), true);
+
+  if (policy_name.empty() || user_name.empty() || policy.empty()) {
+    ldpp_dout(this, 20) << "ERROR: one of policy name, user name or policy document is empty"
+    << dendl;
+    return -EINVAL;
+  }
+
+  if (! validate_input()) {
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWPutUserPolicy::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  bufferlist bl = bufferlist::static_from_string(policy);
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(user_name));
+
+  op_ret = user->load_user(s, s->yield);
+  if (op_ret < 0) {
+    op_ret = -ERR_NO_SUCH_ENTITY;
+    return;
+  }
+
+  op_ret = user->read_attrs(s, s->yield);
+  if (op_ret == -ENOENT) {
+    op_ret = -ERR_NO_SUCH_ENTITY;
+    return;
+  }
+
+  ceph::bufferlist in_data;
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    ldpp_dout(this, 0) << "ERROR: forward_request_to_master returned ret=" << op_ret << dendl;
+    return;
+  }
+
+  try {
+    const rgw::IAM::Policy p(
+      s->cct, s->user->get_tenant(), bl,
+      s->cct->_conf.get_val<bool>("rgw_policy_reject_invalid_principals"));
+    std::map<std::string, std::string> policies;
+    if (auto it = user->get_attrs().find(RGW_ATTR_USER_POLICY); it != user->get_attrs().end()) {
+      bufferlist out_bl = it->second;
+      decode(policies, out_bl);
+    }
+    bufferlist in_bl;
+    policies[policy_name] = policy;
+    constexpr unsigned int USER_POLICIES_MAX_NUM = 100;
+    const unsigned int max_num = s->cct->_conf->rgw_user_policies_max_num < 0 ?
+      USER_POLICIES_MAX_NUM : s->cct->_conf->rgw_user_policies_max_num;
+    if (policies.size() > max_num) {
+      ldpp_dout(this, 4) << "IAM user policies has reached the num config: "
+                         << max_num << ", cant add another" << dendl;
+      op_ret = -ERR_INVALID_REQUEST;
+      s->err.message =
+          "The number of IAM user policies should not exceed allowed limit "
+          "of " +
+          std::to_string(max_num) + " policies.";
+      return;
+    }
+    encode(policies, in_bl);
+    user->get_attrs()[RGW_ATTR_USER_POLICY] = in_bl;
+
+    op_ret = user->store_user(s, s->yield, false);
+    if (op_ret < 0) {
+      op_ret = -ERR_INTERNAL_ERROR;
+    }
+  } catch (buffer::error& err) {
+    ldpp_dout(this, 0) << "ERROR: failed to decode user policies" << dendl;
+    op_ret = -EIO;
+  } catch (rgw::IAM::PolicyParseException& e) {
+    ldpp_dout(this, 5) << "failed to parse policy: " << e.what() << dendl;
+    s->err.message = e.what();
+    op_ret = -ERR_MALFORMED_DOC;
+  }
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("PutUserPolicyResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+}
+
+uint64_t RGWGetUserPolicy::get_op()
+{
+  return rgw::IAM::iamGetUserPolicy;
+}
+
+int RGWGetUserPolicy::get_params()
+{
+  policy_name = s->info.args.get("PolicyName");
+  user_name = s->info.args.get("UserName");
+
+  if (policy_name.empty() || user_name.empty()) {
+    ldpp_dout(this, 20) << "ERROR: one of policy name or user name is empty"
+    << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWGetUserPolicy::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(user_name));
+  op_ret = user->read_attrs(s, s->yield);
+  if (op_ret == -ENOENT) {
+    ldpp_dout(this, 0) << "ERROR: attrs not found for user" << user_name << dendl;
+    op_ret = -ERR_NO_SUCH_ENTITY;
+    return;
+  }
+
+  if (op_ret == 0) {
+    s->formatter->open_object_section("GetUserPolicyResponse");
+    s->formatter->open_object_section("ResponseMetadata");
+    s->formatter->dump_string("RequestId", s->trans_id);
+    s->formatter->close_section();
+    s->formatter->open_object_section("GetUserPolicyResult");
+    std::map<std::string, std::string> policies;
+    if (auto it = user->get_attrs().find(RGW_ATTR_USER_POLICY); it != user->get_attrs().end()) {
+      bufferlist bl = it->second;
+      try {
+        decode(policies, bl);
+      } catch (buffer::error& err) {
+        ldpp_dout(this, 0) << "ERROR: failed to decode user policies" << dendl;
+        op_ret = -EIO;
+        return;
+      }
+      if (auto it = policies.find(policy_name); it != policies.end()) {
+        policy = policies[policy_name];
+        dump(s->formatter);
+      } else {
+        ldpp_dout(this, 0) << "ERROR: policy not found" << policy << dendl;
+        op_ret = -ERR_NO_SUCH_ENTITY;
+        return;
+      }
+    } else {
+      ldpp_dout(this, 0) << "ERROR: RGW_ATTR_USER_POLICY not found" << dendl;
+      op_ret = -ERR_NO_SUCH_ENTITY;
+      return;
+    }
+    s->formatter->close_section();
+    s->formatter->close_section();
+  }
+  if (op_ret < 0) {
+    op_ret = -ERR_INTERNAL_ERROR;
+  }
+}
+
+uint64_t RGWListUserPolicies::get_op()
+{
+  return rgw::IAM::iamListUserPolicies;
+}
+
+int RGWListUserPolicies::get_params()
+{
+  user_name = s->info.args.get("UserName");
+
+  if (user_name.empty()) {
+    ldpp_dout(this, 20) << "ERROR: user name is empty" << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWListUserPolicies::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(user_name));
+  op_ret = user->read_attrs(s, s->yield);
+  if (op_ret == -ENOENT) {
+    ldpp_dout(this, 0) << "ERROR: attrs not found for user" << user_name << dendl;
+    op_ret = -ERR_NO_SUCH_ENTITY;
+    return;
+  }
+
+  if (op_ret == 0) {
+    std::map<std::string, std::string> policies;
+    if (auto it = user->get_attrs().find(RGW_ATTR_USER_POLICY); it != user->get_attrs().end()) {
+      s->formatter->open_object_section("ListUserPoliciesResponse");
+      s->formatter->open_object_section("ResponseMetadata");
+      s->formatter->dump_string("RequestId", s->trans_id);
+      s->formatter->close_section();
+      s->formatter->open_object_section("ListUserPoliciesResult");
+      bufferlist bl = it->second;
+      try {
+        decode(policies, bl);
+      } catch (buffer::error& err) {
+        ldpp_dout(this, 0) << "ERROR: failed to decode user policies" << dendl;
+        op_ret = -EIO;
+        return;
+      }
+      s->formatter->open_object_section("PolicyNames");
+      for (const auto& p : policies) {
+        s->formatter->dump_string("member", p.first);
+      }
+      s->formatter->close_section();
+      s->formatter->close_section();
+      s->formatter->close_section();
+    } else {
+      ldpp_dout(this, 0) << "ERROR: RGW_ATTR_USER_POLICY not found" << dendl;
+      op_ret = -ERR_NO_SUCH_ENTITY;
+      return;
+    }
+  }
+  if (op_ret < 0) {
+    op_ret = -ERR_INTERNAL_ERROR;
+  }
+}
+
+uint64_t RGWDeleteUserPolicy::get_op()
+{
+  return rgw::IAM::iamDeleteUserPolicy;
+}
+
+int RGWDeleteUserPolicy::get_params()
+{
+  policy_name = s->info.args.get("PolicyName");
+  user_name = s->info.args.get("UserName");
+
+  if (policy_name.empty() || user_name.empty()) {
+    ldpp_dout(this, 20) << "ERROR: One of policy name or user name is empty"<< dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWDeleteUserPolicy::execute(optional_yield y)
+{
+  op_ret = get_params();
+  if (op_ret < 0) {
+    return;
+  }
+
+  std::unique_ptr<rgw::sal::User> user = driver->get_user(rgw_user(user_name));
+  op_ret = user->load_user(s, s->yield);
+  if (op_ret < 0) {
+    op_ret = -ERR_NO_SUCH_ENTITY;
+    return;
+  }
+
+  op_ret = user->read_attrs(this, s->yield);
+  if (op_ret == -ENOENT) {
+    op_ret = -ERR_NO_SUCH_ENTITY;
+    return;
+  }
+
+  ceph::bufferlist in_data;
+  op_ret = driver->forward_request_to_master(this, s->user.get(), nullptr, in_data, nullptr, s->info, y);
+  if (op_ret < 0) {
+    // a policy might've been uploaded to this site when there was no sync
+    // req. in earlier releases, proceed deletion
+    if (op_ret != -ENOENT) {
+      ldpp_dout(this, 5) << "forward_request_to_master returned ret=" << op_ret << dendl;
+      return;
+    }
+    ldpp_dout(this, 0) << "ERROR: forward_request_to_master returned ret=" << op_ret << dendl;
+  }
+
+  std::map<std::string, std::string> policies;
+  if (auto it = user->get_attrs().find(RGW_ATTR_USER_POLICY); it != user->get_attrs().end()) {
+    bufferlist out_bl = it->second;
+    try {
+      decode(policies, out_bl);
+    } catch (buffer::error& err) {
+      ldpp_dout(this, 0) << "ERROR: failed to decode user policies" << dendl;
+      op_ret = -EIO;
+      return;
+    }
+
+    if (auto p = policies.find(policy_name); p != policies.end()) {
+      bufferlist in_bl;
+      policies.erase(p);
+      encode(policies, in_bl);
+      user->get_attrs()[RGW_ATTR_USER_POLICY] = in_bl;
+
+      op_ret = user->store_user(s, s->yield, false);
+      if (op_ret < 0) {
+        op_ret = -ERR_INTERNAL_ERROR;
+      }
+      if (op_ret == 0) {
+        s->formatter->open_object_section("DeleteUserPoliciesResponse");
+        s->formatter->open_object_section("ResponseMetadata");
+        s->formatter->dump_string("RequestId", s->trans_id);
+        s->formatter->close_section();
+        s->formatter->close_section();
+      }
+    } else {
+      op_ret = -ERR_NO_SUCH_ENTITY;
+      return;
+    }
+  } else {
+    op_ret = -ERR_NO_SUCH_ENTITY;
+    return;
+  }
+}
diff --git a/src/rgw/rgw_rest_user_policy.h b/src/rgw/rgw_rest_user_policy.h
new file mode 100644
index 000000000..4a123456e
--- /dev/null
+++ b/src/rgw/rgw_rest_user_policy.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+#include "rgw_rest.h"
+
+class RGWRestUserPolicy : public RGWRESTOp {
+protected:
+  static constexpr int MAX_POLICY_NAME_LEN = 128;
+  std::string policy_name;
+  std::string user_name;
+  std::string policy;
+
+  bool validate_input();
+
+public:
+  int verify_permission(optional_yield y) override;
+  virtual uint64_t get_op() = 0;
+  void send_response() override;
+  void dump(Formatter *f) const;
+};
+
+class RGWUserPolicyRead : public RGWRestUserPolicy {
+public:
+  RGWUserPolicyRead() = default;
+  int check_caps(const RGWUserCaps& caps) override;
+};
+
+class RGWUserPolicyWrite : public RGWRestUserPolicy {
+public:
+  RGWUserPolicyWrite() = default;
+  int check_caps(const RGWUserCaps& caps) override;
+};
+
+class RGWPutUserPolicy : public RGWUserPolicyWrite {
+public:
+  RGWPutUserPolicy() = default;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "put_user-policy"; }
+  uint64_t get_op() override;
+  RGWOpType get_type() override { return RGW_OP_PUT_USER_POLICY; }
+};
+
+class RGWGetUserPolicy : public RGWUserPolicyRead {
+public:
+  RGWGetUserPolicy() = default;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "get_user_policy"; }
+  uint64_t get_op() override;
+  RGWOpType get_type() override { return RGW_OP_GET_USER_POLICY; }
+};
+
+class RGWListUserPolicies : public RGWUserPolicyRead {
+public:
+  RGWListUserPolicies() = default;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "list_user_policies"; }
+  uint64_t get_op() override;
+  RGWOpType get_type() override { return RGW_OP_LIST_USER_POLICIES; }
+};
+
+class RGWDeleteUserPolicy : public RGWUserPolicyWrite {
+public:
+  RGWDeleteUserPolicy() = default;
+  void execute(optional_yield y) override;
+  int get_params();
+  const char* name() const override { return "delete_user_policy"; }
+  uint64_t get_op() override;
+  RGWOpType get_type() override { return RGW_OP_DELETE_USER_POLICY; }
+};
diff --git a/src/rgw/rgw_role.cc b/src/rgw/rgw_role.cc
new file mode 100644
index 000000000..fb188e7f8
--- /dev/null
+++ b/src/rgw/rgw_role.cc
@@ -0,0 +1,444 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <ctime>
+#include <regex>
+#include <boost/algorithm/string/replace.hpp>
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+#include "common/ceph_time.h"
+#include "rgw_rados.h"
+#include "rgw_zone.h"
+
+#include "include/types.h"
+#include "rgw_string.h"
+
+#include "rgw_common.h"
+#include "rgw_tools.h"
+#include "rgw_role.h"
+
+#include "services/svc_zone.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_meta_be_sobj.h"
+#include "services/svc_meta.h"
+#include "services/svc_role_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw { namespace sal {
+
+const string RGWRole::role_name_oid_prefix = "role_names.";
+const string RGWRole::role_oid_prefix = "roles.";
+const string RGWRole::role_path_oid_prefix = "role_paths.";
+const string RGWRole::role_arn_prefix = "arn:aws:iam::";
+
+void RGWRoleInfo::dump(Formatter *f) const
+{
+  encode_json("RoleId", id , f);
+  std::string role_name;
+  if (tenant.empty()) {
+    role_name = name;
+  } else {
+    role_name = tenant + '$' + name;
+  }
+  encode_json("RoleName", role_name , f);
+  encode_json("Path", path, f);
+  encode_json("Arn", arn, f);
+  encode_json("CreateDate", creation_date, f);
+  encode_json("MaxSessionDuration", max_session_duration, f);
+  encode_json("AssumeRolePolicyDocument", trust_policy, f);
+  if (!perm_policy_map.empty()) {
+    f->open_array_section("PermissionPolicies");
+    for (const auto& it : perm_policy_map) {
+      f->open_object_section("Policy");
+      encode_json("PolicyName", it.first, f);
+      encode_json("PolicyValue", it.second, f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+  if (!tags.empty()) {
+    f->open_array_section("Tags");
+    for (const auto& it : tags) {
+      f->open_object_section("Tag");
+      encode_json("Key", it.first, f);
+      encode_json("Value", it.second, f);
+      f->close_section();
+    }
+    f->close_section();
+  }
+}
+
+void RGWRoleInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("RoleId", id, obj);
+  JSONDecoder::decode_json("RoleName", name, obj);
+  JSONDecoder::decode_json("Path", path, obj);
+  JSONDecoder::decode_json("Arn", arn, obj);
+  JSONDecoder::decode_json("CreateDate", creation_date, obj);
+  JSONDecoder::decode_json("MaxSessionDuration", max_session_duration, obj);
+  JSONDecoder::decode_json("AssumeRolePolicyDocument", trust_policy, obj);
+
+  auto tags_iter = obj->find_first("Tags");
+  if (!tags_iter.end()) {
+    JSONObj* tags_json = *tags_iter;
+    auto iter = tags_json->find_first();
+
+    for (; !iter.end(); ++iter) {
+      std::string key, val;
+      JSONDecoder::decode_json("Key", key, *iter);
+      JSONDecoder::decode_json("Value", val, *iter);
+      this->tags.emplace(key, val);
+    }
+  }
+
+  auto perm_policy_iter = obj->find_first("PermissionPolicies");
+  if (!perm_policy_iter.end()) {
+    JSONObj* perm_policies = *perm_policy_iter;
+    auto iter = perm_policies->find_first();
+
+    for (; !iter.end(); ++iter) {
+      std::string policy_name, policy_val;
+      JSONDecoder::decode_json("PolicyName", policy_name, *iter);
+      JSONDecoder::decode_json("PolicyValue", policy_val, *iter);
+      this->perm_policy_map.emplace(policy_name, policy_val);
+    }
+  }
+
+  if (auto pos = name.find('$'); pos != std::string::npos) {
+    tenant = name.substr(0, pos);
+    name = name.substr(pos+1);
+  }
+}
+
+RGWRole::RGWRole(std::string name,
+              std::string tenant,
+              std::string path,
+              std::string trust_policy,
+              std::string max_session_duration_str,
+              std::multimap<std::string,std::string> tags)
+{
+  info.name = std::move(name);
+  info.path = std::move(path);
+  info.trust_policy = std::move(trust_policy);
+  info.tenant = std::move(tenant);
+  info.tags = std::move(tags);
+  if (this->info.path.empty())
+    this->info.path = "/";
+  extract_name_tenant(this->info.name);
+  if (max_session_duration_str.empty()) {
+    info.max_session_duration = SESSION_DURATION_MIN;
+  } else {
+    info.max_session_duration = std::stoull(max_session_duration_str);
+  }
+  info.mtime = real_time();
+}
+
+RGWRole::RGWRole(std::string id)
+{
+  info.id = std::move(id);
+}
+
+int RGWRole::get(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  int ret = read_name(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = read_info(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWRole::get_by_id(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  int ret = read_info(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+void RGWRole::dump(Formatter *f) const
+{
+  info.dump(f);
+}
+
+void RGWRole::decode_json(JSONObj *obj)
+{
+  info.decode_json(obj);
+}
+
+bool RGWRole::validate_max_session_duration(const DoutPrefixProvider* dpp)
+{
+  if (info.max_session_duration < SESSION_DURATION_MIN ||
+          info.max_session_duration > SESSION_DURATION_MAX) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid session duration, should be between 3600 and 43200 seconds " << dendl;
+    return false;
+  }
+  return true;
+}
+
+bool RGWRole::validate_input(const DoutPrefixProvider* dpp)
+{
+  if (info.name.length() > MAX_ROLE_NAME_LEN) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid name length " << dendl;
+    return false;
+  }
+
+  if (info.path.length() > MAX_PATH_NAME_LEN) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid path length " << dendl;
+    return false;
+  }
+
+  std::regex regex_name("[A-Za-z0-9:=,.@-]+");
+  if (! std::regex_match(info.name, regex_name)) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid chars in name " << dendl;
+    return false;
+  }
+
+  std::regex regex_path("(/[!-~]+/)|(/)");
+  if (! std::regex_match(info.path,regex_path)) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid chars in path " << dendl;
+    return false;
+  }
+
+  if (!validate_max_session_duration(dpp)) {
+    return false;
+  }
+  return true;
+}
+
+void RGWRole::extract_name_tenant(const std::string& str) {
+  if (auto pos = str.find('$');
+      pos != std::string::npos) {
+    info.tenant = str.substr(0, pos);
+    info.name = str.substr(pos+1);
+  }
+}
+
+int RGWRole::update(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  int ret = store_info(dpp, false, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR:  storing info in Role pool: "
+                  << info.id << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+void RGWRole::set_perm_policy(const string& policy_name, const string& perm_policy)
+{
+  info.perm_policy_map[policy_name] = perm_policy;
+}
+
+vector<string> RGWRole::get_role_policy_names()
+{
+  vector<string> policy_names;
+  for (const auto& it : info.perm_policy_map)
+  {
+    policy_names.push_back(std::move(it.first));
+  }
+
+  return policy_names;
+}
+
+int RGWRole::get_role_policy(const DoutPrefixProvider* dpp, const string& policy_name, string& perm_policy)
+{
+  const auto it = info.perm_policy_map.find(policy_name);
+  if (it == info.perm_policy_map.end()) {
+    ldpp_dout(dpp, 0) << "ERROR: Policy name: " << policy_name << " not found" << dendl;
+    return -ENOENT;
+  } else {
+    perm_policy = it->second;
+  }
+  return 0;
+}
+
+int RGWRole::delete_policy(const DoutPrefixProvider* dpp, const string& policy_name)
+{
+  const auto& it = info.perm_policy_map.find(policy_name);
+  if (it == info.perm_policy_map.end()) {
+    ldpp_dout(dpp, 0) << "ERROR: Policy name: " << policy_name << " not found" << dendl;
+    return -ENOENT;
+  } else {
+    info.perm_policy_map.erase(it);
+  }
+  return 0;
+}
+
+void RGWRole::update_trust_policy(string& trust_policy)
+{
+  this->info.trust_policy = trust_policy;
+}
+
+int RGWRole::set_tags(const DoutPrefixProvider* dpp, const multimap<string,string>& tags_map)
+{
+  for (auto& it : tags_map) {
+    this->info.tags.emplace(it.first, it.second);
+  }
+  if (this->info.tags.size() > 50) {
+    ldpp_dout(dpp, 0) << "No. of tags is greater than 50" << dendl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+boost::optional<multimap<string,string>> RGWRole::get_tags()
+{
+  if(this->info.tags.empty()) {
+    return boost::none;
+  }
+  return this->info.tags;
+}
+
+void RGWRole::erase_tags(const vector<string>& tagKeys)
+{
+  for (auto& it : tagKeys) {
+    this->info.tags.erase(it);
+  }
+}
+
+void RGWRole::update_max_session_duration(const std::string& max_session_duration_str)
+{
+  if (max_session_duration_str.empty()) {
+    info.max_session_duration = SESSION_DURATION_MIN;
+  } else {
+    info.max_session_duration = std::stoull(max_session_duration_str);
+  }
+}
+
+const string& RGWRole::get_names_oid_prefix()
+{
+  return role_name_oid_prefix;
+}
+
+const string& RGWRole::get_info_oid_prefix()
+{
+  return role_oid_prefix;
+}
+
+const string& RGWRole::get_path_oid_prefix()
+{
+  return role_path_oid_prefix;
+}
+
+RGWRoleMetadataHandler::RGWRoleMetadataHandler(Driver* driver,
+                                              RGWSI_Role_RADOS *role_svc)
+{
+  this->driver = driver;
+  base_init(role_svc->ctx(), role_svc->get_be_handler());
+}
+
+RGWMetadataObject *RGWRoleMetadataHandler::get_meta_obj(JSONObj *jo,
+							const obj_version& objv,
+							const ceph::real_time& mtime)
+{
+  RGWRoleInfo info;
+
+  try {
+    info.decode_json(jo);
+  } catch (JSONDecoder:: err& e) {
+    return nullptr;
+  }
+
+  return new RGWRoleMetadataObject(info, objv, mtime, driver);
+}
+
+int RGWRoleMetadataHandler::do_get(RGWSI_MetaBackend_Handler::Op *op,
+                                   std::string& entry,
+                                   RGWMetadataObject **obj,
+                                   optional_yield y,
+                                   const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(entry);
+  int ret = role->read_info(dpp, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  RGWObjVersionTracker objv_tracker = role->get_objv_tracker();
+  real_time mtime = role->get_mtime();
+
+  RGWRoleInfo info = role->get_info();
+  RGWRoleMetadataObject *rdo = new RGWRoleMetadataObject(info, objv_tracker.read_version,
+                                                         mtime, driver);
+  *obj = rdo;
+
+  return 0;
+}
+
+int RGWRoleMetadataHandler::do_remove(RGWSI_MetaBackend_Handler::Op *op,
+                                      std::string& entry,
+                                      RGWObjVersionTracker& objv_tracker,
+                                      optional_yield y,
+                                      const DoutPrefixProvider *dpp)
+{
+  std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(entry);
+  int ret = role->read_info(dpp, y);
+  if (ret < 0) {
+    return ret == -ENOENT? 0 : ret;
+  }
+
+  return role->delete_obj(dpp, y);
+}
+
+class RGWMetadataHandlerPut_Role : public RGWMetadataHandlerPut_SObj
+{
+  RGWRoleMetadataHandler *rhandler;
+  RGWRoleMetadataObject *mdo;
+public:
+  RGWMetadataHandlerPut_Role(RGWRoleMetadataHandler *handler,
+                             RGWSI_MetaBackend_Handler::Op *op,
+                             std::string& entry,
+                             RGWMetadataObject *obj,
+                             RGWObjVersionTracker& objv_tracker,
+                             optional_yield y,
+                             RGWMDLogSyncType type,
+                             bool from_remote_zone) :
+    RGWMetadataHandlerPut_SObj(handler, op, entry, obj, objv_tracker, y, type, from_remote_zone),
+    rhandler(handler) {
+    mdo = static_cast<RGWRoleMetadataObject*>(obj);
+  }
+
+  int put_checked(const DoutPrefixProvider *dpp) override {
+    auto& info = mdo->get_role_info();
+    auto mtime = mdo->get_mtime();
+    auto* driver = mdo->get_driver();
+    info.mtime = mtime;
+    std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(info);
+    int ret = role->create(dpp, true, info.id, y);
+    if (ret == -EEXIST) {
+      ret = role->update(dpp, y);
+    }
+
+    return ret < 0 ? ret : STATUS_APPLIED;
+  }
+};
+
+int RGWRoleMetadataHandler::do_put(RGWSI_MetaBackend_Handler::Op *op,
+                                   std::string& entry,
+                                   RGWMetadataObject *obj,
+                                   RGWObjVersionTracker& objv_tracker,
+                                   optional_yield y,
+                                   const DoutPrefixProvider *dpp,
+                                   RGWMDLogSyncType type,
+                                   bool from_remote_zone)
+{
+  RGWMetadataHandlerPut_Role put_op(this, op , entry, obj, objv_tracker, y, type, from_remote_zone);
+  return do_put_operate(&put_op, dpp);
+}
+
+} } // namespace rgw::sal
diff --git a/src/rgw/rgw_role.h b/src/rgw/rgw_role.h
new file mode 100644
index 000000000..9183829d9
--- /dev/null
+++ b/src/rgw/rgw_role.h
@@ -0,0 +1,209 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+
+#include "common/async/yield_context.h"
+
+#include "common/ceph_json.h"
+#include "common/ceph_context.h"
+#include "rgw_rados.h"
+#include "rgw_metadata.h"
+
+class RGWRados;
+
+namespace rgw { namespace sal {
+struct RGWRoleInfo
+{
+  std::string id;
+  std::string name;
+  std::string path;
+  std::string arn;
+  std::string creation_date;
+  std::string trust_policy;
+  std::map<std::string, std::string> perm_policy_map;
+  std::string tenant;
+  uint64_t max_session_duration;
+  std::multimap<std::string,std::string> tags;
+  std::map<std::string, bufferlist> attrs;
+  RGWObjVersionTracker objv_tracker;
+  real_time mtime;
+
+  RGWRoleInfo() = default;
+
+  ~RGWRoleInfo() = default;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    encode(id, bl);
+    encode(name, bl);
+    encode(path, bl);
+    encode(arn, bl);
+    encode(creation_date, bl);
+    encode(trust_policy, bl);
+    encode(perm_policy_map, bl);
+    encode(tenant, bl);
+    encode(max_session_duration, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(3, bl);
+    decode(id, bl);
+    decode(name, bl);
+    decode(path, bl);
+    decode(arn, bl);
+    decode(creation_date, bl);
+    decode(trust_policy, bl);
+    decode(perm_policy_map, bl);
+    if (struct_v >= 2) {
+      decode(tenant, bl);
+    }
+    if (struct_v >= 3) {
+      decode(max_session_duration, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWRoleInfo)
+
+class RGWRole
+{
+public:
+  static const std::string role_name_oid_prefix;
+  static const std::string role_oid_prefix;
+  static const std::string role_path_oid_prefix;
+  static const std::string role_arn_prefix;
+  static constexpr int MAX_ROLE_NAME_LEN = 64;
+  static constexpr int MAX_PATH_NAME_LEN = 512;
+  static constexpr uint64_t SESSION_DURATION_MIN = 3600; // in seconds
+  static constexpr uint64_t SESSION_DURATION_MAX = 43200; // in seconds
+protected:
+  RGWRoleInfo info;
+public:
+  virtual int store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) = 0;
+  virtual int store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) = 0;
+  virtual int store_path(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y) = 0;
+  virtual int read_id(const DoutPrefixProvider *dpp, const std::string& role_name, const std::string& tenant, std::string& role_id, optional_yield y) = 0;
+  virtual int read_name(const DoutPrefixProvider *dpp, optional_yield y) = 0;
+  virtual int read_info(const DoutPrefixProvider *dpp, optional_yield y) = 0;
+  bool validate_max_session_duration(const DoutPrefixProvider* dpp);
+  bool validate_input(const DoutPrefixProvider* dpp);
+  void extract_name_tenant(const std::string& str);
+
+  RGWRole(std::string name,
+              std::string tenant,
+              std::string path="",
+              std::string trust_policy="",
+              std::string max_session_duration_str="",
+              std::multimap<std::string,std::string> tags={});
+
+  explicit RGWRole(std::string id);
+
+  explicit RGWRole(const RGWRoleInfo& info) : info(info) {}
+
+  RGWRole() = default;
+
+  virtual ~RGWRole() = default;
+
+  const std::string& get_id() const { return info.id; }
+  const std::string& get_name() const { return info.name; }
+  const std::string& get_tenant() const { return info.tenant; }
+  const std::string& get_path() const { return info.path; }
+  const std::string& get_create_date() const { return info.creation_date; }
+  const std::string& get_assume_role_policy() const { return info.trust_policy;}
+  const uint64_t& get_max_session_duration() const { return info.max_session_duration; }
+  const RGWObjVersionTracker& get_objv_tracker() const { return info.objv_tracker; }
+  const real_time& get_mtime() const { return info.mtime; }
+  std::map<std::string, bufferlist>& get_attrs() { return info.attrs; }
+  RGWRoleInfo& get_info() { return info; }
+
+  void set_id(const std::string& id) { this->info.id = id; }
+  void set_mtime(const real_time& mtime) { this->info.mtime = mtime; }
+
+  virtual int create(const DoutPrefixProvider *dpp, bool exclusive, const std::string &role_id, optional_yield y) = 0;
+  virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) = 0;
+  int get(const DoutPrefixProvider *dpp, optional_yield y);
+  int get_by_id(const DoutPrefixProvider *dpp, optional_yield y);
+  int update(const DoutPrefixProvider *dpp, optional_yield y);
+  void update_trust_policy(std::string& trust_policy);
+  void set_perm_policy(const std::string& policy_name, const std::string& perm_policy);
+  std::vector<std::string> get_role_policy_names();
+  int get_role_policy(const DoutPrefixProvider* dpp, const std::string& policy_name, std::string& perm_policy);
+  int delete_policy(const DoutPrefixProvider* dpp, const std::string& policy_name);
+  int set_tags(const DoutPrefixProvider* dpp, const std::multimap<std::string,std::string>& tags_map);
+  boost::optional<std::multimap<std::string,std::string>> get_tags();
+  void erase_tags(const std::vector<std::string>& tagKeys);
+  void update_max_session_duration(const std::string& max_session_duration_str);
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  static const std::string& get_names_oid_prefix();
+  static const std::string& get_info_oid_prefix();
+  static const std::string& get_path_oid_prefix();
+};
+
+class RGWRoleMetadataObject: public RGWMetadataObject {
+  RGWRoleInfo info;
+  Driver* driver;
+public:
+  RGWRoleMetadataObject() = default;
+  RGWRoleMetadataObject(RGWRoleInfo& info,
+			const obj_version& v,
+			real_time m,
+      Driver* driver) : RGWMetadataObject(v,m), info(info), driver(driver) {}
+
+  void dump(Formatter *f) const override {
+    info.dump(f);
+  }
+
+  RGWRoleInfo& get_role_info() {
+    return info;
+  }
+
+  Driver* get_driver() {
+    return driver;
+  }
+};
+
+class RGWRoleMetadataHandler: public RGWMetadataHandler_GenericMetaBE
+{
+public:
+  RGWRoleMetadataHandler(Driver* driver, RGWSI_Role_RADOS *role_svc);
+
+  std::string get_type() final { return "roles";  }
+
+  RGWMetadataObject *get_meta_obj(JSONObj *jo,
+				  const obj_version& objv,
+				  const ceph::real_time& mtime);
+
+  int do_get(RGWSI_MetaBackend_Handler::Op *op,
+	     std::string& entry,
+	     RGWMetadataObject **obj,
+	     optional_yield y,
+       const DoutPrefixProvider *dpp) final;
+
+  int do_remove(RGWSI_MetaBackend_Handler::Op *op,
+		std::string& entry,
+		RGWObjVersionTracker& objv_tracker,
+		optional_yield y,
+    const DoutPrefixProvider *dpp) final;
+
+  int do_put(RGWSI_MetaBackend_Handler::Op *op,
+	     std::string& entr,
+	     RGWMetadataObject *obj,
+	     RGWObjVersionTracker& objv_tracker,
+	     optional_yield y,
+       const DoutPrefixProvider *dpp,
+	     RGWMDLogSyncType type,
+       bool from_remote_zone) override;
+
+private:
+  Driver* driver;
+};
+} } // namespace rgw::sal
diff --git a/src/rgw/rgw_s3select.cc b/src/rgw/rgw_s3select.cc
new file mode 100644
index 000000000..c7eaa6984
--- /dev/null
+++ b/src/rgw/rgw_s3select.cc
@@ -0,0 +1,1001 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_s3select_private.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw::s3select {
+RGWOp* create_s3select_op()
+{
+  return new RGWSelectObj_ObjStore_S3();
+}
+};
+
+using namespace s3selectEngine;
+
+std::string& aws_response_handler::get_sql_result()
+{
+  return sql_result;
+}
+
+uint64_t aws_response_handler::get_processed_size()
+{
+  return processed_size;
+}
+
+void aws_response_handler::update_processed_size(uint64_t value)
+{
+  processed_size += value;
+}
+
+uint64_t aws_response_handler::get_total_bytes_returned()
+{
+  return total_bytes_returned;
+}
+
+void aws_response_handler::update_total_bytes_returned(uint64_t value)
+{
+  total_bytes_returned = value;
+}
+
+void aws_response_handler::push_header(const char* header_name, const char* header_value)
+{
+  char x;
+  short s;
+  x = char(strlen(header_name));
+  m_buff_header.append(&x, sizeof(x));
+  m_buff_header.append(header_name);
+  x = char(7);
+  m_buff_header.append(&x, sizeof(x));
+  s = htons(uint16_t(strlen(header_value)));
+  m_buff_header.append(reinterpret_cast<char*>(&s), sizeof(s));
+  m_buff_header.append(header_value);
+}
+
+#define IDX( x ) static_cast<int>( x )
+
+int aws_response_handler::create_header_records()
+{
+  //headers description(AWS)
+  //[header-name-byte-length:1][header-name:variable-length][header-value-type:1][header-value:variable-length]
+  //1
+  push_header(header_name_str[IDX(header_name_En::EVENT_TYPE)], header_value_str[IDX(header_value_En::RECORDS)]);
+  //2
+  push_header(header_name_str[IDX(header_name_En::CONTENT_TYPE)], header_value_str[IDX(header_value_En::OCTET_STREAM)]);
+  //3
+  push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::EVENT)]);
+  return m_buff_header.size();
+}
+
+int aws_response_handler::create_header_continuation()
+{
+  //headers description(AWS)
+  //1
+  push_header(header_name_str[IDX(header_name_En::EVENT_TYPE)], header_value_str[IDX(header_value_En::CONT)]);
+  //2
+  push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::EVENT)]);
+  return m_buff_header.size();
+}
+
+int aws_response_handler::create_header_progress()
+{
+  //headers description(AWS)
+  //1
+  push_header(header_name_str[IDX(header_name_En::EVENT_TYPE)], header_value_str[IDX(header_value_En::PROGRESS)]);
+  //2
+  push_header(header_name_str[IDX(header_name_En::CONTENT_TYPE)], header_value_str[IDX(header_value_En::XML)]);
+  //3
+  push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::EVENT)]);
+  return m_buff_header.size();
+}
+
+int aws_response_handler::create_header_stats()
+{
+  //headers description(AWS)
+  //1
+  push_header(header_name_str[IDX(header_name_En::EVENT_TYPE)], header_value_str[IDX(header_value_En::STATS)]);
+  //2
+  push_header(header_name_str[IDX(header_name_En::CONTENT_TYPE)], header_value_str[IDX(header_value_En::XML)]);
+  //3
+  push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::EVENT)]);
+  return m_buff_header.size();
+}
+
+int aws_response_handler::create_header_end()
+{
+  //headers description(AWS)
+  //1
+  push_header(header_name_str[IDX(header_name_En::EVENT_TYPE)], header_value_str[IDX(header_value_En::END)]);
+  //2
+  push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::EVENT)]);
+  return m_buff_header.size();
+}
+
+int aws_response_handler::create_error_header_records(const char* error_message)
+{
+  //headers description(AWS)
+  //[header-name-byte-length:1][header-name:variable-length][header-value-type:1][header-value:variable-length]
+  //1
+  push_header(header_name_str[IDX(header_name_En::ERROR_CODE)], header_value_str[IDX(header_value_En::ENGINE_ERROR)]);
+  //2
+  push_header(header_name_str[IDX(header_name_En::ERROR_MESSAGE)], error_message);
+  //3
+  push_header(header_name_str[IDX(header_name_En::MESSAGE_TYPE)], header_value_str[IDX(header_value_En::ERROR_TYPE)]);
+  return m_buff_header.size();
+}
+
+int aws_response_handler::create_message(u_int32_t header_len)
+{
+  //message description(AWS):
+  //[total-byte-length:4][header-byte-length:4][crc:4][headers:variable-length][payload:variable-length][crc:4]
+  //s3select result is produced into sql_result, the sql_result is also the response-message, thus the attach headers and CRC
+  //are created later to the produced SQL result, and actually wrapping the payload.
+  auto push_encode_int = [&](u_int32_t s, int pos) {
+    u_int32_t x = htonl(s);
+    sql_result.replace(pos, sizeof(x), reinterpret_cast<char*>(&x), sizeof(x));
+  };
+  u_int32_t total_byte_len = 0;
+  u_int32_t preload_crc = 0;
+  u_int32_t message_crc = 0;
+  total_byte_len = sql_result.size() + 4; //the total is greater in 4 bytes than current size
+  push_encode_int(total_byte_len, 0);
+  push_encode_int(header_len, 4);
+  crc32.reset();
+  crc32 = std::for_each(sql_result.data(), sql_result.data() + 8, crc32); //crc for starting 8 bytes
+  preload_crc = crc32();
+  push_encode_int(preload_crc, 8);
+  crc32.reset();
+  crc32 = std::for_each(sql_result.begin(), sql_result.end(), crc32); //crc for payload + checksum
+  message_crc = crc32();
+  u_int32_t x = htonl(message_crc);
+  sql_result.append(reinterpret_cast<char*>(&x), sizeof(x));
+  return sql_result.size();
+}
+
+void aws_response_handler::init_response()
+{
+  //12 positions for header-crc
+  sql_result.resize(header_crc_size, '\0');
+}
+
+void aws_response_handler::init_success_response()
+{
+  m_buff_header.clear();
+  header_size = create_header_records();
+  sql_result.append(m_buff_header.c_str(), header_size);
+#ifdef PAYLOAD_TAG
+  sql_result.append(PAYLOAD_LINE);
+#endif
+}
+
+void aws_response_handler::send_continuation_response()
+{
+  sql_result.resize(header_crc_size, '\0');
+  m_buff_header.clear();
+  header_size = create_header_continuation();
+  sql_result.append(m_buff_header.c_str(), header_size);
+  int buff_len = create_message(header_size);
+  s->formatter->write_bin_data(sql_result.data(), buff_len);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void aws_response_handler::init_progress_response()
+{
+  sql_result.resize(header_crc_size, '\0');
+  m_buff_header.clear();
+  header_size = create_header_progress();
+  sql_result.append(m_buff_header.c_str(), header_size);
+}
+
+void aws_response_handler::init_stats_response()
+{
+  sql_result.resize(header_crc_size, '\0');
+  m_buff_header.clear();
+  header_size = create_header_stats();
+  sql_result.append(m_buff_header.c_str(), header_size);
+}
+
+void aws_response_handler::init_end_response()
+{
+  sql_result.resize(header_crc_size, '\0');
+  m_buff_header.clear();
+  header_size = create_header_end();
+  sql_result.append(m_buff_header.c_str(), header_size);
+  int buff_len = create_message(header_size);
+  s->formatter->write_bin_data(sql_result.data(), buff_len);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void aws_response_handler::init_error_response(const char* error_message)
+{
+  //currently not in use. the headers in the case of error, are not extracted by AWS-cli.
+  m_buff_header.clear();
+  header_size = create_error_header_records(error_message);
+  sql_result.append(m_buff_header.c_str(), header_size);
+}
+
+void aws_response_handler::send_success_response()
+{
+#ifdef PAYLOAD_TAG
+  sql_result.append(END_PAYLOAD_LINE);
+#endif
+  int buff_len = create_message(header_size);
+  s->formatter->write_bin_data(sql_result.data(), buff_len);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void aws_response_handler::send_error_response(const char* error_code,
+    const char* error_message,
+    const char* resource_id)
+{
+  set_req_state_err(s, 0);
+  dump_errno(s, 400);
+  end_header(s, m_rgwop, "application/xml", CHUNKED_TRANSFER_ENCODING);
+  dump_start(s);
+  s->formatter->open_object_section("Error");
+  s->formatter->dump_string("Code", error_code);
+  s->formatter->dump_string("Message", error_message);
+  s->formatter->dump_string("Resource", "#Resource#");
+  s->formatter->dump_string("RequestId", resource_id);
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void aws_response_handler::send_progress_response()
+{
+  std::string progress_payload = fmt::format("<?xml version=\"1.0\" encoding=\"UTF-8\"?><Progress><BytesScanned>{}</BytesScanned><BytesProcessed>{}</BytesProcessed><BytesReturned>{}</BytesReturned></Progress>"
+                                 , get_processed_size(), get_processed_size(), get_total_bytes_returned());
+  sql_result.append(progress_payload);
+  int buff_len = create_message(header_size);
+  s->formatter->write_bin_data(sql_result.data(), buff_len);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+void aws_response_handler::send_stats_response()
+{
+  std::string stats_payload = fmt::format("<?xml version=\"1.0\" encoding=\"UTF-8\"?><Stats><BytesScanned>{}</BytesScanned><BytesProcessed>{}</BytesProcessed><BytesReturned>{}</BytesReturned></Stats>"
+                                          , get_processed_size(), get_processed_size(), get_total_bytes_returned());
+  sql_result.append(stats_payload);
+  int buff_len = create_message(header_size);
+  s->formatter->write_bin_data(sql_result.data(), buff_len);
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+RGWSelectObj_ObjStore_S3::RGWSelectObj_ObjStore_S3():
+  m_buff_header(std::make_unique<char[]>(1000)),
+  m_scan_range_ind(false),
+  m_start_scan_sz(0),
+  m_end_scan_sz(0),
+  m_object_size_for_processing(0),
+  m_parquet_type(false),
+  m_json_type(false),
+  chunk_number(0),
+  m_requested_range(0),
+  m_scan_offset(1024),
+  m_skip_next_chunk(false),
+  m_is_trino_request(false)
+{
+  set_get_data(true);
+  fp_get_obj_size = [&]() {
+    return get_obj_size();
+  };
+  fp_range_req = [&](int64_t start, int64_t len, void* buff, optional_yield* y) {
+    ldout(s->cct, 10) << "S3select: range-request start: " << start << " length: " << len << dendl;
+    auto status = range_request(start, len, buff, *y);
+    return status;
+  };
+#ifdef _ARROW_EXIST
+  m_rgw_api.set_get_size_api(fp_get_obj_size);
+  m_rgw_api.set_range_req_api(fp_range_req);
+#endif
+  fp_result_header_format = [this](std::string& result) {
+    m_aws_response_handler.init_response();
+    m_aws_response_handler.init_success_response();
+    return 0;
+  };
+  fp_s3select_result_format = [this](std::string& result) {
+    fp_chunked_transfer_encoding();
+    m_aws_response_handler.send_success_response();
+    return 0;
+  };
+
+  fp_debug_mesg = [&](const char* mesg){
+    ldpp_dout(this, 10) << mesg << dendl;
+  };
+
+  fp_chunked_transfer_encoding = [&](void){
+    if (chunk_number == 0) { 
+      if (op_ret < 0) { 
+        set_req_state_err(s, op_ret); 
+      } 
+      dump_errno(s); 
+      // Explicitly use chunked transfer encoding so that we can stream the result
+      // to the user without having to wait for the full length of it.
+      end_header(s, this, "application/xml", CHUNKED_TRANSFER_ENCODING); 
+    } 
+    chunk_number++; 
+  };
+}
+
+RGWSelectObj_ObjStore_S3::~RGWSelectObj_ObjStore_S3()
+{}
+
+int RGWSelectObj_ObjStore_S3::get_params(optional_yield y)
+{
+  if(m_s3select_query.empty() == false) {
+    return 0;
+  }
+#ifndef _ARROW_EXIST
+    m_parquet_type = false;
+    ldpp_dout(this, 10) << "arrow library is not installed" << dendl;
+#endif
+  
+  //retrieve s3-select query from payload
+  bufferlist data;
+  int ret;
+  int max_size = 4096;
+  std::tie(ret, data) = read_all_input(s, max_size, false);
+  if (ret != 0) {
+    ldpp_dout(this, 10) << "s3-select query: failed to retrieve query; ret = " << ret << dendl;
+    return ret;
+  }
+  m_s3select_query = data.to_str();
+  if (m_s3select_query.length() > 0) {
+    ldpp_dout(this, 10) << "s3-select query: " << m_s3select_query << dendl;
+  } else {
+    ldpp_dout(this, 10) << "s3-select query: failed to retrieve query;" << dendl;
+    return -1;
+  }
+  const auto& m = s->info.env->get_map();
+  auto user_agent = m.find("HTTP_USER_AGENT"); {
+  if (user_agent != m.end()){
+    if (user_agent->second.find("Trino") != std::string::npos){
+	m_is_trino_request = true;
+	ldpp_dout(this, 10) << "s3-select query: request sent by Trino." << dendl;
+      }
+    }
+  }
+
+  int status = handle_aws_cli_parameters(m_sql_query);
+  if (status<0) {
+    return status;
+  }
+  return RGWGetObj_ObjStore_S3::get_params(y);
+}
+
+int RGWSelectObj_ObjStore_S3::run_s3select_on_csv(const char* query, const char* input, size_t input_length)
+{
+  int status = 0;
+  uint32_t length_before_processing, length_post_processing;
+  csv_object::csv_defintions csv;
+  const char* s3select_syntax_error = "s3select-Syntax-Error";
+  const char* s3select_resource_id = "resourcse-id";
+  const char* s3select_processTime_error = "s3select-ProcessingTime-Error";
+
+  s3select_syntax.parse_query(query);
+  if (m_row_delimiter.size()) {
+    csv.row_delimiter = *m_row_delimiter.c_str();
+  }
+  if (m_column_delimiter.size()) {
+    csv.column_delimiter = *m_column_delimiter.c_str();
+  }
+  if (m_quot.size()) {
+    csv.quot_char = *m_quot.c_str();
+  }
+  if (m_escape_char.size()) {
+    csv.escape_char = *m_escape_char.c_str();
+  }
+  if (output_row_delimiter.size()) {
+    csv.output_row_delimiter = *output_row_delimiter.c_str();
+  }
+  if (output_column_delimiter.size()) {
+    csv.output_column_delimiter = *output_column_delimiter.c_str();
+  }
+  if (output_quot.size()) {
+    csv.output_quot_char = *output_quot.c_str();
+  }
+  if (output_escape_char.size()) {
+    csv.output_escape_char = *output_escape_char.c_str();
+  }
+  if(output_quote_fields.compare("ALWAYS") == 0) {
+    csv.quote_fields_always = true;
+  } else if(output_quote_fields.compare("ASNEEDED") == 0) {
+    csv.quote_fields_asneeded = true;
+  }
+  if(m_header_info.compare("IGNORE")==0) {
+    csv.ignore_header_info=true;
+  } else if(m_header_info.compare("USE")==0) {
+    csv.use_header_info=true;
+  }
+  //m_s3_csv_object.set_external_debug_system(fp_debug_mesg);
+  m_s3_csv_object.set_result_formatters(fp_s3select_result_format,fp_result_header_format);
+  m_s3_csv_object.set_csv_query(&s3select_syntax, csv);
+  if (s3select_syntax.get_error_description().empty() == false) {
+    //error-flow (syntax-error)
+    m_aws_response_handler.send_error_response(s3select_syntax_error,
+        s3select_syntax.get_error_description().c_str(),
+        s3select_resource_id);
+    ldpp_dout(this, 10) << "s3-select query: failed to prase the following query {" << query << "}" << dendl;
+    ldpp_dout(this, 10) << "s3-select query: syntax-error {" << s3select_syntax.get_error_description() << "}" << dendl;
+    return -1;
+  } else {
+    if (input == nullptr) {
+      input = "";
+    }
+    fp_result_header_format(m_aws_response_handler.get_sql_result());
+    length_before_processing = m_s3_csv_object.get_return_result_size();
+    //query is correct(syntax), processing is starting.
+    status = m_s3_csv_object.run_s3select_on_stream(m_aws_response_handler.get_sql_result(), input, input_length, m_object_size_for_processing);
+    length_post_processing = m_s3_csv_object.get_return_result_size();
+    m_aws_response_handler.update_total_bytes_returned( m_s3_csv_object.get_return_result_size() );
+
+    if (status < 0) {
+      //error flow(processing-time)
+      m_aws_response_handler.send_error_response(s3select_processTime_error,
+          m_s3_csv_object.get_error_description().c_str(),
+          s3select_resource_id);
+      ldpp_dout(this, 10) << "s3-select query: failed to process query; {" << m_s3_csv_object.get_error_description() << "}" << dendl;
+      return -1;
+    }
+
+  }
+  if ((length_post_processing-length_before_processing) != 0) {
+    ldpp_dout(this, 10) << "s3-select: sql-result-size = " << m_aws_response_handler.get_sql_result().size() << dendl;
+  } else {
+    m_aws_response_handler.send_continuation_response();
+  }
+  ldpp_dout(this, 10) << "s3-select: complete chunk processing : chunk length = " << input_length << dendl;
+  if (enable_progress == true) {
+    fp_chunked_transfer_encoding();
+    m_aws_response_handler.init_progress_response();
+    m_aws_response_handler.send_progress_response();
+  }
+  return status;
+}
+
+int RGWSelectObj_ObjStore_S3::run_s3select_on_parquet(const char* query)
+{
+  int status = 0;
+#ifdef _ARROW_EXIST
+  if (!m_s3_parquet_object.is_set()) {
+    //parsing the SQL statement.
+    s3select_syntax.parse_query(m_sql_query.c_str());
+    //m_s3_parquet_object.set_external_debug_system(fp_debug_mesg);
+    try {
+      //at this stage the Parquet-processing requires for the meta-data that reside on Parquet object 
+      m_s3_parquet_object.set_parquet_object(std::string("s3object"), &s3select_syntax, &m_rgw_api);
+    } catch(base_s3select_exception& e) {
+      ldpp_dout(this, 10) << "S3select: failed upon parquet-reader construction: " << e.what() << dendl;
+      fp_result_header_format(m_aws_response_handler.get_sql_result());
+      m_aws_response_handler.get_sql_result().append(e.what());
+      fp_s3select_result_format(m_aws_response_handler.get_sql_result());
+      return -1;
+    }
+  }
+  if (s3select_syntax.get_error_description().empty() == false) {
+    //the SQL statement failed the syntax parser
+    fp_result_header_format(m_aws_response_handler.get_sql_result());
+    m_aws_response_handler.get_sql_result().append(s3select_syntax.get_error_description().data());
+    fp_s3select_result_format(m_aws_response_handler.get_sql_result());
+    ldpp_dout(this, 10) << "s3-select query: failed to prase query; {" << s3select_syntax.get_error_description() << "}" << dendl;
+    status = -1;
+  } else {
+    fp_result_header_format(m_aws_response_handler.get_sql_result());
+    //at this stage the Parquet-processing "takes control", it keep calling to s3-range-request according to the SQL statement.
+    status = m_s3_parquet_object.run_s3select_on_object(m_aws_response_handler.get_sql_result(), fp_s3select_result_format, fp_result_header_format);
+    if (status < 0) {
+      m_aws_response_handler.get_sql_result().append(m_s3_parquet_object.get_error_description());
+      fp_s3select_result_format(m_aws_response_handler.get_sql_result());
+      ldout(s->cct, 10) << "S3select: failure while execution" << m_s3_parquet_object.get_error_description() << dendl;
+    }
+  }
+#endif
+  return status;
+}
+
+int RGWSelectObj_ObjStore_S3::run_s3select_on_json(const char* query, const char* input, size_t input_length)
+{
+  int status = 0;
+  
+  const char* s3select_processTime_error = "s3select-ProcessingTime-Error";
+  const char* s3select_syntax_error = "s3select-Syntax-Error";
+  const char* s3select_resource_id = "resourcse-id";
+  const char* s3select_json_error = "json-Format-Error";
+
+  m_aws_response_handler.init_response();
+
+  //the JSON data-type should be(currently) only DOCUMENT
+  if (m_json_datatype.compare("DOCUMENT") != 0) {
+    const char* s3select_json_error_msg = "s3-select query: wrong json dataType should use DOCUMENT; ";
+    m_aws_response_handler.send_error_response(s3select_json_error,
+      s3select_json_error_msg,
+      s3select_resource_id);
+    ldpp_dout(this, 10) << s3select_json_error_msg << dendl;
+    return -EINVAL;
+  } 
+
+  //parsing the SQL statement
+  s3select_syntax.parse_query(m_sql_query.c_str());
+  if (s3select_syntax.get_error_description().empty() == false) {
+  //SQL statement is wrong(syntax).
+    m_aws_response_handler.send_error_response(s3select_syntax_error,
+      s3select_syntax.get_error_description().c_str(),
+      s3select_resource_id);
+    ldpp_dout(this, 10) << "s3-select query: failed to prase query; {" << s3select_syntax.get_error_description() << "}" << dendl;
+    return -EINVAL;
+  }
+    
+  //initializing json processor
+  m_s3_json_object.set_json_query(&s3select_syntax);
+
+  if (input == nullptr) {
+    input = "";
+  }
+  m_aws_response_handler.init_success_response();
+  uint32_t length_before_processing = m_aws_response_handler.get_sql_result().size();
+  //query is correct(syntax), processing is starting.
+  try {
+    status = m_s3_json_object.run_s3select_on_stream(m_aws_response_handler.get_sql_result(), input, input_length, m_object_size_for_processing);
+  } catch(base_s3select_exception& e) {
+    ldpp_dout(this, 10) << "S3select: failed to process JSON object: " << e.what() << dendl;
+    m_aws_response_handler.get_sql_result().append(e.what());
+    m_aws_response_handler.send_error_response(s3select_processTime_error,
+	e.what(),
+     	s3select_resource_id);
+    return -EINVAL;
+  }
+  uint32_t length_post_processing = m_aws_response_handler.get_sql_result().size();
+  m_aws_response_handler.update_total_bytes_returned(length_post_processing - length_before_processing);
+  if (status < 0) {
+    //error flow(processing-time)
+    m_aws_response_handler.send_error_response(s3select_processTime_error,
+	m_s3_json_object.get_error_description().c_str(),
+     	s3select_resource_id);
+    ldpp_dout(this, 10) << "s3-select query: failed to process query; {" << m_s3_json_object.get_error_description() << "}" << dendl;
+    return -EINVAL;
+  }
+  fp_chunked_transfer_encoding();
+
+  if (length_post_processing-length_before_processing != 0) {
+    m_aws_response_handler.send_success_response();
+  } else {
+    m_aws_response_handler.send_continuation_response();
+  }
+  if (enable_progress == true) {
+    m_aws_response_handler.init_progress_response();
+    m_aws_response_handler.send_progress_response();
+  }
+
+  return status;
+}
+
+int RGWSelectObj_ObjStore_S3::handle_aws_cli_parameters(std::string& sql_query)
+{
+  std::string input_tag{"InputSerialization"};
+  std::string output_tag{"OutputSerialization"};
+  if (chunk_number !=0) {
+    return 0;
+  }
+#define GT "&gt;"
+#define LT "&lt;"
+#define APOS "&apos;"
+
+  if (m_s3select_query.find(GT) != std::string::npos) {
+    boost::replace_all(m_s3select_query, GT, ">");
+  }
+  if (m_s3select_query.find(LT) != std::string::npos) {
+    boost::replace_all(m_s3select_query, LT, "<");
+  }
+  if (m_s3select_query.find(APOS) != std::string::npos) {
+    boost::replace_all(m_s3select_query, APOS, "'");
+  }
+  //AWS cli s3select parameters
+  if (m_s3select_query.find(input_tag+"><CSV") != std::string::npos) {
+    ldpp_dout(this, 10) << "s3select: engine is set to process CSV objects" << dendl;
+  }
+  else if (m_s3select_query.find(input_tag+"><JSON") != std::string::npos) {
+    m_json_type=true;
+    ldpp_dout(this, 10) << "s3select: engine is set to process JSON objects" << dendl;
+  } else if (m_s3select_query.find(input_tag+"><Parquet") != std::string::npos) {
+    m_parquet_type=true;
+    ldpp_dout(this, 10) << "s3select: engine is set to process Parquet objects" << dendl;
+  }
+
+  extract_by_tag(m_s3select_query, "Expression", sql_query);
+  extract_by_tag(m_s3select_query, "Enabled", m_enable_progress);
+  size_t _qi = m_s3select_query.find("<" + input_tag + ">", 0);
+  size_t _qe = m_s3select_query.find("</" + input_tag + ">", _qi);
+  m_s3select_input = m_s3select_query.substr(_qi + input_tag.size() + 2, _qe - (_qi + input_tag.size() + 2));
+  extract_by_tag(m_s3select_input, "FieldDelimiter", m_column_delimiter);
+  extract_by_tag(m_s3select_input, "QuoteCharacter", m_quot);
+  extract_by_tag(m_s3select_input, "RecordDelimiter", m_row_delimiter);
+  extract_by_tag(m_s3select_input, "FileHeaderInfo", m_header_info);
+  extract_by_tag(m_s3select_input, "Type", m_json_datatype);
+  if (m_row_delimiter.size()==0) {
+    m_row_delimiter='\n';
+  } else if (m_row_delimiter.compare("&#10;") == 0) {
+    //presto change
+    m_row_delimiter='\n';
+  }
+  extract_by_tag(m_s3select_input, "QuoteEscapeCharacter", m_escape_char);
+  extract_by_tag(m_s3select_input, "CompressionType", m_compression_type);
+  size_t _qo = m_s3select_query.find("<" + output_tag + ">", 0);
+  size_t _qs = m_s3select_query.find("</" + output_tag + ">", _qi);
+  m_s3select_output = m_s3select_query.substr(_qo + output_tag.size() + 2, _qs - (_qo + output_tag.size() + 2));
+  extract_by_tag(m_s3select_output, "FieldDelimiter", output_column_delimiter);
+  extract_by_tag(m_s3select_output, "QuoteCharacter", output_quot);
+  extract_by_tag(m_s3select_output, "QuoteEscapeCharacter", output_escape_char);
+  extract_by_tag(m_s3select_output, "QuoteFields", output_quote_fields);
+  extract_by_tag(m_s3select_output, "RecordDelimiter", output_row_delimiter);
+  if (output_row_delimiter.size()==0) {
+    output_row_delimiter='\n';
+  } else if (output_row_delimiter.compare("&#10;") == 0) {
+    //presto change
+    output_row_delimiter='\n';
+  }
+  if (m_compression_type.length()>0 && m_compression_type.compare("NONE") != 0) {
+    ldpp_dout(this, 10) << "RGW supports currently only NONE option for compression type" << dendl;
+    return -1;
+  }
+  extract_by_tag(m_s3select_query, "Start", m_start_scan);
+  extract_by_tag(m_s3select_query, "End", m_end_scan);
+  if (m_start_scan.size() || m_end_scan.size()) {
+    m_scan_range_ind = true;
+    if (m_start_scan.size()) {
+      m_start_scan_sz = std::stol(m_start_scan);
+    }
+    if (m_end_scan.size()) {
+      m_end_scan_sz = std::stol(m_end_scan);
+    } else {
+      m_end_scan_sz = std::numeric_limits<std::int64_t>::max();
+    } 
+  }
+  if (m_enable_progress.compare("true")==0) {
+    enable_progress = true;
+  } else {
+    enable_progress = false;
+  }
+  return 0;
+}
+
+int RGWSelectObj_ObjStore_S3::extract_by_tag(std::string input, std::string tag_name, std::string& result)
+{
+  result = "";
+  size_t _qs = input.find("<" + tag_name + ">", 0);
+  size_t qs_input = _qs + tag_name.size() + 2;
+  if (_qs == std::string::npos) {
+    return -1;
+  }
+  size_t _qe = input.find("</" + tag_name + ">", qs_input);
+  if (_qe == std::string::npos) {
+    return -1;
+  }
+  result = input.substr(qs_input, _qe - qs_input);
+  return 0;
+}
+
+size_t RGWSelectObj_ObjStore_S3::get_obj_size()
+{
+  return s->obj_size;
+}
+
+int RGWSelectObj_ObjStore_S3::range_request(int64_t ofs, int64_t len, void* buff, optional_yield y)
+{
+  //purpose: implementation for arrow::ReadAt, this may take several async calls.
+  //send_response_date(call_back) accumulate buffer, upon completion control is back to ReadAt.
+  range_req_str = "bytes=" + std::to_string(ofs) + "-" + std::to_string(ofs+len-1);
+  range_str = range_req_str.c_str();
+  range_parsed = false;
+  RGWGetObj::parse_range();
+  requested_buffer.clear();
+  m_request_range = len;
+  ldout(s->cct, 10) << "S3select: calling execute(async):" << " request-offset :" << ofs << " request-length :" << len << " buffer size : " << requested_buffer.size() << dendl;
+  RGWGetObj::execute(y);
+  if (buff) {
+    memcpy(buff, requested_buffer.data(), len);
+  }
+  ldout(s->cct, 10) << "S3select: done waiting, buffer is complete buffer-size:" << requested_buffer.size() << dendl;
+  return len;
+}
+
+void RGWSelectObj_ObjStore_S3::execute(optional_yield y)
+{
+  int status = 0;
+  char parquet_magic[4];
+  static constexpr uint8_t parquet_magic1[4] = {'P', 'A', 'R', '1'};
+  static constexpr uint8_t parquet_magicE[4] = {'P', 'A', 'R', 'E'};
+  get_params(y);
+#ifdef _ARROW_EXIST
+  m_rgw_api.m_y = &y;
+#endif
+  if (m_parquet_type) {
+    //parquet processing
+    range_request(0, 4, parquet_magic, y);
+    if (memcmp(parquet_magic, parquet_magic1, 4) && memcmp(parquet_magic, parquet_magicE, 4)) {
+      ldout(s->cct, 10) << s->object->get_name() << " does not contain parquet magic" << dendl;
+      op_ret = -ERR_INVALID_REQUEST;
+      return;
+    }
+    s3select_syntax.parse_query(m_sql_query.c_str());
+    status = run_s3select_on_parquet(m_sql_query.c_str());
+    if (status) {
+      ldout(s->cct, 10) << "S3select: failed to process query <" << m_sql_query << "> on object " << s->object->get_name() << dendl;
+      op_ret = -ERR_INVALID_REQUEST;
+    } else {
+      ldout(s->cct, 10) << "S3select: complete query with success " << dendl;
+    }
+    } else { 
+	//CSV or JSON processing
+	if (m_scan_range_ind) {
+
+	  m_requested_range = (m_end_scan_sz - m_start_scan_sz);
+	    
+	  if(m_is_trino_request){
+	  // fetch more than requested(m_scan_offset), that additional bytes are scanned for end of row, 
+	  // thus the additional length will be processed, and no broken row for Trino.
+	  // assumption: row is smaller than m_scan_offset. (a different approach is to request for additional range)
+	    range_request(m_start_scan_sz, m_requested_range + m_scan_offset, nullptr, y);
+	  } else {
+	    range_request(m_start_scan_sz, m_requested_range, nullptr, y);
+	  }
+
+	} else {
+	  RGWGetObj::execute(y);
+	}
+  }//if (m_parquet_type)
+}
+
+int RGWSelectObj_ObjStore_S3::parquet_processing(bufferlist& bl, off_t ofs, off_t len)
+{
+    fp_chunked_transfer_encoding();
+    size_t append_in_callback = 0;
+    int part_no = 1;
+    //concat the requested buffer
+    for (auto& it : bl.buffers()) {
+      if (it.length() == 0) {
+        ldout(s->cct, 10) << "S3select: get zero-buffer while appending request-buffer " << dendl;
+      }
+      append_in_callback += it.length();
+      ldout(s->cct, 10) << "S3select: part " << part_no++ << " it.length() = " << it.length() << dendl;
+      requested_buffer.append(&(it)[0]+ofs, len);
+    }
+    ldout(s->cct, 10) << "S3select:append_in_callback = " << append_in_callback << dendl;
+    if (requested_buffer.size() < m_request_range) {
+      ldout(s->cct, 10) << "S3select: need another round buffe-size: " << requested_buffer.size() << " request range length:" << m_request_range << dendl;
+      return 0;
+    } else {//buffer is complete
+      ldout(s->cct, 10) << "S3select: buffer is complete " << requested_buffer.size() << " request range length:" << m_request_range << dendl;
+      m_request_range = 0;
+    }
+    return 0;
+}
+
+void RGWSelectObj_ObjStore_S3::shape_chunk_per_trino_requests(const char* it_cp, off_t& ofs, off_t& len)
+{
+//in case it is a scan range request and sent by Trino client.
+//this routine chops the start/end of chunks.
+//the purpose is to return "perfect" results, with no broken or missing lines.
+
+  off_t new_offset = 0;
+  if(m_scan_range_ind){//only upon range-scan
+  int64_t sc=0;
+  int64_t start =0;
+  const char* row_delimiter = m_row_delimiter.c_str();
+   
+    ldpp_dout(this, 10) << "s3select query: per Trino request the first and last chunk should modified." << dendl;
+
+    //chop the head of the first chunk and only upon the slice does not include the head of the object.
+    if(m_start_scan_sz && (m_aws_response_handler.get_processed_size()==0)){
+      char* p = const_cast<char*>(it_cp+ofs);
+      while(strncmp(row_delimiter,p,1) && (p - (it_cp+ofs)) < len)p++;
+      if(!strncmp(row_delimiter,p,1)){
+	new_offset += (p - (it_cp+ofs))+1;
+      } 
+    }
+
+    //RR : end of the range-request. the original request sent by Trino client
+    //RD : row-delimiter
+    //[ ... ] : chunk boundaries 
+
+    //chop the end of the last chunk for this request
+    //if it's the last chunk, search for first row-delimiter for the following different use-cases
+    if((m_aws_response_handler.get_processed_size()+len) >= m_requested_range){ 
+    //had pass the requested range, start to search for first delimiter
+      if(m_aws_response_handler.get_processed_size()>m_requested_range){
+	//the previous chunk contain the complete request(all data) and an extra bytes.
+	//thus, search for the first row-delimiter
+	//[:previous (RR) ... ][:current (RD) ]
+	start = 0;
+      } else if(m_aws_response_handler.get_processed_size()){
+	//the *current* chunk contain the complete request in the middle of the chunk. 
+	//thus, search for the first row-delimiter after the complete request position
+	//[:current (RR) .... (RD) ]
+	start = m_requested_range -  m_aws_response_handler.get_processed_size();
+      } else {
+	//the current chunk is the first chunk and it contains complete request
+	//[:current:first-chunk (RR) .... (RD) ]
+	start = m_requested_range;
+      }
+
+      for(sc=start;sc<len;sc++)//assumption : row-delimiter must exist or its end ebject
+      {
+	char* p = const_cast<char*>(it_cp) + ofs + sc;
+	if(!strncmp(row_delimiter,p,1)){
+	      ldout(s->cct, 10) << "S3select: found row-delimiter on " << sc << " get_processed_size = " << m_aws_response_handler.get_processed_size() <<  dendl;
+	      len = sc + 1;//+1 is for delimiter.  TODO what about m_object_size_for_processing (to update according to len)
+	      //the end of row exist in current chunk.
+	      //thus, the next chunk should be skipped
+	      m_skip_next_chunk = true; 
+	      break;
+	}
+      }
+    }
+    ofs += new_offset;
+  }
+
+  ldout(s->cct, 10) << "S3select: shape_chunk_per_trino_requests:update progress len = " << len << dendl;
+  len -= new_offset;
+}
+
+int RGWSelectObj_ObjStore_S3::csv_processing(bufferlist& bl, off_t ofs, off_t len)
+{
+  int status = 0;
+  if(m_skip_next_chunk == true){
+    return status;
+  } 
+
+  if (s->obj_size == 0 || m_object_size_for_processing == 0) {
+    status = run_s3select_on_csv(m_sql_query.c_str(), nullptr, 0);
+    if (status<0){
+      return -EINVAL;
+    }
+  } else {
+    auto bl_len = bl.get_num_buffers();
+    int buff_no=0;
+    for(auto& it : bl.buffers()) {
+      ldpp_dout(this, 10) << "s3select :processing segment " << buff_no << " out of " << bl_len << " off " << ofs
+                          << " len " << len << " obj-size " << m_object_size_for_processing << dendl;
+      if (it.length() == 0 || len == 0) {
+        ldpp_dout(this, 10) << "s3select :it->_len is zero. segment " << buff_no << " out of " << bl_len
+                            <<  " obj-size " << m_object_size_for_processing << dendl;
+        continue;
+      }
+
+      if((ofs + len) > it.length()){
+	ldpp_dout(this, 10) << "offset and length may cause invalid read: ofs = " << ofs << " len = " << len << " it.length() = " << it.length() << dendl;
+	ofs = 0;
+	len = it.length();
+      }
+
+    if(m_is_trino_request){
+      shape_chunk_per_trino_requests(&(it)[0], ofs, len); 
+    }
+
+    ldpp_dout(this, 10) << "s3select: chunk:  ofs = " << ofs << " len = " << len << " it.length() = " << it.length() << " m_object_size_for_processing = " << m_object_size_for_processing << dendl;
+    
+    m_aws_response_handler.update_processed_size(it.length());//NOTE : to run analysis to validate len is aligned with m_processed_bytes
+    status = run_s3select_on_csv(m_sql_query.c_str(), &(it)[0] + ofs, len);
+    if (status<0) {
+	  return -EINVAL;
+    }
+    if (m_s3_csv_object.is_sql_limit_reached()) {
+	  break;
+    }
+    buff_no++;
+  }//for
+  }//else
+
+  ldpp_dout(this, 10) << "s3select : m_aws_response_handler.get_processed_size() " << m_aws_response_handler.get_processed_size() 
+  << " m_object_size_for_processing " << uint64_t(m_object_size_for_processing) << dendl;
+
+  if (m_aws_response_handler.get_processed_size() >= uint64_t(m_object_size_for_processing) || m_s3_csv_object.is_sql_limit_reached()) {
+    if (status >=0) {
+      m_aws_response_handler.init_stats_response();
+      m_aws_response_handler.send_stats_response();
+      m_aws_response_handler.init_end_response();
+      ldpp_dout(this, 10) << "s3select : reached the end of query request : aws_response_handler.get_processed_size() " << m_aws_response_handler.get_processed_size()
+      << "m_object_size_for_processing : " << m_object_size_for_processing << dendl;
+    }
+    if (m_s3_csv_object.is_sql_limit_reached()) {
+    //stop fetching chunks
+    ldpp_dout(this, 10) << "s3select : reached the limit :" << m_aws_response_handler.get_processed_size()  << dendl;
+    status = -ENOENT;
+    }
+  }
+
+  return status;
+}
+
+int RGWSelectObj_ObjStore_S3::json_processing(bufferlist& bl, off_t ofs, off_t len)
+{
+  int status = 0;
+  
+  if (s->obj_size == 0 || m_object_size_for_processing == 0) {
+    //in case of empty object the s3select-function returns a correct "empty" result(for aggregation and non-aggregation queries).
+    status = run_s3select_on_json(m_sql_query.c_str(), nullptr, 0);
+    if (status<0)
+      return -EINVAL;
+  } else {
+    //loop on buffer-list(chunks)
+    auto bl_len = bl.get_num_buffers();
+    int i=0;
+    for(auto& it : bl.buffers()) {
+      ldpp_dout(this, 10) << "processing segment " << i << " out of " << bl_len << " off " << ofs
+                          << " len " << len << " obj-size " << m_object_size_for_processing << dendl;
+      //skipping the empty chunks
+      if (len == 0) {
+        ldpp_dout(this, 10) << "s3select:it->_len is zero. segment " << i << " out of " << bl_len
+                            <<  " obj-size " << m_object_size_for_processing << dendl;
+        continue;
+      }
+
+      if((ofs + len) > it.length()){
+	ldpp_dout(this, 10) << "s3select: offset and length may cause invalid read: ofs = " << ofs << " len = " << len << " it.length() = " << it.length() << dendl;
+	ofs = 0;
+	len = it.length();
+      }
+
+      m_aws_response_handler.update_processed_size(len);
+      status = run_s3select_on_json(m_sql_query.c_str(), &(it)[0] + ofs, len);
+      if (status<0) {
+	status = -EINVAL;
+        break;
+      }
+      if (m_s3_json_object.is_sql_limit_reached()) {
+	break;
+      }
+      i++;
+    }//for
+  }//else
+
+  if (status>=0 && (m_aws_response_handler.get_processed_size() == uint64_t(m_object_size_for_processing) || m_s3_json_object.is_sql_limit_reached())) {
+    //flush the internal JSON buffer(upon last chunk)
+    status = run_s3select_on_json(m_sql_query.c_str(), nullptr, 0);
+    if (status<0) {
+      return -EINVAL;
+    }
+    if (status >=0) {
+      m_aws_response_handler.init_stats_response();
+      m_aws_response_handler.send_stats_response();
+      m_aws_response_handler.init_end_response();
+    }
+    if (m_s3_json_object.is_sql_limit_reached()){
+      //stop fetching chunks
+      status = -ENOENT;
+      ldpp_dout(this, 10) << "s3select : reached the limit :" << m_aws_response_handler.get_processed_size()  << dendl;
+    }
+  }
+  return status;
+}
+
+int RGWSelectObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t ofs, off_t len)
+{
+  if (m_scan_range_ind == false){
+    m_object_size_for_processing = s->obj_size;
+  }
+  if (m_scan_range_ind == true){
+      if (m_end_scan_sz == -1){
+       	m_end_scan_sz = s->obj_size;
+      }
+      if (static_cast<uint64_t>((m_end_scan_sz - m_start_scan_sz))>s->obj_size){ //in the case user provides range bigger than object-size
+	m_object_size_for_processing = s->obj_size;
+      } else {
+	m_object_size_for_processing = m_end_scan_sz - m_start_scan_sz;
+      }
+  }
+  if (!m_aws_response_handler.is_set()) {
+    m_aws_response_handler.set(s, this);
+  }
+  if (len == 0 && s->obj_size != 0) {
+    return 0;
+  }
+  if (m_parquet_type) {
+    return parquet_processing(bl,ofs,len);
+  }
+  if (m_json_type) {
+    return json_processing(bl,ofs,len);
+  }
+  return csv_processing(bl,ofs,len);
+}
+
diff --git a/src/rgw/rgw_s3select.h b/src/rgw/rgw_s3select.h
new file mode 100644
index 000000000..4a506ba4c
--- /dev/null
+++ b/src/rgw/rgw_s3select.h
@@ -0,0 +1,10 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+//
+
+#pragma once
+
+namespace rgw::s3select {
+RGWOp* create_s3select_op();
+}
+
diff --git a/src/rgw/rgw_s3select_private.h b/src/rgw/rgw_s3select_private.h
new file mode 100644
index 000000000..fa595b0da
--- /dev/null
+++ b/src/rgw/rgw_s3select_private.h
@@ -0,0 +1,258 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+//
+#pragma once
+
+#include <errno.h>
+#include <array>
+#include <string.h>
+#include <string_view>
+
+#include "common/ceph_crypto.h"
+#include "common/split.h"
+#include "common/Formatter.h"
+#include "common/utf8.h"
+#include "common/ceph_json.h"
+#include "common/safe_io.h"
+#include "common/errno.h"
+#include "auth/Crypto.h"
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/replace.hpp>
+#include <boost/tokenizer.hpp>
+#define BOOST_BIND_GLOBAL_PLACEHOLDERS
+#ifdef HAVE_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wimplicit-const-int-float-conversion"
+#endif
+#ifdef HAVE_WARN_IMPLICIT_CONST_INT_FLOAT_CONVERSION
+#pragma clang diagnostic pop
+#endif
+#undef BOOST_BIND_GLOBAL_PLACEHOLDERS
+
+#include <liboath/oath.h>
+
+
+#pragma GCC diagnostic push
+#pragma clang diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated"
+#pragma clang diagnostic ignored "-Wdeprecated"
+#include <s3select/include/s3select.h>
+#pragma GCC diagnostic pop
+#pragma clang diagnostic pop
+
+#include "rgw_rest_s3.h"
+#include "rgw_s3select.h"
+
+class aws_response_handler
+{
+
+private:
+  std::string sql_result;
+  req_state* s;
+  uint32_t header_size;
+  // the parameters are according to CRC-32 algorithm and its aligned with AWS-cli checksum
+  boost::crc_optimal<32, 0x04C11DB7, 0xFFFFFFFF, 0xFFFFFFFF, true, true> crc32;
+  RGWOp* m_rgwop;
+  std::string m_buff_header;
+  uint64_t total_bytes_returned;
+  uint64_t processed_size;
+
+  enum class header_name_En {
+    EVENT_TYPE,
+    CONTENT_TYPE,
+    MESSAGE_TYPE,
+    ERROR_CODE,
+    ERROR_MESSAGE
+  };
+
+  enum class header_value_En {
+    RECORDS,
+    OCTET_STREAM,
+    EVENT,
+    CONT,
+    PROGRESS,
+    END,
+    XML,
+    STATS,
+    ENGINE_ERROR,
+    ERROR_TYPE
+  };
+
+  const char* PAYLOAD_LINE= "\n<Payload>\n<Records>\n<Payload>\n";
+  const char* END_PAYLOAD_LINE= "\n</Payload></Records></Payload>";
+  const char* header_name_str[5] =  {":event-type", ":content-type", ":message-type", ":error-code", ":error-message"};
+  const char* header_value_str[10] = {"Records", "application/octet-stream", "event", "Cont", "Progress", "End", "text/xml", "Stats", "s3select-engine-error", "error"};
+  static constexpr size_t header_crc_size = 12;
+
+  void push_header(const char* header_name, const char* header_value);
+
+  int create_message(u_int32_t header_len);
+
+public:
+  aws_response_handler(req_state* ps, RGWOp* rgwop) : s(ps), m_rgwop(rgwop), total_bytes_returned{0}, processed_size{0}
+  {}
+
+  aws_response_handler() : s(nullptr), m_rgwop(nullptr), total_bytes_returned{0}, processed_size{0}
+  {}
+
+  bool is_set()
+  {
+    if(s==nullptr || m_rgwop == nullptr){
+      return false;
+    } 
+    return true;
+  }
+
+  void set(req_state* ps, RGWOp* rgwop)
+  {
+    s = ps;
+    m_rgwop = rgwop;
+  }
+
+  std::string& get_sql_result();
+
+  uint64_t get_processed_size();
+
+  void update_processed_size(uint64_t value);
+
+  uint64_t get_total_bytes_returned();
+
+  void update_total_bytes_returned(uint64_t value);
+
+  int create_header_records();
+
+  int create_header_continuation();
+
+  int create_header_progress();
+
+  int create_header_stats();
+
+  int create_header_end();
+
+  int create_error_header_records(const char* error_message);
+
+  void init_response();
+
+  void init_success_response();
+
+  void send_continuation_response();
+
+  void init_progress_response();
+
+  void init_end_response();
+
+  void init_stats_response();
+
+  void init_error_response(const char* error_message);
+
+  void send_success_response();
+
+  void send_progress_response();
+
+  void send_stats_response();
+
+  void send_error_response(const char* error_code,
+                           const char* error_message,
+                           const char* resource_id);
+
+}; //end class aws_response_handler
+
+class RGWSelectObj_ObjStore_S3 : public RGWGetObj_ObjStore_S3
+{
+
+private:
+  s3selectEngine::s3select s3select_syntax;
+  std::string m_s3select_query;
+  std::string m_s3select_input;
+  std::string m_s3select_output;
+  s3selectEngine::csv_object m_s3_csv_object;
+#ifdef _ARROW_EXIST
+  s3selectEngine::parquet_object m_s3_parquet_object;
+#endif
+  s3selectEngine::json_object m_s3_json_object;
+  std::string m_column_delimiter;
+  std::string m_quot;
+  std::string m_row_delimiter;
+  std::string m_compression_type;
+  std::string m_escape_char;
+  std::unique_ptr<char[]>  m_buff_header;
+  std::string m_header_info;
+  std::string m_sql_query;
+  std::string m_enable_progress;
+  std::string output_column_delimiter;
+  std::string output_quot;
+  std::string output_escape_char;
+  std::string output_quote_fields;
+  std::string output_row_delimiter;
+  std::string m_start_scan;
+  std::string m_end_scan;
+  bool m_scan_range_ind;
+  int64_t m_start_scan_sz;
+  int64_t m_end_scan_sz;
+  int64_t m_object_size_for_processing;
+  aws_response_handler m_aws_response_handler;
+  bool enable_progress;
+
+  //parquet request
+  bool m_parquet_type;
+  //json request
+  std::string m_json_datatype;
+  bool m_json_type;
+#ifdef _ARROW_EXIST
+  s3selectEngine::rgw_s3select_api m_rgw_api;
+#endif
+  //a request for range may statisfy by several calls to send_response_date;
+  size_t m_request_range;
+  std::string requested_buffer;
+  std::string range_req_str;
+  std::function<int(std::string&)> fp_result_header_format;
+  std::function<int(std::string&)> fp_s3select_result_format;
+  std::function<void(const char*)> fp_debug_mesg;
+  std::function<void(void)> fp_chunked_transfer_encoding;
+  int m_header_size;
+
+public:
+  unsigned int chunk_number;
+  size_t m_requested_range;
+  size_t m_scan_offset;
+  bool m_skip_next_chunk;
+  bool m_is_trino_request;
+
+  RGWSelectObj_ObjStore_S3();
+  virtual ~RGWSelectObj_ObjStore_S3();
+
+  virtual int send_response_data(bufferlist& bl, off_t ofs, off_t len) override;
+
+  virtual int get_params(optional_yield y) override;
+
+  virtual void execute(optional_yield) override;
+
+private:
+
+  int csv_processing(bufferlist& bl, off_t ofs, off_t len);
+
+  int parquet_processing(bufferlist& bl, off_t ofs, off_t len);
+
+  int json_processing(bufferlist& bl, off_t ofs, off_t len);
+
+  int run_s3select_on_csv(const char* query, const char* input, size_t input_length);
+
+  int run_s3select_on_parquet(const char* query);
+
+  int run_s3select_on_json(const char* query, const char* input, size_t input_length);
+
+  int extract_by_tag(std::string input, std::string tag_name, std::string& result);
+
+  void convert_escape_seq(std::string& esc);
+
+  int handle_aws_cli_parameters(std::string& sql_query);
+
+  int range_request(int64_t start, int64_t len, void*, optional_yield);
+
+  size_t get_obj_size();
+  std::function<int(int64_t, int64_t, void*, optional_yield*)> fp_range_req;
+  std::function<size_t(void)> fp_get_obj_size;
+
+  void shape_chunk_per_trino_requests(const char*, off_t& ofs, off_t& len);
+};
+
diff --git a/src/rgw/rgw_sal.cc b/src/rgw/rgw_sal.cc
new file mode 100644
index 000000000..58a21f707
--- /dev/null
+++ b/src/rgw/rgw_sal.cc
@@ -0,0 +1,402 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <system_error>
+#include <unistd.h>
+#include <sstream>
+
+#include "common/errno.h"
+
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "driver/rados/config/store.h"
+#include "driver/json_config/store.h"
+#include "rgw_d3n_datacache.h"
+
+#ifdef WITH_RADOSGW_DBSTORE
+#include "rgw_sal_dbstore.h"
+#include "driver/dbstore/config/store.h"
+#endif
+
+#ifdef WITH_RADOSGW_MOTR
+#include "rgw_sal_motr.h"
+#endif
+
+#ifdef WITH_RADOSGW_DAOS
+#include "rgw_sal_daos.h"
+#endif
+
+#define dout_subsys ceph_subsys_rgw
+
+extern "C" {
+extern rgw::sal::Driver* newRadosStore(void);
+#ifdef WITH_RADOSGW_DBSTORE
+extern rgw::sal::Driver* newDBStore(CephContext *cct);
+#endif
+#ifdef WITH_RADOSGW_MOTR
+extern rgw::sal::Driver* newMotrStore(CephContext *cct);
+#endif
+#ifdef WITH_RADOSGW_DAOS
+extern rgw::sal::Driver* newDaosStore(CephContext *cct);
+#endif
+extern rgw::sal::Driver* newBaseFilter(rgw::sal::Driver* next);
+
+}
+
+RGWObjState::RGWObjState() {
+}
+
+RGWObjState::~RGWObjState() {
+}
+
+RGWObjState::RGWObjState(const RGWObjState& rhs) : obj (rhs.obj) {
+  is_atomic = rhs.is_atomic;
+  has_attrs = rhs.has_attrs;
+  exists = rhs.exists;
+  size = rhs.size;
+  accounted_size = rhs.accounted_size;
+  mtime = rhs.mtime;
+  epoch = rhs.epoch;
+  if (rhs.obj_tag.length()) {
+    obj_tag = rhs.obj_tag;
+  }
+  if (rhs.tail_tag.length()) {
+    tail_tag = rhs.tail_tag;
+  }
+  write_tag = rhs.write_tag;
+  fake_tag = rhs.fake_tag;
+  shadow_obj = rhs.shadow_obj;
+  has_data = rhs.has_data;
+  if (rhs.data.length()) {
+    data = rhs.data;
+  }
+  prefetch_data = rhs.prefetch_data;
+  keep_tail = rhs.keep_tail;
+  is_olh = rhs.is_olh;
+  objv_tracker = rhs.objv_tracker;
+  pg_ver = rhs.pg_ver;
+  compressed = rhs.compressed;
+}
+
+rgw::sal::Driver* DriverManager::init_storage_provider(const DoutPrefixProvider* dpp,
+						     CephContext* cct,
+						     const Config& cfg,
+						     bool use_gc_thread,
+						     bool use_lc_thread,
+						     bool quota_threads,
+						     bool run_sync_thread,
+						     bool run_reshard_thread,
+						     bool use_cache,
+						     bool use_gc)
+{
+  rgw::sal::Driver* driver{nullptr};
+
+  if (cfg.store_name.compare("rados") == 0) {
+    driver = newRadosStore();
+    RGWRados* rados = static_cast<rgw::sal::RadosStore* >(driver)->getRados();
+
+    if ((*rados).set_use_cache(use_cache)
+                .set_use_datacache(false)
+                .set_use_gc(use_gc)
+                .set_run_gc_thread(use_gc_thread)
+                .set_run_lc_thread(use_lc_thread)
+                .set_run_quota_threads(quota_threads)
+                .set_run_sync_thread(run_sync_thread)
+                .set_run_reshard_thread(run_reshard_thread)
+                .init_begin(cct, dpp) < 0) {
+      delete driver;
+      return nullptr;
+    }
+    if (driver->initialize(cct, dpp) < 0) {
+      delete driver;
+      return nullptr;
+    }
+    if (rados->init_complete(dpp) < 0) {
+      delete driver;
+      return nullptr;
+    }
+  }
+  else if (cfg.store_name.compare("d3n") == 0) {
+    driver = new rgw::sal::RadosStore();
+    RGWRados* rados = new D3nRGWDataCache<RGWRados>;
+    dynamic_cast<rgw::sal::RadosStore*>(driver)->setRados(rados);
+    rados->set_store(static_cast<rgw::sal::RadosStore* >(driver));
+
+    if ((*rados).set_use_cache(use_cache)
+                .set_use_datacache(true)
+                .set_run_gc_thread(use_gc_thread)
+                .set_run_lc_thread(use_lc_thread)
+                .set_run_quota_threads(quota_threads)
+                .set_run_sync_thread(run_sync_thread)
+                .set_run_reshard_thread(run_reshard_thread)
+                .init_begin(cct, dpp) < 0) {
+      delete driver;
+      return nullptr;
+    }
+    if (driver->initialize(cct, dpp) < 0) {
+      delete driver;
+      return nullptr;
+    }
+    if (rados->init_complete(dpp) < 0) {
+      delete driver;
+      return nullptr;
+    }
+
+    lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_local_datacache_enabled=" <<
+      cct->_conf->rgw_d3n_l1_local_datacache_enabled << dendl;
+    lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_datacache_persistent_path='" <<
+      cct->_conf->rgw_d3n_l1_datacache_persistent_path << "'" << dendl;
+    lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_datacache_size=" <<
+      cct->_conf->rgw_d3n_l1_datacache_size << dendl;
+    lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_evict_cache_on_start=" <<
+      cct->_conf->rgw_d3n_l1_evict_cache_on_start << dendl;
+    lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_fadvise=" <<
+      cct->_conf->rgw_d3n_l1_fadvise << dendl;
+    lsubdout(cct, rgw, 1) << "rgw_d3n: rgw_d3n_l1_eviction_policy=" <<
+      cct->_conf->rgw_d3n_l1_eviction_policy << dendl;
+  }
+#ifdef WITH_RADOSGW_DBSTORE
+  else if (cfg.store_name.compare("dbstore") == 0) {
+    driver = newDBStore(cct);
+
+    if ((*(rgw::sal::DBStore*)driver).set_run_lc_thread(use_lc_thread)
+                                    .initialize(cct, dpp) < 0) {
+      delete driver;
+      return nullptr;
+    }
+  }
+#endif
+
+#ifdef WITH_RADOSGW_MOTR
+  else if (cfg.store_name.compare("motr") == 0) {
+    driver = newMotrStore(cct);
+    if (driver == nullptr) {
+      ldpp_dout(dpp, 0) << "newMotrStore() failed!" << dendl;
+      return driver;
+    }
+    ((rgw::sal::MotrStore *)driver)->init_metadata_cache(dpp, cct);
+
+    return store;
+  }
+#endif
+
+#ifdef WITH_RADOSGW_DAOS
+  else if (cfg.store_name.compare("daos") == 0) {
+    driver = newDaosStore(cct);
+    if (driver == nullptr) {
+      ldpp_dout(dpp, 0) << "newDaosStore() failed!" << dendl;
+      return driver;
+    }
+    int ret = driver->initialize(cct, dpp);
+    if (ret != 0) {
+      ldpp_dout(dpp, 20) << "ERROR: store->initialize() failed: " << ret << dendl;
+      delete driver;
+      return nullptr;
+    }
+  }
+#endif
+
+  if (cfg.filter_name.compare("base") == 0) {
+    rgw::sal::Driver* next = driver;
+    driver = newBaseFilter(next);
+
+    if (driver->initialize(cct, dpp) < 0) {
+      delete driver;
+      delete next;
+      return nullptr;
+    }
+  }
+
+  return driver;
+}
+
+rgw::sal::Driver* DriverManager::init_raw_storage_provider(const DoutPrefixProvider* dpp, CephContext* cct, const Config& cfg)
+{
+  rgw::sal::Driver* driver = nullptr;
+  if (cfg.store_name.compare("rados") == 0) {
+    driver = newRadosStore();
+    RGWRados* rados = static_cast<rgw::sal::RadosStore* >(driver)->getRados();
+
+    rados->set_context(cct);
+
+    int ret = rados->init_svc(true, dpp);
+    if (ret < 0) {
+      ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
+      delete driver;
+      return nullptr;
+    }
+
+    if (rados->init_rados() < 0) {
+      delete driver;
+      return nullptr;
+    }
+    if (driver->initialize(cct, dpp) < 0) {
+      delete driver;
+      return nullptr;
+    }
+  } else if (cfg.store_name.compare("dbstore") == 0) {
+#ifdef WITH_RADOSGW_DBSTORE
+    driver = newDBStore(cct);
+
+    if ((*(rgw::sal::DBStore*)driver).initialize(cct, dpp) < 0) {
+      delete driver;
+      return nullptr;
+    }
+#else
+    driver = nullptr;
+#endif
+  } else if (cfg.store_name.compare("motr") == 0) {
+#ifdef WITH_RADOSGW_MOTR
+    driver = newMotrStore(cct);
+#else
+    driver = nullptr;
+#endif
+  } else if (cfg.store_name.compare("daos") == 0) {
+#ifdef WITH_RADOSGW_DAOS
+    driver = newDaosStore(cct);
+
+    if (driver->initialize(cct, dpp) < 0) {
+      delete driver;
+      return nullptr;
+    }
+#else
+    driver = nullptr;
+#endif
+  }
+
+  if (cfg.filter_name.compare("base") == 0) {
+    rgw::sal::Driver* next = driver;
+    driver = newBaseFilter(next);
+
+    if (driver->initialize(cct, dpp) < 0) {
+      delete driver;
+      delete next;
+      return nullptr;
+    }
+  }
+
+  return driver;
+}
+
+void DriverManager::close_storage(rgw::sal::Driver* driver)
+{
+  if (!driver)
+    return;
+
+  driver->finalize();
+
+  delete driver;
+}
+
+DriverManager::Config DriverManager::get_config(bool admin, CephContext* cct)
+{
+  DriverManager::Config cfg;
+
+  // Get the store backend
+  const auto& config_store = g_conf().get_val<std::string>("rgw_backend_store");
+  if (config_store == "rados") {
+    cfg.store_name = "rados";
+
+    /* Check to see if d3n is configured, but only for non-admin */
+    const auto& d3n = g_conf().get_val<bool>("rgw_d3n_l1_local_datacache_enabled");
+    if (!admin && d3n) {
+      if (g_conf().get_val<Option::size_t>("rgw_max_chunk_size") !=
+	  g_conf().get_val<Option::size_t>("rgw_obj_stripe_size")) {
+	lsubdout(cct, rgw_datacache, 0) << "rgw_d3n:  WARNING: D3N DataCache disabling (D3N requires that the chunk_size equals stripe_size)" << dendl;
+      } else if (!g_conf().get_val<bool>("rgw_beast_enable_async")) {
+	lsubdout(cct, rgw_datacache, 0) << "rgw_d3n:  WARNING: D3N DataCache disabling (D3N requires yield context - rgw_beast_enable_async=true)" << dendl;
+      } else {
+	cfg.store_name = "d3n";
+      }
+    }
+  }
+#ifdef WITH_RADOSGW_DBSTORE
+  else if (config_store == "dbstore") {
+    cfg.store_name = "dbstore";
+  }
+#endif
+#ifdef WITH_RADOSGW_MOTR
+  else if (config_store == "motr") {
+    cfg.store_name = "motr";
+  }
+#endif
+#ifdef WITH_RADOSGW_DAOS
+  else if (config_store == "daos") {
+    cfg.store_name = "daos";
+  }
+#endif
+
+  // Get the filter
+  cfg.filter_name = "none";
+  const auto& config_filter = g_conf().get_val<std::string>("rgw_filter");
+  if (config_filter == "base") {
+    cfg.filter_name = "base";
+  }
+
+  return cfg;
+}
+
+auto DriverManager::create_config_store(const DoutPrefixProvider* dpp,
+                                       std::string_view type)
+  -> std::unique_ptr<rgw::sal::ConfigStore>
+{
+  try {
+    if (type == "rados") {
+      return rgw::rados::create_config_store(dpp);
+#ifdef WITH_RADOSGW_DBSTORE
+    } else if (type == "dbstore") {
+      const auto uri = g_conf().get_val<std::string>("dbstore_config_uri");
+      return rgw::dbstore::create_config_store(dpp, uri);
+#endif
+    } else if (type == "json") {
+      auto filename = g_conf().get_val<std::string>("rgw_json_config");
+      return rgw::sal::create_json_config_store(dpp, filename);
+    } else {
+      ldpp_dout(dpp, -1) << "ERROR: unrecognized config store type '"
+          << type << "'" << dendl;
+      return nullptr;
+    }
+  } catch (const std::exception& e) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to initialize config store '"
+        << type << "': " << e.what() << dendl;
+  }
+  return nullptr;
+}
+
+namespace rgw::sal {
+int Object::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
+{
+  if (ofs < 0) {
+    ofs += obj_size;
+    if (ofs < 0)
+      ofs = 0;
+    end = obj_size - 1;
+  } else if (end < 0) {
+    end = obj_size - 1;
+  }
+
+  if (obj_size > 0) {
+    if (ofs >= (off_t)obj_size) {
+      return -ERANGE;
+    }
+    if (end >= (off_t)obj_size) {
+      end = obj_size - 1;
+    }
+  }
+  return 0;
+}
+} // namespace rgw::sal
diff --git a/src/rgw/rgw_sal.h b/src/rgw/rgw_sal.h
new file mode 100644
index 000000000..944737dee
--- /dev/null
+++ b/src/rgw/rgw_sal.h
@@ -0,0 +1,1644 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal_fwd.h"
+#include "rgw_lua.h"
+#include "rgw_user.h"
+#include "rgw_notify_event_type.h"
+#include "common/tracer.h"
+#include "rgw_datalog_notify.h"
+#include "include/random.h"
+
+class RGWRESTMgr;
+class RGWAccessListFilter;
+class RGWLC;
+struct rgw_user_bucket;
+class RGWUsageBatch;
+class RGWCoroutinesManagerRegistry;
+class RGWBucketSyncPolicyHandler;
+using RGWBucketSyncPolicyHandlerRef = std::shared_ptr<RGWBucketSyncPolicyHandler>;
+class RGWDataSyncStatusManager;
+class RGWSyncModuleInstance;
+typedef std::shared_ptr<RGWSyncModuleInstance> RGWSyncModuleInstanceRef;
+class RGWCompressionInfo;
+struct rgw_pubsub_topics;
+struct rgw_pubsub_bucket_topics;
+
+
+using RGWBucketListNameFilter = std::function<bool (const std::string&)>;
+
+
+namespace rgw {
+  class Aio;
+  namespace IAM { struct Policy; }
+}
+
+class RGWGetDataCB {
+public:
+  virtual int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) = 0;
+  RGWGetDataCB() {}
+  virtual ~RGWGetDataCB() {}
+};
+
+struct RGWUsageIter {
+  std::string read_iter;
+  uint32_t index;
+
+  RGWUsageIter() : index(0) {}
+};
+
+/**
+ * @struct RGWClusterStat
+ * Cluster-wide usage information
+ */
+struct RGWClusterStat {
+  /// total device size
+  uint64_t kb;
+  /// total used
+  uint64_t kb_used;
+  /// total available/free
+  uint64_t kb_avail;
+  /// number of objects
+  uint64_t num_objects;
+};
+
+class RGWGetBucketStats_CB : public RefCountedObject {
+protected:
+  rgw_bucket bucket;
+  std::map<RGWObjCategory, RGWStorageStats>* stats;
+public:
+  explicit RGWGetBucketStats_CB(const rgw_bucket& _bucket) : bucket(_bucket), stats(NULL) {}
+  ~RGWGetBucketStats_CB() override {}
+  virtual void handle_response(int r) = 0;
+  virtual void set_response(std::map<RGWObjCategory, RGWStorageStats>* _stats) {
+    stats = _stats;
+  }
+};
+
+class RGWGetUserStats_CB : public RefCountedObject {
+protected:
+  rgw_user user;
+  RGWStorageStats stats;
+public:
+  explicit RGWGetUserStats_CB(const rgw_user& _user) : user(_user) {}
+  ~RGWGetUserStats_CB() override {}
+  virtual void handle_response(int r) = 0;
+  virtual void set_response(RGWStorageStats& _stats) {
+    stats = _stats;
+  }
+};
+
+struct RGWObjState {
+  rgw_obj obj;
+  bool is_atomic{false};
+  bool has_attrs{false};
+  bool exists{false};
+  uint64_t size{0}; //< size of raw object
+  uint64_t accounted_size{0}; //< size before compression, encryption
+  ceph::real_time mtime;
+  uint64_t epoch{0};
+  bufferlist obj_tag;
+  bufferlist tail_tag;
+  std::string write_tag;
+  bool fake_tag{false};
+  std::string shadow_obj;
+  bool has_data{false};
+  bufferlist data;
+  bool prefetch_data{false};
+  bool keep_tail{false};
+  bool is_olh{false};
+  bufferlist olh_tag;
+  uint64_t pg_ver{false};
+  uint32_t zone_short_id{0};
+  bool compressed{false};
+
+  /* important! don't forget to update copy constructor */
+
+  RGWObjVersionTracker objv_tracker;
+
+  std::map<std::string, ceph::buffer::list> attrset;
+
+  RGWObjState();
+  RGWObjState(const RGWObjState& rhs);
+  ~RGWObjState();
+
+  bool get_attr(std::string name, bufferlist& dest) {
+    auto iter = attrset.find(name);
+    if (iter != attrset.end()) {
+      dest = iter->second;
+      return true;
+    }
+    return false;
+  }
+};
+
+/**
+ * @defgroup RGWSAL RGW Store Abstraction Layer
+ *
+ * The Store Abstraction Layer is an API that separates the top layer of RGW that
+ * handles client protocols (such as S3 or Swift) from the bottom layer of RGW that
+ * interacts with a backing store.  It allows the creation of multiple backing stores
+ * that can co-exist with a single RGW instance, and allows the creation of stacking
+ * layers of translators that can modify operations as they pass down the stack.
+ * Examples of translators might be a cache layer, a duplication layer that copies
+ * operations to multiple stores, or a policy layer that sends some operations to one
+ * store and some to another.
+ *
+ * The basic unit of a SAL implementation is the Store.  Whether an actual backing store
+ * or a translator, there will be a Store implementation that represents it.  Examples
+ * are the RadosStore that communicates via RADOS with a Ceph cluster, and the DBStore
+ * that uses a SQL db (such as SQLite3) as a backing store.  There is a singleton
+ * instance of each Store.
+ *
+ * Data within RGW is owned by a User.  The User is the unit of authentication and
+ * access control.
+ *
+ * Data within RGW is stored as an Object.  Each Object is a single chunk of data, owned
+ * by a single User, contained within a single Bucket.  It has metadata associated with
+ * it, such as size, owner, and so on, and a set of key-value attributes that can
+ * contain anything needed by the top half.
+ *
+ * Data with RGW is organized into Buckets.  Each Bucket is owned by a User, and
+ * contains Objects.  There is a single, flat layer of Buckets, there is no hierarchy,
+ * and each Object is contained in a single Bucket.
+ *
+ * Instantiations of SAL classes are done as unique pointers, using std::unique_ptr.
+ * Instances of these classes are acquired via getters, and it's up to the caller to
+ * manage the lifetime.
+ *
+ * @note Anything using RGWObjContext is subject to change, as that type will not be
+ * used in the final API.
+ * @{
+ */
+
+/**
+ * @file rgw_sal.h
+ * @brief Base abstractions and API for SAL
+ */
+
+namespace rgw { namespace sal {
+
+/**
+ * @addtogroup RGWSAL
+ * @{
+ */
+
+#define RGW_SAL_VERSION 1
+
+struct MPSerializer;
+class GCChain;
+class RGWOIDCProvider;
+class RGWRole;
+
+enum AttrsMod {
+  ATTRSMOD_NONE    = 0,
+  ATTRSMOD_REPLACE = 1,
+  ATTRSMOD_MERGE   = 2
+};
+
+// a simple streaming data processing abstraction
+/**
+ * @brief A simple streaming data processing abstraction
+ */
+class DataProcessor {
+ public:
+  virtual ~DataProcessor() {}
+
+  /**
+   * @brief Consume a bufferlist in its entirety at the given object offset.
+   *
+   * An empty bufferlist is given to request that any buffered data be flushed, though this doesn't
+   * wait for completions
+   */
+  virtual int process(bufferlist&& data, uint64_t offset) = 0;
+};
+
+/**
+ * @brief a data consumer that writes an object in a bucket
+ */
+class ObjectProcessor : public DataProcessor {
+ public:
+  /** prepare to start processing object data */
+  virtual int prepare(optional_yield y) = 0;
+
+  /** complete the operation and make its result visible to clients */
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) = 0;
+};
+
+/** Base class for AIO completions */
+class Completions {
+  public:
+    Completions() {}
+    virtual ~Completions() = default;
+    virtual int drain() = 0;
+};
+
+/** A list of key-value attributes */
+  using Attrs = std::map<std::string, ceph::buffer::list>;
+
+/**
+ * @brief Base singleton representing a Store or Filter
+ *
+ * The Driver is the base abstraction of the SAL layer.  It represents a base storage
+ * mechanism, or a intermediate stacking layer.  There is a single instance of a given
+ * Driver per RGW, and this Driver mediates all access to it's backing.
+ *
+ * A Driver contains, loosely, @a User, @a Bucket, and @a Object entities.  The @a Object
+ * contains data, and it's associated metadata.  The @a Bucket contains Objects, and
+ * metadata about the bucket.  Both Buckets and Objects are owned by a @a User, which is
+ * the basic unit of access control.
+ *
+ * A Driver also has metadata and some global responsibilities.  For example, a driver is
+ * responsible for managing the LifeCycle activities for it's data.
+ */
+class Driver {
+  public:
+    Driver() {}
+    virtual ~Driver() = default;
+
+    /** Post-creation initialization of driver */
+    virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) = 0;
+    /** Name of this driver provider (e.g., "rados") */
+    virtual const std::string get_name() const = 0;
+    /** Get cluster unique identifier */
+    virtual std::string get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y) = 0;
+    /** Get a User from a rgw_user.  Does not query driver for user info, so quick */
+    virtual std::unique_ptr<User> get_user(const rgw_user& u) = 0;
+    /** Lookup a User by access key.  Queries driver for user info. */
+    virtual int get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user) = 0;
+    /** Lookup a User by email address.  Queries driver for user info. */
+    virtual int get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user) = 0;
+    /** Lookup a User by swift username.  Queries driver for user info. */
+    virtual int get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user) = 0;
+    /** Get a basic Object.  This Object is not looked up, and is incomplete, since is
+     * does not have a bucket.  This should only be used when an Object is needed before
+     * there is a Bucket, otherwise use the get_object() in the Bucket class. */
+    virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) = 0;
+    /** Get a Bucket by info.  Does not query the driver, just uses the give bucket info. */
+    virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket) = 0;
+    /** Lookup a Bucket by key.  Queries driver for bucket info. */
+    virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y) = 0;
+    /** Lookup a Bucket by name.  Queries driver for bucket info. */
+    virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr<Bucket>* bucket, optional_yield y) = 0;
+    /** For multisite, this driver is the zone's master */
+    virtual bool is_meta_master() = 0;
+    /** For multisite, forward an OP to the zone's master */
+    virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
+					  bufferlist& in_data, JSONParser* jp, req_info& info,
+					  optional_yield y) = 0;
+    virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+					     bufferlist& in_data,
+					     RGWXMLDecoder::XMLParser* parser, req_info& info,
+					     optional_yield y) = 0;
+    /** Get zone info for this driver */
+    virtual Zone* get_zone() = 0;
+    /** Get a unique ID specific to this zone. */
+    virtual std::string zone_unique_id(uint64_t unique_num) = 0;
+    /** Get a unique Swift transaction ID specific to this zone */
+    virtual std::string zone_unique_trans_id(const uint64_t unique_num) = 0;
+    /** Lookup a zonegroup by ID */
+    virtual int get_zonegroup(const std::string& id, std::unique_ptr<ZoneGroup>* zonegroup) = 0;
+    /** List all zones in all zone groups by ID */
+    virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids) = 0;
+    /** Get statistics about the cluster represented by this driver */
+    virtual int cluster_stat(RGWClusterStat& stats) = 0;
+    /** Get a @a Lifecycle object. Used to manage/run lifecycle transitions */
+    virtual std::unique_ptr<Lifecycle> get_lifecycle(void) = 0;
+    /** Get a @a Completions object.  Used for Async I/O tracking */
+    virtual std::unique_ptr<Completions> get_completions(void) = 0;
+
+     /** Get a @a Notification object.  Used to communicate with non-RGW daemons, such as
+      * management/tracking software */
+    /** RGWOp variant */
+    virtual std::unique_ptr<Notification> get_notification(rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s,
+        rgw::notify::EventType event_type, optional_yield y, const std::string* object_name=nullptr) = 0;
+    /** No-req_state variant (e.g., rgwlc) */
+    virtual std::unique_ptr<Notification> get_notification(
+    const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj,
+    rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket, std::string& _user_id, std::string& _user_tenant,
+    std::string& _req_id, optional_yield y) = 0;
+    /** Read the topic config entry into @a data and (optionally) @a objv_tracker */
+    virtual int read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+        optional_yield y, const DoutPrefixProvider *dpp) = 0;
+    /** Write @a info and (optionally) @a objv_tracker into the config */
+    virtual int write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+        optional_yield y, const DoutPrefixProvider *dpp) = 0;
+    /** Remove the topic config, optionally a specific version */
+    virtual int remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker,
+        optional_yield y,const DoutPrefixProvider *dpp) = 0;
+    /** Get access to the lifecycle management thread */
+    virtual RGWLC* get_rgwlc(void) = 0;
+    /** Get access to the coroutine registry.  Used to create new coroutine managers */
+    virtual RGWCoroutinesManagerRegistry* get_cr_registry() = 0;
+
+    /** Log usage data to the driver.  Usage data is things like bytes sent/received and
+     * op count */
+    virtual int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info) = 0;
+    /** Log OP data to the driver.  Data is opaque to SAL */
+    virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) = 0;
+    /** Register this driver to the service map.  Somewhat Rados specific; may be removed*/
+    virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
+					const std::map<std::string, std::string>& meta) = 0;
+    /** Get default quota info.  Used as fallback if a user or bucket has no quota set*/
+    virtual void get_quota(RGWQuota& quota) = 0;
+    /** Get global rate limit configuration*/
+    virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) = 0;
+    /** Enable or disable a set of bucket.  e.g. if a User is suspended */
+    virtual int set_buckets_enabled(const DoutPrefixProvider* dpp, std::vector<rgw_bucket>& buckets, bool enabled) = 0;
+    /** Get a new request ID */
+    virtual uint64_t get_new_req_id() = 0;
+    /** Get a handler for bucket sync policy. */
+    virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp,
+					std::optional<rgw_zone_id> zone,
+					std::optional<rgw_bucket> bucket,
+					RGWBucketSyncPolicyHandlerRef* phandler,
+					optional_yield y) = 0;
+    /** Get a status manager for bucket sync */
+    virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) = 0;
+    /** Wake up sync threads for bucket metadata sync */
+    virtual void wakeup_meta_sync_shards(std::set<int>& shard_ids) = 0;
+    /** Wake up sync threads for bucket data sync */
+    virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, boost::container::flat_map<int, boost::container::flat_set<rgw_data_notify_entry>>& shard_ids) = 0;
+    /** Clear all usage statistics globally */
+    virtual int clear_usage(const DoutPrefixProvider *dpp) = 0;
+    /** Get usage statistics for all users and buckets */
+    virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+			       uint32_t max_entries, bool* is_truncated,
+			       RGWUsageIter& usage_iter,
+			       std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) = 0;
+    /** Trim usage log for all users and buckets */
+    virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) = 0;
+    /** Get a configuration value for the given name */
+    virtual int get_config_key_val(std::string name, bufferlist* bl) = 0;
+    /** Start a metadata listing of the given section */
+    virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) = 0;
+    /** Get the next key from a metadata list */
+    virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list<std::string>& keys, bool* truncated) = 0;
+    /** Complete a metadata listing */
+    virtual void meta_list_keys_complete(void* handle) = 0;
+    /** Get the marker associated with the current metadata listing */
+    virtual std::string meta_get_marker(void* handle) = 0;
+    /** Remove a specific metadata key */
+    virtual int meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key, optional_yield y) = 0;
+    /** Get an instance of the Sync module for bucket sync */
+    virtual const RGWSyncModuleInstanceRef& get_sync_module() = 0;
+    /** Get the ID of the current host */
+    virtual std::string get_host_id() = 0;
+    /** Get a Lua script manager for running lua scripts */
+    virtual std::unique_ptr<LuaManager> get_lua_manager() = 0;
+    /** Get an IAM Role by name etc. */
+    virtual std::unique_ptr<RGWRole> get_role(std::string name,
+					      std::string tenant,
+					      std::string path="",
+					      std::string trust_policy="",
+					      std::string max_session_duration_str="",
+                std::multimap<std::string,std::string> tags={}) = 0;
+    /** Get an IAM Role by ID */
+    virtual std::unique_ptr<RGWRole> get_role(std::string id) = 0;
+    virtual std::unique_ptr<RGWRole> get_role(const RGWRoleInfo& info) = 0;
+    /** Get all IAM Roles optionally filtered by path */
+    virtual int get_roles(const DoutPrefixProvider *dpp,
+			  optional_yield y,
+			  const std::string& path_prefix,
+			  const std::string& tenant,
+			  std::vector<std::unique_ptr<RGWRole>>& roles) = 0;
+    /** Get an empty Open ID Connector provider */
+    virtual std::unique_ptr<RGWOIDCProvider> get_oidc_provider() = 0;
+    /** Get all Open ID Connector providers, optionally filtered by tenant  */
+    virtual int get_oidc_providers(const DoutPrefixProvider *dpp,
+				   const std::string& tenant,
+				   std::vector<std::unique_ptr<RGWOIDCProvider>>& providers) = 0;
+    /** Get a Writer that appends to an object */
+    virtual std::unique_ptr<Writer> get_append_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  const std::string& unique_tag,
+				  uint64_t position,
+				  uint64_t *cur_accounted_size) = 0;
+    /** Get a Writer that atomically writes an entire object */
+    virtual std::unique_ptr<Writer> get_atomic_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t olh_epoch,
+				  const std::string& unique_tag) = 0;
+
+    /** Get the compression type of a placement rule */
+    virtual const std::string& get_compression_type(const rgw_placement_rule& rule) = 0;
+    /** Check to see if this placement rule is valid */
+    virtual bool valid_placement(const rgw_placement_rule& rule) = 0;
+
+    /** Clean up a driver for termination */
+    virtual void finalize(void) = 0;
+
+    /** Get the Ceph context associated with this driver.  May be removed. */
+    virtual CephContext* ctx(void) = 0;
+
+    /** Register admin APIs unique to this driver */
+    virtual void register_admin_apis(RGWRESTMgr* mgr) = 0;
+};
+
+/**
+ * @brief User abstraction
+ *
+ * This represents a user.  In general, there will be a @a User associated with an OP
+ * (the user performing the OP), and potentially several others acting as owners.
+ * Lifetime of a User is a bit tricky , since it must last as long as any Buckets
+ * associated with it.  A User has associated metadata, including a set of key/value
+ * attributes, and statistics (including usage) about the User.
+ */
+class User {
+  public:
+    User() {}
+    virtual ~User() = default;
+
+    /** Clone a copy of this user.  Used when modification is necessary of the copy */
+    virtual std::unique_ptr<User> clone() = 0;
+    /** List the buckets owned by a user */
+    virtual int list_buckets(const DoutPrefixProvider* dpp,
+			     const std::string& marker, const std::string& end_marker,
+			     uint64_t max, bool need_stats, BucketList& buckets,
+			     optional_yield y) = 0;
+    /** Create a new bucket owned by this user.  Creates in the backing store, not just the instantiation. */
+    virtual int create_bucket(const DoutPrefixProvider* dpp,
+                            const rgw_bucket& b,
+                            const std::string& zonegroup_id,
+                            rgw_placement_rule& placement_rule,
+                            std::string& swift_ver_location,
+                            const RGWQuotaInfo* pquota_info,
+                            const RGWAccessControlPolicy& policy,
+			    Attrs& attrs,
+                            RGWBucketInfo& info,
+                            obj_version& ep_objv,
+			    bool exclusive,
+			    bool obj_lock_enabled,
+			    bool* existed,
+			    req_info& req_info,
+			    std::unique_ptr<Bucket>* bucket,
+			    optional_yield y) = 0;
+
+    /** Get the display name for this User */
+    virtual std::string& get_display_name() = 0;
+    /** Get the tenant name for this User */
+    virtual const std::string& get_tenant() = 0;
+    /** Set the tenant name for this User */
+    virtual void set_tenant(std::string& _t) = 0;
+    /** Get the namespace for this User */
+    virtual const std::string& get_ns() = 0;
+    /** Set the namespace for this User */
+    virtual void set_ns(std::string& _ns) = 0;
+    /** Clear the namespace for this User */
+    virtual void clear_ns() = 0;
+    /** Get the full ID for this User */
+    virtual const rgw_user& get_id() const = 0;
+    /** Get the type of this User */
+    virtual uint32_t get_type() const = 0;
+    /** Get the maximum number of buckets allowed for this User */
+    virtual int32_t get_max_buckets() const = 0;
+    /** Get the capabilities for this User */
+    virtual const RGWUserCaps& get_caps() const = 0;
+    /** Get the version tracker for this User */
+    virtual RGWObjVersionTracker& get_version_tracker() = 0;
+    /** Get the cached attributes for this User */
+    virtual Attrs& get_attrs() = 0;
+    /** Set the cached attributes fro this User */
+    virtual void set_attrs(Attrs& _attrs) = 0;
+    /** Check if a User is empty */
+    virtual bool empty() const = 0;
+    /** Check if a User pointer is empty */
+    static bool empty(const User* u) { return (!u || u->empty()); }
+    /** Check if a User unique_pointer is empty */
+    static bool empty(const std::unique_ptr<User>& u) { return (!u || u->empty()); }
+    /** Read the User attributes from the backing Store */
+    virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) = 0;
+    /** Set the attributes in attrs, leaving any other existing attrs set, and
+     * write them to the backing store; a merge operation */
+    virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) = 0;
+    /** Read the User stats from the backing Store, synchronous */
+    virtual int read_stats(const DoutPrefixProvider *dpp,
+                           optional_yield y, RGWStorageStats* stats,
+			   ceph::real_time* last_stats_sync = nullptr,
+			   ceph::real_time* last_stats_update = nullptr) = 0;
+    /** Read the User stats from the backing Store, asynchronous */
+    virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) = 0;
+    /** Flush accumulated stat changes for this User to the backing store */
+    virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) = 0;
+    /** Read detailed usage stats for this User from the backing store */
+    virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+			   uint64_t end_epoch, uint32_t max_entries,
+			   bool* is_truncated, RGWUsageIter& usage_iter,
+			   std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) = 0;
+    /** Trim User usage stats to the given epoch range */
+    virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) = 0;
+
+    /** Load this User from the backing store.  requires ID to be set, fills all other fields. */
+    virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) = 0;
+    /** Store this User to the backing store */
+    virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) = 0;
+    /** Remove this User from the backing store */
+    virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) = 0;
+    /** Verify multi-factor authentication for this user */
+    virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) = 0;
+
+    /* dang temporary; will be removed when User is complete */
+    virtual RGWUserInfo& get_info() = 0;
+
+    /** Print the User to @a out */
+    virtual void print(std::ostream& out) const = 0;
+
+    friend inline std::ostream& operator<<(std::ostream& out, const User& u) {
+      u.print(out);
+      return out;
+    }
+
+    friend inline std::ostream& operator<<(std::ostream& out, const User* u) {
+      if (!u)
+	out << "<NULL>";
+      else
+	u->print(out);
+      return out;
+    }
+
+    friend inline std::ostream& operator<<(std::ostream& out, const std::unique_ptr<User>& p) {
+      out << p.get();
+      return out;
+    }
+};
+
+/**
+ * @brief Bucket abstraction
+ *
+ * This represents a bucket.  A bucket is a container for objects.  It is owned by a user, and has
+ * it's own set of metadata, including a set of key/value attributes.  A bucket may not contain
+ * other buckets, only objects.  Buckets have Access Control Lists (ACLs) that control what users
+ * can access the contents of the bucket, and in what ways.
+ */
+class Bucket {
+  public:
+
+    /**
+     * @brief Parameters for a bucket list operation
+     */
+    struct ListParams {
+      std::string prefix;
+      std::string delim;
+      rgw_obj_key marker;
+      rgw_obj_key end_marker;
+      std::string ns;
+      bool enforce_ns{true};
+      RGWAccessListFilter* access_list_filter{nullptr};
+      RGWBucketListNameFilter force_check_filter;
+      bool list_versions{false};
+      bool allow_unordered{false};
+      int shard_id{RGW_NO_SHARD};
+
+      friend std::ostream& operator<<(std::ostream& out, const ListParams& p) {
+	out << "rgw::sal::Bucket::ListParams{ prefix=\"" << p.prefix <<
+	  "\", delim=\"" << p.delim <<
+	  "\", marker=\"" << p.marker <<
+	  "\", end_marker=\"" << p.end_marker <<
+	  "\", ns=\"" << p.ns <<
+	  "\", enforce_ns=" << p.enforce_ns <<
+	  ", list_versions=" << p.list_versions <<
+	  ", allow_unordered=" << p.allow_unordered <<
+	  ", shard_id=" << p.shard_id <<
+	  " }";
+	return out;
+      }
+    };
+    /**
+     * @brief Results from a bucket list operation
+     */
+    struct ListResults {
+      std::vector<rgw_bucket_dir_entry> objs;
+      std::map<std::string, bool> common_prefixes;
+      bool is_truncated{false};
+      rgw_obj_key next_marker;
+    };
+
+    Bucket() = default;
+    virtual ~Bucket() = default;
+
+    /** Get an @a Object belonging to this bucket */
+    virtual std::unique_ptr<Object> get_object(const rgw_obj_key& key) = 0;
+    /** List the contents of this bucket */
+    virtual int list(const DoutPrefixProvider* dpp, ListParams&, int, ListResults&, optional_yield y) = 0;
+    /** Get the cached attributes associated with this bucket */
+    virtual Attrs& get_attrs(void) = 0;
+    /** Set the cached attributes on this bucket */
+    virtual int set_attrs(Attrs a) = 0;
+    /** Remove this bucket from the backing store */
+    virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) = 0;
+    /** Remove this bucket, bypassing garbage collection.  May be removed */
+    virtual int remove_bucket_bypass_gc(int concurrent_max, bool
+					keep_index_consistent,
+					optional_yield y, const
+					DoutPrefixProvider *dpp) = 0;
+    /** Get then ACL for this bucket */
+    virtual RGWAccessControlPolicy& get_acl(void) = 0;
+    /** Set the ACL for this bucket */
+    virtual int set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy& acl, optional_yield y) = 0;
+
+    // XXXX hack
+    virtual void set_owner(rgw::sal::User* _owner) = 0;
+
+    /** Load this bucket from the backing store.  Requires the key to be set, fills other fields.
+     * If @a get_stats is true, then statistics on the bucket are also looked up. */
+    virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y, bool get_stats = false) = 0;
+    /** Read the bucket stats from the backing Store, synchronous */
+    virtual int read_stats(const DoutPrefixProvider *dpp,
+			   const bucket_index_layout_generation& idx_layout,
+			   int shard_id, std::string* bucket_ver, std::string* master_ver,
+			   std::map<RGWObjCategory, RGWStorageStats>& stats,
+			   std::string* max_marker = nullptr,
+			   bool* syncstopped = nullptr) = 0;
+    /** Read the bucket stats from the backing Store, asynchronous */
+    virtual int read_stats_async(const DoutPrefixProvider *dpp,
+				 const bucket_index_layout_generation& idx_layout,
+				 int shard_id, RGWGetBucketStats_CB* ctx) = 0;
+    /** Sync this bucket's stats to the owning user's stats in the backing store */
+    virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) = 0;
+    /** Refresh the metadata stats (size, count, and so on) from the backing store */
+    virtual int update_container_stats(const DoutPrefixProvider* dpp) = 0;
+    /** Check if this bucket needs resharding, and schedule it if it does */
+    virtual int check_bucket_shards(const DoutPrefixProvider* dpp) = 0;
+    /** Change the owner of this bucket in the backing store.  Current owner must be set.  Does not
+     * change ownership of the objects in the bucket. */
+    virtual int chown(const DoutPrefixProvider* dpp, User& new_user, optional_yield y) = 0;
+    /** Store the cached bucket info into the backing store */
+    virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive, ceph::real_time mtime) = 0;
+    /** Check to see if the given user is the owner of this bucket */
+    virtual bool is_owner(User* user) = 0;
+    /** Get the owner of this bucket */
+    virtual User* get_owner(void) = 0;
+    /** Get the owner of this bucket in the form of an ACLOwner object */
+    virtual ACLOwner get_acl_owner(void) = 0;
+    /** Check in the backing store if this bucket is empty */
+    virtual int check_empty(const DoutPrefixProvider* dpp, optional_yield y) = 0;
+    /** Chec k if the given size fits within the quota */
+    virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) = 0;
+    /** Set the attributes in attrs, leaving any other existing attrs set, and
+     * write them to the backing store; a merge operation */
+    virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) = 0;
+    /** Try to refresh the cached bucket info from the backing store.  Used in
+     * read-modify-update loop. */
+    virtual int try_refresh_info(const DoutPrefixProvider* dpp, ceph::real_time* pmtime) = 0;
+    /** Read usage information about this bucket from the backing store */
+    virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+			   bool* is_truncated, RGWUsageIter& usage_iter,
+			   std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) = 0;
+    /** Trim the usage information to the given epoch range */
+    virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) = 0;
+    /** Remove objects from the bucket index of this bucket.  May be removed from API */
+    virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink) = 0;
+    /** Check the state of the bucket index, and get stats from it.  May be removed from API */
+    virtual int check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats) = 0;
+    /** Rebuild the bucket index.  May be removed from API */
+    virtual int rebuild_index(const DoutPrefixProvider *dpp) = 0;
+    /** Set a timeout on the check_index() call.  May be removed from API */
+    virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) = 0;
+    /** Remove this specific bucket instance from the backing store.  May be removed from API */
+    virtual int purge_instance(const DoutPrefixProvider* dpp) = 0;
+
+    /** Check if this instantiation is empty */
+    virtual bool empty() const = 0;
+    /** Get the cached name of this bucket */
+    virtual const std::string& get_name() const = 0;
+    /** Get the cached tenant of this bucket */
+    virtual const std::string& get_tenant() const = 0;
+    /** Get the cached marker of this bucket */
+    virtual const std::string& get_marker() const = 0;
+    /** Get the cached ID of this bucket */
+    virtual const std::string& get_bucket_id() const = 0;
+    /** Get the cached size of this bucket */
+    virtual size_t get_size() const = 0;
+    /** Get the cached rounded size of this bucket */
+    virtual size_t get_size_rounded() const = 0;
+    /** Get the cached object count of this bucket */
+    virtual uint64_t get_count() const = 0;
+    /** Get the cached placement rule of this bucket */
+    virtual rgw_placement_rule& get_placement_rule() = 0;
+    /** Get the cached creation time of this bucket */
+    virtual ceph::real_time& get_creation_time() = 0;
+    /** Get the cached modification time of this bucket */
+    virtual ceph::real_time& get_modification_time() = 0;
+    /** Get the cached version of this bucket */
+    virtual obj_version& get_version() = 0;
+    /** Set the cached version of this bucket */
+    virtual void set_version(obj_version &ver) = 0;
+    /** Check if this bucket is versioned */
+    virtual bool versioned() = 0;
+    /** Check if this bucket has versioning enabled */
+    virtual bool versioning_enabled() = 0;
+
+    /** Check if a Bucket pointer is empty */
+    static bool empty(const Bucket* b) { return (!b || b->empty()); }
+    /** Check if a Bucket unique pointer is empty */
+    static bool empty(const std::unique_ptr<Bucket>& b) { return (!b || b->empty()); }
+    /** Clone a copy of this bucket.  Used when modification is necessary of the copy */
+    virtual std::unique_ptr<Bucket> clone() = 0;
+
+    /** Create a multipart upload in this bucket */
+    virtual std::unique_ptr<MultipartUpload> get_multipart_upload(
+				const std::string& oid,
+				std::optional<std::string> upload_id=std::nullopt,
+				ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) = 0;
+    /** List multipart uploads currently in this bucket */
+    virtual int list_multiparts(const DoutPrefixProvider *dpp,
+				const std::string& prefix,
+				std::string& marker,
+				const std::string& delim,
+				const int& max_uploads,
+				std::vector<std::unique_ptr<MultipartUpload>>& uploads,
+				std::map<std::string, bool> *common_prefixes,
+				bool *is_truncated) = 0;
+    /** Abort multipart uploads in a bucket */
+    virtual int abort_multiparts(const DoutPrefixProvider* dpp,
+				 CephContext* cct) = 0;
+
+    /** Read the bucket notification config into @a notifications with and (optionally) @a objv_tracker */
+    virtual int read_topics(rgw_pubsub_bucket_topics& notifications, 
+        RGWObjVersionTracker* objv_tracker, optional_yield y, const DoutPrefixProvider *dpp) = 0;
+    /** Write @a notifications with (optionally) @a objv_tracker into the bucket notification config */
+    virtual int write_topics(const rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker,
+        optional_yield y, const DoutPrefixProvider *dpp) = 0;
+    /** Remove the bucket notification config with (optionally) @a objv_tracker */
+    virtual int remove_topics(RGWObjVersionTracker* objv_tracker, 
+        optional_yield y, const DoutPrefixProvider *dpp) = 0;
+
+    /* dang - This is temporary, until the API is completed */
+    virtual rgw_bucket& get_key() = 0;
+    virtual RGWBucketInfo& get_info() = 0;
+
+    /** Print the User to @a out */
+    virtual void print(std::ostream& out) const = 0;
+
+    friend inline std::ostream& operator<<(std::ostream& out, const Bucket& b) {
+      b.print(out);
+      return out;
+    }
+
+    friend inline std::ostream& operator<<(std::ostream& out, const Bucket* b) {
+      if (!b)
+	out << "<NULL>";
+      else
+	b->print(out);
+      return out;
+    }
+
+    friend inline std::ostream& operator<<(std::ostream& out, const std::unique_ptr<Bucket>& p) {
+      out << p.get();
+      return out;
+    }
+
+    virtual bool operator==(const Bucket& b) const = 0;
+    virtual bool operator!=(const Bucket& b) const = 0;
+
+    friend class BucketList;
+};
+
+/**
+ * @brief A list of buckets
+ *
+ * This is the result from a bucket listing operation.
+ */
+class BucketList {
+  std::map<std::string, std::unique_ptr<Bucket>> buckets;
+  bool truncated;
+
+public:
+  BucketList() : buckets(), truncated(false) {}
+  BucketList(BucketList&& _bl) :
+    buckets(std::move(_bl.buckets)),
+    truncated(_bl.truncated)
+    { }
+  BucketList& operator=(const BucketList&) = delete;
+  BucketList& operator=(BucketList&& _bl) {
+    for (auto& ent : _bl.buckets) {
+      buckets.emplace(ent.first, std::move(ent.second));
+    }
+    truncated = _bl.truncated;
+    return *this;
+  };
+
+  /** Get the list of buckets.  The list is a map of <bucket-name, Bucket> pairs. */
+  std::map<std::string, std::unique_ptr<Bucket>>& get_buckets() { return buckets; }
+  /** True if the list is truncated (that is, there are more buckets to list) */
+  bool is_truncated(void) const { return truncated; }
+  /** Set the truncated state of the list */
+  void set_truncated(bool trunc) { truncated = trunc; }
+  /** Add a bucket to the list.  Takes ownership of the bucket */
+  void add(std::unique_ptr<Bucket> bucket) {
+    buckets.emplace(bucket->get_name(), std::move(bucket));
+  }
+  /** The number of buckets in this list */
+  size_t count() const { return buckets.size(); }
+  /** Clear the list */
+  void clear(void) {
+    buckets.clear();
+    truncated = false;
+  }
+};
+
+/**
+ * @brief Object abstraction
+ *
+ * This represents an Object.  An Object is the basic unit of data storage.  It
+ * represents a blob of data, a set of metadata (such as size, owner, ACLs, etc.) and
+ * a set of key/value attributes.  Objects may be versioned.  If a versioned object
+ * is written to, a new object with the same name but a different version is created,
+ * and the old version of the object is still accessible.  If an unversioned object
+ * is written to, it is replaced, and the old data is not accessible.
+ */
+class Object {
+  public:
+
+    /**
+     * @brief Read operation on an Object
+     *
+     * This represents a Read operation on an Object.  Read operations are optionally
+     * asynchronous, using the iterate() API.
+     */
+    struct ReadOp {
+      struct Params {
+        const ceph::real_time* mod_ptr{nullptr};
+        const ceph::real_time* unmod_ptr{nullptr};
+        bool high_precision_time{false};
+        uint32_t mod_zone_id{0};
+        uint64_t mod_pg_ver{0};
+        const char* if_match{nullptr};
+        const char* if_nomatch{nullptr};
+        ceph::real_time* lastmod{nullptr};
+        rgw_obj* target_obj{nullptr}; // XXX dang remove?
+      } params;
+
+      virtual ~ReadOp() = default;
+
+      /** Prepare the Read op.  Must be called first */
+      virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) = 0;
+
+      /** Synchronous read. Read from @a ofs to @a end (inclusive)
+       * into @a bl. Length is `end - ofs + 1`. */
+      virtual int read(int64_t ofs, int64_t end, bufferlist& bl,
+		       optional_yield y, const DoutPrefixProvider* dpp) = 0;
+
+      /** Asynchronous read.  Read from @a ofs to @a end (inclusive)
+       * calling @a cb on each read chunk. Length is `end - ofs +
+       * 1`. */
+      virtual int iterate(const DoutPrefixProvider* dpp, int64_t ofs,
+			  int64_t end, RGWGetDataCB* cb, optional_yield y) = 0;
+
+      /** Get an attribute by name */
+      virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) = 0;
+    };
+
+    /**
+     * @brief Delete operation on an Object
+     *
+     * This deletes an Object from the backing store.
+     */
+    struct DeleteOp {
+      struct Params {
+        ACLOwner bucket_owner;
+        ACLOwner obj_owner;
+        int versioning_status{0};
+        uint64_t olh_epoch{0};
+	std::string marker_version_id;
+        uint32_t bilog_flags{0};
+        std::list<rgw_obj_index_key>* remove_objs{nullptr};
+        ceph::real_time expiration_time;
+        ceph::real_time unmod_since;
+        ceph::real_time mtime;
+        bool high_precision_time{false};
+        rgw_zone_set* zones_trace{nullptr};
+	bool abortmp{false};
+	uint64_t parts_accounted_size{0};
+      } params;
+
+      struct Result {
+        bool delete_marker{false};
+	std::string version_id;
+      } result;
+
+      virtual ~DeleteOp() = default;
+
+      /** Delete the object */
+      virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) = 0;
+    };
+
+    Object() {}
+    virtual ~Object() = default;
+
+    /** Shortcut synchronous delete call for common deletes */
+    virtual int delete_object(const DoutPrefixProvider* dpp,
+			      optional_yield y,
+			      bool prevent_versioning = false) = 0;
+    /** Asynchronous delete call */
+    virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio,
+			       bool keep_index_consistent, optional_yield y) = 0;
+    /** Copy an this object to another object. */
+    virtual int copy_object(User* user,
+               req_info* info, const rgw_zone_id& source_zone,
+               rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket,
+               rgw::sal::Bucket* src_bucket,
+               const rgw_placement_rule& dest_placement,
+               ceph::real_time* src_mtime, ceph::real_time* mtime,
+               const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr,
+               bool high_precision_time,
+               const char* if_match, const char* if_nomatch,
+               AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs,
+               RGWObjCategory category, uint64_t olh_epoch,
+	       boost::optional<ceph::real_time> delete_at,
+               std::string* version_id, std::string* tag, std::string* etag,
+               void (*progress_cb)(off_t, void *), void* progress_data,
+               const DoutPrefixProvider* dpp, optional_yield y) = 0;
+    /** Get the ACL for this object */
+    virtual RGWAccessControlPolicy& get_acl(void) = 0;
+    /** Set the ACL for this object */
+    virtual int set_acl(const RGWAccessControlPolicy& acl) = 0;
+    /** Mark further operations on this object as being atomic */
+    virtual void set_atomic() = 0;
+    /** Check if this object is atomic */
+    virtual bool is_atomic() = 0;
+    /** Pre-fetch data when reading */
+    virtual void set_prefetch_data() = 0;
+    /** Check if this object should prefetch */
+    virtual bool is_prefetch_data() = 0;
+    /** Mark data as compressed */
+    virtual void set_compressed() = 0;
+    /** Check if this object is compressed */
+    virtual bool is_compressed() = 0;
+    /** Invalidate cached info about this object, except atomic, prefetch, and
+     * compressed */
+    virtual void invalidate() = 0;
+
+    /** Check to see if this object has an empty key.  This means it's uninitialized */
+    virtual bool empty() const = 0;
+    /** Get the name of this object */
+    virtual const std::string &get_name() const = 0;
+
+    /** Get the object state for this object.  Will be removed in the future */
+    virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) = 0;
+    /** Set attributes for this object from the backing store.  Attrs can be set or
+     * deleted.  @note the attribute APIs may be revisited in the future. */
+    virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) = 0;
+    /** Get attributes for this object */
+    virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) = 0;
+    /** Modify attributes for this object. */
+    virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) = 0;
+    /** Delete attributes for this object */
+    virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) = 0;
+    /** Check to see if this object has expired */
+    virtual bool is_expired() = 0;
+    /** Create a randomized instance ID for this object */
+    virtual void gen_rand_obj_instance_name() = 0;
+    /** Get a multipart serializer for this object */
+    virtual std::unique_ptr<MPSerializer> get_serializer(const DoutPrefixProvider *dpp,
+							 const std::string& lock_name) = 0;
+    /** Move the data of an object to new placement storage */
+    virtual int transition(Bucket* bucket,
+			   const rgw_placement_rule& placement_rule,
+			   const real_time& mtime,
+			   uint64_t olh_epoch,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y) = 0;
+    /** Move an object to the cloud */
+    virtual int transition_to_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_bucket_dir_entry& o,
+			   std::set<std::string>& cloud_targets,
+			   CephContext* cct,
+			   bool update_object,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y) = 0;
+    /** Check to see if two placement rules match */
+    virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) = 0;
+    /** Dump driver-specific object layout info in JSON */
+    virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) = 0;
+
+    /** Get the cached attributes for this object */
+    virtual Attrs& get_attrs(void) = 0;
+    /** Get the (const) cached attributes for this object */
+    virtual const Attrs& get_attrs(void) const = 0;
+    /** Set the cached attributes for this object */
+    virtual int set_attrs(Attrs a) = 0;
+    /** Check to see if attributes are cached on this object */
+    virtual bool has_attrs(void) = 0;
+    /** Get the cached modification time for this object */
+    virtual ceph::real_time get_mtime(void) const = 0;
+    /** Get the cached size for this object */
+    virtual uint64_t get_obj_size(void) const = 0;
+    /** Get the bucket containing this object */
+    virtual Bucket* get_bucket(void) const = 0;
+    /** Set the bucket containing this object */
+    virtual void set_bucket(Bucket* b) = 0;
+    /** Get the sharding hash representation of this object */
+    virtual std::string get_hash_source(void) = 0;
+    /** Set the sharding hash representation of this object */
+    virtual void set_hash_source(std::string s) = 0;
+    /** Build an Object Identifier string for this object */
+    virtual std::string get_oid(void) const = 0;
+    /** True if this object is a delete marker (newest version is deleted) */
+    virtual bool get_delete_marker(void) = 0;
+    /** True if this object is stored in the extra data pool */
+    virtual bool get_in_extra_data(void) = 0;
+    /** Set the in_extra_data field */
+    virtual void set_in_extra_data(bool i) = 0;
+    /** Helper to sanitize object size, offset, and end values */
+    int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
+    /** Set the cached size of this object */
+    virtual void set_obj_size(uint64_t s) = 0;
+    /** Set the cached name of this object */
+    virtual void set_name(const std::string& n) = 0;
+    /** Set the cached key of this object */
+    virtual void set_key(const rgw_obj_key& k) = 0;
+    /** Get an rgw_obj representing this object */
+    virtual rgw_obj get_obj(void) const = 0;
+
+    /** Restore the previous swift version of this object */
+    virtual int swift_versioning_restore(bool& restored,   /* out */
+					 const DoutPrefixProvider* dpp) = 0;
+    /** Copy the current version of a swift object to the configured destination bucket*/
+    virtual int swift_versioning_copy(const DoutPrefixProvider* dpp,
+				      optional_yield y) = 0;
+
+    /** Get a new ReadOp for this object */
+    virtual std::unique_ptr<ReadOp> get_read_op() = 0;
+    /** Get a new DeleteOp for this object */
+    virtual std::unique_ptr<DeleteOp> get_delete_op() = 0;
+
+    /** Get @a count OMAP values via listing, starting at @a marker for this object */
+    virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+			      std::map<std::string, bufferlist>* m,
+			      bool* pmore, optional_yield y) = 0;
+    /** Get all OMAP key/value pairs for this object */
+    virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist>* m,
+			     optional_yield y) = 0;
+    /** Get the OMAP values matching the given set of keys */
+    virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+			      const std::set<std::string>& keys,
+			      Attrs* vals) = 0;
+    /** Get a single OMAP value matching the given key */
+    virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+				    bool must_exist, optional_yield y) = 0;
+    /** Change the ownership of this object */
+    virtual int chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) = 0;
+
+    /** Check to see if the given object pointer is uninitialized */
+    static bool empty(const Object* o) { return (!o || o->empty()); }
+    /** Check to see if the given object unique pointer is uninitialized */
+    static bool empty(const std::unique_ptr<Object>& o) { return (!o || o->empty()); }
+    /** Get a unique copy of this object */
+    virtual std::unique_ptr<Object> clone() = 0;
+
+    /* dang - This is temporary, until the API is completed */
+    /** Get the key for this object */
+    virtual rgw_obj_key& get_key() = 0;
+    /** Set the instance for this object */
+    virtual void set_instance(const std::string &i) = 0;
+    /** Get the instance for this object */
+    virtual const std::string &get_instance() const = 0;
+    /** Check to see if this object has an instance set */
+    virtual bool have_instance(void) = 0;
+    /** Clear the instance on this object */
+    virtual void clear_instance() = 0;
+
+    /** Print the User to @a out */
+    virtual void print(std::ostream& out) const = 0;
+
+    friend inline std::ostream& operator<<(std::ostream& out, const Object& o) {
+      o.print(out);
+      return out;
+    }
+    friend inline std::ostream& operator<<(std::ostream& out, const Object* o) {
+      if (!o)
+	out << "<NULL>";
+      else
+	o->print(out);
+      return out;
+    }
+    friend inline std::ostream& operator<<(std::ostream& out, const std::unique_ptr<Object>& p) {
+      out << p.get();
+      return out;
+    }
+};
+
+/**
+ * @brief Abstraction of a single part of a multipart upload
+ */
+class MultipartPart {
+public:
+  MultipartPart() = default;
+  virtual ~MultipartPart() = default;
+
+  /** Get the part number of this part */
+  virtual uint32_t get_num() = 0;
+  /** Get the size of this part */
+  virtual uint64_t get_size() = 0;
+  /** Get the etag of this part */
+  virtual const std::string& get_etag() = 0;
+  /** Get the modification time of this part */
+  virtual ceph::real_time& get_mtime() = 0;
+};
+
+/**
+ * @brief Abstraction of a multipart upload
+ *
+ * This represents a multipart upload.  For large objects, it's inefficient to do a
+ * single, long-lived upload of the object.  Instead, protocols such as S3 allow the
+ * client to start a multipart upload, and then upload object in smaller parts in
+ * parallel.  A MultipartUpload consists of a target bucket, a unique identifier, and a
+ * set of upload parts.
+ */
+class MultipartUpload {
+public:
+  MultipartUpload() = default;
+  virtual ~MultipartUpload() = default;
+
+  /** Get the name of the object representing this upload in the backing store */
+  virtual const std::string& get_meta() const = 0;
+  /** Get the name of the target object for this upload */
+  virtual const std::string& get_key() const = 0;
+  /** Get the unique ID of this upload */
+  virtual const std::string& get_upload_id() const = 0;
+  /** Get the owner of this upload */
+  virtual const ACLOwner& get_owner() const = 0;
+  /** Get the modification time of this upload */
+  virtual ceph::real_time& get_mtime() = 0;
+
+  /** Get all the cached parts that make up this upload */
+  virtual std::map<uint32_t, std::unique_ptr<MultipartPart>>& get_parts() = 0;
+
+  /** Get the trace context of this upload */
+  virtual const jspan_context& get_trace() = 0;
+
+  /** Get the Object that represents this upload */
+  virtual std::unique_ptr<rgw::sal::Object> get_meta_obj() = 0;
+
+  /** Initialize this upload */
+  virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) = 0;
+  /** List all the parts of this upload, filling the parts cache */
+  virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			 int num_parts, int marker,
+			 int* next_marker, bool* truncated,
+			 bool assume_unsorted = false) = 0;
+  /** Abort this upload */
+  virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) = 0;
+  /** Complete this upload, making it available as a normal object */
+  virtual int complete(const DoutPrefixProvider* dpp,
+		       optional_yield y, CephContext* cct,
+		       std::map<int, std::string>& part_etags,
+		       std::list<rgw_obj_index_key>& remove_objs,
+		       uint64_t& accounted_size, bool& compressed,
+		       RGWCompressionInfo& cs_info, off_t& ofs,
+		       std::string& tag, ACLOwner& owner,
+		       uint64_t olh_epoch,
+		       rgw::sal::Object* target_obj) = 0;
+
+  /** Get placement and/or attribute info for this upload */
+  virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) = 0;
+
+  /** Get a Writer to write to a part of this upload */
+  virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
+			  optional_yield y,
+			  rgw::sal::Object* obj,
+			  const rgw_user& owner,
+			  const rgw_placement_rule *ptail_placement_rule,
+			  uint64_t part_num,
+			  const std::string& part_num_str) = 0;
+
+  /** Print the Upload to @a out */
+  virtual void print(std::ostream& out) const = 0;
+
+  friend inline std::ostream& operator<<(std::ostream& out, const MultipartUpload& u) {
+    u.print(out);
+    return out;
+  }
+  friend inline std::ostream& operator<<(std::ostream& out, const MultipartUpload* u) {
+    if (!u)
+      out << "<NULL>";
+    else
+      u->print(out);
+    return out;
+  }
+  friend inline std::ostream& operator<<(std::ostream& out, const
+				    std::unique_ptr<MultipartUpload>& p) {
+    out << p.get();
+    return out;
+  }
+};
+
+/**
+ * @brief Interface of a lock/serialization
+ */
+class Serializer {
+public:
+  Serializer() = default;
+  virtual ~Serializer() = default;
+
+  /** Try to take the lock for the given amount of time. */
+  virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) = 0;
+  /** Unlock the lock */
+  virtual int unlock()  = 0;
+
+  /** Print the Serializer to @a out */
+  virtual void print(std::ostream& out) const = 0;
+
+  friend inline std::ostream& operator<<(std::ostream& out, const Serializer& s) {
+    s.print(out);
+    return out;
+  }
+  friend inline std::ostream& operator<<(std::ostream& out, const Serializer* s) {
+    if (!s)
+      out << "<NULL>";
+    else
+      s->print(out);
+    return out;
+  }
+};
+
+/** @brief Abstraction of a serializer for multipart uploads
+ */
+class MPSerializer : public Serializer {
+public:
+  MPSerializer() = default;
+  virtual ~MPSerializer() = default;
+
+  virtual void clear_locked() = 0;
+  /** Check to see if locked */
+  virtual bool is_locked() = 0;
+};
+
+/** @brief Abstraction of a serializer for Lifecycle
+ */
+class LCSerializer : public Serializer {
+public:
+  LCSerializer() {}
+  virtual ~LCSerializer() = default;
+};
+
+/**
+ * @brief Abstraction for lifecycle processing
+ *
+ * Lifecycle processing loops over the objects in a bucket, applying per-bucket policy
+ * to each object.  Examples of policy can be deleting after a certain amount of time,
+ * deleting extra versions, changing the storage class, and so on.
+ */
+class Lifecycle {
+public:
+  /** Head of a lifecycle run.  Used for tracking parallel lifecycle runs. */
+  struct LCHead {
+    LCHead() = default;
+    virtual ~LCHead() = default;
+
+    virtual time_t& get_start_date() = 0;
+    virtual void set_start_date(time_t) = 0;
+    virtual std::string& get_marker() = 0;
+    virtual void set_marker(const std::string&) = 0;
+    virtual time_t& get_shard_rollover_date() = 0;
+    virtual void set_shard_rollover_date(time_t) = 0;
+  };
+
+  /** Single entry in a lifecycle run.  Multiple entries can exist processing different
+   * buckets. */
+  struct LCEntry {
+    LCEntry() = default;
+    virtual ~LCEntry() = default;
+
+    virtual std::string& get_bucket() = 0;
+    virtual void set_bucket(const std::string&) = 0;
+    virtual std::string& get_oid() = 0;
+    virtual void set_oid(const std::string&) = 0;
+    virtual uint64_t get_start_time() = 0;
+    virtual void set_start_time(uint64_t) = 0;
+    virtual uint32_t get_status() = 0;
+    virtual void set_status(uint32_t) = 0;
+
+    /** Print the entry to @a out */
+    virtual void print(std::ostream& out) const = 0;
+
+    friend inline std::ostream& operator<<(std::ostream& out, const LCEntry& e) {
+      e.print(out);
+      return out;
+    }
+    friend inline std::ostream& operator<<(std::ostream& out, const LCEntry* e) {
+      if (!e)
+	out << "<NULL>";
+      else
+	e->print(out);
+      return out;
+    }
+    friend inline std::ostream& operator<<(std::ostream& out, const std::unique_ptr<LCEntry>& p) {
+      out << p.get();
+      return out;
+      }
+  };
+
+  Lifecycle() = default;
+  virtual ~Lifecycle() = default;
+
+  /** Get an empty entry */
+  virtual std::unique_ptr<LCEntry> get_entry() = 0;
+  /** Get an entry matching the given marker */
+  virtual int get_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) = 0;
+  /** Get the entry following the given marker */
+  virtual int get_next_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) = 0;
+  /** Store a modified entry in then backing store */
+  virtual int set_entry(const std::string& oid, LCEntry& entry) = 0;
+  /** List all known entries */
+  virtual int list_entries(const std::string& oid, const std::string& marker,
+			   uint32_t max_entries,
+			   std::vector<std::unique_ptr<LCEntry>>& entries) = 0;
+  /** Remove an entry from the backing store */
+  virtual int rm_entry(const std::string& oid, LCEntry& entry) = 0;
+  /** Get a head */
+  virtual int get_head(const std::string& oid, std::unique_ptr<LCHead>* head) = 0;
+  /** Store a modified head to the backing store */
+  virtual int put_head(const std::string& oid, LCHead& head) = 0;
+
+  /** Get a serializer for lifecycle */
+  virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
+						       const std::string& oid,
+						       const std::string& cookie) = 0;
+};
+
+/**
+ * @brief Abstraction for a Notification event
+ *
+ * RGW can generate notifications for various events, such as object creation or
+ * deletion.
+ */
+class Notification {
+protected:
+  public:
+    Notification() {}
+
+    virtual ~Notification() = default;
+
+    /** Indicate the start of the event associated with this notification */
+    virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) = 0;
+    /** Indicate the successful completion of the event associated with this notification */
+    virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+			       const ceph::real_time& mtime, const std::string& etag, const std::string& version) = 0;
+};
+
+/**
+ * @brief Abstraction for an asynchronous writer
+ *
+ * Writing is done through a set of filters.  This allows chaining filters to do things
+ * like compression and encryption on async writes.  This is the base abstraction for
+ * those filters.
+ */
+class Writer : public ObjectProcessor {
+public:
+  Writer() {}
+  virtual ~Writer() = default;
+
+  /** prepare to start processing object data */
+  virtual int prepare(optional_yield y) = 0;
+
+  /**
+   * Process a buffer. Called multiple times to write different buffers.
+   * data.length() == 0 indicates the last call and may be used to flush
+   * the data buffers.
+   */
+  virtual int process(bufferlist&& data, uint64_t offset) = 0;
+
+  /** complete the operation and make its result visible to clients */
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) = 0;
+};
+
+
+/**
+ * @brief Abstraction of a placement tier
+ *
+ * This abstraction allows access to information about placement tiers,
+ * including storage class.
+ */
+class PlacementTier {
+public:
+  virtual ~PlacementTier() = default;
+
+  /** Get the type of this tier */
+  virtual const std::string& get_tier_type() = 0;
+  /** Get the storage class of this tier */
+  virtual const std::string& get_storage_class() = 0;
+  /** Should we retain the head object when transitioning */
+  virtual bool retain_head_object() = 0;
+  /** Get the placement rule associated with this tier */
+};
+
+/**
+ * @brief Abstraction of a zone group
+ *
+ * This class allows access to information about a zonegroup.  It may be the
+ * group containing the current zone, or another group.
+ */
+class ZoneGroup {
+public:
+  virtual ~ZoneGroup() = default;
+  /** Get the ID of this zonegroup */
+  virtual const std::string& get_id() const = 0;
+  /** Get the name of this zonegroup */
+  virtual const std::string& get_name() const = 0;
+  /** Determine if two zonegroups are the same */
+  virtual int equals(const std::string& other_zonegroup) const = 0;
+  /** Get the endpoint from zonegroup, or from master zone if not set */
+  virtual const std::string& get_endpoint() const = 0;
+  /** Check if a placement target (by name) exists in this zonegroup */
+  virtual bool placement_target_exists(std::string& target) const = 0;
+  /** Check if this is the master zonegroup */
+  virtual bool is_master_zonegroup() const = 0;
+  /** Get the API name of this zonegroup */
+  virtual const std::string& get_api_name() const = 0;
+  /** Get the list of placement target names for this zone */
+  virtual void get_placement_target_names(std::set<std::string>& names) const = 0;
+  /** Get the name of the default placement target for this zone */
+  virtual const std::string& get_default_placement_name() const = 0;
+  /** Get the list of hostnames from this zone */
+  virtual int get_hostnames(std::list<std::string>& names) const = 0;
+  /** Get the list of hostnames that host s3 websites from this zone */
+  virtual int get_s3website_hostnames(std::list<std::string>& names) const = 0;
+  /** Get the number of zones in this zonegroup */
+  virtual int get_zone_count() const = 0;
+  /** Get the placement tier associated with the rule */
+  virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr<PlacementTier>* tier) = 0;
+  /** Get a zone by ID */
+  virtual int get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone) = 0;
+  /** Get a zone by Name */
+  virtual int get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone) = 0;
+  /** List zones in zone group by ID */
+  virtual int list_zones(std::list<std::string>& zone_ids) = 0;
+  /// Return true if the given feature is enabled in the zonegroup.
+  virtual bool supports(std::string_view feature) const = 0;
+  /** Clone a copy of this zonegroup. */
+  virtual std::unique_ptr<ZoneGroup> clone() = 0;
+};
+
+/**
+ * @brief Abstraction of a Zone
+ *
+ * This abstraction allows access to information about zones.  This can be the zone
+ * containing the RGW, or another zone.
+ */
+class Zone {
+  public:
+    virtual ~Zone() = default;
+
+    /** Clone a copy of this zone. */
+    virtual std::unique_ptr<Zone> clone() = 0;
+    /** Get info about the zonegroup containing this zone */
+    virtual ZoneGroup& get_zonegroup() = 0;
+    /** Get the ID of this zone */
+    virtual const std::string& get_id() = 0;
+    /** Get the name of this zone */
+    virtual const std::string& get_name() const = 0;
+    /** True if this zone is writable */
+    virtual bool is_writeable() = 0;
+    /** Get the URL for the endpoint for redirecting to this zone */
+    virtual bool get_redirect_endpoint(std::string* endpoint) = 0;
+    /** Check to see if the given API is supported in this zone */
+    virtual bool has_zonegroup_api(const std::string& api) const = 0;
+    /** Get the current period ID for this zone */
+    virtual const std::string& get_current_period_id() = 0;
+    /** Get thes system access key for this zone */
+    virtual const RGWAccessKey& get_system_key() = 0;
+    /** Get the name of the realm containing this zone */
+    virtual const std::string& get_realm_name() = 0;
+    /** Get the ID of the realm containing this zone */
+    virtual const std::string& get_realm_id() = 0;
+    /** Get the tier type for the zone */
+    virtual const std::string_view get_tier_type() = 0;
+    /** Get a handler for zone sync policy. */
+    virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() = 0;
+};
+
+/**
+ * @brief Abstraction of a manager for Lua scripts and packages
+ *
+ * RGW can load and process Lua scripts.  This will handle loading/storing scripts; adding, deleting, and listing packages
+ */
+class LuaManager {
+public:
+  virtual ~LuaManager() = default;
+
+  /** Get a script named with the given key from the backing store */
+  virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) = 0;
+  /** Put a script named with the given key to the backing store */
+  virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) = 0;
+  /** Delete a script named with the given key from the backing store */
+  virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) = 0;
+  /** Add a lua package */
+  virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) = 0;
+  /** Remove a lua package */
+  virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) = 0;
+  /** List lua packages */
+  virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) = 0;
+};
+
+/** @} namespace rgw::sal in group RGWSAL */
+} } // namespace rgw::sal
+
+/**
+ * @brief A manager for Drivers
+ *
+ * This will manage the singleton instances of the various drivers.  Drivers come in two
+ * varieties: Full and Raw.  A full driver is suitable for use in a radosgw daemon.  It
+ * has full access to the cluster, if any.  A raw driver is a stripped down driver, used
+ * for admin commands.
+ */
+class DriverManager {
+public:
+  struct Config {
+    /** Name of store to create */
+    std::string store_name;
+    /** Name of filter to create or "none" */
+    std::string filter_name;
+  };
+
+  DriverManager() {}
+  /** Get a full driver by service name */
+  static rgw::sal::Driver* get_storage(const DoutPrefixProvider* dpp,
+				      CephContext* cct,
+				      const Config& cfg,
+				      bool use_gc_thread,
+				      bool use_lc_thread,
+				      bool quota_threads,
+				      bool run_sync_thread,
+				      bool run_reshard_thread,
+				      bool use_cache = true,
+				      bool use_gc = true) {
+    rgw::sal::Driver* driver = init_storage_provider(dpp, cct, cfg, use_gc_thread,
+						   use_lc_thread,
+						   quota_threads,
+						   run_sync_thread,
+						   run_reshard_thread,
+						   use_cache, use_gc);
+    return driver;
+  }
+  /** Get a stripped down driver by service name */
+  static rgw::sal::Driver* get_raw_storage(const DoutPrefixProvider* dpp,
+					  CephContext* cct, const Config& cfg) {
+    rgw::sal::Driver* driver = init_raw_storage_provider(dpp, cct, cfg);
+    return driver;
+  }
+  /** Initialize a new full Driver */
+  static rgw::sal::Driver* init_storage_provider(const DoutPrefixProvider* dpp,
+						CephContext* cct,
+						const Config& cfg,
+						bool use_gc_thread,
+						bool use_lc_thread,
+						bool quota_threads,
+						bool run_sync_thread,
+						bool run_reshard_thread,
+						bool use_metadata_cache,
+						bool use_gc);
+  /** Initialize a new raw Driver */
+  static rgw::sal::Driver* init_raw_storage_provider(const DoutPrefixProvider* dpp,
+						    CephContext* cct,
+						    const Config& cfg);
+  /** Close a Driver when it's no longer needed */
+  static void close_storage(rgw::sal::Driver* driver);
+
+  /** Get the config for Drivers */
+  static Config get_config(bool admin, CephContext* cct);
+
+  /** Create a ConfigStore */
+  static auto create_config_store(const DoutPrefixProvider* dpp,
+                                  std::string_view type)
+      -> std::unique_ptr<rgw::sal::ConfigStore>;
+
+};
+
+/** @} */
diff --git a/src/rgw/rgw_sal_config.h b/src/rgw/rgw_sal_config.h
new file mode 100644
index 000000000..705094022
--- /dev/null
+++ b/src/rgw/rgw_sal_config.h
@@ -0,0 +1,301 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <span>
+#include <string>
+#include "rgw_sal_fwd.h"
+
+class DoutPrefixProvider;
+class optional_yield;
+struct RGWPeriod;
+struct RGWPeriodConfig;
+struct RGWRealm;
+struct RGWZoneGroup;
+struct RGWZoneParams;
+
+namespace rgw::sal {
+
+/// Results of a listing operation
+template <typename T>
+struct ListResult {
+  /// The subspan of the input entries that contain results
+  std::span<T> entries;
+  /// The next marker to resume listing, or empty
+  std::string next;
+};
+
+/// Storage abstraction for realm/zonegroup/zone configuration
+class ConfigStore {
+ public:
+  virtual ~ConfigStore() {}
+
+  /// @group Realm
+  ///@{
+
+  /// Set the cluster-wide default realm id
+  virtual int write_default_realm_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y, bool exclusive,
+                                     std::string_view realm_id) = 0;
+  /// Read the cluster's default realm id
+  virtual int read_default_realm_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y,
+                                    std::string& realm_id) = 0;
+  /// Delete the cluster's default realm id
+  virtual int delete_default_realm_id(const DoutPrefixProvider* dpp,
+                                      optional_yield y) = 0;
+
+  /// Create a realm
+  virtual int create_realm(const DoutPrefixProvider* dpp,
+                           optional_yield y, bool exclusive,
+                           const RGWRealm& info,
+                           std::unique_ptr<RealmWriter>* writer) = 0;
+  /// Read a realm by id
+  virtual int read_realm_by_id(const DoutPrefixProvider* dpp,
+                               optional_yield y,
+                               std::string_view realm_id,
+                               RGWRealm& info,
+                               std::unique_ptr<RealmWriter>* writer) = 0;
+  /// Read a realm by name
+  virtual int read_realm_by_name(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 std::string_view realm_name,
+                                 RGWRealm& info,
+                                 std::unique_ptr<RealmWriter>* writer) = 0;
+  /// Read the cluster's default realm
+  virtual int read_default_realm(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 RGWRealm& info,
+                                 std::unique_ptr<RealmWriter>* writer) = 0;
+  /// Look up a realm id by its name
+  virtual int read_realm_id(const DoutPrefixProvider* dpp,
+                            optional_yield y, std::string_view realm_name,
+                            std::string& realm_id) = 0;
+  /// Notify the cluster of a new period, so radosgws can reload with the new
+  /// configuration
+  virtual int realm_notify_new_period(const DoutPrefixProvider* dpp,
+                                      optional_yield y,
+                                      const RGWPeriod& period) = 0;
+  /// List up to 'entries.size()' realm names starting from the given marker
+  virtual int list_realm_names(const DoutPrefixProvider* dpp,
+                               optional_yield y, const std::string& marker,
+                               std::span<std::string> entries,
+                               ListResult<std::string>& result) = 0;
+  ///@}
+
+  /// @group Period
+  ///@{
+
+  /// Write a period and advance its latest epoch
+  virtual int create_period(const DoutPrefixProvider* dpp,
+                            optional_yield y, bool exclusive,
+                            const RGWPeriod& info) = 0;
+  /// Read a period by id and epoch. If no epoch is given, read the latest
+  virtual int read_period(const DoutPrefixProvider* dpp,
+                          optional_yield y, std::string_view period_id,
+                          std::optional<uint32_t> epoch, RGWPeriod& info) = 0;
+  /// Delete all period epochs with the given period id
+  virtual int delete_period(const DoutPrefixProvider* dpp,
+                            optional_yield y,
+                            std::string_view period_id) = 0;
+  /// List up to 'entries.size()' period ids starting from the given marker
+  virtual int list_period_ids(const DoutPrefixProvider* dpp,
+                              optional_yield y, const std::string& marker,
+                              std::span<std::string> entries,
+                              ListResult<std::string>& result) = 0;
+  ///@}
+
+  /// @group ZoneGroup
+  ///@{
+
+  /// Set the cluster-wide default zonegroup id
+  virtual int write_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                         optional_yield y, bool exclusive,
+                                         std::string_view realm_id,
+                                         std::string_view zonegroup_id) = 0;
+  /// Read the cluster's default zonegroup id
+  virtual int read_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                        optional_yield y,
+                                        std::string_view realm_id,
+                                        std::string& zonegroup_id) = 0;
+  /// Delete the cluster's default zonegroup id
+  virtual int delete_default_zonegroup_id(const DoutPrefixProvider* dpp,
+                                          optional_yield y,
+                                          std::string_view realm_id) = 0;
+
+  /// Create a zonegroup
+  virtual int create_zonegroup(const DoutPrefixProvider* dpp,
+                               optional_yield y, bool exclusive,
+                               const RGWZoneGroup& info,
+                               std::unique_ptr<ZoneGroupWriter>* writer) = 0;
+  /// Read a zonegroup by id
+  virtual int read_zonegroup_by_id(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view zonegroup_id,
+                                   RGWZoneGroup& info,
+                                   std::unique_ptr<ZoneGroupWriter>* writer) = 0;
+  /// Read a zonegroup by name
+  virtual int read_zonegroup_by_name(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view zonegroup_name,
+                                     RGWZoneGroup& info,
+                                     std::unique_ptr<ZoneGroupWriter>* writer) = 0;
+  /// Read the cluster's default zonegroup
+  virtual int read_default_zonegroup(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_id,
+                                     RGWZoneGroup& info,
+                                     std::unique_ptr<ZoneGroupWriter>* writer) = 0;
+  /// List up to 'entries.size()' zonegroup names starting from the given marker
+  virtual int list_zonegroup_names(const DoutPrefixProvider* dpp,
+                                   optional_yield y, const std::string& marker,
+                                   std::span<std::string> entries,
+                                   ListResult<std::string>& result) = 0;
+  ///@}
+
+  /// @group Zone
+  ///@{
+
+  /// Set the realm-wide default zone id
+  virtual int write_default_zone_id(const DoutPrefixProvider* dpp,
+                                    optional_yield y, bool exclusive,
+                                    std::string_view realm_id,
+                                    std::string_view zone_id) = 0;
+  /// Read the realm's default zone id
+  virtual int read_default_zone_id(const DoutPrefixProvider* dpp,
+                                   optional_yield y,
+                                   std::string_view realm_id,
+                                   std::string& zone_id) = 0;
+  /// Delete the realm's default zone id
+  virtual int delete_default_zone_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y,
+                                     std::string_view realm_id) = 0;
+
+  /// Create a zone
+  virtual int create_zone(const DoutPrefixProvider* dpp,
+                          optional_yield y, bool exclusive,
+                          const RGWZoneParams& info,
+                          std::unique_ptr<ZoneWriter>* writer) = 0;
+  /// Read a zone by id
+  virtual int read_zone_by_id(const DoutPrefixProvider* dpp,
+                              optional_yield y,
+                              std::string_view zone_id,
+                              RGWZoneParams& info,
+                              std::unique_ptr<ZoneWriter>* writer) = 0;
+  /// Read a zone by id or name. If both are empty, try to load the
+  /// cluster's default zone
+  virtual int read_zone_by_name(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view zone_name,
+                                RGWZoneParams& info,
+                                std::unique_ptr<ZoneWriter>* writer) = 0;
+  /// Read the realm's default zone
+  virtual int read_default_zone(const DoutPrefixProvider* dpp,
+                                optional_yield y,
+                                std::string_view realm_id,
+                                RGWZoneParams& info,
+                                std::unique_ptr<ZoneWriter>* writer) = 0;
+  /// List up to 'entries.size()' zone names starting from the given marker
+  virtual int list_zone_names(const DoutPrefixProvider* dpp,
+                              optional_yield y, const std::string& marker,
+                              std::span<std::string> entries,
+                              ListResult<std::string>& result) = 0;
+  ///@}
+
+  /// @group PeriodConfig
+  ///@{
+
+  /// Read period config object
+  virtual int read_period_config(const DoutPrefixProvider* dpp,
+                                 optional_yield y,
+                                 std::string_view realm_id,
+                                 RGWPeriodConfig& info) = 0;
+  /// Write period config object
+  virtual int write_period_config(const DoutPrefixProvider* dpp,
+                                  optional_yield y, bool exclusive,
+                                  std::string_view realm_id,
+                                  const RGWPeriodConfig& info) = 0;
+  ///@}
+
+}; // ConfigStore
+
+
+/// A handle to manage the atomic updates of an existing realm object. This
+/// is initialized on read, and any subsequent writes through this handle will
+/// fail with -ECANCELED if another writer updates the object in the meantime.
+class RealmWriter {
+ public:
+  virtual ~RealmWriter() {}
+
+  /// Overwrite an existing realm. Must not change id or name
+  virtual int write(const DoutPrefixProvider* dpp,
+                    optional_yield y,
+                    const RGWRealm& info) = 0;
+  /// Rename an existing realm. Must not change id
+  virtual int rename(const DoutPrefixProvider* dpp,
+                     optional_yield y,
+                     RGWRealm& info,
+                     std::string_view new_name) = 0;
+  /// Delete an existing realm
+  virtual int remove(const DoutPrefixProvider* dpp,
+                     optional_yield y) = 0;
+};
+
+/// A handle to manage the atomic updates of an existing zonegroup object. This
+/// is initialized on read, and any subsequent writes through this handle will
+/// fail with -ECANCELED if another writer updates the object in the meantime.
+class ZoneGroupWriter {
+ public:
+  virtual ~ZoneGroupWriter() {}
+
+  /// Overwrite an existing zonegroup. Must not change id or name
+  virtual int write(const DoutPrefixProvider* dpp,
+                    optional_yield y,
+                    const RGWZoneGroup& info) = 0;
+  /// Rename an existing zonegroup. Must not change id
+  virtual int rename(const DoutPrefixProvider* dpp,
+                     optional_yield y,
+                     RGWZoneGroup& info,
+                     std::string_view new_name) = 0;
+  /// Delete an existing zonegroup
+  virtual int remove(const DoutPrefixProvider* dpp,
+                     optional_yield y) = 0;
+};
+
+/// A handle to manage the atomic updates of an existing zone object. This
+/// is initialized on read, and any subsequent writes through this handle will
+/// fail with -ECANCELED if another writer updates the object in the meantime.
+class ZoneWriter {
+ public:
+  virtual ~ZoneWriter() {}
+
+  /// Overwrite an existing zone. Must not change id or name
+  virtual int write(const DoutPrefixProvider* dpp,
+                    optional_yield y,
+                    const RGWZoneParams& info) = 0;
+  /// Rename an existing zone. Must not change id
+  virtual int rename(const DoutPrefixProvider* dpp,
+                     optional_yield y,
+                     RGWZoneParams& info,
+                     std::string_view new_name) = 0;
+  /// Delete an existing zone
+  virtual int remove(const DoutPrefixProvider* dpp,
+                     optional_yield y) = 0;
+};
+
+} // namespace rgw::sal
diff --git a/src/rgw/rgw_sal_daos.cc b/src/rgw/rgw_sal_daos.cc
new file mode 100644
index 000000000..4b0234b1f
--- /dev/null
+++ b/src/rgw/rgw_sal_daos.cc
@@ -0,0 +1,2473 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=2 sw=2 expandtab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * SAL implementation for the CORTX DAOS backend
+ *
+ * Copyright (C) 2022 Seagate Technology LLC and/or its Affiliates
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_sal_daos.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <filesystem>
+#include <system_error>
+
+#include "common/Clock.h"
+#include "common/errno.h"
+#include "rgw_bucket.h"
+#include "rgw_compression.h"
+#include "rgw_sal.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using std::list;
+using std::map;
+using std::set;
+using std::string;
+using std::vector;
+
+namespace fs = std::filesystem;
+
+namespace rgw::sal {
+
+using ::ceph::decode;
+using ::ceph::encode;
+
+int DaosUser::list_buckets(const DoutPrefixProvider* dpp, const string& marker,
+                           const string& end_marker, uint64_t max,
+                           bool need_stats, BucketList& buckets,
+                           optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: list_user_buckets: marker=" << marker
+                     << " end_marker=" << end_marker << " max=" << max << dendl;
+  int ret = 0;
+  bool is_truncated = false;
+  buckets.clear();
+  vector<struct ds3_bucket_info> bucket_infos(max);
+  daos_size_t bcount = bucket_infos.size();
+  vector<vector<uint8_t>> values(bcount, vector<uint8_t>(DS3_MAX_ENCODED_LEN));
+  for (daos_size_t i = 0; i < bcount; i++) {
+    bucket_infos[i].encoded = values[i].data();
+    bucket_infos[i].encoded_length = values[i].size();
+  }
+
+  char daos_marker[DS3_MAX_BUCKET_NAME];
+  std::strncpy(daos_marker, marker.c_str(), sizeof(daos_marker));
+  ret = ds3_bucket_list(&bcount, bucket_infos.data(), daos_marker,
+                        &is_truncated, store->ds3, nullptr);
+  ldpp_dout(dpp, 20) << "DEBUG: ds3_bucket_list: bcount=" << bcount
+                     << " ret=" << ret << dendl;
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: ds3_bucket_list failed!" << ret << dendl;
+    return ret;
+  }
+
+  bucket_infos.resize(bcount);
+  values.resize(bcount);
+
+  for (const auto& bi : bucket_infos) {
+    DaosBucketInfo dbinfo;
+    bufferlist bl;
+    bl.append(reinterpret_cast<char*>(bi.encoded), bi.encoded_length);
+    auto iter = bl.cbegin();
+    dbinfo.decode(iter);
+    buckets.add(std::make_unique<DaosBucket>(this->store, dbinfo.info, this));
+  }
+
+  buckets.set_truncated(is_truncated);
+  return 0;
+}
+
+int DaosUser::create_bucket(
+    const DoutPrefixProvider* dpp, const rgw_bucket& b,
+    const std::string& zonegroup_id, rgw_placement_rule& placement_rule,
+    std::string& swift_ver_location, const RGWQuotaInfo* pquota_info,
+    const RGWAccessControlPolicy& policy, Attrs& attrs, RGWBucketInfo& info,
+    obj_version& ep_objv, bool exclusive, bool obj_lock_enabled, bool* existed,
+    req_info& req_info, std::unique_ptr<Bucket>* bucket_out, optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: create_bucket:" << b.name << dendl;
+  int ret;
+  std::unique_ptr<Bucket> bucket;
+
+  // Look up the bucket. Create it if it doesn't exist.
+  ret = this->store->get_bucket(dpp, this, b, &bucket, y);
+  if (ret != 0 && ret != -ENOENT) {
+    return ret;
+  }
+
+  if (ret != -ENOENT) {
+    *existed = true;
+    if (swift_ver_location.empty()) {
+      swift_ver_location = bucket->get_info().swift_ver_location;
+    }
+    placement_rule.inherit_from(bucket->get_info().placement_rule);
+
+    // TODO: ACL policy
+    // // don't allow changes to the acl policy
+    // RGWAccessControlPolicy old_policy(ctx());
+    // int rc = rgw_op_get_bucket_policy_from_attr(
+    //           dpp, this, u, bucket->get_attrs(), &old_policy, y);
+    // if (rc >= 0 && old_policy != policy) {
+    //    bucket_out->swap(bucket);
+    //    return -EEXIST;
+    //}
+  } else {
+    placement_rule.name = "default";
+    placement_rule.storage_class = "STANDARD";
+    bucket = std::make_unique<DaosBucket>(store, b, this);
+    bucket->set_attrs(attrs);
+
+    *existed = false;
+  }
+
+  // TODO: how to handle zone and multi-site.
+
+  if (!*existed) {
+    info.placement_rule = placement_rule;
+    info.bucket = b;
+    info.owner = this->get_info().user_id;
+    info.zonegroup = zonegroup_id;
+    info.creation_time = ceph::real_clock::now();
+    if (obj_lock_enabled)
+      info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED;
+    bucket->set_version(ep_objv);
+    bucket->get_info() = info;
+
+    // Create a new bucket:
+    DaosBucket* daos_bucket = static_cast<DaosBucket*>(bucket.get());
+    bufferlist bl;
+    std::unique_ptr<struct ds3_bucket_info> bucket_info =
+        daos_bucket->get_encoded_info(bl, ceph::real_time());
+    ret = ds3_bucket_create(bucket->get_name().c_str(), bucket_info.get(),
+                            nullptr, store->ds3, nullptr);
+    if (ret != 0) {
+      ldpp_dout(dpp, 0) << "ERROR: ds3_bucket_create failed! ret=" << ret
+                        << dendl;
+      return ret;
+    }
+  } else {
+    bucket->set_version(ep_objv);
+    bucket->get_info() = info;
+  }
+
+  bucket_out->swap(bucket);
+
+  return ret;
+}
+
+int DaosUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosUser::read_stats(const DoutPrefixProvider* dpp, optional_yield y,
+                         RGWStorageStats* stats,
+                         ceph::real_time* last_stats_sync,
+                         ceph::real_time* last_stats_update) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+/* stats - Not for first pass */
+int DaosUser::read_stats_async(const DoutPrefixProvider* dpp,
+                               RGWGetUserStats_CB* cb) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosUser::complete_flush_stats(const DoutPrefixProvider* dpp,
+                                   optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosUser::read_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch,
+                         uint64_t end_epoch, uint32_t max_entries,
+                         bool* is_truncated, RGWUsageIter& usage_iter,
+                         map<rgw_user_bucket, rgw_usage_log_entry>& usage) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosUser::trim_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch,
+                         uint64_t end_epoch) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosUser::load_user(const DoutPrefixProvider* dpp, optional_yield y) {
+  const string name = info.user_id.to_str();
+  ldpp_dout(dpp, 20) << "DEBUG: load_user, name=" << name << dendl;
+
+  DaosUserInfo duinfo;
+  int ret = read_user(dpp, name, &duinfo);
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: load_user failed, name=" << name << dendl;
+    return ret;
+  }
+
+  info = duinfo.info;
+  attrs = duinfo.attrs;
+  objv_tracker.read_version = duinfo.user_version;
+  return 0;
+}
+
+int DaosUser::merge_and_store_attrs(const DoutPrefixProvider* dpp,
+                                    Attrs& new_attrs, optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: merge_and_store_attrs, new_attrs=" << new_attrs
+                     << dendl;
+  for (auto& it : new_attrs) {
+    attrs[it.first] = it.second;
+  }
+  return store_user(dpp, y, false);
+}
+
+int DaosUser::store_user(const DoutPrefixProvider* dpp, optional_yield y,
+                         bool exclusive, RGWUserInfo* old_info) {
+  const string name = info.user_id.to_str();
+  ldpp_dout(dpp, 10) << "DEBUG: Store_user(): User name=" << name << dendl;
+
+  // Read user
+  int ret = 0;
+  struct DaosUserInfo duinfo;
+  ret = read_user(dpp, name, &duinfo);
+  obj_version obj_ver = duinfo.user_version;
+  std::unique_ptr<struct ds3_user_info> old_user_info;
+  std::vector<const char*> old_access_ids;
+
+  // Check if the user already exists
+  if (ret == 0 && obj_ver.ver) {
+    // already exists.
+
+    if (old_info) {
+      *old_info = duinfo.info;
+    }
+
+    if (objv_tracker.read_version.ver != obj_ver.ver) {
+      // Object version mismatch.. return ECANCELED
+      ret = -ECANCELED;
+      ldpp_dout(dpp, 0) << "User Read version mismatch read_version="
+                        << objv_tracker.read_version.ver
+                        << " obj_ver=" << obj_ver.ver << dendl;
+      return ret;
+    }
+
+    if (exclusive) {
+      // return
+      return ret;
+    }
+    obj_ver.ver++;
+
+    for (auto const& [id, key] : duinfo.info.access_keys) {
+      old_access_ids.push_back(id.c_str());
+    }
+    old_user_info.reset(
+        new ds3_user_info{.name = duinfo.info.user_id.to_str().c_str(),
+                          .email = duinfo.info.user_email.c_str(),
+                          .access_ids = old_access_ids.data(),
+                          .access_ids_nr = old_access_ids.size()});
+  } else {
+    obj_ver.ver = 1;
+    obj_ver.tag = "UserTAG";
+  }
+
+  bufferlist bl;
+  std::unique_ptr<struct ds3_user_info> user_info =
+      get_encoded_info(bl, obj_ver);
+
+  ret = ds3_user_set(name.c_str(), user_info.get(), old_user_info.get(),
+                     store->ds3, nullptr);
+
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "Error: ds3_user_set failed, name=" << name
+                      << " ret=" << ret << dendl;
+  }
+
+  return ret;
+}
+
+int DaosUser::read_user(const DoutPrefixProvider* dpp, std::string name,
+                        DaosUserInfo* duinfo) {
+  // Initialize ds3_user_info
+  bufferlist bl;
+  uint64_t size = DS3_MAX_ENCODED_LEN;
+  struct ds3_user_info user_info = {.encoded = bl.append_hole(size).c_str(),
+                                    .encoded_length = size};
+
+  int ret = ds3_user_get(name.c_str(), &user_info, store->ds3, nullptr);
+
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "Error: ds3_user_get failed, name=" << name
+                      << " ret=" << ret << dendl;
+    return ret;
+  }
+
+  // Decode
+  bufferlist& blr = bl;
+  auto iter = blr.cbegin();
+  duinfo->decode(iter);
+  return ret;
+}
+
+std::unique_ptr<struct ds3_user_info> DaosUser::get_encoded_info(
+    bufferlist& bl, obj_version& obj_ver) {
+  // Encode user data
+  struct DaosUserInfo duinfo;
+  duinfo.info = info;
+  duinfo.attrs = attrs;
+  duinfo.user_version = obj_ver;
+  duinfo.encode(bl);
+
+  // Initialize ds3_user_info
+  access_ids.clear();
+  for (auto const& [id, key] : info.access_keys) {
+    access_ids.push_back(id.c_str());
+  }
+  return std::unique_ptr<struct ds3_user_info>(
+      new ds3_user_info{.name = info.user_id.to_str().c_str(),
+                        .email = info.user_email.c_str(),
+                        .access_ids = access_ids.data(),
+                        .access_ids_nr = access_ids.size(),
+                        .encoded = bl.c_str(),
+                        .encoded_length = bl.length()});
+}
+
+int DaosUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y) {
+  const string name = info.user_id.to_str();
+
+  // TODO: the expectation is that the object version needs to be passed in as a
+  // method arg see int DB::remove_user(const DoutPrefixProvider *dpp,
+  // RGWUserInfo& uinfo, RGWObjVersionTracker *pobjv)
+  obj_version obj_ver;
+  bufferlist bl;
+  std::unique_ptr<struct ds3_user_info> user_info =
+      get_encoded_info(bl, obj_ver);
+
+  // Remove user
+  int ret = ds3_user_remove(name.c_str(), user_info.get(), store->ds3, nullptr);
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "Error: ds3_user_set failed, name=" << name
+                      << " ret=" << ret << dendl;
+  }
+  return ret;
+}
+
+DaosBucket::~DaosBucket() { close(nullptr); }
+
+int DaosBucket::open(const DoutPrefixProvider* dpp) {
+  ldpp_dout(dpp, 20) << "DEBUG: open, name=" << info.bucket.name.c_str()
+                     << dendl;
+  // Idempotent
+  if (is_open()) {
+    return 0;
+  }
+
+  int ret = ds3_bucket_open(get_name().c_str(), &ds3b, store->ds3, nullptr);
+  ldpp_dout(dpp, 20) << "DEBUG: ds3_bucket_open, name=" << get_name()
+                     << ", ret=" << ret << dendl;
+
+  return ret;
+}
+
+int DaosBucket::close(const DoutPrefixProvider* dpp) {
+  ldpp_dout(dpp, 20) << "DEBUG: close" << dendl;
+  // Idempotent
+  if (!is_open()) {
+    return 0;
+  }
+
+  int ret = ds3_bucket_close(ds3b, nullptr);
+  ds3b = nullptr;
+  ldpp_dout(dpp, 20) << "DEBUG: ds3_bucket_close ret=" << ret << dendl;
+
+  return ret;
+}
+
+std::unique_ptr<struct ds3_bucket_info> DaosBucket::get_encoded_info(
+    bufferlist& bl, ceph::real_time _mtime) {
+  DaosBucketInfo dbinfo;
+  dbinfo.info = info;
+  dbinfo.bucket_attrs = attrs;
+  dbinfo.mtime = _mtime;
+  dbinfo.bucket_version = bucket_version;
+  dbinfo.encode(bl);
+
+  auto bucket_info = std::make_unique<struct ds3_bucket_info>();
+  bucket_info->encoded = bl.c_str();
+  bucket_info->encoded_length = bl.length();
+  std::strncpy(bucket_info->name, get_name().c_str(), sizeof(bucket_info->name));
+  return bucket_info;
+}
+
+int DaosBucket::remove_bucket(const DoutPrefixProvider* dpp,
+                              bool delete_children, bool forward_to_master,
+                              req_info* req_info, optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: remove_bucket, delete_children="
+                    
+                     << delete_children
+                    
+                     << " forward_to_master=" << forward_to_master << dendl;
+
+  return ds3_bucket_destroy(get_name().c_str(), delete_children, store->ds3,
+                            nullptr);
+}
+
+int DaosBucket::remove_bucket_bypass_gc(int concurrent_max,
+                                        bool keep_index_consistent,
+                                        optional_yield y,
+                                        const DoutPrefixProvider* dpp) {
+  ldpp_dout(dpp, 20) << "DEBUG: remove_bucket_bypass_gc, concurrent_max="
+                    
+                     << concurrent_max
+                    
+                     << " keep_index_consistent=" << keep_index_consistent
+                    
+                     << dendl;
+  return ds3_bucket_destroy(get_name().c_str(), true, store->ds3, nullptr);
+}
+
+int DaosBucket::put_info(const DoutPrefixProvider* dpp, bool exclusive,
+                         ceph::real_time _mtime) {
+  ldpp_dout(dpp, 20) << "DEBUG: put_info(): bucket name=" << get_name()
+                     << dendl;
+
+  int ret = open(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  bufferlist bl;
+  std::unique_ptr<struct ds3_bucket_info> bucket_info =
+      get_encoded_info(bl, ceph::real_time());
+
+  ret = ds3_bucket_set_info(bucket_info.get(), ds3b, nullptr);
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: ds3_bucket_set_info failed: " << ret << dendl;
+  }
+  return ret;
+}
+
+int DaosBucket::load_bucket(const DoutPrefixProvider* dpp, optional_yield y,
+                            bool get_stats) {
+  ldpp_dout(dpp, 20) << "DEBUG: load_bucket(): bucket name=" << get_name()
+                     << dendl;
+  int ret = open(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  bufferlist bl;
+  DaosBucketInfo dbinfo;
+  uint64_t size = DS3_MAX_ENCODED_LEN;
+  struct ds3_bucket_info bucket_info = {.encoded = bl.append_hole(size).c_str(),
+                                        .encoded_length = size};
+
+  ret = ds3_bucket_get_info(&bucket_info, ds3b, nullptr);
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: ds3_bucket_get_info failed: " << ret << dendl;
+    return ret;
+  }
+
+  auto iter = bl.cbegin();
+  dbinfo.decode(iter);
+  info = dbinfo.info;
+  rgw_placement_rule placement_rule;
+  placement_rule.name = "default";
+  placement_rule.storage_class = "STANDARD";
+  info.placement_rule = placement_rule;
+
+  attrs = dbinfo.bucket_attrs;
+  mtime = dbinfo.mtime;
+  bucket_version = dbinfo.bucket_version;
+  return ret;
+}
+
+/* stats - Not for first pass */
+int DaosBucket::read_stats(const DoutPrefixProvider* dpp,
+                           const bucket_index_layout_generation& idx_layout,
+                           int shard_id, std::string* bucket_ver,
+                           std::string* master_ver,
+                           std::map<RGWObjCategory, RGWStorageStats>& stats,
+                           std::string* max_marker, bool* syncstopped) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::read_stats_async(
+    const DoutPrefixProvider* dpp,
+    const bucket_index_layout_generation& idx_layout, int shard_id,
+    RGWGetBucketStats_CB* ctx) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::sync_user_stats(const DoutPrefixProvider* dpp,
+                                optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::update_container_stats(const DoutPrefixProvider* dpp) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::check_bucket_shards(const DoutPrefixProvider* dpp) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::chown(const DoutPrefixProvider* dpp, User& new_user,
+                      optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+/* Make sure to call load_bucket() if you need it first */
+bool DaosBucket::is_owner(User* user) {
+  return (info.owner.compare(user->get_id()) == 0);
+}
+
+int DaosBucket::check_empty(const DoutPrefixProvider* dpp, optional_yield y) {
+  /* XXX: Check if bucket contains any objects */
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::check_quota(const DoutPrefixProvider* dpp, RGWQuota& quota,
+                            uint64_t obj_size, optional_yield y,
+                            bool check_size_only) {
+  /* Not Handled in the first pass as stats are also needed */
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::merge_and_store_attrs(const DoutPrefixProvider* dpp,
+                                      Attrs& new_attrs, optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: merge_and_store_attrs, new_attrs=" << new_attrs
+                     << dendl;
+  for (auto& it : new_attrs) {
+    attrs[it.first] = it.second;
+  }
+
+  return put_info(dpp, y, ceph::real_time());
+}
+
+int DaosBucket::try_refresh_info(const DoutPrefixProvider* dpp,
+                                 ceph::real_time* pmtime) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+/* XXX: usage and stats not supported in the first pass */
+int DaosBucket::read_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch,
+                           uint64_t end_epoch, uint32_t max_entries,
+                           bool* is_truncated, RGWUsageIter& usage_iter,
+                           map<rgw_user_bucket, rgw_usage_log_entry>& usage) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::trim_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch,
+                           uint64_t end_epoch) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::remove_objs_from_index(
+    const DoutPrefixProvider* dpp,
+    std::list<rgw_obj_index_key>& objs_to_unlink) {
+  /* XXX: CHECK: Unlike RadosStore, there is no seperate bucket index table.
+   * Delete all the object in the list from the object table of this
+   * bucket
+   */
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::check_index(
+    const DoutPrefixProvider* dpp,
+    std::map<RGWObjCategory, RGWStorageStats>& existing_stats,
+    std::map<RGWObjCategory, RGWStorageStats>& calculated_stats) {
+  /* XXX: stats not supported yet */
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::rebuild_index(const DoutPrefixProvider* dpp) {
+  /* there is no index table in DAOS. Not applicable */
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::set_tag_timeout(const DoutPrefixProvider* dpp,
+                                uint64_t timeout) {
+  /* XXX: CHECK: set tag timeout for all the bucket objects? */
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::purge_instance(const DoutPrefixProvider* dpp) {
+  /* XXX: CHECK: for DAOS only single instance supported.
+   * Remove all the objects for that instance? Anything extra needed?
+   */
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosBucket::set_acl(const DoutPrefixProvider* dpp,
+                        RGWAccessControlPolicy& acl, optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: set_acl" << dendl;
+  int ret = 0;
+  bufferlist aclbl;
+
+  acls = acl;
+  acl.encode(aclbl);
+
+  Attrs attrs = get_attrs();
+  attrs[RGW_ATTR_ACL] = aclbl;
+
+  return ret;
+}
+
+std::unique_ptr<Object> DaosBucket::get_object(const rgw_obj_key& k) {
+  return std::make_unique<DaosObject>(this->store, k, this);
+}
+
+bool compare_rgw_bucket_dir_entry(rgw_bucket_dir_entry& entry1,
+                                  rgw_bucket_dir_entry& entry2) {
+  return (entry1.key < entry2.key);
+}
+
+bool compare_multipart_upload(std::unique_ptr<MultipartUpload>& upload1,
+                              std::unique_ptr<MultipartUpload>& upload2) {
+  return (upload1->get_key() < upload2->get_key());
+}
+
+int DaosBucket::list(const DoutPrefixProvider* dpp, ListParams& params, int max,
+                     ListResults& results, optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: list bucket=" << get_name() << " max=" << max
+                     << " params=" << params << dendl;
+  // End
+  if (max == 0) {
+    return 0;
+  }
+
+  int ret = open(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  // Init needed structures
+  vector<struct ds3_object_info> object_infos(max);
+  uint32_t nobj = object_infos.size();
+  vector<vector<uint8_t>> values(nobj, vector<uint8_t>(DS3_MAX_ENCODED_LEN));
+  for (uint32_t i = 0; i < nobj; i++) {
+    object_infos[i].encoded = values[i].data();
+    object_infos[i].encoded_length = values[i].size();
+  }
+
+  vector<struct ds3_common_prefix_info> common_prefixes(max);
+  uint32_t ncp = common_prefixes.size();
+
+  char daos_marker[DS3_MAX_KEY_BUFF];
+  std::strncpy(daos_marker, params.marker.get_oid().c_str(), sizeof(daos_marker));
+
+  ret = ds3_bucket_list_obj(&nobj, object_infos.data(), &ncp,
+                            common_prefixes.data(), params.prefix.c_str(),
+                            params.delim.c_str(), daos_marker,
+                            params.list_versions, &results.is_truncated, ds3b);
+
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: ds3_bucket_list_obj failed, name="
+                      << get_name() << ", ret=" << ret << dendl;
+    return ret;
+  }
+
+  object_infos.resize(nobj);
+  values.resize(nobj);
+  common_prefixes.resize(ncp);
+
+  // Fill common prefixes
+  for (auto const& cp : common_prefixes) {
+    results.common_prefixes[cp.prefix] = true;
+  }
+
+  // Decode objs
+  for (auto const& obj : object_infos) {
+    bufferlist bl;
+    rgw_bucket_dir_entry ent;
+    bl.append(reinterpret_cast<char*>(obj.encoded), obj.encoded_length);
+    auto iter = bl.cbegin();
+    ent.decode(iter);
+    if (params.list_versions || ent.is_visible()) {
+      results.objs.emplace_back(std::move(ent));
+    }
+  }
+
+  if (!params.allow_unordered) {
+    std::sort(results.objs.begin(), results.objs.end(),
+              compare_rgw_bucket_dir_entry);
+  }
+
+  return ret;
+}
+
+int DaosBucket::list_multiparts(
+    const DoutPrefixProvider* dpp, const string& prefix, string& marker,
+    const string& delim, const int& max_uploads,
+    vector<std::unique_ptr<MultipartUpload>>& uploads,
+    map<string, bool>* common_prefixes, bool* is_truncated) {
+  ldpp_dout(dpp, 20) << "DEBUG: list_multiparts" << dendl;
+  // End of uploading
+  if (max_uploads == 0) {
+    *is_truncated = false;
+    return 0;
+  }
+
+  // Init needed structures
+  vector<struct ds3_multipart_upload_info> multipart_upload_infos(max_uploads);
+  uint32_t nmp = multipart_upload_infos.size();
+  vector<vector<uint8_t>> values(nmp, vector<uint8_t>(DS3_MAX_ENCODED_LEN));
+  for (uint32_t i = 0; i < nmp; i++) {
+    multipart_upload_infos[i].encoded = values[i].data();
+    multipart_upload_infos[i].encoded_length = values[i].size();
+  }
+
+  vector<struct ds3_common_prefix_info> cps(max_uploads);
+  uint32_t ncp = cps.size();
+
+  char daos_marker[DS3_MAX_KEY_BUFF];
+  std::strncpy(daos_marker, marker.c_str(), sizeof(daos_marker));
+
+  int ret = ds3_bucket_list_multipart(
+      get_name().c_str(), &nmp, multipart_upload_infos.data(), &ncp, cps.data(),
+      prefix.c_str(), delim.c_str(), daos_marker, is_truncated, store->ds3);
+
+  multipart_upload_infos.resize(nmp);
+  values.resize(nmp);
+  cps.resize(ncp);
+
+  // Fill common prefixes
+  for (auto const& cp : cps) {
+    (*common_prefixes)[cp.prefix] = true;
+  }
+
+  for (auto const& mp : multipart_upload_infos) {
+    // Decode the xattr
+    bufferlist bl;
+    rgw_bucket_dir_entry ent;
+    bl.append(reinterpret_cast<char*>(mp.encoded), mp.encoded_length);
+    auto iter = bl.cbegin();
+    ent.decode(iter);
+    string name = ent.key.name;
+
+    ACLOwner owner(rgw_user(ent.meta.owner));
+    owner.set_name(ent.meta.owner_display_name);
+    uploads.push_back(this->get_multipart_upload(
+        name, mp.upload_id, std::move(owner), ent.meta.mtime));
+  }
+
+  // Sort uploads
+  std::sort(uploads.begin(), uploads.end(), compare_multipart_upload);
+
+  return ret;
+}
+
+int DaosBucket::abort_multiparts(const DoutPrefixProvider* dpp,
+                                 CephContext* cct) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+void DaosStore::finalize(void) {
+  ldout(cctx, 20) << "DEBUG: finalize" << dendl;
+  int ret;
+
+  ret = ds3_disconnect(ds3, nullptr);
+  if (ret != 0) {
+    ldout(cctx, 0) << "ERROR: ds3_disconnect() failed: " << ret << dendl;
+  }
+  ds3 = nullptr;
+
+  ret = ds3_fini();
+  if (ret != 0) {
+    ldout(cctx, 0) << "ERROR: daos_fini() failed: " << ret << dendl;
+  }
+}
+
+int DaosStore::initialize(CephContext* cct, const DoutPrefixProvider* dpp) {
+  ldpp_dout(dpp, 20) << "DEBUG: initialize" << dendl;
+  int ret = ds3_init();
+
+  // DS3 init failed, allow the case where init is already done
+  if (ret != 0 && ret != DER_ALREADY) {
+    ldout(cct, 0) << "ERROR: ds3_init() failed: " << ret << dendl;
+    return ret;
+  }
+
+  // XXX: these params should be taken from config settings and
+  // cct somehow?
+  const auto& daos_pool = cct->_conf.get_val<std::string>("daos_pool");
+  ldout(cct, 20) << "INFO: daos pool: " << daos_pool << dendl;
+
+  ret = ds3_connect(daos_pool.c_str(), nullptr, &ds3, nullptr);
+
+  if (ret != 0) {
+    ldout(cct, 0) << "ERROR: ds3_connect() failed: " << ret << dendl;
+    ds3_fini();
+  }
+
+  return ret;
+}
+
+const std::string& DaosZoneGroup::get_endpoint() const {
+  if (!group.endpoints.empty()) {
+    return group.endpoints.front();
+  } else {
+    // use zonegroup's master zone endpoints
+    auto z = group.zones.find(group.master_zone);
+    if (z != group.zones.end() && !z->second.endpoints.empty()) {
+      return z->second.endpoints.front();
+    }
+  }
+  return empty;
+}
+
+bool DaosZoneGroup::placement_target_exists(std::string& target) const {
+  return !!group.placement_targets.count(target);
+}
+
+void DaosZoneGroup::get_placement_target_names(
+    std::set<std::string>& names) const {
+  for (const auto& target : group.placement_targets) {
+    names.emplace(target.second.name);
+  }
+}
+
+int DaosZoneGroup::get_placement_tier(const rgw_placement_rule& rule,
+                                      std::unique_ptr<PlacementTier>* tier) {
+  std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
+  titer = group.placement_targets.find(rule.name);
+  if (titer == group.placement_targets.end()) {
+    return -ENOENT;
+  }
+
+  const auto& target_rule = titer->second;
+  std::map<std::string, RGWZoneGroupPlacementTier>::const_iterator ttier;
+  ttier = target_rule.tier_targets.find(rule.storage_class);
+  if (ttier == target_rule.tier_targets.end()) {
+    // not found
+    return -ENOENT;
+  }
+
+  PlacementTier* t;
+  t = new DaosPlacementTier(store, ttier->second);
+  if (!t) return -ENOMEM;
+
+  tier->reset(t);
+  return 0;
+}
+
+ZoneGroup& DaosZone::get_zonegroup() { return zonegroup; }
+
+int DaosZone::get_zonegroup(const std::string& id,
+                            std::unique_ptr<ZoneGroup>* group) {
+  /* XXX: for now only one zonegroup supported */
+  ZoneGroup* zg;
+  zg = new DaosZoneGroup(store, zonegroup.get_group());
+
+  group->reset(zg);
+  return 0;
+}
+
+const rgw_zone_id& DaosZone::get_id() { return cur_zone_id; }
+
+const std::string& DaosZone::get_name() const {
+  return zone_params->get_name();
+}
+
+bool DaosZone::is_writeable() { return true; }
+
+bool DaosZone::get_redirect_endpoint(std::string* endpoint) { return false; }
+
+bool DaosZone::has_zonegroup_api(const std::string& api) const { return false; }
+
+const std::string& DaosZone::get_current_period_id() {
+  return current_period->get_id();
+}
+
+std::unique_ptr<LuaManager> DaosStore::get_lua_manager() {
+  return std::make_unique<DaosLuaManager>(this);
+}
+
+int DaosObject::get_obj_state(const DoutPrefixProvider* dpp,
+                              RGWObjState** _state, optional_yield y,
+                              bool follow_olh) {
+  // Get object's metadata (those stored in rgw_bucket_dir_entry)
+  ldpp_dout(dpp, 20) << "DEBUG: get_obj_state" << dendl;
+  rgw_bucket_dir_entry ent;
+  *_state = &state;  // state is required even if a failure occurs
+
+  int ret = get_dir_entry_attrs(dpp, &ent);
+  if (ret != 0) {
+    return ret;
+  }
+
+  // Set object state.
+  state.exists = true;
+  state.size = ent.meta.size;
+  state.accounted_size = ent.meta.size;
+  state.mtime = ent.meta.mtime;
+
+  state.has_attrs = true;
+  bufferlist etag_bl;
+  string& etag = ent.meta.etag;
+  ldpp_dout(dpp, 20) << __func__ << ": object's etag:  " << ent.meta.etag
+                     << dendl;
+  etag_bl.append(etag);
+  state.attrset[RGW_ATTR_ETAG] = etag_bl;
+  return 0;
+}
+
+DaosObject::~DaosObject() { close(nullptr); }
+
+int DaosObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs,
+                              Attrs* delattrs, optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: DaosObject::set_obj_attrs()" << dendl;
+  // TODO handle target_obj
+  // Get object's metadata (those stored in rgw_bucket_dir_entry)
+  rgw_bucket_dir_entry ent;
+  int ret = get_dir_entry_attrs(dpp, &ent);
+  if (ret != 0) {
+    return ret;
+  }
+
+  // Update object metadata
+  Attrs updateattrs = setattrs == nullptr ? attrs : *setattrs;
+  if (delattrs) {
+    for (auto const& [attr, attrval] : *delattrs) {
+      updateattrs.erase(attr);
+    }
+  }
+
+  ret = set_dir_entry_attrs(dpp, &ent, &updateattrs);
+  return ret;
+}
+
+int DaosObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp,
+                              rgw_obj* target_obj) {
+  ldpp_dout(dpp, 20) << "DEBUG: DaosObject::get_obj_attrs()" << dendl;
+  // TODO handle target_obj
+  // Get object's metadata (those stored in rgw_bucket_dir_entry)
+  rgw_bucket_dir_entry ent;
+  int ret = get_dir_entry_attrs(dpp, &ent, &attrs);
+  return ret;
+}
+
+int DaosObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val,
+                                 optional_yield y,
+                                 const DoutPrefixProvider* dpp) {
+  // Get object's metadata (those stored in rgw_bucket_dir_entry)
+  ldpp_dout(dpp, 20) << "DEBUG: modify_obj_attrs" << dendl;
+  rgw_bucket_dir_entry ent;
+  int ret = get_dir_entry_attrs(dpp, &ent, &attrs);
+  if (ret != 0) {
+    return ret;
+  }
+
+  // Update object attrs
+  set_atomic();
+  attrs[attr_name] = attr_val;
+
+  ret = set_dir_entry_attrs(dpp, &ent, &attrs);
+  return ret;
+}
+
+int DaosObject::delete_obj_attrs(const DoutPrefixProvider* dpp,
+                                 const char* attr_name, optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: delete_obj_attrs" << dendl;
+  rgw_obj target = get_obj();
+  Attrs rmattr;
+  bufferlist bl;
+
+  rmattr[attr_name] = bl;
+  return set_obj_attrs(dpp, nullptr, &rmattr, y);
+}
+
+bool DaosObject::is_expired() {
+  auto iter = attrs.find(RGW_ATTR_DELETE_AT);
+  if (iter != attrs.end()) {
+    utime_t delete_at;
+    try {
+      auto bufit = iter->second.cbegin();
+      decode(delete_at, bufit);
+    } catch (buffer::error& err) {
+      ldout(store->ctx(), 0)
+          << "ERROR: " << __func__
+          << ": failed to decode " RGW_ATTR_DELETE_AT " attr" << dendl;
+      return false;
+    }
+
+    if (delete_at <= ceph_clock_now() && !delete_at.is_zero()) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Taken from rgw_rados.cc
+void DaosObject::gen_rand_obj_instance_name() {
+  enum { OBJ_INSTANCE_LEN = 32 };
+  char buf[OBJ_INSTANCE_LEN + 1];
+
+  gen_rand_alphanumeric_no_underscore(store->ctx(), buf, OBJ_INSTANCE_LEN);
+  state.obj.key.set_instance(buf);
+}
+
+int DaosObject::omap_get_vals(const DoutPrefixProvider* dpp,
+                              const std::string& marker, uint64_t count,
+                              std::map<std::string, bufferlist>* m, bool* pmore,
+                              optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosObject::omap_get_all(const DoutPrefixProvider* dpp,
+                             std::map<std::string, bufferlist>* m,
+                             optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosObject::omap_get_vals_by_keys(const DoutPrefixProvider* dpp,
+                                      const std::string& oid,
+                                      const std::set<std::string>& keys,
+                                      Attrs* vals) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosObject::omap_set_val_by_key(const DoutPrefixProvider* dpp,
+                                    const std::string& key, bufferlist& val,
+                                    bool must_exist, optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) {
+  return 0;
+}
+
+std::unique_ptr<MPSerializer> DaosObject::get_serializer(
+    const DoutPrefixProvider* dpp, const std::string& lock_name) {
+  return std::make_unique<MPDaosSerializer>(dpp, store, this, lock_name);
+}
+
+int DaosObject::transition(Bucket* bucket,
+                           const rgw_placement_rule& placement_rule,
+                           const real_time& mtime, uint64_t olh_epoch,
+                           const DoutPrefixProvider* dpp, optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosObject::transition_to_cloud(
+    Bucket* bucket, rgw::sal::PlacementTier* tier, rgw_bucket_dir_entry& o,
+    std::set<std::string>& cloud_targets, CephContext* cct, bool update_object,
+    const DoutPrefixProvider* dpp, optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+bool DaosObject::placement_rules_match(rgw_placement_rule& r1,
+                                       rgw_placement_rule& r2) {
+  /* XXX: support single default zone and zonegroup for now */
+  return true;
+}
+
+int DaosObject::dump_obj_layout(const DoutPrefixProvider* dpp, optional_yield y,
+                                Formatter* f) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+std::unique_ptr<Object::ReadOp> DaosObject::get_read_op() {
+  return std::make_unique<DaosObject::DaosReadOp>(this);
+}
+
+DaosObject::DaosReadOp::DaosReadOp(DaosObject* _source) : source(_source) {}
+
+int DaosObject::DaosReadOp::prepare(optional_yield y,
+                                    const DoutPrefixProvider* dpp) {
+  ldpp_dout(dpp, 20) << __func__
+                     << ": bucket=" << source->get_bucket()->get_name()
+                     << dendl;
+
+  if (source->get_bucket()->versioned() && !source->have_instance()) {
+    // If the bucket is versioned and no version is specified, get the latest
+    // version
+    source->set_instance(DS3_LATEST_INSTANCE);
+  }
+
+  rgw_bucket_dir_entry ent;
+  int ret = source->get_dir_entry_attrs(dpp, &ent);
+
+  // Set source object's attrs. The attrs is key/value map and is used
+  // in send_response_data() to set attributes, including etag.
+  bufferlist etag_bl;
+  string& etag = ent.meta.etag;
+  ldpp_dout(dpp, 20) << __func__ << ": object's etag: " << ent.meta.etag
+                     << dendl;
+  etag_bl.append(etag.c_str(), etag.size());
+  source->get_attrs().emplace(std::move(RGW_ATTR_ETAG), std::move(etag_bl));
+
+  source->set_key(ent.key);
+  source->set_obj_size(ent.meta.size);
+  ldpp_dout(dpp, 20) << __func__ << ": object's size: " << ent.meta.size
+                     << dendl;
+
+  return ret;
+}
+
+int DaosObject::DaosReadOp::read(int64_t off, int64_t end, bufferlist& bl,
+                                 optional_yield y,
+                                 const DoutPrefixProvider* dpp) {
+  ldpp_dout(dpp, 20) << __func__ << ": off=" << off << " end=" << end << dendl;
+  int ret = source->lookup(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  // Calculate size, end is inclusive
+  uint64_t size = end - off + 1;
+
+  // Read
+  ret = source->read(dpp, bl, off, size);
+  if (ret != 0) {
+    return ret;
+  }
+
+  return ret;
+}
+
+// RGWGetObj::execute() calls ReadOp::iterate() to read object from 'off' to
+// 'end'. The returned data is processed in 'cb' which is a chain of
+// post-processing filters such as decompression, de-encryption and sending back
+// data to client (RGWGetObj_CB::handle_dta which in turn calls
+// RGWGetObj::get_data_cb() to send data back.).
+//
+// POC implements a simple sync version of iterate() function in which it reads
+// a block of data each time and call 'cb' for post-processing.
+int DaosObject::DaosReadOp::iterate(const DoutPrefixProvider* dpp, int64_t off,
+                                    int64_t end, RGWGetDataCB* cb,
+                                    optional_yield y) {
+  ldpp_dout(dpp, 20) << __func__ << ": off=" << off << " end=" << end << dendl;
+  int ret = source->lookup(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  // Calculate size, end is inclusive
+  uint64_t size = end - off + 1;
+
+  // Reserve buffers and read
+  bufferlist bl;
+  ret = source->read(dpp, bl, off, size);
+  if (ret != 0) {
+    return ret;
+  }
+
+  // Call cb to process returned data.
+  ldpp_dout(dpp, 20) << __func__ << ": call cb to process data, actual=" << size
+                     << dendl;
+  cb->handle_data(bl, off, size);
+  return ret;
+}
+
+int DaosObject::DaosReadOp::get_attr(const DoutPrefixProvider* dpp,
+                                     const char* name, bufferlist& dest,
+                                     optional_yield y) {
+  Attrs attrs;
+  int ret = source->get_dir_entry_attrs(dpp, nullptr, &attrs);
+  if (!ret) {
+    return -ENODATA;
+  }
+
+  auto search = attrs.find(name);
+  if (search == attrs.end()) {
+    return -ENODATA;
+  }
+
+  dest = search->second;
+  return 0;
+}
+
+std::unique_ptr<Object::DeleteOp> DaosObject::get_delete_op() {
+  return std::make_unique<DaosObject::DaosDeleteOp>(this);
+}
+
+DaosObject::DaosDeleteOp::DaosDeleteOp(DaosObject* _source) : source(_source) {}
+
+// Implementation of DELETE OBJ also requires DaosObject::get_obj_state()
+// to retrieve and set object's state from object's metadata.
+//
+// TODO:
+// 1. The POC only deletes the Daos objects. It doesn't handle the
+// DeleteOp::params. Delete::delete_obj() in rgw_rados.cc shows how rados
+// backend process the params.
+// 2. Delete an object when its versioning is turned on.
+// 3. Handle empty directories
+// 4. Fail when file doesn't exist
+int DaosObject::DaosDeleteOp::delete_obj(const DoutPrefixProvider* dpp,
+                                         optional_yield y) {
+  ldpp_dout(dpp, 20) << "DaosDeleteOp::delete_obj "
+                     << source->get_key().get_oid() << " from "
+                     << source->get_bucket()->get_name() << dendl;
+  if (source->get_instance() == "null") {
+    source->clear_instance();
+  }
+
+  // Open bucket
+  int ret = 0;
+  std::string key = source->get_key().get_oid();
+  DaosBucket* daos_bucket = source->get_daos_bucket();
+  ret = daos_bucket->open(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  // Remove the daos object
+  ret = ds3_obj_destroy(key.c_str(), daos_bucket->ds3b);
+  ldpp_dout(dpp, 20) << "DEBUG: ds3_obj_destroy key=" << key << " ret=" << ret
+                     << dendl;
+
+  // result.delete_marker = parent_op.result.delete_marker;
+  // result.version_id = parent_op.result.version_id;
+
+  return ret;
+}
+
+int DaosObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y,
+                              bool prevent_versioning) {
+  ldpp_dout(dpp, 20) << "DEBUG: delete_object" << dendl;
+  DaosObject::DaosDeleteOp del_op(this);
+  del_op.params.bucket_owner = bucket->get_info().owner;
+  del_op.params.versioning_status = bucket->get_info().versioning_status();
+
+  return del_op.delete_obj(dpp, y);
+}
+
+int DaosObject::delete_obj_aio(const DoutPrefixProvider* dpp,
+                               RGWObjState* astate, Completions* aio,
+                               bool keep_index_consistent, optional_yield y) {
+  /* XXX: Make it async */
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosObject::copy_object(
+    User* user, req_info* info, const rgw_zone_id& source_zone,
+    rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket,
+    rgw::sal::Bucket* src_bucket, const rgw_placement_rule& dest_placement,
+    ceph::real_time* src_mtime, ceph::real_time* mtime,
+    const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr,
+    bool high_precision_time, const char* if_match, const char* if_nomatch,
+    AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs,
+    RGWObjCategory category, uint64_t olh_epoch,
+    boost::optional<ceph::real_time> delete_at, std::string* version_id,
+    std::string* tag, std::string* etag, void (*progress_cb)(off_t, void*),
+    void* progress_data, const DoutPrefixProvider* dpp, optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosObject::swift_versioning_restore(bool& restored,
+                                         const DoutPrefixProvider* dpp) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosObject::swift_versioning_copy(const DoutPrefixProvider* dpp,
+                                      optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosObject::lookup(const DoutPrefixProvider* dpp) {
+  ldpp_dout(dpp, 20) << "DEBUG: lookup" << dendl;
+  if (is_open()) {
+    return 0;
+  }
+
+  if (get_instance() == "null") {
+    clear_instance();
+  }
+
+  int ret = 0;
+  DaosBucket* daos_bucket = get_daos_bucket();
+  ret = daos_bucket->open(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  ret = ds3_obj_open(get_key().get_oid().c_str(), &ds3o, daos_bucket->ds3b);
+
+  if (ret == -ENOENT) {
+    ldpp_dout(dpp, 20) << "DEBUG: daos object (" << get_bucket()->get_name()
+                       << ", " << get_key().get_oid()
+                       << ") does not exist: ret=" << ret << dendl;
+  } else if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to open daos object ("
+                      << get_bucket()->get_name() << ", " << get_key().get_oid()
+                      << "): ret=" << ret << dendl;
+  }
+  return ret;
+}
+
+int DaosObject::create(const DoutPrefixProvider* dpp) {
+  ldpp_dout(dpp, 20) << "DEBUG: create" << dendl;
+  if (is_open()) {
+    return 0;
+  }
+
+  if (get_instance() == "null") {
+    clear_instance();
+  }
+
+  int ret = 0;
+  DaosBucket* daos_bucket = get_daos_bucket();
+  ret = daos_bucket->open(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  ret = ds3_obj_create(get_key().get_oid().c_str(), &ds3o, daos_bucket->ds3b);
+
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to create daos object ("
+                      << get_bucket()->get_name() << ", " << get_key().get_oid()
+                      << "): ret=" << ret << dendl;
+  }
+  return ret;
+}
+
+int DaosObject::close(const DoutPrefixProvider* dpp) {
+  ldpp_dout(dpp, 20) << "DEBUG: close" << dendl;
+  if (!is_open()) {
+    return 0;
+  }
+
+  int ret = ds3_obj_close(ds3o);
+  ds3o = nullptr;
+  ldpp_dout(dpp, 20) << "DEBUG: ds3_obj_close ret=" << ret << dendl;
+  return ret;
+}
+
+int DaosObject::write(const DoutPrefixProvider* dpp, bufferlist&& data,
+                      uint64_t offset) {
+  ldpp_dout(dpp, 20) << "DEBUG: write" << dendl;
+  uint64_t size = data.length();
+  int ret = ds3_obj_write(data.c_str(), offset, &size, get_daos_bucket()->ds3b,
+                          ds3o, nullptr);
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to write into daos object ("
+                      << get_bucket()->get_name() << ", " << get_key().get_oid()
+                      << "): ret=" << ret << dendl;
+  }
+  return ret;
+}
+
+int DaosObject::read(const DoutPrefixProvider* dpp, bufferlist& data,
+                     uint64_t offset, uint64_t& size) {
+  ldpp_dout(dpp, 20) << "DEBUG: read" << dendl;
+  int ret = ds3_obj_read(data.append_hole(size).c_str(), offset, &size,
+                         get_daos_bucket()->ds3b, ds3o, nullptr);
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to read from daos object ("
+                      << get_bucket()->get_name() << ", " << get_key().get_oid()
+                      << "): ret=" << ret << dendl;
+  }
+  return ret;
+}
+
+// Get the object's dirent and attrs
+int DaosObject::get_dir_entry_attrs(const DoutPrefixProvider* dpp,
+                                    rgw_bucket_dir_entry* ent,
+                                    Attrs* getattrs) {
+  ldpp_dout(dpp, 20) << "DEBUG: get_dir_entry_attrs" << dendl;
+  int ret = 0;
+  vector<uint8_t> value(DS3_MAX_ENCODED_LEN);
+  uint32_t size = value.size();
+
+  if (get_key().ns == RGW_OBJ_NS_MULTIPART) {
+    struct ds3_multipart_upload_info ui = {.encoded = value.data(),
+                                           .encoded_length = size};
+    ret = ds3_upload_get_info(&ui, bucket->get_name().c_str(),
+                              get_key().get_oid().c_str(), store->ds3);
+  } else {
+    ret = lookup(dpp);
+    if (ret != 0) {
+      return ret;
+    }
+
+    auto object_info = std::make_unique<struct ds3_object_info>();
+    object_info->encoded = value.data();
+    object_info->encoded_length = size;
+    ret = ds3_obj_get_info(object_info.get(), get_daos_bucket()->ds3b, ds3o);
+    size = object_info->encoded_length;
+  }
+
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to get info of daos object ("
+                      << get_bucket()->get_name() << ", " << get_key().get_oid()
+                      << "): ret=" << ret << dendl;
+    return ret;
+  }
+
+  rgw_bucket_dir_entry dummy_ent;
+  if (!ent) {
+    // if ent is not passed, use a dummy ent
+    ent = &dummy_ent;
+  }
+
+  bufferlist bl;
+  bl.append(reinterpret_cast<char*>(value.data()), size);
+  auto iter = bl.cbegin();
+  ent->decode(iter);
+  if (getattrs) {
+    decode(*getattrs, iter);
+  }
+
+  return ret;
+}
+// Set the object's dirent and attrs
+int DaosObject::set_dir_entry_attrs(const DoutPrefixProvider* dpp,
+                                    rgw_bucket_dir_entry* ent,
+                                    Attrs* setattrs) {
+  ldpp_dout(dpp, 20) << "DEBUG: set_dir_entry_attrs" << dendl;
+  int ret = lookup(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  // Set defaults
+  if (!ent) {
+    // if ent is not passed, return an error
+    return -EINVAL;
+  }
+
+  if (!setattrs) {
+    // if setattrs is not passed, use object attrs
+    setattrs = &attrs;
+  }
+
+  bufferlist wbl;
+  ent->encode(wbl);
+  encode(*setattrs, wbl);
+
+  // Write rgw_bucket_dir_entry into object xattr
+  auto object_info = std::make_unique<struct ds3_object_info>();
+  object_info->encoded = wbl.c_str();
+  object_info->encoded_length = wbl.length();
+  ret = ds3_obj_set_info(object_info.get(), get_daos_bucket()->ds3b, ds3o);
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to set info of daos object ("
+                      << get_bucket()->get_name() << ", " << get_key().get_oid()
+                      << "): ret=" << ret << dendl;
+  }
+  return ret;
+}
+
+int DaosObject::mark_as_latest(const DoutPrefixProvider* dpp,
+                               ceph::real_time set_mtime) {
+  // TODO handle deletion
+  // TODO understand race conditions
+  ldpp_dout(dpp, 20) << "DEBUG: mark_as_latest" << dendl;
+
+  // Get latest version so far
+  std::unique_ptr<DaosObject> latest_object = std::make_unique<DaosObject>(
+      store, rgw_obj_key(get_name(), DS3_LATEST_INSTANCE), get_bucket());
+
+  ldpp_dout(dpp, 20) << __func__ << ": key=" << get_key().get_oid()
+                     << " latest_object_key= "
+                     << latest_object->get_key().get_oid() << dendl;
+
+  int ret = latest_object->lookup(dpp);
+  if (ret == 0) {
+    // Get metadata only if file exists
+    rgw_bucket_dir_entry latest_ent;
+    Attrs latest_attrs;
+    ret = latest_object->get_dir_entry_attrs(dpp, &latest_ent, &latest_attrs);
+    if (ret != 0) {
+      return ret;
+    }
+
+    // Update flags
+    latest_ent.flags = rgw_bucket_dir_entry::FLAG_VER;
+    latest_ent.meta.mtime = set_mtime;
+    ret = latest_object->set_dir_entry_attrs(dpp, &latest_ent, &latest_attrs);
+    if (ret != 0) {
+      return ret;
+    }
+  }
+
+  // Get or create the link [latest], make it link to the current latest
+  // version.
+  ret =
+      ds3_obj_mark_latest(get_key().get_oid().c_str(), get_daos_bucket()->ds3b);
+  ldpp_dout(dpp, 20) << "DEBUG: ds3_obj_mark_latest ret=" << ret << dendl;
+  return ret;
+}
+
+DaosAtomicWriter::DaosAtomicWriter(
+    const DoutPrefixProvider* dpp, optional_yield y,
+    rgw::sal::Object* obj, DaosStore* _store,
+    const rgw_user& _owner, const rgw_placement_rule* _ptail_placement_rule,
+    uint64_t _olh_epoch, const std::string& _unique_tag)
+    : StoreWriter(dpp, y),
+      store(_store),
+      owner(_owner),
+      ptail_placement_rule(_ptail_placement_rule),
+      olh_epoch(_olh_epoch),
+      unique_tag(_unique_tag),
+      obj(_store, obj->get_key(), obj->get_bucket()) {}
+
+int DaosAtomicWriter::prepare(optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: prepare" << dendl;
+  int ret = obj.create(dpp);
+  return ret;
+}
+
+// TODO: Handle concurrent writes, a unique object id is a possible solution, or
+// use DAOS transactions
+// XXX: Do we need to accumulate writes as motr does?
+int DaosAtomicWriter::process(bufferlist&& data, uint64_t offset) {
+  ldpp_dout(dpp, 20) << "DEBUG: process" << dendl;
+  if (data.length() == 0) {
+    return 0;
+  }
+
+  int ret = 0;
+  if (!obj.is_open()) {
+    ret = obj.lookup(dpp);
+    if (ret != 0) {
+      return ret;
+    }
+  }
+
+  // XXX: Combine multiple streams into one as motr does
+  uint64_t data_size = data.length();
+  ret = obj.write(dpp, std::move(data), offset);
+  if (ret == 0) {
+    total_data_size += data_size;
+  }
+  return ret;
+}
+
+int DaosAtomicWriter::complete(
+    size_t accounted_size, const std::string& etag, ceph::real_time* mtime,
+    ceph::real_time set_mtime, std::map<std::string, bufferlist>& attrs,
+    ceph::real_time delete_at, const char* if_match, const char* if_nomatch,
+    const std::string* user_data, rgw_zone_set* zones_trace, bool* canceled,
+    optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: complete" << dendl;
+  bufferlist bl;
+  rgw_bucket_dir_entry ent;
+  int ret;
+
+  // Set rgw_bucet_dir_entry. Some of the members of this structure may not
+  // apply to daos.
+  //
+  // Checkout AtomicObjectProcessor::complete() in rgw_putobj_processor.cc
+  // and RGWRados::Object::Write::write_meta() in rgw_rados.cc for what and
+  // how to set the dir entry. Only set the basic ones for POC, no ACLs and
+  // other attrs.
+  obj.get_key().get_index_key(&ent.key);
+  ent.meta.size = total_data_size;
+  ent.meta.accounted_size = accounted_size;
+  ent.meta.mtime =
+      real_clock::is_zero(set_mtime) ? ceph::real_clock::now() : set_mtime;
+  ent.meta.etag = etag;
+  ent.meta.owner = owner.to_str();
+  ent.meta.owner_display_name =
+      obj.get_bucket()->get_owner()->get_display_name();
+  bool is_versioned = obj.get_bucket()->versioned();
+  if (is_versioned)
+    ent.flags =
+        rgw_bucket_dir_entry::FLAG_VER | rgw_bucket_dir_entry::FLAG_CURRENT;
+  ldpp_dout(dpp, 20) << __func__ << ": key=" << obj.get_key().get_oid()
+                     << " etag: " << etag << dendl;
+  if (user_data) ent.meta.user_data = *user_data;
+
+  RGWBucketInfo& info = obj.get_bucket()->get_info();
+  if (info.obj_lock_enabled() && info.obj_lock.has_rule()) {
+    auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+    if (iter == attrs.end()) {
+      real_time lock_until_date =
+          info.obj_lock.get_lock_until_date(ent.meta.mtime);
+      string mode = info.obj_lock.get_mode();
+      RGWObjectRetention obj_retention(mode, lock_until_date);
+      bufferlist retention_bl;
+      obj_retention.encode(retention_bl);
+      attrs[RGW_ATTR_OBJECT_RETENTION] = retention_bl;
+    }
+  }
+
+  ret = obj.set_dir_entry_attrs(dpp, &ent, &attrs);
+
+  if (is_versioned) {
+    ret = obj.mark_as_latest(dpp, set_mtime);
+    if (ret != 0) {
+      return ret;
+    }
+  }
+
+  return ret;
+}
+
+int DaosMultipartUpload::abort(const DoutPrefixProvider* dpp,
+                               CephContext* cct) {
+  // Remove upload from bucket multipart index
+  ldpp_dout(dpp, 20) << "DEBUG: abort" << dendl;
+  return ds3_upload_remove(bucket->get_name().c_str(), get_upload_id().c_str(),
+                           store->ds3);
+}
+
+std::unique_ptr<rgw::sal::Object> DaosMultipartUpload::get_meta_obj() {
+  return bucket->get_object(
+      rgw_obj_key(get_upload_id(), string(), RGW_OBJ_NS_MULTIPART));
+}
+
+int DaosMultipartUpload::init(const DoutPrefixProvider* dpp, optional_yield y,
+                              ACLOwner& _owner,
+                              rgw_placement_rule& dest_placement,
+                              rgw::sal::Attrs& attrs) {
+  ldpp_dout(dpp, 20) << "DEBUG: init" << dendl;
+  int ret;
+  std::string oid = mp_obj.get_key();
+
+  // Create an initial entry in the bucket. The entry will be
+  // updated when multipart upload is completed, for example,
+  // size, etag etc.
+  bufferlist bl;
+  rgw_bucket_dir_entry ent;
+  ent.key.name = oid;
+  ent.meta.owner = owner.get_id().to_str();
+  ent.meta.category = RGWObjCategory::MultiMeta;
+  ent.meta.mtime = ceph::real_clock::now();
+
+  multipart_upload_info upload_info;
+  upload_info.dest_placement = dest_placement;
+
+  ent.encode(bl);
+  encode(attrs, bl);
+  encode(upload_info, bl);
+
+  struct ds3_multipart_upload_info ui;
+  std::strcpy(ui.upload_id, MULTIPART_UPLOAD_ID_PREFIX);
+  std::strncpy(ui.key, oid.c_str(), sizeof(ui.key));
+  ui.encoded = bl.c_str();
+  ui.encoded_length = bl.length();
+  int prefix_length = strlen(ui.upload_id);
+
+  do {
+    gen_rand_alphanumeric(store->ctx(), ui.upload_id + prefix_length,
+                          sizeof(ui.upload_id) - 1 - prefix_length);
+    mp_obj.init(oid, ui.upload_id);
+    ret = ds3_upload_init(&ui, bucket->get_name().c_str(), store->ds3);
+  } while (ret == -EEXIST);
+
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to create multipart upload dir ("
+                      << bucket->get_name() << "/" << get_upload_id()
+                      << "): ret=" << ret << dendl;
+  }
+  return ret;
+}
+
+int DaosMultipartUpload::list_parts(const DoutPrefixProvider* dpp,
+                                    CephContext* cct, int num_parts, int marker,
+                                    int* next_marker, bool* truncated,
+                                    bool assume_unsorted) {
+  ldpp_dout(dpp, 20) << "DEBUG: list_parts" << dendl;
+  // Init needed structures
+  vector<struct ds3_multipart_part_info> multipart_part_infos(num_parts);
+  uint32_t npart = multipart_part_infos.size();
+  vector<vector<uint8_t>> values(npart, vector<uint8_t>(DS3_MAX_ENCODED_LEN));
+  for (uint32_t i = 0; i < npart; i++) {
+    multipart_part_infos[i].encoded = values[i].data();
+    multipart_part_infos[i].encoded_length = values[i].size();
+  }
+
+  uint32_t daos_marker = marker;
+  int ret = ds3_upload_list_parts(
+      bucket->get_name().c_str(), get_upload_id().c_str(), &npart,
+      multipart_part_infos.data(), &daos_marker, truncated, store->ds3);
+
+  if (ret != 0) {
+    if (ret == -ENOENT) {
+      ret = -ERR_NO_SUCH_UPLOAD;
+    }
+    return ret;
+  }
+
+  multipart_part_infos.resize(npart);
+  values.resize(npart);
+  parts.clear();
+
+  for (auto const& pi : multipart_part_infos) {
+    bufferlist bl;
+    bl.append(reinterpret_cast<char*>(pi.encoded), pi.encoded_length);
+
+    std::unique_ptr<DaosMultipartPart> part =
+        std::make_unique<DaosMultipartPart>();
+    auto iter = bl.cbegin();
+    decode(part->info, iter);
+    parts[pi.part_num] = std::move(part);
+  }
+
+  if (next_marker) {
+    *next_marker = daos_marker;
+  }
+  return ret;
+}
+
+// Heavily copied from rgw_sal_rados.cc
+int DaosMultipartUpload::complete(
+    const DoutPrefixProvider* dpp, optional_yield y, CephContext* cct,
+    map<int, string>& part_etags, list<rgw_obj_index_key>& remove_objs,
+    uint64_t& accounted_size, bool& compressed, RGWCompressionInfo& cs_info,
+    off_t& off, std::string& tag, ACLOwner& owner, uint64_t olh_epoch,
+    rgw::sal::Object* target_obj) {
+  ldpp_dout(dpp, 20) << "DEBUG: complete" << dendl;
+  char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+  std::string etag;
+  bufferlist etag_bl;
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  bool truncated;
+  int ret;
+
+  ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): enter" << dendl;
+  int total_parts = 0;
+  int handled_parts = 0;
+  int max_parts = 1000;
+  int marker = 0;
+  uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size;
+  auto etags_iter = part_etags.begin();
+  rgw::sal::Attrs attrs = target_obj->get_attrs();
+
+  do {
+    ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): list_parts()"
+                       << dendl;
+    ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated);
+    if (ret == -ENOENT) {
+      ret = -ERR_NO_SUCH_UPLOAD;
+    }
+    if (ret != 0) return ret;
+
+    total_parts += parts.size();
+    if (!truncated && total_parts != (int)part_etags.size()) {
+      ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts
+                        << " expected: " << part_etags.size() << dendl;
+      ret = -ERR_INVALID_PART;
+      return ret;
+    }
+    ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): parts.size()="
+                       << parts.size() << dendl;
+
+    for (auto obj_iter = parts.begin();
+         etags_iter != part_etags.end() && obj_iter != parts.end();
+         ++etags_iter, ++obj_iter, ++handled_parts) {
+      DaosMultipartPart* part =
+          dynamic_cast<rgw::sal::DaosMultipartPart*>(obj_iter->second.get());
+      uint64_t part_size = part->get_size();
+      ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): part_size="
+                         << part_size << dendl;
+      if (handled_parts < (int)part_etags.size() - 1 &&
+          part_size < min_part_size) {
+        ret = -ERR_TOO_SMALL;
+        return ret;
+      }
+
+      char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+      if (etags_iter->first != (int)obj_iter->first) {
+        ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: "
+                          << etags_iter->first
+                          << " next uploaded: " << obj_iter->first << dendl;
+        ret = -ERR_INVALID_PART;
+        return ret;
+      }
+      string part_etag = rgw_string_unquote(etags_iter->second);
+      if (part_etag.compare(part->get_etag()) != 0) {
+        ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: "
+                          << etags_iter->first
+                          << " etag: " << etags_iter->second << dendl;
+        ret = -ERR_INVALID_PART;
+        return ret;
+      }
+
+      hex_to_buf(part->get_etag().c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+      hash.Update((const unsigned char*)petag, sizeof(petag));
+      ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): calc etag "
+                         << dendl;
+
+      RGWUploadPartInfo& obj_part = part->info;
+      string oid = mp_obj.get_part(obj_part.num);
+      rgw_obj src_obj;
+      src_obj.init_ns(bucket->get_key(), oid, RGW_OBJ_NS_MULTIPART);
+
+      bool part_compressed = (obj_part.cs_info.compression_type != "none");
+      if ((handled_parts > 0) &&
+          ((part_compressed != compressed) ||
+           (cs_info.compression_type != obj_part.cs_info.compression_type))) {
+        ldpp_dout(dpp, 0)
+            << "ERROR: compression type was changed during multipart upload ("
+            << cs_info.compression_type << ">>"
+            << obj_part.cs_info.compression_type << ")" << dendl;
+        ret = -ERR_INVALID_PART;
+        return ret;
+      }
+
+      ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): part compression"
+                         << dendl;
+      if (part_compressed) {
+        int64_t new_ofs;  // offset in compression data for new part
+        if (cs_info.blocks.size() > 0)
+          new_ofs = cs_info.blocks.back().new_ofs + cs_info.blocks.back().len;
+        else
+          new_ofs = 0;
+        for (const auto& block : obj_part.cs_info.blocks) {
+          compression_block cb;
+          cb.old_ofs = block.old_ofs + cs_info.orig_size;
+          cb.new_ofs = new_ofs;
+          cb.len = block.len;
+          cs_info.blocks.push_back(cb);
+          new_ofs = cb.new_ofs + cb.len;
+        }
+        if (!compressed)
+          cs_info.compression_type = obj_part.cs_info.compression_type;
+        cs_info.orig_size += obj_part.cs_info.orig_size;
+        compressed = true;
+      }
+
+      // We may not need to do the following as remove_objs are those
+      // don't show when listing a bucket. As we store in-progress uploaded
+      // object's metadata in a separate index, they are not shown when
+      // listing a bucket.
+      rgw_obj_index_key remove_key;
+      src_obj.key.get_index_key(&remove_key);
+
+      remove_objs.push_back(remove_key);
+
+      off += obj_part.size;
+      accounted_size += obj_part.accounted_size;
+      ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): off=" << off
+                         << ", accounted_size = " << accounted_size << dendl;
+    }
+  } while (truncated);
+  hash.Final((unsigned char*)final_etag);
+
+  buf_to_hex((unsigned char*)final_etag, sizeof(final_etag), final_etag_str);
+  snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
+           sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2, "-%lld",
+           (long long)part_etags.size());
+  etag = final_etag_str;
+  ldpp_dout(dpp, 10) << "calculated etag: " << etag << dendl;
+
+  etag_bl.append(etag);
+
+  attrs[RGW_ATTR_ETAG] = etag_bl;
+
+  if (compressed) {
+    // write compression attribute to full object
+    bufferlist tmp;
+    encode(cs_info, tmp);
+    attrs[RGW_ATTR_COMPRESSION] = tmp;
+  }
+
+  // Different from rgw_sal_rados.cc starts here
+  // Read the object's multipart info
+  bufferlist bl;
+  uint64_t size = DS3_MAX_ENCODED_LEN;
+  struct ds3_multipart_upload_info ui = {
+      .encoded = bl.append_hole(size).c_str(), .encoded_length = size};
+  ret = ds3_upload_get_info(&ui, bucket->get_name().c_str(),
+                            get_upload_id().c_str(), store->ds3);
+  ldpp_dout(dpp, 20) << "DEBUG: ds3_upload_get_info entry="
+                     << bucket->get_name() << "/" << get_upload_id() << dendl;
+  if (ret != 0) {
+    if (ret == -ENOENT) {
+      ret = -ERR_NO_SUCH_UPLOAD;
+    }
+    return ret;
+  }
+
+  rgw_bucket_dir_entry ent;
+  auto iter = bl.cbegin();
+  ent.decode(iter);
+
+  // Update entry data and name
+  target_obj->get_key().get_index_key(&ent.key);
+  ent.meta.size = off;
+  ent.meta.accounted_size = accounted_size;
+  ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): obj size="
+                     << ent.meta.size
+                     << " obj accounted size=" << ent.meta.accounted_size
+                     << dendl;
+  ent.meta.category = RGWObjCategory::Main;
+  ent.meta.mtime = ceph::real_clock::now();
+  bool is_versioned = target_obj->get_bucket()->versioned();
+  if (is_versioned)
+    ent.flags =
+        rgw_bucket_dir_entry::FLAG_VER | rgw_bucket_dir_entry::FLAG_CURRENT;
+  ent.meta.etag = etag;
+
+  // Open object
+  DaosObject* obj = static_cast<DaosObject*>(target_obj);
+  ret = obj->create(dpp);
+  if (ret != 0) {
+    return ret;
+  }
+
+  // Copy data from parts to object
+  uint64_t write_off = 0;
+  for (auto const& [part_num, part] : get_parts()) {
+    ds3_part_t* ds3p;
+    ret = ds3_part_open(get_bucket_name().c_str(), get_upload_id().c_str(),
+                        part_num, false, &ds3p, store->ds3);
+    if (ret != 0) {
+      return ret;
+    }
+
+    // Reserve buffers and read
+    uint64_t size = part->get_size();
+    bufferlist bl;
+    ret = ds3_part_read(bl.append_hole(size).c_str(), 0, &size, ds3p,
+                        store->ds3, nullptr);
+    if (ret != 0) {
+      ds3_part_close(ds3p);
+      return ret;
+    }
+
+    ldpp_dout(dpp, 20) << "DaosMultipartUpload::complete(): part " << part_num
+                       << " size is " << size << dendl;
+
+    // write to obj
+    obj->write(dpp, std::move(bl), write_off);
+    ds3_part_close(ds3p);
+    write_off += part->get_size();
+  }
+
+  // Set attributes
+  ret = obj->set_dir_entry_attrs(dpp, &ent, &attrs);
+
+  if (is_versioned) {
+    ret = obj->mark_as_latest(dpp, ent.meta.mtime);
+    if (ret != 0) {
+      return ret;
+    }
+  }
+
+  // Remove upload from bucket multipart index
+  ret = ds3_upload_remove(get_bucket_name().c_str(), get_upload_id().c_str(),
+                          store->ds3);
+  return ret;
+}
+
+int DaosMultipartUpload::get_info(const DoutPrefixProvider* dpp,
+                                  optional_yield y, rgw_placement_rule** rule,
+                                  rgw::sal::Attrs* attrs) {
+  ldpp_dout(dpp, 20) << "DaosMultipartUpload::get_info(): enter" << dendl;
+  if (!rule && !attrs) {
+    return 0;
+  }
+
+  if (rule) {
+    if (!placement.empty()) {
+      *rule = &placement;
+      if (!attrs) {
+        // Don't need attrs, done
+        return 0;
+      }
+    } else {
+      *rule = nullptr;
+    }
+  }
+
+  // Read the multipart upload dirent from index
+  bufferlist bl;
+  uint64_t size = DS3_MAX_ENCODED_LEN;
+  struct ds3_multipart_upload_info ui = {
+      .encoded = bl.append_hole(size).c_str(), .encoded_length = size};
+  int ret = ds3_upload_get_info(&ui, bucket->get_name().c_str(),
+                                get_upload_id().c_str(), store->ds3);
+
+  if (ret != 0) {
+    if (ret == -ENOENT) {
+      ret = -ERR_NO_SUCH_UPLOAD;
+    }
+    return ret;
+  }
+
+  multipart_upload_info upload_info;
+  rgw_bucket_dir_entry ent;
+  Attrs decoded_attrs;
+  auto iter = bl.cbegin();
+  ent.decode(iter);
+  decode(decoded_attrs, iter);
+  ldpp_dout(dpp, 20) << "DEBUG: decoded_attrs=" << attrs << dendl;
+
+  if (attrs) {
+    *attrs = decoded_attrs;
+    if (!rule || *rule != nullptr) {
+      // placement was cached; don't actually read
+      return 0;
+    }
+  }
+
+  // Now decode the placement rule
+  decode(upload_info, iter);
+  placement = upload_info.dest_placement;
+  *rule = &placement;
+
+  return 0;
+}
+
+std::unique_ptr<Writer> DaosMultipartUpload::get_writer(
+    const DoutPrefixProvider* dpp, optional_yield y,
+    rgw::sal::Object* obj, const rgw_user& owner,
+    const rgw_placement_rule* ptail_placement_rule, uint64_t part_num,
+    const std::string& part_num_str) {
+  ldpp_dout(dpp, 20) << "DaosMultipartUpload::get_writer(): enter part="
+                     << part_num << " head_obj=" << _head_obj << dendl;
+  return std::make_unique<DaosMultipartWriter>(
+      dpp, y, this, obj, store, owner, ptail_placement_rule,
+      part_num, part_num_str);
+}
+
+DaosMultipartWriter::~DaosMultipartWriter() {
+  if (is_open()) ds3_part_close(ds3p);
+}
+
+int DaosMultipartWriter::prepare(optional_yield y) {
+  ldpp_dout(dpp, 20) << "DaosMultipartWriter::prepare(): enter part="
+                     << part_num_str << dendl;
+  int ret = ds3_part_open(get_bucket_name().c_str(), upload_id.c_str(),
+                          part_num, true, &ds3p, store->ds3);
+  if (ret == -ENOENT) {
+    ret = -ERR_NO_SUCH_UPLOAD;
+  }
+  return ret;
+}
+
+const std::string& DaosMultipartWriter::get_bucket_name() {
+  return static_cast<DaosMultipartUpload*>(upload)->get_bucket_name();
+}
+
+int DaosMultipartWriter::process(bufferlist&& data, uint64_t offset) {
+  ldpp_dout(dpp, 20) << "DaosMultipartWriter::process(): enter part="
+                     << part_num_str << " offset=" << offset << dendl;
+  if (data.length() == 0) {
+    return 0;
+  }
+
+  uint64_t size = data.length();
+  int ret =
+      ds3_part_write(data.c_str(), offset, &size, ds3p, store->ds3, nullptr);
+  if (ret == 0) {
+    // XXX: Combine multiple streams into one as motr does
+    actual_part_size += size;
+  } else {
+    ldpp_dout(dpp, 0) << "ERROR: failed to write into part ("
+                      << get_bucket_name() << ", " << upload_id << ", "
+                      << part_num << "): ret=" << ret << dendl;
+  }
+  return ret;
+}
+
+int DaosMultipartWriter::complete(
+    size_t accounted_size, const std::string& etag, ceph::real_time* mtime,
+    ceph::real_time set_mtime, std::map<std::string, bufferlist>& attrs,
+    ceph::real_time delete_at, const char* if_match, const char* if_nomatch,
+    const std::string* user_data, rgw_zone_set* zones_trace, bool* canceled,
+    optional_yield y) {
+  ldpp_dout(dpp, 20) << "DaosMultipartWriter::complete(): enter part="
+                     << part_num_str << dendl;
+
+  // Add an entry into part index
+  bufferlist bl;
+  RGWUploadPartInfo info;
+  info.num = part_num;
+  info.etag = etag;
+  info.size = actual_part_size;
+  info.accounted_size = accounted_size;
+  info.modified = real_clock::now();
+
+  bool compressed;
+  int ret = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info);
+  ldpp_dout(dpp, 20) << "DaosMultipartWriter::complete(): compression ret="
+                     << ret << dendl;
+  if (ret != 0) {
+    ldpp_dout(dpp, 1) << "cannot get compression info" << dendl;
+    return ret;
+  }
+  encode(info, bl);
+  encode(attrs, bl);
+  ldpp_dout(dpp, 20) << "DaosMultipartWriter::complete(): entry size"
+                     << bl.length() << dendl;
+
+  struct ds3_multipart_part_info part_info = {.part_num = part_num,
+                                              .encoded = bl.c_str(),
+                                              .encoded_length = bl.length()};
+
+  ret = ds3_part_set_info(&part_info, ds3p, store->ds3, nullptr);
+
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to set part info (" << get_bucket_name()
+                      << ", " << upload_id << ", " << part_num
+                      << "): ret=" << ret << dendl;
+    if (ret == ENOENT) {
+      ret = -ERR_NO_SUCH_UPLOAD;
+    }
+  }
+
+  return ret;
+}
+
+std::unique_ptr<RGWRole> DaosStore::get_role(
+    std::string name, std::string tenant, std::string path,
+    std::string trust_policy, std::string max_session_duration_str,
+    std::multimap<std::string, std::string> tags) {
+  RGWRole* p = nullptr;
+  return std::unique_ptr<RGWRole>(p);
+}
+
+std::unique_ptr<RGWRole> DaosStore::get_role(const RGWRoleInfo& info) {
+  RGWRole* p = nullptr;
+  return std::unique_ptr<RGWRole>(p);
+}
+
+std::unique_ptr<RGWRole> DaosStore::get_role(std::string id) {
+  RGWRole* p = nullptr;
+  return std::unique_ptr<RGWRole>(p);
+}
+
+int DaosStore::get_roles(const DoutPrefixProvider* dpp, optional_yield y,
+                         const std::string& path_prefix,
+                         const std::string& tenant,
+                         vector<std::unique_ptr<RGWRole>>& roles) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+std::unique_ptr<RGWOIDCProvider> DaosStore::get_oidc_provider() {
+  RGWOIDCProvider* p = nullptr;
+  return std::unique_ptr<RGWOIDCProvider>(p);
+}
+
+int DaosStore::get_oidc_providers(
+    const DoutPrefixProvider* dpp, const std::string& tenant,
+    vector<std::unique_ptr<RGWOIDCProvider>>& providers) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+std::unique_ptr<MultipartUpload> DaosBucket::get_multipart_upload(
+    const std::string& oid, std::optional<std::string> upload_id,
+    ACLOwner owner, ceph::real_time mtime) {
+  return std::make_unique<DaosMultipartUpload>(store, this, oid, upload_id,
+                                               owner, mtime);
+}
+
+std::unique_ptr<Writer> DaosStore::get_append_writer(
+    const DoutPrefixProvider* dpp, optional_yield y,
+    rgw::sal::Object* obj, const rgw_user& owner,
+    const rgw_placement_rule* ptail_placement_rule,
+    const std::string& unique_tag, uint64_t position,
+    uint64_t* cur_accounted_size) {
+  DAOS_NOT_IMPLEMENTED_LOG(dpp);
+  return nullptr;
+}
+
+std::unique_ptr<Writer> DaosStore::get_atomic_writer(
+    const DoutPrefixProvider* dpp, optional_yield y,
+    rgw::sal::Object* obj, const rgw_user& owner,
+    const rgw_placement_rule* ptail_placement_rule, uint64_t olh_epoch,
+    const std::string& unique_tag) {
+  ldpp_dout(dpp, 20) << "get_atomic_writer" << dendl;
+  return std::make_unique<DaosAtomicWriter>(dpp, y, obj, this,
+                                            owner, ptail_placement_rule,
+                                            olh_epoch, unique_tag);
+}
+
+const std::string& DaosStore::get_compression_type(
+    const rgw_placement_rule& rule) {
+  return zone.zone_params->get_compression_type(rule);
+}
+
+bool DaosStore::valid_placement(const rgw_placement_rule& rule) {
+  return zone.zone_params->valid_placement(rule);
+}
+
+std::unique_ptr<User> DaosStore::get_user(const rgw_user& u) {
+  ldout(cctx, 20) << "DEBUG: bucket's user:  " << u.to_str() << dendl;
+  return std::make_unique<DaosUser>(this, u);
+}
+
+int DaosStore::get_user_by_access_key(const DoutPrefixProvider* dpp,
+                                      const std::string& key, optional_yield y,
+                                      std::unique_ptr<User>* user) {
+  // Initialize ds3_user_info
+  bufferlist bl;
+  uint64_t size = DS3_MAX_ENCODED_LEN;
+  struct ds3_user_info user_info = {.encoded = bl.append_hole(size).c_str(),
+                                    .encoded_length = size};
+
+  int ret = ds3_user_get_by_key(key.c_str(), &user_info, ds3, nullptr);
+
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "Error: ds3_user_get_by_key failed, key=" << key
+                      << " ret=" << ret << dendl;
+    return ret;
+  }
+
+  // Decode
+  DaosUserInfo duinfo;
+  bufferlist& blr = bl;
+  auto iter = blr.cbegin();
+  duinfo.decode(iter);
+
+  User* u = new DaosUser(this, duinfo.info);
+  if (!u) {
+    return -ENOMEM;
+  }
+
+  user->reset(u);
+  return 0;
+}
+
+int DaosStore::get_user_by_email(const DoutPrefixProvider* dpp,
+                                 const std::string& email, optional_yield y,
+                                 std::unique_ptr<User>* user) {
+  // Initialize ds3_user_info
+  bufferlist bl;
+  uint64_t size = DS3_MAX_ENCODED_LEN;
+  struct ds3_user_info user_info = {.encoded = bl.append_hole(size).c_str(),
+                                    .encoded_length = size};
+
+  int ret = ds3_user_get_by_email(email.c_str(), &user_info, ds3, nullptr);
+
+  if (ret != 0) {
+    ldpp_dout(dpp, 0) << "Error: ds3_user_get_by_email failed, email=" << email
+                      << " ret=" << ret << dendl;
+    return ret;
+  }
+
+  // Decode
+  DaosUserInfo duinfo;
+  bufferlist& blr = bl;
+  auto iter = blr.cbegin();
+  duinfo.decode(iter);
+
+  User* u = new DaosUser(this, duinfo.info);
+  if (!u) {
+    return -ENOMEM;
+  }
+
+  user->reset(u);
+  return 0;
+}
+
+int DaosStore::get_user_by_swift(const DoutPrefixProvider* dpp,
+                                 const std::string& user_str, optional_yield y,
+                                 std::unique_ptr<User>* user) {
+  /* Swift keys and subusers are not supported for now */
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+std::unique_ptr<Object> DaosStore::get_object(const rgw_obj_key& k) {
+  return std::make_unique<DaosObject>(this, k);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const rgw_user* u) {
+  std::string s;
+  if (u != nullptr)
+    u->to_str(s);
+  else
+    s = "(nullptr)";
+  return out << s;
+}
+
+int DaosStore::get_bucket(const DoutPrefixProvider* dpp, User* u,
+                          const rgw_bucket& b, std::unique_ptr<Bucket>* bucket,
+                          optional_yield y) {
+  ldpp_dout(dpp, 20) << "DEBUG: get_bucket1: User: " << u << dendl;
+  int ret;
+  Bucket* bp;
+
+  bp = new DaosBucket(this, b, u);
+  ret = bp->load_bucket(dpp, y);
+  if (ret != 0) {
+    delete bp;
+    return ret;
+  }
+
+  bucket->reset(bp);
+  return 0;
+}
+
+int DaosStore::get_bucket(User* u, const RGWBucketInfo& i,
+                          std::unique_ptr<Bucket>* bucket) {
+  DaosBucket* bp;
+
+  bp = new DaosBucket(this, i, u);
+  /* Don't need to fetch the bucket info, use the provided one */
+
+  bucket->reset(bp);
+  return 0;
+}
+
+int DaosStore::get_bucket(const DoutPrefixProvider* dpp, User* u,
+                          const std::string& tenant, const std::string& name,
+                          std::unique_ptr<Bucket>* bucket, optional_yield y) {
+  ldpp_dout(dpp, 20) << "get_bucket" << dendl;
+  rgw_bucket b;
+
+  b.tenant = tenant;
+  b.name = name;
+
+  return get_bucket(dpp, u, b, bucket, y);
+}
+
+bool DaosStore::is_meta_master() { return true; }
+
+int DaosStore::forward_request_to_master(const DoutPrefixProvider* dpp,
+                                         User* user, obj_version* objv,
+                                         bufferlist& in_data, JSONParser* jp,
+                                         req_info& info, optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosStore::forward_iam_request_to_master(const DoutPrefixProvider* dpp,
+                                             const RGWAccessKey& key,
+                                             obj_version* objv,
+                                             bufferlist& in_data,
+                                             RGWXMLDecoder::XMLParser* parser,
+                                             req_info& info, optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+std::string DaosStore::zone_unique_id(uint64_t unique_num) { return ""; }
+
+std::string DaosStore::zone_unique_trans_id(const uint64_t unique_num) {
+  return "";
+}
+
+int DaosStore::cluster_stat(RGWClusterStat& stats) {
+  return DAOS_NOT_IMPLEMENTED_LOG(nullptr);
+}
+
+std::unique_ptr<Lifecycle> DaosStore::get_lifecycle(void) {
+  DAOS_NOT_IMPLEMENTED_LOG(nullptr);
+  return 0;
+}
+
+std::unique_ptr<Completions> DaosStore::get_completions(void) {
+  DAOS_NOT_IMPLEMENTED_LOG(nullptr);
+  return 0;
+}
+
+std::unique_ptr<Notification> DaosStore::get_notification(
+    rgw::sal::Object* obj, rgw::sal::Object* src_obj, struct req_state* s,
+    rgw::notify::EventType event_type, const std::string* object_name) {
+  return std::make_unique<DaosNotification>(obj, src_obj, event_type);
+}
+
+std::unique_ptr<Notification> DaosStore::get_notification(
+    const DoutPrefixProvider* dpp, Object* obj, Object* src_obj,
+    rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket,
+    std::string& _user_id, std::string& _user_tenant, std::string& _req_id,
+    optional_yield y) {
+  ldpp_dout(dpp, 20) << "get_notification" << dendl;
+  return std::make_unique<DaosNotification>(obj, src_obj, event_type);
+}
+
+int DaosStore::log_usage(const DoutPrefixProvider* dpp,
+                         map<rgw_user_bucket, RGWUsageBatch>& usage_info) {
+  DAOS_NOT_IMPLEMENTED_LOG(dpp);
+  return 0;
+}
+
+int DaosStore::log_op(const DoutPrefixProvider* dpp, string& oid,
+                      bufferlist& bl) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosStore::register_to_service_map(const DoutPrefixProvider* dpp,
+                                       const string& daemon_type,
+                                       const map<string, string>& meta) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+void DaosStore::get_quota(RGWQuota& quota) {
+  // XXX: Not handled for the first pass
+  return;
+}
+
+void DaosStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit,
+                              RGWRateLimitInfo& user_ratelimit,
+                              RGWRateLimitInfo& anon_ratelimit) {
+  return;
+}
+
+int DaosStore::set_buckets_enabled(const DoutPrefixProvider* dpp,
+                                   std::vector<rgw_bucket>& buckets,
+                                   bool enabled) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosStore::get_sync_policy_handler(const DoutPrefixProvider* dpp,
+                                       std::optional<rgw_zone_id> zone,
+                                       std::optional<rgw_bucket> bucket,
+                                       RGWBucketSyncPolicyHandlerRef* phandler,
+                                       optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+RGWDataSyncStatusManager* DaosStore::get_data_sync_manager(
+    const rgw_zone_id& source_zone) {
+  DAOS_NOT_IMPLEMENTED_LOG(nullptr);
+  return 0;
+}
+
+int DaosStore::read_all_usage(
+    const DoutPrefixProvider* dpp, uint64_t start_epoch, uint64_t end_epoch,
+    uint32_t max_entries, bool* is_truncated, RGWUsageIter& usage_iter,
+    map<rgw_user_bucket, rgw_usage_log_entry>& usage) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosStore::trim_all_usage(const DoutPrefixProvider* dpp,
+                              uint64_t start_epoch, uint64_t end_epoch) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosStore::get_config_key_val(string name, bufferlist* bl) {
+  return DAOS_NOT_IMPLEMENTED_LOG(nullptr);
+}
+
+int DaosStore::meta_list_keys_init(const DoutPrefixProvider* dpp,
+                                   const string& section, const string& marker,
+                                   void** phandle) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+int DaosStore::meta_list_keys_next(const DoutPrefixProvider* dpp, void* handle,
+                                   int max, list<string>& keys,
+                                   bool* truncated) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+void DaosStore::meta_list_keys_complete(void* handle) { return; }
+
+std::string DaosStore::meta_get_marker(void* handle) { return ""; }
+
+int DaosStore::meta_remove(const DoutPrefixProvider* dpp, string& metadata_key,
+                           optional_yield y) {
+  return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+}
+
+std::string DaosStore::get_cluster_id(const DoutPrefixProvider* dpp,
+                                      optional_yield y) {
+  DAOS_NOT_IMPLEMENTED_LOG(dpp);
+  return "";
+}
+
+}  // namespace rgw::sal
+
+extern "C" {
+
+void* newDaosStore(CephContext* cct) {
+  return new rgw::sal::DaosStore(cct);
+}
+}
diff --git a/src/rgw/rgw_sal_daos.h b/src/rgw/rgw_sal_daos.h
new file mode 100644
index 000000000..64bf49c7c
--- /dev/null
+++ b/src/rgw/rgw_sal_daos.h
@@ -0,0 +1,1054 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=2 sw=2 expandtab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * SAL implementation for the CORTX Daos backend
+ *
+ * Copyright (C) 2022 Seagate Technology LLC and/or its Affiliates
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <daos.h>
+#include <daos_s3.h>
+#include <uuid/uuid.h>
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "rgw_multi.h"
+#include "rgw_notify.h"
+#include "rgw_oidc_provider.h"
+#include "rgw_putobj_processor.h"
+#include "rgw_rados.h"
+#include "rgw_role.h"
+#include "rgw_sal_store.h"
+
+inline bool IsDebuggerAttached() {
+#ifdef DEBUG
+  char buf[4096];
+
+  const int status_fd = ::open("/proc/self/status", O_RDONLY);
+  if (status_fd == -1) return false;
+
+  const ssize_t num_read = ::read(status_fd, buf, sizeof(buf) - 1);
+  ::close(status_fd);
+
+  if (num_read <= 0) return false;
+
+  buf[num_read] = '\0';
+  constexpr char tracerPidString[] = "TracerPid:";
+  const auto tracer_pid_ptr = ::strstr(buf, tracerPidString);
+  if (!tracer_pid_ptr) return false;
+
+  for (const char* characterPtr = tracer_pid_ptr + sizeof(tracerPidString) - 1;
+       characterPtr <= buf + num_read; ++characterPtr) {
+    if (::isspace(*characterPtr))
+      continue;
+    else
+      return ::isdigit(*characterPtr) != 0 && *characterPtr != '0';
+  }
+#endif  // DEBUG
+  return false;
+}
+
+inline void DebugBreak() {
+#ifdef DEBUG
+  // only break into the debugger if the debugger is attached
+  if (IsDebuggerAttached())
+    raise(SIGINT);  // breaks into GDB and stops, can be continued
+#endif              // DEBUG
+}
+
+inline int NotImplementedLog(const DoutPrefixProvider* ldpp,
+                             const char* filename, int linenumber,
+                             const char* functionname) {
+  if (ldpp)
+    ldpp_dout(ldpp, 20) << filename << "(" << linenumber << ") " << functionname
+                        << ": Not implemented" << dendl;
+  return 0;
+}
+
+inline int NotImplementedGdbBreak(const DoutPrefixProvider* ldpp,
+                                  const char* filename, int linenumber,
+                                  const char* functionname) {
+  NotImplementedLog(ldpp, filename, linenumber, functionname);
+  DebugBreak();
+  return 0;
+}
+
+#define DAOS_NOT_IMPLEMENTED_GDB_BREAK(ldpp) \
+  NotImplementedGdbBreak(ldpp, __FILE__, __LINE__, __FUNCTION__)
+#define DAOS_NOT_IMPLEMENTED_LOG(ldpp) \
+  NotImplementedLog(ldpp, __FILE__, __LINE__, __FUNCTION__)
+
+namespace rgw::sal {
+
+class DaosStore;
+class DaosObject;
+
+#ifdef DEBUG
+// Prepends each log entry with the "filename(source_line) function_name". Makes
+// it simple to
+//  associate log entries with the source that generated the log entry
+#undef ldpp_dout
+#define ldpp_dout(dpp, v)                                                     \
+  if (decltype(auto) pdpp = (dpp);                                            \
+      pdpp) /* workaround -Wnonnull-compare for 'this' */                     \
+  dout_impl(pdpp->get_cct(), ceph::dout::need_dynamic(pdpp->get_subsys()), v) \
+          pdpp->gen_prefix(*_dout)                                            \
+      << __FILE__ << "(" << __LINE__ << ") " << __FUNCTION__ << " - "
+#endif  // DEBUG
+
+struct DaosUserInfo {
+  RGWUserInfo info;
+  obj_version user_version;
+  rgw::sal::Attrs attrs;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 3, bl);
+    encode(info, bl);
+    encode(user_version, bl);
+    encode(attrs, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(3, bl);
+    decode(info, bl);
+    decode(user_version, bl);
+    decode(attrs, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(DaosUserInfo);
+
+class DaosNotification : public StoreNotification {
+ public:
+  DaosNotification(Object* _obj, Object* _src_obj, rgw::notify::EventType _type)
+      : StoreNotification(_obj, _src_obj, _type) {}
+  ~DaosNotification() = default;
+
+  virtual int publish_reserve(const DoutPrefixProvider* dpp,
+                              RGWObjTags* obj_tags = nullptr) override {
+    return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+  }
+  virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+                             const ceph::real_time& mtime,
+                             const std::string& etag,
+                             const std::string& version) override {
+    return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+  }
+};
+
+class DaosUser : public StoreUser {
+ private:
+  DaosStore* store;
+  std::vector<const char*> access_ids;
+
+ public:
+  DaosUser(DaosStore* _st, const rgw_user& _u) : StoreUser(_u), store(_st) {}
+  DaosUser(DaosStore* _st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) {}
+  DaosUser(DaosStore* _st) : store(_st) {}
+  DaosUser(DaosUser& _o) = default;
+  DaosUser() {}
+
+  virtual std::unique_ptr<User> clone() override {
+    return std::make_unique<DaosUser>(*this);
+  }
+  int list_buckets(const DoutPrefixProvider* dpp, const std::string& marker,
+                   const std::string& end_marker, uint64_t max, bool need_stats,
+                   BucketList& buckets, optional_yield y) override;
+  virtual int create_bucket(
+      const DoutPrefixProvider* dpp, const rgw_bucket& b,
+      const std::string& zonegroup_id, rgw_placement_rule& placement_rule,
+      std::string& swift_ver_location, const RGWQuotaInfo* pquota_info,
+      const RGWAccessControlPolicy& policy, Attrs& attrs, RGWBucketInfo& info,
+      obj_version& ep_objv, bool exclusive, bool obj_lock_enabled,
+      bool* existed, req_info& req_info, std::unique_ptr<Bucket>* bucket,
+      optional_yield y) override;
+  virtual int read_attrs(const DoutPrefixProvider* dpp,
+                         optional_yield y) override;
+  virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp,
+                                    Attrs& new_attrs,
+                                    optional_yield y) override;
+  virtual int read_stats(const DoutPrefixProvider* dpp, optional_yield y,
+                         RGWStorageStats* stats,
+                         ceph::real_time* last_stats_sync = nullptr,
+                         ceph::real_time* last_stats_update = nullptr) override;
+  virtual int read_stats_async(const DoutPrefixProvider* dpp,
+                               RGWGetUserStats_CB* cb) override;
+  virtual int complete_flush_stats(const DoutPrefixProvider* dpp,
+                                   optional_yield y) override;
+  virtual int read_usage(
+      const DoutPrefixProvider* dpp, uint64_t start_epoch, uint64_t end_epoch,
+      uint32_t max_entries, bool* is_truncated, RGWUsageIter& usage_iter,
+      std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+  virtual int trim_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch,
+                         uint64_t end_epoch) override;
+
+  virtual int load_user(const DoutPrefixProvider* dpp,
+                        optional_yield y) override;
+  virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y,
+                         bool exclusive,
+                         RGWUserInfo* old_info = nullptr) override;
+  virtual int remove_user(const DoutPrefixProvider* dpp,
+                          optional_yield y) override;
+
+  /** Read user info without loading it */
+  int read_user(const DoutPrefixProvider* dpp, std::string name,
+                DaosUserInfo* duinfo);
+
+  std::unique_ptr<struct ds3_user_info> get_encoded_info(bufferlist& bl,
+                                                         obj_version& obj_ver);
+
+  friend class DaosBucket;
+};
+
+// RGWBucketInfo and other information that are shown when listing a bucket is
+// represented in struct DaosBucketInfo. The structure is encoded and stored
+// as the value of the global bucket instance index.
+// TODO: compare pros and cons of separating the bucket_attrs (ACLs, tag etc.)
+// into a different index.
+struct DaosBucketInfo {
+  RGWBucketInfo info;
+
+  obj_version bucket_version;
+  ceph::real_time mtime;
+
+  rgw::sal::Attrs bucket_attrs;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(4, 4, bl);
+    encode(info, bl);
+    encode(bucket_version, bl);
+    encode(mtime, bl);
+    encode(bucket_attrs, bl);  // rgw_cache.h example for a map
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(4, bl);
+    decode(info, bl);
+    decode(bucket_version, bl);
+    decode(mtime, bl);
+    decode(bucket_attrs, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(DaosBucketInfo);
+
+class DaosBucket : public StoreBucket {
+ private:
+  DaosStore* store;
+  RGWAccessControlPolicy acls;
+
+ public:
+  /** Container ds3b handle */
+  ds3_bucket_t* ds3b = nullptr;
+
+  DaosBucket(DaosStore* _st) : store(_st), acls() {}
+
+  DaosBucket(const DaosBucket& _daos_bucket)
+      : store(_daos_bucket.store), acls(), ds3b(nullptr) {
+    // TODO: deep copy all objects
+  }
+
+  DaosBucket(DaosStore* _st, User* _u) : StoreBucket(_u), store(_st), acls() {}
+
+  DaosBucket(DaosStore* _st, const rgw_bucket& _b)
+      : StoreBucket(_b), store(_st), acls() {}
+
+  DaosBucket(DaosStore* _st, const RGWBucketEnt& _e)
+      : StoreBucket(_e), store(_st), acls() {}
+
+  DaosBucket(DaosStore* _st, const RGWBucketInfo& _i)
+      : StoreBucket(_i), store(_st), acls() {}
+
+  DaosBucket(DaosStore* _st, const rgw_bucket& _b, User* _u)
+      : StoreBucket(_b, _u), store(_st), acls() {}
+
+  DaosBucket(DaosStore* _st, const RGWBucketEnt& _e, User* _u)
+      : StoreBucket(_e, _u), store(_st), acls() {}
+
+  DaosBucket(DaosStore* _st, const RGWBucketInfo& _i, User* _u)
+      : StoreBucket(_i, _u), store(_st), acls() {}
+
+  ~DaosBucket();
+
+  virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+  virtual int list(const DoutPrefixProvider* dpp, ListParams&, int,
+                   ListResults&, optional_yield y) override;
+  virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children,
+                            bool forward_to_master, req_info* req_info,
+                            optional_yield y) override;
+  virtual int remove_bucket_bypass_gc(int concurrent_max,
+                                      bool keep_index_consistent,
+                                      optional_yield y,
+                                      const DoutPrefixProvider* dpp) override;
+  virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+  virtual int set_acl(const DoutPrefixProvider* dpp,
+                      RGWAccessControlPolicy& acl, optional_yield y) override;
+  virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y,
+                          bool get_stats = false) override;
+  virtual int read_stats(const DoutPrefixProvider* dpp,
+                         const bucket_index_layout_generation& idx_layout,
+                         int shard_id, std::string* bucket_ver,
+                         std::string* master_ver,
+                         std::map<RGWObjCategory, RGWStorageStats>& stats,
+                         std::string* max_marker = nullptr,
+                         bool* syncstopped = nullptr) override;
+  virtual int read_stats_async(const DoutPrefixProvider* dpp,
+                               const bucket_index_layout_generation& idx_layout,
+                               int shard_id,
+                               RGWGetBucketStats_CB* ctx) override;
+  virtual int sync_user_stats(const DoutPrefixProvider* dpp,
+                              optional_yield y) override;
+  virtual int update_container_stats(const DoutPrefixProvider* dpp) override;
+  virtual int check_bucket_shards(const DoutPrefixProvider* dpp) override;
+  virtual int chown(const DoutPrefixProvider* dpp, User& new_user,
+                    optional_yield y) override;
+  virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive,
+                       ceph::real_time mtime) override;
+  virtual bool is_owner(User* user) override;
+  virtual int check_empty(const DoutPrefixProvider* dpp,
+                          optional_yield y) override;
+  virtual int check_quota(const DoutPrefixProvider* dpp, RGWQuota& quota,
+                          uint64_t obj_size, optional_yield y,
+                          bool check_size_only = false) override;
+  virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& attrs,
+                                    optional_yield y) override;
+  virtual int try_refresh_info(const DoutPrefixProvider* dpp,
+                               ceph::real_time* pmtime) override;
+  virtual int read_usage(
+      const DoutPrefixProvider* dpp, uint64_t start_epoch, uint64_t end_epoch,
+      uint32_t max_entries, bool* is_truncated, RGWUsageIter& usage_iter,
+      std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+  virtual int trim_usage(const DoutPrefixProvider* dpp, uint64_t start_epoch,
+                         uint64_t end_epoch) override;
+  virtual int remove_objs_from_index(
+      const DoutPrefixProvider* dpp,
+      std::list<rgw_obj_index_key>& objs_to_unlink) override;
+  virtual int check_index(
+      const DoutPrefixProvider* dpp,
+      std::map<RGWObjCategory, RGWStorageStats>& existing_stats,
+      std::map<RGWObjCategory, RGWStorageStats>& calculated_stats) override;
+  virtual int rebuild_index(const DoutPrefixProvider* dpp) override;
+  virtual int set_tag_timeout(const DoutPrefixProvider* dpp,
+                              uint64_t timeout) override;
+  virtual int purge_instance(const DoutPrefixProvider* dpp) override;
+  virtual std::unique_ptr<Bucket> clone() override {
+    return std::make_unique<DaosBucket>(*this);
+  }
+  virtual std::unique_ptr<MultipartUpload> get_multipart_upload(
+      const std::string& oid,
+      std::optional<std::string> upload_id = std::nullopt, ACLOwner owner = {},
+      ceph::real_time mtime = real_clock::now()) override;
+  virtual int list_multiparts(
+      const DoutPrefixProvider* dpp, const std::string& prefix,
+      std::string& marker, const std::string& delim, const int& max_uploads,
+      std::vector<std::unique_ptr<MultipartUpload>>& uploads,
+      std::map<std::string, bool>* common_prefixes,
+      bool* is_truncated) override;
+  virtual int abort_multiparts(const DoutPrefixProvider* dpp,
+                               CephContext* cct) override;
+
+  int open(const DoutPrefixProvider* dpp);
+  int close(const DoutPrefixProvider* dpp);
+  bool is_open() { return ds3b != nullptr; }
+  std::unique_ptr<struct ds3_bucket_info> get_encoded_info(
+      bufferlist& bl, ceph::real_time mtime);
+
+  friend class DaosStore;
+};
+
+class DaosPlacementTier : public StorePlacementTier {
+  DaosStore* store;
+  RGWZoneGroupPlacementTier tier;
+
+ public:
+  DaosPlacementTier(DaosStore* _store, const RGWZoneGroupPlacementTier& _tier)
+      : store(_store), tier(_tier) {}
+  virtual ~DaosPlacementTier() = default;
+
+  virtual const std::string& get_tier_type() { return tier.tier_type; }
+  virtual const std::string& get_storage_class() { return tier.storage_class; }
+  virtual bool retain_head_object() { return tier.retain_head_object; }
+  RGWZoneGroupPlacementTier& get_rt() { return tier; }
+};
+
+class DaosZoneGroup : public StoreZoneGroup {
+  DaosStore* store;
+  const RGWZoneGroup group;
+  std::string empty;
+
+ public:
+  DaosZoneGroup(DaosStore* _store) : store(_store), group() {}
+  DaosZoneGroup(DaosStore* _store, const RGWZoneGroup& _group)
+      : store(_store), group(_group) {}
+  virtual ~DaosZoneGroup() = default;
+
+  virtual const std::string& get_id() const override { return group.get_id(); };
+  virtual const std::string& get_name() const override {
+    return group.get_name();
+  };
+  virtual int equals(const std::string& other_zonegroup) const override {
+    return group.equals(other_zonegroup);
+  };
+  /** Get the endpoint from zonegroup, or from master zone if not set */
+  virtual const std::string& get_endpoint() const override;
+  virtual bool placement_target_exists(std::string& target) const override;
+  virtual bool is_master_zonegroup() const override {
+    return group.is_master_zonegroup();
+  };
+  virtual const std::string& get_api_name() const override {
+    return group.api_name;
+  };
+  virtual void get_placement_target_names(
+      std::set<std::string>& names) const override;
+  virtual const std::string& get_default_placement_name() const override {
+    return group.default_placement.name;
+  };
+  virtual int get_hostnames(std::list<std::string>& names) const override {
+    names = group.hostnames;
+    return 0;
+  };
+  virtual int get_s3website_hostnames(
+      std::list<std::string>& names) const override {
+    names = group.hostnames_s3website;
+    return 0;
+  };
+  virtual int get_zone_count() const override { return group.zones.size(); }
+  virtual int get_placement_tier(const rgw_placement_rule& rule,
+                                 std::unique_ptr<PlacementTier>* tier);
+  bool supports(std::string_view feature) const override {
+    return group.supports(feature);
+  }
+  virtual std::unique_ptr<ZoneGroup> clone() override {
+    return std::make_unique<DaosZoneGroup>(store, group);
+  }
+  const RGWZoneGroup& get_group() { return group; }
+};
+
+class DaosZone : public StoreZone {
+ protected:
+  DaosStore* store;
+  RGWRealm* realm{nullptr};
+  DaosZoneGroup zonegroup;
+  RGWZone* zone_public_config{
+      nullptr}; /* external zone params, e.g., entrypoints, log flags, etc. */
+  RGWZoneParams* zone_params{
+      nullptr}; /* internal zone params, e.g., rados pools */
+  RGWPeriod* current_period{nullptr};
+  rgw_zone_id cur_zone_id;
+
+ public:
+  DaosZone(DaosStore* _store) : store(_store), zonegroup(_store) {
+    realm = new RGWRealm();
+    zone_public_config = new RGWZone();
+    zone_params = new RGWZoneParams();
+    current_period = new RGWPeriod();
+    cur_zone_id = rgw_zone_id(zone_params->get_id());
+
+    // XXX: only default and STANDARD supported for now
+    RGWZonePlacementInfo info;
+    RGWZoneStorageClasses sc;
+    sc.set_storage_class("STANDARD", nullptr, nullptr);
+    info.storage_classes = sc;
+    zone_params->placement_pools["default"] = info;
+  }
+  DaosZone(DaosStore* _store, DaosZoneGroup _zg)
+      : store(_store), zonegroup(_zg) {
+    realm = new RGWRealm();
+    zone_public_config = new RGWZone();
+    zone_params = new RGWZoneParams();
+    current_period = new RGWPeriod();
+    cur_zone_id = rgw_zone_id(zone_params->get_id());
+
+    // XXX: only default and STANDARD supported for now
+    RGWZonePlacementInfo info;
+    RGWZoneStorageClasses sc;
+    sc.set_storage_class("STANDARD", nullptr, nullptr);
+    info.storage_classes = sc;
+    zone_params->placement_pools["default"] = info;
+  }
+  ~DaosZone() = default;
+
+  virtual std::unique_ptr<Zone> clone() override {
+    return std::make_unique<DaosZone>(store);
+  }
+  virtual ZoneGroup& get_zonegroup() override;
+  virtual int get_zonegroup(const std::string& id,
+                            std::unique_ptr<ZoneGroup>* zonegroup) override;
+  virtual const rgw_zone_id& get_id() override;
+  virtual const std::string& get_name() const override;
+  virtual bool is_writeable() override;
+  virtual bool get_redirect_endpoint(std::string* endpoint) override;
+  virtual bool has_zonegroup_api(const std::string& api) const override;
+  virtual const std::string& get_current_period_id() override;
+  virtual const RGWAccessKey& get_system_key() {
+    return zone_params->system_key;
+  }
+  virtual const std::string& get_realm_name() { return realm->get_name(); }
+  virtual const std::string& get_realm_id() { return realm->get_id(); }
+  virtual const std::string_view get_tier_type() { return "rgw"; }
+
+  friend class DaosStore;
+};
+
+class DaosLuaManager : public StoreLuaManager {
+  DaosStore* store;
+
+ public:
+  DaosLuaManager(DaosStore* _s) : store(_s) {}
+  virtual ~DaosLuaManager() = default;
+
+  virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y,
+                         const std::string& key, std::string& script) override {
+    DAOS_NOT_IMPLEMENTED_LOG(dpp);
+    return -ENOENT;
+  };
+
+  virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y,
+                         const std::string& key,
+                         const std::string& script) override {
+    DAOS_NOT_IMPLEMENTED_LOG(dpp);
+    return -ENOENT;
+  };
+
+  virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y,
+                         const std::string& key) override {
+    DAOS_NOT_IMPLEMENTED_LOG(dpp);
+    return -ENOENT;
+  };
+
+  virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y,
+                          const std::string& package_name) override {
+    DAOS_NOT_IMPLEMENTED_LOG(dpp);
+    return -ENOENT;
+  };
+
+  virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y,
+                             const std::string& package_name) override {
+    DAOS_NOT_IMPLEMENTED_LOG(dpp);
+    return -ENOENT;
+  };
+
+  virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y,
+                            rgw::lua::packages_t& packages) override {
+    DAOS_NOT_IMPLEMENTED_LOG(dpp);
+    return -ENOENT;
+  };
+};
+
+class DaosObject : public StoreObject {
+ private:
+  DaosStore* store;
+  RGWAccessControlPolicy acls;
+
+ public:
+  struct DaosReadOp : public StoreReadOp {
+   private:
+    DaosObject* source;
+
+   public:
+    DaosReadOp(DaosObject* _source);
+
+    virtual int prepare(optional_yield y,
+                        const DoutPrefixProvider* dpp) override;
+
+    /*
+     * Both `read` and `iterate` read up through index `end`
+     * *inclusive*. The number of bytes that could be returned is
+     * `end - ofs + 1`.
+     */
+    virtual int read(int64_t off, int64_t end, bufferlist& bl, optional_yield y,
+                     const DoutPrefixProvider* dpp) override;
+    virtual int iterate(const DoutPrefixProvider* dpp, int64_t off, int64_t end,
+                        RGWGetDataCB* cb, optional_yield y) override;
+
+    virtual int get_attr(const DoutPrefixProvider* dpp, const char* name,
+                         bufferlist& dest, optional_yield y) override;
+  };
+
+  struct DaosDeleteOp : public StoreDeleteOp {
+   private:
+    DaosObject* source;
+
+   public:
+    DaosDeleteOp(DaosObject* _source);
+
+    virtual int delete_obj(const DoutPrefixProvider* dpp,
+                           optional_yield y) override;
+  };
+
+  ds3_obj_t* ds3o = nullptr;
+
+  DaosObject() = default;
+
+  DaosObject(DaosStore* _st, const rgw_obj_key& _k)
+      : StoreObject(_k), store(_st), acls() {}
+  DaosObject(DaosStore* _st, const rgw_obj_key& _k, Bucket* _b)
+      : StoreObject(_k, _b), store(_st), acls() {}
+
+  DaosObject(DaosObject& _o) = default;
+
+  virtual ~DaosObject();
+
+  virtual int delete_object(const DoutPrefixProvider* dpp, optional_yield y,
+                            bool prevent_versioning = false) override;
+  virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate,
+                             Completions* aio, bool keep_index_consistent,
+                             optional_yield y) override;
+  virtual int copy_object(
+      User* user, req_info* info, const rgw_zone_id& source_zone,
+      rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket,
+      rgw::sal::Bucket* src_bucket, const rgw_placement_rule& dest_placement,
+      ceph::real_time* src_mtime, ceph::real_time* mtime,
+      const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr,
+      bool high_precision_time, const char* if_match, const char* if_nomatch,
+      AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs,
+      RGWObjCategory category, uint64_t olh_epoch,
+      boost::optional<ceph::real_time> delete_at, std::string* version_id,
+      std::string* tag, std::string* etag, void (*progress_cb)(off_t, void*),
+      void* progress_data, const DoutPrefixProvider* dpp,
+      optional_yield y) override;
+  virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+  virtual int set_acl(const RGWAccessControlPolicy& acl) override {
+    acls = acl;
+    return 0;
+  }
+
+  virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState** state,
+                            optional_yield y, bool follow_olh = true) override;
+  virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs,
+                            Attrs* delattrs, optional_yield y) override;
+  virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp,
+                            rgw_obj* target_obj = NULL) override;
+  virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val,
+                               optional_yield y,
+                               const DoutPrefixProvider* dpp) override;
+  virtual int delete_obj_attrs(const DoutPrefixProvider* dpp,
+                               const char* attr_name,
+                               optional_yield y) override;
+  virtual bool is_expired() override;
+  virtual void gen_rand_obj_instance_name() override;
+  virtual std::unique_ptr<Object> clone() override {
+    return std::make_unique<DaosObject>(*this);
+  }
+  virtual std::unique_ptr<MPSerializer> get_serializer(
+      const DoutPrefixProvider* dpp, const std::string& lock_name) override;
+  virtual int transition(Bucket* bucket,
+                         const rgw_placement_rule& placement_rule,
+                         const real_time& mtime, uint64_t olh_epoch,
+                         const DoutPrefixProvider* dpp,
+                         optional_yield y) override;
+  virtual int transition_to_cloud(Bucket* bucket, rgw::sal::PlacementTier* tier,
+                                  rgw_bucket_dir_entry& o,
+                                  std::set<std::string>& cloud_targets,
+                                  CephContext* cct, bool update_object,
+                                  const DoutPrefixProvider* dpp,
+                                  optional_yield y) override;
+  virtual bool placement_rules_match(rgw_placement_rule& r1,
+                                     rgw_placement_rule& r2) override;
+  virtual int dump_obj_layout(const DoutPrefixProvider* dpp, optional_yield y,
+                              Formatter* f) override;
+
+  /* Swift versioning */
+  virtual int swift_versioning_restore(bool& restored,
+                                       const DoutPrefixProvider* dpp) override;
+  virtual int swift_versioning_copy(const DoutPrefixProvider* dpp,
+                                    optional_yield y) override;
+
+  /* OPs */
+  virtual std::unique_ptr<ReadOp> get_read_op() override;
+  virtual std::unique_ptr<DeleteOp> get_delete_op() override;
+
+  /* OMAP */
+  virtual int omap_get_vals(const DoutPrefixProvider* dpp,
+                            const std::string& marker, uint64_t count,
+                            std::map<std::string, bufferlist>* m, bool* pmore,
+                            optional_yield y) override;
+  virtual int omap_get_all(const DoutPrefixProvider* dpp,
+                           std::map<std::string, bufferlist>* m,
+                           optional_yield y) override;
+  virtual int omap_get_vals_by_keys(const DoutPrefixProvider* dpp,
+                                    const std::string& oid,
+                                    const std::set<std::string>& keys,
+                                    Attrs* vals) override;
+  virtual int omap_set_val_by_key(const DoutPrefixProvider* dpp,
+                                  const std::string& key, bufferlist& val,
+                                  bool must_exist, optional_yield y) override;
+  virtual int chown(User& new_user, const DoutPrefixProvider* dpp,
+                    optional_yield y) override;
+
+  bool is_open() { return ds3o != nullptr; };
+  // Only lookup the object, do not create
+  int lookup(const DoutPrefixProvider* dpp);
+  // Create the object, truncate if exists
+  int create(const DoutPrefixProvider* dpp);
+  // Release the daos resources
+  int close(const DoutPrefixProvider* dpp);
+  // Write to object starting from offset
+  int write(const DoutPrefixProvider* dpp, bufferlist&& data, uint64_t offset);
+  // Read size bytes from object starting from offset
+  int read(const DoutPrefixProvider* dpp, bufferlist& data, uint64_t offset,
+           uint64_t& size);
+  // Get the object's dirent and attrs
+  int get_dir_entry_attrs(const DoutPrefixProvider* dpp,
+                          rgw_bucket_dir_entry* ent, Attrs* getattrs = nullptr);
+  // Set the object's dirent and attrs
+  int set_dir_entry_attrs(const DoutPrefixProvider* dpp,
+                          rgw_bucket_dir_entry* ent, Attrs* setattrs = nullptr);
+  // Marks this DAOS object as being the latest version and unmarks all other
+  // versions as latest
+  int mark_as_latest(const DoutPrefixProvider* dpp, ceph::real_time set_mtime);
+  // get_bucket casted as DaosBucket*
+  DaosBucket* get_daos_bucket() {
+    return static_cast<DaosBucket*>(get_bucket());
+  }
+};
+
+// A placeholder locking class for multipart upload.
+class MPDaosSerializer : public StoreMPSerializer {
+ public:
+  MPDaosSerializer(const DoutPrefixProvider* dpp, DaosStore* store,
+                   DaosObject* obj, const std::string& lock_name) {}
+
+  virtual int try_lock(const DoutPrefixProvider* dpp, utime_t dur,
+                       optional_yield y) override {
+    return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+  }
+  virtual int unlock() override { return DAOS_NOT_IMPLEMENTED_LOG(nullptr); }
+};
+
+class DaosAtomicWriter : public StoreWriter {
+ protected:
+  rgw::sal::DaosStore* store;
+  const rgw_user& owner;
+  const rgw_placement_rule* ptail_placement_rule;
+  uint64_t olh_epoch;
+  const std::string& unique_tag;
+  DaosObject obj;
+  uint64_t total_data_size = 0;  // for total data being uploaded
+
+ public:
+  DaosAtomicWriter(const DoutPrefixProvider* dpp, optional_yield y,
+                   rgw::sal::Object* obj,
+                   DaosStore* _store, const rgw_user& _owner,
+                   const rgw_placement_rule* _ptail_placement_rule,
+                   uint64_t _olh_epoch, const std::string& _unique_tag);
+  ~DaosAtomicWriter() = default;
+
+  // prepare to start processing object data
+  virtual int prepare(optional_yield y) override;
+
+  // Process a bufferlist
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+
+  // complete the operation and make its result visible to clients
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time* mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at, const char* if_match,
+                       const char* if_nomatch, const std::string* user_data,
+                       rgw_zone_set* zones_trace, bool* canceled,
+                       optional_yield y) override;
+};
+
+class DaosMultipartWriter : public StoreWriter {
+ protected:
+  rgw::sal::DaosStore* store;
+  MultipartUpload* upload;
+  std::string upload_id;
+
+  // Part parameters.
+  const uint64_t part_num;
+  const std::string part_num_str;
+  uint64_t actual_part_size = 0;
+
+  ds3_part_t* ds3p = nullptr;
+  bool is_open() { return ds3p != nullptr; };
+
+ public:
+  DaosMultipartWriter(const DoutPrefixProvider* dpp, optional_yield y,
+                      MultipartUpload* _upload,
+                      rgw::sal::Object* obj,
+                      DaosStore* _store, const rgw_user& owner,
+                      const rgw_placement_rule* ptail_placement_rule,
+                      uint64_t _part_num, const std::string& part_num_str)
+      : StoreWriter(dpp, y),
+        store(_store),
+        upload(_upload),
+        upload_id(_upload->get_upload_id()),
+        part_num(_part_num),
+        part_num_str(part_num_str) {}
+  virtual ~DaosMultipartWriter();
+
+  // prepare to start processing object data
+  virtual int prepare(optional_yield y) override;
+
+  // Process a bufferlist
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+
+  // complete the operation and make its result visible to clients
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time* mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at, const char* if_match,
+                       const char* if_nomatch, const std::string* user_data,
+                       rgw_zone_set* zones_trace, bool* canceled,
+                       optional_yield y) override;
+
+  const std::string& get_bucket_name();
+};
+
+class DaosMultipartPart : public StoreMultipartPart {
+ protected:
+  RGWUploadPartInfo info;
+
+ public:
+  DaosMultipartPart() = default;
+  virtual ~DaosMultipartPart() = default;
+
+  virtual uint32_t get_num() { return info.num; }
+  virtual uint64_t get_size() { return info.accounted_size; }
+  virtual const std::string& get_etag() { return info.etag; }
+  virtual ceph::real_time& get_mtime() { return info.modified; }
+
+  friend class DaosMultipartUpload;
+};
+
+class DaosMultipartUpload : public StoreMultipartUpload {
+  DaosStore* store;
+  RGWMPObj mp_obj;
+  ACLOwner owner;
+  ceph::real_time mtime;
+  rgw_placement_rule placement;
+  RGWObjManifest manifest;
+
+ public:
+  DaosMultipartUpload(DaosStore* _store, Bucket* _bucket,
+                      const std::string& oid,
+                      std::optional<std::string> upload_id, ACLOwner _owner,
+                      ceph::real_time _mtime)
+      : StoreMultipartUpload(_bucket),
+        store(_store),
+        mp_obj(oid, upload_id),
+        owner(_owner),
+        mtime(_mtime) {}
+  virtual ~DaosMultipartUpload() = default;
+
+  virtual const std::string& get_meta() const { return mp_obj.get_meta(); }
+  virtual const std::string& get_key() const { return mp_obj.get_key(); }
+  virtual const std::string& get_upload_id() const {
+    return mp_obj.get_upload_id();
+  }
+  virtual const ACLOwner& get_owner() const override { return owner; }
+  virtual ceph::real_time& get_mtime() { return mtime; }
+  virtual std::unique_ptr<rgw::sal::Object> get_meta_obj() override;
+  virtual int init(const DoutPrefixProvider* dpp, optional_yield y,
+                   ACLOwner& owner, rgw_placement_rule& dest_placement,
+                   rgw::sal::Attrs& attrs) override;
+  virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+                         int num_parts, int marker, int* next_marker,
+                         bool* truncated,
+                         bool assume_unsorted = false) override;
+  virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override;
+  virtual int complete(const DoutPrefixProvider* dpp, optional_yield y,
+                       CephContext* cct, std::map<int, std::string>& part_etags,
+                       std::list<rgw_obj_index_key>& remove_objs,
+                       uint64_t& accounted_size, bool& compressed,
+                       RGWCompressionInfo& cs_info, off_t& off,
+                       std::string& tag, ACLOwner& owner, uint64_t olh_epoch,
+                       rgw::sal::Object* target_obj) override;
+  virtual int get_info(const DoutPrefixProvider* dpp, optional_yield y,
+                       rgw_placement_rule** rule,
+                       rgw::sal::Attrs* attrs = nullptr) override;
+  virtual std::unique_ptr<Writer> get_writer(
+      const DoutPrefixProvider* dpp, optional_yield y,
+      rgw::sal::Object* obj, const rgw_user& owner,
+      const rgw_placement_rule* ptail_placement_rule, uint64_t part_num,
+      const std::string& part_num_str) override;
+  const std::string& get_bucket_name() { return bucket->get_name(); }
+};
+
+class DaosStore : public StoreDriver {
+ private:
+  DaosZone zone;
+  RGWSyncModuleInstanceRef sync_module;
+
+ public:
+  ds3_t* ds3 = nullptr;
+
+  CephContext* cctx;
+
+  DaosStore(CephContext* c) : zone(this), cctx(c) {}
+  ~DaosStore() = default;
+
+  virtual const std::string get_name() const override { return "daos"; }
+
+  virtual std::unique_ptr<User> get_user(const rgw_user& u) override;
+  virtual std::string get_cluster_id(const DoutPrefixProvider* dpp,
+                                     optional_yield y) override;
+  virtual int get_user_by_access_key(const DoutPrefixProvider* dpp,
+                                     const std::string& key, optional_yield y,
+                                     std::unique_ptr<User>* user) override;
+  virtual int get_user_by_email(const DoutPrefixProvider* dpp,
+                                const std::string& email, optional_yield y,
+                                std::unique_ptr<User>* user) override;
+  virtual int get_user_by_swift(const DoutPrefixProvider* dpp,
+                                const std::string& user_str, optional_yield y,
+                                std::unique_ptr<User>* user) override;
+  virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+  virtual int get_bucket(const DoutPrefixProvider* dpp, User* u,
+                         const rgw_bucket& b, std::unique_ptr<Bucket>* bucket,
+                         optional_yield y) override;
+  virtual int get_bucket(User* u, const RGWBucketInfo& i,
+                         std::unique_ptr<Bucket>* bucket) override;
+  virtual int get_bucket(const DoutPrefixProvider* dpp, User* u,
+                         const std::string& tenant, const std::string& name,
+                         std::unique_ptr<Bucket>* bucket,
+                         optional_yield y) override;
+  virtual bool is_meta_master() override;
+  virtual int forward_request_to_master(const DoutPrefixProvider* dpp,
+                                        User* user, obj_version* objv,
+                                        bufferlist& in_data, JSONParser* jp,
+                                        req_info& info,
+                                        optional_yield y) override;
+  virtual int forward_iam_request_to_master(
+      const DoutPrefixProvider* dpp, const RGWAccessKey& key, obj_version* objv,
+      bufferlist& in_data, RGWXMLDecoder::XMLParser* parser, req_info& info,
+      optional_yield y) override;
+  virtual Zone* get_zone() { return &zone; }
+  virtual std::string zone_unique_id(uint64_t unique_num) override;
+  virtual std::string zone_unique_trans_id(const uint64_t unique_num) override;
+  virtual int cluster_stat(RGWClusterStat& stats) override;
+  virtual std::unique_ptr<Lifecycle> get_lifecycle(void) override;
+  virtual std::unique_ptr<Completions> get_completions(void) override;
+  virtual std::unique_ptr<Notification> get_notification(
+      rgw::sal::Object* obj, rgw::sal::Object* src_obj, struct req_state* s,
+      rgw::notify::EventType event_type, optional_yield y,
+      const std::string* object_name = nullptr) override;
+  virtual std::unique_ptr<Notification> get_notification(
+      const DoutPrefixProvider* dpp, rgw::sal::Object* obj,
+      rgw::sal::Object* src_obj, rgw::notify::EventType event_type,
+      rgw::sal::Bucket* _bucket, std::string& _user_id,
+      std::string& _user_tenant, std::string& _req_id,
+      optional_yield y) override;
+  virtual RGWLC* get_rgwlc(void) override { return NULL; }
+  virtual RGWCoroutinesManagerRegistry* get_cr_registry() override {
+    return NULL;
+  }
+
+  virtual int log_usage(
+      const DoutPrefixProvider* dpp,
+      std::map<rgw_user_bucket, RGWUsageBatch>& usage_info) override;
+  virtual int log_op(const DoutPrefixProvider* dpp, std::string& oid,
+                     bufferlist& bl) override;
+  virtual int register_to_service_map(
+      const DoutPrefixProvider* dpp, const std::string& daemon_type,
+      const std::map<std::string, std::string>& meta) override;
+  virtual void get_quota(RGWQuota& quota) override;
+  virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit,
+                             RGWRateLimitInfo& user_ratelimit,
+                             RGWRateLimitInfo& anon_ratelimit) override;
+  virtual int set_buckets_enabled(const DoutPrefixProvider* dpp,
+                                  std::vector<rgw_bucket>& buckets,
+                                  bool enabled) override;
+  virtual uint64_t get_new_req_id() override {
+    return DAOS_NOT_IMPLEMENTED_LOG(nullptr);
+  }
+  virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp,
+                                      std::optional<rgw_zone_id> zone,
+                                      std::optional<rgw_bucket> bucket,
+                                      RGWBucketSyncPolicyHandlerRef* phandler,
+                                      optional_yield y) override;
+  virtual RGWDataSyncStatusManager* get_data_sync_manager(
+      const rgw_zone_id& source_zone) override;
+  virtual void wakeup_meta_sync_shards(std::set<int>& shard_ids) override {
+    return;
+  }
+  virtual void wakeup_data_sync_shards(
+      const DoutPrefixProvider* dpp, const rgw_zone_id& source_zone,
+      boost::container::flat_map<
+          int, boost::container::flat_set<rgw_data_notify_entry>>& shard_ids)
+      override {
+    return;
+  }
+  virtual int clear_usage(const DoutPrefixProvider* dpp) override {
+    return DAOS_NOT_IMPLEMENTED_LOG(dpp);
+  }
+  virtual int read_all_usage(
+      const DoutPrefixProvider* dpp, uint64_t start_epoch, uint64_t end_epoch,
+      uint32_t max_entries, bool* is_truncated, RGWUsageIter& usage_iter,
+      std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+  virtual int trim_all_usage(const DoutPrefixProvider* dpp,
+                             uint64_t start_epoch, uint64_t end_epoch) override;
+  virtual int get_config_key_val(std::string name, bufferlist* bl) override;
+  virtual int meta_list_keys_init(const DoutPrefixProvider* dpp,
+                                  const std::string& section,
+                                  const std::string& marker,
+                                  void** phandle) override;
+  virtual int meta_list_keys_next(const DoutPrefixProvider* dpp, void* handle,
+                                  int max, std::list<std::string>& keys,
+                                  bool* truncated) override;
+  virtual void meta_list_keys_complete(void* handle) override;
+  virtual std::string meta_get_marker(void* handle) override;
+  virtual int meta_remove(const DoutPrefixProvider* dpp,
+                          std::string& metadata_key, optional_yield y) override;
+
+  virtual const RGWSyncModuleInstanceRef& get_sync_module() {
+    return sync_module;
+  }
+  virtual std::string get_host_id() { return ""; }
+
+  virtual std::unique_ptr<LuaManager> get_lua_manager() override;
+  virtual std::unique_ptr<RGWRole> get_role(
+      std::string name, std::string tenant, std::string path = "",
+      std::string trust_policy = "", std::string max_session_duration_str = "",
+      std::multimap<std::string, std::string> tags = {}) override;
+  virtual std::unique_ptr<RGWRole> get_role(const RGWRoleInfo& info) override;
+  virtual std::unique_ptr<RGWRole> get_role(std::string id) override;
+  virtual int get_roles(const DoutPrefixProvider* dpp, optional_yield y,
+                        const std::string& path_prefix,
+                        const std::string& tenant,
+                        std::vector<std::unique_ptr<RGWRole>>& roles) override;
+  virtual std::unique_ptr<RGWOIDCProvider> get_oidc_provider() override;
+  virtual int get_oidc_providers(
+      const DoutPrefixProvider* dpp, const std::string& tenant,
+      std::vector<std::unique_ptr<RGWOIDCProvider>>& providers) override;
+  virtual std::unique_ptr<Writer> get_append_writer(
+      const DoutPrefixProvider* dpp, optional_yield y,
+      rgw::sal::Object* obj, const rgw_user& owner,
+      const rgw_placement_rule* ptail_placement_rule,
+      const std::string& unique_tag, uint64_t position,
+      uint64_t* cur_accounted_size) override;
+  virtual std::unique_ptr<Writer> get_atomic_writer(
+      const DoutPrefixProvider* dpp, optional_yield y,
+      rgw::sal::Object* obj, const rgw_user& owner,
+      const rgw_placement_rule* ptail_placement_rule, uint64_t olh_epoch,
+      const std::string& unique_tag) override;
+  virtual const std::string& get_compression_type(
+      const rgw_placement_rule& rule) override;
+  virtual bool valid_placement(const rgw_placement_rule& rule) override;
+
+  virtual void finalize(void) override;
+
+  virtual CephContext* ctx(void) override { return cctx; }
+
+  virtual int initialize(CephContext* cct,
+                         const DoutPrefixProvider* dpp) override;
+};
+
+}  // namespace rgw::sal
diff --git a/src/rgw/rgw_sal_dbstore.cc b/src/rgw/rgw_sal_dbstore.cc
new file mode 100644
index 000000000..5100dc41e
--- /dev/null
+++ b/src/rgw/rgw_sal_dbstore.cc
@@ -0,0 +1,2045 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2021 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <system_error>
+#include <unistd.h>
+#include <sstream>
+
+#include "common/Clock.h"
+#include "common/errno.h"
+
+#include "rgw_sal.h"
+#include "rgw_sal_dbstore.h"
+#include "rgw_bucket.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace rgw::sal {
+
+  int DBUser::list_buckets(const DoutPrefixProvider *dpp, const string& marker,
+      const string& end_marker, uint64_t max, bool need_stats,
+      BucketList &buckets, optional_yield y)
+  {
+    RGWUserBuckets ulist;
+    bool is_truncated = false;
+    int ret;
+
+    buckets.clear();
+    ret = store->getDB()->list_buckets(dpp, "", info.user_id, marker, end_marker, max,
+        need_stats, &ulist, &is_truncated);
+    if (ret < 0)
+      return ret;
+
+    buckets.set_truncated(is_truncated);
+    for (const auto& ent : ulist.get_buckets()) {
+      buckets.add(std::make_unique<DBBucket>(this->store, ent.second, this));
+    }
+
+    return 0;
+  }
+
+  int DBUser::create_bucket(const DoutPrefixProvider *dpp,
+      const rgw_bucket& b,
+      const string& zonegroup_id,
+      rgw_placement_rule& placement_rule,
+      string& swift_ver_location,
+      const RGWQuotaInfo * pquota_info,
+      const RGWAccessControlPolicy& policy,
+      Attrs& attrs,
+      RGWBucketInfo& info,
+      obj_version& ep_objv,
+      bool exclusive,
+      bool obj_lock_enabled,
+      bool *existed,
+      req_info& req_info,
+      std::unique_ptr<Bucket>* bucket_out,
+      optional_yield y)
+  {
+    int ret;
+    bufferlist in_data;
+    RGWBucketInfo master_info;
+    rgw_bucket *pmaster_bucket = nullptr;
+    uint32_t *pmaster_num_shards = nullptr;
+    real_time creation_time;
+    std::unique_ptr<Bucket> bucket;
+    obj_version objv, *pobjv = NULL;
+
+    /* If it exists, look it up; otherwise create it */
+    ret = store->get_bucket(dpp, this, b, &bucket, y);
+    if (ret < 0 && ret != -ENOENT)
+      return ret;
+
+    if (ret != -ENOENT) {
+      RGWAccessControlPolicy old_policy(store->ctx());
+      *existed = true;
+      if (swift_ver_location.empty()) {
+        swift_ver_location = bucket->get_info().swift_ver_location;
+      }
+      placement_rule.inherit_from(bucket->get_info().placement_rule);
+
+      // don't allow changes to the acl policy
+      /*    int r = rgw_op_get_bucket_policy_from_attr(dpp, this, this, bucket->get_attrs(),
+            &old_policy, y);
+            if (r >= 0 && old_policy != policy) {
+            bucket_out->swap(bucket);
+            return -EEXIST;
+            }*/
+    } else {
+      bucket = std::make_unique<DBBucket>(store, b, this);
+      *existed = false;
+      bucket->set_attrs(attrs);
+      // XXX: For now single default zone and STANDARD storage class
+      // supported.
+      placement_rule.name = "default";
+      placement_rule.storage_class = "STANDARD";
+    }
+
+    /*
+     * XXX: If not master zone, fwd the request to master zone.
+     * For now DBStore has single zone.
+     */
+    std::string zid = zonegroup_id;
+    /* if (zid.empty()) {
+       zid = svc()->zone->get_zonegroup().get_id();
+       } */
+
+    if (*existed) {
+      rgw_placement_rule selected_placement_rule;
+      /* XXX: Handle this when zone is implemented
+         ret = svc()->zone->select_bucket_placement(this.get_info(),
+         zid, placement_rule,
+         &selected_placement_rule, nullptr, y);
+         if (selected_placement_rule != info.placement_rule) {
+         ret = -EEXIST;
+         bucket_out->swap(bucket);
+         return ret;
+         } */
+    } else {
+
+      /* XXX: We may not need to send all these params. Cleanup the unused ones */
+      ret = store->getDB()->create_bucket(dpp, this->get_info(), bucket->get_key(),
+          zid, placement_rule, swift_ver_location, pquota_info,
+          attrs, info, pobjv, &ep_objv, creation_time,
+          pmaster_bucket, pmaster_num_shards, y, exclusive);
+      if (ret == -EEXIST) {
+        *existed = true;
+        ret = 0;
+      } else if (ret != 0) {
+        return ret;
+      }
+    }
+
+    bucket->set_version(ep_objv);
+    bucket->get_info() = info;
+
+    bucket_out->swap(bucket);
+
+    return ret;
+  }
+
+  int DBUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y)
+  {
+    int ret;
+    ret = store->getDB()->get_user(dpp, string("user_id"), get_id().id, info, &attrs,
+        &objv_tracker);
+    return ret;
+  }
+
+  int DBUser::read_stats(const DoutPrefixProvider *dpp,
+      optional_yield y, RGWStorageStats* stats,
+      ceph::real_time *last_stats_sync,
+      ceph::real_time *last_stats_update)
+  {
+    return 0;
+  }
+
+  /* stats - Not for first pass */
+  int DBUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB *cb)
+  {
+    return 0;
+  }
+
+  int DBUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y)
+  {
+    return 0;
+  }
+
+  int DBUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+      bool *is_truncated, RGWUsageIter& usage_iter,
+      map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+  {
+    return 0;
+  }
+
+  int DBUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+  {
+    return 0;
+  }
+
+  int DBUser::load_user(const DoutPrefixProvider *dpp, optional_yield y)
+  {
+    int ret = 0;
+
+    ret = store->getDB()->get_user(dpp, string("user_id"), get_id().id, info, &attrs,
+        &objv_tracker);
+
+    return ret;
+  }
+  int DBUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y)
+  {
+    for(auto& it : new_attrs) {
+  	  attrs[it.first] = it.second;
+    }
+    return store_user(dpp, y, false);
+  }
+  int DBUser::store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info)
+  {
+    int ret = 0;
+
+    ret = store->getDB()->store_user(dpp, info, exclusive, &attrs, &objv_tracker, old_info);
+
+    return ret;
+  }
+
+  int DBUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y)
+  {
+    int ret = 0;
+
+    ret = store->getDB()->remove_user(dpp, info, &objv_tracker);
+
+    return ret;
+  }
+
+  int DBUser::verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider *dpp, optional_yield y)
+  {
+    *verified = false;
+    return 0;
+  }
+
+  int DBBucket::remove_bucket(const DoutPrefixProvider *dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y)
+  {
+    int ret;
+
+    ret = load_bucket(dpp, y);
+    if (ret < 0)
+      return ret;
+
+    /* XXX: handle delete_children */
+
+    if (!delete_children) {
+      /* Check if there are any objects */
+      rgw::sal::Bucket::ListParams params;
+      params.list_versions = true;
+      params.allow_unordered = true;
+
+      rgw::sal::Bucket::ListResults results;
+
+      results.objs.clear();
+
+      ret = list(dpp, params, 2, results, null_yield);
+
+      if (ret < 0) {
+        ldpp_dout(dpp, 20) << __func__ << ": Bucket list objects returned " <<
+        ret << dendl;
+        return ret;
+      }
+
+      if (!results.objs.empty()) {
+        ret = -ENOTEMPTY;
+        ldpp_dout(dpp, -1) << __func__ << ": Bucket Not Empty.. returning " <<
+        ret << dendl;
+        return ret;
+      }
+    }
+
+    ret = store->getDB()->remove_bucket(dpp, info);
+
+    return ret;
+  }
+
+  int DBBucket::remove_bucket_bypass_gc(int concurrent_max, bool
+					keep_index_consistent,
+					optional_yield y, const
+					DoutPrefixProvider *dpp) {
+    return 0;
+  }
+
+  int DBBucket::load_bucket(const DoutPrefixProvider *dpp, optional_yield y, bool get_stats)
+  {
+    int ret = 0;
+
+    ret = store->getDB()->get_bucket_info(dpp, string("name"), "", info, &attrs,
+        &mtime, &bucket_version);
+
+    return ret;
+  }
+
+  /* stats - Not for first pass */
+  int DBBucket::read_stats(const DoutPrefixProvider *dpp,
+      const bucket_index_layout_generation& idx_layout,
+      int shard_id,
+      std::string *bucket_ver, std::string *master_ver,
+      std::map<RGWObjCategory, RGWStorageStats>& stats,
+      std::string *max_marker, bool *syncstopped)
+  {
+    return 0;
+  }
+
+  int DBBucket::read_stats_async(const DoutPrefixProvider *dpp, const bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB *ctx)
+  {
+    return 0;
+  }
+
+  int DBBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y)
+  {
+    return 0;
+  }
+
+  int DBBucket::update_container_stats(const DoutPrefixProvider *dpp)
+  {
+    return 0;
+  }
+
+  int DBBucket::check_bucket_shards(const DoutPrefixProvider *dpp)
+  {
+    return 0;
+  }
+
+  int DBBucket::chown(const DoutPrefixProvider *dpp, User& new_user, optional_yield y)
+  {
+    int ret;
+
+    ret = store->getDB()->update_bucket(dpp, "owner", info, false, &(new_user.get_id()), nullptr, nullptr, nullptr);
+    return ret;
+  }
+
+  int DBBucket::put_info(const DoutPrefixProvider *dpp, bool exclusive, ceph::real_time _mtime)
+  {
+    int ret;
+
+    ret = store->getDB()->update_bucket(dpp, "info", info, exclusive, nullptr, nullptr, &_mtime, &info.objv_tracker);
+
+    return ret;
+
+  }
+
+  /* Make sure to call get_bucket_info() if you need it first */
+  bool DBBucket::is_owner(User* user)
+  {
+    return (info.owner.compare(user->get_id()) == 0);
+  }
+
+  int DBBucket::check_empty(const DoutPrefixProvider *dpp, optional_yield y)
+  {
+    /* XXX: Check if bucket contains any objects */
+    return 0;
+  }
+
+  int DBBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size,
+      optional_yield y, bool check_size_only)
+  {
+    /* Not Handled in the first pass as stats are also needed */
+    return 0;
+  }
+
+  int DBBucket::merge_and_store_attrs(const DoutPrefixProvider *dpp, Attrs& new_attrs, optional_yield y)
+  {
+    int ret = 0;
+
+    for(auto& it : new_attrs) {
+	    attrs[it.first] = it.second;
+    }
+
+    /* XXX: handle has_instance_obj like in set_bucket_instance_attrs() */
+
+    ret = store->getDB()->update_bucket(dpp, "attrs", info, false, nullptr, &new_attrs, nullptr, &get_info().objv_tracker);
+
+    return ret;
+  }
+
+  int DBBucket::try_refresh_info(const DoutPrefixProvider *dpp, ceph::real_time *pmtime)
+  {
+    int ret = 0;
+
+    ret = store->getDB()->get_bucket_info(dpp, string("name"), "", info, &attrs,
+        pmtime, &bucket_version);
+
+    return ret;
+  }
+
+  /* XXX: usage and stats not supported in the first pass */
+  int DBBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+      uint32_t max_entries, bool *is_truncated,
+      RGWUsageIter& usage_iter,
+      map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+  {
+    return 0;
+  }
+
+  int DBBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+  {
+    return 0;
+  }
+
+  int DBBucket::remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink)
+  {
+    /* XXX: CHECK: Unlike RadosStore, there is no seperate bucket index table.
+     * Delete all the object in the list from the object table of this
+     * bucket
+     */
+    return 0;
+  }
+
+  int DBBucket::check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats)
+  {
+    /* XXX: stats not supported yet */
+    return 0;
+  }
+
+  int DBBucket::rebuild_index(const DoutPrefixProvider *dpp)
+  {
+    /* there is no index table in dbstore. Not applicable */
+    return 0;
+  }
+
+  int DBBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout)
+  {
+    /* XXX: CHECK: set tag timeout for all the bucket objects? */
+    return 0;
+  }
+
+  int DBBucket::purge_instance(const DoutPrefixProvider *dpp)
+  {
+    /* XXX: CHECK: for dbstore only single instance supported.
+     * Remove all the objects for that instance? Anything extra needed?
+     */
+    return 0;
+  }
+
+  int DBBucket::set_acl(const DoutPrefixProvider *dpp, RGWAccessControlPolicy &acl, optional_yield y)
+  {
+    int ret = 0;
+    bufferlist aclbl;
+
+    acls = acl;
+    acl.encode(aclbl);
+
+    Attrs attrs = get_attrs();
+    attrs[RGW_ATTR_ACL] = aclbl;
+
+    ret = store->getDB()->update_bucket(dpp, "attrs", info, false, &(acl.get_owner().get_id()), &attrs, nullptr, nullptr);
+
+    return ret;
+  }
+
+  std::unique_ptr<Object> DBBucket::get_object(const rgw_obj_key& k)
+  {
+    return std::make_unique<DBObject>(this->store, k, this);
+  }
+
+  int DBBucket::list(const DoutPrefixProvider *dpp, ListParams& params, int max, ListResults& results, optional_yield y)
+  {
+    int ret = 0;
+
+    results.objs.clear();
+
+    DB::Bucket target(store->getDB(), get_info());
+    DB::Bucket::List list_op(&target);
+
+    list_op.params.prefix = params.prefix;
+    list_op.params.delim = params.delim;
+    list_op.params.marker = params.marker;
+    list_op.params.ns = params.ns;
+    list_op.params.end_marker = params.end_marker;
+    list_op.params.ns = params.ns;
+    list_op.params.enforce_ns = params.enforce_ns;
+    list_op.params.access_list_filter = params.access_list_filter;
+    list_op.params.force_check_filter = params.force_check_filter;
+    list_op.params.list_versions = params.list_versions;
+    list_op.params.allow_unordered = params.allow_unordered;
+
+    results.objs.clear();
+    ret = list_op.list_objects(dpp, max, &results.objs, &results.common_prefixes, &results.is_truncated);
+    if (ret >= 0) {
+      results.next_marker = list_op.get_next_marker();
+      params.marker = results.next_marker;
+    }
+
+    return ret;
+  }
+
+  std::unique_ptr<MultipartUpload> DBBucket::get_multipart_upload(
+				const std::string& oid,
+				std::optional<std::string> upload_id,
+				ACLOwner owner, ceph::real_time mtime) {
+    return std::make_unique<DBMultipartUpload>(this->store, this, oid, upload_id,
+						std::move(owner), mtime);
+  }
+
+  int DBBucket::list_multiparts(const DoutPrefixProvider *dpp,
+				const string& prefix,
+				string& marker,
+				const string& delim,
+				const int& max_uploads,
+				vector<std::unique_ptr<MultipartUpload>>& uploads,
+				map<string, bool> *common_prefixes,
+				bool *is_truncated) {
+    return 0;
+  }
+
+  int DBBucket::abort_multiparts(const DoutPrefixProvider* dpp,
+				 CephContext* cct) {
+    return 0;
+  }
+
+  void DBStore::finalize(void)
+  {
+    if (dbsm)
+      dbsm->destroyAllHandles();
+  }
+
+  const std::string&  DBZoneGroup::get_endpoint() const {
+    if (!group->endpoints.empty()) {
+      return group->endpoints.front();
+    } else {
+      // use zonegroup's master zone endpoints
+      auto z = group->zones.find(group->master_zone);
+      if (z != group->zones.end() && !z->second.endpoints.empty()) {
+	return z->second.endpoints.front();
+      }
+    }
+    return empty;
+  }
+
+  bool DBZoneGroup::placement_target_exists(std::string& target) const {
+    return !!group->placement_targets.count(target);
+  }
+
+  void DBZoneGroup::get_placement_target_names(std::set<std::string>& names) const {
+    for (const auto& target : group->placement_targets) {
+      names.emplace(target.second.name);
+    }
+  }
+
+  ZoneGroup& DBZone::get_zonegroup()
+  {
+    return *zonegroup;
+  }
+
+  const RGWZoneParams& DBZone::get_rgw_params()
+  {
+    return *zone_params;
+  }
+
+  const std::string& DBZone::get_id()
+  {
+    return zone_params->get_id();
+  }
+
+
+  const std::string& DBZone::get_name() const
+  {
+    return zone_params->get_name();
+  }
+
+  bool DBZone::is_writeable()
+  {
+    return true;
+  }
+
+  bool DBZone::get_redirect_endpoint(std::string* endpoint)
+  {
+    return false;
+  }
+
+  bool DBZone::has_zonegroup_api(const std::string& api) const
+  {
+    return false;
+  }
+
+  const std::string& DBZone::get_current_period_id()
+  {
+    return current_period->get_id();
+  }
+
+  const RGWAccessKey& DBZone::get_system_key()
+  {
+    return zone_params->system_key;
+  }
+
+  const std::string& DBZone::get_realm_name()
+  {
+    return realm->get_name();
+  }
+
+  const std::string& DBZone::get_realm_id()
+  {
+    return realm->get_id();
+  }
+
+  RGWBucketSyncPolicyHandlerRef DBZone::get_sync_policy_handler()
+  {
+    return nullptr;
+  }
+
+  std::unique_ptr<LuaManager> DBStore::get_lua_manager()
+  {
+    return std::make_unique<DBLuaManager>(this);
+  }
+
+  int DBObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **pstate, optional_yield y, bool follow_olh)
+  {
+    RGWObjState* astate;
+    DB::Object op_target(store->getDB(), get_bucket()->get_info(), get_obj());
+    int ret = op_target.get_obj_state(dpp, get_bucket()->get_info(), get_obj(), follow_olh, &astate);
+    if (ret < 0) {
+      return ret;
+    }
+
+    /* Don't overwrite obj, atomic, or prefetch */
+    rgw_obj obj = get_obj();
+    bool is_atomic = state.is_atomic;
+    bool prefetch_data = state.prefetch_data;
+
+    state = *astate;
+    *pstate = &state;
+
+    state.obj = obj;
+    state.is_atomic = is_atomic;
+    state.prefetch_data = prefetch_data;
+    return ret;
+  }
+
+  int DBObject::read_attrs(const DoutPrefixProvider* dpp, DB::Object::Read &read_op, optional_yield y, rgw_obj* target_obj)
+  {
+    read_op.params.attrs = &state.attrset;
+    read_op.params.target_obj = target_obj;
+    read_op.params.obj_size = &state.size;
+    read_op.params.lastmod = &state.mtime;
+
+    return read_op.prepare(dpp);
+  }
+
+  int DBObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y)
+  {
+    Attrs empty;
+    DB::Object op_target(store->getDB(),
+        get_bucket()->get_info(), get_obj());
+    return op_target.set_attrs(dpp, setattrs ? *setattrs : empty, delattrs);
+  }
+
+  int DBObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj)
+  {
+    DB::Object op_target(store->getDB(), get_bucket()->get_info(), get_obj());
+    DB::Object::Read read_op(&op_target);
+
+    return read_attrs(dpp, read_op, y, target_obj);
+  }
+
+  int DBObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp)
+  {
+    rgw_obj target = get_obj();
+    int r = get_obj_attrs(y, dpp, &target);
+    if (r < 0) {
+      return r;
+    }
+    set_atomic();
+    state.attrset[attr_name] = attr_val;
+    return set_obj_attrs(dpp, &state.attrset, nullptr, y);
+  }
+
+  int DBObject::delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y)
+  {
+    Attrs rmattr;
+    bufferlist bl;
+
+    set_atomic();
+    rmattr[attr_name] = bl;
+    return set_obj_attrs(dpp, nullptr, &rmattr, y);
+  }
+
+  bool DBObject::is_expired() {
+    return false;
+  }
+
+  void DBObject::gen_rand_obj_instance_name()
+  {
+     store->getDB()->gen_rand_obj_instance_name(&state.obj.key);
+  }
+
+
+  int DBObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+      std::map<std::string, bufferlist> *m,
+      bool* pmore, optional_yield y)
+  {
+    DB::Object op_target(store->getDB(),
+        get_bucket()->get_info(), get_obj());
+    return op_target.obj_omap_get_vals(dpp, marker, count, m, pmore);
+  }
+
+  int DBObject::omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
+      optional_yield y)
+  {
+    DB::Object op_target(store->getDB(),
+        get_bucket()->get_info(), get_obj());
+    return op_target.obj_omap_get_all(dpp, m);
+  }
+
+  int DBObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+      const std::set<std::string>& keys,
+      Attrs* vals)
+  {
+    DB::Object op_target(store->getDB(),
+        get_bucket()->get_info(), get_obj());
+    return op_target.obj_omap_get_vals_by_keys(dpp, oid, keys, vals);
+  }
+
+  int DBObject::omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+      bool must_exist, optional_yield y)
+  {
+    DB::Object op_target(store->getDB(),
+        get_bucket()->get_info(), get_obj());
+    return op_target.obj_omap_set_val_by_key(dpp, key, val, must_exist);
+  }
+
+  int DBObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y)
+  {
+    return 0;
+  }
+
+  std::unique_ptr<MPSerializer> DBObject::get_serializer(const DoutPrefixProvider *dpp,
+							 const std::string& lock_name)
+  {
+    return std::make_unique<MPDBSerializer>(dpp, store, this, lock_name);
+  }
+
+  int DBObject::transition(Bucket* bucket,
+      const rgw_placement_rule& placement_rule,
+      const real_time& mtime,
+      uint64_t olh_epoch,
+      const DoutPrefixProvider* dpp,
+      optional_yield y)
+  {
+    DB::Object op_target(store->getDB(),
+        get_bucket()->get_info(), get_obj());
+    return op_target.transition(dpp, placement_rule, mtime, olh_epoch);
+  }
+
+  bool DBObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
+  {
+    /* XXX: support single default zone and zonegroup for now */
+    return true;
+  }
+
+  int DBObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f)
+  {
+    return 0;
+  }
+
+  std::unique_ptr<Object::ReadOp> DBObject::get_read_op()
+  {
+    return std::make_unique<DBObject::DBReadOp>(this, nullptr);
+  }
+
+  DBObject::DBReadOp::DBReadOp(DBObject *_source, RGWObjectCtx *_rctx) :
+    source(_source),
+    rctx(_rctx),
+    op_target(_source->store->getDB(),
+        _source->get_bucket()->get_info(),
+        _source->get_obj()),
+    parent_op(&op_target)
+  { }
+
+  int DBObject::DBReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp)
+  {
+    uint64_t obj_size;
+
+    parent_op.conds.mod_ptr = params.mod_ptr;
+    parent_op.conds.unmod_ptr = params.unmod_ptr;
+    parent_op.conds.high_precision_time = params.high_precision_time;
+    parent_op.conds.mod_zone_id = params.mod_zone_id;
+    parent_op.conds.mod_pg_ver = params.mod_pg_ver;
+    parent_op.conds.if_match = params.if_match;
+    parent_op.conds.if_nomatch = params.if_nomatch;
+    parent_op.params.lastmod = params.lastmod;
+    parent_op.params.target_obj = params.target_obj;
+    parent_op.params.obj_size = &obj_size;
+    parent_op.params.attrs = &source->get_attrs();
+
+    int ret = parent_op.prepare(dpp);
+    if (ret < 0)
+      return ret;
+
+    source->set_key(parent_op.state.obj.key);
+    source->set_obj_size(obj_size);
+
+    return ret;
+  }
+
+  int DBObject::DBReadOp::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider* dpp)
+  {
+    return parent_op.read(ofs, end, bl, dpp);
+  }
+
+  int DBObject::DBReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y)
+  {
+    return parent_op.get_attr(dpp, name, dest);
+  }
+
+  std::unique_ptr<Object::DeleteOp> DBObject::get_delete_op()
+  {
+    return std::make_unique<DBObject::DBDeleteOp>(this);
+  }
+
+  DBObject::DBDeleteOp::DBDeleteOp(DBObject *_source) :
+    source(_source),
+    op_target(_source->store->getDB(),
+        _source->get_bucket()->get_info(),
+        _source->get_obj()),
+    parent_op(&op_target)
+  { }
+
+  int DBObject::DBDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional_yield y)
+  {
+    parent_op.params.bucket_owner = params.bucket_owner.get_id();
+    parent_op.params.versioning_status = params.versioning_status;
+    parent_op.params.obj_owner = params.obj_owner;
+    parent_op.params.olh_epoch = params.olh_epoch;
+    parent_op.params.marker_version_id = params.marker_version_id;
+    parent_op.params.bilog_flags = params.bilog_flags;
+    parent_op.params.remove_objs = params.remove_objs;
+    parent_op.params.expiration_time = params.expiration_time;
+    parent_op.params.unmod_since = params.unmod_since;
+    parent_op.params.mtime = params.mtime;
+    parent_op.params.high_precision_time = params.high_precision_time;
+    parent_op.params.zones_trace = params.zones_trace;
+    parent_op.params.abortmp = params.abortmp;
+    parent_op.params.parts_accounted_size = params.parts_accounted_size;
+
+    int ret = parent_op.delete_obj(dpp);
+    if (ret < 0)
+      return ret;
+
+    result.delete_marker = parent_op.result.delete_marker;
+    result.version_id = parent_op.result.version_id;
+
+    return ret;
+  }
+
+  int DBObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y, bool prevent_versioning)
+  {
+    DB::Object del_target(store->getDB(), bucket->get_info(), get_obj());
+    DB::Object::Delete del_op(&del_target);
+
+    del_op.params.bucket_owner = bucket->get_info().owner;
+    del_op.params.versioning_status = bucket->get_info().versioning_status();
+
+    return del_op.delete_obj(dpp);
+  }
+
+  int DBObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate,
+      Completions* aio, bool keep_index_consistent,
+      optional_yield y)
+  {
+    /* XXX: Make it async */
+    return 0;
+  }
+
+  int DBObject::copy_object(User* user,
+      req_info* info,
+      const rgw_zone_id& source_zone,
+      rgw::sal::Object* dest_object,
+      rgw::sal::Bucket* dest_bucket,
+      rgw::sal::Bucket* src_bucket,
+      const rgw_placement_rule& dest_placement,
+      ceph::real_time* src_mtime,
+      ceph::real_time* mtime,
+      const ceph::real_time* mod_ptr,
+      const ceph::real_time* unmod_ptr,
+      bool high_precision_time,
+      const char* if_match,
+      const char* if_nomatch,
+      AttrsMod attrs_mod,
+      bool copy_if_newer,
+      Attrs& attrs,
+      RGWObjCategory category,
+      uint64_t olh_epoch,
+      boost::optional<ceph::real_time> delete_at,
+      std::string* version_id,
+      std::string* tag,
+      std::string* etag,
+      void (*progress_cb)(off_t, void *),
+      void* progress_data,
+      const DoutPrefixProvider* dpp,
+      optional_yield y)
+  {
+        return 0;
+  }
+
+  int DBObject::DBReadOp::iterate(const DoutPrefixProvider* dpp, int64_t ofs, int64_t end, RGWGetDataCB* cb, optional_yield y)
+  {
+    return parent_op.iterate(dpp, ofs, end, cb);
+  }
+
+  int DBObject::swift_versioning_restore(bool& restored,
+      const DoutPrefixProvider* dpp)
+  {
+    return 0;
+  }
+
+  int DBObject::swift_versioning_copy(const DoutPrefixProvider* dpp,
+      optional_yield y)
+  {
+    return 0;
+  }
+
+  int DBMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct)
+  {
+    std::unique_ptr<rgw::sal::Object> meta_obj = get_meta_obj();
+    meta_obj->set_in_extra_data(true);
+    meta_obj->set_hash_source(mp_obj.get_key());
+    int ret;
+
+    std::unique_ptr<rgw::sal::Object::DeleteOp> del_op = meta_obj->get_delete_op();
+    del_op->params.bucket_owner = bucket->get_acl_owner();
+    del_op->params.versioning_status = 0;
+
+    // Since the data objects are associated with meta obj till
+    // MultipartUpload::Complete() is done, removing the metadata obj
+    // should remove all the uploads so far.
+    ret = del_op->delete_obj(dpp, null_yield);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << __func__ << ": del_op.delete_obj returned " <<
+        ret << dendl;
+    }
+    return (ret == -ENOENT) ? -ERR_NO_SUCH_UPLOAD : ret;
+  }
+
+  static string mp_ns = RGW_OBJ_NS_MULTIPART;
+
+  std::unique_ptr<rgw::sal::Object> DBMultipartUpload::get_meta_obj()
+  {
+    return bucket->get_object(rgw_obj_key(get_meta(), string(), mp_ns));
+  }
+
+  int DBMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs)
+  {
+    int ret;
+    std::string oid = mp_obj.get_key();
+
+    char buf[33];
+    std::unique_ptr<rgw::sal::Object> obj; // create meta obj
+    gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+    std::string upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */
+    upload_id.append(buf);
+
+    mp_obj.init(oid, upload_id);
+    obj = get_meta_obj();
+
+    DB::Object op_target(store->getDB(), obj->get_bucket()->get_info(),
+			       obj->get_obj());
+    DB::Object::Write obj_op(&op_target);
+
+    /* Create meta object */
+    obj_op.meta.owner = owner.get_id();
+    obj_op.meta.category = RGWObjCategory::MultiMeta;
+    obj_op.meta.flags = PUT_OBJ_CREATE_EXCL;
+    obj_op.meta.mtime = &mtime;
+
+    multipart_upload_info upload_info;
+    upload_info.dest_placement = dest_placement;
+
+    bufferlist bl;
+    encode(upload_info, bl);
+    obj_op.meta.data = &bl; 
+    ret = obj_op.prepare(dpp);
+    if (ret < 0)
+      return ret;
+    ret = obj_op.write_meta(dpp, bl.length(), bl.length(), attrs);
+
+    return ret;
+  }
+
+  int DBMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct,
+				     int num_parts, int marker,
+				     int *next_marker, bool *truncated,
+				     bool assume_unsorted)
+  {
+    std::list<RGWUploadPartInfo> parts_map;
+
+    std::unique_ptr<rgw::sal::Object> obj = get_meta_obj();
+
+    parts.clear();
+    int ret;
+
+    DB::Object op_target(store->getDB(),
+        obj->get_bucket()->get_info(), obj->get_obj());
+    ret = op_target.get_mp_parts_list(dpp, parts_map);
+    if (ret < 0) {
+      return ret;
+    }
+
+    int last_num = 0;
+
+    while (!parts_map.empty()) {
+      std::unique_ptr<DBMultipartPart> part = std::make_unique<DBMultipartPart>();
+      RGWUploadPartInfo &pinfo = parts_map.front();
+      part->set_info(pinfo);
+      if ((int)pinfo.num > marker) {
+        last_num = pinfo.num;
+        parts[pinfo.num] = std::move(part);
+      }
+      parts_map.pop_front();
+    }
+
+    /* rebuild a map with only num_parts entries */
+    std::map<uint32_t, std::unique_ptr<MultipartPart>> new_parts;
+    std::map<uint32_t, std::unique_ptr<MultipartPart>>::iterator piter;
+    int i;
+    for (i = 0, piter = parts.begin();
+	 i < num_parts && piter != parts.end();
+	 ++i, ++piter) {
+      last_num = piter->first;
+      new_parts[piter->first] = std::move(piter->second);
+    }
+
+    if (truncated) {
+      *truncated = (piter != parts.end());
+    }
+
+    parts.swap(new_parts);
+
+    if (next_marker) {
+      *next_marker = last_num;
+    }
+
+    return 0;
+  }
+
+  int DBMultipartUpload::complete(const DoutPrefixProvider *dpp,
+				   optional_yield y, CephContext* cct,
+				   map<int, string>& part_etags,
+				   list<rgw_obj_index_key>& remove_objs,
+				   uint64_t& accounted_size, bool& compressed,
+				   RGWCompressionInfo& cs_info, off_t& ofs,
+				   std::string& tag, ACLOwner& owner,
+				   uint64_t olh_epoch,
+				   rgw::sal::Object* target_obj)
+  {
+    char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+    char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+    std::string etag;
+    bufferlist etag_bl;
+    MD5 hash;
+    bool truncated;
+    int ret;
+
+    int total_parts = 0;
+    int handled_parts = 0;
+    int max_parts = 1000;
+    int marker = 0;
+    uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size;
+    auto etags_iter = part_etags.begin();
+    rgw::sal::Attrs attrs = target_obj->get_attrs();
+
+    ofs = 0;
+    accounted_size = 0;
+    do {
+      ret = list_parts(dpp, cct, max_parts, marker, &marker, &truncated);
+      if (ret == -ENOENT) {
+        ret = -ERR_NO_SUCH_UPLOAD;
+      }
+      if (ret < 0)
+        return ret;
+
+      total_parts += parts.size();
+      if (!truncated && total_parts != (int)part_etags.size()) {
+        ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts
+		       << " expected: " << part_etags.size() << dendl;
+        ret = -ERR_INVALID_PART;
+        return ret;
+      }
+
+      for (auto obj_iter = parts.begin(); etags_iter != part_etags.end() && obj_iter != parts.end(); ++etags_iter, ++obj_iter, ++handled_parts) {
+        DBMultipartPart* part = dynamic_cast<rgw::sal::DBMultipartPart*>(obj_iter->second.get());
+        uint64_t part_size = part->get_size();
+        if (handled_parts < (int)part_etags.size() - 1 &&
+            part_size < min_part_size) {
+          ret = -ERR_TOO_SMALL;
+          return ret;
+        }
+
+        char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+        if (etags_iter->first != (int)obj_iter->first) {
+          ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: "
+		  	 << etags_iter->first << " next uploaded: "
+			 << obj_iter->first << dendl;
+          ret = -ERR_INVALID_PART;
+          return ret;
+        }
+        string part_etag = rgw_string_unquote(etags_iter->second);
+        if (part_etag.compare(part->get_etag()) != 0) {
+          ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: " << etags_iter->first
+			 << " etag: " << etags_iter->second << dendl;
+          ret = -ERR_INVALID_PART;
+          return ret;
+        }
+
+        hex_to_buf(part->get_etag().c_str(), petag,
+		  CEPH_CRYPTO_MD5_DIGESTSIZE);
+        hash.Update((const unsigned char *)petag, sizeof(petag));
+
+        RGWUploadPartInfo& obj_part = part->get_info();
+
+        ofs += obj_part.size;
+        accounted_size += obj_part.accounted_size;
+      }
+    } while (truncated);
+    hash.Final((unsigned char *)final_etag);
+
+    buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
+    snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
+	     sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+           "-%lld", (long long)part_etags.size());
+    etag = final_etag_str;
+    ldpp_dout(dpp, 10) << "calculated etag: " << etag << dendl;
+
+    etag_bl.append(etag);
+
+    attrs[RGW_ATTR_ETAG] = etag_bl;
+
+    /* XXX: handle compression ? */
+
+    /* Rename all the object data entries with original object name (i.e
+     * from 'head_obj.name + "." + upload_id' to head_obj.name) */
+
+    /* Original head object */
+    DB::Object op_target(store->getDB(),
+			     target_obj->get_bucket()->get_info(),
+			     target_obj->get_obj(), get_upload_id());
+    DB::Object::Write obj_op(&op_target);
+    ret = obj_op.prepare(dpp);
+
+    obj_op.meta.owner = owner.get_id();
+    obj_op.meta.flags = PUT_OBJ_CREATE;
+    obj_op.meta.category = RGWObjCategory::Main;
+    obj_op.meta.modify_tail = true;
+    obj_op.meta.completeMultipart = true;
+
+    ret = obj_op.write_meta(dpp, ofs, accounted_size, attrs);
+    if (ret < 0)
+      return ret;
+
+    /* No need to delete Meta obj here. It is deleted from sal */
+    return ret;
+  }
+
+  int DBMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
+  {
+    if (!rule && !attrs) {
+      return 0;
+    }
+
+    if (rule) {
+      if (!placement.empty()) {
+        *rule = &placement;
+        if (!attrs) {
+	    /* Don't need attrs, done */
+	      return 0;
+        }
+      } else {
+        *rule = nullptr;
+      }
+    }
+
+    /* We need either attributes or placement, so we need a read */
+    std::unique_ptr<rgw::sal::Object> meta_obj;
+    meta_obj = get_meta_obj();
+    meta_obj->set_in_extra_data(true);
+
+    multipart_upload_info upload_info;
+    bufferlist headbl;
+
+    /* Read the obj head which contains the multipart_upload_info */
+    std::unique_ptr<rgw::sal::Object::ReadOp> read_op = meta_obj->get_read_op();
+    int ret = read_op->prepare(y, dpp);
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+        return -ERR_NO_SUCH_UPLOAD;
+      }
+      return ret;
+    }
+
+    if (attrs) {
+      /* Attrs are filled in by prepare */
+      *attrs = meta_obj->get_attrs();
+      if (!rule || *rule != nullptr) {
+        /* placement was cached; don't actually read */
+        return 0;
+      }
+    }
+
+    /* Now read the placement from the head */
+    ret = read_op->read(0, store->getDB()->get_max_head_size(), headbl, y, dpp);
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+        return -ERR_NO_SUCH_UPLOAD;
+      }
+      return ret;
+    }
+
+    if (headbl.length() <= 0) {
+      return -ERR_NO_SUCH_UPLOAD;
+    }
+
+    /* Decode multipart_upload_info */
+    auto hiter = headbl.cbegin();
+    try {
+      decode(upload_info, hiter);
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to decode multipart upload info" << dendl;
+      return -EIO;
+    }
+    placement = upload_info.dest_placement;
+    *rule = &placement;
+
+    return 0;
+  }
+
+  std::unique_ptr<Writer> DBMultipartUpload::get_writer(
+				  const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t part_num,
+				  const std::string& part_num_str)
+  {
+    return std::make_unique<DBMultipartWriter>(dpp, y, this, obj, store, owner,
+				 ptail_placement_rule, part_num, part_num_str);
+  }
+
+  DBMultipartWriter::DBMultipartWriter(const DoutPrefixProvider *dpp,
+	    	    optional_yield y,
+                MultipartUpload* upload,
+		        rgw::sal::Object* obj,
+		        DBStore* _driver,
+    		    const rgw_user& _owner,
+	    	    const rgw_placement_rule *_ptail_placement_rule,
+                uint64_t _part_num, const std::string& _part_num_str):
+			StoreWriter(dpp, y),
+			store(_driver),
+                owner(_owner),
+                ptail_placement_rule(_ptail_placement_rule),
+                head_obj(obj),
+                upload_id(upload->get_upload_id()),
+                part_num(_part_num),
+                oid(head_obj->get_name() + "." + upload_id +
+                    "." + std::to_string(part_num)),
+                meta_obj(((DBMultipartUpload*)upload)->get_meta_obj()),
+                op_target(_driver->getDB(), head_obj->get_bucket()->get_info(), head_obj->get_obj(), upload_id),
+                parent_op(&op_target),
+                part_num_str(_part_num_str) {}
+
+  int DBMultipartWriter::prepare(optional_yield y)
+  {
+    parent_op.prepare(NULL);
+    parent_op.set_mp_part_str(upload_id + "." + std::to_string(part_num));
+    // XXX: do we need to handle part_num_str??
+    return 0;
+  }
+
+  int DBMultipartWriter::process(bufferlist&& data, uint64_t offset)
+  {
+    /* XXX: same as AtomicWriter..consolidate code */
+    total_data_size += data.length();
+
+    /* XXX: Optimize all bufferlist copies in this function */
+
+    /* copy head_data into meta. But for multipart we do not
+     * need to write head_data */
+    uint64_t max_chunk_size = store->getDB()->get_max_chunk_size();
+    int excess_size = 0;
+
+    /* Accumulate tail_data till max_chunk_size or flush op */
+    bufferlist tail_data;
+
+    if (data.length() != 0) {
+        parent_op.meta.data = &head_data; /* Null data ?? */
+
+      /* handle tail )parts.
+       * First accumulate and write data into dbstore in its chunk_size
+       * parts
+       */
+      if (!tail_part_size) { /* new tail part */
+        tail_part_offset = offset;
+      }
+      data.begin(0).copy(data.length(), tail_data);
+      tail_part_size += tail_data.length();
+      tail_part_data.append(tail_data);
+
+      if (tail_part_size < max_chunk_size)  {
+        return 0;
+      } else {
+        int write_ofs = 0;
+        while (tail_part_size >= max_chunk_size) {
+          excess_size = tail_part_size - max_chunk_size;
+          bufferlist tmp;
+          tail_part_data.begin(write_ofs).copy(max_chunk_size, tmp);
+          /* write tail objects data */
+          int ret = parent_op.write_data(dpp, tmp, tail_part_offset);
+
+          if (ret < 0) {
+            return ret;
+          }
+
+          tail_part_size -= max_chunk_size;
+          write_ofs += max_chunk_size;
+          tail_part_offset += max_chunk_size;
+        }
+        /* reset tail parts or update if excess data */
+        if (excess_size > 0) { /* wrote max_chunk_size data */
+          tail_part_size = excess_size;
+          bufferlist tmp;
+          tail_part_data.begin(write_ofs).copy(excess_size, tmp);
+          tail_part_data = tmp;
+        } else {
+          tail_part_size = 0;
+          tail_part_data.clear();
+          tail_part_offset = 0;
+        }
+      }
+    } else {
+      if (tail_part_size == 0) {
+        return 0; /* nothing more to write */
+      }
+
+      /* flush watever tail data is present */
+      int ret = parent_op.write_data(dpp, tail_part_data, tail_part_offset);
+      if (ret < 0) {
+        return ret;
+      }
+      tail_part_size = 0;
+      tail_part_data.clear();
+      tail_part_offset = 0;
+    }
+
+    return 0;
+  }
+
+  int DBMultipartWriter::complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y)
+  {
+    int ret = 0;
+    /* XXX: same as AtomicWriter..consolidate code */
+    parent_op.meta.mtime = mtime;
+    parent_op.meta.delete_at = delete_at;
+    parent_op.meta.if_match = if_match;
+    parent_op.meta.if_nomatch = if_nomatch;
+    parent_op.meta.user_data = user_data;
+    parent_op.meta.zones_trace = zones_trace;
+    
+    /* XXX: handle accounted size */
+    accounted_size = total_data_size;
+
+    if (ret < 0)
+      return ret;
+
+    RGWUploadPartInfo info;
+    info.num = part_num;
+    info.etag = etag;
+    info.size = total_data_size;
+    info.accounted_size = accounted_size;
+    info.modified = real_clock::now();
+    //info.manifest = manifest;
+
+    DB::Object op_target(store->getDB(),
+        meta_obj->get_bucket()->get_info(), meta_obj->get_obj());
+    ret = op_target.add_mp_part(dpp, info);
+    if (ret < 0) {
+      return ret == -ENOENT ? -ERR_NO_SUCH_UPLOAD : ret;
+    }
+
+    return 0;
+  }
+
+  DBAtomicWriter::DBAtomicWriter(const DoutPrefixProvider *dpp,
+	    	    optional_yield y,
+		        rgw::sal::Object* _obj,
+		        DBStore* _driver,
+    		    const rgw_user& _owner,
+	    	    const rgw_placement_rule *_ptail_placement_rule,
+		        uint64_t _olh_epoch,
+		        const std::string& _unique_tag) :
+			StoreWriter(dpp, y),
+			store(_driver),
+                owner(_owner),
+                ptail_placement_rule(_ptail_placement_rule),
+                olh_epoch(_olh_epoch),
+                unique_tag(_unique_tag),
+                obj(_driver, _obj->get_key(), _obj->get_bucket()),
+                op_target(_driver->getDB(), obj.get_bucket()->get_info(), obj.get_obj()),
+                parent_op(&op_target) {}
+
+  int DBAtomicWriter::prepare(optional_yield y)
+  {
+    return parent_op.prepare(NULL); /* send dpp */
+  }
+
+  int DBAtomicWriter::process(bufferlist&& data, uint64_t offset)
+  {
+    total_data_size += data.length();
+
+    /* XXX: Optimize all bufferlist copies in this function */
+
+    /* copy head_data into meta. */
+    uint64_t head_size = store->getDB()->get_max_head_size();
+    unsigned head_len = 0;
+    uint64_t max_chunk_size = store->getDB()->get_max_chunk_size();
+    int excess_size = 0;
+
+    /* Accumulate tail_data till max_chunk_size or flush op */
+    bufferlist tail_data;
+
+    if (data.length() != 0) {
+      if (offset < head_size) {
+        /* XXX: handle case (if exists) where offset > 0 & < head_size */
+        head_len = std::min((uint64_t)data.length(),
+                                    head_size - offset);
+        bufferlist tmp;
+        data.begin(0).copy(head_len, tmp);
+        head_data.append(tmp);
+
+        parent_op.meta.data = &head_data;
+        if (head_len == data.length()) {
+          return 0;
+        }
+
+        /* Move offset by copy_len */
+        offset = head_len;
+      }
+
+      /* handle tail parts.
+       * First accumulate and write data into dbstore in its chunk_size
+       * parts
+       */
+      if (!tail_part_size) { /* new tail part */
+        tail_part_offset = offset;
+      }
+      data.begin(head_len).copy(data.length() - head_len, tail_data);
+      tail_part_size += tail_data.length();
+      tail_part_data.append(tail_data);
+
+      if (tail_part_size < max_chunk_size)  {
+        return 0;
+      } else {
+        int write_ofs = 0;
+        while (tail_part_size >= max_chunk_size) {
+          excess_size = tail_part_size - max_chunk_size;
+          bufferlist tmp;
+          tail_part_data.begin(write_ofs).copy(max_chunk_size, tmp);
+          /* write tail objects data */
+          int ret = parent_op.write_data(dpp, tmp, tail_part_offset);
+
+          if (ret < 0) {
+            return ret;
+          }
+
+          tail_part_size -= max_chunk_size;
+          write_ofs += max_chunk_size;
+          tail_part_offset += max_chunk_size;
+        }
+        /* reset tail parts or update if excess data */
+        if (excess_size > 0) { /* wrote max_chunk_size data */
+          tail_part_size = excess_size;
+          bufferlist tmp;
+          tail_part_data.begin(write_ofs).copy(excess_size, tmp);
+          tail_part_data = tmp;
+        } else {
+          tail_part_size = 0;
+          tail_part_data.clear();
+          tail_part_offset = 0;
+        }
+      }
+    } else {
+      if (tail_part_size == 0) {
+        return 0; /* nothing more to write */
+      }
+
+      /* flush watever tail data is present */
+      int ret = parent_op.write_data(dpp, tail_part_data, tail_part_offset);
+      if (ret < 0) {
+        return ret;
+      }
+      tail_part_size = 0;
+      tail_part_data.clear();
+      tail_part_offset = 0;
+    }
+
+    return 0;
+  }
+
+  int DBAtomicWriter::complete(size_t accounted_size, const std::string& etag,
+                         ceph::real_time *mtime, ceph::real_time set_mtime,
+                         std::map<std::string, bufferlist>& attrs,
+                         ceph::real_time delete_at,
+                         const char *if_match, const char *if_nomatch,
+                         const std::string *user_data,
+                         rgw_zone_set *zones_trace, bool *canceled,
+                         optional_yield y)
+  {
+    parent_op.meta.mtime = mtime;
+    parent_op.meta.delete_at = delete_at;
+    parent_op.meta.if_match = if_match;
+    parent_op.meta.if_nomatch = if_nomatch;
+    parent_op.meta.user_data = user_data;
+    parent_op.meta.zones_trace = zones_trace;
+    parent_op.meta.category = RGWObjCategory::Main;
+    
+    /* XXX: handle accounted size */
+    accounted_size = total_data_size;
+    int ret = parent_op.write_meta(dpp, total_data_size, accounted_size, attrs);
+    if (canceled) {
+      *canceled = parent_op.meta.canceled;
+    }
+
+    return ret;
+
+  }
+
+  std::unique_ptr<RGWRole> DBStore::get_role(std::string name,
+      std::string tenant,
+      std::string path,
+      std::string trust_policy,
+      std::string max_session_duration_str,
+      std::multimap<std::string,std::string> tags)
+  {
+    RGWRole* p = nullptr;
+    return std::unique_ptr<RGWRole>(p);
+  }
+
+  std::unique_ptr<RGWRole> DBStore::get_role(std::string id)
+  {
+    RGWRole* p = nullptr;
+    return std::unique_ptr<RGWRole>(p);
+  }
+
+  std::unique_ptr<RGWRole> DBStore::get_role(const RGWRoleInfo& info)
+  {
+    RGWRole* p = nullptr;
+    return std::unique_ptr<RGWRole>(p);
+  }
+
+  int DBStore::get_roles(const DoutPrefixProvider *dpp,
+      optional_yield y,
+      const std::string& path_prefix,
+      const std::string& tenant,
+      vector<std::unique_ptr<RGWRole>>& roles)
+  {
+    return 0;
+  }
+
+  std::unique_ptr<RGWOIDCProvider> DBStore::get_oidc_provider()
+  {
+    RGWOIDCProvider* p = nullptr;
+    return std::unique_ptr<RGWOIDCProvider>(p);
+  }
+
+  int DBStore::get_oidc_providers(const DoutPrefixProvider *dpp,
+      const std::string& tenant,
+      vector<std::unique_ptr<RGWOIDCProvider>>& providers)
+  {
+    return 0;
+  }
+
+  std::unique_ptr<Writer> DBStore::get_append_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  const std::string& unique_tag,
+				  uint64_t position,
+				  uint64_t *cur_accounted_size) {
+    return nullptr;
+  }
+
+  std::unique_ptr<Writer> DBStore::get_atomic_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t olh_epoch,
+				  const std::string& unique_tag) {
+    return std::make_unique<DBAtomicWriter>(dpp, y, obj, this, owner,
+                    ptail_placement_rule, olh_epoch, unique_tag);
+  }
+
+  const std::string& DBStore::get_compression_type(const rgw_placement_rule& rule) {
+    return zone.get_rgw_params().get_compression_type(rule);
+  }
+
+  bool DBStore::valid_placement(const rgw_placement_rule& rule)
+  {
+    // XXX: Till zonegroup, zone and storage-classes can be configured
+    // for dbstore return true
+    return true; //zone.get_rgw_params().valid_placement(rule);
+  }
+
+  std::unique_ptr<User> DBStore::get_user(const rgw_user &u)
+  {
+    return std::make_unique<DBUser>(this, u);
+  }
+
+  int DBStore::get_user_by_access_key(const DoutPrefixProvider *dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user)
+  {
+    RGWUserInfo uinfo;
+    User *u;
+    int ret = 0;
+    RGWObjVersionTracker objv_tracker;
+
+    ret = getDB()->get_user(dpp, string("access_key"), key, uinfo, nullptr,
+        &objv_tracker);
+
+    if (ret < 0)
+      return ret;
+
+    u = new DBUser(this, uinfo);
+
+    if (!u)
+      return -ENOMEM;
+
+    u->get_version_tracker() = objv_tracker;
+    user->reset(u);
+
+    return 0;
+  }
+
+  int DBStore::get_user_by_email(const DoutPrefixProvider *dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user)
+  {
+    RGWUserInfo uinfo;
+    User *u;
+    int ret = 0;
+    RGWObjVersionTracker objv_tracker;
+
+    ret = getDB()->get_user(dpp, string("email"), email, uinfo, nullptr,
+        &objv_tracker);
+
+    if (ret < 0)
+      return ret;
+
+    u = new DBUser(this, uinfo);
+
+    if (!u)
+      return -ENOMEM;
+
+    u->get_version_tracker() = objv_tracker;
+    user->reset(u);
+
+    return ret;
+  }
+
+  int DBStore::get_user_by_swift(const DoutPrefixProvider *dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user)
+  {
+    /* Swift keys and subusers are not supported for now */
+    return -ENOTSUP;
+  }
+
+  std::string DBStore::get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y)
+  {
+    return "PLACEHOLDER"; // for instance unique identifier
+  }
+
+  std::unique_ptr<Object> DBStore::get_object(const rgw_obj_key& k)
+  {
+    return std::make_unique<DBObject>(this, k);
+  }
+
+
+  int DBStore::get_bucket(const DoutPrefixProvider *dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y)
+  {
+    int ret;
+    Bucket* bp;
+
+    bp = new DBBucket(this, b, u);
+    ret = bp->load_bucket(dpp, y);
+    if (ret < 0) {
+      delete bp;
+      return ret;
+    }
+
+    bucket->reset(bp);
+    return 0;
+  }
+
+  int DBStore::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket)
+  {
+    Bucket* bp;
+
+    bp = new DBBucket(this, i, u);
+    /* Don't need to fetch the bucket info, use the provided one */
+
+    bucket->reset(bp);
+    return 0;
+  }
+
+  int DBStore::get_bucket(const DoutPrefixProvider *dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr<Bucket>* bucket, optional_yield y)
+  {
+    rgw_bucket b;
+
+    b.tenant = tenant;
+    b.name = name;
+
+    return get_bucket(dpp, u, b, bucket, y);
+  }
+
+  bool DBStore::is_meta_master()
+  {
+    return true;
+  }
+
+  int DBStore::forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version *objv,
+      bufferlist& in_data,
+      JSONParser *jp, req_info& info,
+      optional_yield y)
+  {
+    return 0;
+  }
+
+  int DBStore::forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+					     bufferlist& in_data,
+					     RGWXMLDecoder::XMLParser* parser, req_info& info,
+					     optional_yield y)
+  {
+      return 0;
+  }
+
+  std::string DBStore::zone_unique_id(uint64_t unique_num)
+  {
+    return "";
+  }
+
+  std::string DBStore::zone_unique_trans_id(const uint64_t unique_num)
+  {
+    return "";
+  }
+
+  int DBStore::get_zonegroup(const std::string& id, std::unique_ptr<ZoneGroup>* zg)
+  {
+    /* XXX: for now only one zonegroup supported */
+    ZoneGroup* group = new DBZoneGroup(this, std::make_unique<RGWZoneGroup>());
+    if (!group)
+      return -ENOMEM;
+
+    zg->reset(group);
+    return 0;
+  }
+
+  int DBStore::list_all_zones(const DoutPrefixProvider* dpp,
+			      std::list<std::string>& zone_ids)
+  {
+    zone_ids.push_back(zone.get_id());
+    return 0;
+  }
+
+  int DBStore::cluster_stat(RGWClusterStat& stats)
+  {
+    return 0;
+  }
+
+  std::unique_ptr<Lifecycle> DBStore::get_lifecycle(void)
+  {
+    return std::make_unique<DBLifecycle>(this);
+  }
+
+  std::unique_ptr<Completions> DBStore::get_completions(void)
+  {
+    return 0;
+  }
+
+  int DBLifecycle::get_entry(const std::string& oid, const std::string& marker,
+			      std::unique_ptr<LCEntry>* entry)
+  {
+    return store->getDB()->get_entry(oid, marker, entry);
+  }
+
+  int DBLifecycle::get_next_entry(const std::string& oid, const std::string& marker,
+				  std::unique_ptr<LCEntry>* entry)
+  {
+    return store->getDB()->get_next_entry(oid, marker, entry);
+  }
+
+  int DBLifecycle::set_entry(const std::string& oid, LCEntry& entry)
+  {
+    return store->getDB()->set_entry(oid, entry);
+  }
+
+  int DBLifecycle::list_entries(const std::string& oid, const std::string& marker,
+  				 uint32_t max_entries, vector<std::unique_ptr<LCEntry>>& entries)
+  {
+    return store->getDB()->list_entries(oid, marker, max_entries, entries);
+  }
+
+  int DBLifecycle::rm_entry(const std::string& oid, LCEntry& entry)
+  {
+    return store->getDB()->rm_entry(oid, entry);
+  }
+
+  int DBLifecycle::get_head(const std::string& oid, std::unique_ptr<LCHead>* head)
+  {
+    return store->getDB()->get_head(oid, head);
+  }
+
+  int DBLifecycle::put_head(const std::string& oid, LCHead& head)
+  {
+    return store->getDB()->put_head(oid, head);
+  }
+
+  std::unique_ptr<LCSerializer> DBLifecycle::get_serializer(const std::string& lock_name,
+							    const std::string& oid,
+							    const std::string& cookie)
+  {
+    return std::make_unique<LCDBSerializer>(store, oid, lock_name, cookie);
+  }
+
+  std::unique_ptr<Notification> DBStore::get_notification(
+    rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s,
+    rgw::notify::EventType event_type, optional_yield y,
+    const std::string* object_name)
+  {
+    return std::make_unique<DBNotification>(obj, src_obj, event_type);
+  }
+
+  std::unique_ptr<Notification> DBStore::get_notification(
+    const DoutPrefixProvider* dpp, rgw::sal::Object* obj,
+    rgw::sal::Object* src_obj,
+    rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket,
+    std::string& _user_id, std::string& _user_tenant, std::string& _req_id,
+    optional_yield y)
+  {
+    return std::make_unique<DBNotification>(obj, src_obj, event_type);
+  }
+
+  RGWLC* DBStore::get_rgwlc(void) {
+    return lc;
+  }
+
+  int DBStore::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
+  {
+    return 0;
+  }
+
+  int DBStore::log_op(const DoutPrefixProvider *dpp, string& oid, bufferlist& bl)
+  {
+    return 0;
+  }
+
+  int DBStore::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type,
+      const map<string, string>& meta)
+  {
+    return 0;
+  }
+
+  void DBStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit)
+  {
+    return;
+  }
+
+  void DBStore::get_quota(RGWQuota& quota)
+  {
+    // XXX: Not handled for the first pass 
+    return;
+  }
+
+  int DBStore::set_buckets_enabled(const DoutPrefixProvider *dpp, vector<rgw_bucket>& buckets, bool enabled)
+  {
+    int ret = 0;
+
+    vector<rgw_bucket>::iterator iter;
+
+    for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
+      rgw_bucket& bucket = *iter;
+      if (enabled) {
+        ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
+      } else {
+        ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
+      }
+
+      RGWBucketInfo info;
+      map<string, bufferlist> attrs;
+      int r = getDB()->get_bucket_info(dpp, string("name"), "", info, &attrs,
+          nullptr, nullptr);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
+        ret = r;
+        continue;
+      }
+      if (enabled) {
+        info.flags &= ~BUCKET_SUSPENDED;
+      } else {
+        info.flags |= BUCKET_SUSPENDED;
+      }
+
+      r = getDB()->update_bucket(dpp, "info", info, false, nullptr, &attrs, nullptr, &info.objv_tracker);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
+        ret = r;
+        continue;
+      }
+    }
+    return ret;
+  }
+
+  int DBStore::get_sync_policy_handler(const DoutPrefixProvider *dpp,
+      std::optional<rgw_zone_id> zone,
+      std::optional<rgw_bucket> bucket,
+      RGWBucketSyncPolicyHandlerRef *phandler,
+      optional_yield y)
+  {
+    return 0;
+  }
+
+  RGWDataSyncStatusManager* DBStore::get_data_sync_manager(const rgw_zone_id& source_zone)
+  {
+    return 0;
+  }
+
+  int DBStore::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, 
+      uint32_t max_entries, bool *is_truncated,
+      RGWUsageIter& usage_iter,
+      map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+  {
+    return 0;
+  }
+
+  int DBStore::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+  {
+    return 0;
+  }
+
+  int DBStore::get_config_key_val(string name, bufferlist *bl)
+  {
+    return -ENOTSUP;
+  }
+
+  int DBStore::meta_list_keys_init(const DoutPrefixProvider *dpp, const string& section, const string& marker, void** phandle)
+  {
+    return 0;
+  }
+
+  int DBStore::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, list<string>& keys, bool* truncated)
+  {
+    return 0;
+  }
+
+  void DBStore::meta_list_keys_complete(void* handle)
+  {
+    return;
+  }
+
+  std::string DBStore::meta_get_marker(void* handle)
+  {
+    return "";
+  }
+
+  int DBStore::meta_remove(const DoutPrefixProvider *dpp, string& metadata_key, optional_yield y)
+  {
+    return 0;
+  }
+
+  int DBStore::initialize(CephContext *_cct, const DoutPrefixProvider *_dpp) {
+    int ret = 0;
+    cct = _cct;
+    dpp = _dpp;
+
+    lc = new RGWLC();
+    lc->initialize(cct, this);
+
+    if (use_lc_thread) {
+      ret = db->createLCTables(dpp);
+      lc->start_processor();
+    }
+
+    ret = db->createGC(dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) <<"GC thread creation failed: ret = " << ret << dendl;
+    }
+
+    return ret;
+  }
+
+  int DBLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script)
+  {
+    return -ENOENT;
+  }
+
+  int DBLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script)
+  {
+    return -ENOENT;
+  }
+
+  int DBLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key)
+  {
+    return -ENOENT;
+  }
+
+  int DBLuaManager::add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name)
+  {
+    return -ENOENT;
+  }
+
+  int DBLuaManager::remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name)
+  {
+    return -ENOENT;
+  }
+
+  int DBLuaManager::list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages)
+  {
+    return -ENOENT;
+  }
+} // namespace rgw::sal
+
+extern "C" {
+
+  void *newDBStore(CephContext *cct)
+  {
+    rgw::sal::DBStore *driver = new rgw::sal::DBStore();
+    DBStoreManager *dbsm = new DBStoreManager(cct);
+
+    DB *db = dbsm->getDB();
+    if (!db) {
+      delete dbsm;
+      delete driver;
+      return nullptr;
+    }
+
+    driver->setDBStoreManager(dbsm);
+    driver->setDB(db);
+    db->set_driver((rgw::sal::Driver*)driver);
+    db->set_context(cct);
+
+    return driver;
+  }
+
+}
diff --git a/src/rgw/rgw_sal_dbstore.h b/src/rgw/rgw_sal_dbstore.h
new file mode 100644
index 000000000..3acdb4ba3
--- /dev/null
+++ b/src/rgw/rgw_sal_dbstore.h
@@ -0,0 +1,921 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2021 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal_store.h"
+#include "rgw_oidc_provider.h"
+#include "rgw_role.h"
+#include "rgw_lc.h"
+#include "rgw_multi.h"
+
+#include "driver/dbstore/common/dbstore.h"
+#include "driver/dbstore/dbstore_mgr.h"
+
+namespace rgw { namespace sal {
+
+  class DBStore;
+
+class LCDBSerializer : public StoreLCSerializer {
+
+public:
+  LCDBSerializer(DBStore* store, const std::string& oid, const std::string& lock_name, const std::string& cookie) {}
+
+  virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override { return 0; }
+  virtual int unlock() override {
+    return 0;
+  }
+};
+
+class DBLifecycle : public StoreLifecycle {
+  DBStore* store;
+
+public:
+  DBLifecycle(DBStore* _st) : store(_st) {}
+
+  using StoreLifecycle::get_entry;
+  virtual int get_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) override;
+  virtual int get_next_entry(const std::string& oid, const std::string& marker, std::unique_ptr<LCEntry>* entry) override;
+  virtual int set_entry(const std::string& oid, LCEntry& entry) override;
+  virtual int list_entries(const std::string& oid, const std::string& marker,
+			   uint32_t max_entries,
+			   std::vector<std::unique_ptr<LCEntry>>& entries) override;
+  virtual int rm_entry(const std::string& oid, LCEntry& entry) override;
+  virtual int get_head(const std::string& oid, std::unique_ptr<LCHead>* head) override;
+  virtual int put_head(const std::string& oid, LCHead& head) override;
+  virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
+						       const std::string& oid,
+						       const std::string& cookie) override;
+};
+
+class DBNotification : public StoreNotification {
+protected:
+  public:
+  DBNotification(Object* _obj, Object* _src_obj, rgw::notify::EventType _type)
+    : StoreNotification(_obj, _src_obj, _type) {}
+    ~DBNotification() = default;
+
+    virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) override { return 0;}
+    virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+			       const ceph::real_time& mtime, const std::string& etag, const std::string& version) override { return 0; }
+};
+
+  class DBUser : public StoreUser {
+    private:
+      DBStore *store;
+
+    public:
+      DBUser(DBStore *_st, const rgw_user& _u) : StoreUser(_u), store(_st) { }
+      DBUser(DBStore *_st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) { }
+      DBUser(DBStore *_st) : store(_st) { }
+      DBUser(DBUser& _o) = default;
+      DBUser() {}
+
+      virtual std::unique_ptr<User> clone() override {
+        return std::unique_ptr<User>(new DBUser(*this));
+      }
+      int list_buckets(const DoutPrefixProvider *dpp, const std::string& marker, const std::string& end_marker,
+          uint64_t max, bool need_stats, BucketList& buckets, optional_yield y) override;
+      virtual int create_bucket(const DoutPrefixProvider* dpp,
+          const rgw_bucket& b,
+          const std::string& zonegroup_id,
+          rgw_placement_rule& placement_rule,
+          std::string& swift_ver_location,
+          const RGWQuotaInfo* pquota_info,
+          const RGWAccessControlPolicy& policy,
+          Attrs& attrs,
+          RGWBucketInfo& info,
+          obj_version& ep_objv,
+          bool exclusive,
+          bool obj_lock_enabled,
+          bool* existed,
+          req_info& req_info,
+          std::unique_ptr<Bucket>* bucket,
+          optional_yield y) override;
+      virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override;
+      virtual int read_stats(const DoutPrefixProvider *dpp,
+          optional_yield y, RGWStorageStats* stats,
+          ceph::real_time *last_stats_sync = nullptr,
+          ceph::real_time *last_stats_update = nullptr) override;
+      virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) override;
+      virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+      virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+          bool* is_truncated, RGWUsageIter& usage_iter,
+          std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+      virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+
+      /* Placeholders */
+      virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) override;
+      virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+      virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) override;
+      virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+      virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) override;
+
+      friend class DBBucket;
+  };
+
+  class DBBucket : public StoreBucket {
+    private:
+      DBStore *store;
+      RGWAccessControlPolicy acls;
+
+    public:
+      DBBucket(DBStore *_st)
+        : store(_st),
+        acls() {
+        }
+
+      DBBucket(DBStore *_st, User* _u)
+        : StoreBucket(_u),
+        store(_st),
+        acls() {
+        }
+
+      DBBucket(DBStore *_st, const rgw_bucket& _b)
+        : StoreBucket(_b),
+        store(_st),
+        acls() {
+        }
+
+      DBBucket(DBStore *_st, const RGWBucketEnt& _e)
+        : StoreBucket(_e),
+        store(_st),
+        acls() {
+        }
+
+      DBBucket(DBStore *_st, const RGWBucketInfo& _i)
+        : StoreBucket(_i),
+        store(_st),
+        acls() {
+        }
+
+      DBBucket(DBStore *_st, const rgw_bucket& _b, User* _u)
+        : StoreBucket(_b, _u),
+        store(_st),
+        acls() {
+        }
+
+      DBBucket(DBStore *_st, const RGWBucketEnt& _e, User* _u)
+        : StoreBucket(_e, _u),
+        store(_st),
+        acls() {
+        }
+
+      DBBucket(DBStore *_st, const RGWBucketInfo& _i, User* _u)
+        : StoreBucket(_i, _u),
+        store(_st),
+        acls() {
+        }
+
+      ~DBBucket() { }
+
+      virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+      virtual int list(const DoutPrefixProvider *dpp, ListParams&, int, ListResults&, optional_yield y) override;
+      virtual int remove_bucket(const DoutPrefixProvider *dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) override;
+      virtual int remove_bucket_bypass_gc(int concurrent_max, bool
+					keep_index_consistent,
+					optional_yield y, const
+					DoutPrefixProvider *dpp) override;
+      virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+      virtual int set_acl(const DoutPrefixProvider *dpp, RGWAccessControlPolicy& acl, optional_yield y) override;
+      virtual int load_bucket(const DoutPrefixProvider *dpp, optional_yield y, bool get_stats = false) override;
+      virtual int read_stats(const DoutPrefixProvider *dpp,
+			     const bucket_index_layout_generation& idx_layout,
+			     int shard_id,
+          std::string *bucket_ver, std::string *master_ver,
+          std::map<RGWObjCategory, RGWStorageStats>& stats,
+          std::string *max_marker = nullptr,
+          bool *syncstopped = nullptr) override;
+      virtual int read_stats_async(const DoutPrefixProvider *dpp, const bucket_index_layout_generation& idx_layout, int shard_id, RGWGetBucketStats_CB* ctx) override;
+      virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+      virtual int update_container_stats(const DoutPrefixProvider *dpp) override;
+      virtual int check_bucket_shards(const DoutPrefixProvider *dpp) override;
+      virtual int chown(const DoutPrefixProvider *dpp, User& new_user, optional_yield y) override;
+      virtual int put_info(const DoutPrefixProvider *dpp, bool exclusive, ceph::real_time mtime) override;
+      virtual bool is_owner(User* user) override;
+      virtual int check_empty(const DoutPrefixProvider *dpp, optional_yield y) override;
+      virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) override;
+      virtual int merge_and_store_attrs(const DoutPrefixProvider *dpp, Attrs& attrs, optional_yield y) override;
+      virtual int try_refresh_info(const DoutPrefixProvider *dpp, ceph::real_time *pmtime) override;
+      virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+          bool *is_truncated, RGWUsageIter& usage_iter,
+          std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+      virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+      virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink) override;
+      virtual int check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats) override;
+      virtual int rebuild_index(const DoutPrefixProvider *dpp) override;
+      virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override;
+      virtual int purge_instance(const DoutPrefixProvider *dpp) override;
+      virtual std::unique_ptr<Bucket> clone() override {
+        return std::make_unique<DBBucket>(*this);
+      }
+      virtual std::unique_ptr<MultipartUpload> get_multipart_upload(
+				const std::string& oid, std::optional<std::string> upload_id,
+				ACLOwner owner={}, ceph::real_time mtime=ceph::real_clock::now()) override;
+      virtual int list_multiparts(const DoutPrefixProvider *dpp,
+				const std::string& prefix,
+				std::string& marker,
+				const std::string& delim,
+				const int& max_uploads,
+				std::vector<std::unique_ptr<MultipartUpload>>& uploads,
+				std::map<std::string, bool> *common_prefixes,
+				bool *is_truncated) override;
+      virtual int abort_multiparts(const DoutPrefixProvider* dpp,
+				   CephContext* cct) override;
+
+      friend class DBStore;
+  };
+
+  class DBPlacementTier: public StorePlacementTier {
+    DBStore* store;
+    RGWZoneGroupPlacementTier tier;
+  public:
+    DBPlacementTier(DBStore* _store, const RGWZoneGroupPlacementTier& _tier) : store(_store), tier(_tier) {}
+    virtual ~DBPlacementTier() = default;
+
+    virtual const std::string& get_tier_type() { return tier.tier_type; }
+    virtual const std::string& get_storage_class() { return tier.storage_class; }
+    virtual bool retain_head_object() { return tier.retain_head_object; }
+    RGWZoneGroupPlacementTier& get_rt() { return tier; }
+  };
+
+  class DBZoneGroup : public StoreZoneGroup {
+    DBStore* store;
+    std::unique_ptr<RGWZoneGroup> group;
+    std::string empty;
+  public:
+    DBZoneGroup(DBStore* _store, std::unique_ptr<RGWZoneGroup> _group) : store(_store), group(std::move(_group)) {}
+    virtual ~DBZoneGroup() = default;
+
+    virtual const std::string& get_id() const override { return group->get_id(); };
+    virtual const std::string& get_name() const override { return group->get_name(); };
+    virtual int equals(const std::string& other_zonegroup) const override {
+      return group->equals(other_zonegroup);
+    };
+    /** Get the endpoint from zonegroup, or from master zone if not set */
+    virtual const std::string& get_endpoint() const override;
+    virtual bool placement_target_exists(std::string& target) const override;
+    virtual bool is_master_zonegroup() const override {
+      return group->is_master_zonegroup();
+    };
+    virtual const std::string& get_api_name() const override { return group->api_name; };
+    virtual void get_placement_target_names(std::set<std::string>& names) const override;
+    virtual const std::string& get_default_placement_name() const override {
+      return group->default_placement.name; };
+    virtual int get_hostnames(std::list<std::string>& names) const override {
+      names = group->hostnames;
+      return 0;
+    };
+    virtual int get_s3website_hostnames(std::list<std::string>& names) const override {
+      names = group->hostnames_s3website;
+      return 0;
+    };
+    virtual int get_zone_count() const override {
+      /* currently only 1 zone supported */
+      return 1;
+    }
+    virtual int get_placement_tier(const rgw_placement_rule& rule,
+				   std::unique_ptr<PlacementTier>* tier) {
+      return -1;
+    }
+    virtual int get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone) override {
+      return -1;
+    }
+    virtual int get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone) override {
+      return -1;
+    }
+    virtual int list_zones(std::list<std::string>& zone_ids) override {
+      zone_ids.clear();
+      return 0;
+    }
+    bool supports(std::string_view feature) const override {
+      return group->supports(feature);
+    }
+    virtual std::unique_ptr<ZoneGroup> clone() override {
+      std::unique_ptr<RGWZoneGroup>zg = std::make_unique<RGWZoneGroup>(*group.get());
+      return std::make_unique<DBZoneGroup>(store, std::move(zg));
+    }
+  };
+
+  class DBZone : public StoreZone {
+    protected:
+      DBStore* store;
+      RGWRealm *realm{nullptr};
+      DBZoneGroup *zonegroup{nullptr};
+      RGWZone *zone_public_config{nullptr}; /* external zone params, e.g., entrypoints, log flags, etc. */
+      RGWZoneParams *zone_params{nullptr}; /* internal zone params, e.g., rados pools */
+      RGWPeriod *current_period{nullptr};
+
+    public:
+      DBZone(DBStore* _store) : store(_store) {
+	realm = new RGWRealm();
+        zonegroup = new DBZoneGroup(store, std::make_unique<RGWZoneGroup>());
+        zone_public_config = new RGWZone();
+        zone_params = new RGWZoneParams();
+        current_period = new RGWPeriod();
+
+        // XXX: only default and STANDARD supported for now
+        RGWZonePlacementInfo info;
+        RGWZoneStorageClasses sc;
+        sc.set_storage_class("STANDARD", nullptr, nullptr);
+        info.storage_classes = sc;
+        zone_params->placement_pools["default"] = info;
+      }
+      ~DBZone() {
+	delete realm;
+	delete zonegroup;
+	delete zone_public_config;
+	delete zone_params;
+	delete current_period;
+      }
+
+      virtual std::unique_ptr<Zone> clone() override {
+	return std::make_unique<DBZone>(store);
+      }
+      virtual ZoneGroup& get_zonegroup() override;
+      const RGWZoneParams& get_rgw_params();
+      virtual const std::string& get_id() override;
+      virtual const std::string& get_name() const override;
+      virtual bool is_writeable() override;
+      virtual bool get_redirect_endpoint(std::string* endpoint) override;
+      virtual bool has_zonegroup_api(const std::string& api) const override;
+      virtual const std::string& get_current_period_id() override;
+      virtual const RGWAccessKey& get_system_key() override;
+      virtual const std::string& get_realm_name() override;
+      virtual const std::string& get_realm_id() override;
+      virtual const std::string_view get_tier_type() override { return "rgw"; }
+      virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() override;
+  };
+
+  class DBLuaManager : public StoreLuaManager {
+    DBStore* store;
+
+    public:
+    DBLuaManager(DBStore* _s) : store(_s)
+    {
+    }
+    virtual ~DBLuaManager() = default;
+
+    /** Get a script named with the given key from the backing store */
+    virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override;
+    /** Put a script named with the given key to the backing store */
+    virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override;
+    /** Delete a script named with the given key from the backing store */
+    virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override;
+    /** Add a lua package */
+    virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override;
+    /** Remove a lua package */
+    virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override;
+    /** List lua packages */
+    virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override;
+  };
+
+  class DBOIDCProvider : public RGWOIDCProvider {
+    DBStore* store;
+    public:
+    DBOIDCProvider(DBStore* _store) : store(_store) {}
+    ~DBOIDCProvider() = default;
+
+    virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) override { return 0; }
+    virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) override { return 0; }
+    virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override { return 0;}
+
+    void encode(bufferlist& bl) const {
+      RGWOIDCProvider::encode(bl);
+    }
+    void decode(bufferlist::const_iterator& bl) {
+      RGWOIDCProvider::decode(bl);
+    }
+  };
+
+  /*
+   * For multipart upload, below is the process flow -
+   *
+   * MultipartUpload::Init - create head object of meta obj (src_obj_name + "." + upload_id)
+   *                     [ Meta object stores all the parts upload info]
+   * MultipartWriter::process - create all data/tail objects with obj_name same as
+   *                        meta obj (so that they can all be identified & deleted
+   *                        during abort)
+   * MultipartUpload::Abort - Just delete meta obj .. that will indirectly delete all the
+   *                     uploads associated with that upload id / meta obj so far.
+   * MultipartUpload::Complete - create head object of the original object (if not exists) &
+   *                     rename all data/tail objects to orig object name and update
+   *                     metadata of the orig object.
+   */
+  class DBMultipartPart : public StoreMultipartPart {
+  protected:
+    RGWUploadPartInfo info; /* XXX: info contains manifest also which is not needed */
+
+  public:
+    DBMultipartPart() = default;
+    virtual ~DBMultipartPart() = default;
+
+    virtual RGWUploadPartInfo& get_info() { return info; }
+    virtual void set_info(const RGWUploadPartInfo& _info) { info = _info; }
+    virtual uint32_t get_num() { return info.num; }
+    virtual uint64_t get_size() { return info.accounted_size; }
+    virtual const std::string& get_etag() { return info.etag; }
+    virtual ceph::real_time& get_mtime() { return info.modified; }
+
+  };
+
+  class DBMPObj {
+    std::string oid; // object name
+    std::string upload_id;
+    std::string meta; // multipart meta object = <oid>.<upload_id>
+  public:
+    DBMPObj() {}
+    DBMPObj(const std::string& _oid, const std::string& _upload_id) {
+      init(_oid, _upload_id, _upload_id);
+    }
+    DBMPObj(const std::string& _oid, std::optional<std::string> _upload_id) {
+      if (_upload_id) {
+        init(_oid, *_upload_id, *_upload_id);
+      } else {
+        from_meta(_oid);
+      }
+    }
+    void init(const std::string& _oid, const std::string& _upload_id) {
+      init(_oid, _upload_id, _upload_id);
+    }
+    void init(const std::string& _oid, const std::string& _upload_id, const std::string& part_unique_str) {
+      if (_oid.empty()) {
+        clear();
+        return;
+      }
+      oid = _oid;
+      upload_id = _upload_id;
+      meta = oid + "." + upload_id;
+    }
+    const std::string& get_upload_id() const {
+      return upload_id;
+    }
+    const std::string& get_key() const {
+      return oid;
+    }
+    const std::string& get_meta() const { return meta; }
+    bool from_meta(const std::string& meta) {
+      int end_pos = meta.length();
+      int mid_pos = meta.rfind('.', end_pos - 1); // <key>.<upload_id>
+      if (mid_pos < 0)
+        return false;
+      oid = meta.substr(0, mid_pos);
+      upload_id = meta.substr(mid_pos + 1, end_pos - mid_pos - 1);
+      init(oid, upload_id, upload_id);
+      return true;
+    }
+    void clear() {
+      oid = "";
+      meta = "";
+      upload_id = "";
+    }
+  };
+
+  class DBMultipartUpload : public StoreMultipartUpload {
+    DBStore* store;
+    DBMPObj mp_obj;
+    ACLOwner owner;
+    ceph::real_time mtime;
+    rgw_placement_rule placement;
+
+  public:
+    DBMultipartUpload(DBStore* _store, Bucket* _bucket, const std::string& oid, std::optional<std::string> upload_id, ACLOwner _owner, ceph::real_time _mtime) : StoreMultipartUpload(_bucket), store(_store), mp_obj(oid, upload_id), owner(_owner), mtime(_mtime) {}
+    virtual ~DBMultipartUpload() = default;
+
+    virtual const std::string& get_meta() const { return mp_obj.get_meta(); }
+    virtual const std::string& get_key() const { return mp_obj.get_key(); }
+    virtual const std::string& get_upload_id() const { return mp_obj.get_upload_id(); }
+    virtual const ACLOwner& get_owner() const override { return owner; }
+    virtual ceph::real_time& get_mtime() { return mtime; }
+    virtual std::unique_ptr<rgw::sal::Object> get_meta_obj() override;
+    virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override;
+    virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			 int num_parts, int marker,
+			 int* next_marker, bool* truncated,
+			 bool assume_unsorted = false) override;
+    virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override;
+    virtual int complete(const DoutPrefixProvider* dpp,
+		       optional_yield y, CephContext* cct,
+		       std::map<int, std::string>& part_etags,
+		       std::list<rgw_obj_index_key>& remove_objs,
+		       uint64_t& accounted_size, bool& compressed,
+		       RGWCompressionInfo& cs_info, off_t& ofs,
+		       std::string& tag, ACLOwner& owner,
+		       uint64_t olh_epoch,
+		       rgw::sal::Object* target_obj) override;
+    virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
+    virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
+			  optional_yield y,
+			  rgw::sal::Object* obj,
+			  const rgw_user& owner,
+			  const rgw_placement_rule *ptail_placement_rule,
+			  uint64_t part_num,
+			  const std::string& part_num_str) override;
+  };
+
+  class DBObject : public StoreObject {
+    private:
+      DBStore* store;
+      RGWAccessControlPolicy acls;
+
+    public:
+      struct DBReadOp : public ReadOp {
+        private:
+          DBObject* source;
+          RGWObjectCtx* rctx;
+          DB::Object op_target;
+          DB::Object::Read parent_op;
+
+        public:
+          DBReadOp(DBObject *_source, RGWObjectCtx *_rctx);
+
+          virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override;
+
+	  /*
+	   * Both `read` and `iterate` read up through index `end`
+	   * *inclusive*. The number of bytes that could be returned is
+	   * `end - ofs + 1`.
+	   */
+	  virtual int read(int64_t ofs, int64_t end, bufferlist& bl,
+			   optional_yield y,
+			   const DoutPrefixProvider* dpp) override;
+	  virtual int iterate(const DoutPrefixProvider* dpp, int64_t ofs,
+			      int64_t end, RGWGetDataCB* cb,
+			      optional_yield y) override;
+
+	  virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) override;
+      };
+
+      struct DBDeleteOp : public DeleteOp {
+        private:
+          DBObject* source;
+          DB::Object op_target;
+          DB::Object::Delete parent_op;
+
+        public:
+          DBDeleteOp(DBObject* _source);
+
+          virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override;
+      };
+
+      DBObject() = default;
+
+      DBObject(DBStore *_st, const rgw_obj_key& _k)
+        : StoreObject(_k),
+        store(_st),
+        acls() {}
+
+      DBObject(DBStore *_st, const rgw_obj_key& _k, Bucket* _b)
+        : StoreObject(_k, _b),
+        store(_st),
+        acls() {}
+
+      DBObject(DBObject& _o) = default;
+
+      virtual int delete_object(const DoutPrefixProvider* dpp,
+          optional_yield y,
+          bool prevent_versioning = false) override;
+      virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio,
+          bool keep_index_consistent, optional_yield y) override;
+      virtual int copy_object(User* user,
+          req_info* info, const rgw_zone_id& source_zone,
+          rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket,
+          rgw::sal::Bucket* src_bucket,
+          const rgw_placement_rule& dest_placement,
+          ceph::real_time* src_mtime, ceph::real_time* mtime,
+          const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr,
+          bool high_precision_time,
+          const char* if_match, const char* if_nomatch,
+          AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs,
+          RGWObjCategory category, uint64_t olh_epoch,
+          boost::optional<ceph::real_time> delete_at,
+          std::string* version_id, std::string* tag, std::string* etag,
+          void (*progress_cb)(off_t, void *), void* progress_data,
+          const DoutPrefixProvider* dpp, optional_yield y) override;
+      virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+      virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; }
+
+      virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) override;
+      virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) override;
+      virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override;
+      virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override;
+      virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) override;
+      virtual bool is_expired() override;
+      virtual void gen_rand_obj_instance_name() override;
+      virtual std::unique_ptr<Object> clone() override {
+        return std::unique_ptr<Object>(new DBObject(*this));
+      }
+      virtual std::unique_ptr<MPSerializer> get_serializer(const DoutPrefixProvider *dpp,
+							   const std::string& lock_name) override;
+      virtual int transition(Bucket* bucket,
+          const rgw_placement_rule& placement_rule,
+          const real_time& mtime,
+          uint64_t olh_epoch,
+          const DoutPrefixProvider* dpp,
+          optional_yield y) override;
+      virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
+      virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override;
+
+      /* Swift versioning */
+      virtual int swift_versioning_restore(bool& restored,
+          const DoutPrefixProvider* dpp) override;
+      virtual int swift_versioning_copy(const DoutPrefixProvider* dpp,
+          optional_yield y) override;
+
+      /* OPs */
+      virtual std::unique_ptr<ReadOp> get_read_op() override;
+      virtual std::unique_ptr<DeleteOp> get_delete_op() override;
+
+      /* OMAP */
+      virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+          std::map<std::string, bufferlist> *m,
+          bool* pmore, optional_yield y) override;
+      virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
+          optional_yield y) override;
+      virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+          const std::set<std::string>& keys,
+          Attrs* vals) override;
+      virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+          bool must_exist, optional_yield y) override;
+      virtual int chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) override;
+    private:
+      int read_attrs(const DoutPrefixProvider* dpp, DB::Object::Read &read_op, optional_yield y, rgw_obj* target_obj = nullptr);
+  };
+
+  class MPDBSerializer : public StoreMPSerializer {
+
+  public:
+    MPDBSerializer(const DoutPrefixProvider *dpp, DBStore* store, DBObject* obj, const std::string& lock_name) {}
+
+    virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override {return 0; }
+    virtual int unlock() override { return 0;}
+  };
+
+  class DBAtomicWriter : public StoreWriter {
+    protected:
+    rgw::sal::DBStore* store;
+    const rgw_user& owner;
+	const rgw_placement_rule *ptail_placement_rule;
+	uint64_t olh_epoch;
+	const std::string& unique_tag;
+    DBObject obj;
+    DB::Object op_target;
+    DB::Object::Write parent_op;
+    uint64_t total_data_size = 0; /* for total data being uploaded */
+    bufferlist head_data;
+    bufferlist tail_part_data;
+    uint64_t tail_part_offset;
+    uint64_t tail_part_size = 0; /* corresponds to each tail part being
+                                  written to dbstore */
+
+    public:
+    DBAtomicWriter(const DoutPrefixProvider *dpp,
+	    	    optional_yield y,
+		        rgw::sal::Object* obj,
+		        DBStore* _store,
+    		    const rgw_user& _owner,
+	    	    const rgw_placement_rule *_ptail_placement_rule,
+		        uint64_t _olh_epoch,
+		        const std::string& _unique_tag);
+    ~DBAtomicWriter() = default;
+
+    // prepare to start processing object data
+    virtual int prepare(optional_yield y) override;
+
+    // Process a bufferlist
+    virtual int process(bufferlist&& data, uint64_t offset) override;
+
+    // complete the operation and make its result visible to clients
+    virtual int complete(size_t accounted_size, const std::string& etag,
+                         ceph::real_time *mtime, ceph::real_time set_mtime,
+                         std::map<std::string, bufferlist>& attrs,
+                         ceph::real_time delete_at,
+                         const char *if_match, const char *if_nomatch,
+                         const std::string *user_data,
+                         rgw_zone_set *zones_trace, bool *canceled,
+                         optional_yield y) override;
+  };
+
+  class DBMultipartWriter : public StoreWriter {
+  protected:
+    rgw::sal::DBStore* store;
+    const rgw_user& owner;
+	const rgw_placement_rule *ptail_placement_rule;
+	uint64_t olh_epoch;
+    rgw::sal::Object* head_obj;
+    std::string upload_id;
+    int part_num;
+    std::string oid; /* object->name() + "." + "upload_id" + "." + part_num */
+    std::unique_ptr<rgw::sal::Object> meta_obj;
+    DB::Object op_target;
+    DB::Object::Write parent_op;
+    std::string part_num_str;
+    uint64_t total_data_size = 0; /* for total data being uploaded */
+    bufferlist head_data;
+    bufferlist tail_part_data;
+    uint64_t tail_part_offset;
+    uint64_t tail_part_size = 0; /* corresponds to each tail part being
+                                  written to dbstore */
+
+public:
+    DBMultipartWriter(const DoutPrefixProvider *dpp,
+		       optional_yield y, MultipartUpload* upload,
+		       rgw::sal::Object* obj,
+		       DBStore* _store,
+		       const rgw_user& owner,
+		       const rgw_placement_rule *ptail_placement_rule,
+		       uint64_t part_num, const std::string& part_num_str);
+    ~DBMultipartWriter() = default;
+
+    // prepare to start processing object data
+    virtual int prepare(optional_yield y) override;
+
+    // Process a bufferlist
+    virtual int process(bufferlist&& data, uint64_t offset) override;
+
+    // complete the operation and make its result visible to clients
+    virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) override;
+  };
+
+  class DBStore : public StoreDriver {
+    private:
+      /* DBStoreManager is used in case multiple
+       * connections are needed one for each tenant.
+       */
+      DBStoreManager *dbsm;
+      /* default db (single connection). If needed
+       * multiple db handles (for eg., one for each tenant),
+       * use dbsm->getDB(tenant) */
+      DB *db;
+      DBZone zone;
+      RGWSyncModuleInstanceRef sync_module;
+      RGWLC* lc;
+      CephContext *cct;
+      const DoutPrefixProvider *dpp;
+      bool use_lc_thread;
+
+    public:
+      DBStore(): dbsm(nullptr), zone(this), cct(nullptr), dpp(nullptr),
+                 use_lc_thread(false) {}
+      ~DBStore() { delete dbsm; }
+
+      DBStore& set_run_lc_thread(bool _use_lc_thread) {
+        use_lc_thread = _use_lc_thread;
+        return *this;
+      }
+
+      virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) override;
+
+      virtual const std::string get_name() const override {
+        return "dbstore";
+      }
+
+      virtual std::unique_ptr<User> get_user(const rgw_user& u) override;
+      virtual int get_user_by_access_key(const DoutPrefixProvider *dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user) override;
+      virtual int get_user_by_email(const DoutPrefixProvider *dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user) override;
+      virtual int get_user_by_swift(const DoutPrefixProvider *dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user) override;
+      virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+      virtual std::string get_cluster_id(const DoutPrefixProvider* dpp, optional_yield y);
+      virtual int get_bucket(const DoutPrefixProvider *dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+      virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket) override;
+      virtual int get_bucket(const DoutPrefixProvider *dpp, User* u, const std::string& tenant, const std::string&name, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+      virtual bool is_meta_master() override;
+      virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
+          bufferlist& in_data, JSONParser *jp, req_info& info,
+          optional_yield y) override;
+      virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+					     bufferlist& in_data,
+					     RGWXMLDecoder::XMLParser* parser, req_info& info,
+					     optional_yield y) override;
+      virtual Zone* get_zone() { return &zone; }
+      virtual std::string zone_unique_id(uint64_t unique_num) override;
+      virtual std::string zone_unique_trans_id(const uint64_t unique_num) override;
+      virtual int get_zonegroup(const std::string& id, std::unique_ptr<ZoneGroup>* zonegroup) override;
+      virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids) override;
+      virtual int cluster_stat(RGWClusterStat& stats) override;
+      virtual std::unique_ptr<Lifecycle> get_lifecycle(void) override;
+      virtual std::unique_ptr<Completions> get_completions(void) override;
+
+  virtual std::unique_ptr<Notification> get_notification(
+    rgw::sal::Object* obj, rgw::sal::Object* src_obj, req_state* s,
+    rgw::notify::EventType event_type, optional_yield y, const std::string* object_name) override;
+
+  virtual std::unique_ptr<Notification> get_notification(
+    const DoutPrefixProvider* dpp, rgw::sal::Object* obj,
+    rgw::sal::Object* src_obj,
+    rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket,
+    std::string& _user_id, std::string& _user_tenant, std::string& _req_id,
+    optional_yield y) override;
+
+      virtual RGWLC* get_rgwlc(void) override;
+      virtual RGWCoroutinesManagerRegistry* get_cr_registry() override { return NULL; }
+      virtual int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info) override;
+      virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) override;
+      virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
+          const std::map<std::string, std::string>& meta) override;
+      virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) override;
+      virtual void get_quota(RGWQuota& quota) override;
+    virtual int set_buckets_enabled(const DoutPrefixProvider *dpp, std::vector<rgw_bucket>& buckets, bool enabled) override;
+      virtual int get_sync_policy_handler(const DoutPrefixProvider *dpp,
+          std::optional<rgw_zone_id> zone,
+          std::optional<rgw_bucket> bucket,
+          RGWBucketSyncPolicyHandlerRef *phandler,
+          optional_yield y) override;
+      virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override;
+      virtual void wakeup_meta_sync_shards(std::set<int>& shard_ids) override { return; }
+      virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp,
+					   const rgw_zone_id& source_zone,
+					   boost::container::flat_map<
+					     int,
+					   boost::container::flat_set<rgw_data_notify_entry>>& shard_ids) override { return; }
+      virtual int clear_usage(const DoutPrefixProvider *dpp) override { return 0; }
+      virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+          uint32_t max_entries, bool *is_truncated,
+          RGWUsageIter& usage_iter,
+          std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+      virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+      virtual int get_config_key_val(std::string name, bufferlist* bl) override;
+      virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) override;
+      virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list<std::string>& keys, bool* truncated) override;
+      virtual void meta_list_keys_complete(void* handle) override;
+      virtual std::string meta_get_marker(void *handle) override;
+      virtual int meta_remove(const DoutPrefixProvider *dpp, std::string& metadata_key, optional_yield y) override;
+
+      virtual const RGWSyncModuleInstanceRef& get_sync_module() { return sync_module; }
+      virtual std::string get_host_id() { return ""; }
+
+      virtual std::unique_ptr<LuaManager> get_lua_manager() override;
+      virtual std::unique_ptr<RGWRole> get_role(std::string name,
+          std::string tenant,
+          std::string path="",
+          std::string trust_policy="",
+          std::string max_session_duration_str="",
+          std::multimap<std::string,std::string> tags={}) override;
+      virtual std::unique_ptr<RGWRole> get_role(std::string id) override;
+      virtual std::unique_ptr<RGWRole> get_role(const RGWRoleInfo& info) override;
+      virtual int get_roles(const DoutPrefixProvider *dpp,
+          optional_yield y,
+          const std::string& path_prefix,
+          const std::string& tenant,
+          std::vector<std::unique_ptr<RGWRole>>& roles) override;
+      virtual std::unique_ptr<RGWOIDCProvider> get_oidc_provider() override;
+      virtual int get_oidc_providers(const DoutPrefixProvider *dpp,
+          const std::string& tenant,
+          std::vector<std::unique_ptr<RGWOIDCProvider>>& providers) override;
+      virtual std::unique_ptr<Writer> get_append_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  const std::string& unique_tag,
+				  uint64_t position,
+				  uint64_t *cur_accounted_size) override;
+      virtual std::unique_ptr<Writer> get_atomic_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t olh_epoch,
+				  const std::string& unique_tag) override;
+
+      virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override;
+      virtual bool valid_placement(const rgw_placement_rule& rule) override;
+
+      virtual void finalize(void) override;
+
+      virtual CephContext *ctx(void) override {
+        return db->ctx();
+      }
+
+      virtual void register_admin_apis(RGWRESTMgr* mgr) override { };
+
+      /* Unique to DBStore */
+      void setDBStoreManager(DBStoreManager *stm) { dbsm = stm; }
+      DBStoreManager *getDBStoreManager(void) { return dbsm; }
+
+      void setDB(DB * st) { db = st; }
+      DB *getDB(void) { return db; }
+
+      DB *getDB(std::string tenant) { return dbsm->getDB(tenant, false); }
+  };
+
+} } // namespace rgw::sal
diff --git a/src/rgw/rgw_sal_filter.cc b/src/rgw/rgw_sal_filter.cc
new file mode 100644
index 000000000..2a48cec9c
--- /dev/null
+++ b/src/rgw/rgw_sal_filter.cc
@@ -0,0 +1,1370 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "rgw_sal_filter.h"
+
+namespace rgw { namespace sal {
+
+/* These are helpers for getting 'next' out of an object, handling nullptr */
+static inline PlacementTier* nextPlacementTier(PlacementTier* t)
+{
+  if (!t)
+    return nullptr;
+
+  return dynamic_cast<FilterPlacementTier*>(t)->get_next();
+}
+
+static inline User* nextUser(User* t)
+{
+  if (!t)
+    return nullptr;
+
+  return dynamic_cast<FilterUser*>(t)->get_next();
+}
+
+static inline Bucket* nextBucket(Bucket* t)
+{
+  if (!t)
+    return nullptr;
+
+  return dynamic_cast<FilterBucket*>(t)->get_next();
+}
+
+static inline Object* nextObject(Object* t)
+{
+  if (!t)
+    return nullptr;
+
+  return dynamic_cast<FilterObject*>(t)->get_next();
+}
+
+int FilterZoneGroup::get_placement_tier(const rgw_placement_rule& rule,
+					std::unique_ptr<PlacementTier>* tier)
+{
+  std::unique_ptr<PlacementTier> nt;
+  int ret;
+
+  ret = next->get_placement_tier(rule, &nt);
+  if (ret != 0)
+    return ret;
+
+  PlacementTier* t = new FilterPlacementTier(std::move(nt));
+  tier->reset(t);
+  return 0;
+}
+
+int FilterZoneGroup::get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone)
+{
+  std::unique_ptr<Zone> nz;
+  int ret = next->get_zone_by_id(id, &nz);
+  if (ret < 0)
+    return ret;
+  Zone *z = new FilterZone(std::move(nz));
+
+  zone->reset(z);
+  return 0;
+}
+
+int FilterZoneGroup::get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone)
+{
+  std::unique_ptr<Zone> nz;
+  int ret = next->get_zone_by_name(name, &nz);
+  if (ret < 0)
+    return ret;
+  Zone *z = new FilterZone(std::move(nz));
+
+  zone->reset(z);
+  return 0;
+}
+
+int FilterDriver::initialize(CephContext *cct, const DoutPrefixProvider *dpp)
+{
+  zone = std::make_unique<FilterZone>(next->get_zone()->clone());
+
+  return 0;
+}
+
+const std::string FilterDriver::get_name() const
+{
+  std::string name = "filter<" + next->get_name() + ">";
+  return name;
+}
+
+std::string FilterDriver::get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y)
+{
+  return next->get_cluster_id(dpp, y);
+}
+
+std::unique_ptr<User> FilterDriver::get_user(const rgw_user &u)
+{
+  std::unique_ptr<User> user = next->get_user(u);
+  return std::make_unique<FilterUser>(std::move(user));
+}
+
+int FilterDriver::get_user_by_access_key(const DoutPrefixProvider* dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user)
+{
+  std::unique_ptr<User> nu;
+  int ret;
+
+  ret = next->get_user_by_access_key(dpp, key, y, &nu);
+  if (ret != 0)
+    return ret;
+
+  User* u = new FilterUser(std::move(nu));
+  user->reset(u);
+  return 0;
+}
+
+int FilterDriver::get_user_by_email(const DoutPrefixProvider* dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user)
+{
+  std::unique_ptr<User> nu;
+  int ret;
+
+  ret = next->get_user_by_email(dpp, email, y, &nu);
+  if (ret != 0)
+    return ret;
+
+  User* u = new FilterUser(std::move(nu));
+  user->reset(u);
+  return 0;
+}
+
+int FilterDriver::get_user_by_swift(const DoutPrefixProvider* dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user)
+{
+  std::unique_ptr<User> nu;
+  int ret;
+
+  ret = next->get_user_by_swift(dpp, user_str, y, &nu);
+  if (ret != 0)
+    return ret;
+
+  User* u = new FilterUser(std::move(nu));
+  user->reset(u);
+  return 0;
+}
+
+std::unique_ptr<Object> FilterDriver::get_object(const rgw_obj_key& k)
+{
+  std::unique_ptr<Object> o = next->get_object(k);
+  return std::make_unique<FilterObject>(std::move(o));
+}
+
+int FilterDriver::get_bucket(const DoutPrefixProvider* dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y)
+{
+  std::unique_ptr<Bucket> nb;
+  int ret;
+  User* nu = nextUser(u);
+
+  ret = next->get_bucket(dpp, nu, b, &nb, y);
+  if (ret != 0)
+    return ret;
+
+  Bucket* fb = new FilterBucket(std::move(nb), u);
+  bucket->reset(fb);
+  return 0;
+}
+
+int FilterDriver::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket)
+{
+  std::unique_ptr<Bucket> nb;
+  int ret;
+  User* nu = nextUser(u);
+
+  ret = next->get_bucket(nu, i, &nb);
+  if (ret != 0)
+    return ret;
+
+  Bucket* fb = new FilterBucket(std::move(nb), u);
+  bucket->reset(fb);
+  return 0;
+}
+
+int FilterDriver::get_bucket(const DoutPrefixProvider* dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr<Bucket>* bucket, optional_yield y)
+{
+  std::unique_ptr<Bucket> nb;
+  int ret;
+  User* nu = nextUser(u);
+
+  ret = next->get_bucket(dpp, nu, tenant, name, &nb, y);
+  if (ret != 0)
+    return ret;
+
+  Bucket* fb = new FilterBucket(std::move(nb), u);
+  bucket->reset(fb);
+  return 0;
+}
+
+bool FilterDriver::is_meta_master()
+{
+  return next->is_meta_master();
+}
+
+int FilterDriver::forward_request_to_master(const DoutPrefixProvider *dpp,
+					   User* user, obj_version* objv,
+					   bufferlist& in_data,
+					   JSONParser* jp, req_info& info,
+					   optional_yield y)
+{
+  return next->forward_request_to_master(dpp, user, objv, in_data, jp, info, y);
+}
+
+int FilterDriver::forward_iam_request_to_master(const DoutPrefixProvider *dpp,
+					       const RGWAccessKey& key,
+					       obj_version* objv,
+					       bufferlist& in_data,
+					       RGWXMLDecoder::XMLParser* parser,
+					       req_info& info,
+					       optional_yield y)
+{
+  return next->forward_iam_request_to_master(dpp, key, objv, in_data, parser, info, y);
+}
+
+std::string FilterDriver::zone_unique_id(uint64_t unique_num)
+{
+  return next->zone_unique_id(unique_num);
+}
+
+std::string FilterDriver::zone_unique_trans_id(uint64_t unique_num)
+{
+  return next->zone_unique_trans_id(unique_num);
+}
+
+int FilterDriver::get_zonegroup(const std::string& id,
+			       std::unique_ptr<ZoneGroup>* zonegroup)
+{
+  std::unique_ptr<ZoneGroup> ngz;
+  int ret;
+
+  ret = next->get_zonegroup(id, &ngz);
+  if (ret != 0)
+    return ret;
+
+  ZoneGroup* zg = new FilterZoneGroup(std::move(ngz));
+  zonegroup->reset(zg);
+  return 0;
+}
+
+int FilterDriver::cluster_stat(RGWClusterStat& stats)
+{
+  return next->cluster_stat(stats);
+}
+
+std::unique_ptr<Lifecycle> FilterDriver::get_lifecycle(void)
+{
+  std::unique_ptr<Lifecycle> lc = next->get_lifecycle();
+  return std::make_unique<FilterLifecycle>(std::move(lc));
+}
+
+std::unique_ptr<Completions> FilterDriver::get_completions(void)
+{
+  std::unique_ptr<Completions> c = next->get_completions();
+  return std::make_unique<FilterCompletions>(std::move(c));
+}
+
+std::unique_ptr<Notification> FilterDriver::get_notification(rgw::sal::Object* obj,
+				rgw::sal::Object* src_obj, req_state* s,
+				rgw::notify::EventType event_type, optional_yield y,
+				const std::string* object_name)
+{
+  std::unique_ptr<Notification> n = next->get_notification(nextObject(obj),
+							   nextObject(src_obj),
+							   s, event_type, y,
+							   object_name);
+  return std::make_unique<FilterNotification>(std::move(n));
+}
+
+std::unique_ptr<Notification> FilterDriver::get_notification(const DoutPrefixProvider* dpp,
+				rgw::sal::Object* obj, rgw::sal::Object* src_obj,
+				rgw::notify::EventType event_type,
+				rgw::sal::Bucket* _bucket, std::string& _user_id,
+				std::string& _user_tenant, std::string& _req_id,
+				optional_yield y)
+{
+  std::unique_ptr<Notification> n = next->get_notification(dpp, nextObject(obj),
+							   nextObject(src_obj),
+							   event_type,
+							   nextBucket(_bucket),
+							   _user_id,
+							   _user_tenant,
+							   _req_id, y);
+  return std::make_unique<FilterNotification>(std::move(n));
+}
+
+RGWLC* FilterDriver::get_rgwlc()
+{
+  return next->get_rgwlc();
+}
+
+RGWCoroutinesManagerRegistry* FilterDriver::get_cr_registry()
+{
+  return next->get_cr_registry();
+}
+
+int FilterDriver::log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info)
+{
+    return next->log_usage(dpp, usage_info);
+}
+
+int FilterDriver::log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl)
+{
+    return next->log_op(dpp, oid, bl);
+}
+
+int FilterDriver::register_to_service_map(const DoutPrefixProvider *dpp,
+					 const std::string& daemon_type,
+					 const std::map<std::string, std::string>& meta)
+{
+  return next->register_to_service_map(dpp, daemon_type, meta);
+}
+
+void FilterDriver::get_quota(RGWQuota& quota)
+{
+  return next->get_quota(quota);
+}
+
+void FilterDriver::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit,
+				RGWRateLimitInfo& user_ratelimit,
+				RGWRateLimitInfo& anon_ratelimit)
+{
+  return next->get_ratelimit(bucket_ratelimit, user_ratelimit, anon_ratelimit);
+}
+
+int FilterDriver::set_buckets_enabled(const DoutPrefixProvider* dpp,
+				     std::vector<rgw_bucket>& buckets, bool enabled)
+{
+    return next->set_buckets_enabled(dpp, buckets, enabled);
+}
+
+uint64_t FilterDriver::get_new_req_id()
+{
+    return next->get_new_req_id();
+}
+
+int FilterDriver::get_sync_policy_handler(const DoutPrefixProvider* dpp,
+					 std::optional<rgw_zone_id> zone,
+					 std::optional<rgw_bucket> bucket,
+					 RGWBucketSyncPolicyHandlerRef* phandler,
+					 optional_yield y)
+{
+  return next->get_sync_policy_handler(dpp, zone, bucket, phandler, y);
+}
+
+RGWDataSyncStatusManager* FilterDriver::get_data_sync_manager(const rgw_zone_id& source_zone)
+{
+  return next->get_data_sync_manager(source_zone);
+}
+
+void FilterDriver::wakeup_meta_sync_shards(std::set<int>& shard_ids)
+{
+  return next->wakeup_meta_sync_shards(shard_ids);
+}
+
+void FilterDriver::wakeup_data_sync_shards(const DoutPrefixProvider *dpp,
+					  const rgw_zone_id& source_zone,
+					  boost::container::flat_map<int, boost::container::flat_set<rgw_data_notify_entry>>& shard_ids)
+{
+  return next->wakeup_data_sync_shards(dpp, source_zone, shard_ids);
+}
+
+int FilterDriver::clear_usage(const DoutPrefixProvider *dpp)
+{
+  return next->clear_usage(dpp);
+}
+
+int FilterDriver::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+				uint64_t end_epoch, uint32_t max_entries,
+				bool* is_truncated, RGWUsageIter& usage_iter,
+				std::map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  return next->read_all_usage(dpp, start_epoch, end_epoch, max_entries,
+			      is_truncated, usage_iter, usage);
+}
+
+int FilterDriver::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+				uint64_t end_epoch)
+{
+  return next->trim_all_usage(dpp, start_epoch, end_epoch);
+}
+
+int FilterDriver::get_config_key_val(std::string name, bufferlist* bl)
+{
+  return next->get_config_key_val(name, bl);
+}
+
+int FilterDriver::meta_list_keys_init(const DoutPrefixProvider *dpp,
+				     const std::string& section,
+				     const std::string& marker, void** phandle)
+{
+  return next->meta_list_keys_init(dpp, section, marker, phandle);
+}
+
+int FilterDriver::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle,
+				     int max, std::list<std::string>& keys,
+				     bool* truncated)
+{
+  return next->meta_list_keys_next(dpp, handle, max, keys, truncated);
+}
+
+void FilterDriver::meta_list_keys_complete(void* handle)
+{
+  next->meta_list_keys_complete(handle);
+}
+
+std::string FilterDriver::meta_get_marker(void* handle)
+{
+  return next->meta_get_marker(handle);
+}
+
+int FilterDriver::meta_remove(const DoutPrefixProvider* dpp, std::string& metadata_key,
+			     optional_yield y)
+{
+  return next->meta_remove(dpp, metadata_key, y);
+}
+
+const RGWSyncModuleInstanceRef& FilterDriver::get_sync_module()
+{
+  return next->get_sync_module();
+}
+
+std::unique_ptr<LuaManager> FilterDriver::get_lua_manager()
+{
+  std::unique_ptr<LuaManager> nm = next->get_lua_manager();
+
+  return std::make_unique<FilterLuaManager>(std::move(nm));
+}
+
+std::unique_ptr<RGWRole> FilterDriver::get_role(std::string name,
+					      std::string tenant,
+					      std::string path,
+					      std::string trust_policy,
+					      std::string max_session_duration_str,
+                std::multimap<std::string,std::string> tags)
+{
+  return next->get_role(name, tenant, path, trust_policy, max_session_duration_str, tags);
+}
+
+std::unique_ptr<RGWRole> FilterDriver::get_role(std::string id)
+{
+  return next->get_role(id);
+}
+
+std::unique_ptr<RGWRole> FilterDriver::get_role(const RGWRoleInfo& info)
+{
+  return next->get_role(info);
+}
+
+int FilterDriver::get_roles(const DoutPrefixProvider *dpp,
+			   optional_yield y,
+			   const std::string& path_prefix,
+			   const std::string& tenant,
+			   std::vector<std::unique_ptr<RGWRole>>& roles)
+{
+  return next->get_roles(dpp, y, path_prefix, tenant, roles);
+}
+
+std::unique_ptr<RGWOIDCProvider> FilterDriver::get_oidc_provider()
+{
+  return next->get_oidc_provider();
+}
+
+int FilterDriver::get_oidc_providers(const DoutPrefixProvider *dpp,
+				    const std::string& tenant,
+				    std::vector<std::unique_ptr<RGWOIDCProvider>>& providers)
+{
+  return next->get_oidc_providers(dpp, tenant, providers);
+}
+
+std::unique_ptr<Writer> FilterDriver::get_append_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  const std::string& unique_tag,
+				  uint64_t position,
+				  uint64_t *cur_accounted_size)
+{
+  std::unique_ptr<Writer> writer = next->get_append_writer(dpp, y, nextObject(obj),
+							   owner, ptail_placement_rule,
+							   unique_tag, position,
+							   cur_accounted_size);
+
+  return std::make_unique<FilterWriter>(std::move(writer), obj);
+}
+
+std::unique_ptr<Writer> FilterDriver::get_atomic_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t olh_epoch,
+				  const std::string& unique_tag)
+{
+  std::unique_ptr<Writer> writer = next->get_atomic_writer(dpp, y, nextObject(obj),
+							   owner, ptail_placement_rule,
+							   olh_epoch, unique_tag);
+
+  return std::make_unique<FilterWriter>(std::move(writer), obj);
+}
+
+const std::string& FilterDriver::get_compression_type(const rgw_placement_rule& rule)
+{
+  return next->get_compression_type(rule);
+}
+
+bool FilterDriver::valid_placement(const rgw_placement_rule& rule)
+{
+  return next->valid_placement(rule);
+}
+
+void FilterDriver::finalize(void)
+{
+  next->finalize();
+}
+
+CephContext* FilterDriver::ctx(void)
+{
+  return next->ctx();
+}
+
+int FilterUser::list_buckets(const DoutPrefixProvider* dpp, const std::string& marker,
+			     const std::string& end_marker, uint64_t max,
+			     bool need_stats, BucketList &buckets, optional_yield y)
+{
+  BucketList bl;
+  int ret;
+
+  buckets.clear();
+  ret = next->list_buckets(dpp, marker, end_marker, max, need_stats, bl, y);
+  if (ret < 0)
+    return ret;
+
+  buckets.set_truncated(bl.is_truncated());
+  for (auto& ent : bl.get_buckets()) {
+    buckets.add(std::make_unique<FilterBucket>(std::move(ent.second), this));
+  }
+
+  return 0;
+}
+
+int FilterUser::create_bucket(const DoutPrefixProvider* dpp,
+			      const rgw_bucket& b,
+			      const std::string& zonegroup_id,
+			      rgw_placement_rule& placement_rule,
+			      std::string& swift_ver_location,
+			      const RGWQuotaInfo * pquota_info,
+			      const RGWAccessControlPolicy& policy,
+			      Attrs& attrs,
+			      RGWBucketInfo& info,
+			      obj_version& ep_objv,
+			      bool exclusive,
+			      bool obj_lock_enabled,
+			      bool* existed,
+			      req_info& req_info,
+			      std::unique_ptr<Bucket>* bucket_out,
+			      optional_yield y)
+{
+  std::unique_ptr<Bucket> nb;
+  int ret;
+
+  ret = next->create_bucket(dpp, b, zonegroup_id, placement_rule, swift_ver_location, pquota_info, policy, attrs, info, ep_objv, exclusive, obj_lock_enabled, existed, req_info, &nb, y);
+  if (ret < 0)
+    return ret;
+
+  Bucket* fb = new FilterBucket(std::move(nb), this);
+  bucket_out->reset(fb);
+  return 0;
+}
+
+int FilterUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return next->read_attrs(dpp, y);
+}
+
+int FilterUser::merge_and_store_attrs(const DoutPrefixProvider* dpp,
+				      Attrs& new_attrs, optional_yield y)
+{
+  return next->merge_and_store_attrs(dpp, new_attrs, y);
+}
+
+int FilterUser::read_stats(const DoutPrefixProvider *dpp,
+			   optional_yield y, RGWStorageStats* stats,
+			   ceph::real_time* last_stats_sync,
+			   ceph::real_time* last_stats_update)
+{
+  return next->read_stats(dpp, y, stats, last_stats_sync, last_stats_update);
+}
+
+int FilterUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb)
+{
+  return next->read_stats_async(dpp, cb);
+}
+
+int FilterUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return next->complete_flush_stats(dpp, y);
+}
+
+int FilterUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+			   uint64_t end_epoch, uint32_t max_entries,
+			   bool* is_truncated, RGWUsageIter& usage_iter,
+			   std::map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  return next->read_usage(dpp, start_epoch, end_epoch, max_entries,
+			  is_truncated, usage_iter, usage);
+}
+
+int FilterUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+			   uint64_t end_epoch)
+{
+  return next->trim_usage(dpp, start_epoch, end_epoch);
+}
+
+int FilterUser::load_user(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return next->load_user(dpp, y);
+}
+
+int FilterUser::store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info)
+{
+  return next->store_user(dpp, y, exclusive, old_info);
+}
+
+int FilterUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return next->remove_user(dpp, y);
+}
+
+int FilterUser::verify_mfa(const std::string& mfa_str, bool* verified,
+			   const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return next->verify_mfa(mfa_str, verified, dpp, y);
+}
+
+std::unique_ptr<Object> FilterBucket::get_object(const rgw_obj_key& k)
+{
+  std::unique_ptr<Object> o = next->get_object(k);
+
+  return std::make_unique<FilterObject>(std::move(o), this);
+}
+
+int FilterBucket::list(const DoutPrefixProvider* dpp, ListParams& params, int max,
+		       ListResults& results, optional_yield y)
+{
+  return next->list(dpp, params, max, results, y);
+}
+
+int FilterBucket::remove_bucket(const DoutPrefixProvider* dpp,
+				bool delete_children,
+				bool forward_to_master,
+				req_info* req_info,
+				optional_yield y)
+{
+  return next->remove_bucket(dpp, delete_children, forward_to_master, req_info, y);
+}
+
+int FilterBucket::remove_bucket_bypass_gc(int concurrent_max,
+					  bool keep_index_consistent,
+					  optional_yield y,
+					  const DoutPrefixProvider *dpp)
+{
+  return next->remove_bucket_bypass_gc(concurrent_max, keep_index_consistent, y, dpp);
+}
+
+int FilterBucket::set_acl(const DoutPrefixProvider* dpp,
+			  RGWAccessControlPolicy &acl, optional_yield y)
+{
+  return next->set_acl(dpp, acl, y);
+}
+
+int FilterBucket::load_bucket(const DoutPrefixProvider* dpp, optional_yield y,
+			      bool get_stats)
+{
+  return next->load_bucket(dpp, y, get_stats);
+}
+
+int FilterBucket::read_stats(const DoutPrefixProvider *dpp,
+			     const bucket_index_layout_generation& idx_layout,
+			     int shard_id, std::string* bucket_ver,
+			     std::string* master_ver,
+			     std::map<RGWObjCategory, RGWStorageStats>& stats,
+			     std::string* max_marker, bool* syncstopped)
+{
+  return next->read_stats(dpp, idx_layout, shard_id, bucket_ver, master_ver,
+			  stats, max_marker, syncstopped);
+}
+
+int FilterBucket::read_stats_async(const DoutPrefixProvider *dpp,
+				   const bucket_index_layout_generation& idx_layout,
+				   int shard_id, RGWGetBucketStats_CB* ctx)
+{
+  return next->read_stats_async(dpp, idx_layout, shard_id, ctx);
+}
+
+int FilterBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return next->sync_user_stats(dpp, y);
+}
+
+int FilterBucket::update_container_stats(const DoutPrefixProvider* dpp)
+{
+  return next->update_container_stats(dpp);
+}
+
+int FilterBucket::check_bucket_shards(const DoutPrefixProvider* dpp)
+{
+  return next->check_bucket_shards(dpp);
+}
+
+int FilterBucket::chown(const DoutPrefixProvider* dpp, User& new_user, optional_yield y)
+{
+  return next->chown(dpp, new_user, y);
+}
+
+int FilterBucket::put_info(const DoutPrefixProvider* dpp, bool exclusive,
+			   ceph::real_time _mtime)
+{
+  return next->put_info(dpp, exclusive, _mtime);
+}
+
+bool FilterBucket::is_owner(User* user)
+{
+  return next->is_owner(nextUser(user));
+}
+
+int FilterBucket::check_empty(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return next->check_empty(dpp, y);
+}
+
+int FilterBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota,
+			      uint64_t obj_size, optional_yield y,
+			      bool check_size_only)
+{
+  return next->check_quota(dpp, quota, obj_size, y, check_size_only);
+}
+
+int FilterBucket::merge_and_store_attrs(const DoutPrefixProvider* dpp,
+					Attrs& new_attrs, optional_yield y)
+{
+  return next->merge_and_store_attrs(dpp, new_attrs, y);
+}
+
+int FilterBucket::try_refresh_info(const DoutPrefixProvider* dpp,
+				   ceph::real_time* pmtime)
+{
+  return next->try_refresh_info(dpp, pmtime);
+}
+
+int FilterBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+			     uint64_t end_epoch, uint32_t max_entries,
+			     bool* is_truncated, RGWUsageIter& usage_iter,
+			     std::map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  return next->read_usage(dpp, start_epoch, end_epoch, max_entries,
+			  is_truncated, usage_iter, usage);
+}
+
+int FilterBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+			     uint64_t end_epoch)
+{
+  return next->trim_usage(dpp, start_epoch, end_epoch);
+}
+
+int FilterBucket::remove_objs_from_index(const DoutPrefixProvider *dpp,
+					 std::list<rgw_obj_index_key>& objs_to_unlink)
+{
+  return next->remove_objs_from_index(dpp, objs_to_unlink);
+}
+
+int FilterBucket::check_index(const DoutPrefixProvider *dpp,
+			      std::map<RGWObjCategory, RGWStorageStats>& existing_stats,
+			      std::map<RGWObjCategory, RGWStorageStats>& calculated_stats)
+{
+  return next->check_index(dpp, existing_stats, calculated_stats);
+}
+
+int FilterBucket::rebuild_index(const DoutPrefixProvider *dpp)
+{
+  return next->rebuild_index(dpp);
+}
+
+int FilterBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout)
+{
+  return next->set_tag_timeout(dpp, timeout);
+}
+
+int FilterBucket::purge_instance(const DoutPrefixProvider* dpp)
+{
+  return next->purge_instance(dpp);
+}
+
+std::unique_ptr<MultipartUpload> FilterBucket::get_multipart_upload(
+				  const std::string& oid,
+				  std::optional<std::string> upload_id,
+				  ACLOwner owner, ceph::real_time mtime)
+{
+  std::unique_ptr<MultipartUpload> nmu =
+    next->get_multipart_upload(oid, upload_id, owner, mtime);
+
+  return std::make_unique<FilterMultipartUpload>(std::move(nmu), this);
+}
+
+int FilterBucket::list_multiparts(const DoutPrefixProvider *dpp,
+				  const std::string& prefix,
+				  std::string& marker,
+				  const std::string& delim,
+				  const int& max_uploads,
+				  std::vector<std::unique_ptr<MultipartUpload>>& uploads,
+				  std::map<std::string, bool> *common_prefixes,
+				  bool *is_truncated)
+{
+  std::vector<std::unique_ptr<MultipartUpload>> nup;
+  int ret;
+
+  ret = next->list_multiparts(dpp, prefix, marker, delim, max_uploads, nup,
+			      common_prefixes, is_truncated);
+  if (ret < 0)
+    return ret;
+
+  for (auto& ent : nup) {
+    uploads.emplace_back(std::make_unique<FilterMultipartUpload>(std::move(ent), this));
+  }
+
+  return 0;
+}
+
+int FilterBucket::abort_multiparts(const DoutPrefixProvider* dpp, CephContext* cct)
+{
+  return next->abort_multiparts(dpp, cct);
+}
+
+int FilterObject::delete_object(const DoutPrefixProvider* dpp,
+				optional_yield y,
+				bool prevent_versioning)
+{
+  return next->delete_object(dpp, y, prevent_versioning);
+}
+
+int FilterObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate,
+				 Completions* aio, bool keep_index_consistent,
+				 optional_yield y)
+{
+  return next->delete_obj_aio(dpp, astate, aio, keep_index_consistent, y);
+}
+
+int FilterObject::copy_object(User* user,
+			      req_info* info,
+			      const rgw_zone_id& source_zone,
+			      rgw::sal::Object* dest_object,
+			      rgw::sal::Bucket* dest_bucket,
+			      rgw::sal::Bucket* src_bucket,
+			      const rgw_placement_rule& dest_placement,
+			      ceph::real_time* src_mtime,
+			      ceph::real_time* mtime,
+			      const ceph::real_time* mod_ptr,
+			      const ceph::real_time* unmod_ptr,
+			      bool high_precision_time,
+			      const char* if_match,
+			      const char* if_nomatch,
+			      AttrsMod attrs_mod,
+			      bool copy_if_newer,
+			      Attrs& attrs,
+			      RGWObjCategory category,
+			      uint64_t olh_epoch,
+			      boost::optional<ceph::real_time> delete_at,
+			      std::string* version_id,
+			      std::string* tag,
+			      std::string* etag,
+			      void (*progress_cb)(off_t, void *),
+			      void* progress_data,
+			      const DoutPrefixProvider* dpp,
+			      optional_yield y)
+{
+  return next->copy_object(user, info, source_zone,
+			   nextObject(dest_object),
+			   nextBucket(dest_bucket),
+			   nextBucket(src_bucket),
+			   dest_placement, src_mtime, mtime,
+			   mod_ptr, unmod_ptr, high_precision_time, if_match,
+			   if_nomatch, attrs_mod, copy_if_newer, attrs,
+			   category, olh_epoch, delete_at, version_id, tag,
+			   etag, progress_cb, progress_data, dpp, y);
+}
+
+RGWAccessControlPolicy& FilterObject::get_acl()
+{
+  return next->get_acl();
+}
+
+int FilterObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **pstate,
+				optional_yield y, bool follow_olh)
+{
+  return next->get_obj_state(dpp, pstate, y, follow_olh);
+}
+
+int FilterObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs,
+				Attrs* delattrs, optional_yield y)
+{
+  return next->set_obj_attrs(dpp, setattrs, delattrs, y);
+}
+
+int FilterObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp,
+				rgw_obj* target_obj)
+{
+  return next->get_obj_attrs(y, dpp, target_obj);
+}
+
+int FilterObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val,
+				   optional_yield y, const DoutPrefixProvider* dpp)
+{
+  return next->modify_obj_attrs(attr_name, attr_val, y, dpp);
+}
+
+int FilterObject::delete_obj_attrs(const DoutPrefixProvider* dpp,
+				   const char* attr_name, optional_yield y)
+{
+  return next->delete_obj_attrs(dpp, attr_name, y);
+}
+
+bool FilterObject::is_expired()
+{
+  return next->is_expired();
+}
+
+void FilterObject::gen_rand_obj_instance_name()
+{
+  return next->gen_rand_obj_instance_name();
+}
+
+std::unique_ptr<MPSerializer> FilterObject::get_serializer(const DoutPrefixProvider *dpp,
+							   const std::string& lock_name)
+{
+  std::unique_ptr<MPSerializer> s = next->get_serializer(dpp, lock_name);
+  return std::make_unique<FilterMPSerializer>(std::move(s));
+}
+
+int FilterObject::transition(Bucket* bucket,
+			     const rgw_placement_rule& placement_rule,
+			     const real_time& mtime,
+			     uint64_t olh_epoch,
+			     const DoutPrefixProvider* dpp,
+			     optional_yield y)
+{
+  return next->transition(nextBucket(bucket), placement_rule, mtime, olh_epoch,
+			  dpp, y);
+}
+
+int FilterObject::transition_to_cloud(Bucket* bucket,
+				      rgw::sal::PlacementTier* tier,
+				      rgw_bucket_dir_entry& o,
+				      std::set<std::string>& cloud_targets,
+				      CephContext* cct,
+				      bool update_object,
+				      const DoutPrefixProvider* dpp,
+				      optional_yield y)
+{
+  return next->transition_to_cloud(nextBucket(bucket), nextPlacementTier(tier),
+				   o, cloud_targets, cct, update_object, dpp, y);
+}
+
+bool FilterObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
+{
+  return next->placement_rules_match(r1, r2);
+}
+
+int FilterObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y,
+				  Formatter* f)
+{
+  return next->dump_obj_layout(dpp, y, f);
+}
+
+void FilterObject::set_bucket(Bucket* b)
+{
+  bucket = b;
+  next->set_bucket(nextBucket(b));
+};
+
+int FilterObject::swift_versioning_restore(bool& restored,
+					   const DoutPrefixProvider* dpp)
+{
+  return next->swift_versioning_restore(restored, dpp);
+}
+
+int FilterObject::swift_versioning_copy(const DoutPrefixProvider* dpp,
+					optional_yield y)
+{
+  return next->swift_versioning_copy(dpp, y);
+}
+
+std::unique_ptr<Object::ReadOp> FilterObject::get_read_op()
+{
+  std::unique_ptr<ReadOp> r = next->get_read_op();
+  return std::make_unique<FilterReadOp>(std::move(r));
+}
+
+std::unique_ptr<Object::DeleteOp> FilterObject::get_delete_op()
+{
+  std::unique_ptr<DeleteOp> d = next->get_delete_op();
+  return std::make_unique<FilterDeleteOp>(std::move(d));
+}
+
+int FilterObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker,
+				uint64_t count, std::map<std::string, bufferlist> *m,
+				bool* pmore, optional_yield y)
+{
+  return next->omap_get_vals(dpp, marker, count, m, pmore, y);
+}
+
+int FilterObject::omap_get_all(const DoutPrefixProvider *dpp,
+			       std::map<std::string, bufferlist> *m,
+			       optional_yield y)
+{
+  return next->omap_get_all(dpp, m, y);
+}
+
+int FilterObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp,
+					const std::string& oid,
+					const std::set<std::string>& keys,
+					Attrs* vals)
+{
+  return next->omap_get_vals_by_keys(dpp, oid, keys, vals);
+}
+
+int FilterObject::omap_set_val_by_key(const DoutPrefixProvider *dpp,
+				      const std::string& key, bufferlist& val,
+				      bool must_exist, optional_yield y)
+{
+  return next->omap_set_val_by_key(dpp, key, val, must_exist, y);
+}
+
+int FilterObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return next->chown(new_user, dpp, y);
+}
+
+int FilterObject::FilterReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp)
+{
+  /* Copy params into next */
+  next->params = params;
+  return next->prepare(y, dpp);
+}
+
+int FilterObject::FilterReadOp::read(int64_t ofs, int64_t end, bufferlist& bl,
+				     optional_yield y, const DoutPrefixProvider* dpp)
+{
+  int ret = next->read(ofs, end, bl, y, dpp);
+  if (ret < 0)
+    return ret;
+
+  /* Copy params out of next */
+  params = next->params;
+  return ret;
+}
+
+int FilterObject::FilterReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y)
+{
+  return next->get_attr(dpp, name, dest, y);
+}
+
+int FilterObject::FilterReadOp::iterate(const DoutPrefixProvider* dpp, int64_t ofs,
+					int64_t end, RGWGetDataCB* cb, optional_yield y)
+{
+  int ret = next->iterate(dpp, ofs, end, cb, y);
+  if (ret < 0)
+    return ret;
+
+  /* Copy params out of next */
+  params = next->params;
+  return ret;
+}
+
+int FilterObject::FilterDeleteOp::delete_obj(const DoutPrefixProvider* dpp,
+					   optional_yield y)
+{
+  /* Copy params into next */
+  next->params = params;
+  int ret = next->delete_obj(dpp, y);
+  /* Copy result back */
+  result = next->result;
+  return ret;
+}
+
+std::unique_ptr<rgw::sal::Object> FilterMultipartUpload::get_meta_obj()
+{
+  std::unique_ptr<Object> no = next->get_meta_obj();
+
+  return std::make_unique<FilterObject>(std::move(no), bucket);
+}
+
+int FilterMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y,
+				ACLOwner& owner, rgw_placement_rule& dest_placement,
+				rgw::sal::Attrs& attrs)
+{
+  return next->init(dpp, y, owner, dest_placement, attrs);
+}
+
+int FilterMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct,
+				      int num_parts, int marker,
+				      int *next_marker, bool *truncated,
+				      bool assume_unsorted)
+{
+  int ret;
+
+  ret = next->list_parts(dpp, cct, num_parts, marker, next_marker, truncated,
+			 assume_unsorted);
+  if (ret < 0)
+    return ret;
+
+  parts.clear();
+
+  for (auto& ent : next->get_parts()) {
+    parts.emplace(ent.first, std::make_unique<FilterMultipartPart>(std::move(ent.second)));
+  }
+
+  return 0;
+}
+
+int FilterMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct)
+{
+  return next->abort(dpp, cct);
+}
+
+int FilterMultipartUpload::complete(const DoutPrefixProvider *dpp,
+				    optional_yield y, CephContext* cct,
+				    std::map<int, std::string>& part_etags,
+				    std::list<rgw_obj_index_key>& remove_objs,
+				    uint64_t& accounted_size, bool& compressed,
+				    RGWCompressionInfo& cs_info, off_t& ofs,
+				    std::string& tag, ACLOwner& owner,
+				    uint64_t olh_epoch,
+				    rgw::sal::Object* target_obj)
+{
+  return next->complete(dpp, y, cct, part_etags, remove_objs, accounted_size,
+			compressed, cs_info, ofs, tag, owner, olh_epoch,
+			nextObject(target_obj));
+}
+
+int FilterMultipartUpload::get_info(const DoutPrefixProvider *dpp,
+				    optional_yield y, rgw_placement_rule** rule,
+				    rgw::sal::Attrs* attrs)
+{
+  return next->get_info(dpp, y, rule, attrs);
+}
+
+std::unique_ptr<Writer> FilterMultipartUpload::get_writer(
+				  const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t part_num,
+				  const std::string& part_num_str)
+{
+  std::unique_ptr<Writer> writer;
+  writer = next->get_writer(dpp, y, nextObject(obj), owner,
+			    ptail_placement_rule, part_num, part_num_str);
+
+  return std::make_unique<FilterWriter>(std::move(writer), obj);
+}
+
+int FilterMPSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur,
+				 optional_yield y)
+{
+  return next->try_lock(dpp, dur, y);
+}
+
+int FilterLCSerializer::try_lock(const DoutPrefixProvider *dpp, utime_t dur,
+				 optional_yield y)
+{
+  return next->try_lock(dpp, dur, y);
+}
+
+std::unique_ptr<Lifecycle::LCEntry> FilterLifecycle::get_entry()
+{
+  std::unique_ptr<Lifecycle::LCEntry> e = next->get_entry();
+  return std::make_unique<FilterLCEntry>(std::move(e));
+}
+
+int FilterLifecycle::get_entry(const std::string& oid, const std::string& marker,
+			       std::unique_ptr<LCEntry>* entry)
+{
+  std::unique_ptr<LCEntry> ne;
+  int ret;
+
+  ret = next->get_entry(oid, marker, &ne);
+  if (ret < 0)
+    return ret;
+
+  LCEntry* e = new FilterLCEntry(std::move(ne));
+  entry->reset(e);
+
+  return 0;
+}
+
+int FilterLifecycle::get_next_entry(const std::string& oid, const std::string& marker,
+				    std::unique_ptr<LCEntry>* entry)
+{
+  std::unique_ptr<LCEntry> ne;
+  int ret;
+
+  ret = next->get_next_entry(oid, marker, &ne);
+  if (ret < 0)
+    return ret;
+
+  LCEntry* e = new FilterLCEntry(std::move(ne));
+  entry->reset(e);
+
+  return 0;
+}
+
+int FilterLifecycle::set_entry(const std::string& oid, LCEntry& entry)
+{
+  return next->set_entry(oid, entry);
+}
+
+int FilterLifecycle::list_entries(const std::string& oid, const std::string& marker,
+				  uint32_t max_entries,
+				  std::vector<std::unique_ptr<LCEntry>>& entries)
+{
+  std::vector<std::unique_ptr<LCEntry>> ne;
+  int ret;
+
+  ret = next->list_entries(oid, marker, max_entries, ne);
+  if (ret < 0)
+    return ret;
+
+  for (auto& ent : ne) {
+    entries.emplace_back(std::make_unique<FilterLCEntry>(std::move(ent)));
+  }
+
+  return 0;
+}
+
+int FilterLifecycle::rm_entry(const std::string& oid, LCEntry& entry)
+{
+  return next->rm_entry(oid, entry);
+}
+
+int FilterLifecycle::get_head(const std::string& oid, std::unique_ptr<LCHead>* head)
+{
+  std::unique_ptr<LCHead> nh;
+  int ret;
+
+  ret = next->get_head(oid, &nh);
+  if (ret < 0)
+    return ret;
+
+  LCHead* h = new FilterLCHead(std::move(nh));
+  head->reset(h);
+
+  return 0;
+}
+
+int FilterLifecycle::put_head(const std::string& oid, LCHead& head)
+{
+  return next->put_head(oid, *(dynamic_cast<FilterLCHead&>(head).next.get()));
+}
+
+std::unique_ptr<LCSerializer> FilterLifecycle::get_serializer(
+					      const std::string& lock_name,
+					      const std::string& oid,
+					      const std::string& cookie)
+{
+  std::unique_ptr<LCSerializer> ns;
+  ns = next->get_serializer(lock_name, oid, cookie);
+
+  return std::make_unique<FilterLCSerializer>(std::move(ns));
+}
+
+int FilterNotification::publish_reserve(const DoutPrefixProvider *dpp,
+					RGWObjTags* obj_tags)
+{
+  return next->publish_reserve(dpp, obj_tags);
+}
+
+int FilterNotification::publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+				       const ceph::real_time& mtime, const
+				       std::string& etag, const std::string& version)
+{
+  return next->publish_commit(dpp, size, mtime, etag, version);
+}
+
+int FilterWriter::process(bufferlist&& data, uint64_t offset)
+{
+  return next->process(std::move(data), offset);
+}
+
+int FilterWriter::complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y)
+{
+  return next->complete(accounted_size, etag, mtime, set_mtime, attrs,
+			delete_at, if_match, if_nomatch, user_data, zones_trace,
+			canceled, y);
+}
+
+int FilterLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y,
+				const std::string& key, std::string& script)
+{
+  return next->get_script(dpp, y, key, script);
+}
+
+int FilterLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y,
+				const std::string& key, const std::string& script)
+{
+  return next->put_script(dpp, y, key, script);
+}
+
+int FilterLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y,
+				const std::string& key)
+{
+  return next->del_script(dpp, y, key);
+}
+
+int FilterLuaManager::add_package(const DoutPrefixProvider* dpp, optional_yield y,
+                                 const std::string& package_name)
+{
+  return next->add_package(dpp, y, package_name);
+}
+
+int FilterLuaManager::remove_package(const DoutPrefixProvider* dpp, optional_yield y, 
+                                    const std::string& package_name)
+{
+  return next->remove_package(dpp, y, package_name);
+}
+
+int FilterLuaManager::list_packages(const DoutPrefixProvider* dpp, optional_yield y,
+                                   rgw::lua::packages_t& packages)
+{
+  return next->list_packages(dpp, y, packages);
+}
+
+} } // namespace rgw::sal
+
+extern "C" {
+
+rgw::sal::Driver* newBaseFilter(rgw::sal::Driver* next)
+{
+  rgw::sal::FilterDriver* driver = new rgw::sal::FilterDriver(next);
+
+  return driver;
+}
+
+}
diff --git a/src/rgw/rgw_sal_filter.h b/src/rgw/rgw_sal_filter.h
new file mode 100644
index 000000000..951a1de5f
--- /dev/null
+++ b/src/rgw/rgw_sal_filter.h
@@ -0,0 +1,921 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal.h"
+#include "rgw_oidc_provider.h"
+#include "rgw_role.h"
+
+namespace rgw { namespace sal {
+
+class FilterCompletions : public Completions {
+protected:
+  std::unique_ptr<Completions> next;
+
+public:
+  FilterCompletions(std::unique_ptr<Completions> _next) : next(std::move(_next)) {}
+  virtual ~FilterCompletions() = default;
+  virtual int drain() override { return next->drain(); }
+};
+
+class FilterPlacementTier : public PlacementTier {
+protected:
+  std::unique_ptr<PlacementTier> next;
+
+public:
+  FilterPlacementTier(std::unique_ptr<PlacementTier> _next) : next(std::move(_next)) {}
+  virtual ~FilterPlacementTier() = default;
+
+  virtual const std::string& get_tier_type() override { return next->get_tier_type(); }
+  virtual const std::string& get_storage_class() override { return next->get_storage_class(); }
+  virtual bool retain_head_object() override { return next->retain_head_object(); }
+
+  /* Internal to Filters */
+  PlacementTier* get_next() { return next.get(); }
+};
+
+class FilterZoneGroup : public ZoneGroup {
+protected:
+  std::unique_ptr<ZoneGroup> next;
+
+public:
+  FilterZoneGroup(std::unique_ptr<ZoneGroup> _next) : next(std::move(_next)) {}
+  virtual ~FilterZoneGroup() = default;
+  virtual const std::string& get_id() const override
+    { return next->get_id(); }
+  virtual const std::string& get_name() const override
+    { return next->get_name(); }
+  virtual int equals(const std::string& other_zonegroup) const override
+    { return next->equals(other_zonegroup); }
+  virtual const std::string& get_endpoint() const override
+    { return next->get_endpoint(); }
+  virtual bool placement_target_exists(std::string& target) const override
+    { return next->placement_target_exists(target); }
+  virtual bool is_master_zonegroup() const override
+    { return next->is_master_zonegroup(); }
+  virtual const std::string& get_api_name() const override
+    { return next->get_api_name(); }
+  virtual void get_placement_target_names(std::set<std::string>& names) const override
+    { next->get_placement_target_names(names); }
+  virtual const std::string& get_default_placement_name() const override
+    { return next->get_default_placement_name(); }
+  virtual int get_hostnames(std::list<std::string>& names) const override
+    { return next->get_hostnames(names); }
+  virtual int get_s3website_hostnames(std::list<std::string>& names) const override
+    { return next->get_s3website_hostnames(names); }
+  virtual int get_zone_count() const override
+    { return next->get_zone_count(); }
+  virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr<PlacementTier>* tier) override;
+  virtual int get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone) override;
+  virtual int get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone) override;
+  virtual int list_zones(std::list<std::string>& zone_ids) override
+    { return next->list_zones(zone_ids); }
+  bool supports(std::string_view feature) const override {
+    return next->supports(feature);
+  }
+  virtual std::unique_ptr<ZoneGroup> clone() override {
+    std::unique_ptr<ZoneGroup> nzg = next->clone();
+    return std::make_unique<FilterZoneGroup>(std::move(nzg));
+  }
+};
+
+class FilterZone : public Zone {
+protected:
+  std::unique_ptr<Zone> next;
+private:
+  std::unique_ptr<ZoneGroup> group;
+
+public:
+  FilterZone(std::unique_ptr<Zone> _next) : next(std::move(_next))
+  {
+    group = std::make_unique<FilterZoneGroup>(next->get_zonegroup().clone());
+  }
+  virtual ~FilterZone() = default;
+
+  virtual std::unique_ptr<Zone> clone() override {
+    std::unique_ptr<Zone> nz = next->clone();
+    return std::make_unique<FilterZone>(std::move(nz));
+  }
+  virtual ZoneGroup& get_zonegroup() override {
+      return *group.get();
+  }
+  virtual const std::string& get_id() override {
+      return next->get_id();
+  }
+  virtual const std::string& get_name() const override {
+      return next->get_name();
+  }
+  virtual bool is_writeable() override {
+      return next->is_writeable();
+  }
+  virtual bool get_redirect_endpoint(std::string* endpoint) override {
+      return next->get_redirect_endpoint(endpoint);
+  }
+  virtual bool has_zonegroup_api(const std::string& api) const override {
+      return next->has_zonegroup_api(api);
+  }
+  virtual const std::string& get_current_period_id() override {
+      return next->get_current_period_id();
+  }
+  virtual const RGWAccessKey& get_system_key() override {
+      return next->get_system_key();
+  }
+  virtual const std::string& get_realm_name() override {
+      return next->get_realm_name();
+  }
+  virtual const std::string& get_realm_id() override {
+      return next->get_realm_id();
+  }
+  virtual const std::string_view get_tier_type() override {
+      return next->get_tier_type();
+  }
+  virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() override {
+    return next->get_sync_policy_handler();
+  }
+};
+
+class FilterDriver : public Driver {
+protected:
+  Driver* next;
+private:
+  std::unique_ptr<FilterZone> zone;
+
+public:
+  FilterDriver(Driver* _next) : next(_next) {}
+  virtual ~FilterDriver() = default;
+
+  virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) override;
+  virtual const std::string get_name() const override;
+  virtual std::string get_cluster_id(const DoutPrefixProvider* dpp,
+				     optional_yield y) override;
+  virtual std::unique_ptr<User> get_user(const rgw_user& u) override;
+  virtual int get_user_by_access_key(const DoutPrefixProvider* dpp, const
+				     std::string& key, optional_yield y,
+				     std::unique_ptr<User>* user) override;
+  virtual int get_user_by_email(const DoutPrefixProvider* dpp, const
+				std::string& email, optional_yield y,
+				std::unique_ptr<User>* user) override;
+  virtual int get_user_by_swift(const DoutPrefixProvider* dpp, const
+				std::string& user_str, optional_yield y,
+				std::unique_ptr<User>* user) override;
+  virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+  virtual int get_bucket(User* u, const RGWBucketInfo& i,
+			 std::unique_ptr<Bucket>* bucket) override;
+  virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const
+			 rgw_bucket& b, std::unique_ptr<Bucket>* bucket,
+			 optional_yield y) override;
+  virtual int get_bucket(const DoutPrefixProvider* dpp, User* u, const
+			 std::string& tenant, const std::string& name,
+			 std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+  virtual bool is_meta_master() override;
+  virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user,
+					obj_version* objv, bufferlist& in_data,
+					JSONParser* jp, req_info& info,
+					optional_yield y) override;
+  virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp,
+					    const RGWAccessKey& key,
+					    obj_version* objv,
+					    bufferlist& in_data,
+					    RGWXMLDecoder::XMLParser* parser,
+					    req_info& info,
+					    optional_yield y) override;
+  virtual Zone* get_zone() override { return zone.get(); }
+  virtual std::string zone_unique_id(uint64_t unique_num) override;
+  virtual std::string zone_unique_trans_id(const uint64_t unique_num) override;
+  virtual int get_zonegroup(const std::string& id, std::unique_ptr<ZoneGroup>* zonegroup) override;
+  virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids) override {
+    return next->list_all_zones(dpp, zone_ids);
+  }
+  virtual int cluster_stat(RGWClusterStat& stats) override;
+  virtual std::unique_ptr<Lifecycle> get_lifecycle(void) override;
+  virtual std::unique_ptr<Completions> get_completions(void) override;
+
+  virtual std::unique_ptr<Notification> get_notification(rgw::sal::Object* obj,
+				 rgw::sal::Object* src_obj, struct req_state* s,
+				 rgw::notify::EventType event_type, optional_yield y,
+				 const std::string* object_name=nullptr) override;
+  virtual std::unique_ptr<Notification> get_notification(
+    const DoutPrefixProvider* dpp, rgw::sal::Object* obj, rgw::sal::Object* src_obj,
+
+    rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket,
+    std::string& _user_id, std::string& _user_tenant,
+    std::string& _req_id, optional_yield y) override;
+
+  int read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+      optional_yield y, const DoutPrefixProvider *dpp) override {
+    return next->read_topics(tenant, topics, objv_tracker, y, dpp);
+  }
+  int write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+      optional_yield y, const DoutPrefixProvider *dpp) override {
+    return next->write_topics(tenant, topics, objv_tracker, y, dpp);
+  }
+  int remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker, 
+      optional_yield y, const DoutPrefixProvider *dpp) override {
+    return next->remove_topics(tenant, objv_tracker, y, dpp);
+  }
+
+  virtual RGWLC* get_rgwlc(void) override;
+  virtual RGWCoroutinesManagerRegistry* get_cr_registry() override;
+
+  virtual int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket,
+			RGWUsageBatch>& usage_info) override;
+  virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid,
+		     bufferlist& bl) override;
+  virtual int register_to_service_map(const DoutPrefixProvider *dpp, const
+				      std::string& daemon_type,
+				      const std::map<std::string,
+				      std::string>& meta) override;
+  virtual void get_quota(RGWQuota& quota) override;
+  virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit,
+			     RGWRateLimitInfo& user_ratelimit,
+			     RGWRateLimitInfo& anon_ratelimit) override;
+  virtual int set_buckets_enabled(const DoutPrefixProvider* dpp,
+				  std::vector<rgw_bucket>& buckets,
+				  bool enabled) override;
+  virtual uint64_t get_new_req_id() override;
+  virtual int get_sync_policy_handler(const DoutPrefixProvider* dpp,
+				      std::optional<rgw_zone_id> zone,
+				      std::optional<rgw_bucket> bucket,
+				      RGWBucketSyncPolicyHandlerRef* phandler,
+				      optional_yield y) override;
+  virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override;
+  virtual void wakeup_meta_sync_shards(std::set<int>& shard_ids) override;
+  virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp,
+				       const rgw_zone_id& source_zone,
+				       boost::container::flat_map<int, boost::container::flat_set<rgw_data_notify_entry>>& shard_ids) override;
+  virtual int clear_usage(const DoutPrefixProvider *dpp) override;
+  virtual int read_all_usage(const DoutPrefixProvider *dpp,
+			     uint64_t start_epoch, uint64_t end_epoch,
+			     uint32_t max_entries, bool* is_truncated,
+			     RGWUsageIter& usage_iter,
+			     std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+  virtual int trim_all_usage(const DoutPrefixProvider *dpp,
+			     uint64_t start_epoch, uint64_t end_epoch) override;
+  virtual int get_config_key_val(std::string name, bufferlist* bl) override;
+  virtual int meta_list_keys_init(const DoutPrefixProvider *dpp,
+				  const std::string& section,
+				  const std::string& marker,
+				  void** phandle) override;
+  virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle,
+				  int max, std::list<std::string>& keys,
+				  bool* truncated) override;
+  virtual void meta_list_keys_complete(void* handle) override;
+  virtual std::string meta_get_marker(void* handle) override;
+  virtual int meta_remove(const DoutPrefixProvider* dpp,
+			  std::string& metadata_key, optional_yield y) override;
+  virtual const RGWSyncModuleInstanceRef& get_sync_module() override;
+  virtual std::string get_host_id() override { return next->get_host_id(); }
+  virtual std::unique_ptr<LuaManager> get_lua_manager() override;
+  virtual std::unique_ptr<RGWRole> get_role(std::string name,
+					    std::string tenant,
+					    std::string path="",
+					    std::string trust_policy="",
+					    std::string
+					    max_session_duration_str="",
+                std::multimap<std::string,std::string> tags={}) override;
+  virtual std::unique_ptr<RGWRole> get_role(std::string id) override;
+  virtual std::unique_ptr<RGWRole> get_role(const RGWRoleInfo& info) override;
+  virtual int get_roles(const DoutPrefixProvider *dpp,
+			optional_yield y,
+			const std::string& path_prefix,
+			const std::string& tenant,
+			std::vector<std::unique_ptr<RGWRole>>& roles) override;
+  virtual std::unique_ptr<RGWOIDCProvider> get_oidc_provider() override;
+  virtual int get_oidc_providers(const DoutPrefixProvider *dpp,
+				 const std::string& tenant,
+				 std::vector<std::unique_ptr<RGWOIDCProvider>>&
+				 providers) override;
+  virtual std::unique_ptr<Writer> get_append_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule
+				  *ptail_placement_rule,
+				  const std::string& unique_tag,
+				  uint64_t position,
+				  uint64_t *cur_accounted_size) override;
+  virtual std::unique_ptr<Writer> get_atomic_writer(const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t olh_epoch,
+				  const std::string& unique_tag) override;
+
+  virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override;
+  virtual bool valid_placement(const rgw_placement_rule& rule) override;
+
+  virtual void finalize(void) override;
+
+  virtual CephContext* ctx(void) override;
+
+  virtual void register_admin_apis(RGWRESTMgr* mgr) override {
+      return next->register_admin_apis(mgr);
+  }
+};
+
+class FilterUser : public User {
+protected:
+  std::unique_ptr<User> next;
+
+public:
+  FilterUser(std::unique_ptr<User> _next) : next(std::move(_next)) {}
+  FilterUser(FilterUser& u) : next(u.next->clone()) {};
+  virtual ~FilterUser() = default;
+
+  virtual std::unique_ptr<User> clone() override {
+    return std::make_unique<FilterUser>(*this);
+  }
+  virtual int list_buckets(const DoutPrefixProvider* dpp,
+			   const std::string& marker, const std::string& end_marker,
+			   uint64_t max, bool need_stats, BucketList& buckets,
+			   optional_yield y) override;
+  virtual int create_bucket(const DoutPrefixProvider* dpp,
+			    const rgw_bucket& b,
+			    const std::string& zonegroup_id,
+			    rgw_placement_rule& placement_rule,
+			    std::string& swift_ver_location,
+			    const RGWQuotaInfo* pquota_info,
+			    const RGWAccessControlPolicy& policy,
+			    Attrs& attrs,
+			    RGWBucketInfo& info,
+			    obj_version& ep_objv,
+			    bool exclusive,
+			    bool obj_lock_enabled,
+			    bool* existed,
+			    req_info& req_info,
+			    std::unique_ptr<Bucket>* bucket,
+			    optional_yield y) override;
+
+  virtual std::string& get_display_name() override { return next->get_display_name(); }
+  virtual const std::string& get_tenant() override { return next->get_tenant(); }
+  virtual void set_tenant(std::string& _t) override { next->set_tenant(_t); }
+  virtual const std::string& get_ns() override { return next->get_ns(); }
+  virtual void set_ns(std::string& _ns) override { next->set_ns(_ns); }
+  virtual void clear_ns() override { next->clear_ns(); }
+  virtual const rgw_user& get_id() const override { return next->get_id(); }
+  virtual uint32_t get_type() const override { return next->get_type(); }
+  virtual int32_t get_max_buckets() const override { return next->get_max_buckets(); }
+  virtual const RGWUserCaps& get_caps() const override { return next->get_caps(); }
+  virtual RGWObjVersionTracker& get_version_tracker() override {
+    return next->get_version_tracker();
+  }
+  virtual Attrs& get_attrs() override { return next->get_attrs(); }
+  virtual void set_attrs(Attrs& _attrs) override { next->set_attrs(_attrs); }
+  virtual bool empty() const override { return next->empty(); }
+  virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override;
+  virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs&
+				    new_attrs, optional_yield y) override;
+  virtual int read_stats(const DoutPrefixProvider *dpp,
+			 optional_yield y, RGWStorageStats* stats,
+			 ceph::real_time* last_stats_sync = nullptr,
+			 ceph::real_time* last_stats_update = nullptr) override;
+  virtual int read_stats_async(const DoutPrefixProvider *dpp,
+			       RGWGetUserStats_CB* cb) override;
+  virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+  virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+			 uint64_t end_epoch, uint32_t max_entries,
+			 bool* is_truncated, RGWUsageIter& usage_iter,
+			 std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+  virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+			 uint64_t end_epoch) override;
+
+  virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+  virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool
+			 exclusive, RGWUserInfo* old_info = nullptr) override;
+  virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+  virtual int verify_mfa(const std::string& mfa_str, bool* verified,
+			 const DoutPrefixProvider* dpp, optional_yield y) override;
+
+  RGWUserInfo& get_info() override { return next->get_info(); }
+  virtual void print(std::ostream& out) const override { return next->print(out); }
+
+  /* Internal to Filters */
+  User* get_next() { return next.get(); }
+};
+
+class FilterBucket : public Bucket {
+protected:
+  std::unique_ptr<Bucket> next;
+private:
+  User* user;
+
+public:
+
+  FilterBucket(std::unique_ptr<Bucket> _next, User* _user) :
+    next(std::move(_next)), user(_user) {}
+  virtual ~FilterBucket() = default;
+
+  virtual std::unique_ptr<Object> get_object(const rgw_obj_key& key) override;
+  virtual int list(const DoutPrefixProvider* dpp, ListParams&, int,
+		   ListResults&, optional_yield y) override;
+  virtual Attrs& get_attrs(void) override { return next->get_attrs(); }
+  virtual int set_attrs(Attrs a) override { return next->set_attrs(a); }
+  virtual int remove_bucket(const DoutPrefixProvider* dpp, bool delete_children,
+			    bool forward_to_master, req_info* req_info,
+			    optional_yield y) override;
+  virtual int remove_bucket_bypass_gc(int concurrent_max, bool
+				      keep_index_consistent,
+				      optional_yield y, const
+				      DoutPrefixProvider *dpp) override;
+  virtual RGWAccessControlPolicy& get_acl(void) override { return next->get_acl(); }
+  virtual int set_acl(const DoutPrefixProvider* dpp, RGWAccessControlPolicy& acl,
+		      optional_yield y) override;
+
+  virtual void set_owner(rgw::sal::User* _owner) override { next->set_owner(_owner); }
+  virtual int load_bucket(const DoutPrefixProvider* dpp, optional_yield y,
+			  bool get_stats = false) override;
+  virtual int read_stats(const DoutPrefixProvider *dpp,
+			 const bucket_index_layout_generation& idx_layout,
+			 int shard_id, std::string* bucket_ver, std::string* master_ver,
+			 std::map<RGWObjCategory, RGWStorageStats>& stats,
+			 std::string* max_marker = nullptr,
+			 bool* syncstopped = nullptr) override;
+  virtual int read_stats_async(const DoutPrefixProvider *dpp,
+			       const bucket_index_layout_generation& idx_layout,
+			       int shard_id, RGWGetBucketStats_CB* ctx) override;
+  virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+  virtual int update_container_stats(const DoutPrefixProvider* dpp) override;
+  virtual int check_bucket_shards(const DoutPrefixProvider* dpp) override;
+  virtual int chown(const DoutPrefixProvider* dpp, User& new_user,
+		    optional_yield y) override;
+  virtual int put_info(const DoutPrefixProvider* dpp, bool exclusive,
+		       ceph::real_time mtime) override;
+  virtual bool is_owner(User* user) override;
+  virtual User* get_owner(void) override { return user; }
+  virtual ACLOwner get_acl_owner(void) override { return next->get_acl_owner(); }
+  virtual int check_empty(const DoutPrefixProvider* dpp, optional_yield y) override;
+  virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota,
+			  uint64_t obj_size, optional_yield y,
+			  bool check_size_only = false) override;
+  virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp,
+				    Attrs& new_attrs, optional_yield y) override;
+  virtual int try_refresh_info(const DoutPrefixProvider* dpp,
+			       ceph::real_time* pmtime) override;
+  virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+			 uint64_t end_epoch, uint32_t max_entries,
+			 bool* is_truncated, RGWUsageIter& usage_iter,
+			 std::map<rgw_user_bucket,
+			 rgw_usage_log_entry>& usage) override;
+  virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch,
+			 uint64_t end_epoch) override;
+  virtual int remove_objs_from_index(const DoutPrefixProvider *dpp,
+				     std::list<rgw_obj_index_key>&
+				     objs_to_unlink) override;
+  virtual int check_index(const DoutPrefixProvider *dpp,
+			  std::map<RGWObjCategory, RGWStorageStats>&
+			  existing_stats,
+			  std::map<RGWObjCategory, RGWStorageStats>&
+			  calculated_stats) override;
+  virtual int rebuild_index(const DoutPrefixProvider *dpp) override;
+  virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override;
+  virtual int purge_instance(const DoutPrefixProvider* dpp) override;
+  virtual bool empty() const override { return next->empty(); }
+  virtual const std::string& get_name() const override { return next->get_name(); }
+  virtual const std::string& get_tenant() const override { return next->get_tenant(); }
+  virtual const std::string& get_marker() const override { return next->get_marker(); }
+  virtual const std::string& get_bucket_id() const override { return next->get_bucket_id(); }
+  virtual size_t get_size() const override { return next->get_size(); }
+  virtual size_t get_size_rounded() const override { return next->get_size_rounded(); }
+  virtual uint64_t get_count() const override { return next->get_count(); }
+  virtual rgw_placement_rule& get_placement_rule() override { return next->get_placement_rule(); }
+  virtual ceph::real_time& get_creation_time() override { return next->get_creation_time(); }
+  virtual ceph::real_time& get_modification_time() override { return next->get_modification_time(); }
+  virtual obj_version& get_version() override { return next->get_version(); }
+  virtual void set_version(obj_version &ver) override { next->set_version(ver); }
+  virtual bool versioned() override { return next->versioned(); }
+  virtual bool versioning_enabled() override { return next->versioning_enabled(); }
+
+  virtual std::unique_ptr<Bucket> clone() override {
+    std::unique_ptr<Bucket> nb = next->clone();
+    return std::make_unique<FilterBucket>(std::move(nb), user);
+  }
+
+  virtual std::unique_ptr<MultipartUpload> get_multipart_upload(
+				const std::string& oid,
+				std::optional<std::string> upload_id=std::nullopt,
+				ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) override;
+  virtual int list_multiparts(const DoutPrefixProvider *dpp,
+			      const std::string& prefix,
+			      std::string& marker,
+			      const std::string& delim,
+			      const int& max_uploads,
+			      std::vector<std::unique_ptr<MultipartUpload>>& uploads,
+			      std::map<std::string, bool> *common_prefixes,
+			      bool *is_truncated) override;
+  virtual int abort_multiparts(const DoutPrefixProvider* dpp,
+			       CephContext* cct) override;
+
+  int read_topics(rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, 
+      optional_yield y, const DoutPrefixProvider *dpp) override { 
+    return next->read_topics(notifications, objv_tracker, y, dpp); 
+  }
+  int write_topics(const rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* obj_tracker, 
+      optional_yield y, const DoutPrefixProvider *dpp) override { 
+    return next->write_topics(notifications, obj_tracker, y, dpp); 
+  }
+  int remove_topics(RGWObjVersionTracker* objv_tracker, 
+      optional_yield y, const DoutPrefixProvider *dpp) override {
+    return next->remove_topics(objv_tracker, y, dpp);
+  }
+
+  virtual rgw_bucket& get_key() override { return next->get_key(); }
+  virtual RGWBucketInfo& get_info() override { return next->get_info(); }
+
+  virtual void print(std::ostream& out) const override { return next->print(out); }
+
+  virtual bool operator==(const Bucket& b) const override { return next->operator==(b); }
+  virtual bool operator!=(const Bucket& b) const override { return next->operator!=(b); }
+
+  friend class BucketList;
+
+  /* Internal to Filters */
+  Bucket* get_next() { return next.get(); }
+};
+
+class FilterObject : public Object {
+protected:
+  std::unique_ptr<Object> next;
+private:
+  Bucket* bucket{nullptr};
+
+public:
+
+  struct FilterReadOp : ReadOp {
+    std::unique_ptr<ReadOp> next;
+
+    FilterReadOp(std::unique_ptr<ReadOp> _next) : next(std::move(_next)) {}
+    virtual ~FilterReadOp() = default;
+
+    virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override;
+    virtual int read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y,
+		     const DoutPrefixProvider* dpp) override;
+    virtual int iterate(const DoutPrefixProvider* dpp, int64_t ofs, int64_t end,
+			RGWGetDataCB* cb, optional_yield y) override;
+    virtual int get_attr(const DoutPrefixProvider* dpp, const char* name,
+			 bufferlist& dest, optional_yield y) override;
+  };
+
+  struct FilterDeleteOp : DeleteOp {
+    std::unique_ptr<DeleteOp> next;
+
+    FilterDeleteOp(std::unique_ptr<DeleteOp> _next) : next(std::move(_next)) {}
+    virtual ~FilterDeleteOp() = default;
+
+    virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override;
+  };
+
+  FilterObject(std::unique_ptr<Object> _next) : next(std::move(_next)) {}
+  FilterObject(std::unique_ptr<Object> _next, Bucket* _bucket) :
+			next(std::move(_next)), bucket(_bucket) {}
+  FilterObject(FilterObject& _o) {
+    next = _o.next->clone();
+    bucket = _o.bucket;
+  }
+  virtual ~FilterObject() = default;
+
+  virtual int delete_object(const DoutPrefixProvider* dpp,
+			    optional_yield y,
+			    bool prevent_versioning = false) override;
+  virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate,
+			     Completions* aio,
+			     bool keep_index_consistent, optional_yield y) override;
+  virtual int copy_object(User* user,
+               req_info* info, const rgw_zone_id& source_zone,
+	       rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket,
+               rgw::sal::Bucket* src_bucket,
+               const rgw_placement_rule& dest_placement,
+               ceph::real_time* src_mtime, ceph::real_time* mtime,
+               const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr,
+               bool high_precision_time,
+               const char* if_match, const char* if_nomatch,
+               AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs,
+               RGWObjCategory category, uint64_t olh_epoch,
+	       boost::optional<ceph::real_time> delete_at,
+               std::string* version_id, std::string* tag, std::string* etag,
+               void (*progress_cb)(off_t, void *), void* progress_data,
+               const DoutPrefixProvider* dpp, optional_yield y) override;
+  virtual RGWAccessControlPolicy& get_acl(void) override;
+  virtual int set_acl(const RGWAccessControlPolicy& acl) override { return next->set_acl(acl); }
+  virtual void set_atomic() override { return next->set_atomic(); }
+  virtual bool is_atomic() override { return next->is_atomic(); }
+  virtual void set_prefetch_data() override { return next->set_prefetch_data(); }
+  virtual bool is_prefetch_data() override { return next->is_prefetch_data(); }
+  virtual void set_compressed() override { return next->set_compressed(); }
+  virtual bool is_compressed() override { return next->is_compressed(); }
+  virtual void invalidate() override { return next->invalidate(); }
+  virtual bool empty() const override { return next->empty(); }
+  virtual const std::string &get_name() const override { return next->get_name(); }
+
+  virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state,
+			    optional_yield y, bool follow_olh = true) override;
+  virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs,
+			    Attrs* delattrs, optional_yield y) override;
+  virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp,
+			    rgw_obj* target_obj = NULL) override;
+  virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val,
+			       optional_yield y, const DoutPrefixProvider* dpp) override;
+  virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name,
+			       optional_yield y) override;
+  virtual bool is_expired() override;
+  virtual void gen_rand_obj_instance_name() override;
+  virtual std::unique_ptr<MPSerializer> get_serializer(const DoutPrefixProvider *dpp,
+						       const std::string& lock_name) override;
+  virtual int transition(Bucket* bucket,
+			 const rgw_placement_rule& placement_rule,
+			 const real_time& mtime,
+			 uint64_t olh_epoch,
+			 const DoutPrefixProvider* dpp,
+			 optional_yield y) override;
+  virtual int transition_to_cloud(Bucket* bucket,
+				  rgw::sal::PlacementTier* tier,
+				  rgw_bucket_dir_entry& o,
+				  std::set<std::string>& cloud_targets,
+				  CephContext* cct,
+				  bool update_object,
+				  const DoutPrefixProvider* dpp,
+				  optional_yield y) override;
+  virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
+  virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y,
+			      Formatter* f) override;
+
+  virtual Attrs& get_attrs(void) override { return next->get_attrs(); };
+  virtual const Attrs& get_attrs(void) const override { return next->get_attrs(); };
+  virtual int set_attrs(Attrs a) override { return next->set_attrs(a); };
+  virtual bool has_attrs(void) override { return next->has_attrs(); };
+  virtual ceph::real_time get_mtime(void) const override { return next->get_mtime(); };
+  virtual uint64_t get_obj_size(void) const override { return next->get_obj_size(); };
+  virtual Bucket* get_bucket(void) const override { return bucket; };
+  virtual void set_bucket(Bucket* b) override;
+  virtual std::string get_hash_source(void) override { return next->get_hash_source(); };
+  virtual void set_hash_source(std::string s) override { return next->set_hash_source(s); };
+  virtual std::string get_oid(void) const override { return next->get_oid(); };
+  virtual bool get_delete_marker(void) override { return next->get_delete_marker(); };
+  virtual bool get_in_extra_data(void) override { return next->get_in_extra_data(); };
+  virtual void set_in_extra_data(bool i) override { return next->set_in_extra_data(i); };
+  int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end) {
+    return next->range_to_ofs(obj_size, ofs, end);
+  };
+  virtual void set_obj_size(uint64_t s) override { return next->set_obj_size(s); };
+  virtual void set_name(const std::string& n) override { return next->set_name(n); };
+  virtual void set_key(const rgw_obj_key& k) override { return next->set_key(k); };
+  virtual rgw_obj get_obj(void) const override { return next->get_obj(); };
+  virtual rgw_obj_key& get_key() override { return next->get_key(); }
+  virtual void set_instance(const std::string &i) override { return next->set_instance(i); }
+  virtual const std::string &get_instance() const override { return next->get_instance(); }
+  virtual bool have_instance(void) override { return next->have_instance(); }
+  virtual void clear_instance() override { return next->clear_instance(); }
+
+  virtual int swift_versioning_restore(bool& restored,   /* out */
+				       const DoutPrefixProvider* dpp) override;
+  virtual int swift_versioning_copy(const DoutPrefixProvider* dpp,
+				    optional_yield y) override;
+
+  virtual std::unique_ptr<ReadOp> get_read_op() override;
+  virtual std::unique_ptr<DeleteOp> get_delete_op() override;
+
+  virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker,
+			    uint64_t count, std::map<std::string, bufferlist>* m,
+			    bool* pmore, optional_yield y) override;
+  virtual int omap_get_all(const DoutPrefixProvider *dpp,
+			   std::map<std::string, bufferlist>* m,
+			   optional_yield y) override;
+  virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp,
+				    const std::string& oid,
+				    const std::set<std::string>& keys,
+				    Attrs* vals) override;
+  virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp,
+				  const std::string& key, bufferlist& val,
+				  bool must_exist, optional_yield y) override;
+  virtual int chown(User& new_user, const DoutPrefixProvider* dpp,
+		    optional_yield y) override;
+
+  virtual std::unique_ptr<Object> clone() override {
+    return std::make_unique<FilterObject>(*this);
+  }
+
+  virtual void print(std::ostream& out) const override { return next->print(out); }
+
+  /* Internal to Filters */
+  Object* get_next() { return next.get(); }
+};
+
+class FilterMultipartPart : public MultipartPart {
+protected:
+  std::unique_ptr<MultipartPart> next;
+
+public:
+  FilterMultipartPart(std::unique_ptr<MultipartPart> _next) : next(std::move(_next)) {}
+  virtual ~FilterMultipartPart() = default;
+
+  virtual uint32_t get_num() override { return next->get_num(); }
+  virtual uint64_t get_size() override { return next->get_size(); }
+  virtual const std::string& get_etag() override { return next->get_etag(); }
+  virtual ceph::real_time& get_mtime() override { return next->get_mtime(); }
+};
+
+class FilterMultipartUpload : public MultipartUpload {
+protected:
+  std::unique_ptr<MultipartUpload> next;
+  Bucket* bucket;
+  std::map<uint32_t, std::unique_ptr<MultipartPart>> parts;
+
+public:
+  FilterMultipartUpload(std::unique_ptr<MultipartUpload> _next, Bucket* _b) :
+    next(std::move(_next)), bucket(_b) {}
+  virtual ~FilterMultipartUpload() = default;
+
+  virtual const std::string& get_meta() const override { return next->get_meta(); }
+  virtual const std::string& get_key() const override { return next->get_key(); }
+  virtual const std::string& get_upload_id() const override { return next->get_upload_id(); }
+  virtual const ACLOwner& get_owner() const override { return next->get_owner(); }
+  virtual ceph::real_time& get_mtime() override { return next->get_mtime(); }
+
+  virtual std::map<uint32_t, std::unique_ptr<MultipartPart>>& get_parts() override { return parts; }
+
+  virtual const jspan_context& get_trace() override { return next->get_trace(); }
+
+  virtual std::unique_ptr<rgw::sal::Object> get_meta_obj() override;
+
+  virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override;
+  virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			 int num_parts, int marker,
+			 int* next_marker, bool* truncated,
+			 bool assume_unsorted = false) override;
+  virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override;
+  virtual int complete(const DoutPrefixProvider* dpp,
+		       optional_yield y, CephContext* cct,
+		       std::map<int, std::string>& part_etags,
+		       std::list<rgw_obj_index_key>& remove_objs,
+		       uint64_t& accounted_size, bool& compressed,
+		       RGWCompressionInfo& cs_info, off_t& ofs,
+		       std::string& tag, ACLOwner& owner,
+		       uint64_t olh_epoch,
+		       rgw::sal::Object* target_obj) override;
+
+  virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y,
+		       rgw_placement_rule** rule,
+		       rgw::sal::Attrs* attrs = nullptr) override;
+
+  virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
+			  optional_yield y,
+			  rgw::sal::Object* obj,
+			  const rgw_user& owner,
+			  const rgw_placement_rule *ptail_placement_rule,
+			  uint64_t part_num,
+			  const std::string& part_num_str) override;
+  virtual void print(std::ostream& out) const override { return next->print(out); }
+};
+
+class FilterMPSerializer : public MPSerializer {
+protected:
+  std::unique_ptr<MPSerializer> next;
+
+public:
+  FilterMPSerializer(std::unique_ptr<MPSerializer> _next) : next(std::move(_next)) {}
+  virtual ~FilterMPSerializer() = default;
+
+  virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override;
+  virtual int unlock() override { return next->unlock(); }
+  virtual void clear_locked() override { next->clear_locked(); }
+  virtual bool is_locked() override { return next->is_locked(); }
+  virtual void print(std::ostream& out) const override { return next->print(out); }
+};
+
+class FilterLCSerializer : public LCSerializer {
+protected:
+  std::unique_ptr<LCSerializer> next;
+
+public:
+  FilterLCSerializer(std::unique_ptr<LCSerializer> _next) : next(std::move(_next)) {}
+  virtual ~FilterLCSerializer() = default;
+
+  virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override;
+  virtual int unlock() override { return next->unlock(); }
+  virtual void print(std::ostream& out) const override { return next->print(out); }
+};
+
+class FilterLifecycle : public Lifecycle {
+protected:
+  std::unique_ptr<Lifecycle> next;
+
+public:
+  struct FilterLCHead : LCHead {
+    std::unique_ptr<LCHead> next;
+
+    FilterLCHead(std::unique_ptr<LCHead> _next) : next(std::move(_next)) {}
+    virtual ~FilterLCHead() = default;
+
+    virtual time_t& get_start_date() override { return next->get_start_date(); }
+    virtual void set_start_date(time_t t) override { next->set_start_date(t); }
+    virtual std::string& get_marker() override { return next->get_marker(); }
+    virtual void set_marker(const std::string& m) override { next->set_marker(m); }
+    virtual time_t& get_shard_rollover_date() override { return next->get_shard_rollover_date(); }
+    virtual void set_shard_rollover_date(time_t t) override { next->set_shard_rollover_date(t); }
+  };
+
+  struct FilterLCEntry : LCEntry {
+    std::unique_ptr<LCEntry> next;
+
+    FilterLCEntry(std::unique_ptr<LCEntry> _next) : next(std::move(_next)) {}
+    virtual ~FilterLCEntry() = default;
+
+    virtual std::string& get_bucket() override { return next->get_bucket(); }
+    virtual void set_bucket(const std::string& b) override { next->set_bucket(b); }
+    virtual std::string& get_oid() override { return next->get_oid(); }
+    virtual void set_oid(const std::string& o) override { next->set_oid(o); }
+    virtual uint64_t get_start_time() override { return next->get_start_time(); }
+    virtual void set_start_time(uint64_t t) override { next->set_start_time(t); }
+    virtual uint32_t get_status() override { return next->get_status(); }
+    virtual void set_status(uint32_t s) override { next->set_status(s); }
+    virtual void print(std::ostream& out) const override { return next->print(out); }
+  };
+
+  FilterLifecycle(std::unique_ptr<Lifecycle> _next) : next(std::move(_next)) {}
+  virtual ~FilterLifecycle() = default;
+
+  virtual std::unique_ptr<LCEntry> get_entry() override;
+  virtual int get_entry(const std::string& oid, const std::string& marker,
+			std::unique_ptr<LCEntry>* entry) override;
+  virtual int get_next_entry(const std::string& oid, const std::string& marker,
+			     std::unique_ptr<LCEntry>* entry) override;
+  virtual int set_entry(const std::string& oid, LCEntry& entry) override;
+  virtual int list_entries(const std::string& oid, const std::string& marker,
+			   uint32_t max_entries,
+			   std::vector<std::unique_ptr<LCEntry>>& entries) override;
+  virtual int rm_entry(const std::string& oid, LCEntry& entry) override;
+  virtual int get_head(const std::string& oid, std::unique_ptr<LCHead>* head) override;
+  virtual int put_head(const std::string& oid, LCHead& head) override;
+  virtual std::unique_ptr<LCSerializer> get_serializer(const std::string& lock_name,
+						       const std::string& oid,
+						       const std::string& cookie) override;
+};
+
+class FilterNotification : public Notification {
+protected:
+  std::unique_ptr<Notification> next;
+
+public:
+  FilterNotification(std::unique_ptr<Notification> _next) : next(std::move(_next)) {}
+
+  virtual ~FilterNotification() = default;
+
+  virtual int publish_reserve(const DoutPrefixProvider *dpp,
+			      RGWObjTags* obj_tags = nullptr) override;
+  virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+			     const ceph::real_time& mtime,
+			     const std::string& etag,
+			     const std::string& version) override;
+};
+
+class FilterWriter : public Writer {
+protected:
+  std::unique_ptr<Writer> next;
+  Object* obj;
+public:
+  FilterWriter(std::unique_ptr<Writer> _next, Object* _obj) :
+    next(std::move(_next)), obj(_obj) {}
+  virtual ~FilterWriter() = default;
+
+  virtual int prepare(optional_yield y) { return next->prepare(y); }
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) override;
+};
+
+class FilterLuaManager : public LuaManager {
+protected:
+  std::unique_ptr<LuaManager> next;
+
+public:
+  FilterLuaManager(std::unique_ptr<LuaManager> _next) : next(std::move(_next)) {}
+  virtual ~FilterLuaManager() = default;
+
+  virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override;
+  virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override;
+  virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override;
+  virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override;
+  virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override;
+  virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override;
+};
+
+} } // namespace rgw::sal
diff --git a/src/rgw/rgw_sal_fwd.h b/src/rgw/rgw_sal_fwd.h
new file mode 100644
index 000000000..08866c2be
--- /dev/null
+++ b/src/rgw/rgw_sal_fwd.h
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+
+namespace rgw { namespace sal {
+
+  class Driver;
+  class User;
+  class Bucket;
+  class BucketList;
+  class Object;
+  class MultipartUpload;
+  class Lifecycle;
+  class Notification;
+  class Writer;
+  class PlacementTier;
+  class ZoneGroup;
+  class Zone;
+  class LuaManager;
+  struct RGWRoleInfo;
+
+  class ConfigStore;
+  class RealmWriter;
+  class ZoneGroupWriter;
+  class ZoneWriter;
+
+} } // namespace rgw::sal
diff --git a/src/rgw/rgw_sal_motr.cc b/src/rgw/rgw_sal_motr.cc
new file mode 100644
index 000000000..de18ba944
--- /dev/null
+++ b/src/rgw/rgw_sal_motr.cc
@@ -0,0 +1,4024 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=2 sw=2 expandtab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * SAL implementation for the CORTX Motr backend
+ *
+ * Copyright (C) 2021 Seagate Technology LLC and/or its Affiliates
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+extern "C" {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wextern-c-compat"
+#pragma clang diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion"
+#include "motr/config.h"
+#include "lib/types.h"
+#include "lib/trace.h"   // m0_trace_set_mmapped_buffer
+#include "motr/layout.h" // M0_OBJ_LAYOUT_ID
+#include "helpers/helpers.h" // m0_ufid_next
+#pragma clang diagnostic pop
+}
+
+#include "common/Clock.h"
+#include "common/errno.h"
+
+#include "rgw_compression.h"
+#include "rgw_sal.h"
+#include "rgw_sal_motr.h"
+#include "rgw_bucket.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using std::string;
+using std::map;
+using std::vector;
+using std::set;
+using std::list;
+
+static string mp_ns = RGW_OBJ_NS_MULTIPART;
+static struct m0_ufid_generator ufid_gr;
+
+namespace rgw::sal {
+
+using ::ceph::encode;
+using ::ceph::decode;
+
+static std::string motr_global_indices[] = {
+  RGW_MOTR_USERS_IDX_NAME,
+  RGW_MOTR_BUCKET_INST_IDX_NAME,
+  RGW_MOTR_BUCKET_HD_IDX_NAME,
+  RGW_IAM_MOTR_ACCESS_KEY,
+  RGW_IAM_MOTR_EMAIL_KEY
+};
+
+void MotrMetaCache::invalid(const DoutPrefixProvider *dpp,
+                           const string& name)
+{
+  cache.invalidate_remove(dpp, name);
+}
+
+int MotrMetaCache::put(const DoutPrefixProvider *dpp,
+                       const string& name,
+                       const bufferlist& data)
+{
+  ldpp_dout(dpp, 0) << "Put into cache: name = " << name << dendl;
+
+  ObjectCacheInfo info;
+  info.status = 0;
+  info.data = data;
+  info.flags = CACHE_FLAG_DATA;
+  info.meta.mtime = ceph::real_clock::now();
+  info.meta.size = data.length();
+  cache.put(dpp, name, info, NULL);
+
+  // Inform other rgw instances. Do nothing if it gets some error?
+  int rc = distribute_cache(dpp, name, info, UPDATE_OBJ);
+  if (rc < 0)
+      ldpp_dout(dpp, 0) << "ERROR: failed to distribute cache for " << name << dendl;
+
+  return 0;
+}
+
+int MotrMetaCache::get(const DoutPrefixProvider *dpp,
+                       const string& name,
+                       bufferlist& data)
+{
+  ObjectCacheInfo info;
+  uint32_t flags = CACHE_FLAG_DATA;
+  int rc = cache.get(dpp, name, info, flags, NULL);
+  if (rc == 0) {
+    if (info.status < 0)
+      return info.status;
+
+    bufferlist& bl = info.data;
+    bufferlist::iterator it = bl.begin();
+    data.clear();
+
+    it.copy_all(data);
+    ldpp_dout(dpp, 0) << "Cache hit: name = " << name << dendl;
+    return 0;
+  }
+  ldpp_dout(dpp, 0) << "Cache miss: name = " << name << ", rc = "<< rc << dendl;
+  if(rc == -ENODATA)
+    return -ENOENT;
+
+  return rc;
+}
+
+int MotrMetaCache::remove(const DoutPrefixProvider *dpp,
+                          const string& name)
+
+{
+  cache.invalidate_remove(dpp, name);
+
+  ObjectCacheInfo info;
+  int rc = distribute_cache(dpp, name, info, INVALIDATE_OBJ);
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " <<__func__<< "(): failed to distribute cache: rc =" << rc << dendl;
+  }
+
+  ldpp_dout(dpp, 0) << "Remove from cache: name = " << name << dendl;
+  return 0;
+}
+
+int MotrMetaCache::distribute_cache(const DoutPrefixProvider *dpp,
+                                    const string& normal_name,
+                                    ObjectCacheInfo& obj_info, int op)
+{
+  return 0;
+}
+
+int MotrMetaCache::watch_cb(const DoutPrefixProvider *dpp,
+                            uint64_t notify_id,
+                            uint64_t cookie,
+                            uint64_t notifier_id,
+                            bufferlist& bl)
+{
+  return 0;
+}
+
+void MotrMetaCache::set_enabled(bool status)
+{
+  cache.set_enabled(status);
+}
+
+// TODO: properly handle the number of key/value pairs to get in
+// one query. Now the POC simply tries to retrieve all `max` number of pairs
+// with starting key `marker`.
+int MotrUser::list_buckets(const DoutPrefixProvider *dpp, const string& marker,
+    const string& end_marker, uint64_t max, bool need_stats,
+    BucketList &buckets, optional_yield y)
+{
+  int rc;
+  vector<string> keys(max);
+  vector<bufferlist> vals(max);
+  bool is_truncated = false;
+
+  ldpp_dout(dpp, 20) <<__func__<< ": list_user_buckets: marker=" << marker
+                    << " end_marker=" << end_marker
+                    << " max=" << max << dendl;
+
+  // Retrieve all `max` number of pairs.
+  buckets.clear();
+  string user_info_iname = "motr.rgw.user.info." + info.user_id.to_str();
+  keys[0] = marker;
+  rc = store->next_query_by_name(user_info_iname, keys, vals);
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: NEXT query failed. " << rc << dendl;
+    return rc;
+  }
+
+  // Process the returned pairs to add into BucketList.
+  uint64_t bcount = 0;
+  for (const auto& bl: vals) {
+    if (bl.length() == 0)
+      break;
+
+    RGWBucketEnt ent;
+    auto iter = bl.cbegin();
+    ent.decode(iter);
+
+    std::time_t ctime = ceph::real_clock::to_time_t(ent.creation_time);
+    ldpp_dout(dpp, 20) << "got creation time: << " << std::put_time(std::localtime(&ctime), "%F %T") << dendl;
+
+    if (!end_marker.empty() &&
+         end_marker.compare(ent.bucket.marker) <= 0)
+      break;
+
+    buckets.add(std::make_unique<MotrBucket>(this->store, ent, this));
+    bcount++;
+  }
+  if (bcount == max)
+    is_truncated = true;
+  buckets.set_truncated(is_truncated);
+
+  return 0;
+}
+
+int MotrUser::create_bucket(const DoutPrefixProvider* dpp,
+                            const rgw_bucket& b,
+                            const std::string& zonegroup_id,
+                            rgw_placement_rule& placement_rule,
+                            std::string& swift_ver_location,
+                            const RGWQuotaInfo* pquota_info,
+                            const RGWAccessControlPolicy& policy,
+                            Attrs& attrs,
+                            RGWBucketInfo& info,
+                            obj_version& ep_objv,
+                            bool exclusive,
+                            bool obj_lock_enabled,
+                            bool* existed,
+                            req_info& req_info,
+                            std::unique_ptr<Bucket>* bucket_out,
+                            optional_yield y)
+{
+  int ret;
+  std::unique_ptr<Bucket> bucket;
+
+  // Look up the bucket. Create it if it doesn't exist.
+  ret = this->store->get_bucket(dpp, this, b, &bucket, y);
+  if (ret < 0 && ret != -ENOENT)
+    return ret;
+
+  if (ret != -ENOENT) {
+    *existed = true;
+    // if (swift_ver_location.empty()) {
+    //   swift_ver_location = bucket->get_info().swift_ver_location;
+    // }
+    // placement_rule.inherit_from(bucket->get_info().placement_rule);
+
+    // TODO: ACL policy
+    // // don't allow changes to the acl policy
+    //RGWAccessControlPolicy old_policy(ctx());
+    //int rc = rgw_op_get_bucket_policy_from_attr(
+    //           dpp, this, u, bucket->get_attrs(), &old_policy, y);
+    //if (rc >= 0 && old_policy != policy) {
+    //    bucket_out->swap(bucket);
+    //    return -EEXIST;
+    //}
+  } else {
+
+    placement_rule.name = "default";
+    placement_rule.storage_class = "STANDARD";
+    bucket = std::make_unique<MotrBucket>(store, b, this);
+    bucket->set_attrs(attrs);
+    *existed = false;
+  }
+
+  if (!*existed){
+    // TODO: how to handle zone and multi-site.
+    info.placement_rule = placement_rule;
+    info.bucket = b;
+    info.owner = this->get_info().user_id;
+    info.zonegroup = zonegroup_id;
+    if (obj_lock_enabled)
+      info.flags = BUCKET_VERSIONED | BUCKET_OBJ_LOCK_ENABLED;
+    bucket->set_version(ep_objv);
+    bucket->get_info() = info;
+
+    // Create a new bucket: (1) Add a key/value pair in the
+    // bucket instance index. (2) Create a new bucket index.
+    MotrBucket* mbucket = static_cast<MotrBucket*>(bucket.get());
+    ret = mbucket->put_info(dpp, y, ceph::real_time())? :
+          mbucket->create_bucket_index() ? :
+          mbucket->create_multipart_indices();
+    if (ret < 0)
+      ldpp_dout(dpp, 0) << "ERROR: failed to create bucket indices! " << ret << dendl;
+
+     // Insert the bucket entry into the user info index.
+     ret = mbucket->link_user(dpp, this, y);
+     if (ret < 0)
+       ldpp_dout(dpp, 0) << "ERROR: failed to add bucket entry! " << ret << dendl;
+  } else {
+    return -EEXIST;
+    // bucket->set_version(ep_objv);
+    // bucket->get_info() = info;
+  }
+
+  bucket_out->swap(bucket);
+
+  return ret;
+}
+
+int MotrUser::read_attrs(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return 0;
+}
+
+int MotrUser::read_stats(const DoutPrefixProvider *dpp,
+    optional_yield y, RGWStorageStats* stats,
+    ceph::real_time *last_stats_sync,
+    ceph::real_time *last_stats_update)
+{
+  return 0;
+}
+
+/* stats - Not for first pass */
+int MotrUser::read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB *cb)
+{
+  return 0;
+}
+
+int MotrUser::complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return 0;
+}
+
+int MotrUser::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+    bool *is_truncated, RGWUsageIter& usage_iter,
+    map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  return 0;
+}
+
+int MotrUser::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+  return 0;
+}
+
+int MotrUser::load_user_from_idx(const DoutPrefixProvider *dpp,
+                              MotrStore *store,
+                              RGWUserInfo& info, map<string, bufferlist> *attrs,
+                              RGWObjVersionTracker *objv_tr)
+{
+  struct MotrUserInfo muinfo;
+  bufferlist bl;
+  ldpp_dout(dpp, 20) << "info.user_id.id = "  << info.user_id.id << dendl;
+  if (store->get_user_cache()->get(dpp, info.user_id.id, bl)) {
+    // Cache misses
+    int rc = store->do_idx_op_by_name(RGW_MOTR_USERS_IDX_NAME,
+                                      M0_IC_GET, info.user_id.to_str(), bl);
+    ldpp_dout(dpp, 20) << "do_idx_op_by_name() = "  << rc << dendl;
+    if (rc < 0)
+        return rc;
+
+    // Put into cache.
+    store->get_user_cache()->put(dpp, info.user_id.id, bl);
+  }
+
+  bufferlist& blr = bl;
+  auto iter = blr.cbegin();
+  muinfo.decode(iter);
+  info = muinfo.info;
+  if (attrs)
+    *attrs = muinfo.attrs;
+  if (objv_tr)
+  {
+    objv_tr->read_version = muinfo.user_version;
+    objv_tracker.read_version = objv_tr->read_version;
+  }
+
+  if (!info.access_keys.empty()) {
+    for(auto key : info.access_keys) {
+      access_key_tracker.insert(key.first);
+    }
+  }
+
+  return 0;
+}
+
+int MotrUser::load_user(const DoutPrefixProvider *dpp,
+                        optional_yield y)
+{
+  ldpp_dout(dpp, 20) << "load user: user id =   " << info.user_id.to_str() << dendl;
+  return load_user_from_idx(dpp, store, info, &attrs, &objv_tracker);
+}
+
+int MotrUser::create_user_info_idx()
+{
+  string user_info_iname = "motr.rgw.user.info." + info.user_id.to_str();
+  return store->create_motr_idx_by_name(user_info_iname);
+}
+
+int MotrUser::merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y)
+{
+  for (auto& it : new_attrs)
+    attrs[it.first] = it.second;
+
+  return store_user(dpp, y, false);
+}
+
+int MotrUser::store_user(const DoutPrefixProvider* dpp,
+                         optional_yield y, bool exclusive, RGWUserInfo* old_info)
+{
+  bufferlist bl;
+  struct MotrUserInfo muinfo;
+  RGWUserInfo orig_info;
+  RGWObjVersionTracker objv_tr = {};
+  obj_version& obj_ver = objv_tr.read_version;
+
+  ldpp_dout(dpp, 20) << "Store_user(): User = " << info.user_id.id << dendl;
+  orig_info.user_id = info.user_id;
+  // XXX: we open and close motr idx 2 times in this method:
+  // 1) on load_user_from_idx() here and 2) on do_idx_op_by_name(PUT) below.
+  // Maybe this can be optimised later somewhow.
+  int rc = load_user_from_idx(dpp, store, orig_info, nullptr, &objv_tr);
+  ldpp_dout(dpp, 10) << "Get user: rc = " << rc << dendl;
+
+  // Check if the user already exists
+  if (rc == 0 && obj_ver.ver > 0) {
+    if (old_info)
+      *old_info = orig_info;
+
+    if (obj_ver.ver != objv_tracker.read_version.ver) {
+      rc = -ECANCELED;
+      ldpp_dout(dpp, 0) << "ERROR: User Read version mismatch" << dendl;
+      goto out;
+    }
+
+    if (exclusive)
+      return rc;
+
+    obj_ver.ver++;
+  } else {
+    obj_ver.ver = 1;
+    obj_ver.tag = "UserTAG";
+  }
+
+  // Insert the user to user info index.
+  muinfo.info = info;
+  muinfo.attrs = attrs;
+  muinfo.user_version = obj_ver;
+  muinfo.encode(bl);
+  rc = store->do_idx_op_by_name(RGW_MOTR_USERS_IDX_NAME,
+                                M0_IC_PUT, info.user_id.to_str(), bl);
+  ldpp_dout(dpp, 10) << "Store user to motr index: rc = " << rc << dendl;
+  if (rc == 0) {
+    objv_tracker.read_version = obj_ver;
+    objv_tracker.write_version = obj_ver;
+  }
+  
+  // Store access key in access key index
+  if (!info.access_keys.empty()) {
+    std::string access_key;
+    std::string secret_key;
+    std::map<std::string, RGWAccessKey>::const_iterator iter = info.access_keys.begin();
+    const RGWAccessKey& k = iter->second;
+    access_key = k.id;
+    secret_key = k.key;
+    MotrAccessKey MGWUserKeys(access_key, secret_key, info.user_id.to_str());
+    store->store_access_key(dpp, y, MGWUserKeys);
+    access_key_tracker.insert(access_key);
+  }
+
+  // Check if any key need to be deleted
+  if (access_key_tracker.size() != info.access_keys.size()) {
+    std::string key_for_deletion;
+    for (auto key : access_key_tracker) {
+      if (!info.get_key(key)) {
+        key_for_deletion = key;
+        ldpp_dout(dpp, 0) << "Deleting access key: " << key_for_deletion << dendl;
+        store->delete_access_key(dpp, y, key_for_deletion);
+        if (rc < 0) {
+          ldpp_dout(dpp, 0) << "Unable to delete access key" << rc << dendl;
+        }
+      }
+    }
+    if(rc >= 0){
+      access_key_tracker.erase(key_for_deletion);
+    }
+  }
+
+  if (!info.user_email.empty()) {
+     MotrEmailInfo MGWEmailInfo(info.user_id.to_str(), info.user_email);
+     store->store_email_info(dpp, y, MGWEmailInfo);
+  }
+
+  // Create user info index to store all buckets that are belong
+  // to this bucket.
+  rc = create_user_info_idx();
+  if (rc < 0 && rc != -EEXIST) {
+    ldpp_dout(dpp, 0) << "Failed to create user info index: rc = " << rc << dendl;
+    goto out;
+  }
+
+  // Put the user info into cache.
+  rc = store->get_user_cache()->put(dpp, info.user_id.id, bl);
+
+out:
+  return rc;
+}
+
+int MotrUser::remove_user(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  // Remove user info from cache
+  // Delete access keys for user
+  // Delete user info
+  // Delete user from user index
+  // Delete email for user - TODO
+  bufferlist bl;
+  int rc;
+  // Remove the user info from cache.
+  store->get_user_cache()->remove(dpp, info.user_id.id);
+
+  // Delete all access key of user
+  if (!info.access_keys.empty()) {
+    for(auto acc_key = info.access_keys.begin(); acc_key != info.access_keys.end(); acc_key++) {
+      auto access_key = acc_key->first;
+      rc = store->delete_access_key(dpp, y, access_key);
+      // TODO 
+      // Check error code for access_key does not exist
+      // Continue to next step only if delete failed because key doesn't exists
+      if (rc < 0){
+        ldpp_dout(dpp, 0) << "Unable to delete access key" << rc << dendl;
+      }
+    }
+  }
+
+  //Delete email id 
+  if (!info.user_email.empty()) {
+    rc = store->do_idx_op_by_name(RGW_IAM_MOTR_EMAIL_KEY,
+		             M0_IC_DEL, info.user_email, bl);
+    if (rc < 0 && rc != -ENOENT) {
+       ldpp_dout(dpp, 0) << "Unable to delete email id " << rc << dendl;
+    }
+  }
+  
+  // Delete user info index
+  string user_info_iname = "motr.rgw.user.info." + info.user_id.to_str();
+  store->delete_motr_idx_by_name(user_info_iname);
+  ldpp_dout(dpp, 10) << "Deleted user info index - " << user_info_iname << dendl;
+
+  // Delete user from user index
+  rc = store->do_idx_op_by_name(RGW_MOTR_USERS_IDX_NAME,
+                           M0_IC_DEL, info.user_id.to_str(), bl);
+  if (rc < 0){
+    ldpp_dout(dpp, 0) << "Unable to delete user from user index " << rc << dendl;
+    return rc;
+  }
+
+  // TODO 
+  // Delete email for user
+  // rc = store->do_idx_op_by_name(RGW_IAM_MOTR_EMAIL_KEY,
+  //                          M0_IC_DEL, info.user_email, bl);
+  // if (rc < 0){
+  //   ldpp_dout(dpp, 0) << "Unable to delete email for user" << rc << dendl;
+  //   return rc;
+  // }
+  return 0;
+}
+
+int MotrUser::verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider *dpp, optional_yield y)
+{
+  *verified = false;
+  return 0;
+}
+
+int MotrBucket::remove_bucket(const DoutPrefixProvider *dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y)
+{
+  int ret;
+
+  ldpp_dout(dpp, 20) << "remove_bucket Entry=" << info.bucket.name << dendl;
+
+  // Refresh info
+  ret = load_bucket(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: remove_bucket load_bucket failed rc=" << ret << dendl;
+    return ret;
+  }
+
+  ListParams params;
+  params.list_versions = true;
+  params.allow_unordered = true;
+
+  ListResults results;
+
+  // 1. Check if Bucket has objects.
+  // If bucket contains objects and delete_children is true, delete all objects.
+  // Else throw error that bucket is not empty.
+  do {
+    results.objs.clear();
+
+    // Check if bucket has objects.
+    ret = list(dpp, params, 1000, results, y);
+    if (ret < 0) {
+      return ret;
+    }
+
+    // If result contains entries, bucket is not empty.
+    if (!results.objs.empty() && !delete_children) {
+      ldpp_dout(dpp, 0) << "ERROR: could not remove non-empty bucket " << info.bucket.name << dendl;
+      return -ENOTEMPTY;
+    }
+
+    for (const auto& obj : results.objs) {
+      rgw_obj_key key(obj.key);
+      /* xxx dang */
+      ret = rgw_remove_object(dpp, store, this, key);
+      if (ret < 0 && ret != -ENOENT) {
+        ldpp_dout(dpp, 0) << "ERROR: remove_bucket rgw_remove_object failed rc=" << ret << dendl;
+	      return ret;
+      }
+    }
+  } while(results.is_truncated);
+
+  // 2. Abort Mp uploads on the bucket.
+  ret = abort_multiparts(dpp, store->ctx());
+  if (ret < 0) {
+    return ret;
+  }
+
+  // 3. Remove mp index??
+  string bucket_multipart_iname = "motr.rgw.bucket." + info.bucket.name + ".multiparts";
+  ret = store->delete_motr_idx_by_name(bucket_multipart_iname);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: remove_bucket failed to remove multipart index rc=" << ret << dendl;
+    return ret;
+  }
+
+  // 4. Sync user stats.
+  ret = this->sync_user_stats(dpp, y);
+  if (ret < 0) {
+     ldout(store->ctx(), 1) << "WARNING: failed sync user stats before bucket delete. ret=" <<  ret << dendl;
+  }
+
+  // 5. Remove the bucket from user info index. (unlink user)
+  ret = this->unlink_user(dpp, owner, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: remove_bucket unlink_user failed rc=" << ret << dendl;
+    return ret;
+  }
+
+  // 6. Remove bucket index.
+  string bucket_index_iname = "motr.rgw.bucket.index." + info.bucket.name;
+  ret = store->delete_motr_idx_by_name(bucket_index_iname);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: remove_bucket unlink_user failed rc=" << ret << dendl;
+    return ret;
+  }
+
+  // 7. Remove bucket instance info.
+  bufferlist bl;
+  ret = store->get_bucket_inst_cache()->remove(dpp, info.bucket.name);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: remove_bucket failed to remove bucket instance from cache rc="
+      << ret << dendl;
+    return ret;
+  }
+
+  ret = store->do_idx_op_by_name(RGW_MOTR_BUCKET_INST_IDX_NAME,
+                                  M0_IC_DEL, info.bucket.name, bl);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: remove_bucket failed to remove bucket instance rc=" 
+      << ret << dendl;
+    return ret;
+  }
+
+  // TODO :
+  // 8. Remove Notifications
+  // if bucket has notification definitions associated with it
+  // they should be removed (note that any pending notifications on the bucket are still going to be sent)
+
+  // 9. Forward request to master.
+  if (forward_to_master) {
+    bufferlist in_data;
+    ret = store->forward_request_to_master(dpp, owner, &bucket_version, in_data, nullptr, *req_info, y);
+    if (ret < 0) {
+      if (ret == -ENOENT) {
+        /* adjust error, we want to return with NoSuchBucket and not
+        * NoSuchKey */
+        ret = -ERR_NO_SUCH_BUCKET;
+      }
+      ldpp_dout(dpp, 0) << "ERROR: Forward to master failed. ret=" << ret << dendl;
+      return ret;
+    }
+  }
+
+  ldpp_dout(dpp, 20) << "remove_bucket Exit=" << info.bucket.name << dendl;
+
+  return ret;
+}
+
+int MotrBucket::remove_bucket_bypass_gc(int concurrent_max, bool
+        keep_index_consistent,
+        optional_yield y, const
+        DoutPrefixProvider *dpp) {
+  return 0;
+}
+
+int MotrBucket::put_info(const DoutPrefixProvider *dpp, bool exclusive, ceph::real_time _mtime)
+{
+  bufferlist bl;
+  struct MotrBucketInfo mbinfo;
+
+  ldpp_dout(dpp, 20) << "put_info(): bucket_id=" << info.bucket.bucket_id << dendl;
+  mbinfo.info = info;
+  mbinfo.bucket_attrs = attrs;
+  mbinfo.mtime = _mtime;
+  mbinfo.bucket_version = bucket_version;
+  mbinfo.encode(bl);
+
+  // Insert bucket instance using bucket's marker (string).
+  int rc = store->do_idx_op_by_name(RGW_MOTR_BUCKET_INST_IDX_NAME,
+                                  M0_IC_PUT, info.bucket.name, bl, !exclusive);
+  if (rc == 0)
+    store->get_bucket_inst_cache()->put(dpp, info.bucket.name, bl);
+
+  return rc;
+}
+
+int MotrBucket::load_bucket(const DoutPrefixProvider *dpp, optional_yield y, bool get_stats)
+{
+  // Get bucket instance using bucket's name (string). or bucket id?
+  bufferlist bl;
+  if (store->get_bucket_inst_cache()->get(dpp, info.bucket.name, bl)) {
+    // Cache misses.
+    ldpp_dout(dpp, 20) << "load_bucket(): name=" << info.bucket.name << dendl;
+    int rc = store->do_idx_op_by_name(RGW_MOTR_BUCKET_INST_IDX_NAME,
+                                      M0_IC_GET, info.bucket.name, bl);
+    ldpp_dout(dpp, 20) << "load_bucket(): rc=" << rc << dendl;
+    if (rc < 0)
+      return rc;
+    store->get_bucket_inst_cache()->put(dpp, info.bucket.name, bl);
+  }
+
+  struct MotrBucketInfo mbinfo;
+  bufferlist& blr = bl;
+  auto iter =blr.cbegin();
+  mbinfo.decode(iter); //Decode into MotrBucketInfo.
+
+  info = mbinfo.info;
+  ldpp_dout(dpp, 20) << "load_bucket(): bucket_id=" << info.bucket.bucket_id << dendl;
+  rgw_placement_rule placement_rule;
+  placement_rule.name = "default";
+  placement_rule.storage_class = "STANDARD";
+  info.placement_rule = placement_rule;
+
+  attrs = mbinfo.bucket_attrs;
+  mtime = mbinfo.mtime;
+  bucket_version = mbinfo.bucket_version;
+
+  return 0;
+}
+
+int MotrBucket::link_user(const DoutPrefixProvider* dpp, User* new_user, optional_yield y)
+{
+  bufferlist bl;
+  RGWBucketEnt new_bucket;
+  ceph::real_time creation_time = get_creation_time();
+
+  // RGWBucketEnt or cls_user_bucket_entry is the structure that is stored.
+  new_bucket.bucket = info.bucket;
+  new_bucket.size = 0;
+  if (real_clock::is_zero(creation_time))
+    creation_time = ceph::real_clock::now();
+  new_bucket.creation_time = creation_time;
+  new_bucket.encode(bl);
+  std::time_t ctime = ceph::real_clock::to_time_t(new_bucket.creation_time);
+  ldpp_dout(dpp, 20) << "got creation time: << " << std::put_time(std::localtime(&ctime), "%F %T") << dendl;
+
+  // Insert the user into the user info index.
+  string user_info_idx_name = "motr.rgw.user.info." + new_user->get_info().user_id.to_str();
+  return store->do_idx_op_by_name(user_info_idx_name,
+                                  M0_IC_PUT, info.bucket.name, bl);
+
+}
+
+int MotrBucket::unlink_user(const DoutPrefixProvider* dpp, User* new_user, optional_yield y)
+{
+  // Remove the user into the user info index.
+  bufferlist bl;
+  string user_info_idx_name = "motr.rgw.user.info." + new_user->get_info().user_id.to_str();
+  return store->do_idx_op_by_name(user_info_idx_name,
+                                  M0_IC_DEL, info.bucket.name, bl);
+}
+
+/* stats - Not for first pass */
+int MotrBucket::read_stats(const DoutPrefixProvider *dpp,
+    const bucket_index_layout_generation& idx_layout, int shard_id,
+    std::string *bucket_ver, std::string *master_ver,
+    std::map<RGWObjCategory, RGWStorageStats>& stats,
+    std::string *max_marker, bool *syncstopped)
+{
+  return 0;
+}
+
+int MotrBucket::create_bucket_index()
+{
+  string bucket_index_iname = "motr.rgw.bucket.index." + info.bucket.name;
+  return store->create_motr_idx_by_name(bucket_index_iname);
+}
+
+int MotrBucket::create_multipart_indices()
+{
+  int rc;
+
+  // Bucket multipart index stores in-progress multipart uploads.
+  // Key is the object name + upload_id, value is a rgw_bucket_dir_entry.
+  // An entry is inserted when a multipart upload is initialised (
+  // MotrMultipartUpload::init()) and will be removed when the upload
+  // is completed (MotrMultipartUpload::complete()).
+  // MotrBucket::list_multiparts() will scan this index to return all
+  // in-progress multipart uploads in the bucket.
+  string bucket_multipart_iname = "motr.rgw.bucket." + info.bucket.name + ".multiparts";
+  rc = store->create_motr_idx_by_name(bucket_multipart_iname);
+  if (rc < 0) {
+    ldout(store->cctx, 0) << "Failed to create bucket multipart index  " << bucket_multipart_iname << dendl;
+    return rc;
+  }
+
+  return 0;
+}
+
+
+int MotrBucket::read_stats_async(const DoutPrefixProvider *dpp,
+                                 const bucket_index_layout_generation& idx_layout,
+                                 int shard_id, RGWGetBucketStats_CB *ctx)
+{
+  return 0;
+}
+
+int MotrBucket::sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  return 0;
+}
+
+int MotrBucket::update_container_stats(const DoutPrefixProvider *dpp)
+{
+  return 0;
+}
+
+int MotrBucket::check_bucket_shards(const DoutPrefixProvider *dpp)
+{
+  return 0;
+}
+
+int MotrBucket::chown(const DoutPrefixProvider *dpp, User& new_user, optional_yield y)
+{
+  // TODO: update bucket with new owner
+  return 0;
+}
+
+/* Make sure to call load_bucket() if you need it first */
+bool MotrBucket::is_owner(User* user)
+{
+  return (info.owner.compare(user->get_id()) == 0);
+}
+
+int MotrBucket::check_empty(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  /* XXX: Check if bucket contains any objects */
+  return 0;
+}
+
+int MotrBucket::check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size,
+    optional_yield y, bool check_size_only)
+{
+  /* Not Handled in the first pass as stats are also needed */
+  return 0;
+}
+
+int MotrBucket::merge_and_store_attrs(const DoutPrefixProvider *dpp, Attrs& new_attrs, optional_yield y)
+{
+  for (auto& it : new_attrs)
+    attrs[it.first] = it.second;
+
+  return put_info(dpp, y, ceph::real_time());
+}
+
+int MotrBucket::try_refresh_info(const DoutPrefixProvider *dpp, ceph::real_time *pmtime)
+{
+  return 0;
+}
+
+/* XXX: usage and stats not supported in the first pass */
+int MotrBucket::read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+    uint32_t max_entries, bool *is_truncated,
+    RGWUsageIter& usage_iter,
+    map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  return 0;
+}
+
+int MotrBucket::trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+  return 0;
+}
+
+int MotrBucket::remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink)
+{
+  /* XXX: CHECK: Unlike RadosStore, there is no seperate bucket index table.
+   * Delete all the object in the list from the object table of this
+   * bucket
+   */
+  return 0;
+}
+
+int MotrBucket::check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats)
+{
+  /* XXX: stats not supported yet */
+  return 0;
+}
+
+int MotrBucket::rebuild_index(const DoutPrefixProvider *dpp)
+{
+  /* there is no index table in dbstore. Not applicable */
+  return 0;
+}
+
+int MotrBucket::set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout)
+{
+  /* XXX: CHECK: set tag timeout for all the bucket objects? */
+  return 0;
+}
+
+int MotrBucket::purge_instance(const DoutPrefixProvider *dpp)
+{
+  /* XXX: CHECK: for dbstore only single instance supported.
+   * Remove all the objects for that instance? Anything extra needed?
+   */
+  return 0;
+}
+
+int MotrBucket::set_acl(const DoutPrefixProvider *dpp, RGWAccessControlPolicy &acl, optional_yield y)
+{
+  int ret = 0;
+  bufferlist aclbl;
+
+  acls = acl;
+  acl.encode(aclbl);
+
+  Attrs attrs = get_attrs();
+  attrs[RGW_ATTR_ACL] = aclbl;
+
+  // TODO: update bucket entry with the new attrs
+
+  return ret;
+}
+
+std::unique_ptr<Object> MotrBucket::get_object(const rgw_obj_key& k)
+{
+  return std::make_unique<MotrObject>(this->store, k, this);
+}
+
+int MotrBucket::list(const DoutPrefixProvider *dpp, ListParams& params, int max, ListResults& results, optional_yield y)
+{
+  int rc;
+  vector<string> keys(max);
+  vector<bufferlist> vals(max);
+
+  ldpp_dout(dpp, 20) << "bucket=" << info.bucket.name
+                    << " prefix=" << params.prefix
+                    << " marker=" << params.marker
+                    << " max=" << max << dendl;
+
+  // Retrieve all `max` number of pairs.
+  string bucket_index_iname = "motr.rgw.bucket.index." + info.bucket.name;
+  keys[0] = params.marker.empty() ? params.prefix :
+                                    params.marker.get_oid();
+  rc = store->next_query_by_name(bucket_index_iname, keys, vals, params.prefix,
+                                                                 params.delim);
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: NEXT query failed. " << rc << dendl;
+    return rc;
+  }
+
+  // Process the returned pairs to add into ListResults.
+  int i = 0;
+  for (; i < rc; ++i) {
+    if (vals[i].length() == 0) {
+      results.common_prefixes[keys[i]] = true;
+    } else {
+      rgw_bucket_dir_entry ent;
+      auto iter = vals[i].cbegin();
+      ent.decode(iter);
+      if (params.list_versions || ent.is_visible())
+        results.objs.emplace_back(std::move(ent));
+    }
+  }
+
+  if (i == max) {
+    results.is_truncated = true;
+    results.next_marker = keys[max - 1] + " ";
+  } else {
+    results.is_truncated = false;
+  }
+
+  return 0;
+}
+
+int MotrBucket::list_multiparts(const DoutPrefixProvider *dpp,
+      const string& prefix,
+      string& marker,
+      const string& delim,
+      const int& max_uploads,
+      vector<std::unique_ptr<MultipartUpload>>& uploads,
+      map<string, bool> *common_prefixes,
+      bool *is_truncated)
+{
+  int rc;
+  vector<string> key_vec(max_uploads);
+  vector<bufferlist> val_vec(max_uploads);
+
+  string bucket_multipart_iname =
+      "motr.rgw.bucket." + this->get_name() + ".multiparts";
+  key_vec[0].clear();
+  key_vec[0].assign(marker.begin(), marker.end());
+  rc = store->next_query_by_name(bucket_multipart_iname, key_vec, val_vec);
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: NEXT query failed. " << rc << dendl;
+    return rc;
+  }
+
+  // Process the returned pairs to add into ListResults.
+  // The POC can only support listing all objects or selecting
+  // with prefix.
+  int ocount = 0;
+  rgw_obj_key last_obj_key;
+  *is_truncated = false;
+  for (const auto& bl: val_vec) {
+    if (bl.length() == 0)
+      break;
+
+    rgw_bucket_dir_entry ent;
+    auto iter = bl.cbegin();
+    ent.decode(iter);
+
+    if (prefix.size() &&
+        (0 != ent.key.name.compare(0, prefix.size(), prefix))) {
+      ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
+        ": skippping \"" << ent.key <<
+        "\" because doesn't match prefix" << dendl;
+      continue;
+    }
+
+    rgw_obj_key key(ent.key);
+    uploads.push_back(this->get_multipart_upload(key.name));
+    last_obj_key = key;
+    ocount++;
+    if (ocount == max_uploads) {
+      *is_truncated = true;
+      break;
+    }
+  }
+  marker = last_obj_key.name;
+
+  // What is common prefix? We don't handle it for now.
+
+  return 0;
+
+}
+
+int MotrBucket::abort_multiparts(const DoutPrefixProvider *dpp, CephContext *cct)
+{
+  return 0;
+}
+
+void MotrStore::finalize(void)
+{
+  // close connection with motr
+  m0_client_fini(this->instance, true);
+}
+
+const std::string& MotrZoneGroup::get_endpoint() const
+{
+  if (!group.endpoints.empty()) {
+      return group.endpoints.front();
+  } else {
+    // use zonegroup's master zone endpoints
+    auto z = group.zones.find(group.master_zone);
+    if (z != group.zones.end() && !z->second.endpoints.empty()) {
+      return z->second.endpoints.front();
+    }
+  }
+  return empty;
+}
+
+bool MotrZoneGroup::placement_target_exists(std::string& target) const
+{
+  return !!group.placement_targets.count(target);
+}
+
+void MotrZoneGroup::get_placement_target_names(std::set<std::string>& names) const
+{
+  for (const auto& target : group.placement_targets) {
+    names.emplace(target.second.name);
+  }
+}
+
+int MotrZoneGroup::get_placement_tier(const rgw_placement_rule& rule,
+				       std::unique_ptr<PlacementTier>* tier)
+{
+  std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
+  titer = group.placement_targets.find(rule.name);
+  if (titer == group.placement_targets.end()) {
+    return -ENOENT;
+  }
+
+  const auto& target_rule = titer->second;
+  std::map<std::string, RGWZoneGroupPlacementTier>::const_iterator ttier;
+  ttier = target_rule.tier_targets.find(rule.storage_class);
+  if (ttier == target_rule.tier_targets.end()) {
+    // not found
+    return -ENOENT;
+  }
+
+  PlacementTier* t;
+  t = new MotrPlacementTier(store, ttier->second);
+  if (!t)
+    return -ENOMEM;
+
+  tier->reset(t);
+  return 0;
+}
+
+ZoneGroup& MotrZone::get_zonegroup()
+{
+  return zonegroup;
+}
+
+const std::string& MotrZone::get_id()
+{
+  return zone_params->get_id();
+}
+
+const std::string& MotrZone::get_name() const
+{
+  return zone_params->get_name();
+}
+
+bool MotrZone::is_writeable()
+{
+  return true;
+}
+
+bool MotrZone::get_redirect_endpoint(std::string* endpoint)
+{
+  return false;
+}
+
+bool MotrZone::has_zonegroup_api(const std::string& api) const
+{
+  return (zonegroup->api_name == api);
+}
+
+const std::string& MotrZone::get_current_period_id()
+{
+  return current_period->get_id();
+}
+
+std::unique_ptr<LuaManager> MotrStore::get_lua_manager()
+{
+  return std::make_unique<MotrLuaManager>(this);
+}
+
+int MotrObject::get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **_state, optional_yield y, bool follow_olh)
+{
+  // Get object's metadata (those stored in rgw_bucket_dir_entry).
+  bufferlist bl;
+  if (this->store->get_obj_meta_cache()->get(dpp, this->get_key().get_oid(), bl)) {
+    // Cache misses.
+    string bucket_index_iname = "motr.rgw.bucket.index." + this->get_bucket()->get_name();
+    int rc = this->store->do_idx_op_by_name(bucket_index_iname,
+                                  M0_IC_GET, this->get_key().get_oid(), bl);
+    if (rc < 0) {
+      ldpp_dout(dpp, 0) << "Failed to get object's entry from bucket index. " << dendl;
+      return rc;
+    }
+
+    // Put into cache.
+    this->store->get_obj_meta_cache()->put(dpp, this->get_key().get_oid(), bl);
+  }
+
+  rgw_bucket_dir_entry ent;
+  bufferlist& blr = bl;
+  auto iter = blr.cbegin();
+  ent.decode(iter);
+
+  // Set object's type.
+  this->category = ent.meta.category;
+
+  // Set object state.
+  state.exists = true;
+  state.size = ent.meta.size;
+  state.accounted_size = ent.meta.size;
+  state.mtime = ent.meta.mtime;
+
+  state.has_attrs = true;
+  bufferlist etag_bl;
+  string& etag = ent.meta.etag;
+  ldpp_dout(dpp, 20) <<__func__<< ": object's etag:  " << ent.meta.etag << dendl;
+  etag_bl.append(etag);
+  state.attrset[RGW_ATTR_ETAG] = etag_bl;
+
+  return 0;
+}
+
+MotrObject::~MotrObject() {
+  this->close_mobj();
+}
+
+//  int MotrObject::read_attrs(const DoutPrefixProvider* dpp, Motr::Object::Read &read_op, optional_yield y, rgw_obj* target_obj)
+//  {
+//    read_op.params.attrs = &attrs;
+//    read_op.params.target_obj = target_obj;
+//    read_op.params.obj_size = &obj_size;
+//    read_op.params.lastmod = &mtime;
+//
+//    return read_op.prepare(dpp);
+//  }
+
+int MotrObject::set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y)
+{
+  // TODO: implement
+  ldpp_dout(dpp, 20) <<__func__<< ": MotrObject::set_obj_attrs()" << dendl;
+  return 0;
+}
+
+int MotrObject::get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj)
+{
+  if (this->category == RGWObjCategory::MultiMeta)
+    return 0;
+
+  string bname, key;
+  if (target_obj) {
+    bname = target_obj->bucket.name;
+    key   = target_obj->key.get_oid();
+  } else {
+    bname = this->get_bucket()->get_name();
+    key   = this->get_key().get_oid();
+  }
+  ldpp_dout(dpp, 20) << "MotrObject::get_obj_attrs(): "
+                    << bname << "/" << key << dendl;
+
+  // Get object's metadata (those stored in rgw_bucket_dir_entry).
+  bufferlist bl;
+  if (this->store->get_obj_meta_cache()->get(dpp, key, bl)) {
+    // Cache misses.
+    string bucket_index_iname = "motr.rgw.bucket.index." + bname;
+    int rc = this->store->do_idx_op_by_name(bucket_index_iname, M0_IC_GET, key, bl);
+    if (rc < 0) {
+      ldpp_dout(dpp, 0) << "Failed to get object's entry from bucket index. " << dendl;
+      return rc;
+    }
+
+    // Put into cache.
+    this->store->get_obj_meta_cache()->put(dpp, key, bl);
+  }
+
+  rgw_bucket_dir_entry ent;
+  bufferlist& blr = bl;
+  auto iter = blr.cbegin();
+  ent.decode(iter);
+  decode(attrs, iter);
+
+  return 0;
+}
+
+int MotrObject::modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp)
+{
+  rgw_obj target = get_obj();
+  int r = get_obj_attrs(y, dpp, &target);
+  if (r < 0) {
+    return r;
+  }
+  set_atomic();
+  attrs[attr_name] = attr_val;
+  return set_obj_attrs(dpp, &attrs, nullptr, y);
+}
+
+int MotrObject::delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y)
+{
+  rgw_obj target = get_obj();
+  Attrs rmattr;
+  bufferlist bl;
+
+  set_atomic();
+  rmattr[attr_name] = bl;
+  return set_obj_attrs(dpp, nullptr, &rmattr, y);
+}
+
+bool MotrObject::is_expired() {
+  return false;
+}
+
+// Taken from rgw_rados.cc
+void MotrObject::gen_rand_obj_instance_name()
+{
+  enum {OBJ_INSTANCE_LEN = 32};
+  char buf[OBJ_INSTANCE_LEN + 1];
+
+  gen_rand_alphanumeric_no_underscore(store->ctx(), buf, OBJ_INSTANCE_LEN);
+  state.obj.key.set_instance(buf);
+}
+
+int MotrObject::omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+    std::map<std::string, bufferlist> *m,
+    bool* pmore, optional_yield y)
+{
+  return 0;
+}
+
+int MotrObject::omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
+    optional_yield y)
+{
+  return 0;
+}
+
+int MotrObject::omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+    const std::set<std::string>& keys,
+    Attrs* vals)
+{
+  return 0;
+}
+
+int MotrObject::omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+    bool must_exist, optional_yield y)
+{
+  return 0;
+}
+
+int MotrObject::chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y)
+{
+  return 0;
+}
+
+std::unique_ptr<MPSerializer> MotrObject::get_serializer(const DoutPrefixProvider *dpp,
+                                                         const std::string& lock_name)
+{
+  return std::make_unique<MPMotrSerializer>(dpp, store, this, lock_name);
+}
+
+int MotrObject::transition(Bucket* bucket,
+    const rgw_placement_rule& placement_rule,
+    const real_time& mtime,
+    uint64_t olh_epoch,
+    const DoutPrefixProvider* dpp,
+    optional_yield y)
+{
+  return 0;
+}
+
+bool MotrObject::placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2)
+{
+  /* XXX: support single default zone and zonegroup for now */
+  return true;
+}
+
+int MotrObject::dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f)
+{
+  return 0;
+}
+
+std::unique_ptr<Object::ReadOp> MotrObject::get_read_op()
+{
+  return std::make_unique<MotrObject::MotrReadOp>(this);
+}
+
+MotrObject::MotrReadOp::MotrReadOp(MotrObject *_source) :
+  source(_source)
+{ }
+
+int MotrObject::MotrReadOp::prepare(optional_yield y, const DoutPrefixProvider* dpp)
+{
+  int rc;
+  ldpp_dout(dpp, 20) <<__func__<< ": bucket=" << source->get_bucket()->get_name() << dendl;
+
+  rgw_bucket_dir_entry ent;
+  rc = source->get_bucket_dir_ent(dpp, ent);
+  if (rc < 0)
+    return rc;
+
+  // Set source object's attrs. The attrs is key/value map and is used
+  // in send_response_data() to set attributes, including etag.
+  bufferlist etag_bl;
+  string& etag = ent.meta.etag;
+  ldpp_dout(dpp, 20) <<__func__<< ": object's etag: " << ent.meta.etag << dendl;
+  etag_bl.append(etag.c_str(), etag.size());
+  source->get_attrs().emplace(std::move(RGW_ATTR_ETAG), std::move(etag_bl));
+
+  source->set_key(ent.key);
+  source->set_obj_size(ent.meta.size);
+  source->category = ent.meta.category;
+  *params.lastmod = ent.meta.mtime;
+
+  if (params.mod_ptr || params.unmod_ptr) {
+    // Convert all times go GMT to make them compatible
+    obj_time_weight src_weight;
+    src_weight.init(*params.lastmod, params.mod_zone_id, params.mod_pg_ver);
+    src_weight.high_precision = params.high_precision_time;
+
+    obj_time_weight dest_weight;
+    dest_weight.high_precision = params.high_precision_time;
+
+    // Check if-modified-since condition
+    if (params.mod_ptr && !params.if_nomatch) {
+      dest_weight.init(*params.mod_ptr, params.mod_zone_id, params.mod_pg_ver);
+      ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " & "
+                         << "Last-Modified: " << src_weight << dendl;
+      if (!(dest_weight < src_weight)) {
+        return -ERR_NOT_MODIFIED;
+      }
+    }
+
+    // Check if-unmodified-since condition
+    if (params.unmod_ptr && !params.if_match) {
+      dest_weight.init(*params.unmod_ptr, params.mod_zone_id, params.mod_pg_ver);
+      ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " & " 
+                         << "Last-Modified: " << src_weight << dendl;
+      if (dest_weight < src_weight) {
+        return -ERR_PRECONDITION_FAILED;
+      }
+    }
+  }
+  // Check if-match condition
+  if (params.if_match) {
+    string if_match_str = rgw_string_unquote(params.if_match);
+    ldpp_dout(dpp, 10) << "ETag: " << etag << " & "
+                       << "If-Match: " << if_match_str << dendl;     
+    if (if_match_str.compare(etag) != 0) {
+      return -ERR_PRECONDITION_FAILED;
+    }
+  }
+  // Check if-none-match condition
+  if (params.if_nomatch) {
+    string if_nomatch_str = rgw_string_unquote(params.if_nomatch);
+    ldpp_dout(dpp, 10) << "ETag: " << etag << " & "
+                       << "If-NoMatch: " << if_nomatch_str << dendl;
+    if (if_nomatch_str.compare(etag) == 0) {
+      return -ERR_NOT_MODIFIED;
+    }
+  }
+
+  // Skip opening an empty object.
+  if(source->get_obj_size() == 0)
+    return 0;
+
+  // Open the object here.
+  if (source->category == RGWObjCategory::MultiMeta) {
+    ldpp_dout(dpp, 20) <<__func__<< ": open obj parts..." << dendl;
+    rc = source->get_part_objs(dpp, this->part_objs)? :
+         source->open_part_objs(dpp, this->part_objs);
+    return rc;
+  } else {
+    ldpp_dout(dpp, 20) <<__func__<< ": open object..." << dendl;
+    return source->open_mobj(dpp);
+  }
+}
+
+int MotrObject::MotrReadOp::read(int64_t off, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider* dpp)
+{
+  ldpp_dout(dpp, 20) << "MotrReadOp::read(): sync read." << dendl;
+  return 0;
+}
+
+// RGWGetObj::execute() calls ReadOp::iterate() to read object from 'off' to 'end'.
+// The returned data is processed in 'cb' which is a chain of post-processing
+// filters such as decompression, de-encryption and sending back data to client
+// (RGWGetObj_CB::handle_dta which in turn calls RGWGetObj::get_data_cb() to
+// send data back.).
+//
+// POC implements a simple sync version of iterate() function in which it reads
+// a block of data each time and call 'cb' for post-processing.
+int MotrObject::MotrReadOp::iterate(const DoutPrefixProvider* dpp, int64_t off, int64_t end, RGWGetDataCB* cb, optional_yield y)
+{
+  int rc;
+
+  if (source->category == RGWObjCategory::MultiMeta)
+    rc = source->read_multipart_obj(dpp, off, end, cb, part_objs);
+  else
+    rc = source->read_mobj(dpp, off, end, cb);
+
+  return rc;
+}
+
+int MotrObject::MotrReadOp::get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y)
+{
+  //return 0;
+  return -ENODATA;
+}
+
+std::unique_ptr<Object::DeleteOp> MotrObject::get_delete_op()
+{
+  return std::make_unique<MotrObject::MotrDeleteOp>(this);
+}
+
+MotrObject::MotrDeleteOp::MotrDeleteOp(MotrObject *_source) :
+  source(_source)
+{ }
+
+// Implementation of DELETE OBJ also requires MotrObject::get_obj_state()
+// to retrieve and set object's state from object's metadata.
+//
+// TODO:
+// 1. The POC only remove the object's entry from bucket index and delete
+// corresponding Motr objects. It doesn't handle the DeleteOp::params.
+// Delete::delete_obj() in rgw_rados.cc shows how rados backend process the
+// params.
+// 2. Delete an object when its versioning is turned on.
+int MotrObject::MotrDeleteOp::delete_obj(const DoutPrefixProvider* dpp, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << "delete " << source->get_key().get_oid() << " from " << source->get_bucket()->get_name() << dendl;
+
+  rgw_bucket_dir_entry ent;
+  int rc = source->get_bucket_dir_ent(dpp, ent);
+  if (rc < 0) {
+    return rc;
+  }
+
+  //TODO: When integrating with background GC for object deletion,
+  // we should consider adding object entry to GC before deleting the metadata.
+  // Delete from the cache first.
+  source->store->get_obj_meta_cache()->remove(dpp, source->get_key().get_oid());
+
+  // Delete the object's entry from the bucket index.
+  bufferlist bl;
+  string bucket_index_iname = "motr.rgw.bucket.index." + source->get_bucket()->get_name();
+  rc = source->store->do_idx_op_by_name(bucket_index_iname,
+                                        M0_IC_DEL, source->get_key().get_oid(), bl);
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << "Failed to del object's entry from bucket index. " << dendl;
+    return rc;
+  }
+
+  if (ent.meta.size == 0) {
+    ldpp_dout(dpp, 0) << __func__ << ": Object size is 0, not deleting motr object." << dendl;
+    return 0;
+  }
+  // Remove the motr objects.
+  if (source->category == RGWObjCategory::MultiMeta)
+    rc = source->delete_part_objs(dpp);
+  else
+    rc = source->delete_mobj(dpp);
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << "Failed to delete the object from Motr. " << dendl;
+    return rc;
+  }
+
+  //result.delete_marker = parent_op.result.delete_marker;
+  //result.version_id = parent_op.result.version_id;
+  return 0;
+}
+
+int MotrObject::delete_object(const DoutPrefixProvider* dpp, optional_yield y, bool prevent_versioning)
+{
+  MotrObject::MotrDeleteOp del_op(this);
+  del_op.params.bucket_owner = bucket->get_info().owner;
+  del_op.params.versioning_status = bucket->get_info().versioning_status();
+
+  return del_op.delete_obj(dpp, y);
+}
+
+int MotrObject::delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate,
+    Completions* aio, bool keep_index_consistent,
+    optional_yield y)
+{
+  /* XXX: Make it async */
+  return 0;
+}
+
+int MotrObject::copy_object(User* user,
+    req_info* info,
+    const rgw_zone_id& source_zone,
+    rgw::sal::Object* dest_object,
+    rgw::sal::Bucket* dest_bucket,
+    rgw::sal::Bucket* src_bucket,
+    const rgw_placement_rule& dest_placement,
+    ceph::real_time* src_mtime,
+    ceph::real_time* mtime,
+    const ceph::real_time* mod_ptr,
+    const ceph::real_time* unmod_ptr,
+    bool high_precision_time,
+    const char* if_match,
+    const char* if_nomatch,
+    AttrsMod attrs_mod,
+    bool copy_if_newer,
+    Attrs& attrs,
+    RGWObjCategory category,
+    uint64_t olh_epoch,
+    boost::optional<ceph::real_time> delete_at,
+    std::string* version_id,
+    std::string* tag,
+    std::string* etag,
+    void (*progress_cb)(off_t, void *),
+    void* progress_data,
+    const DoutPrefixProvider* dpp,
+    optional_yield y)
+{
+      return 0;
+}
+
+int MotrObject::swift_versioning_restore(bool& restored,
+    const DoutPrefixProvider* dpp)
+{
+  return 0;
+}
+
+int MotrObject::swift_versioning_copy(const DoutPrefixProvider* dpp,
+    optional_yield y)
+{
+  return 0;
+}
+
+MotrAtomicWriter::MotrAtomicWriter(const DoutPrefixProvider *dpp,
+          optional_yield y,
+          rgw::sal::Object* obj,
+          MotrStore* _store,
+          const rgw_user& _owner,
+          const rgw_placement_rule *_ptail_placement_rule,
+          uint64_t _olh_epoch,
+          const std::string& _unique_tag) :
+        StoreWriter(dpp, y),
+        store(_store),
+              owner(_owner),
+              ptail_placement_rule(_ptail_placement_rule),
+              olh_epoch(_olh_epoch),
+              unique_tag(_unique_tag),
+              obj(_store, obj->get_key(), obj->get_bucket()),
+              old_obj(_store, obj->get_key(), obj->get_bucket()) {}
+
+static const unsigned MAX_BUFVEC_NR = 256;
+
+int MotrAtomicWriter::prepare(optional_yield y)
+{
+  total_data_size = 0;
+
+  if (obj.is_opened())
+    return 0;
+
+  rgw_bucket_dir_entry ent;
+  int rc = old_obj.get_bucket_dir_ent(dpp, ent);
+  if (rc == 0) {
+    ldpp_dout(dpp, 20) << __func__ << ": object exists." << dendl;
+  }
+
+  rc = m0_bufvec_empty_alloc(&buf, MAX_BUFVEC_NR) ?:
+           m0_bufvec_alloc(&attr, MAX_BUFVEC_NR, 1) ?:
+           m0_indexvec_alloc(&ext, MAX_BUFVEC_NR);
+  if (rc != 0)
+    this->cleanup();
+
+  return rc;
+}
+
+int MotrObject::create_mobj(const DoutPrefixProvider *dpp, uint64_t sz)
+{
+  if (mobj != nullptr) {
+    ldpp_dout(dpp, 0) <<__func__<< "ERROR: object is already opened" << dendl;
+    return -EINVAL;
+  }
+
+  int rc = m0_ufid_next(&ufid_gr, 1, &meta.oid);
+  if (rc != 0) {
+    ldpp_dout(dpp, 0) <<__func__<< "ERROR: m0_ufid_next() failed: " << rc << dendl;
+    return rc;
+  }
+
+  char fid_str[M0_FID_STR_LEN];
+  snprintf(fid_str, ARRAY_SIZE(fid_str), U128X_F, U128_P(&meta.oid));
+  ldpp_dout(dpp, 20) <<__func__<< ": sz=" << sz << " oid=" << fid_str << dendl;
+
+  int64_t lid = m0_layout_find_by_objsz(store->instance, nullptr, sz);
+  M0_ASSERT(lid > 0);
+
+  M0_ASSERT(mobj == nullptr);
+  mobj = new m0_obj();
+  m0_obj_init(mobj, &store->container.co_realm, &meta.oid, lid);
+
+  struct m0_op *op = nullptr;
+  mobj->ob_entity.en_flags |= M0_ENF_META;
+  rc = m0_entity_create(nullptr, &mobj->ob_entity, &op);
+  if (rc != 0) {
+    this->close_mobj();
+    ldpp_dout(dpp, 0) << "ERROR: m0_entity_create() failed: " << rc << dendl;
+    return rc;
+  }
+  ldpp_dout(dpp, 20) <<__func__<< ": call m0_op_launch()..." << dendl;
+  m0_op_launch(&op, 1);
+  rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+       m0_rc(op);
+  m0_op_fini(op);
+  m0_op_free(op);
+
+  if (rc != 0) {
+    this->close_mobj();
+    ldpp_dout(dpp, 0) << "ERROR: failed to create motr object: " << rc << dendl;
+    return rc;
+  }
+
+  meta.layout_id = mobj->ob_attr.oa_layout_id;
+  meta.pver      = mobj->ob_attr.oa_pver;
+  ldpp_dout(dpp, 20) <<__func__<< ": lid=0x" << std::hex << meta.layout_id
+                     << std::dec << " rc=" << rc << dendl;
+
+  // TODO: add key:user+bucket+key+obj.meta.oid value:timestamp to
+  // gc.queue.index. See more at github.com/Seagate/cortx-rgw/issues/7.
+
+  return rc;
+}
+
+int MotrObject::open_mobj(const DoutPrefixProvider *dpp)
+{
+  char fid_str[M0_FID_STR_LEN];
+  snprintf(fid_str, ARRAY_SIZE(fid_str), U128X_F, U128_P(&meta.oid));
+  ldpp_dout(dpp, 20) <<__func__<< ": oid=" << fid_str << dendl;
+
+  int rc;
+  if (meta.layout_id == 0) {
+    rgw_bucket_dir_entry ent;
+    rc = this->get_bucket_dir_ent(dpp, ent);
+    if (rc < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: open_mobj() failed: rc=" << rc << dendl;
+      return rc;
+    }
+  }
+
+  if (meta.layout_id == 0)
+    return -ENOENT;
+
+  M0_ASSERT(mobj == nullptr);
+  mobj = new m0_obj();
+  memset(mobj, 0, sizeof *mobj);
+  m0_obj_init(mobj, &store->container.co_realm, &meta.oid, store->conf.mc_layout_id);
+
+  struct m0_op *op = nullptr;
+  mobj->ob_attr.oa_layout_id = meta.layout_id;
+  mobj->ob_attr.oa_pver      = meta.pver;
+  mobj->ob_entity.en_flags  |= M0_ENF_META;
+  rc = m0_entity_open(&mobj->ob_entity, &op);
+  if (rc != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: m0_entity_open() failed: rc=" << rc << dendl;
+    this->close_mobj();
+    return rc;
+  }
+  m0_op_launch(&op, 1);
+  rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+       m0_rc(op);
+  m0_op_fini(op);
+  m0_op_free(op);
+
+  if (rc < 0) {
+    ldpp_dout(dpp, 10) << "ERROR: failed to open motr object: rc=" << rc << dendl;
+    this->close_mobj();
+    return rc;
+  }
+
+  ldpp_dout(dpp, 20) <<__func__<< ": rc=" << rc << dendl;
+
+  return 0;
+}
+
+int MotrObject::delete_mobj(const DoutPrefixProvider *dpp)
+{
+  int rc;
+  char fid_str[M0_FID_STR_LEN];
+  snprintf(fid_str, ARRAY_SIZE(fid_str), U128X_F, U128_P(&meta.oid));
+  if (!meta.oid.u_hi || !meta.oid.u_lo) {
+    ldpp_dout(dpp, 20) << __func__ << ": invalid motr object oid=" << fid_str << dendl;
+    return -EINVAL;
+  }
+  ldpp_dout(dpp, 20) << __func__ << ": deleting motr object oid=" << fid_str << dendl;
+
+  // Open the object.
+  if (mobj == nullptr) {
+    rc = this->open_mobj(dpp);
+    if (rc < 0)
+      return rc;
+  }
+
+  // Create an DELETE op and execute it (sync version).
+  struct m0_op *op = nullptr;
+  mobj->ob_entity.en_flags |= M0_ENF_META;
+  rc = m0_entity_delete(&mobj->ob_entity, &op);
+  if (rc != 0) {
+    ldpp_dout(dpp, 0) << "ERROR: m0_entity_delete() failed: " << rc << dendl;
+    return rc;
+  }
+  m0_op_launch(&op, 1);
+  rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+       m0_rc(op);
+  m0_op_fini(op);
+  m0_op_free(op);
+
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to open motr object: " << rc << dendl;
+    return rc;
+  }
+
+  this->close_mobj();
+
+  return 0;
+}
+
+void MotrObject::close_mobj()
+{
+  if (mobj == nullptr)
+    return;
+  m0_obj_fini(mobj);
+  delete mobj; mobj = nullptr;
+}
+
+int MotrObject::write_mobj(const DoutPrefixProvider *dpp, bufferlist&& data, uint64_t offset)
+{
+  int rc;
+  unsigned bs, left;
+  struct m0_op *op;
+  char *start, *p;
+  struct m0_bufvec buf;
+  struct m0_bufvec attr;
+  struct m0_indexvec ext;
+
+  left = data.length();
+  if (left == 0)
+    return 0;
+
+  rc = m0_bufvec_empty_alloc(&buf, 1) ?:
+       m0_bufvec_alloc(&attr, 1, 1) ?:
+       m0_indexvec_alloc(&ext, 1);
+  if (rc != 0)
+    goto out;
+
+  bs = this->get_optimal_bs(left);
+  ldpp_dout(dpp, 20) <<__func__<< ": left=" << left << " bs=" << bs << dendl;
+
+  start = data.c_str();
+
+  for (p = start; left > 0; left -= bs, p += bs, offset += bs) {
+    if (left < bs)
+      bs = this->get_optimal_bs(left);
+    if (left < bs) {
+      data.append_zero(bs - left);
+      left = bs;
+      p = data.c_str();
+    }
+    buf.ov_buf[0] = p;
+    buf.ov_vec.v_count[0] = bs;
+    ext.iv_index[0] = offset;
+    ext.iv_vec.v_count[0] = bs;
+    attr.ov_vec.v_count[0] = 0;
+
+    op = nullptr;
+    rc = m0_obj_op(this->mobj, M0_OC_WRITE, &ext, &buf, &attr, 0, 0, &op);
+    if (rc != 0)
+      goto out;
+    m0_op_launch(&op, 1);
+    rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+         m0_rc(op);
+    m0_op_fini(op);
+    m0_op_free(op);
+    if (rc != 0)
+      goto out;
+  }
+
+out:
+  m0_indexvec_free(&ext);
+  m0_bufvec_free(&attr);
+  m0_bufvec_free2(&buf);
+  return rc;
+}
+
+int MotrObject::read_mobj(const DoutPrefixProvider* dpp, int64_t off, int64_t end, RGWGetDataCB* cb)
+{
+  int rc;
+  unsigned bs, actual, left;
+  struct m0_op *op;
+  struct m0_bufvec buf;
+  struct m0_bufvec attr;
+  struct m0_indexvec ext;
+
+  // make end pointer exclusive:
+  // it's easier to work with it this way
+  end++;
+  ldpp_dout(dpp, 20) << "MotrObject::read_mobj(): off=" << off <<
+                       " end=" << end << dendl;
+  // As `off` may not be parity group size aligned, even using optimal
+  // buffer block size, simply reading data from offset `off` could come
+  // across parity group boundary. And Motr only allows page-size aligned
+  // offset.
+  //
+  // The optimal size of each IO should also take into account the data
+  // transfer size to s3 client. For example, 16MB may be nice to read
+  // data from motr, but it could be too big for network transfer.
+  //
+  // TODO: We leave proper handling of offset in the future.
+  bs = this->get_optimal_bs(end - off);
+  ldpp_dout(dpp, 20) << "MotrObject::read_mobj(): bs=" << bs << dendl;
+
+  rc = m0_bufvec_empty_alloc(&buf, 1) ? :
+       m0_bufvec_alloc(&attr, 1, 1) ? :
+       m0_indexvec_alloc(&ext, 1);
+  if (rc < 0)
+    goto out;
+
+  left = end - off;
+  for (; left > 0; off += actual) {
+    if (left < bs)
+      bs = this->get_optimal_bs(left);
+    actual = bs;
+    if (left < bs)
+      actual = left;
+    ldpp_dout(dpp, 20) << "MotrObject::read_mobj(): off=" << off <<
+                                            " actual=" << actual << dendl;
+    bufferlist bl;
+    buf.ov_buf[0] = bl.append_hole(bs).c_str();
+    buf.ov_vec.v_count[0] = bs;
+    ext.iv_index[0] = off;
+    ext.iv_vec.v_count[0] = bs;
+    attr.ov_vec.v_count[0] = 0;
+
+    left -= actual;
+    // Read from Motr.
+    op = nullptr;
+    rc = m0_obj_op(this->mobj, M0_OC_READ, &ext, &buf, &attr, 0, 0, &op);
+    ldpp_dout(dpp, 20) << "MotrObject::read_mobj(): init read op rc=" << rc << dendl;
+    if (rc != 0) {
+      ldpp_dout(dpp, 0) << __func__ << ": read failed during m0_obj_op, rc=" << rc << dendl;
+      goto out;
+    }
+    m0_op_launch(&op, 1);
+    rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+         m0_rc(op);
+    m0_op_fini(op);
+    m0_op_free(op);
+    if (rc != 0) {
+      ldpp_dout(dpp, 0) << __func__ << ": read failed, m0_op_wait rc=" << rc << dendl;
+      goto out;
+    }
+    // Call `cb` to process returned data.
+    ldpp_dout(dpp, 20) << "MotrObject::read_mobj(): call cb to process data" << dendl;
+    cb->handle_data(bl, 0, actual);
+  }
+
+out:
+  m0_indexvec_free(&ext);
+  m0_bufvec_free(&attr);
+  m0_bufvec_free2(&buf);
+  this->close_mobj();
+
+  return rc;
+}
+
+int MotrObject::get_bucket_dir_ent(const DoutPrefixProvider *dpp, rgw_bucket_dir_entry& ent)
+{
+  int rc = 0;
+  string bucket_index_iname = "motr.rgw.bucket.index." + this->get_bucket()->get_name();
+  int max = 1000;
+  vector<string> keys(max);
+  vector<bufferlist> vals(max);
+  bufferlist bl;
+  bufferlist::const_iterator iter;
+
+  if (this->get_bucket()->get_info().versioning_status() == BUCKET_VERSIONED ||
+      this->get_bucket()->get_info().versioning_status() == BUCKET_SUSPENDED) {
+
+    rgw_bucket_dir_entry ent_to_check;
+
+    if (this->store->get_obj_meta_cache()->get(dpp, this->get_name(), bl) == 0) {
+      iter = bl.cbegin();
+      ent_to_check.decode(iter);
+      if (ent_to_check.is_current()) {
+        ent = ent_to_check;
+        rc = 0;
+        goto out;
+      }
+    }
+
+    ldpp_dout(dpp, 20) <<__func__<< ": versioned bucket!" << dendl;
+    keys[0] = this->get_name();
+    rc = store->next_query_by_name(bucket_index_iname, keys, vals);
+    if (rc < 0) {
+      ldpp_dout(dpp, 0) << __func__ << "ERROR: NEXT query failed. " << rc << dendl;
+      return rc;
+    }
+
+    rc = -ENOENT;
+    for (const auto& bl: vals) {
+      if (bl.length() == 0)
+        break;
+
+      iter = bl.cbegin();
+      ent_to_check.decode(iter);
+      if (ent_to_check.is_current()) {
+        ldpp_dout(dpp, 20) <<__func__<< ": found current version!" << dendl;
+        ent = ent_to_check;
+        rc = 0;
+
+        this->store->get_obj_meta_cache()->put(dpp, this->get_name(), bl);
+
+        break;
+      }
+    }
+  } else {
+    if (this->store->get_obj_meta_cache()->get(dpp, this->get_key().get_oid(), bl)) {
+      ldpp_dout(dpp, 20) <<__func__<< ": non-versioned bucket!" << dendl;
+      rc = this->store->do_idx_op_by_name(bucket_index_iname,
+                                          M0_IC_GET, this->get_key().get_oid(), bl);
+      if (rc < 0) {
+        ldpp_dout(dpp, 0) << __func__ << "ERROR: failed to get object's entry from bucket index: rc="
+                          << rc << dendl;
+        return rc;
+      }
+      this->store->get_obj_meta_cache()->put(dpp, this->get_key().get_oid(), bl);
+    }
+
+    bufferlist& blr = bl;
+    iter = blr.cbegin();
+    ent.decode(iter);
+  }
+
+out:
+  if (rc == 0) {
+    sal::Attrs dummy;
+    decode(dummy, iter);
+    meta.decode(iter);
+    ldpp_dout(dpp, 20) <<__func__<< ": lid=0x" << std::hex << meta.layout_id << dendl;
+    char fid_str[M0_FID_STR_LEN];
+    snprintf(fid_str, ARRAY_SIZE(fid_str), U128X_F, U128_P(&meta.oid));
+    ldpp_dout(dpp, 70) << __func__ << ": oid=" << fid_str << dendl;
+  } else
+    ldpp_dout(dpp, 0) <<__func__<< ": rc=" << rc << dendl;
+
+  return rc;
+}
+
+int MotrObject::update_version_entries(const DoutPrefixProvider *dpp)
+{
+  int rc;
+  int max = 10;
+  vector<string> keys(max);
+  vector<bufferlist> vals(max);
+
+  string bucket_index_iname = "motr.rgw.bucket.index." + this->get_bucket()->get_name();
+  keys[0] = this->get_name();
+  rc = store->next_query_by_name(bucket_index_iname, keys, vals);
+  ldpp_dout(dpp, 20) << "get all versions, name = " << this->get_name() << "rc = " << rc << dendl;
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: NEXT query failed. " << rc << dendl;
+    return rc;
+  }
+
+  // no entries returned.
+  if (rc == 0)
+    return 0;
+
+  for (const auto& bl: vals) {
+    if (bl.length() == 0)
+      break;
+
+    rgw_bucket_dir_entry ent;
+    auto iter = bl.cbegin();
+    ent.decode(iter);
+
+    if (0 != ent.key.name.compare(0, this->get_name().size(), this->get_name()))
+      continue;
+
+    if (!ent.is_current())
+      continue;
+
+    // Remove from the cache.
+    store->get_obj_meta_cache()->remove(dpp, this->get_name());
+
+    rgw::sal::Attrs attrs;
+    decode(attrs, iter);
+    MotrObject::Meta meta;
+    meta.decode(iter);
+
+    ent.flags = rgw_bucket_dir_entry::FLAG_VER;
+    string key;
+    if (ent.key.instance.empty())
+      key = ent.key.name;
+    else {
+      char buf[ent.key.name.size() + ent.key.instance.size() + 16];
+      snprintf(buf, sizeof(buf), "%s[%s]", ent.key.name.c_str(), ent.key.instance.c_str());
+      key = buf;
+    }
+    ldpp_dout(dpp, 20) << "update one version, key = " << key << dendl;
+    bufferlist ent_bl;
+    ent.encode(ent_bl);
+    encode(attrs, ent_bl);
+    meta.encode(ent_bl);
+
+    rc = store->do_idx_op_by_name(bucket_index_iname,
+                                  M0_IC_PUT, key, ent_bl);
+    if (rc < 0)
+      break;
+  }
+  return rc;
+}
+
+// Scan object_nnn_part_index to get all parts then open their motr objects.
+// TODO: all parts are opened in the POC. But for a large object, for example
+// a 5GB object will have about 300 parts (for default 15MB part). A better
+// way of managing opened object may be needed.
+int MotrObject::get_part_objs(const DoutPrefixProvider* dpp,
+                              std::map<int, std::unique_ptr<MotrObject>>& part_objs)
+{
+  int rc;
+  int max_parts = 1000;
+  int marker = 0;
+  uint64_t off = 0;
+  bool truncated = false;
+  std::unique_ptr<rgw::sal::MultipartUpload> upload;
+
+  upload = this->get_bucket()->get_multipart_upload(this->get_name(), string());
+
+  do {
+    rc = upload->list_parts(dpp, store->ctx(), max_parts, marker, &marker, &truncated);
+    if (rc == -ENOENT) {
+      rc = -ERR_NO_SUCH_UPLOAD;
+    }
+    if (rc < 0)
+      return rc;
+
+    std::map<uint32_t, std::unique_ptr<MultipartPart>>& parts = upload->get_parts();
+    for (auto part_iter = parts.begin(); part_iter != parts.end(); ++part_iter) {
+
+      MultipartPart *mpart = part_iter->second.get();
+      MotrMultipartPart *mmpart = static_cast<MotrMultipartPart *>(mpart);
+      uint32_t part_num = mmpart->get_num();
+      uint64_t part_size = mmpart->get_size();
+
+      string part_obj_name = this->get_bucket()->get_name() + "." +
+ 	                     this->get_key().get_oid() +
+	                     ".part." + std::to_string(part_num);
+      std::unique_ptr<rgw::sal::Object> obj;
+      obj = this->bucket->get_object(rgw_obj_key(part_obj_name));
+      std::unique_ptr<rgw::sal::MotrObject> mobj(static_cast<rgw::sal::MotrObject *>(obj.release()));
+
+      ldpp_dout(dpp, 20) << "get_part_objs: off = " << off << ", size = " << part_size << dendl;
+      mobj->part_off = off;
+      mobj->part_size = part_size;
+      mobj->part_num = part_num;
+      mobj->meta = mmpart->meta;
+
+      part_objs.emplace(part_num, std::move(mobj));
+
+      off += part_size;
+    }
+  } while (truncated);
+
+  return 0;
+}
+
+int MotrObject::open_part_objs(const DoutPrefixProvider* dpp,
+                               std::map<int, std::unique_ptr<MotrObject>>& part_objs)
+{
+  //for (auto& iter: part_objs) {
+  for (auto iter = part_objs.begin(); iter != part_objs.end(); ++iter) {
+    MotrObject* obj = static_cast<MotrObject *>(iter->second.get());
+    ldpp_dout(dpp, 20) << "open_part_objs: name = " << obj->get_name() << dendl;
+    int rc = obj->open_mobj(dpp);
+    if (rc < 0)
+      return rc;
+  }
+
+  return 0;
+}
+
+int MotrObject::delete_part_objs(const DoutPrefixProvider* dpp)
+{
+  std::unique_ptr<rgw::sal::MultipartUpload> upload;
+  upload = this->get_bucket()->get_multipart_upload(this->get_name(), string());
+  std::unique_ptr<rgw::sal::MotrMultipartUpload> mupload(static_cast<rgw::sal::MotrMultipartUpload *>(upload.release()));
+  return mupload->delete_parts(dpp);
+}
+
+int MotrObject::read_multipart_obj(const DoutPrefixProvider* dpp,
+                                   int64_t off, int64_t end, RGWGetDataCB* cb,
+				   std::map<int, std::unique_ptr<MotrObject>>& part_objs)
+{
+  int64_t cursor = off;
+
+  ldpp_dout(dpp, 20) << "read_multipart_obj: off=" << off << " end=" << end << dendl;
+
+  // Find the parts which are in the (off, end) range and
+  // read data from it. Note: `end` argument is inclusive.
+  for (auto iter = part_objs.begin(); iter != part_objs.end(); ++iter) {
+    MotrObject* obj = static_cast<MotrObject *>(iter->second.get());
+    int64_t part_off = obj->part_off;
+    int64_t part_size = obj->part_size;
+    int64_t part_end = obj->part_off + obj->part_size - 1;
+    ldpp_dout(dpp, 20) << "read_multipart_obj: part_off=" << part_off
+                                          << " part_end=" << part_end << dendl;
+    if (part_end < off)
+      continue;
+
+    int64_t local_off = cursor - obj->part_off;
+    int64_t local_end = part_end < end? part_size - 1 : end - part_off;
+    ldpp_dout(dpp, 20) << "real_multipart_obj: name=" << obj->get_name()
+                                          << " local_off=" << local_off
+                                          << " local_end=" << local_end << dendl;
+    int rc = obj->read_mobj(dpp, local_off, local_end, cb);
+    if (rc < 0)
+        return rc;
+
+    cursor = part_end + 1;
+    if (cursor > end)
+      break;
+  }
+
+  return 0;
+}
+
+static unsigned roundup(unsigned x, unsigned by)
+{
+  return ((x - 1) / by + 1) * by;
+}
+
+unsigned MotrObject::get_optimal_bs(unsigned len)
+{
+  struct m0_pool_version *pver;
+
+  pver = m0_pool_version_find(&store->instance->m0c_pools_common,
+                              &mobj->ob_attr.oa_pver);
+  M0_ASSERT(pver != nullptr);
+  struct m0_pdclust_attr *pa = &pver->pv_attr;
+  uint64_t lid = M0_OBJ_LAYOUT_ID(meta.layout_id);
+  unsigned unit_sz = m0_obj_layout_id_to_unit_size(lid);
+  unsigned grp_sz  = unit_sz * pa->pa_N;
+
+  // bs should be max 4-times pool-width deep counting by 1MB units, or
+  // 8-times deep counting by 512K units, 16-times deep by 256K units,
+  // and so on. Several units to one target will be aggregated to make
+  // fewer network RPCs, disk i/o operations and BE transactions.
+  // For unit sizes of 32K or less, the depth is 128, which
+  // makes it 32K * 128 == 4MB - the maximum amount per target when
+  // the performance is still good on LNet (which has max 1MB frames).
+  // TODO: it may be different on libfabric, should be re-measured.
+  unsigned depth = 128 / ((unit_sz + 0x7fff) / 0x8000);
+  if (depth == 0)
+    depth = 1;
+  // P * N / (N + K + S) - number of data units to span the pool-width
+  unsigned max_bs = depth * unit_sz * pa->pa_P * pa->pa_N /
+                                     (pa->pa_N + pa->pa_K + pa->pa_S);
+  max_bs = roundup(max_bs, grp_sz); // multiple of group size
+  if (len >= max_bs)
+    return max_bs;
+  else if (len <= grp_sz)
+    return grp_sz;
+  else
+    return roundup(len, grp_sz);
+}
+
+void MotrAtomicWriter::cleanup()
+{
+  m0_indexvec_free(&ext);
+  m0_bufvec_free(&attr);
+  m0_bufvec_free2(&buf);
+  acc_data.clear();
+  obj.close_mobj();
+  old_obj.close_mobj();
+}
+
+unsigned MotrAtomicWriter::populate_bvec(unsigned len, bufferlist::iterator &bi)
+{
+  unsigned i, l, done = 0;
+  const char *data;
+
+  for (i = 0; i < MAX_BUFVEC_NR && len > 0; ++i) {
+    l = bi.get_ptr_and_advance(len, &data);
+    buf.ov_buf[i] = (char*)data;
+    buf.ov_vec.v_count[i] = l;
+    ext.iv_index[i] = acc_off;
+    ext.iv_vec.v_count[i] = l;
+    attr.ov_vec.v_count[i] = 0;
+    acc_off += l;
+    len -= l;
+    done += l;
+  }
+  buf.ov_vec.v_nr = i;
+  ext.iv_vec.v_nr = i;
+
+  return done;
+}
+
+int MotrAtomicWriter::write()
+{
+  int rc;
+  unsigned bs, left;
+  struct m0_op *op;
+  bufferlist::iterator bi;
+
+  left = acc_data.length();
+
+  if (!obj.is_opened()) {
+    rc = obj.create_mobj(dpp, left);
+    if (rc == -EEXIST)
+      rc = obj.open_mobj(dpp);
+    if (rc != 0) {
+      char fid_str[M0_FID_STR_LEN];
+      snprintf(fid_str, ARRAY_SIZE(fid_str), U128X_F, U128_P(&obj.meta.oid));
+      ldpp_dout(dpp, 0) << "ERROR: failed to create/open motr object "
+                        << fid_str << " (" << obj.get_bucket()->get_name()
+                        << "/" << obj.get_key().get_oid() << "): rc=" << rc
+                        << dendl;
+      goto err;
+    }
+  }
+
+  total_data_size += left;
+
+  bs = obj.get_optimal_bs(left);
+  ldpp_dout(dpp, 20) <<__func__<< ": left=" << left << " bs=" << bs << dendl;
+
+  bi = acc_data.begin();
+  while (left > 0) {
+    if (left < bs)
+      bs = obj.get_optimal_bs(left);
+    if (left < bs) {
+      acc_data.append_zero(bs - left);
+      auto off = bi.get_off();
+      bufferlist tmp;
+      acc_data.splice(off, bs, &tmp);
+      acc_data.clear();
+      acc_data.append(tmp.c_str(), bs); // make it a single buf
+      bi = acc_data.begin();
+      left = bs;
+    }
+
+    left -= this->populate_bvec(bs, bi);
+
+    op = nullptr;
+    rc = m0_obj_op(obj.mobj, M0_OC_WRITE, &ext, &buf, &attr, 0, 0, &op);
+    if (rc != 0)
+      goto err;
+    m0_op_launch(&op, 1);
+    rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+         m0_rc(op);
+    m0_op_fini(op);
+    m0_op_free(op);
+    if (rc != 0)
+      goto err;
+  }
+  acc_data.clear();
+
+  return 0;
+
+err:
+  this->cleanup();
+  return rc;
+}
+
+static const unsigned MAX_ACC_SIZE = 32 * 1024 * 1024;
+
+// Accumulate enough data first to make a reasonable decision about the
+// optimal unit size for a new object, or bs for existing object (32M seems
+// enough for 4M units in 8+2 parity groups, a common config on wide pools),
+// and then launch the write operations.
+int MotrAtomicWriter::process(bufferlist&& data, uint64_t offset)
+{
+  if (data.length() == 0) { // last call, flush data
+    int rc = 0;
+    if (acc_data.length() != 0)
+      rc = this->write();
+    this->cleanup();
+    return rc;
+  }
+
+  if (acc_data.length() == 0)
+    acc_off = offset;
+
+  acc_data.append(std::move(data));
+  if (acc_data.length() < MAX_ACC_SIZE)
+    return 0;
+
+  return this->write();
+}
+
+int MotrAtomicWriter::complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y)
+{
+  int rc = 0;
+
+  if (acc_data.length() != 0) { // check again, just in case
+    rc = this->write();
+    this->cleanup();
+    if (rc != 0)
+      return rc;
+  }
+
+  bufferlist bl;
+  rgw_bucket_dir_entry ent;
+
+  // Set rgw_bucet_dir_entry. Some of the member of this structure may not
+  // apply to motr. For example the storage_class.
+  //
+  // Checkout AtomicObjectProcessor::complete() in rgw_putobj_processor.cc
+  // and RGWRados::Object::Write::write_meta() in rgw_rados.cc for what and
+  // how to set the dir entry. Only set the basic ones for POC, no ACLs and
+  // other attrs.
+  obj.get_key().get_index_key(&ent.key);
+  ent.meta.size = total_data_size;
+  ent.meta.accounted_size = total_data_size;
+  ent.meta.mtime = real_clock::is_zero(set_mtime)? ceph::real_clock::now() : set_mtime;
+  ent.meta.etag = etag;
+  ent.meta.owner = owner.to_str();
+  ent.meta.owner_display_name = obj.get_bucket()->get_owner()->get_display_name();
+  bool is_versioned = obj.get_key().have_instance();
+  if (is_versioned)
+    ent.flags = rgw_bucket_dir_entry::FLAG_VER | rgw_bucket_dir_entry::FLAG_CURRENT;
+  ldpp_dout(dpp, 20) <<__func__<< ": key=" << obj.get_key().get_oid()
+                    << " etag: " << etag << " user_data=" << user_data << dendl;
+  if (user_data)
+    ent.meta.user_data = *user_data;
+  ent.encode(bl);
+
+  RGWBucketInfo &info = obj.get_bucket()->get_info();
+  if (info.obj_lock_enabled() && info.obj_lock.has_rule()) {
+    auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
+    if (iter == attrs.end()) {
+      real_time lock_until_date = info.obj_lock.get_lock_until_date(ent.meta.mtime);
+      string mode = info.obj_lock.get_mode();
+      RGWObjectRetention obj_retention(mode, lock_until_date);
+      bufferlist retention_bl;
+      obj_retention.encode(retention_bl);
+      attrs[RGW_ATTR_OBJECT_RETENTION] = retention_bl;
+    }
+  }
+  encode(attrs, bl);
+  obj.meta.encode(bl);
+  ldpp_dout(dpp, 20) <<__func__<< ": lid=0x" << std::hex << obj.meta.layout_id
+                                                           << dendl;
+  if (is_versioned) {
+    // get the list of all versioned objects with the same key and
+    // unset their FLAG_CURRENT later, if do_idx_op_by_name() is successful.
+    // Note: without distributed lock on the index - it is possible that 2
+    // CURRENT entries would appear in the bucket. For example, consider the
+    // following scenario when two clients are trying to add the new object
+    // version concurrently:
+    //   client 1: reads all the CURRENT entries
+    //   client 2: updates the index and sets the new CURRENT
+    //   client 1: updates the index and sets the new CURRENT
+    // At the step (1) client 1 would not see the new current record from step (2),
+    // so it won't update it. As a result, two CURRENT version entries will appear
+    // in the bucket.
+    // TODO: update the current version (unset the flag) and insert the new current
+    // version can be launched in one motr op. This requires change at do_idx_op()
+    // and do_idx_op_by_name().
+    rc = obj.update_version_entries(dpp);
+    if (rc < 0)
+      return rc;
+  }
+  // Insert an entry into bucket index.
+  string bucket_index_iname = "motr.rgw.bucket.index." + obj.get_bucket()->get_name();
+  rc = store->do_idx_op_by_name(bucket_index_iname,
+                                M0_IC_PUT, obj.get_key().get_oid(), bl);
+  if (rc == 0)
+    store->get_obj_meta_cache()->put(dpp, obj.get_key().get_oid(), bl);
+
+  if (old_obj.get_bucket()->get_info().versioning_status() != BUCKET_VERSIONED) {
+    // Delete old object data if exists.
+    old_obj.delete_mobj(dpp);
+  }
+
+  // TODO: We need to handle the object leak caused by parallel object upload by
+  // making use of background gc, which is currently not enabled for motr.
+  return rc;
+}
+
+int MotrMultipartUpload::delete_parts(const DoutPrefixProvider *dpp)
+{
+  int rc;
+  int max_parts = 1000;
+  int marker = 0;
+  bool truncated = false;
+
+  // Scan all parts and delete the corresponding motr objects.
+  do {
+    rc = this->list_parts(dpp, store->ctx(), max_parts, marker, &marker, &truncated);
+    if (rc == -ENOENT) {
+      truncated = false;
+      rc = 0;
+    }
+    if (rc < 0)
+      return rc;
+
+    std::map<uint32_t, std::unique_ptr<MultipartPart>>& parts = this->get_parts();
+    for (auto part_iter = parts.begin(); part_iter != parts.end(); ++part_iter) {
+
+      MultipartPart *mpart = part_iter->second.get();
+      MotrMultipartPart *mmpart = static_cast<MotrMultipartPart *>(mpart);
+      uint32_t part_num = mmpart->get_num();
+
+      // Delete the part object. Note that the part object is  not
+      // inserted into bucket index, only the corresponding motr object
+      // needs to be delete. That is why we don't call
+      // MotrObject::delete_object().
+      string part_obj_name = bucket->get_name() + "." +
+ 	                     mp_obj.get_key() +
+	                     ".part." + std::to_string(part_num);
+      std::unique_ptr<rgw::sal::Object> obj;
+      obj = this->bucket->get_object(rgw_obj_key(part_obj_name));
+      std::unique_ptr<rgw::sal::MotrObject> mobj(static_cast<rgw::sal::MotrObject *>(obj.release()));
+      mobj->meta = mmpart->meta;
+      rc = mobj->delete_mobj(dpp);
+      if (rc < 0) {
+        ldpp_dout(dpp, 0) << __func__ << ": Failed to delete object from Motr. rc=" << rc << dendl;
+        return rc;
+      }
+    }
+  } while (truncated);
+
+  // Delete object part index.
+  std::string oid = mp_obj.get_key();
+  string obj_part_iname = "motr.rgw.object." + bucket->get_name() + "." + oid + ".parts";
+  return store->delete_motr_idx_by_name(obj_part_iname);
+}
+
+int MotrMultipartUpload::abort(const DoutPrefixProvider *dpp, CephContext *cct)
+{
+  int rc;
+  // Check if multipart upload exists
+  bufferlist bl;
+  std::unique_ptr<rgw::sal::Object> meta_obj;
+  meta_obj = get_meta_obj();
+  string bucket_multipart_iname =
+      "motr.rgw.bucket." + meta_obj->get_bucket()->get_name() + ".multiparts";
+  rc = store->do_idx_op_by_name(bucket_multipart_iname,
+                                  M0_IC_GET, meta_obj->get_key().to_str(), bl);
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << __func__ << ": Failed to get multipart upload. rc=" << rc << dendl;
+    return rc == -ENOENT ? -ERR_NO_SUCH_UPLOAD : rc;
+  }
+
+  // Scan all parts and delete the corresponding motr objects.
+  rc = this->delete_parts(dpp);
+  if (rc < 0)
+    return rc;
+
+  bl.clear();
+  // Remove the upload from bucket multipart index.
+  rc = store->do_idx_op_by_name(bucket_multipart_iname,
+                                M0_IC_DEL, meta_obj->get_key().get_oid(), bl);
+  return rc;
+}
+
+std::unique_ptr<rgw::sal::Object> MotrMultipartUpload::get_meta_obj()
+{
+  std::unique_ptr<rgw::sal::Object> obj = bucket->get_object(rgw_obj_key(get_meta(), string(), mp_ns));
+  std::unique_ptr<rgw::sal::MotrObject> mobj(static_cast<rgw::sal::MotrObject *>(obj.release()));
+  mobj->set_category(RGWObjCategory::MultiMeta);
+  return mobj;
+}
+
+struct motr_multipart_upload_info
+{
+  rgw_placement_rule dest_placement;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(dest_placement, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(dest_placement, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(motr_multipart_upload_info)
+
+int MotrMultipartUpload::init(const DoutPrefixProvider *dpp, optional_yield y,
+                              ACLOwner& _owner,
+			      rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs)
+{
+  int rc;
+  std::string oid = mp_obj.get_key();
+
+  owner = _owner;
+
+  do {
+    char buf[33];
+    string tmp_obj_name;
+    gen_rand_alphanumeric(store->ctx(), buf, sizeof(buf) - 1);
+    std::string upload_id = MULTIPART_UPLOAD_ID_PREFIX; /* v2 upload id */
+    upload_id.append(buf);
+
+    mp_obj.init(oid, upload_id);
+    tmp_obj_name = mp_obj.get_meta();
+
+    std::unique_ptr<rgw::sal::Object> obj;
+    obj = bucket->get_object(rgw_obj_key(tmp_obj_name, string(), mp_ns));
+    // the meta object will be indexed with 0 size, we c
+    obj->set_in_extra_data(true);
+    obj->set_hash_source(oid);
+
+    motr_multipart_upload_info upload_info;
+    upload_info.dest_placement = dest_placement;
+    bufferlist mpbl;
+    encode(upload_info, mpbl);
+
+    // Create an initial entry in the bucket. The entry will be
+    // updated when multipart upload is completed, for example,
+    // size, etag etc.
+    bufferlist bl;
+    rgw_bucket_dir_entry ent;
+    obj->get_key().get_index_key(&ent.key);
+    ent.meta.owner = owner.get_id().to_str();
+    ent.meta.category = RGWObjCategory::MultiMeta;
+    ent.meta.mtime = ceph::real_clock::now();
+    ent.meta.user_data.assign(mpbl.c_str(), mpbl.c_str() + mpbl.length());
+    ent.encode(bl);
+
+    // Insert an entry into bucket multipart index so it is not shown
+    // when listing a bucket.
+    string bucket_multipart_iname =
+      "motr.rgw.bucket." + obj->get_bucket()->get_name() + ".multiparts";
+    rc = store->do_idx_op_by_name(bucket_multipart_iname,
+                                  M0_IC_PUT, obj->get_key().get_oid(), bl);
+
+  } while (rc == -EEXIST);
+
+  if (rc < 0)
+    return rc;
+
+  // Create object part index.
+  // TODO: add bucket as part of the name.
+  string obj_part_iname = "motr.rgw.object." + bucket->get_name() + "." + oid + ".parts";
+  ldpp_dout(dpp, 20) << "MotrMultipartUpload::init(): object part index=" << obj_part_iname << dendl;
+  rc = store->create_motr_idx_by_name(obj_part_iname);
+  if (rc == -EEXIST)
+    rc = 0;
+  if (rc < 0)
+    // TODO: clean the bucket index entry
+    ldpp_dout(dpp, 0) << "Failed to create object multipart index  " << obj_part_iname << dendl;
+
+  return rc;
+}
+
+int MotrMultipartUpload::list_parts(const DoutPrefixProvider *dpp, CephContext *cct,
+				     int num_parts, int marker,
+				     int *next_marker, bool *truncated,
+				     bool assume_unsorted)
+{
+  int rc;
+  vector<string> key_vec(num_parts);
+  vector<bufferlist> val_vec(num_parts);
+
+  std::string oid = mp_obj.get_key();
+  string obj_part_iname = "motr.rgw.object." + bucket->get_name() + "." + oid + ".parts";
+  ldpp_dout(dpp, 20) << __func__ << ": object part index = " << obj_part_iname << dendl;
+  key_vec[0].clear();
+  key_vec[0] = "part.";
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%08d", marker + 1);
+  key_vec[0].append(buf);
+  rc = store->next_query_by_name(obj_part_iname, key_vec, val_vec);
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: NEXT query failed. " << rc << dendl;
+    return rc;
+  }
+
+  int last_num = 0;
+  int part_cnt = 0;
+  uint32_t expected_next = 0;
+  ldpp_dout(dpp, 20) << __func__ << ": marker = " << marker << dendl;
+  for (const auto& bl: val_vec) {
+    if (bl.length() == 0)
+      break;
+
+    RGWUploadPartInfo info;
+    auto iter = bl.cbegin();
+    info.decode(iter);
+    rgw::sal::Attrs attrs_dummy;
+    decode(attrs_dummy, iter);
+    MotrObject::Meta meta;
+    meta.decode(iter);
+
+    ldpp_dout(dpp, 20) << __func__ << ": part_num=" << info.num
+                                             << " part_size=" << info.size << dendl;
+    ldpp_dout(dpp, 20) << __func__ << ": meta:oid=[" << meta.oid.u_hi << "," << meta.oid.u_lo
+                                              << "], meta:pvid=[" << meta.pver.f_container << "," << meta.pver.f_key
+                                              << "], meta:layout id=" << meta.layout_id << dendl;
+
+    if (!expected_next)
+      expected_next = info.num + 1;
+    else if (expected_next && info.num != expected_next)
+      return -EINVAL;
+    else expected_next = info.num + 1;
+
+    if ((int)info.num > marker) {
+      last_num = info.num;
+      parts.emplace(info.num, std::make_unique<MotrMultipartPart>(info, meta));
+    }
+
+    part_cnt++;
+  }
+
+  // Does it have more parts?
+  if (truncated)
+    *truncated = part_cnt < num_parts? false : true;
+  ldpp_dout(dpp, 20) << __func__ << ": truncated=" << *truncated << dendl;
+
+  if (next_marker)
+    *next_marker = last_num;
+
+  return 0;
+}
+
+// Heavily copy from rgw_sal_rados.cc
+int MotrMultipartUpload::complete(const DoutPrefixProvider *dpp,
+				   optional_yield y, CephContext* cct,
+				   map<int, string>& part_etags,
+				   list<rgw_obj_index_key>& remove_objs,
+				   uint64_t& accounted_size, bool& compressed,
+				   RGWCompressionInfo& cs_info, off_t& off,
+				   std::string& tag, ACLOwner& owner,
+				   uint64_t olh_epoch,
+				   rgw::sal::Object* target_obj)
+{
+  char final_etag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+  char final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 16];
+  std::string etag;
+  bufferlist etag_bl;
+  MD5 hash;
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  bool truncated;
+  int rc;
+
+  ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): enter" << dendl;
+  int total_parts = 0;
+  int handled_parts = 0;
+  int max_parts = 1000;
+  int marker = 0;
+  uint64_t min_part_size = cct->_conf->rgw_multipart_min_part_size;
+  auto etags_iter = part_etags.begin();
+  rgw::sal::Attrs attrs = target_obj->get_attrs();
+
+  do {
+    ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): list_parts()" << dendl;
+    rc = list_parts(dpp, cct, max_parts, marker, &marker, &truncated);
+    if (rc == -ENOENT) {
+      rc = -ERR_NO_SUCH_UPLOAD;
+    }
+    if (rc < 0)
+      return rc;
+
+    total_parts += parts.size();
+    if (!truncated && total_parts != (int)part_etags.size()) {
+      ldpp_dout(dpp, 0) << "NOTICE: total parts mismatch: have: " << total_parts
+		       << " expected: " << part_etags.size() << dendl;
+      rc = -ERR_INVALID_PART;
+      return rc;
+    }
+    ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): parts.size()=" << parts.size() << dendl;
+
+    for (auto obj_iter = parts.begin();
+         etags_iter != part_etags.end() && obj_iter != parts.end();
+	 ++etags_iter, ++obj_iter, ++handled_parts) {
+      MultipartPart *mpart = obj_iter->second.get();
+      MotrMultipartPart *mmpart = static_cast<MotrMultipartPart *>(mpart);
+      RGWUploadPartInfo *part = &mmpart->info;
+
+      uint64_t part_size = part->accounted_size;
+      ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): part_size=" << part_size << dendl;
+      if (handled_parts < (int)part_etags.size() - 1 &&
+          part_size < min_part_size) {
+        rc = -ERR_TOO_SMALL;
+        return rc;
+      }
+
+      char petag[CEPH_CRYPTO_MD5_DIGESTSIZE];
+      if (etags_iter->first != (int)obj_iter->first) {
+        ldpp_dout(dpp, 0) << "NOTICE: parts num mismatch: next requested: "
+			 << etags_iter->first << " next uploaded: "
+			 << obj_iter->first << dendl;
+        rc = -ERR_INVALID_PART;
+        return rc;
+      }
+      string part_etag = rgw_string_unquote(etags_iter->second);
+      if (part_etag.compare(part->etag) != 0) {
+        ldpp_dout(dpp, 0) << "NOTICE: etag mismatch: part: " << etags_iter->first
+			 << " etag: " << etags_iter->second << dendl;
+        rc = -ERR_INVALID_PART;
+        return rc;
+      }
+
+      hex_to_buf(part->etag.c_str(), petag, CEPH_CRYPTO_MD5_DIGESTSIZE);
+      hash.Update((const unsigned char *)petag, sizeof(petag));
+      ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): calc etag " << dendl;
+
+      string oid = mp_obj.get_part(part->num);
+      rgw_obj src_obj;
+      src_obj.init_ns(bucket->get_key(), oid, mp_ns);
+
+#if 0 // does Motr backend need it?
+      /* update manifest for part */
+      if (part->manifest.empty()) {
+        ldpp_dout(dpp, 0) << "ERROR: empty manifest for object part: obj="
+			 << src_obj << dendl;
+        rc = -ERR_INVALID_PART;
+        return rc;
+      } else {
+        manifest.append(dpp, part->manifest, store->get_zone());
+      }
+      ldpp_dout(dpp, 0) << "MotrMultipartUpload::complete(): manifest " << dendl;
+#endif
+
+      bool part_compressed = (part->cs_info.compression_type != "none");
+      if ((handled_parts > 0) &&
+          ((part_compressed != compressed) ||
+            (cs_info.compression_type != part->cs_info.compression_type))) {
+          ldpp_dout(dpp, 0) << "ERROR: compression type was changed during multipart upload ("
+                           << cs_info.compression_type << ">>" << part->cs_info.compression_type << ")" << dendl;
+          rc = -ERR_INVALID_PART;
+          return rc;
+      }
+
+      ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): part compression" << dendl;
+      if (part_compressed) {
+        int64_t new_ofs; // offset in compression data for new part
+        if (cs_info.blocks.size() > 0)
+          new_ofs = cs_info.blocks.back().new_ofs + cs_info.blocks.back().len;
+        else
+          new_ofs = 0;
+        for (const auto& block : part->cs_info.blocks) {
+          compression_block cb;
+          cb.old_ofs = block.old_ofs + cs_info.orig_size;
+          cb.new_ofs = new_ofs;
+          cb.len = block.len;
+          cs_info.blocks.push_back(cb);
+          new_ofs = cb.new_ofs + cb.len;
+        }
+        if (!compressed)
+          cs_info.compression_type = part->cs_info.compression_type;
+        cs_info.orig_size += part->cs_info.orig_size;
+        compressed = true;
+      }
+
+      // We may not need to do the following as remove_objs are those
+      // don't show when listing a bucket. As we store in-progress uploaded
+      // object's metadata in a separate index, they are not shown when
+      // listing a bucket.
+      rgw_obj_index_key remove_key;
+      src_obj.key.get_index_key(&remove_key);
+      remove_objs.push_back(remove_key);
+
+      off += part_size;
+      accounted_size += part->accounted_size;
+      ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): off=" << off << ", accounted_size = " << accounted_size << dendl;
+    }
+  } while (truncated);
+  hash.Final((unsigned char *)final_etag);
+
+  buf_to_hex((unsigned char *)final_etag, sizeof(final_etag), final_etag_str);
+  snprintf(&final_etag_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2],
+	   sizeof(final_etag_str) - CEPH_CRYPTO_MD5_DIGESTSIZE * 2,
+           "-%lld", (long long)part_etags.size());
+  etag = final_etag_str;
+  ldpp_dout(dpp, 20) << "calculated etag: " << etag << dendl;
+  etag_bl.append(etag);
+  attrs[RGW_ATTR_ETAG] = etag_bl;
+
+  if (compressed) {
+    // write compression attribute to full object
+    bufferlist tmp;
+    encode(cs_info, tmp);
+    attrs[RGW_ATTR_COMPRESSION] = tmp;
+  }
+
+  // Read the object's the multipart_upload_info.
+  // TODO: all those index name and key  constructions should be implemented as
+  // member functions.
+  bufferlist bl;
+  std::unique_ptr<rgw::sal::Object> meta_obj;
+  meta_obj = get_meta_obj();
+  string bucket_multipart_iname =
+      "motr.rgw.bucket." + meta_obj->get_bucket()->get_name() + ".multiparts";
+  rc = this->store->do_idx_op_by_name(bucket_multipart_iname,
+                                      M0_IC_GET, meta_obj->get_key().get_oid(), bl);
+  ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): read entry from bucket multipart index rc=" << rc << dendl;
+  if (rc < 0)
+    return rc;
+  rgw_bucket_dir_entry ent;
+  bufferlist& blr = bl;
+  auto ent_iter = blr.cbegin();
+  ent.decode(ent_iter);
+
+  // Update the dir entry and insert it to the bucket index so
+  // the object will be seen when listing the bucket.
+  bufferlist update_bl;
+  target_obj->get_key().get_index_key(&ent.key);  // Change to offical name :)
+  ent.meta.size = off;
+  ent.meta.accounted_size = accounted_size;
+  ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): obj size=" << ent.meta.size
+                           << " obj accounted size=" << ent.meta.accounted_size << dendl;
+  ent.meta.mtime = ceph::real_clock::now();
+  ent.meta.etag = etag;
+  ent.encode(update_bl);
+  encode(attrs, update_bl);
+  MotrObject::Meta meta_dummy;
+  meta_dummy.encode(update_bl);
+
+  string bucket_index_iname = "motr.rgw.bucket.index." + meta_obj->get_bucket()->get_name();
+  ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): target_obj name=" << target_obj->get_name()
+                                  << " target_obj oid=" << target_obj->get_oid() << dendl;
+  rc = store->do_idx_op_by_name(bucket_index_iname, M0_IC_PUT,
+                                target_obj->get_name(), update_bl);
+  if (rc < 0)
+    return rc;
+
+  // Put into metadata cache.
+  store->get_obj_meta_cache()->put(dpp, target_obj->get_name(), update_bl);
+
+  // Now we can remove it from bucket multipart index.
+  ldpp_dout(dpp, 20) << "MotrMultipartUpload::complete(): remove from bucket multipartindex " << dendl;
+  return store->do_idx_op_by_name(bucket_multipart_iname,
+                                  M0_IC_DEL, meta_obj->get_key().get_oid(), bl);
+}
+
+int MotrMultipartUpload::get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs)
+{
+  if (!rule && !attrs) {
+    return 0;
+  }
+
+  if (rule) {
+    if (!placement.empty()) {
+      *rule = &placement;
+      if (!attrs) {
+        /* Don't need attrs, done */
+        return 0;
+      }
+    } else {
+      *rule = nullptr;
+    }
+  }
+
+  std::unique_ptr<rgw::sal::Object> meta_obj;
+  meta_obj = get_meta_obj();
+  meta_obj->set_in_extra_data(true);
+
+  // Read the object's the multipart_upload_info.
+  bufferlist bl;
+  string bucket_multipart_iname =
+      "motr.rgw.bucket." + meta_obj->get_bucket()->get_name() + ".multiparts";
+  int rc = this->store->do_idx_op_by_name(bucket_multipart_iname,
+                                          M0_IC_GET, meta_obj->get_key().get_oid(), bl);
+  if (rc < 0) {
+    ldpp_dout(dpp, 0) << __func__ << ": Failed to get multipart info. rc=" << rc << dendl;
+    return rc == -ENOENT ? -ERR_NO_SUCH_UPLOAD : rc;
+  }
+
+  rgw_bucket_dir_entry ent;
+  bufferlist& blr = bl;
+  auto ent_iter = blr.cbegin();
+  ent.decode(ent_iter);
+
+  if (attrs) {
+    bufferlist etag_bl;
+    string& etag = ent.meta.etag;
+    ldpp_dout(dpp, 20) << "object's etag:  " << ent.meta.etag << dendl;
+    etag_bl.append(etag.c_str(), etag.size());
+    attrs->emplace(std::move(RGW_ATTR_ETAG), std::move(etag_bl));
+    if (!rule || *rule != nullptr) {
+      /* placement was cached; don't actually read */
+      return 0;
+    }
+  }
+
+  /* Decode multipart_upload_info */
+  motr_multipart_upload_info upload_info;
+  bufferlist mpbl;
+  mpbl.append(ent.meta.user_data.c_str(), ent.meta.user_data.size());
+  auto mpbl_iter = mpbl.cbegin();
+  upload_info.decode(mpbl_iter);
+  placement = upload_info.dest_placement;
+  *rule = &placement;
+
+  return 0;
+}
+
+std::unique_ptr<Writer> MotrMultipartUpload::get_writer(
+				  const DoutPrefixProvider *dpp,
+				  optional_yield y,
+				  rgw::sal::Object* obj,
+				  const rgw_user& owner,
+				  const rgw_placement_rule *ptail_placement_rule,
+				  uint64_t part_num,
+				  const std::string& part_num_str)
+{
+  return std::make_unique<MotrMultipartWriter>(dpp, y, this,
+				 obj, store, owner,
+				 ptail_placement_rule, part_num, part_num_str);
+}
+
+int MotrMultipartWriter::prepare(optional_yield y)
+{
+  string part_obj_name = head_obj->get_bucket()->get_name() + "." +
+	                 head_obj->get_key().get_oid() +
+	                 ".part." + std::to_string(part_num);
+  ldpp_dout(dpp, 20) << "bucket=" << head_obj->get_bucket()->get_name() << "part_obj_name=" << part_obj_name << dendl;
+  part_obj = std::make_unique<MotrObject>(this->store, rgw_obj_key(part_obj_name), head_obj->get_bucket());
+  if (part_obj == nullptr)
+    return -ENOMEM;
+
+  // s3 client may retry uploading part, so the part may have already
+  // been created.
+  int rc = part_obj->create_mobj(dpp, store->cctx->_conf->rgw_max_chunk_size);
+  if (rc == -EEXIST) {
+    rc = part_obj->open_mobj(dpp);
+    if (rc < 0)
+      return rc;
+  }
+  return rc;
+}
+
+int MotrMultipartWriter::process(bufferlist&& data, uint64_t offset)
+{
+  int rc = part_obj->write_mobj(dpp, std::move(data), offset);
+  if (rc == 0) {
+    actual_part_size += data.length();
+    ldpp_dout(dpp, 20) << " write_mobj(): actual_part_size=" << actual_part_size << dendl;
+  }
+  return rc;
+}
+
+int MotrMultipartWriter::complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y)
+{
+  // Should the dir entry(object metadata) be updated? For example
+  // mtime.
+
+  ldpp_dout(dpp, 20) << "MotrMultipartWriter::complete(): enter" << dendl;
+  // Add an entry into object_nnn_part_index.
+  bufferlist bl;
+  RGWUploadPartInfo info;
+  info.num = part_num;
+  info.etag = etag;
+  info.size = actual_part_size;
+  info.accounted_size = accounted_size;
+  info.modified = real_clock::now();
+
+  bool compressed;
+  int rc = rgw_compression_info_from_attrset(attrs, compressed, info.cs_info);
+  ldpp_dout(dpp, 20) << "MotrMultipartWriter::complete(): compression rc=" << rc << dendl;
+  if (rc < 0) {
+    ldpp_dout(dpp, 1) << "cannot get compression info" << dendl;
+    return rc;
+  }
+  encode(info, bl);
+  encode(attrs, bl);
+  part_obj->meta.encode(bl);
+
+  string p = "part.";
+  char buf[32];
+  snprintf(buf, sizeof(buf), "%08d", (int)part_num);
+  p.append(buf);
+  string obj_part_iname = "motr.rgw.object." + head_obj->get_bucket()->get_name() + "." +
+	                  head_obj->get_key().get_oid() + ".parts";
+  ldpp_dout(dpp, 20) << "MotrMultipartWriter::complete(): object part index = " << obj_part_iname << dendl;
+  rc = store->do_idx_op_by_name(obj_part_iname, M0_IC_PUT, p, bl);
+  if (rc < 0) {
+    return rc == -ENOENT ? -ERR_NO_SUCH_UPLOAD : rc;
+  }
+
+  return 0;
+}
+
+std::unique_ptr<RGWRole> MotrStore::get_role(std::string name,
+    std::string tenant,
+    std::string path,
+    std::string trust_policy,
+    std::string max_session_duration_str,
+    std::multimap<std::string,std::string> tags)
+{
+  RGWRole* p = nullptr;
+  return std::unique_ptr<RGWRole>(p);
+}
+
+std::unique_ptr<RGWRole> MotrStore::get_role(const RGWRoleInfo& info)
+{
+  RGWRole* p = nullptr;
+  return std::unique_ptr<RGWRole>(p);
+}
+
+std::unique_ptr<RGWRole> MotrStore::get_role(std::string id)
+{
+  RGWRole* p = nullptr;
+  return std::unique_ptr<RGWRole>(p);
+}
+
+int MotrStore::get_roles(const DoutPrefixProvider *dpp,
+    optional_yield y,
+    const std::string& path_prefix,
+    const std::string& tenant,
+    vector<std::unique_ptr<RGWRole>>& roles)
+{
+  return 0;
+}
+
+std::unique_ptr<RGWOIDCProvider> MotrStore::get_oidc_provider()
+{
+  RGWOIDCProvider* p = nullptr;
+  return std::unique_ptr<RGWOIDCProvider>(p);
+}
+
+int MotrStore::get_oidc_providers(const DoutPrefixProvider *dpp,
+    const std::string& tenant,
+    vector<std::unique_ptr<RGWOIDCProvider>>& providers)
+{
+  return 0;
+}
+
+std::unique_ptr<MultipartUpload> MotrBucket::get_multipart_upload(const std::string& oid,
+                                std::optional<std::string> upload_id,
+                                ACLOwner owner, ceph::real_time mtime)
+{
+  return std::make_unique<MotrMultipartUpload>(store, this, oid, upload_id, owner, mtime);
+}
+
+std::unique_ptr<Writer> MotrStore::get_append_writer(const DoutPrefixProvider *dpp,
+        optional_yield y,
+        rgw::sal::Object* obj,
+        const rgw_user& owner,
+        const rgw_placement_rule *ptail_placement_rule,
+        const std::string& unique_tag,
+        uint64_t position,
+        uint64_t *cur_accounted_size) {
+  return nullptr;
+}
+
+std::unique_ptr<Writer> MotrStore::get_atomic_writer(const DoutPrefixProvider *dpp,
+        optional_yield y,
+        rgw::sal::Object* obj,
+        const rgw_user& owner,
+        const rgw_placement_rule *ptail_placement_rule,
+        uint64_t olh_epoch,
+        const std::string& unique_tag) {
+  return std::make_unique<MotrAtomicWriter>(dpp, y,
+                  obj, this, owner,
+                  ptail_placement_rule, olh_epoch, unique_tag);
+}
+
+const std::string& MotrStore::get_compression_type(const rgw_placement_rule& rule)
+{
+      return zone.zone_params->get_compression_type(rule);
+}
+
+bool MotrStore::valid_placement(const rgw_placement_rule& rule)
+{
+  return zone.zone_params->valid_placement(rule);
+}
+
+std::unique_ptr<User> MotrStore::get_user(const rgw_user &u)
+{
+  ldout(cctx, 20) << "bucket's user:  " << u.to_str() << dendl;
+  return std::make_unique<MotrUser>(this, u);
+}
+
+int MotrStore::get_user_by_access_key(const DoutPrefixProvider *dpp, const std::string &key, optional_yield y, std::unique_ptr<User> *user)
+{
+  int rc;
+  User *u;
+  bufferlist bl;
+  RGWUserInfo uinfo;
+  MotrAccessKey access_key;
+
+  rc = do_idx_op_by_name(RGW_IAM_MOTR_ACCESS_KEY,
+                           M0_IC_GET, key, bl);
+  if (rc < 0){
+    ldout(cctx, 0) << "Access key not found: rc = " << rc << dendl;
+    return rc;
+  }
+
+  bufferlist& blr = bl;
+  auto iter = blr.cbegin();
+  access_key.decode(iter);
+
+  uinfo.user_id.from_str(access_key.user_id);
+  ldout(cctx, 0) << "Loading user: " << uinfo.user_id.id << dendl;
+  rc = MotrUser().load_user_from_idx(dpp, this, uinfo, nullptr, nullptr);
+  if (rc < 0){
+    ldout(cctx, 0) << "Failed to load user: rc = " << rc << dendl;
+    return rc;
+  }
+  u = new MotrUser(this, uinfo);
+  if (!u)
+    return -ENOMEM;
+
+  user->reset(u);
+  return 0;
+}
+
+int MotrStore::get_user_by_email(const DoutPrefixProvider *dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user)
+{
+  int rc;
+  User *u;
+  bufferlist bl;
+  RGWUserInfo uinfo;
+  MotrEmailInfo email_info; 
+  rc = do_idx_op_by_name(RGW_IAM_MOTR_EMAIL_KEY,
+                           M0_IC_GET, email, bl);
+  if (rc < 0){
+    ldout(cctx, 0) << "Email Id not found: rc = " << rc << dendl;
+    return rc;
+  }
+  auto iter = bl.cbegin();
+  email_info.decode(iter);
+  ldout(cctx, 0) << "Loading user: " << email_info.user_id << dendl;
+  uinfo.user_id.from_str(email_info.user_id);
+  rc = MotrUser().load_user_from_idx(dpp, this, uinfo, nullptr, nullptr);
+  if (rc < 0){
+    ldout(cctx, 0) << "Failed to load user: rc = " << rc << dendl;
+    return rc;
+  }
+  u = new MotrUser(this, uinfo);
+  if (!u)
+    return -ENOMEM;
+
+  user->reset(u);  
+  return 0;
+}
+
+int MotrStore::get_user_by_swift(const DoutPrefixProvider *dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user)
+{
+  /* Swift keys and subusers are not supported for now */
+  return 0;
+}
+
+int MotrStore::store_access_key(const DoutPrefixProvider *dpp, optional_yield y, MotrAccessKey access_key)
+{
+  int rc;
+  bufferlist bl;
+  access_key.encode(bl);
+  rc = do_idx_op_by_name(RGW_IAM_MOTR_ACCESS_KEY,
+                                M0_IC_PUT, access_key.id, bl);
+  if (rc < 0){
+    ldout(cctx, 0) << "Failed to store key: rc = " << rc << dendl;
+    return rc;
+  }
+  return rc;
+}
+
+int MotrStore::delete_access_key(const DoutPrefixProvider *dpp, optional_yield y, std::string access_key)
+{
+  int rc;
+  bufferlist bl;
+  rc = do_idx_op_by_name(RGW_IAM_MOTR_ACCESS_KEY,
+                                M0_IC_DEL, access_key, bl);
+  if (rc < 0){
+    ldout(cctx, 0) << "Failed to delete key: rc = " << rc << dendl;
+  }
+  return rc;
+}
+
+int MotrStore::store_email_info(const DoutPrefixProvider *dpp, optional_yield y, MotrEmailInfo& email_info )
+{
+  int rc;
+  bufferlist bl;
+  email_info.encode(bl);
+  rc = do_idx_op_by_name(RGW_IAM_MOTR_EMAIL_KEY,
+                                M0_IC_PUT, email_info.email_id, bl);
+  if (rc < 0) {
+    ldout(cctx, 0) << "Failed to store the user by email as key: rc = " << rc << dendl;
+  } 
+  return rc;
+}
+
+std::unique_ptr<Object> MotrStore::get_object(const rgw_obj_key& k)
+{
+  return std::make_unique<MotrObject>(this, k);
+}
+
+
+int MotrStore::get_bucket(const DoutPrefixProvider *dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y)
+{
+  int ret;
+  Bucket* bp;
+
+  bp = new MotrBucket(this, b, u);
+  ret = bp->load_bucket(dpp, y);
+  if (ret < 0) {
+    delete bp;
+    return ret;
+  }
+
+  bucket->reset(bp);
+  return 0;
+}
+
+int MotrStore::get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket)
+{
+  Bucket* bp;
+
+  bp = new MotrBucket(this, i, u);
+  /* Don't need to fetch the bucket info, use the provided one */
+
+  bucket->reset(bp);
+  return 0;
+}
+
+int MotrStore::get_bucket(const DoutPrefixProvider *dpp, User* u, const std::string& tenant, const std::string& name, std::unique_ptr<Bucket>* bucket, optional_yield y)
+{
+  rgw_bucket b;
+
+  b.tenant = tenant;
+  b.name = name;
+
+  return get_bucket(dpp, u, b, bucket, y);
+}
+
+bool MotrStore::is_meta_master()
+{
+  return true;
+}
+
+int MotrStore::forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version *objv,
+    bufferlist& in_data,
+    JSONParser *jp, req_info& info,
+    optional_yield y)
+{
+  return 0;
+}
+
+int MotrStore::forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+					     bufferlist& in_data,
+					     RGWXMLDecoder::XMLParser* parser, req_info& info,
+					     optional_yield y)
+{
+    return 0;
+}
+
+std::string MotrStore::zone_unique_id(uint64_t unique_num)
+{
+  return "";
+}
+
+std::string MotrStore::zone_unique_trans_id(const uint64_t unique_num)
+{
+  return "";
+}
+
+int MotrStore::get_zonegroup(const std::string& id, std::unique_ptr<ZoneGroup>* group)
+{
+  /* XXX: for now only one zonegroup supported */
+  ZoneGroup* zg;
+  zg = new MotrZoneGroup(this, zone.zonegroup.get_group());
+
+  group->reset(zg);
+  return 0;
+}
+
+int MotrStore::list_all_zones(const DoutPrefixProvider* dpp,
+                            std::list<std::string>& zone_ids)
+{
+  zone_ids.push_back(zone.get_id());
+    return 0;
+}
+
+int MotrStore::cluster_stat(RGWClusterStat& stats)
+{
+  return 0;
+}
+
+std::unique_ptr<Lifecycle> MotrStore::get_lifecycle(void)
+{
+  return 0;
+}
+
+std::unique_ptr<Completions> MotrStore::get_completions(void)
+{
+  return 0;
+}
+
+std::unique_ptr<Notification> MotrStore::get_notification(Object* obj, Object* src_obj, req_state* s,
+    rgw::notify::EventType event_type, optional_yield y, const string* object_name)
+{
+  return std::make_unique<MotrNotification>(obj, src_obj, event_type);
+}
+
+std::unique_ptr<Notification>  MotrStore::get_notification(const DoutPrefixProvider* dpp, Object* obj,
+        Object* src_obj, rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket,
+        std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y)
+{
+  return std::make_unique<MotrNotification>(obj, src_obj, event_type);
+}
+
+int MotrStore::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
+{
+  return 0;
+}
+
+int MotrStore::log_op(const DoutPrefixProvider *dpp, string& oid, bufferlist& bl)
+{
+  return 0;
+}
+
+int MotrStore::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type,
+    const map<string, string>& meta)
+{
+  return 0;
+}
+
+void MotrStore::get_ratelimit(RGWRateLimitInfo& bucket_ratelimit,
+                              RGWRateLimitInfo& user_ratelimit,
+                              RGWRateLimitInfo& anon_ratelimit)
+{
+  return;
+}
+
+void MotrStore::get_quota(RGWQuota& quota)
+{
+  // XXX: Not handled for the first pass
+  return;
+}
+
+int MotrStore::set_buckets_enabled(const DoutPrefixProvider *dpp, vector<rgw_bucket>& buckets, bool enabled)
+{
+  return 0;
+}
+
+int MotrStore::get_sync_policy_handler(const DoutPrefixProvider *dpp,
+    std::optional<rgw_zone_id> zone,
+    std::optional<rgw_bucket> bucket,
+    RGWBucketSyncPolicyHandlerRef *phandler,
+    optional_yield y)
+{
+  return 0;
+}
+
+RGWDataSyncStatusManager* MotrStore::get_data_sync_manager(const rgw_zone_id& source_zone)
+{
+  return 0;
+}
+
+int MotrStore::read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+    uint32_t max_entries, bool *is_truncated,
+    RGWUsageIter& usage_iter,
+    map<rgw_user_bucket, rgw_usage_log_entry>& usage)
+{
+  return 0;
+}
+
+int MotrStore::trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch)
+{
+  return 0;
+}
+
+int MotrStore::get_config_key_val(string name, bufferlist *bl)
+{
+  return 0;
+}
+
+int MotrStore::meta_list_keys_init(const DoutPrefixProvider *dpp, const string& section, const string& marker, void** phandle)
+{
+  return 0;
+}
+
+int MotrStore::meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, list<string>& keys, bool* truncated)
+{
+  return 0;
+}
+
+void MotrStore::meta_list_keys_complete(void* handle)
+{
+  return;
+}
+
+std::string MotrStore::meta_get_marker(void* handle)
+{
+  return "";
+}
+
+int MotrStore::meta_remove(const DoutPrefixProvider *dpp, string& metadata_key, optional_yield y)
+{
+  return 0;
+}
+
+int MotrStore::open_idx(struct m0_uint128 *id, bool create, struct m0_idx *idx)
+{
+  m0_idx_init(idx, &container.co_realm, id);
+
+  if (!create)
+    return 0; // nothing to do more
+
+  // create index or make sure it's created
+  struct m0_op *op = nullptr;
+  int rc = m0_entity_create(nullptr, &idx->in_entity, &op);
+  if (rc != 0) {
+    ldout(cctx, 0) << "ERROR: m0_entity_create() failed: " << rc << dendl;
+    goto out;
+  }
+
+  m0_op_launch(&op, 1);
+  rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+       m0_rc(op);
+  m0_op_fini(op);
+  m0_op_free(op);
+
+  if (rc != 0 && rc != -EEXIST)
+    ldout(cctx, 0) << "ERROR: index create failed: " << rc << dendl;
+out:
+  return rc;
+}
+
+static void set_m0bufvec(struct m0_bufvec *bv, vector<uint8_t>& vec)
+{
+  *bv->ov_buf = reinterpret_cast<char*>(vec.data());
+  *bv->ov_vec.v_count = vec.size();
+}
+
+// idx must be opened with open_idx() beforehand
+int MotrStore::do_idx_op(struct m0_idx *idx, enum m0_idx_opcode opcode,
+                         vector<uint8_t>& key, vector<uint8_t>& val, bool update)
+{
+  int rc, rc_i;
+  struct m0_bufvec k, v, *vp = &v;
+  uint32_t flags = 0;
+  struct m0_op *op = nullptr;
+
+  if (m0_bufvec_empty_alloc(&k, 1) != 0) {
+    ldout(cctx, 0) << "ERROR: failed to allocate key bufvec" << dendl;
+    return -ENOMEM;
+  }
+
+  if (opcode == M0_IC_PUT || opcode == M0_IC_GET) {
+    rc = -ENOMEM;
+    if (m0_bufvec_empty_alloc(&v, 1) != 0) {
+      ldout(cctx, 0) << "ERROR: failed to allocate value bufvec" << dendl;
+      goto out;
+    }
+  }
+
+  set_m0bufvec(&k, key);
+  if (opcode == M0_IC_PUT)
+    set_m0bufvec(&v, val);
+
+  if (opcode == M0_IC_DEL)
+    vp = nullptr;
+
+  if (opcode == M0_IC_PUT && update)
+    flags |= M0_OIF_OVERWRITE;
+
+  rc = m0_idx_op(idx, opcode, &k, vp, &rc_i, flags, &op);
+  if (rc != 0) {
+    ldout(cctx, 0) << "ERROR: failed to init index op: " << rc << dendl;
+    goto out;
+  }
+
+  m0_op_launch(&op, 1);
+  rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+       m0_rc(op);
+  m0_op_fini(op);
+  m0_op_free(op);
+
+  if (rc != 0) {
+    ldout(cctx, 0) << "ERROR: op failed: " << rc << dendl;
+    goto out;
+  }
+
+  if (rc_i != 0) {
+    ldout(cctx, 0) << "ERROR: idx op failed: " << rc_i << dendl;
+    rc = rc_i;
+    goto out;
+  }
+
+  if (opcode == M0_IC_GET) {
+    val.resize(*v.ov_vec.v_count);
+    memcpy(reinterpret_cast<char*>(val.data()), *v.ov_buf, *v.ov_vec.v_count);
+  }
+
+out:
+  m0_bufvec_free2(&k);
+  if (opcode == M0_IC_GET)
+    m0_bufvec_free(&v); // cleanup buffer after GET
+  else if (opcode == M0_IC_PUT)
+    m0_bufvec_free2(&v);
+
+  return rc;
+}
+
+// Retrieve a range of key/value pairs starting from keys[0].
+int MotrStore::do_idx_next_op(struct m0_idx *idx,
+                              vector<vector<uint8_t>>& keys,
+                              vector<vector<uint8_t>>& vals)
+{
+  int rc;
+  uint32_t i = 0;
+  int nr_kvp = vals.size();
+  int *rcs = new int[nr_kvp];
+  struct m0_bufvec k, v;
+  struct m0_op *op = nullptr;
+
+  rc = m0_bufvec_empty_alloc(&k, nr_kvp)?:
+       m0_bufvec_empty_alloc(&v, nr_kvp);
+  if (rc != 0) {
+    ldout(cctx, 0) << "ERROR: failed to allocate kv bufvecs" << dendl;
+    return rc;
+  }
+
+  set_m0bufvec(&k, keys[0]);
+
+  rc = m0_idx_op(idx, M0_IC_NEXT, &k, &v, rcs, 0, &op);
+  if (rc != 0) {
+    ldout(cctx, 0) << "ERROR: failed to init index op: " << rc << dendl;
+    goto out;
+  }
+
+  m0_op_launch(&op, 1);
+  rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+       m0_rc(op);
+  m0_op_fini(op);
+  m0_op_free(op);
+
+  if (rc != 0) {
+    ldout(cctx, 0) << "ERROR: op failed: " << rc << dendl;
+    goto out;
+  }
+
+  for (i = 0; i < v.ov_vec.v_nr; ++i) {
+    if (rcs[i] < 0)
+      break;
+
+    vector<uint8_t>& key = keys[i];
+    vector<uint8_t>& val = vals[i];
+    key.resize(k.ov_vec.v_count[i]);
+    val.resize(v.ov_vec.v_count[i]);
+    memcpy(reinterpret_cast<char*>(key.data()), k.ov_buf[i], k.ov_vec.v_count[i]);
+    memcpy(reinterpret_cast<char*>(val.data()), v.ov_buf[i], v.ov_vec.v_count[i]);
+  }
+
+out:
+  k.ov_vec.v_nr = i;
+  v.ov_vec.v_nr = i;
+  m0_bufvec_free(&k);
+  m0_bufvec_free(&v); // cleanup buffer after GET
+
+  delete []rcs;
+  return rc ?: i;
+}
+
+// Retrieve a number of key/value pairs under the prefix starting
+// from the marker at key_out[0].
+int MotrStore::next_query_by_name(string idx_name,
+                                  vector<string>& key_out,
+                                  vector<bufferlist>& val_out,
+                                  string prefix, string delim)
+{
+  unsigned nr_kvp = std::min(val_out.size(), 100UL);
+  struct m0_idx idx = {};
+  vector<vector<uint8_t>> keys(nr_kvp);
+  vector<vector<uint8_t>> vals(nr_kvp);
+  struct m0_uint128 idx_id;
+  int i = 0, j, k = 0;
+
+  index_name_to_motr_fid(idx_name, &idx_id);
+  int rc = open_motr_idx(&idx_id, &idx);
+  if (rc != 0) {
+    ldout(cctx, 0) << "ERROR: next_query_by_name(): failed to open index: rc="
+                   << rc << dendl;
+    goto out;
+  }
+
+  // Only the first element for keys needs to be set for NEXT query.
+  // The keys will be set will the returned keys from motr index.
+  ldout(cctx, 20) <<__func__<< ": next_query_by_name(): index=" << idx_name
+                  << " prefix=" << prefix << " delim=" << delim << dendl;
+  keys[0].assign(key_out[0].begin(), key_out[0].end());
+  for (i = 0; i < (int)val_out.size(); i += k, k = 0) {
+    rc = do_idx_next_op(&idx, keys, vals);
+    ldout(cctx, 20) << "do_idx_next_op() = " << rc << dendl;
+    if (rc < 0) {
+      ldout(cctx, 0) << "ERROR: NEXT query failed. " << rc << dendl;
+      goto out;
+    }
+
+    string dir;
+    for (j = 0, k = 0; j < rc; ++j) {
+      string key(keys[j].begin(), keys[j].end());
+      size_t pos = std::string::npos;
+      if (!delim.empty())
+        pos = key.find(delim, prefix.length());
+      if (pos != std::string::npos) { // DIR entry
+        dir.assign(key, 0, pos + 1);
+        if (dir.compare(0, prefix.length(), prefix) != 0)
+          goto out;
+        if (i + k == 0 || dir != key_out[i + k - 1]) // a new one
+          key_out[i + k++] = dir;
+        continue;
+      }
+      dir = "";
+      if (key.compare(0, prefix.length(), prefix) != 0)
+        goto out;
+      key_out[i + k] = key;
+      bufferlist& vbl = val_out[i + k];
+      vbl.append(reinterpret_cast<char*>(vals[j].data()), vals[j].size());
+      ++k;
+    }
+
+    if (rc < (int)nr_kvp) // there are no more keys to fetch
+      break;
+
+    string next_key;
+    if (dir != "")
+      next_key = dir + "\xff"; // skip all dir content in 1 step
+    else
+      next_key = key_out[i + k - 1] + " ";
+    ldout(cctx, 0) << "do_idx_next_op(): next_key=" << next_key << dendl;
+    keys[0].assign(next_key.begin(), next_key.end());
+  }
+
+out:
+  m0_idx_fini(&idx);
+  return rc < 0 ? rc : i + k;
+}
+
+int MotrStore::delete_motr_idx_by_name(string iname)
+{
+  struct m0_idx idx;
+  struct m0_uint128 idx_id;
+  struct m0_op *op = nullptr;
+
+  ldout(cctx, 20) << "delete_motr_idx_by_name=" << iname << dendl;
+
+  index_name_to_motr_fid(iname, &idx_id);
+  m0_idx_init(&idx, &container.co_realm, &idx_id);
+  m0_entity_open(&idx.in_entity, &op);
+  int rc = m0_entity_delete(&idx.in_entity, &op);
+  if (rc < 0)
+    goto out;
+
+  m0_op_launch(&op, 1);
+
+  ldout(cctx, 70) << "waiting for op completion" << dendl;
+
+  rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+       m0_rc(op);
+  m0_op_fini(op);
+  m0_op_free(op);
+
+  if (rc == -ENOENT) // race deletion??
+    rc = 0;
+  else if (rc < 0)
+    ldout(cctx, 0) << "ERROR: index create failed: " << rc << dendl;
+
+  ldout(cctx, 20) << "delete_motr_idx_by_name rc=" << rc << dendl;
+
+out:
+  m0_idx_fini(&idx);
+  return rc;
+}
+
+int MotrStore::open_motr_idx(struct m0_uint128 *id, struct m0_idx *idx)
+{
+  m0_idx_init(idx, &container.co_realm, id);
+  return 0;
+}
+
+// The following marcos are from dix/fid_convert.h which are not exposed.
+enum {
+      M0_DIX_FID_DEVICE_ID_OFFSET   = 32,
+      M0_DIX_FID_DIX_CONTAINER_MASK = (1ULL << M0_DIX_FID_DEVICE_ID_OFFSET)
+                                      - 1,
+};
+
+// md5 is used here, a more robust way to convert index name to fid is
+// needed to avoid collision.
+void MotrStore::index_name_to_motr_fid(string iname, struct m0_uint128 *id)
+{
+  unsigned char md5[16];  // 128/8 = 16
+  MD5 hash;
+
+  // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
+  hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+  hash.Update((const unsigned char *)iname.c_str(), iname.length());
+  hash.Final(md5);
+
+  memcpy(&id->u_hi, md5, 8);
+  memcpy(&id->u_lo, md5 + 8, 8);
+  ldout(cctx, 20) << "id = 0x" << std::hex << id->u_hi << ":0x" << std::hex << id->u_lo  << dendl;
+
+  struct m0_fid *fid = (struct m0_fid*)id;
+  m0_fid_tset(fid, m0_dix_fid_type.ft_id,
+              fid->f_container & M0_DIX_FID_DIX_CONTAINER_MASK, fid->f_key);
+  ldout(cctx, 20) << "converted id = 0x" << std::hex << id->u_hi << ":0x" << std::hex << id->u_lo  << dendl;
+}
+
+int MotrStore::do_idx_op_by_name(string idx_name, enum m0_idx_opcode opcode,
+                                 string key_str, bufferlist &bl, bool update)
+{
+  struct m0_idx idx;
+  vector<uint8_t> key(key_str.begin(), key_str.end());
+  vector<uint8_t> val;
+  struct m0_uint128 idx_id;
+
+  index_name_to_motr_fid(idx_name, &idx_id);
+  int rc = open_motr_idx(&idx_id, &idx);
+  if (rc != 0) {
+    ldout(cctx, 0) << "ERROR: failed to open index: " << rc << dendl;
+    goto out;
+  }
+
+  if (opcode == M0_IC_PUT)
+    val.assign(bl.c_str(), bl.c_str() + bl.length());
+
+  ldout(cctx, 20) <<__func__<< ": do_idx_op_by_name(): op="
+                 << (opcode == M0_IC_PUT ? "PUT" : "GET")
+                 << " idx=" << idx_name << " key=" << key_str << dendl;
+  rc = do_idx_op(&idx, opcode, key, val, update);
+  if (rc == 0 && opcode == M0_IC_GET)
+    // Append the returned value (blob) to the bufferlist.
+    bl.append(reinterpret_cast<char*>(val.data()), val.size());
+
+out:
+  m0_idx_fini(&idx);
+  return rc;
+}
+
+int MotrStore::create_motr_idx_by_name(string iname)
+{
+  struct m0_idx idx = {};
+  struct m0_uint128 id;
+
+  index_name_to_motr_fid(iname, &id);
+  m0_idx_init(&idx, &container.co_realm, &id);
+
+  // create index or make sure it's created
+  struct m0_op *op = nullptr;
+  int rc = m0_entity_create(nullptr, &idx.in_entity, &op);
+  if (rc != 0) {
+    ldout(cctx, 0) << "ERROR: m0_entity_create() failed: " << rc << dendl;
+    goto out;
+  }
+
+  m0_op_launch(&op, 1);
+  rc = m0_op_wait(op, M0_BITS(M0_OS_FAILED, M0_OS_STABLE), M0_TIME_NEVER) ?:
+       m0_rc(op);
+  m0_op_fini(op);
+  m0_op_free(op);
+
+  if (rc != 0 && rc != -EEXIST)
+    ldout(cctx, 0) << "ERROR: index create failed: " << rc << dendl;
+out:
+  m0_idx_fini(&idx);
+  return rc;
+}
+
+// If a global index is checked (if it has been create) every time
+// before they're queried (put/get), which takes 2 Motr operations to
+// complete the query. As the global indices' name and FID are known
+// already when MotrStore is created, we move the check and creation
+// in newMotrStore().
+// Similar method is used for per bucket/user index. For example,
+// bucket instance index is created when creating the bucket.
+int MotrStore::check_n_create_global_indices()
+{
+  int rc = 0;
+
+  for (const auto& iname : motr_global_indices) {
+    rc = create_motr_idx_by_name(iname);
+    if (rc < 0 && rc != -EEXIST)
+      break;
+    rc = 0;
+  }
+
+  return rc;
+}
+
+std::string MotrStore::get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y)
+{
+  char id[M0_FID_STR_LEN];
+  struct m0_confc *confc = m0_reqh2confc(&instance->m0c_reqh);
+
+  m0_fid_print(id, ARRAY_SIZE(id), &confc->cc_root->co_id);
+  return std::string(id);
+}
+
+int MotrStore::init_metadata_cache(const DoutPrefixProvider *dpp,
+                                   CephContext *cct)
+{
+  this->obj_meta_cache = new MotrMetaCache(dpp, cct);
+  this->get_obj_meta_cache()->set_enabled(true);
+
+  this->user_cache = new MotrMetaCache(dpp, cct);
+  this->get_user_cache()->set_enabled(true);
+
+  this->bucket_inst_cache = new MotrMetaCache(dpp, cct);
+  this->get_bucket_inst_cache()->set_enabled(true);
+
+  return 0;
+}
+
+  int MotrLuaManager::get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script)
+  {
+    return -ENOENT;
+  }
+
+  int MotrLuaManager::put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script)
+  {
+    return -ENOENT;
+  }
+
+  int MotrLuaManager::del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key)
+  {
+    return -ENOENT;
+  }
+
+  int MotrLuaManager::add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name)
+  {
+    return -ENOENT;
+  }
+
+  int MotrLuaManager::remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name)
+  {
+    return -ENOENT;
+  }
+
+  int MotrLuaManager::list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages)
+  {
+    return -ENOENT;
+  }
+} // namespace rgw::sal
+
+extern "C" {
+
+void *newMotrStore(CephContext *cct)
+{
+  int rc = -1;
+  rgw::sal::MotrStore *store = new rgw::sal::MotrStore(cct);
+
+  if (store) {
+    store->conf.mc_is_oostore     = true;
+    // XXX: these params should be taken from config settings and
+    // cct somehow?
+    store->instance = nullptr;
+    const auto& proc_ep  = g_conf().get_val<std::string>("motr_my_endpoint");
+    const auto& ha_ep    = g_conf().get_val<std::string>("motr_ha_endpoint");
+    const auto& proc_fid = g_conf().get_val<std::string>("motr_my_fid");
+    const auto& profile  = g_conf().get_val<std::string>("motr_profile_fid");
+    const auto& admin_proc_ep  = g_conf().get_val<std::string>("motr_admin_endpoint");
+    const auto& admin_proc_fid = g_conf().get_val<std::string>("motr_admin_fid");
+    const int init_flags = cct->get_init_flags();
+    ldout(cct, 0) << "INFO: motr my endpoint: " << proc_ep << dendl;
+    ldout(cct, 0) << "INFO: motr ha endpoint: " << ha_ep << dendl;
+    ldout(cct, 0) << "INFO: motr my fid:      " << proc_fid << dendl;
+    ldout(cct, 0) << "INFO: motr profile fid: " << profile << dendl;
+    store->conf.mc_local_addr  = proc_ep.c_str();
+    store->conf.mc_process_fid = proc_fid.c_str();
+
+    ldout(cct, 0) << "INFO: init flags:       " << init_flags << dendl;
+    ldout(cct, 0) << "INFO: motr admin endpoint: " << admin_proc_ep << dendl;
+    ldout(cct, 0) << "INFO: motr admin fid:   " << admin_proc_fid << dendl;
+
+    // HACK this is so that radosge-admin uses a different client
+    if (init_flags == 0) {
+      store->conf.mc_process_fid = admin_proc_fid.c_str();
+      store->conf.mc_local_addr  = admin_proc_ep.c_str();
+    } else {
+      store->conf.mc_process_fid = proc_fid.c_str();
+      store->conf.mc_local_addr  = proc_ep.c_str();
+    }
+    store->conf.mc_ha_addr     = ha_ep.c_str();
+    store->conf.mc_profile     = profile.c_str();
+
+    ldout(cct, 50) << "INFO: motr profile fid:  " << store->conf.mc_profile << dendl;
+    ldout(cct, 50) << "INFO: ha addr:  " << store->conf.mc_ha_addr << dendl;
+    ldout(cct, 50) << "INFO: process fid:  " << store->conf.mc_process_fid << dendl;
+    ldout(cct, 50) << "INFO: motr endpoint:  " << store->conf.mc_local_addr << dendl;
+
+    store->conf.mc_tm_recv_queue_min_len =     64;
+    store->conf.mc_max_rpc_msg_size      = 524288;
+    store->conf.mc_idx_service_id  = M0_IDX_DIX;
+    store->dix_conf.kc_create_meta = false;
+    store->conf.mc_idx_service_conf = &store->dix_conf;
+
+    if (!g_conf().get_val<bool>("motr_tracing_enabled")) {
+      m0_trace_level_allow(M0_WARN); // allow errors and warnings in syslog anyway
+      m0_trace_set_mmapped_buffer(false);
+    }
+
+    store->instance = nullptr;
+    rc = m0_client_init(&store->instance, &store->conf, true);
+    if (rc != 0) {
+      ldout(cct, 0) << "ERROR: m0_client_init() failed: " << rc << dendl;
+      goto out;
+    }
+
+    m0_container_init(&store->container, nullptr, &M0_UBER_REALM, store->instance);
+    rc = store->container.co_realm.re_entity.en_sm.sm_rc;
+    if (rc != 0) {
+      ldout(cct, 0) << "ERROR: m0_container_init() failed: " << rc << dendl;
+      goto out;
+    }
+
+    rc = m0_ufid_init(store->instance, &ufid_gr);
+    if (rc != 0) {
+      ldout(cct, 0) << "ERROR: m0_ufid_init() failed: " << rc << dendl;
+      goto out;
+    }
+
+    // Create global indices if not yet.
+    rc = store->check_n_create_global_indices();
+    if (rc != 0) {
+      ldout(cct, 0) << "ERROR: check_n_create_global_indices() failed: " << rc << dendl;
+      goto out;
+    }
+
+  }
+
+out:
+  if (rc != 0) {
+    delete store;
+    return nullptr;
+  }
+  return store;
+}
+
+}
diff --git a/src/rgw/rgw_sal_motr.h b/src/rgw/rgw_sal_motr.h
new file mode 100644
index 000000000..b7230f7e1
--- /dev/null
+++ b/src/rgw/rgw_sal_motr.h
@@ -0,0 +1,1204 @@
+
+// vim: ts=2 sw=2 expandtab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * SAL implementation for the CORTX Motr backend
+ *
+ * Copyright (C) 2021 Seagate Technology LLC and/or its Affiliates
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+extern "C" {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wextern-c-compat"
+#pragma clang diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion"
+#include "motr/config.h"
+#include "motr/client.h"
+#pragma clang diagnostic pop
+}
+
+#include "rgw_sal_store.h"
+#include "rgw_rados.h"
+#include "rgw_notify.h"
+#include "rgw_oidc_provider.h"
+#include "rgw_role.h"
+#include "rgw_multi.h"
+#include "rgw_putobj_processor.h"
+
+namespace rgw::sal {
+
+class MotrStore;
+
+// Global Motr indices
+#define RGW_MOTR_USERS_IDX_NAME       "motr.rgw.users"
+#define RGW_MOTR_BUCKET_INST_IDX_NAME "motr.rgw.bucket.instances"
+#define RGW_MOTR_BUCKET_HD_IDX_NAME   "motr.rgw.bucket.headers"
+#define RGW_IAM_MOTR_ACCESS_KEY       "motr.rgw.accesskeys"
+#define RGW_IAM_MOTR_EMAIL_KEY        "motr.rgw.emails"
+
+//#define RGW_MOTR_BUCKET_ACL_IDX_NAME  "motr.rgw.bucket.acls"
+
+// A simplified metadata cache implementation.
+// Note: MotrObjMetaCache doesn't handle the IO operations to Motr. A proxy
+// class can be added to handle cache and 'real' ops.
+class MotrMetaCache
+{
+protected:
+  // MGW re-uses ObjectCache to cache object's metadata as it has already
+  // implemented a lru cache: (1) ObjectCache internally uses a map and lru
+  // list to manage cache entry. POC uses object name, user name or bucket
+  // name as the key to lookup and insert an entry. (2) ObjectCache::data is
+  // a bufferlist and can be used to store any metadata structure, such as
+  // object's bucket dir entry, user info or bucket instance.
+  //
+  // Note from RGW:
+  // The Rados Gateway stores metadata and objects in an internal cache. This
+  // should be kept consistent by the OSD's relaying notify events between
+  // multiple watching RGW processes. In the event that this notification
+  // protocol fails, bounding the length of time that any data in the cache will
+  // be assumed valid will ensure that any RGW instance that falls out of sync
+  // will eventually recover. This seems to be an issue mostly for large numbers
+  // of RGW instances under heavy use. If you would like to turn off cache expiry,
+  // set this value to zero.
+  //
+  // Currently POC hasn't implemented the watch-notify menchanism yet. So the
+  // current implementation is similar to cortx-s3server which is based on expiry
+  // time. TODO: see comments on distribute_cache).
+  //
+  // Beaware: Motr object data is not cached in current POC as RGW!
+  // RGW caches the first chunk (4MB by default).
+  ObjectCache cache;
+
+public:
+  // Lookup a cache entry.
+  int get(const DoutPrefixProvider *dpp, const std::string& name, bufferlist& data);
+
+  // Insert a cache entry.
+  int put(const DoutPrefixProvider *dpp, const std::string& name, const bufferlist& data);
+
+  // Called when an object is deleted. Notification should be sent to other
+  // RGW instances.
+  int remove(const DoutPrefixProvider *dpp, const std::string& name);
+
+  // Make the local cache entry invalid.
+  void invalid(const DoutPrefixProvider *dpp, const std::string& name);
+
+  // TODO: Distribute_cache() and watch_cb() now are only place holder functions.
+  // Checkout services/svc_sys_obj_cache.h/cc for reference.
+  // These 2 functions are designed to notify or to act on cache notification.
+  // It is feasible to implement the functionality using Motr's FDMI after discussing
+  // with Hua.
+  int distribute_cache(const DoutPrefixProvider *dpp,
+                       const std::string& normal_name,
+                       ObjectCacheInfo& obj_info, int op);
+  int watch_cb(const DoutPrefixProvider *dpp,
+               uint64_t notify_id,
+               uint64_t cookie,
+               uint64_t notifier_id,
+               bufferlist& bl);
+
+  void set_enabled(bool status);
+
+  MotrMetaCache(const DoutPrefixProvider *dpp, CephContext *cct) {
+    cache.set_ctx(cct);
+  }
+};
+
+struct MotrUserInfo {
+  RGWUserInfo info;
+  obj_version user_version;
+  rgw::sal::Attrs attrs;
+
+  void encode(bufferlist& bl)  const
+  {
+    ENCODE_START(3, 3, bl);
+    encode(info, bl);
+    encode(user_version, bl);
+    encode(attrs, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl)
+  {
+    DECODE_START(3, bl);
+    decode(info, bl);
+    decode(user_version, bl);
+    decode(attrs, bl);
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(MotrUserInfo);
+
+struct MotrEmailInfo {
+  std::string user_id;
+  std::string email_id;
+
+  MotrEmailInfo() {}
+  MotrEmailInfo(std::string _user_id, std::string _email_id )
+    : user_id(std::move(_user_id)), email_id(std::move(_email_id)) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(user_id, bl);
+    encode(email_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl);
+     decode(user_id, bl);
+     decode(email_id, bl);
+      DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(MotrEmailInfo);
+
+struct MotrAccessKey {
+  std::string id; // AccessKey
+  std::string key; // SecretKey
+  std::string user_id; // UserID
+  
+  MotrAccessKey() {}
+  MotrAccessKey(std::string _id, std::string _key, std::string _user_id)
+    : id(std::move(_id)), key(std::move(_key)), user_id(std::move(_user_id)) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 2, bl);
+    encode(id, bl);
+    encode(key, bl);
+    encode(user_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+     DECODE_START_LEGACY_COMPAT_LEN_32(2, 2, 2, bl);
+     decode(id, bl);
+     decode(key, bl);
+     decode(user_id, bl);
+     DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(MotrAccessKey);
+
+class MotrNotification : public StoreNotification {
+  public:
+    MotrNotification(Object* _obj, Object* _src_obj, rgw::notify::EventType _type) :
+        StoreNotification(_obj, _src_obj, _type) {}
+    ~MotrNotification() = default;
+
+    virtual int publish_reserve(const DoutPrefixProvider *dpp, RGWObjTags* obj_tags = nullptr) override { return 0;}
+    virtual int publish_commit(const DoutPrefixProvider* dpp, uint64_t size,
+			       const ceph::real_time& mtime, const std::string& etag, const std::string& version) override { return 0; }
+};
+
+class MotrUser : public StoreUser {
+  private:
+    MotrStore         *store;
+    struct m0_uint128  idxID = {0xe5ecb53640d4ecce, 0x6a156cd5a74aa3b8}; // MD5 of “motr.rgw.users“
+    struct m0_idx      idx;
+
+  public:
+    std::set<std::string> access_key_tracker;
+    MotrUser(MotrStore *_st, const rgw_user& _u) : StoreUser(_u), store(_st) { }
+    MotrUser(MotrStore *_st, const RGWUserInfo& _i) : StoreUser(_i), store(_st) { }
+    MotrUser(MotrStore *_st) : store(_st) { }
+    MotrUser(MotrUser& _o) = default;
+    MotrUser() {}
+
+    virtual std::unique_ptr<User> clone() override {
+      return std::unique_ptr<User>(new MotrUser(*this));
+    }
+    int list_buckets(const DoutPrefixProvider *dpp, const std::string& marker, const std::string& end_marker,
+        uint64_t max, bool need_stats, BucketList& buckets, optional_yield y) override;
+    virtual int create_bucket(const DoutPrefixProvider* dpp,
+                            const rgw_bucket& b,
+                            const std::string& zonegroup_id,
+                            rgw_placement_rule& placement_rule,
+                            std::string& swift_ver_location,
+                            const RGWQuotaInfo* pquota_info,
+                            const RGWAccessControlPolicy& policy,
+                            Attrs& attrs,
+                            RGWBucketInfo& info,
+                            obj_version& ep_objv,
+                            bool exclusive,
+                            bool obj_lock_enabled,
+                            bool* existed,
+                            req_info& req_info,
+                            std::unique_ptr<Bucket>* bucket,
+                            optional_yield y) override;
+    virtual int read_attrs(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int merge_and_store_attrs(const DoutPrefixProvider* dpp, Attrs& new_attrs, optional_yield y) override;
+    virtual int read_stats(const DoutPrefixProvider *dpp,
+        optional_yield y, RGWStorageStats* stats,
+        ceph::real_time *last_stats_sync = nullptr,
+        ceph::real_time *last_stats_update = nullptr) override;
+    virtual int read_stats_async(const DoutPrefixProvider *dpp, RGWGetUserStats_CB* cb) override;
+    virtual int complete_flush_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+    virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+        bool* is_truncated, RGWUsageIter& usage_iter,
+        std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+    virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+
+    virtual int load_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int store_user(const DoutPrefixProvider* dpp, optional_yield y, bool exclusive, RGWUserInfo* old_info = nullptr) override;
+    virtual int remove_user(const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual int verify_mfa(const std::string& mfa_str, bool* verified, const DoutPrefixProvider* dpp, optional_yield y) override;
+
+    int create_user_info_idx();
+    int load_user_from_idx(const DoutPrefixProvider *dpp, MotrStore *store, RGWUserInfo& info, std::map<std::string, 
+                              bufferlist> *attrs, RGWObjVersionTracker *objv_tr);
+
+    friend class MotrBucket;
+};
+
+class MotrBucket : public StoreBucket {
+  private:
+    MotrStore *store;
+    RGWAccessControlPolicy acls;
+
+  // RGWBucketInfo and other information that are shown when listing a bucket is
+  // represented in struct MotrBucketInfo. The structure is encoded and stored
+  // as the value of the global bucket instance index.
+  // TODO: compare pros and cons of separating the bucket_attrs (ACLs, tag etc.)
+  // into a different index.
+  struct MotrBucketInfo {
+    RGWBucketInfo info;
+
+    obj_version bucket_version;
+    ceph::real_time mtime;
+
+    rgw::sal::Attrs bucket_attrs;
+
+    void encode(bufferlist& bl)  const
+    {
+      ENCODE_START(4, 4, bl);
+      encode(info, bl);
+      encode(bucket_version, bl);
+      encode(mtime, bl);
+      encode(bucket_attrs, bl); //rgw_cache.h example for a map
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::const_iterator& bl)
+    {
+      DECODE_START(4, bl);
+      decode(info, bl);
+      decode(bucket_version, bl);
+      decode(mtime, bl);
+      decode(bucket_attrs, bl);
+      DECODE_FINISH(bl);
+    }
+  };
+  WRITE_CLASS_ENCODER(MotrBucketInfo);
+
+  public:
+    MotrBucket(MotrStore *_st)
+      : store(_st),
+      acls() {
+      }
+
+    MotrBucket(MotrStore *_st, User* _u)
+      : StoreBucket(_u),
+      store(_st),
+      acls() {
+      }
+
+    MotrBucket(MotrStore *_st, const rgw_bucket& _b)
+      : StoreBucket(_b),
+      store(_st),
+      acls() {
+      }
+
+    MotrBucket(MotrStore *_st, const RGWBucketEnt& _e)
+      : StoreBucket(_e),
+      store(_st),
+      acls() {
+      }
+
+    MotrBucket(MotrStore *_st, const RGWBucketInfo& _i)
+      : StoreBucket(_i),
+      store(_st),
+      acls() {
+      }
+
+    MotrBucket(MotrStore *_st, const rgw_bucket& _b, User* _u)
+      : StoreBucket(_b, _u),
+      store(_st),
+      acls() {
+      }
+
+    MotrBucket(MotrStore *_st, const RGWBucketEnt& _e, User* _u)
+      : StoreBucket(_e, _u),
+      store(_st),
+      acls() {
+      }
+
+    MotrBucket(MotrStore *_st, const RGWBucketInfo& _i, User* _u)
+      : StoreBucket(_i, _u),
+      store(_st),
+      acls() {
+      }
+
+    ~MotrBucket() { }
+
+    virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+    virtual int list(const DoutPrefixProvider *dpp, ListParams&, int, ListResults&, optional_yield y) override;
+    virtual int remove_bucket(const DoutPrefixProvider *dpp, bool delete_children, bool forward_to_master, req_info* req_info, optional_yield y) override;
+    virtual int remove_bucket_bypass_gc(int concurrent_max, bool
+        keep_index_consistent,
+        optional_yield y, const
+        DoutPrefixProvider *dpp) override;
+    virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+    virtual int set_acl(const DoutPrefixProvider *dpp, RGWAccessControlPolicy& acl, optional_yield y) override;
+    virtual int load_bucket(const DoutPrefixProvider *dpp, optional_yield y, bool get_stats = false) override;
+    int link_user(const DoutPrefixProvider* dpp, User* new_user, optional_yield y);
+    int unlink_user(const DoutPrefixProvider* dpp, User* new_user, optional_yield y);
+    int create_bucket_index();
+    int create_multipart_indices();
+    virtual int read_stats(const DoutPrefixProvider *dpp,
+        const bucket_index_layout_generation& idx_layout, int shard_id,
+        std::string *bucket_ver, std::string *master_ver,
+        std::map<RGWObjCategory, RGWStorageStats>& stats,
+        std::string *max_marker = nullptr,
+        bool *syncstopped = nullptr) override;
+    virtual int read_stats_async(const DoutPrefixProvider *dpp,
+                                 const bucket_index_layout_generation& idx_layout,
+                                 int shard_id, RGWGetBucketStats_CB* ctx) override;
+    virtual int sync_user_stats(const DoutPrefixProvider *dpp, optional_yield y) override;
+    virtual int update_container_stats(const DoutPrefixProvider *dpp) override;
+    virtual int check_bucket_shards(const DoutPrefixProvider *dpp) override;
+    virtual int chown(const DoutPrefixProvider *dpp, User& new_user, optional_yield y) override;
+    virtual int put_info(const DoutPrefixProvider *dpp, bool exclusive, ceph::real_time mtime) override;
+    virtual bool is_owner(User* user) override;
+    virtual int check_empty(const DoutPrefixProvider *dpp, optional_yield y) override;
+    virtual int check_quota(const DoutPrefixProvider *dpp, RGWQuota& quota, uint64_t obj_size, optional_yield y, bool check_size_only = false) override;
+    virtual int merge_and_store_attrs(const DoutPrefixProvider *dpp, Attrs& attrs, optional_yield y) override;
+    virtual int try_refresh_info(const DoutPrefixProvider *dpp, ceph::real_time *pmtime) override;
+    virtual int read_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
+        bool *is_truncated, RGWUsageIter& usage_iter,
+        std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+    virtual int trim_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+    virtual int remove_objs_from_index(const DoutPrefixProvider *dpp, std::list<rgw_obj_index_key>& objs_to_unlink) override;
+    virtual int check_index(const DoutPrefixProvider *dpp, std::map<RGWObjCategory, RGWStorageStats>& existing_stats, std::map<RGWObjCategory, RGWStorageStats>& calculated_stats) override;
+    virtual int rebuild_index(const DoutPrefixProvider *dpp) override;
+    virtual int set_tag_timeout(const DoutPrefixProvider *dpp, uint64_t timeout) override;
+    virtual int purge_instance(const DoutPrefixProvider *dpp) override;
+    virtual std::unique_ptr<Bucket> clone() override {
+      return std::make_unique<MotrBucket>(*this);
+    }
+    virtual std::unique_ptr<MultipartUpload> get_multipart_upload(const std::string& oid,
+                                std::optional<std::string> upload_id=std::nullopt,
+                                ACLOwner owner={}, ceph::real_time mtime=real_clock::now()) override;
+    virtual int list_multiparts(const DoutPrefixProvider *dpp,
+      const std::string& prefix,
+      std::string& marker,
+      const std::string& delim,
+      const int& max_uploads,
+      std::vector<std::unique_ptr<MultipartUpload>>& uploads,
+      std::map<std::string, bool> *common_prefixes,
+      bool *is_truncated) override;
+    virtual int abort_multiparts(const DoutPrefixProvider *dpp, CephContext *cct) override;
+
+    friend class MotrStore;
+};
+
+class MotrPlacementTier: public StorePlacementTier {
+  MotrStore* store;
+  RGWZoneGroupPlacementTier tier;
+public:
+  MotrPlacementTier(MotrStore* _store, const RGWZoneGroupPlacementTier& _tier) : store(_store), tier(_tier) {}
+  virtual ~MotrPlacementTier() = default;
+
+  virtual const std::string& get_tier_type() { return tier.tier_type; }
+  virtual const std::string& get_storage_class() { return tier.storage_class; }
+  virtual bool retain_head_object() { return tier.retain_head_object; }
+  RGWZoneGroupPlacementTier& get_rt() { return tier; }
+};
+
+class MotrZoneGroup : public StoreZoneGroup {
+  MotrStore* store;
+  const RGWZoneGroup group;
+  std::string empty;
+public:
+  MotrZoneGroup(MotrStore* _store) : store(_store), group() {}
+  MotrZoneGroup(MotrStore* _store, const RGWZoneGroup& _group) : store(_store), group(_group) {}
+  virtual ~MotrZoneGroup() = default;
+
+  virtual const std::string& get_id() const override { return group.get_id(); };
+  virtual const std::string& get_name() const override { return group.get_name(); };
+  virtual int equals(const std::string& other_zonegroup) const override {
+    return group.equals(other_zonegroup);
+  };
+  /** Get the endpoint from zonegroup, or from master zone if not set */
+  virtual const std::string& get_endpoint() const override;
+  virtual bool placement_target_exists(std::string& target) const override;
+  virtual bool is_master_zonegroup() const override {
+    return group.is_master_zonegroup();
+  };
+  virtual const std::string& get_api_name() const override { return group.api_name; };
+  virtual void get_placement_target_names(std::set<std::string>& names) const override;
+  virtual const std::string& get_default_placement_name() const override {
+    return group.default_placement.name; };
+  virtual int get_hostnames(std::list<std::string>& names) const override {
+    names = group.hostnames;
+    return 0;
+  };
+  virtual int get_s3website_hostnames(std::list<std::string>& names) const override {
+   names = group.hostnames_s3website;
+    return 0;
+  };
+  virtual int get_zone_count() const override {
+    return group.zones.size();
+  }
+  virtual int get_placement_tier(const rgw_placement_rule& rule, std::unique_ptr<PlacementTier>* tier);
+  virtual int get_zone_by_id(const std::string& id, std::unique_ptr<Zone>* zone) override {
+    return -1;
+  }
+  virtual int get_zone_by_name(const std::string& name, std::unique_ptr<Zone>* zone) override {
+    return -1;
+  }
+  virtual int list_zones(std::list<std::string>& zone_ids) override {
+    zone_ids.clear();
+    return 0;
+  }
+  const RGWZoneGroup& get_group() { return group; }
+  bool supports(std::string_view feature) const override {
+    return group.supports(features);
+  }
+  virtual std::unique_ptr<ZoneGroup> clone() override {
+    return std::make_unique<MotrZoneGroup>(store, group);
+  }
+};
+
+class MotrZone : public StoreZone {
+  protected:
+    MotrStore* store;
+    RGWRealm *realm{nullptr};
+    MotrZoneGroup zonegroup;
+    RGWZone *zone_public_config{nullptr}; /* external zone params, e.g., entrypoints, log flags, etc. */
+    RGWZoneParams *zone_params{nullptr}; /* internal zone params, e.g., rados pools */
+    RGWPeriod *current_period{nullptr};
+
+  public:
+    MotrZone(MotrStore* _store) : store(_store), zonegroup(_store) {
+      realm = new RGWRealm();
+      zone_public_config = new RGWZone();
+      zone_params = new RGWZoneParams();
+      current_period = new RGWPeriod();
+
+      // XXX: only default and STANDARD supported for now
+      RGWZonePlacementInfo info;
+      RGWZoneStorageClasses sc;
+      sc.set_storage_class("STANDARD", nullptr, nullptr);
+      info.storage_classes = sc;
+      zone_params->placement_pools["default"] = info;
+    }
+    MotrZone(MotrStore* _store, MotrZoneGroup _zg) : store(_store), zonegroup(_zg) {
+      realm = new RGWRealm();
+      // TODO: fetch zonegroup params (eg. id) from provisioner config.
+      zonegroup.set_id("0956b174-fe14-4f97-8b50-bb7ec5e1cf62");
+      zonegroup.api_name = "default";
+      zone_public_config = new RGWZone();
+      zone_params = new RGWZoneParams();
+      current_period = new RGWPeriod();
+
+      // XXX: only default and STANDARD supported for now
+      RGWZonePlacementInfo info;
+      RGWZoneStorageClasses sc;
+      sc.set_storage_class("STANDARD", nullptr, nullptr);
+      info.storage_classes = sc;
+      zone_params->placement_pools["default"] = info;
+    }
+    ~MotrZone() = default;
+
+    virtual std::unique_ptr<Zone> clone() override {
+      return std::make_unique<MotrZone>(store);
+    }
+    virtual ZoneGroup& get_zonegroup() override;
+    virtual const std::string& get_id() override;
+    virtual const std::string& get_name() const override;
+    virtual bool is_writeable() override;
+    virtual bool get_redirect_endpoint(std::string* endpoint) override;
+    virtual bool has_zonegroup_api(const std::string& api) const override;
+    virtual const std::string& get_current_period_id() override;
+    virtual const RGWAccessKey& get_system_key() { return zone_params->system_key; }
+    virtual const std::string& get_realm_name() { return realm->get_name(); }
+    virtual const std::string& get_realm_id() { return realm->get_id(); }
+    virtual const std::string_view get_tier_type() { return "rgw"; }
+    virtual RGWBucketSyncPolicyHandlerRef get_sync_policy_handler() { return nullptr; }
+    friend class MotrStore;
+};
+
+class MotrLuaManager : public StoreLuaManager {
+  MotrStore* store;
+
+  public:
+  MotrLuaManager(MotrStore* _s) : store(_s)
+  {
+  }
+  virtual ~MotrLuaManager() = default;
+
+  /** Get a script named with the given key from the backing store */
+  virtual int get_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, std::string& script) override;
+  /** Put a script named with the given key to the backing store */
+  virtual int put_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key, const std::string& script) override;
+  /** Delete a script named with the given key from the backing store */
+  virtual int del_script(const DoutPrefixProvider* dpp, optional_yield y, const std::string& key) override;
+  /** Add a lua package */
+  virtual int add_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override;
+  /** Remove a lua package */
+  virtual int remove_package(const DoutPrefixProvider* dpp, optional_yield y, const std::string& package_name) override;
+  /** List lua packages */
+  virtual int list_packages(const DoutPrefixProvider* dpp, optional_yield y, rgw::lua::packages_t& packages) override;
+};
+
+class MotrOIDCProvider : public RGWOIDCProvider {
+  MotrStore* store;
+  public:
+  MotrOIDCProvider(MotrStore* _store) : store(_store) {}
+  ~MotrOIDCProvider() = default;
+
+  virtual int store_url(const DoutPrefixProvider *dpp, const std::string& url, bool exclusive, optional_yield y) override { return 0; }
+  virtual int read_url(const DoutPrefixProvider *dpp, const std::string& url, const std::string& tenant) override { return 0; }
+  virtual int delete_obj(const DoutPrefixProvider *dpp, optional_yield y) override { return 0;}
+
+  void encode(bufferlist& bl) const {
+    RGWOIDCProvider::encode(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    RGWOIDCProvider::decode(bl);
+  }
+};
+
+class MotrObject : public StoreObject {
+  private:
+    MotrStore *store;
+    RGWAccessControlPolicy acls;
+    RGWObjCategory category;
+
+    // If this object is pat of a multipart uploaded one.
+    // TODO: do it in another class? MotrPartObject : public MotrObject
+    uint64_t part_off;
+    uint64_t part_size;
+    uint64_t part_num;
+
+  public:
+
+    // motr object metadata stored in index
+    struct Meta {
+      struct m0_uint128 oid = {};
+      struct m0_fid pver = {};
+      uint64_t layout_id = 0;
+
+      void encode(bufferlist& bl) const
+      {
+        ENCODE_START(5, 5, bl);
+        encode(oid.u_hi, bl);
+        encode(oid.u_lo, bl);
+        encode(pver.f_container, bl);
+        encode(pver.f_key, bl);
+        encode(layout_id, bl);
+        ENCODE_FINISH(bl);
+      }
+
+      void decode(bufferlist::const_iterator& bl)
+      {
+        DECODE_START(5, bl);
+        decode(oid.u_hi, bl);
+        decode(oid.u_lo, bl);
+        decode(pver.f_container, bl);
+        decode(pver.f_key, bl);
+        decode(layout_id, bl);
+        DECODE_FINISH(bl);
+      }
+    };
+
+    struct m0_obj     *mobj = NULL;
+    Meta               meta;
+
+    struct MotrReadOp : public ReadOp {
+      private:
+        MotrObject* source;
+
+	// The set of part objects if the source is
+	// a multipart uploaded object.
+        std::map<int, std::unique_ptr<MotrObject>> part_objs;
+
+      public:
+        MotrReadOp(MotrObject *_source);
+
+        virtual int prepare(optional_yield y, const DoutPrefixProvider* dpp) override;
+
+        /*
+         * Both `read` and `iterate` read up through index `end`
+         * *inclusive*. The number of bytes that could be returned is
+         * `end - ofs + 1`.
+         */
+        virtual int read(int64_t off, int64_t end, bufferlist& bl,
+                         optional_yield y,
+                         const DoutPrefixProvider* dpp) override;
+        virtual int iterate(const DoutPrefixProvider* dpp, int64_t off,
+                            int64_t end, RGWGetDataCB* cb,
+                            optional_yield y) override;
+
+        virtual int get_attr(const DoutPrefixProvider* dpp, const char* name, bufferlist& dest, optional_yield y) override;
+    };
+
+    struct MotrDeleteOp : public DeleteOp {
+      private:
+        MotrObject* source;
+
+      public:
+        MotrDeleteOp(MotrObject* _source);
+
+        virtual int delete_obj(const DoutPrefixProvider* dpp, optional_yield y) override;
+    };
+
+    MotrObject() = default;
+
+    MotrObject(MotrStore *_st, const rgw_obj_key& _k)
+      : StoreObject(_k), store(_st), acls() {}
+    MotrObject(MotrStore *_st, const rgw_obj_key& _k, Bucket* _b)
+      : StoreObject(_k, _b), store(_st), acls() {}
+
+    MotrObject(MotrObject& _o) = default;
+
+    virtual ~MotrObject();
+
+    virtual int delete_object(const DoutPrefixProvider* dpp,
+        optional_yield y,
+        bool prevent_versioning = false) override;
+    virtual int delete_obj_aio(const DoutPrefixProvider* dpp, RGWObjState* astate, Completions* aio,
+        bool keep_index_consistent, optional_yield y) override;
+    virtual int copy_object(User* user,
+        req_info* info, const rgw_zone_id& source_zone,
+        rgw::sal::Object* dest_object, rgw::sal::Bucket* dest_bucket,
+        rgw::sal::Bucket* src_bucket,
+        const rgw_placement_rule& dest_placement,
+        ceph::real_time* src_mtime, ceph::real_time* mtime,
+        const ceph::real_time* mod_ptr, const ceph::real_time* unmod_ptr,
+        bool high_precision_time,
+        const char* if_match, const char* if_nomatch,
+        AttrsMod attrs_mod, bool copy_if_newer, Attrs& attrs,
+        RGWObjCategory category, uint64_t olh_epoch,
+        boost::optional<ceph::real_time> delete_at,
+        std::string* version_id, std::string* tag, std::string* etag,
+        void (*progress_cb)(off_t, void *), void* progress_data,
+        const DoutPrefixProvider* dpp, optional_yield y) override;
+    virtual RGWAccessControlPolicy& get_acl(void) override { return acls; }
+    virtual int set_acl(const RGWAccessControlPolicy& acl) override { acls = acl; return 0; }
+    virtual int get_obj_state(const DoutPrefixProvider* dpp, RGWObjState **state, optional_yield y, bool follow_olh = true) override;
+    virtual int set_obj_attrs(const DoutPrefixProvider* dpp, Attrs* setattrs, Attrs* delattrs, optional_yield y) override;
+    virtual int get_obj_attrs(optional_yield y, const DoutPrefixProvider* dpp, rgw_obj* target_obj = NULL) override;
+    virtual int modify_obj_attrs(const char* attr_name, bufferlist& attr_val, optional_yield y, const DoutPrefixProvider* dpp) override;
+    virtual int delete_obj_attrs(const DoutPrefixProvider* dpp, const char* attr_name, optional_yield y) override;
+    virtual bool is_expired() override;
+    virtual void gen_rand_obj_instance_name() override;
+    virtual std::unique_ptr<Object> clone() override {
+      return std::unique_ptr<Object>(new MotrObject(*this));
+    }
+    virtual std::unique_ptr<MPSerializer> get_serializer(const DoutPrefixProvider *dpp, const std::string& lock_name) override;
+    virtual int transition(Bucket* bucket,
+        const rgw_placement_rule& placement_rule,
+        const real_time& mtime,
+        uint64_t olh_epoch,
+        const DoutPrefixProvider* dpp,
+        optional_yield y) override;
+    virtual bool placement_rules_match(rgw_placement_rule& r1, rgw_placement_rule& r2) override;
+    virtual int dump_obj_layout(const DoutPrefixProvider *dpp, optional_yield y, Formatter* f) override;
+
+    /* Swift versioning */
+    virtual int swift_versioning_restore(bool& restored,
+        const DoutPrefixProvider* dpp) override;
+    virtual int swift_versioning_copy(const DoutPrefixProvider* dpp,
+        optional_yield y) override;
+
+    /* OPs */
+    virtual std::unique_ptr<ReadOp> get_read_op() override;
+    virtual std::unique_ptr<DeleteOp> get_delete_op() override;
+
+    /* OMAP */
+    virtual int omap_get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+        std::map<std::string, bufferlist> *m,
+        bool* pmore, optional_yield y) override;
+    virtual int omap_get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m,
+        optional_yield y) override;
+    virtual int omap_get_vals_by_keys(const DoutPrefixProvider *dpp, const std::string& oid,
+        const std::set<std::string>& keys,
+        Attrs* vals) override;
+    virtual int omap_set_val_by_key(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& val,
+        bool must_exist, optional_yield y) override;
+    virtual int chown(User& new_user, const DoutPrefixProvider* dpp, optional_yield y) override;
+  private:
+    //int read_attrs(const DoutPrefixProvider* dpp, Motr::Object::Read &read_op, optional_yield y, rgw_obj* target_obj = nullptr);
+
+  public:
+    bool is_opened() { return mobj != NULL; }
+    int create_mobj(const DoutPrefixProvider *dpp, uint64_t sz);
+    int open_mobj(const DoutPrefixProvider *dpp);
+    int delete_mobj(const DoutPrefixProvider *dpp);
+    void close_mobj();
+    int write_mobj(const DoutPrefixProvider *dpp, bufferlist&& data, uint64_t offset);
+    int read_mobj(const DoutPrefixProvider* dpp, int64_t off, int64_t end, RGWGetDataCB* cb);
+    unsigned get_optimal_bs(unsigned len);
+
+    int get_part_objs(const DoutPrefixProvider *dpp,
+                      std::map<int, std::unique_ptr<MotrObject>>& part_objs);
+    int open_part_objs(const DoutPrefixProvider* dpp,
+                       std::map<int, std::unique_ptr<MotrObject>>& part_objs);
+    int read_multipart_obj(const DoutPrefixProvider* dpp,
+                           int64_t off, int64_t end, RGWGetDataCB* cb,
+                           std::map<int, std::unique_ptr<MotrObject>>& part_objs);
+    int delete_part_objs(const DoutPrefixProvider* dpp);
+    void set_category(RGWObjCategory _category) {category = _category;}
+    int get_bucket_dir_ent(const DoutPrefixProvider *dpp, rgw_bucket_dir_entry& ent);
+    int update_version_entries(const DoutPrefixProvider *dpp);
+};
+
+// A placeholder locking class for multipart upload.
+// TODO: implement it using Motr object locks.
+class MPMotrSerializer : public StoreMPSerializer {
+
+  public:
+    MPMotrSerializer(const DoutPrefixProvider *dpp, MotrStore* store, MotrObject* obj, const std::string& lock_name) {}
+
+    virtual int try_lock(const DoutPrefixProvider *dpp, utime_t dur, optional_yield y) override {return 0; }
+    virtual int unlock() override { return 0;}
+};
+
+class MotrAtomicWriter : public StoreWriter {
+  protected:
+  rgw::sal::MotrStore* store;
+  const rgw_user& owner;
+  const rgw_placement_rule *ptail_placement_rule;
+  uint64_t olh_epoch;
+  const std::string& unique_tag;
+  MotrObject obj;
+  MotrObject old_obj;
+  uint64_t total_data_size; // for total data being uploaded
+  bufferlist acc_data;  // accumulated data
+  uint64_t   acc_off; // accumulated data offset
+
+  struct m0_bufvec buf;
+  struct m0_bufvec attr;
+  struct m0_indexvec ext;
+
+  public:
+  MotrAtomicWriter(const DoutPrefixProvider *dpp,
+          optional_yield y,
+          rgw::sal::Object* obj,
+          MotrStore* _store,
+          const rgw_user& _owner,
+          const rgw_placement_rule *_ptail_placement_rule,
+          uint64_t _olh_epoch,
+          const std::string& _unique_tag);
+  ~MotrAtomicWriter() = default;
+
+  // prepare to start processing object data
+  virtual int prepare(optional_yield y) override;
+
+  // Process a bufferlist
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+
+  int write();
+
+  // complete the operation and make its result visible to clients
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) override;
+
+  unsigned populate_bvec(unsigned len, bufferlist::iterator &bi);
+  void cleanup();
+};
+
+class MotrMultipartWriter : public StoreWriter {
+protected:
+  rgw::sal::MotrStore* store;
+
+  // Head object.
+  rgw::sal::Object* head_obj;
+
+  // Part parameters.
+  const uint64_t part_num;
+  const std::string part_num_str;
+  std::unique_ptr<MotrObject> part_obj;
+  uint64_t actual_part_size = 0;
+
+public:
+  MotrMultipartWriter(const DoutPrefixProvider *dpp,
+		       optional_yield y, MultipartUpload* upload,
+		       rgw::sal::Object* obj,
+		       MotrStore* _store,
+		       const rgw_user& owner,
+		       const rgw_placement_rule *ptail_placement_rule,
+		       uint64_t _part_num, const std::string& part_num_str) :
+				  StoreWriter(dpp, y), store(_store), head_obj(obj),
+				  part_num(_part_num), part_num_str(part_num_str)
+  {
+  }
+  ~MotrMultipartWriter() = default;
+
+  // prepare to start processing object data
+  virtual int prepare(optional_yield y) override;
+
+  // Process a bufferlist
+  virtual int process(bufferlist&& data, uint64_t offset) override;
+
+  // complete the operation and make its result visible to clients
+  virtual int complete(size_t accounted_size, const std::string& etag,
+                       ceph::real_time *mtime, ceph::real_time set_mtime,
+                       std::map<std::string, bufferlist>& attrs,
+                       ceph::real_time delete_at,
+                       const char *if_match, const char *if_nomatch,
+                       const std::string *user_data,
+                       rgw_zone_set *zones_trace, bool *canceled,
+                       optional_yield y) override;
+};
+
+// The implementation of multipart upload in POC roughly follows the
+// cortx-s3server's design. Parts are stored in separate Motr objects.
+// s3server uses a few auxiliary Motr indices to manage multipart
+// related metadata: (1) Bucket multipart index (bucket_nnn_multipart_index)
+// which contains metadata that answers questions such as which objects have
+// started  multipart upload and its upload id. This index is created during
+// bucket creation. (2) Object part index (object_nnn_part_index) which stores
+// metadata of a part's details (size, pvid, oid...). This index is created in
+// MotrMultipartUpload::init(). (3) Extended metadata index
+// (bucket_nnn_extended_metadata): once parts has been uploaded and their
+// metadata saved in the part index, the user may issue multipart completion
+// request. When processing the completion request, the parts are read from
+// object part index and for each part an entry is created in extended index.
+// The entry for the object is created in bucket (object list) index. The part
+// index is deleted and an entry removed from bucket_nnn_multipart_index. Like
+// bucket multipart index, bucket part extened metadata index is created during
+// bucket creation.
+//
+// The extended metadata index is used mainly due to fault tolerant
+// considerations (how to handle Motr service crash when uploading an object)
+// and to avoid to create too many Motr indices (I am not sure I understand
+// why many Motr indices is bad.). In our POC, to keep it simple, only 2
+// indices are maintained: bucket multipart index and object_nnn_part_index.
+//
+//
+
+class MotrMultipartPart : public StoreMultipartPart {
+protected:
+  RGWUploadPartInfo info;
+
+public:
+  MotrObject::Meta  meta;
+
+  MotrMultipartPart(RGWUploadPartInfo _info, MotrObject::Meta _meta) :
+    info(_info), meta(_meta) {}
+  virtual ~MotrMultipartPart() = default;
+
+  virtual uint32_t get_num() { return info.num; }
+  virtual uint64_t get_size() { return info.accounted_size; }
+  virtual const std::string& get_etag() { return info.etag; }
+  virtual ceph::real_time& get_mtime() { return info.modified; }
+
+  RGWObjManifest& get_manifest() { return info.manifest; }
+
+  friend class MotrMultipartUpload;
+};
+
+class MotrMultipartUpload : public StoreMultipartUpload {
+  MotrStore* store;
+  RGWMPObj mp_obj;
+  ACLOwner owner;
+  ceph::real_time mtime;
+  rgw_placement_rule placement;
+  RGWObjManifest manifest;
+
+public:
+  MotrMultipartUpload(MotrStore* _store, Bucket* _bucket, const std::string& oid,
+                      std::optional<std::string> upload_id, ACLOwner _owner, ceph::real_time _mtime) :
+       StoreMultipartUpload(_bucket), store(_store), mp_obj(oid, upload_id), owner(_owner), mtime(_mtime) {}
+  virtual ~MotrMultipartUpload() = default;
+
+  virtual const std::string& get_meta() const { return mp_obj.get_meta(); }
+  virtual const std::string& get_key() const { return mp_obj.get_key(); }
+  virtual const std::string& get_upload_id() const { return mp_obj.get_upload_id(); }
+  virtual const ACLOwner& get_owner() const override { return owner; }
+  virtual ceph::real_time& get_mtime() { return mtime; }
+  virtual std::unique_ptr<rgw::sal::Object> get_meta_obj() override;
+  virtual int init(const DoutPrefixProvider* dpp, optional_yield y, ACLOwner& owner, rgw_placement_rule& dest_placement, rgw::sal::Attrs& attrs) override;
+  virtual int list_parts(const DoutPrefixProvider* dpp, CephContext* cct,
+			 int num_parts, int marker,
+			 int* next_marker, bool* truncated,
+			 bool assume_unsorted = false) override;
+  virtual int abort(const DoutPrefixProvider* dpp, CephContext* cct) override;
+  virtual int complete(const DoutPrefixProvider* dpp,
+		       optional_yield y, CephContext* cct,
+		       std::map<int, std::string>& part_etags,
+		       std::list<rgw_obj_index_key>& remove_objs,
+		       uint64_t& accounted_size, bool& compressed,
+		       RGWCompressionInfo& cs_info, off_t& off,
+		       std::string& tag, ACLOwner& owner,
+		       uint64_t olh_epoch,
+		       rgw::sal::Object* target_obj) override;
+  virtual int get_info(const DoutPrefixProvider *dpp, optional_yield y, rgw_placement_rule** rule, rgw::sal::Attrs* attrs = nullptr) override;
+  virtual std::unique_ptr<Writer> get_writer(const DoutPrefixProvider *dpp,
+			  optional_yield y,
+			  rgw::sal::Object* obj,
+			  const rgw_user& owner,
+			  const rgw_placement_rule *ptail_placement_rule,
+			  uint64_t part_num,
+			  const std::string& part_num_str) override;
+  int delete_parts(const DoutPrefixProvider *dpp);
+};
+
+class MotrStore : public StoreDriver {
+  private:
+    MotrZone zone;
+    RGWSyncModuleInstanceRef sync_module;
+
+    MotrMetaCache* obj_meta_cache;
+    MotrMetaCache* user_cache;
+    MotrMetaCache* bucket_inst_cache;
+
+  public:
+    CephContext *cctx;
+    struct m0_client   *instance;
+    struct m0_container container;
+    struct m0_realm     uber_realm;
+    struct m0_config    conf = {};
+    struct m0_idx_dix_config dix_conf = {};
+
+    MotrStore(CephContext *c): zone(this), cctx(c) {}
+    ~MotrStore() {
+      delete obj_meta_cache;
+      delete user_cache;
+      delete bucket_inst_cache;
+    }
+
+    virtual int initialize(CephContext *cct, const DoutPrefixProvider *dpp) { return 0; }
+    virtual const std::string get_name() const override {
+      return "motr";
+    }
+
+    virtual std::unique_ptr<User> get_user(const rgw_user& u) override;
+    virtual std::string get_cluster_id(const DoutPrefixProvider* dpp,  optional_yield y) override;
+    virtual int get_user_by_access_key(const DoutPrefixProvider *dpp, const std::string& key, optional_yield y, std::unique_ptr<User>* user) override;
+    virtual int get_user_by_email(const DoutPrefixProvider *dpp, const std::string& email, optional_yield y, std::unique_ptr<User>* user) override;
+    virtual int get_user_by_swift(const DoutPrefixProvider *dpp, const std::string& user_str, optional_yield y, std::unique_ptr<User>* user) override;
+    virtual std::unique_ptr<Object> get_object(const rgw_obj_key& k) override;
+    virtual int get_bucket(const DoutPrefixProvider *dpp, User* u, const rgw_bucket& b, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+    virtual int get_bucket(User* u, const RGWBucketInfo& i, std::unique_ptr<Bucket>* bucket) override;
+    virtual int get_bucket(const DoutPrefixProvider *dpp, User* u, const std::string& tenant, const std::string&name, std::unique_ptr<Bucket>* bucket, optional_yield y) override;
+    virtual bool is_meta_master() override;
+    virtual int forward_request_to_master(const DoutPrefixProvider *dpp, User* user, obj_version* objv,
+        bufferlist& in_data, JSONParser *jp, req_info& info,
+        optional_yield y) override;
+    virtual int forward_iam_request_to_master(const DoutPrefixProvider *dpp, const RGWAccessKey& key, obj_version* objv,
+					     bufferlist& in_data,
+					     RGWXMLDecoder::XMLParser* parser, req_info& info,
+					     optional_yield y) override;
+    virtual Zone* get_zone() { return &zone; }
+    virtual std::string zone_unique_id(uint64_t unique_num) override;
+    virtual std::string zone_unique_trans_id(const uint64_t unique_num) override;
+    virtual int get_zonegroup(const std::string& id, std::unique_ptr<ZoneGroup>* zonegroup) override;
+    virtual int list_all_zones(const DoutPrefixProvider* dpp, std::list<std::string>& zone_ids) override;
+    virtual int cluster_stat(RGWClusterStat& stats) override;
+    virtual std::unique_ptr<Lifecycle> get_lifecycle(void) override;
+    virtual std::unique_ptr<Completions> get_completions(void) override;
+    virtual std::unique_ptr<Notification> get_notification(rgw::sal::Object* obj, rgw::sal::Object* src_obj,
+        req_state* s, rgw::notify::EventType event_type, optional_yield y, const std::string* object_name=nullptr) override;
+    virtual std::unique_ptr<Notification> get_notification(const DoutPrefixProvider* dpp, rgw::sal::Object* obj,
+        rgw::sal::Object* src_obj, rgw::notify::EventType event_type, rgw::sal::Bucket* _bucket,
+        std::string& _user_id, std::string& _user_tenant, std::string& _req_id, optional_yield y) override;
+    virtual RGWLC* get_rgwlc(void) override { return NULL; }
+    virtual RGWCoroutinesManagerRegistry* get_cr_registry() override { return NULL; }
+
+    virtual int log_usage(const DoutPrefixProvider *dpp, std::map<rgw_user_bucket, RGWUsageBatch>& usage_info) override;
+    virtual int log_op(const DoutPrefixProvider *dpp, std::string& oid, bufferlist& bl) override;
+    virtual int register_to_service_map(const DoutPrefixProvider *dpp, const std::string& daemon_type,
+        const std::map<std::string, std::string>& meta) override;
+    virtual void get_ratelimit(RGWRateLimitInfo& bucket_ratelimit, RGWRateLimitInfo& user_ratelimit, RGWRateLimitInfo& anon_ratelimit) override;
+    virtual void get_quota(RGWQuota& quota) override;
+    virtual int set_buckets_enabled(const DoutPrefixProvider *dpp, std::vector<rgw_bucket>& buckets, bool enabled) override;
+    virtual int get_sync_policy_handler(const DoutPrefixProvider *dpp,
+        std::optional<rgw_zone_id> zone,
+        std::optional<rgw_bucket> bucket,
+        RGWBucketSyncPolicyHandlerRef *phandler,
+        optional_yield y) override;
+    virtual RGWDataSyncStatusManager* get_data_sync_manager(const rgw_zone_id& source_zone) override;
+    virtual void wakeup_meta_sync_shards(std::set<int>& shard_ids) override { return; }
+    virtual void wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, boost::container::flat_map<int, boost::container::flat_set<rgw_data_notify_entry>>& shard_ids) override {}
+    virtual int clear_usage(const DoutPrefixProvider *dpp) override { return 0; }
+    virtual int read_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch,
+        uint32_t max_entries, bool *is_truncated,
+        RGWUsageIter& usage_iter,
+        std::map<rgw_user_bucket, rgw_usage_log_entry>& usage) override;
+    virtual int trim_all_usage(const DoutPrefixProvider *dpp, uint64_t start_epoch, uint64_t end_epoch) override;
+    virtual int get_config_key_val(std::string name, bufferlist* bl) override;
+    virtual int meta_list_keys_init(const DoutPrefixProvider *dpp, const std::string& section, const std::string& marker, void** phandle) override;
+    virtual int meta_list_keys_next(const DoutPrefixProvider *dpp, void* handle, int max, std::list<std::string>& keys, bool* truncated) override;
+    virtual void meta_list_keys_complete(void* handle) override;
+    virtual std::string meta_get_marker(void *handle) override;
+    virtual int meta_remove(const DoutPrefixProvider *dpp, std::string& metadata_key, optional_yield y) override;
+
+    virtual const RGWSyncModuleInstanceRef& get_sync_module() { return sync_module; }
+    virtual std::string get_host_id() { return ""; }
+
+    virtual std::unique_ptr<LuaManager> get_lua_manager() override;
+    virtual std::unique_ptr<RGWRole> get_role(std::string name,
+        std::string tenant,
+        std::string path="",
+        std::string trust_policy="",
+        std::string max_session_duration_str="",
+        std::multimap<std::string, std::string> tags={}) override;
+    virtual std::unique_ptr<RGWRole> get_role(const RGWRoleInfo& info) override;
+    virtual std::unique_ptr<RGWRole> get_role(std::string id) override;
+    virtual int get_roles(const DoutPrefixProvider *dpp,
+        optional_yield y,
+        const std::string& path_prefix,
+        const std::string& tenant,
+        std::vector<std::unique_ptr<RGWRole>>& roles) override;
+    virtual std::unique_ptr<RGWOIDCProvider> get_oidc_provider() override;
+    virtual int get_oidc_providers(const DoutPrefixProvider *dpp,
+        const std::string& tenant,
+        std::vector<std::unique_ptr<RGWOIDCProvider>>& providers) override;
+    virtual std::unique_ptr<Writer> get_append_writer(const DoutPrefixProvider *dpp,
+        optional_yield y,
+        rgw::sal::Object* obj,
+        const rgw_user& owner,
+        const rgw_placement_rule *ptail_placement_rule,
+        const std::string& unique_tag,
+        uint64_t position,
+        uint64_t *cur_accounted_size) override;
+    virtual std::unique_ptr<Writer> get_atomic_writer(const DoutPrefixProvider *dpp,
+        optional_yield y,
+        rgw::sal::Object* obj,
+        const rgw_user& owner,
+        const rgw_placement_rule *ptail_placement_rule,
+        uint64_t olh_epoch,
+        const std::string& unique_tag) override;
+    virtual const std::string& get_compression_type(const rgw_placement_rule& rule) override;
+    virtual bool valid_placement(const rgw_placement_rule& rule) override;
+
+    virtual void finalize(void) override;
+
+    virtual CephContext *ctx(void) override {
+      return cctx;
+    }
+
+    virtual void register_admin_apis(RGWRESTMgr* mgr) override { };
+
+    int open_idx(struct m0_uint128 *id, bool create, struct m0_idx *out);
+    void close_idx(struct m0_idx *idx) { m0_idx_fini(idx); }
+    int do_idx_op(struct m0_idx *, enum m0_idx_opcode opcode,
+      std::vector<uint8_t>& key, std::vector<uint8_t>& val, bool update = false);
+
+    int do_idx_next_op(struct m0_idx *idx,
+                       std::vector<std::vector<uint8_t>>& key_vec,
+                       std::vector<std::vector<uint8_t>>& val_vec);
+    int next_query_by_name(std::string idx_name, std::vector<std::string>& key_str_vec,
+                                            std::vector<bufferlist>& val_bl_vec,
+                                            std::string prefix="", std::string delim="");
+
+    void index_name_to_motr_fid(std::string iname, struct m0_uint128 *fid);
+    int open_motr_idx(struct m0_uint128 *id, struct m0_idx *idx);
+    int create_motr_idx_by_name(std::string iname);
+    int delete_motr_idx_by_name(std::string iname);
+    int do_idx_op_by_name(std::string idx_name, enum m0_idx_opcode opcode,
+                          std::string key_str, bufferlist &bl, bool update=true);
+    int check_n_create_global_indices();
+    int store_access_key(const DoutPrefixProvider *dpp, optional_yield y, MotrAccessKey access_key);
+    int delete_access_key(const DoutPrefixProvider *dpp, optional_yield y, std::string access_key);
+    int store_email_info(const DoutPrefixProvider *dpp, optional_yield y, MotrEmailInfo& email_info);
+
+    int init_metadata_cache(const DoutPrefixProvider *dpp, CephContext *cct);
+    MotrMetaCache* get_obj_meta_cache() {return obj_meta_cache;}
+    MotrMetaCache* get_user_cache() {return user_cache;}
+    MotrMetaCache* get_bucket_inst_cache() {return bucket_inst_cache;}
+};
+
+struct obj_time_weight {
+  real_time mtime;
+  uint32_t zone_short_id;
+  uint64_t pg_ver;
+  bool high_precision;
+
+  obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
+
+  bool compare_low_precision(const obj_time_weight& rhs) {
+    struct timespec l = ceph::real_clock::to_timespec(mtime);
+    struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
+    l.tv_nsec = 0;
+    r.tv_nsec = 0;
+    if (l > r) {
+      return false;
+    }
+    if (l < r) {
+      return true;
+    }
+    if (!zone_short_id || !rhs.zone_short_id) {
+      /* don't compare zone ids, if one wasn't provided */
+      return false;
+    }
+    if (zone_short_id != rhs.zone_short_id) {
+      return (zone_short_id < rhs.zone_short_id);
+    }
+    return (pg_ver < rhs.pg_ver);
+
+  }
+
+  bool operator<(const obj_time_weight& rhs) {
+    if (!high_precision || !rhs.high_precision) {
+      return compare_low_precision(rhs);
+    }
+    if (mtime > rhs.mtime) {
+      return false;
+    }
+    if (mtime < rhs.mtime) {
+      return true;
+    }
+    if (!zone_short_id || !rhs.zone_short_id) {
+      /* don't compare zone ids, if one wasn't provided */
+      return false;
+    }
+    if (zone_short_id != rhs.zone_short_id) {
+      return (zone_short_id < rhs.zone_short_id);
+    }
+    return (pg_ver < rhs.pg_ver);
+  }
+
+  void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
+    mtime = _mtime;
+    zone_short_id = _short_id;
+    pg_ver = _pg_ver;
+  }
+
+  void init(RGWObjState *state) {
+    mtime = state->mtime;
+    zone_short_id = state->zone_short_id;
+    pg_ver = state->pg_ver;
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const obj_time_weight &o) {
+  out << o.mtime;
+
+  if (o.zone_short_id != 0 || o.pg_ver != 0) {
+    out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
+  }
+
+  return out;
+}
+
+} // namespace rgw::sal
diff --git a/src/rgw/rgw_sal_store.h b/src/rgw/rgw_sal_store.h
new file mode 100644
index 000000000..55b43e3d9
--- /dev/null
+++ b/src/rgw/rgw_sal_store.h
@@ -0,0 +1,419 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_sal.h"
+
+namespace rgw { namespace sal {
+
+class StoreDriver : public Driver {
+  public:
+    StoreDriver() {}
+    virtual ~StoreDriver() = default;
+
+    virtual uint64_t get_new_req_id() override {
+      return ceph::util::generate_random_number<uint64_t>();
+    }
+
+    int read_topics(const std::string& tenant, rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+        optional_yield y, const DoutPrefixProvider *dpp) override {return -EOPNOTSUPP;}
+    int write_topics(const std::string& tenant, const rgw_pubsub_topics& topics, RGWObjVersionTracker* objv_tracker,
+	optional_yield y, const DoutPrefixProvider *dpp) override {return -ENOENT;}
+    int remove_topics(const std::string& tenant, RGWObjVersionTracker* objv_tracker,
+        optional_yield y, const DoutPrefixProvider *dpp) override {return -ENOENT;}
+};
+
+class StoreUser : public User {
+  protected:
+    RGWUserInfo info;
+    RGWObjVersionTracker objv_tracker;
+    Attrs attrs;
+
+  public:
+    StoreUser() : info() {}
+    StoreUser(const rgw_user& _u) : info() { info.user_id = _u; }
+    StoreUser(const RGWUserInfo& _i) : info(_i) {}
+    StoreUser(StoreUser& _o) = default;
+    virtual ~StoreUser() = default;
+
+    virtual std::string& get_display_name() override { return info.display_name; }
+    virtual const std::string& get_tenant() override { return info.user_id.tenant; }
+    virtual void set_tenant(std::string& _t) override { info.user_id.tenant = _t; }
+    virtual const std::string& get_ns() override { return info.user_id.ns; }
+    virtual void set_ns(std::string& _ns) override { info.user_id.ns = _ns; }
+    virtual void clear_ns() override { info.user_id.ns.clear(); }
+    virtual const rgw_user& get_id() const override { return info.user_id; }
+    virtual uint32_t get_type() const override { return info.type; }
+    virtual int32_t get_max_buckets() const override { return info.max_buckets; }
+    virtual const RGWUserCaps& get_caps() const override { return info.caps; }
+    virtual RGWObjVersionTracker& get_version_tracker() override { return objv_tracker; }
+    virtual Attrs& get_attrs() override { return attrs; }
+    virtual void set_attrs(Attrs& _attrs) override { attrs = _attrs; }
+    virtual bool empty() const override { return info.user_id.id.empty(); }
+    virtual RGWUserInfo& get_info() override { return info; }
+    virtual void print(std::ostream& out) const override { out << info.user_id; }
+
+    friend class StoreBucket;
+};
+
+class StoreBucket : public Bucket {
+  protected:
+    RGWBucketEnt ent;
+    RGWBucketInfo info;
+    User* owner = nullptr;
+    Attrs attrs;
+    obj_version bucket_version;
+    ceph::real_time mtime;
+
+  public:
+
+    StoreBucket() = default;
+    StoreBucket(User* _u) :
+      owner(_u) { }
+    StoreBucket(const rgw_bucket& _b) { ent.bucket = _b; info.bucket = _b; }
+    StoreBucket(const RGWBucketEnt& _e) : ent(_e) {
+      info.bucket = ent.bucket;
+      info.placement_rule = ent.placement_rule;
+      info.creation_time = ent.creation_time;
+    }
+    StoreBucket(const RGWBucketInfo& _i) : info(_i) {
+      ent.bucket = info.bucket;
+      ent.placement_rule = info.placement_rule;
+      ent.creation_time = info.creation_time;
+    }
+    StoreBucket(const rgw_bucket& _b, User* _u) :
+      owner(_u) { ent.bucket = _b; info.bucket = _b; }
+    StoreBucket(const RGWBucketEnt& _e, User* _u) : ent(_e), owner(_u) {
+      info.bucket = ent.bucket;
+      info.placement_rule = ent.placement_rule;
+      info.creation_time = ent.creation_time;
+    }
+    StoreBucket(const RGWBucketInfo& _i, User* _u) : info(_i), owner(_u) {
+      ent.bucket = info.bucket;
+      ent.placement_rule = info.placement_rule;
+      ent.creation_time = info.creation_time;
+    }
+    virtual ~StoreBucket() = default;
+
+    virtual Attrs& get_attrs(void) override { return attrs; }
+    virtual int set_attrs(Attrs a) override { attrs = a; return 0; }
+    virtual void set_owner(rgw::sal::User* _owner) override {
+      owner = _owner;
+    }
+    virtual User* get_owner(void) override { return owner; };
+    virtual ACLOwner get_acl_owner(void) override { return ACLOwner(info.owner); };
+    virtual bool empty() const override { return info.bucket.name.empty(); }
+    virtual const std::string& get_name() const override { return info.bucket.name; }
+    virtual const std::string& get_tenant() const override { return info.bucket.tenant; }
+    virtual const std::string& get_marker() const override { return info.bucket.marker; }
+    virtual const std::string& get_bucket_id() const override { return info.bucket.bucket_id; }
+    virtual size_t get_size() const override { return ent.size; }
+    virtual size_t get_size_rounded() const override { return ent.size_rounded; }
+    virtual uint64_t get_count() const override { return ent.count; }
+    virtual rgw_placement_rule& get_placement_rule() override { return info.placement_rule; }
+    virtual ceph::real_time& get_creation_time() override { return info.creation_time; }
+    virtual ceph::real_time& get_modification_time() override { return mtime; }
+    virtual obj_version& get_version() override { return bucket_version; }
+    virtual void set_version(obj_version &ver) override { bucket_version = ver; }
+    virtual bool versioned() override { return info.versioned(); }
+    virtual bool versioning_enabled() override { return info.versioning_enabled(); }
+    virtual rgw_bucket& get_key() override { return info.bucket; }
+    virtual RGWBucketInfo& get_info() override { return info; }
+    virtual void print(std::ostream& out) const override { out << info.bucket; }
+    virtual bool operator==(const Bucket& b) const override {
+      if (typeid(*this) != typeid(b)) {
+	return false;
+      }
+      const StoreBucket& sb = dynamic_cast<const StoreBucket&>(b);
+
+      return (info.bucket.tenant == sb.info.bucket.tenant) &&
+	     (info.bucket.name == sb.info.bucket.name) &&
+	     (info.bucket.bucket_id == sb.info.bucket.bucket_id);
+    }
+    virtual bool operator!=(const Bucket& b) const override {
+      if (typeid(*this) != typeid(b)) {
+	return false;
+      }
+      const StoreBucket& sb = dynamic_cast<const StoreBucket&>(b);
+
+      return (info.bucket.tenant != sb.info.bucket.tenant) ||
+	     (info.bucket.name != sb.info.bucket.name) ||
+	     (info.bucket.bucket_id != sb.info.bucket.bucket_id);
+    }
+
+    int read_topics(rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, 
+        optional_yield y, const DoutPrefixProvider *dpp) override {return 0;}
+    int write_topics(const rgw_pubsub_bucket_topics& notifications, RGWObjVersionTracker* objv_tracker, 
+        optional_yield y, const DoutPrefixProvider *dpp) override {return 0;}
+    int remove_topics(RGWObjVersionTracker* objv_tracker, 
+        optional_yield y, const DoutPrefixProvider *dpp) override {return 0;}
+
+    friend class BucketList;
+  protected:
+    virtual void set_ent(RGWBucketEnt& _ent) { ent = _ent; info.bucket = ent.bucket; info.placement_rule = ent.placement_rule; }
+};
+
+class StoreObject : public Object {
+  protected:
+    RGWObjState state;
+    Bucket* bucket = nullptr;
+    bool delete_marker{false};
+
+  public:
+    StoreObject() = default;
+    StoreObject(const rgw_obj_key& _k)
+    { state.obj.key = _k; }
+    StoreObject(const rgw_obj_key& _k, Bucket* _b)
+      : bucket(_b)
+    { state.obj.init(_b->get_key(), _k); }
+    StoreObject(const StoreObject& _o) = default;
+
+    virtual ~StoreObject() = default;
+
+    virtual void set_atomic() override { state.is_atomic = true; }
+    virtual bool is_atomic() override { return state.is_atomic; }
+    virtual void set_prefetch_data() override { state.prefetch_data = true; }
+    virtual bool is_prefetch_data() override { return state.prefetch_data; }
+    virtual void set_compressed() override { state.compressed = true; }
+    virtual bool is_compressed() override { return state.compressed; }
+    virtual void invalidate() override {
+      rgw_obj obj = state.obj;
+      bool is_atomic = state.is_atomic;
+      bool prefetch_data = state.prefetch_data;
+      bool compressed = state.compressed;
+
+      state = RGWObjState();
+      state.obj = obj;
+      state.is_atomic = is_atomic;
+      state.prefetch_data = prefetch_data;
+      state.compressed = compressed;
+    }
+
+    virtual bool empty() const override { return state.obj.empty(); }
+    virtual const std::string &get_name() const override { return state.obj.key.name; }
+    virtual Attrs& get_attrs(void) override { return state.attrset; }
+    virtual const Attrs& get_attrs(void) const override { return state.attrset; }
+    virtual int set_attrs(Attrs a) override { state.attrset = a; state.has_attrs = true; return 0; }
+    virtual bool has_attrs(void) override { return state.has_attrs; }
+    virtual ceph::real_time get_mtime(void) const override { return state.mtime; }
+    virtual uint64_t get_obj_size(void) const override { return state.size; }
+    virtual Bucket* get_bucket(void) const override { return bucket; }
+    virtual void set_bucket(Bucket* b) override { bucket = b; state.obj.bucket = b->get_key(); }
+    virtual std::string get_hash_source(void) override { return state.obj.index_hash_source; }
+    virtual void set_hash_source(std::string s) override { state.obj.index_hash_source = s; }
+    virtual std::string get_oid(void) const override { return state.obj.key.get_oid(); }
+    virtual bool get_delete_marker(void) override { return delete_marker; }
+    virtual bool get_in_extra_data(void) override { return state.obj.is_in_extra_data(); }
+    virtual void set_in_extra_data(bool i) override { state.obj.set_in_extra_data(i); }
+    int range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end);
+    virtual void set_obj_size(uint64_t s) override { state.size = s; }
+    virtual void set_name(const std::string& n) override { state.obj.key = n; }
+    virtual void set_key(const rgw_obj_key& k) override { state.obj.key = k; }
+    virtual rgw_obj get_obj(void) const override { return state.obj; }
+    virtual rgw_obj_key& get_key() override { return state.obj.key; }
+    virtual void set_instance(const std::string &i) override { state.obj.key.set_instance(i); }
+    virtual const std::string &get_instance() const override { return state.obj.key.instance; }
+    virtual bool have_instance(void) override { return state.obj.key.have_instance(); }
+    virtual void clear_instance() override { state.obj.key.instance.clear(); }
+    virtual int transition_to_cloud(Bucket* bucket,
+			   rgw::sal::PlacementTier* tier,
+			   rgw_bucket_dir_entry& o,
+			   std::set<std::string>& cloud_targets,
+			   CephContext* cct,
+			   bool update_object,
+			   const DoutPrefixProvider* dpp,
+			   optional_yield y) override {
+      /* Return failure here, so stores which don't transition to cloud will
+       * work with lifecycle */
+      return -1;
+    }
+
+    virtual void print(std::ostream& out) const override {
+      if (bucket)
+	out << bucket << ":";
+      out << state.obj.key;
+    }
+};
+
+class StoreMultipartPart : public MultipartPart {
+  protected:
+    std::string oid;
+public:
+  StoreMultipartPart() = default;
+  virtual ~StoreMultipartPart() = default;
+};
+
+class StoreMultipartUpload : public MultipartUpload {
+protected:
+  Bucket* bucket;
+  std::map<uint32_t, std::unique_ptr<MultipartPart>> parts;
+  jspan_context trace_ctx{false, false};
+public:
+  StoreMultipartUpload(Bucket* _bucket) : bucket(_bucket) {}
+  virtual ~StoreMultipartUpload() = default;
+
+  virtual std::map<uint32_t, std::unique_ptr<MultipartPart>>& get_parts() override { return parts; }
+
+  virtual const jspan_context& get_trace() override { return trace_ctx; }
+
+  virtual void print(std::ostream& out) const override {
+    out << get_meta();
+    if (!get_upload_id().empty())
+      out << ":" << get_upload_id();
+  }
+};
+
+class StoreMPSerializer : public MPSerializer {
+protected:
+  bool locked;
+  std::string oid;
+public:
+  StoreMPSerializer() : locked(false) {}
+  StoreMPSerializer(std::string _oid) : locked(false), oid(_oid) {}
+  virtual ~StoreMPSerializer() = default;
+
+  virtual void clear_locked() override {
+    locked = false;
+  }
+  virtual bool is_locked() override { return locked; }
+
+  virtual void print(std::ostream& out) const override { out << oid; }
+};
+
+class StoreLCSerializer : public LCSerializer {
+protected:
+  std::string oid;
+public:
+  StoreLCSerializer() {}
+  StoreLCSerializer(std::string _oid) : oid(_oid) {}
+  virtual ~StoreLCSerializer() = default;
+
+  virtual void print(std::ostream& out) const override { out << oid; }
+};
+
+class StoreLifecycle : public Lifecycle {
+public:
+  struct StoreLCHead : LCHead {
+    time_t start_date{0};
+    time_t shard_rollover_date{0};
+    std::string marker;
+
+    StoreLCHead() = default;
+    StoreLCHead(time_t _start_date, time_t _rollover_date, std::string& _marker) : start_date(_start_date), shard_rollover_date(_rollover_date), marker(_marker) {}
+
+    StoreLCHead& operator=(LCHead& _h) {
+      start_date = _h.get_start_date();
+      shard_rollover_date = _h.get_shard_rollover_date();
+      marker = _h.get_marker();
+
+      return *this;
+    }
+
+    virtual time_t& get_start_date() override { return start_date; }
+    virtual void set_start_date(time_t _date) override { start_date = _date; }
+    virtual std::string& get_marker() override { return marker; }
+    virtual void set_marker(const std::string& _marker) override { marker = _marker; }
+    virtual time_t& get_shard_rollover_date() override { return shard_rollover_date; }
+    virtual void set_shard_rollover_date(time_t _date) override { shard_rollover_date = _date; }
+  };
+
+  struct StoreLCEntry : LCEntry {
+    std::string bucket;
+    std::string oid;
+    uint64_t start_time{0};
+    uint32_t status{0};
+
+    StoreLCEntry() = default;
+    StoreLCEntry(std::string& _bucket, uint64_t _time, uint32_t _status) : bucket(_bucket), start_time(_time), status(_status) {}
+    StoreLCEntry(std::string& _bucket, std::string _oid, uint64_t _time, uint32_t _status) : bucket(_bucket), oid(_oid), start_time(_time), status(_status) {}
+    StoreLCEntry(const StoreLCEntry& _e) = default;
+
+    StoreLCEntry& operator=(LCEntry& _e) {
+      bucket = _e.get_bucket();
+      oid = _e.get_oid();
+      start_time = _e.get_start_time();
+      status = _e.get_status();
+
+      return *this;
+    }
+
+    virtual std::string& get_bucket() override { return bucket; }
+    virtual void set_bucket(const std::string& _bucket) override { bucket = _bucket; }
+    virtual std::string& get_oid() override { return oid; }
+    virtual void set_oid(const std::string& _oid) override { oid = _oid; }
+    virtual uint64_t get_start_time() override { return start_time; }
+    virtual void set_start_time(uint64_t _time) override { start_time = _time; }
+    virtual uint32_t get_status() override { return status; }
+    virtual void set_status(uint32_t _status) override { status = _status; }
+    virtual void print(std::ostream& out) const override {
+      out << bucket << ":" << oid << ":" << start_time << ":" << status;
+    }
+  };
+
+  StoreLifecycle() = default;
+  virtual ~StoreLifecycle() = default;
+
+  virtual std::unique_ptr<LCEntry> get_entry() override {
+      return std::make_unique<StoreLCEntry>();
+  }
+  using Lifecycle::get_entry;
+};
+
+class StoreNotification : public Notification {
+protected:
+  Object* obj;
+  Object* src_obj;
+  rgw::notify::EventType event_type;
+
+  public:
+    StoreNotification(Object* _obj, Object* _src_obj, rgw::notify::EventType _type)
+      : obj(_obj), src_obj(_src_obj), event_type(_type)
+    {}
+
+    virtual ~StoreNotification() = default;
+};
+
+class StoreWriter : public Writer {
+protected:
+  const DoutPrefixProvider* dpp;
+
+public:
+  StoreWriter(const DoutPrefixProvider *_dpp, optional_yield y) : dpp(_dpp) {}
+  virtual ~StoreWriter() = default;
+
+};
+
+class StorePlacementTier : public PlacementTier {
+public:
+  virtual ~StorePlacementTier() = default;
+};
+
+class StoreZoneGroup : public ZoneGroup {
+public:
+  virtual ~StoreZoneGroup() = default;
+};
+
+class StoreZone : public Zone {
+  public:
+    virtual ~StoreZone() = default;
+};
+
+class StoreLuaManager : public LuaManager {
+public:
+  virtual ~StoreLuaManager() = default;
+};
+
+} } // namespace rgw::sal
diff --git a/src/rgw/rgw_signal.cc b/src/rgw/rgw_signal.cc
new file mode 100644
index 000000000..4bb29d0df
--- /dev/null
+++ b/src/rgw/rgw_signal.cc
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "rgw_signal.h"
+#include "global/signal_handler.h"
+#include "common/safe_io.h"
+#include "common/errno.h"
+#include "rgw_main.h"
+#include "rgw_log.h"
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+#define dout_subsys ceph_subsys_rgw
+#define dout_context g_ceph_context
+
+
+static int signal_fd[2] = {0, 0};
+
+namespace rgw {
+namespace signal {
+
+void sighup_handler(int signum) {
+    if (rgw::AppMain::ops_log_file != nullptr) {
+        rgw::AppMain::ops_log_file->reopen();
+    }
+    g_ceph_context->reopen_logs();
+} /* sighup_handler */
+
+void signal_shutdown()
+{
+  int val = 0;
+  int ret = write(signal_fd[0], (char *)&val, sizeof(val));
+  if (ret < 0) {
+    derr << "ERROR: " << __func__ << ": write() returned "
+         << cpp_strerror(errno) << dendl;
+  }
+} /* signal_shutdown */
+
+void wait_shutdown()
+{
+  int val;
+  int r = safe_read_exact(signal_fd[1], &val, sizeof(val));
+  if (r < 0) {
+    derr << "safe_read_exact returned with error" << dendl;
+  }
+} /* wait_shutdown */
+
+int signal_fd_init()
+{
+  return socketpair(AF_UNIX, SOCK_STREAM, 0, signal_fd);
+} /* signal_fd_init */
+
+void signal_fd_finalize()
+{
+  close(signal_fd[0]);
+  close(signal_fd[1]);
+} /* signal_fd_finalize */
+
+void handle_sigterm(int signum)
+{
+  dout(1) << __func__ << dendl;
+
+  // send a signal to make fcgi's accept(2) wake up.  unfortunately the
+  // initial signal often isn't sufficient because we race with accept's
+  // check of the flag wet by ShutdownPending() above.
+  if (signum != SIGUSR1) {
+    signal_shutdown();
+
+    // safety net in case we get stuck doing an orderly shutdown.
+    uint64_t secs = g_ceph_context->_conf->rgw_exit_timeout_secs;
+    if (secs)
+      alarm(secs);
+    dout(1) << __func__ << " set alarm for " << secs << dendl;
+  }
+} /* handle_sigterm */
+
+}} /* namespace rgw::signal */
diff --git a/src/rgw/rgw_signal.h b/src/rgw/rgw_signal.h
new file mode 100644
index 000000000..68fc4f614
--- /dev/null
+++ b/src/rgw/rgw_signal.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2022 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+
+namespace rgw {
+namespace signal {
+
+void signal_shutdown();
+void wait_shutdown();
+int signal_fd_init();
+void signal_fd_finalize();
+void handle_sigterm(int signum);
+void handle_sigterm(int signum);
+void sighup_handler(int signum);
+
+} // namespace signal
+} // namespace rgw
diff --git a/src/rgw/rgw_string.cc b/src/rgw/rgw_string.cc
new file mode 100644
index 000000000..7be82f854
--- /dev/null
+++ b/src/rgw/rgw_string.cc
@@ -0,0 +1,45 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_string.h"
+
+static bool char_eq(char c1, char c2)
+{
+  return c1 == c2;
+}
+
+static bool ci_char_eq(char c1, char c2)
+{
+  return tolower(c1) == tolower(c2);
+}
+
+bool match_wildcards(std::string_view pattern, std::string_view input,
+                     uint32_t flags)
+{
+  const auto eq = (flags & MATCH_CASE_INSENSITIVE) ? &ci_char_eq : &char_eq;
+
+  auto it1 = pattern.begin();
+  auto it2 = input.begin();
+  while (true) {
+    if (it1 == pattern.end())
+      return it2 == input.end();
+    if (*it1 == '*') {
+      if (it1 + 1 == pattern.end())
+        return true;
+      if (it2 == input.end() || eq(*(it1 + 1), *it2))
+        ++it1;
+      else
+        ++it2;
+      continue;
+    }
+    if (it2 == input.end())
+      return false;
+    if (*it1 == '?' || eq(*it1, *it2)) {
+      ++it1;
+      ++it2;
+      continue;
+    }
+    return false;
+  }
+  return false;
+}
diff --git a/src/rgw/rgw_string.h b/src/rgw/rgw_string.h
new file mode 100644
index 000000000..e58a356f4
--- /dev/null
+++ b/src/rgw/rgw_string.h
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <errno.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string_view>
+#include <string>
+#include <stdexcept>
+
+#include <boost/container/small_vector.hpp>
+
+struct ltstr_nocase
+{
+  bool operator()(const std::string& s1, const std::string& s2) const
+  {
+    return strcasecmp(s1.c_str(), s2.c_str()) < 0;
+  }
+};
+
+static inline int stringcasecmp(const std::string& s1, const std::string& s2)
+{
+  return strcasecmp(s1.c_str(), s2.c_str());
+}
+
+static inline int stringcasecmp(const std::string& s1, const char *s2)
+{
+  return strcasecmp(s1.c_str(), s2);
+}
+
+static inline int stringcasecmp(const std::string& s1, int ofs, int size, const std::string& s2)
+{
+  return strncasecmp(s1.c_str() + ofs, s2.c_str(), size);
+}
+
+static inline int stringtoll(const std::string& s, int64_t *val)
+{
+  char *end;
+
+  long long result = strtoll(s.c_str(), &end, 10);
+  if (result == LLONG_MAX)
+    return -EINVAL;
+
+  if (*end)
+    return -EINVAL;
+
+  *val = (int64_t)result;
+
+  return 0;
+}
+
+static inline int stringtoull(const std::string& s, uint64_t *val)
+{
+  char *end;
+
+  unsigned long long result = strtoull(s.c_str(), &end, 10);
+  if (result == ULLONG_MAX)
+    return -EINVAL;
+
+  if (*end)
+    return -EINVAL;
+
+  *val = (uint64_t)result;
+
+  return 0;
+}
+
+static inline int stringtol(const std::string& s, int32_t *val)
+{
+  char *end;
+
+  long result = strtol(s.c_str(), &end, 10);
+  if (result == LONG_MAX)
+    return -EINVAL;
+
+  if (*end)
+    return -EINVAL;
+
+  *val = (int32_t)result;
+
+  return 0;
+}
+
+static inline int stringtoul(const std::string& s, uint32_t *val)
+{
+  char *end;
+
+  unsigned long result = strtoul(s.c_str(), &end, 10);
+  if (result == ULONG_MAX)
+    return -EINVAL;
+
+  if (*end)
+    return -EINVAL;
+
+  *val = (uint32_t)result;
+
+  return 0;
+}
+
+/* A converter between std::string_view and null-terminated C-strings.
+ * It copies memory while trying to utilize the local memory instead of
+ * issuing dynamic allocations. */
+template<std::size_t N = 128>
+static inline boost::container::small_vector<char, N>
+sview2cstr(const std::string_view& sv)
+{
+  boost::container::small_vector<char, N> cstr;
+  cstr.reserve(sv.size() + sizeof('\0'));
+
+  cstr.assign(std::begin(sv), std::end(sv));
+  cstr.push_back('\0');
+
+  return cstr;
+}
+
+/* std::strlen() isn't guaranteed to be computable at compile-time. Although
+ * newer GCCs actually do that, Clang doesn't. Please be aware this function
+ * IS NOT A DROP-IN REPLACEMENT FOR STRLEN -- it returns a different result
+ * for strings having \0 in the middle. */
+template<size_t N>
+static inline constexpr size_t sarrlen(const char (&arr)[N]) {
+  return N - 1;
+}
+
+namespace detail {
+
+// variadic sum() to add up string lengths for reserve()
+static inline constexpr size_t sum() { return 0; }
+template <typename... Args>
+constexpr size_t sum(size_t v, Args... args) { return v + sum(args...); }
+
+// traits for string_size()
+template <typename T>
+struct string_traits {
+  static constexpr size_t size(const T& s) { return s.size(); }
+};
+// specializations for char*/const char* use strlen()
+template <>
+struct string_traits<const char*> {
+  static size_t size(const char* s) { return std::strlen(s); }
+};
+template <>
+struct string_traits<char*> : string_traits<const char*> {};
+// constexpr specializations for char[]/const char[]
+template <std::size_t N>
+struct string_traits<const char[N]> {
+  static constexpr size_t size_(const char* s, size_t i) {
+    return i < N ? (*(s + i) == '\0' ? i : size_(s, i + 1))
+        : throw std::invalid_argument("Unterminated string constant.");
+  }
+  static constexpr size_t size(const char(&s)[N]) { return size_(s, 0); }
+};
+template <std::size_t N>
+struct string_traits<char[N]> : string_traits<const char[N]> {};
+
+// helpers for string_cat_reserve()
+static inline void append_to(std::string& s) {}
+template <typename... Args>
+void append_to(std::string& s, const std::string_view& v, const Args&... args)
+{
+  s.append(v.begin(), v.end());
+  append_to(s, args...);
+}
+
+// helpers for string_join_reserve()
+static inline void join_next(std::string& s, const std::string_view& d) {}
+template <typename... Args>
+void join_next(std::string& s, const std::string_view& d,
+               const std::string_view& v, const Args&... args)
+{
+  s.append(d.begin(), d.end());
+  s.append(v.begin(), v.end());
+  join_next(s, d, args...);
+}
+
+static inline void join(std::string& s, const std::string_view& d) {}
+template <typename... Args>
+void join(std::string& s, const std::string_view& d,
+          const std::string_view& v, const Args&... args)
+{
+  s.append(v.begin(), v.end());
+  join_next(s, d, args...);
+}
+
+} // namespace detail
+
+/// return the length of a c string, string literal, or string type
+template <typename T>
+constexpr size_t string_size(const T& s)
+{
+  return detail::string_traits<T>::size(s);
+}
+
+/// concatenates the given string arguments, returning as a std::string that
+/// gets preallocated with reserve()
+template <typename... Args>
+std::string string_cat_reserve(const Args&... args)
+{
+  size_t total_size = detail::sum(string_size(args)...);
+  std::string result;
+  result.reserve(total_size);
+  detail::append_to(result, args...);
+  return result;
+}
+
+/// joins the given string arguments with a delimiter, returning as a
+/// std::string that gets preallocated with reserve()
+template <typename... Args>
+std::string string_join_reserve(const std::string_view& delim,
+                                const Args&... args)
+{
+  size_t delim_size = delim.size() * std::max<ssize_t>(0, sizeof...(args) - 1);
+  size_t total_size = detail::sum(string_size(args)...) + delim_size;
+  std::string result;
+  result.reserve(total_size);
+  detail::join(result, delim, args...);
+  return result;
+}
+template <typename... Args>
+std::string string_join_reserve(char delim, const Args&... args)
+{
+  return string_join_reserve(std::string_view{&delim, 1}, args...);
+}
+
+
+/// use case-insensitive comparison in match_wildcards()
+static constexpr uint32_t MATCH_CASE_INSENSITIVE = 0x01;
+
+/// attempt to match the given input string with the pattern, which may contain
+/// the wildcard characters * and ?
+extern bool match_wildcards(std::string_view pattern,
+                            std::string_view input,
+                            uint32_t flags = 0);
diff --git a/src/rgw/rgw_sts.cc b/src/rgw/rgw_sts.cc
new file mode 100644
index 000000000..b55283442
--- /dev/null
+++ b/src/rgw/rgw_sts.cc
@@ -0,0 +1,469 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <ctime>
+#include <regex>
+#include <boost/format.hpp>
+#include <boost/algorithm/string/replace.hpp>
+
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/ceph_json.h"
+#include "common/ceph_time.h"
+#include "auth/Crypto.h"
+#include "include/ceph_fs.h"
+#include "common/iso_8601.h"
+
+#include "include/types.h"
+#include "rgw_string.h"
+
+#include "rgw_b64.h"
+#include "rgw_common.h"
+#include "rgw_tools.h"
+#include "rgw_role.h"
+#include "rgw_user.h"
+#include "rgw_iam_policy.h"
+#include "rgw_sts.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+namespace STS {
+
+void Credentials::dump(Formatter *f) const
+{
+  encode_json("AccessKeyId", accessKeyId , f);
+  encode_json("Expiration", expiration , f);
+  encode_json("SecretAccessKey", secretAccessKey , f);
+  encode_json("SessionToken", sessionToken , f);
+}
+
+int Credentials::generateCredentials(const DoutPrefixProvider *dpp,
+                          CephContext* cct,
+                          const uint64_t& duration,
+                          const boost::optional<std::string>& policy,
+                          const boost::optional<std::string>& roleId,
+                          const boost::optional<std::string>& role_session,
+                          const boost::optional<std::vector<std::string>>& token_claims,
+                          const boost::optional<std::vector<std::pair<std::string,std::string>>>& session_princ_tags,
+                          boost::optional<rgw_user> user,
+                          rgw::auth::Identity* identity)
+{
+  uuid_d accessKey, secretKey;
+  char accessKeyId_str[MAX_ACCESS_KEY_LEN], secretAccessKey_str[MAX_SECRET_KEY_LEN];
+
+  //AccessKeyId
+  gen_rand_alphanumeric_plain(cct, accessKeyId_str, sizeof(accessKeyId_str));
+  accessKeyId = accessKeyId_str;
+
+  //SecretAccessKey
+  gen_rand_alphanumeric_upper(cct, secretAccessKey_str, sizeof(secretAccessKey_str));
+  secretAccessKey = secretAccessKey_str;
+
+  //Expiration
+  real_clock::time_point t = real_clock::now();
+  real_clock::time_point exp = t + std::chrono::seconds(duration);
+  expiration = ceph::to_iso_8601(exp);
+
+  //Session Token - Encrypt using AES
+  auto* cryptohandler = cct->get_crypto_handler(CEPH_CRYPTO_AES);
+  if (! cryptohandler) {
+    ldpp_dout(dpp, 0) << "ERROR: No AES cryto handler found !" << dendl;
+    return -EINVAL;
+  }
+  string secret_s = cct->_conf->rgw_sts_key;
+  buffer::ptr secret(secret_s.c_str(), secret_s.length());
+  int ret = 0;
+  if (ret = cryptohandler->validate_secret(secret); ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: Invalid rgw sts key, please ensure its length is 16" << dendl;
+    return ret;
+  }
+  string error;
+  std::unique_ptr<CryptoKeyHandler> keyhandler(cryptohandler->get_key_handler(secret, error));
+  if (! keyhandler) {
+    ldpp_dout(dpp, 0) << "ERROR: No Key handler found !" << dendl;
+    return -EINVAL;
+  }
+  error.clear();
+  //Storing policy and roleId as part of token, so that they can be extracted
+  // from the token itself for policy evaluation.
+  SessionToken token;
+  //authentication info
+  token.access_key_id = accessKeyId;
+  token.secret_access_key = secretAccessKey;
+  token.expiration = expiration;
+  token.issued_at = ceph::to_iso_8601(t);
+
+  //Authorization info
+  if (policy)
+    token.policy = *policy;
+  else
+    token.policy = {};
+
+  if (roleId)
+    token.roleId = *roleId;
+  else
+    token.roleId = {};
+
+  if (user)
+    token.user = *user;
+  else {
+    rgw_user u({}, {}, {});
+    token.user = u;
+  }
+
+  if (token_claims) {
+    token.token_claims = std::move(*token_claims);
+  }
+
+  if (identity) {
+    token.acct_name = identity->get_acct_name();
+    token.perm_mask = identity->get_perm_mask();
+    token.is_admin = identity->is_admin_of(token.user);
+    token.acct_type = identity->get_identity_type();
+  } else {
+    token.acct_name = {};
+    token.perm_mask = 0;
+    token.is_admin = 0;
+    token.acct_type = TYPE_ROLE;
+    token.role_session = role_session.get();
+  }
+
+  if (session_princ_tags) {
+    token.principal_tags = std::move(*session_princ_tags);
+  }
+  buffer::list input, enc_output;
+  encode(token, input);
+
+  if (ret = keyhandler->encrypt(input, enc_output, &error); ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: Encrypting session token returned an error !" << dendl;
+    return ret;
+  }
+
+  bufferlist encoded_op;
+  enc_output.encode_base64(encoded_op);
+  encoded_op.append('\0');
+  sessionToken = encoded_op.c_str();
+
+  return ret;
+}
+
+void AssumedRoleUser::dump(Formatter *f) const
+{
+  encode_json("Arn", arn , f);
+  encode_json("AssumeRoleId", assumeRoleId , f);
+}
+
+int AssumedRoleUser::generateAssumedRoleUser(CephContext* cct,
+                                              rgw::sal::Driver* driver,
+                                              const string& roleId,
+                                              const rgw::ARN& roleArn,
+                                              const string& roleSessionName)
+{
+  string resource = std::move(roleArn.resource);
+  boost::replace_first(resource, "role", "assumed-role");
+  resource.append("/");
+  resource.append(roleSessionName);
+  
+  rgw::ARN assumed_role_arn(rgw::Partition::aws,
+                                  rgw::Service::sts,
+                                  "", roleArn.account, resource);
+  arn = assumed_role_arn.to_string();
+
+  //Assumeroleid = roleid:rolesessionname
+  assumeRoleId = roleId + ":" + roleSessionName;
+
+  return 0;
+}
+
+AssumeRoleRequestBase::AssumeRoleRequestBase( CephContext* cct,
+                                              const string& duration,
+                                              const string& iamPolicy,
+                                              const string& roleArn,
+                                              const string& roleSessionName)
+  : cct(cct), iamPolicy(iamPolicy), roleArn(roleArn), roleSessionName(roleSessionName)
+{
+  MIN_DURATION_IN_SECS = cct->_conf->rgw_sts_min_session_duration;
+  if (duration.empty()) {
+    this->duration = DEFAULT_DURATION_IN_SECS;
+  } else {
+    this->duration = strict_strtoll(duration.c_str(), 10, &this->err_msg);
+  }
+}
+
+int AssumeRoleRequestBase::validate_input(const DoutPrefixProvider *dpp) const
+{
+  if (!err_msg.empty()) {
+    ldpp_dout(dpp, 0) << "ERROR: error message is empty !" << dendl;
+    return -EINVAL;
+  }
+
+  if (duration < MIN_DURATION_IN_SECS ||
+          duration > MAX_DURATION_IN_SECS) {
+    ldpp_dout(dpp, 0) << "ERROR: Incorrect value of duration: " << duration << dendl;
+    return -EINVAL;
+  }
+
+  if (! iamPolicy.empty() &&
+          (iamPolicy.size() < MIN_POLICY_SIZE || iamPolicy.size() > MAX_POLICY_SIZE)) {
+    ldpp_dout(dpp, 0) << "ERROR: Incorrect size of iamPolicy: " << iamPolicy.size() << dendl;
+    return -ERR_PACKED_POLICY_TOO_LARGE;
+  }
+
+  if (! roleArn.empty() &&
+          (roleArn.size() < MIN_ROLE_ARN_SIZE || roleArn.size() > MAX_ROLE_ARN_SIZE)) {
+    ldpp_dout(dpp, 0) << "ERROR: Incorrect size of roleArn: " << roleArn.size() << dendl;
+    return -EINVAL;
+  }
+
+  if (! roleSessionName.empty()) {
+    if (roleSessionName.size() < MIN_ROLE_SESSION_SIZE || roleSessionName.size() > MAX_ROLE_SESSION_SIZE) {
+      ldpp_dout(dpp, 0) << "ERROR: Either role session name is empty or role session size is incorrect: " << roleSessionName.size() << dendl;
+      return -EINVAL;
+    }
+
+    std::regex regex_roleSession("[A-Za-z0-9_=,.@-]+");
+    if (! std::regex_match(roleSessionName, regex_roleSession)) {
+      ldpp_dout(dpp, 0) << "ERROR: Role session name is incorrect: " << roleSessionName << dendl;
+      return -EINVAL;
+    }
+  }
+
+  return 0;
+}
+
+int AssumeRoleWithWebIdentityRequest::validate_input(const DoutPrefixProvider *dpp) const
+{
+  if (! providerId.empty()) {
+    if (providerId.length() < MIN_PROVIDER_ID_LEN ||
+          providerId.length() > MAX_PROVIDER_ID_LEN) {
+      ldpp_dout(dpp, 0) << "ERROR: Either provider id is empty or provider id length is incorrect: " << providerId.length() << dendl;
+      return -EINVAL;
+    }
+  }
+  return AssumeRoleRequestBase::validate_input(dpp);
+}
+
+int AssumeRoleRequest::validate_input(const DoutPrefixProvider *dpp) const
+{
+  if (! externalId.empty()) {
+    if (externalId.length() < MIN_EXTERNAL_ID_LEN ||
+          externalId.length() > MAX_EXTERNAL_ID_LEN) {
+      ldpp_dout(dpp, 0) << "ERROR: Either external id is empty or external id length is incorrect: " << externalId.length() << dendl;
+      return -EINVAL;
+    }
+
+    std::regex regex_externalId("[A-Za-z0-9_=,.@:/-]+");
+    if (! std::regex_match(externalId, regex_externalId)) {
+      ldpp_dout(dpp, 0) << "ERROR: Invalid external Id: " << externalId << dendl;
+      return -EINVAL;
+    }
+  }
+  if (! serialNumber.empty()){
+    if (serialNumber.size() < MIN_SERIAL_NUMBER_SIZE || serialNumber.size() > MAX_SERIAL_NUMBER_SIZE) {
+      ldpp_dout(dpp, 0) << "Either serial number is empty or serial number length is incorrect: " << serialNumber.size() << dendl;
+      return -EINVAL;
+    }
+
+    std::regex regex_serialNumber("[A-Za-z0-9_=/:,.@-]+");
+    if (! std::regex_match(serialNumber, regex_serialNumber)) {
+      ldpp_dout(dpp, 0) << "Incorrect serial number: " << serialNumber << dendl;
+      return -EINVAL;
+    }
+  }
+  if (! tokenCode.empty() && tokenCode.size() == TOKEN_CODE_SIZE) {
+    ldpp_dout(dpp, 0) << "Either token code is empty or token code size is invalid: " << tokenCode.size() << dendl;
+    return -EINVAL;
+  }
+
+  return AssumeRoleRequestBase::validate_input(dpp);
+}
+
+std::tuple<int, rgw::sal::RGWRole*> STSService::getRoleInfo(const DoutPrefixProvider *dpp,
+                                                 const string& arn,
+						 optional_yield y)
+{
+  if (auto r_arn = rgw::ARN::parse(arn); r_arn) {
+    auto pos = r_arn->resource.find_last_of('/');
+    string roleName = r_arn->resource.substr(pos + 1);
+    std::unique_ptr<rgw::sal::RGWRole> role = driver->get_role(roleName, r_arn->account);
+    if (int ret = role->get(dpp, y); ret < 0) {
+      if (ret == -ENOENT) {
+        ldpp_dout(dpp, 0) << "Role doesn't exist: " << roleName << dendl;
+        ret = -ERR_NO_ROLE_FOUND;
+      }
+      return make_tuple(ret, nullptr);
+    } else {
+      auto path_pos = r_arn->resource.find('/');
+      string path;
+      if (path_pos == pos) {
+        path = "/";
+      } else {
+        path = r_arn->resource.substr(path_pos, ((pos - path_pos) + 1));
+      }
+      string r_path = role->get_path();
+      if (path != r_path) {
+        ldpp_dout(dpp, 0) << "Invalid Role ARN: Path in ARN does not match with the role path: " << path << " " << r_path << dendl;
+        return make_tuple(-EACCES, nullptr);
+      }
+      this->role = std::move(role);
+      return make_tuple(0, this->role.get());
+    }
+  } else {
+    ldpp_dout(dpp, 0) << "Invalid role arn: " << arn << dendl;
+    return make_tuple(-EINVAL, nullptr);
+  }
+}
+
+AssumeRoleWithWebIdentityResponse STSService::assumeRoleWithWebIdentity(const DoutPrefixProvider *dpp, AssumeRoleWithWebIdentityRequest& req)
+{
+  AssumeRoleWithWebIdentityResponse response;
+  response.assumeRoleResp.packedPolicySize = 0;
+  std::vector<string> token_claims;
+
+  if (req.getProviderId().empty()) {
+    response.providerId = req.getIss();
+  }
+  response.aud = req.getAud();
+  response.sub = req.getSub();
+
+  token_claims.emplace_back(string("iss") + ":" + req.getIss());
+  token_claims.emplace_back(string("aud") + ":" + req.getAud());
+  token_claims.emplace_back(string("sub") + ":" + req.getSub());
+
+  //Get the role info which is being assumed
+  boost::optional<rgw::ARN> r_arn = rgw::ARN::parse(req.getRoleARN());
+  if (r_arn == boost::none) {
+    ldpp_dout(dpp, 0) << "Error in parsing role arn: " << req.getRoleARN() << dendl;
+    response.assumeRoleResp.retCode = -EINVAL;
+    return response;
+  }
+
+  string roleId = role->get_id();
+  uint64_t roleMaxSessionDuration = role->get_max_session_duration();
+  req.setMaxDuration(roleMaxSessionDuration);
+
+  //Validate input
+  response.assumeRoleResp.retCode = req.validate_input(dpp);
+  if (response.assumeRoleResp.retCode < 0) {
+    return response;
+  }
+
+  //Calculate PackedPolicySize
+  string policy = req.getPolicy();
+  response.assumeRoleResp.packedPolicySize = (policy.size() / req.getMaxPolicySize()) * 100;
+
+  //Generate Assumed Role User
+  response.assumeRoleResp.retCode = response.assumeRoleResp.user.generateAssumedRoleUser(cct,
+                                                                                          driver,
+                                                                                          roleId,
+                                                                                          r_arn.get(),
+                                                                                          req.getRoleSessionName());
+  if (response.assumeRoleResp.retCode < 0) {
+    return response;
+  }
+
+  //Generate Credentials
+  //Role and Policy provide the authorization info, user id and applier info are not needed
+  response.assumeRoleResp.retCode = response.assumeRoleResp.creds.generateCredentials(dpp, cct, req.getDuration(),
+                                                                                      req.getPolicy(), roleId,
+                                                                                      req.getRoleSessionName(),
+                                                                                      token_claims,
+                                                                                      req.getPrincipalTags(),
+                                                                                      user_id, nullptr);
+  if (response.assumeRoleResp.retCode < 0) {
+    return response;
+  }
+
+  response.assumeRoleResp.retCode = 0;
+  return response;
+}
+
+AssumeRoleResponse STSService::assumeRole(const DoutPrefixProvider *dpp, 
+                                          AssumeRoleRequest& req,
+					  optional_yield y)
+{
+  AssumeRoleResponse response;
+  response.packedPolicySize = 0;
+
+  //Get the role info which is being assumed
+  boost::optional<rgw::ARN> r_arn = rgw::ARN::parse(req.getRoleARN());
+  if (r_arn == boost::none) {
+    ldpp_dout(dpp, 0) << "Error in parsing role arn: " << req.getRoleARN() << dendl;
+    response.retCode = -EINVAL;
+    return response;
+  }
+
+  string roleId = role->get_id();
+  uint64_t roleMaxSessionDuration = role->get_max_session_duration();
+  req.setMaxDuration(roleMaxSessionDuration);
+
+  //Validate input
+  response.retCode = req.validate_input(dpp);
+  if (response.retCode < 0) {
+    return response;
+  }
+
+  //Calculate PackedPolicySize
+  string policy = req.getPolicy();
+  response.packedPolicySize = (policy.size() / req.getMaxPolicySize()) * 100;
+
+  //Generate Assumed Role User
+  response.retCode = response.user.generateAssumedRoleUser(cct, driver, roleId, r_arn.get(), req.getRoleSessionName());
+  if (response.retCode < 0) {
+    return response;
+  }
+
+  //Generate Credentials
+  //Role and Policy provide the authorization info, user id and applier info are not needed
+  response.retCode = response.creds.generateCredentials(dpp, cct, req.getDuration(),
+                                              req.getPolicy(), roleId,
+                                              req.getRoleSessionName(),
+                                              boost::none,
+                                              boost::none,
+                                              user_id, nullptr);
+  if (response.retCode < 0) {
+    return response;
+  }
+
+  response.retCode = 0;
+  return response;
+}
+
+GetSessionTokenRequest::GetSessionTokenRequest(const string& duration, const string& serialNumber, const string& tokenCode)
+{
+  if (duration.empty()) {
+    this->duration = DEFAULT_DURATION_IN_SECS;
+  } else {
+    this->duration = stoull(duration);
+  }
+  this->serialNumber = serialNumber;
+  this->tokenCode = tokenCode;
+}
+
+GetSessionTokenResponse STSService::getSessionToken(const DoutPrefixProvider *dpp, GetSessionTokenRequest& req)
+{
+  int ret;
+  Credentials cred;
+
+  //Generate Credentials
+  if (ret = cred.generateCredentials(dpp, cct,
+                                      req.getDuration(),
+                                      boost::none,
+                                      boost::none,
+                                      boost::none,
+                                      boost::none,
+                                      boost::none,
+                                      user_id,
+                                      identity); ret < 0) {
+    return make_tuple(ret, cred);
+  }
+
+  return make_tuple(0, cred);
+}
+
+}
diff --git a/src/rgw/rgw_sts.h b/src/rgw/rgw_sts.h
new file mode 100644
index 000000000..5ee7ee444
--- /dev/null
+++ b/src/rgw/rgw_sts.h
@@ -0,0 +1,251 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_role.h"
+#include "rgw_auth.h"
+#include "rgw_web_idp.h"
+
+namespace STS {
+
+class AssumeRoleRequestBase {
+protected:
+  static constexpr uint64_t MIN_POLICY_SIZE = 1;
+  static constexpr uint64_t MAX_POLICY_SIZE = 2048;
+  static constexpr uint64_t DEFAULT_DURATION_IN_SECS = 3600;
+  static constexpr uint64_t MIN_ROLE_ARN_SIZE = 2;
+  static constexpr uint64_t MAX_ROLE_ARN_SIZE = 2048;
+  static constexpr uint64_t MIN_ROLE_SESSION_SIZE = 2;
+  static constexpr uint64_t MAX_ROLE_SESSION_SIZE = 64;
+  uint64_t MIN_DURATION_IN_SECS;
+  uint64_t MAX_DURATION_IN_SECS;
+  CephContext* cct;
+  uint64_t duration;
+  std::string err_msg;
+  std::string iamPolicy;
+  std::string roleArn;
+  std::string roleSessionName;
+public:
+  AssumeRoleRequestBase(CephContext* cct,
+                        const std::string& duration,
+                        const std::string& iamPolicy,
+                        const std::string& roleArn,
+                        const std::string& roleSessionName);
+  const std::string& getRoleARN() const { return roleArn; }
+  const std::string& getRoleSessionName() const { return roleSessionName; }
+  const std::string& getPolicy() const {return iamPolicy; }
+  static const uint64_t& getMaxPolicySize() { return MAX_POLICY_SIZE; }
+  void setMaxDuration(const uint64_t& maxDuration) { MAX_DURATION_IN_SECS = maxDuration; }
+  const uint64_t& getDuration() const { return duration; }
+  int validate_input(const DoutPrefixProvider *dpp) const;
+};
+
+class AssumeRoleWithWebIdentityRequest : public AssumeRoleRequestBase {
+  static constexpr uint64_t MIN_PROVIDER_ID_LEN = 4;
+  static constexpr uint64_t MAX_PROVIDER_ID_LEN = 2048;
+  std::string providerId;
+  std::string iamPolicy;
+  std::string iss;
+  std::string sub;
+  std::string aud;
+  std::vector<std::pair<std::string,std::string>> session_princ_tags;
+public:
+  AssumeRoleWithWebIdentityRequest( CephContext* cct,
+                      const std::string& duration,
+                      const std::string& providerId,
+                      const std::string& iamPolicy,
+                      const std::string& roleArn,
+                      const std::string& roleSessionName,
+                      const std::string& iss,
+                      const std::string& sub,
+                      const std::string& aud,
+                      std::vector<std::pair<std::string,std::string>> session_princ_tags)
+    : AssumeRoleRequestBase(cct, duration, iamPolicy, roleArn, roleSessionName),
+      providerId(providerId), iss(iss), sub(sub), aud(aud), session_princ_tags(session_princ_tags) {}
+  const std::string& getProviderId() const { return providerId; }
+  const std::string& getIss() const { return iss; }
+  const std::string& getAud() const { return aud; }
+  const std::string& getSub() const { return sub; }
+  const std::vector<std::pair<std::string,std::string>>& getPrincipalTags() const { return session_princ_tags; }
+  int validate_input(const DoutPrefixProvider *dpp) const;
+};
+
+class AssumeRoleRequest : public AssumeRoleRequestBase {
+  static constexpr uint64_t MIN_EXTERNAL_ID_LEN = 2;
+  static constexpr uint64_t MAX_EXTERNAL_ID_LEN = 1224;
+  static constexpr uint64_t MIN_SERIAL_NUMBER_SIZE = 9;
+  static constexpr uint64_t MAX_SERIAL_NUMBER_SIZE = 256;
+  static constexpr uint64_t TOKEN_CODE_SIZE = 6;
+  std::string externalId;
+  std::string serialNumber;
+  std::string tokenCode;
+public:
+  AssumeRoleRequest(CephContext* cct,
+                    const std::string& duration,
+                    const std::string& externalId,
+                    const std::string& iamPolicy,
+                    const std::string& roleArn,
+                    const std::string& roleSessionName,
+                    const std::string& serialNumber,
+                    const std::string& tokenCode)
+    : AssumeRoleRequestBase(cct, duration, iamPolicy, roleArn, roleSessionName),
+      externalId(externalId), serialNumber(serialNumber), tokenCode(tokenCode){}
+  int validate_input(const DoutPrefixProvider *dpp) const;
+};
+
+class GetSessionTokenRequest {
+protected:
+  static constexpr uint64_t MIN_DURATION_IN_SECS = 900;
+  static constexpr uint64_t DEFAULT_DURATION_IN_SECS = 3600;
+  uint64_t duration;
+  std::string serialNumber;
+  std::string tokenCode;
+
+public:
+  GetSessionTokenRequest(const std::string& duration, const std::string& serialNumber, const std::string& tokenCode);
+
+  const uint64_t& getDuration() const { return duration; }
+  static const uint64_t& getMinDuration() { return MIN_DURATION_IN_SECS; }
+};
+
+class AssumedRoleUser {
+  std::string arn;
+  std::string assumeRoleId;
+public:
+  int generateAssumedRoleUser( CephContext* cct,
+                                rgw::sal::Driver* driver,
+                                const std::string& roleId,
+                                const rgw::ARN& roleArn,
+                                const std::string& roleSessionName);
+  const std::string& getARN() const { return arn; }
+  const std::string& getAssumeRoleId() const { return assumeRoleId; }
+  void dump(Formatter *f) const;
+};
+
+struct SessionToken {
+  std::string access_key_id;
+  std::string secret_access_key;
+  std::string expiration;
+  std::string policy;
+  std::string roleId;
+  rgw_user user;
+  std::string acct_name;
+  uint32_t perm_mask;
+  bool is_admin;
+  uint32_t acct_type;
+  std::string role_session;
+  std::vector<std::string> token_claims;
+  std::string issued_at;
+  std::vector<std::pair<std::string,std::string>> principal_tags;
+
+  SessionToken() {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(5, 1, bl);
+    encode(access_key_id, bl);
+    encode(secret_access_key, bl);
+    encode(expiration, bl);
+    encode(policy, bl);
+    encode(roleId, bl);
+    encode(user, bl);
+    encode(acct_name, bl);
+    encode(perm_mask, bl);
+    encode(is_admin, bl);
+    encode(acct_type, bl);
+    encode(role_session, bl);
+    encode(token_claims, bl);
+    encode(issued_at, bl);
+    encode(principal_tags, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(5, bl);
+    decode(access_key_id, bl);
+    decode(secret_access_key, bl);
+    decode(expiration, bl);
+    decode(policy, bl);
+    decode(roleId, bl);
+    decode(user, bl);
+    decode(acct_name, bl);
+    decode(perm_mask, bl);
+    decode(is_admin, bl);
+    decode(acct_type, bl);
+    if (struct_v >= 2) {
+      decode(role_session, bl);
+    }
+    if (struct_v >= 3) {
+      decode(token_claims, bl);
+    }
+    if (struct_v >= 4) {
+      decode(issued_at, bl);
+    }
+    if (struct_v >= 5) {
+      decode(principal_tags, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+};
+WRITE_CLASS_ENCODER(SessionToken)
+
+class Credentials {
+  static constexpr int MAX_ACCESS_KEY_LEN = 20;
+  static constexpr int MAX_SECRET_KEY_LEN = 40;
+  std::string accessKeyId;
+  std::string expiration;
+  std::string secretAccessKey;
+  std::string sessionToken;
+public:
+  int generateCredentials(const DoutPrefixProvider *dpp,
+                          CephContext* cct,
+                          const uint64_t& duration,
+                          const boost::optional<std::string>& policy,
+                          const boost::optional<std::string>& roleId,
+                          const boost::optional<std::string>& role_session,
+                          const boost::optional<std::vector<std::string>>& token_claims,
+                          const boost::optional<std::vector<std::pair<std::string,std::string>>>& session_princ_tags,
+                          boost::optional<rgw_user> user,
+                          rgw::auth::Identity* identity);
+  const std::string& getAccessKeyId() const { return accessKeyId; }
+  const std::string& getExpiration() const { return expiration; }
+  const std::string& getSecretAccessKey() const { return secretAccessKey; }
+  const std::string& getSessionToken() const { return sessionToken; }
+  void dump(Formatter *f) const;
+};
+
+struct AssumeRoleResponse {
+  int retCode;
+  AssumedRoleUser user;
+  Credentials creds;
+  uint64_t packedPolicySize;
+};
+
+struct AssumeRoleWithWebIdentityResponse {
+  AssumeRoleResponse assumeRoleResp;
+  std::string aud;
+  std::string providerId;
+  std::string sub;
+};
+
+using AssumeRoleResponse = struct AssumeRoleResponse ;
+using GetSessionTokenResponse = std::tuple<int, Credentials>;
+using AssumeRoleWithWebIdentityResponse = struct AssumeRoleWithWebIdentityResponse;
+
+class STSService {
+  CephContext* cct;
+  rgw::sal::Driver* driver;
+  rgw_user user_id;
+  std::unique_ptr<rgw::sal::RGWRole> role;
+  rgw::auth::Identity* identity;
+public:
+  STSService() = default;
+  STSService(CephContext* cct, rgw::sal::Driver* driver, rgw_user user_id,
+	     rgw::auth::Identity* identity)
+    : cct(cct), driver(driver), user_id(user_id), identity(identity) {}
+  std::tuple<int, rgw::sal::RGWRole*> getRoleInfo(const DoutPrefixProvider *dpp, const std::string& arn, optional_yield y);
+  AssumeRoleResponse assumeRole(const DoutPrefixProvider *dpp, AssumeRoleRequest& req, optional_yield y);
+  GetSessionTokenResponse getSessionToken(const DoutPrefixProvider *dpp, GetSessionTokenRequest& req);
+  AssumeRoleWithWebIdentityResponse assumeRoleWithWebIdentity(const DoutPrefixProvider *dpp, AssumeRoleWithWebIdentityRequest& req);
+};
+}
diff --git a/src/rgw/rgw_swift_auth.cc b/src/rgw/rgw_swift_auth.cc
new file mode 100644
index 000000000..05d4b28c1
--- /dev/null
+++ b/src/rgw/rgw_swift_auth.cc
@@ -0,0 +1,775 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <array>
+#include <algorithm>
+#include <string_view>
+
+#include <boost/container/static_vector.hpp>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "rgw_swift_auth.h"
+#include "rgw_rest.h"
+
+#include "common/ceph_crypto.h"
+#include "common/Clock.h"
+
+#include "include/random.h"
+
+#include "rgw_client_io.h"
+#include "rgw_http_client.h"
+#include "rgw_sal_rados.h"
+#include "include/str_list.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+#define DEFAULT_SWIFT_PREFIX "/swift"
+
+using namespace std;
+using namespace ceph::crypto;
+
+
+namespace rgw {
+namespace auth {
+namespace swift {
+
+/* TempURL: applier */
+void TempURLApplier::modify_request_state(const DoutPrefixProvider* dpp, req_state* s) const       /* in/out */
+{
+  bool inline_exists = false;
+  const std::string& filename = s->info.args.get("filename");
+
+  s->info.args.get("inline", &inline_exists);
+  if (inline_exists) {
+    s->content_disp.override = "inline";
+  } else if (!filename.empty()) {
+    std::string fenc;
+    url_encode(filename, fenc);
+    s->content_disp.override = "attachment; filename=\"" + fenc + "\"";
+  } else {
+    std::string fenc;
+    url_encode(s->object->get_name(), fenc);
+    s->content_disp.fallback = "attachment; filename=\"" + fenc + "\"";
+  }
+
+  ldpp_dout(dpp, 20) << "finished applying changes to req_state for TempURL: "
+                    << " content_disp override " << s->content_disp.override
+                    << " content_disp fallback " << s->content_disp.fallback
+                    << dendl;
+
+}
+
+void TempURLApplier::write_ops_log_entry(rgw_log_entry& entry) const
+{
+  LocalApplier::write_ops_log_entry(entry);
+  entry.temp_url = true;
+}
+
+/* TempURL: engine */
+bool TempURLEngine::is_applicable(const req_state* const s) const noexcept
+{
+  return s->info.args.exists("temp_url_sig") ||
+         s->info.args.exists("temp_url_expires");
+}
+
+void TempURLEngine::get_owner_info(const DoutPrefixProvider* dpp, const req_state* const s,
+                                   RGWUserInfo& owner_info, optional_yield y) const
+{
+  /* We cannot use req_state::bucket_name because it isn't available
+   * now. It will be initialized in RGWHandler_REST_SWIFT::postauth_init(). */
+  const string& bucket_name = s->init_state.url_bucket;
+
+  /* TempURL requires that bucket and object names are specified. */
+  if (bucket_name.empty() || s->object->empty()) {
+    throw -EPERM;
+  }
+
+  /* TempURL case is completely different than the Keystone auth - you may
+   * get account name only through extraction from URL. In turn, knowledge
+   * about account is neccessary to obtain its bucket tenant. Without that,
+   * the access would be limited to accounts with empty tenant. */
+  string bucket_tenant;
+  if (!s->account_name.empty()) {
+    bool found = false;
+    std::unique_ptr<rgw::sal::User> user;
+
+    rgw_user uid(s->account_name);
+    if (uid.tenant.empty()) {
+      rgw_user tenanted_uid(uid.id, uid.id);
+      user = driver->get_user(tenanted_uid);
+      if (user->load_user(dpp, s->yield) >= 0) {
+	/* Succeeded */
+	found = true;
+      }
+    }
+
+    if (!found) {
+      user = driver->get_user(uid);
+      if (user->load_user(dpp, s->yield) < 0) {
+	throw -EPERM;
+      }
+    }
+
+    bucket_tenant = user->get_tenant();
+  }
+
+  rgw_bucket b;
+  b.tenant = std::move(bucket_tenant);
+  b.name = std::move(bucket_name);
+  std::unique_ptr<rgw::sal::Bucket> bucket;
+  int ret = driver->get_bucket(dpp, nullptr, b, &bucket, s->yield);
+  if (ret < 0) {
+    throw ret;
+  }
+
+  ldpp_dout(dpp, 20) << "temp url user (bucket owner): " << bucket->get_info().owner
+                 << dendl;
+
+  std::unique_ptr<rgw::sal::User> user;
+  user = driver->get_user(bucket->get_info().owner);
+  if (user->load_user(dpp, s->yield) < 0) {
+    throw -EPERM;
+  }
+
+  owner_info = user->get_info();
+}
+
+std::string TempURLEngine::convert_from_iso8601(std::string expires) const
+{
+  /* Swift's TempURL allows clients to send the expiration as ISO8601-
+   * compatible strings. Though, only plain UNIX timestamp are taken
+   * for the HMAC calculations. We need to make the conversion. */
+  struct tm date_t;
+  if (!parse_iso8601(expires.c_str(), &date_t, nullptr, true)) {
+    return expires;
+  } else {
+    return std::to_string(internal_timegm(&date_t));
+  }
+}
+
+bool TempURLEngine::is_expired(const std::string& expires) const
+{
+  string err;
+  const utime_t now = ceph_clock_now();
+  const uint64_t expiration = (uint64_t)strict_strtoll(expires.c_str(),
+                                                       10, &err);
+  if (!err.empty()) {
+    dout(5) << "failed to parse temp_url_expires: " << err << dendl;
+    return true;
+  }
+
+  if (expiration <= (uint64_t)now.sec()) {
+    dout(5) << "temp url expired: " << expiration << " <= " << now.sec() << dendl;
+    return true;
+  }
+
+  return false;
+}
+
+bool TempURLEngine::is_disallowed_header_present(const req_info& info) const
+{
+  static const auto headers = {
+    "HTTP_X_OBJECT_MANIFEST",
+  };
+
+  return std::any_of(std::begin(headers), std::end(headers),
+                     [&info](const char* header) {
+                       return info.env->exists(header);
+                     });
+}
+
+std::string extract_swift_subuser(const std::string& swift_user_name)
+{
+  size_t pos = swift_user_name.find(':');
+  if (std::string::npos == pos) {
+    return swift_user_name;
+  } else {
+    return swift_user_name.substr(pos + 1);
+  }
+}
+
+class TempURLEngine::SignatureHelper
+{
+private:
+  static constexpr uint32_t output_size =
+    CEPH_CRYPTO_HMACSHA1_DIGESTSIZE * 2 + 1;
+
+  unsigned char dest[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE]; // 20
+  char dest_str[output_size];
+
+public:
+  SignatureHelper() = default;
+
+  const char* calc(const std::string& key,
+                   const std::string_view& method,
+                   const std::string_view& path,
+                   const std::string& expires) {
+
+    using ceph::crypto::HMACSHA1;
+    using UCHARPTR = const unsigned char*;
+
+    HMACSHA1 hmac((UCHARPTR) key.c_str(), key.size());
+    hmac.Update((UCHARPTR) method.data(), method.size());
+    hmac.Update((UCHARPTR) "\n", 1);
+    hmac.Update((UCHARPTR) expires.c_str(), expires.size());
+    hmac.Update((UCHARPTR) "\n", 1);
+    hmac.Update((UCHARPTR) path.data(), path.size());
+    hmac.Final(dest);
+
+    buf_to_hex((UCHARPTR) dest, sizeof(dest), dest_str);
+
+    return dest_str;
+  }
+
+  bool is_equal_to(const std::string& rhs) const {
+    /* never allow out-of-range exception */
+    if (rhs.size() < (output_size - 1)) {
+      return false;
+    }
+    return rhs.compare(0 /* pos */,  output_size, dest_str) == 0;
+  }
+
+}; /* TempURLEngine::SignatureHelper */
+
+class TempURLEngine::PrefixableSignatureHelper
+    : private TempURLEngine::SignatureHelper {
+  using base_t = SignatureHelper;
+
+  const std::string_view decoded_uri;
+  const std::string_view object_name;
+  std::string_view no_obj_uri;
+
+  const boost::optional<const std::string&> prefix;
+
+public:
+  PrefixableSignatureHelper(const std::string& _decoded_uri,
+	                    const std::string& object_name,
+                            const boost::optional<const std::string&> prefix)
+    : decoded_uri(_decoded_uri),
+      object_name(object_name),
+      prefix(prefix) {
+    /* Transform: v1/acct/cont/obj - > v1/acct/cont/
+     *
+     * NOTE(rzarzynski): we really want to substr() on std::string_view,
+     * not std::string. Otherwise we would end with no_obj_uri referencing
+     * a temporary. */
+    no_obj_uri = \
+      decoded_uri.substr(0, decoded_uri.length() - object_name.length());
+  }
+
+  const char* calc(const std::string& key,
+                   const std::string_view& method,
+                   const std::string_view& path,
+                   const std::string& expires) {
+    if (!prefix) {
+      return base_t::calc(key, method, path, expires);
+    } else {
+      const auto prefixed_path = \
+        string_cat_reserve("prefix:", no_obj_uri, *prefix);
+      return base_t::calc(key, method, prefixed_path, expires);
+    }
+  }
+
+  bool is_equal_to(const std::string& rhs) const {
+    bool is_auth_ok = base_t::is_equal_to(rhs);
+
+    if (prefix && is_auth_ok) {
+      const auto prefix_uri = string_cat_reserve(no_obj_uri, *prefix);
+      is_auth_ok = boost::algorithm::starts_with(decoded_uri, prefix_uri);
+    }
+
+    return is_auth_ok;
+  }
+}; /* TempURLEngine::PrefixableSignatureHelper */
+
+TempURLEngine::result_t
+TempURLEngine::authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const
+{
+  if (! is_applicable(s)) {
+    return result_t::deny();
+  }
+
+  /* NOTE(rzarzynski): RGWHTTPArgs::get(), in contrast to RGWEnv::get(),
+   * never returns nullptr. If the requested parameter is absent, we will
+   * get the empty string. */
+  const std::string& temp_url_sig = s->info.args.get("temp_url_sig");
+  const std::string& temp_url_expires = \
+    convert_from_iso8601(s->info.args.get("temp_url_expires"));
+
+  if (temp_url_sig.empty() || temp_url_expires.empty()) {
+    return result_t::deny();
+  }
+
+  /* Though, for prefixed tempurls we need to differentiate between empty
+   * prefix and lack of prefix. Empty prefix means allowance for whole
+   * container. */
+  const boost::optional<const std::string&> temp_url_prefix = \
+    s->info.args.get_optional("temp_url_prefix");
+
+  RGWUserInfo owner_info;
+  try {
+    get_owner_info(dpp, s, owner_info, y);
+  } catch (...) {
+    ldpp_dout(dpp, 5) << "cannot get user_info of account's owner" << dendl;
+    return result_t::reject();
+  }
+
+  if (owner_info.temp_url_keys.empty()) {
+    ldpp_dout(dpp, 5) << "user does not have temp url key set, aborting" << dendl;
+    return result_t::reject();
+  }
+
+  if (is_expired(temp_url_expires)) {
+    ldpp_dout(dpp, 5) << "temp url link expired" << dendl;
+    return result_t::reject(-EPERM);
+  }
+
+  if (is_disallowed_header_present(s->info)) {
+    ldout(cct, 5) << "temp url rejected due to disallowed header" << dendl;
+    return result_t::reject(-EINVAL);
+  }
+
+  /* We need to verify two paths because of compliance with Swift, Tempest
+   * and old versions of RadosGW. The second item will have the prefix
+   * of Swift API entry point removed. */
+
+  /* XXX can we search this ONCE? */
+  const size_t pos = g_conf()->rgw_swift_url_prefix.find_last_not_of('/') + 1;
+  const std::string_view ref_uri = s->decoded_uri;
+  const std::array<std::string_view, 2> allowed_paths = {
+    ref_uri,
+    ref_uri.substr(pos + 1)
+  };
+
+  /* Account owner calculates the signature also against a HTTP method. */
+  boost::container::static_vector<std::string_view, 3> allowed_methods;
+  if (strcmp("HEAD", s->info.method) == 0) {
+    /* HEAD requests are specially handled. */
+    /* TODO: after getting a newer boost (with static_vector supporting
+     * initializers lists), get back to the good notation:
+     *   allowed_methods = {"HEAD", "GET", "PUT" };
+     * Just for now let's use emplace_back to construct the vector. */
+    allowed_methods.emplace_back("HEAD");
+    allowed_methods.emplace_back("GET");
+    allowed_methods.emplace_back("PUT");
+  } else if (strlen(s->info.method) > 0) {
+    allowed_methods.emplace_back(s->info.method);
+  }
+
+  /* Need to try each combination of keys, allowed path and methods. */
+  PrefixableSignatureHelper sig_helper {
+    s->decoded_uri,
+    s->object->get_name(),
+    temp_url_prefix
+  };
+
+  for (const auto& kv : owner_info.temp_url_keys) {
+    const int temp_url_key_num = kv.first;
+    const string& temp_url_key = kv.second;
+
+    if (temp_url_key.empty()) {
+      continue;
+    }
+
+    for (const auto& path : allowed_paths) {
+      for (const auto& method : allowed_methods) {
+        const char* const local_sig = sig_helper.calc(temp_url_key, method,
+                                                      path, temp_url_expires);
+
+        ldpp_dout(dpp, 20) << "temp url signature [" << temp_url_key_num
+                          << "] (calculated): " << local_sig
+                          << dendl;
+
+        if (sig_helper.is_equal_to(temp_url_sig)) {
+          auto apl = apl_factory->create_apl_turl(cct, s, owner_info);
+          return result_t::grant(std::move(apl));
+        } else {
+          ldpp_dout(dpp,  5) << "temp url signature mismatch: " << local_sig
+                            << " != " << temp_url_sig  << dendl;
+        }
+      }
+    }
+  }
+
+  return result_t::reject();
+}
+
+
+/* External token */
+bool ExternalTokenEngine::is_applicable(const std::string& token) const noexcept
+{
+  if (token.empty()) {
+    return false;
+  } else if (g_conf()->rgw_swift_auth_url.empty()) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+ExternalTokenEngine::result_t
+ExternalTokenEngine::authenticate(const DoutPrefixProvider* dpp,
+                                  const std::string& token,
+                                  const req_state* const s, optional_yield y) const
+{
+  if (! is_applicable(token)) {
+    return result_t::deny();
+  }
+
+  std::string auth_url = g_conf()->rgw_swift_auth_url;
+  if (auth_url.back() != '/') {
+    auth_url.append("/");
+  }
+
+  auth_url.append("token");
+  char url_buf[auth_url.size() + 1 + token.length() + 1];
+  sprintf(url_buf, "%s/%s", auth_url.c_str(), token.c_str());
+
+  RGWHTTPHeadersCollector validator(cct, "GET", url_buf, { "X-Auth-Groups", "X-Auth-Ttl" });
+
+  ldpp_dout(dpp, 10) << "rgw_swift_validate_token url=" << url_buf << dendl;
+
+  int ret = validator.process(y);
+  if (ret < 0) {
+    throw ret;
+  }
+
+  std::string swift_user;
+  try {
+    std::vector<std::string> swift_groups;
+    get_str_vec(validator.get_header_value("X-Auth-Groups"),
+                ",", swift_groups);
+
+    if (0 == swift_groups.size()) {
+      return result_t::deny(-EPERM);
+    } else {
+      swift_user = std::move(swift_groups[0]);
+    }
+  } catch (const std::out_of_range&) {
+    /* The X-Auth-Groups header isn't present in the response. */
+    return result_t::deny(-EPERM);
+  }
+
+  if (swift_user.empty()) {
+    return result_t::deny(-EPERM);
+  }
+
+  ldpp_dout(dpp, 10) << "swift user=" << swift_user << dendl;
+
+  std::unique_ptr<rgw::sal::User> user;
+  ret = driver->get_user_by_swift(dpp, swift_user, s->yield, &user);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "NOTICE: couldn't map swift user" << dendl;
+    throw ret;
+  }
+
+  auto apl = apl_factory->create_apl_local(cct, s, user->get_info(),
+                                           extract_swift_subuser(swift_user),
+                                           std::nullopt, rgw::auth::LocalApplier::NO_ACCESS_KEY);
+  return result_t::grant(std::move(apl));
+}
+
+static int build_token(const string& swift_user,
+                       const string& key,
+                       const uint64_t nonce,
+                       const utime_t& expiration,
+                       bufferlist& bl)
+{
+  using ceph::encode;
+  encode(swift_user, bl);
+  encode(nonce, bl);
+  encode(expiration, bl);
+
+  bufferptr p(CEPH_CRYPTO_HMACSHA1_DIGESTSIZE);
+
+  char buf[bl.length() * 2 + 1];
+  buf_to_hex((const unsigned char *)bl.c_str(), bl.length(), buf);
+  dout(20) << "build_token token=" << buf << dendl;
+
+  char k[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE];
+  // FIPS zeroization audit 20191116: this memset is not intended to
+  // wipe out a secret after use.
+  memset(k, 0, sizeof(k));
+  const char *s = key.c_str();
+  for (int i = 0; i < (int)key.length(); i++, s++) {
+    k[i % CEPH_CRYPTO_HMACSHA1_DIGESTSIZE] |= *s;
+  }
+  calc_hmac_sha1(k, sizeof(k), bl.c_str(), bl.length(), p.c_str());
+  ::ceph::crypto::zeroize_for_security(k, sizeof(k));
+
+  bl.append(p);
+
+  return 0;
+
+}
+
+static int encode_token(CephContext *cct, string& swift_user, string& key,
+			bufferlist& bl)
+{
+  const auto nonce = ceph::util::generate_random_number<uint64_t>();
+
+  utime_t expiration = ceph_clock_now();
+  expiration += cct->_conf->rgw_swift_token_expiration;
+
+  return build_token(swift_user, key, nonce, expiration, bl);
+}
+
+
+/* AUTH_rgwtk (signed token): engine */
+bool SignedTokenEngine::is_applicable(const std::string& token) const noexcept
+{
+  if (token.empty()) {
+    return false;
+  } else {
+    return token.compare(0, 10, "AUTH_rgwtk") == 0;
+  }
+}
+
+SignedTokenEngine::result_t
+SignedTokenEngine::authenticate(const DoutPrefixProvider* dpp,
+                                const std::string& token,
+                                const req_state* const s) const
+{
+  if (! is_applicable(token)) {
+    return result_t::deny(-EPERM);
+  }
+
+  /* Effective token string is the part after the prefix. */
+  const std::string etoken = token.substr(strlen("AUTH_rgwtk"));
+  const size_t etoken_len = etoken.length();
+
+  if (etoken_len & 1) {
+    ldpp_dout(dpp, 0) << "NOTICE: failed to verify token: odd token length="
+	          << etoken_len << dendl;
+    throw -EINVAL;
+  }
+
+  ceph::bufferptr p(etoken_len/2);
+  int ret = hex_to_buf(etoken.c_str(), p.c_str(), etoken_len);
+  if (ret < 0) {
+    throw ret;
+  }
+
+  ceph::bufferlist tok_bl;
+  tok_bl.append(p);
+
+  uint64_t nonce;
+  utime_t expiration;
+  std::string swift_user;
+
+  try {
+    auto iter = tok_bl.cbegin();
+
+    using ceph::decode;
+    decode(swift_user, iter);
+    decode(nonce, iter);
+    decode(expiration, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "NOTICE: failed to decode token" << dendl;
+    throw -EINVAL;
+  }
+
+  const utime_t now = ceph_clock_now();
+  if (expiration < now) {
+    ldpp_dout(dpp, 0) << "NOTICE: old timed out token was used now=" << now
+	          << " token.expiration=" << expiration
+                  << dendl;
+    return result_t::deny(-EPERM);
+  }
+
+  std::unique_ptr<rgw::sal::User> user;
+  ret = driver->get_user_by_swift(dpp, swift_user, s->yield, &user);
+  if (ret < 0) {
+    throw ret;
+  }
+
+  ldpp_dout(dpp, 10) << "swift_user=" << swift_user << dendl;
+
+  const auto siter = user->get_info().swift_keys.find(swift_user);
+  if (siter == std::end(user->get_info().swift_keys)) {
+    return result_t::deny(-EPERM);
+  }
+
+  const auto swift_key = siter->second;
+
+  bufferlist local_tok_bl;
+  ret = build_token(swift_user, swift_key.key, nonce, expiration, local_tok_bl);
+  if (ret < 0) {
+    throw ret;
+  }
+
+  if (local_tok_bl.length() != tok_bl.length()) {
+    ldpp_dout(dpp, 0) << "NOTICE: tokens length mismatch:"
+                  << " tok_bl.length()=" << tok_bl.length()
+	          << " local_tok_bl.length()=" << local_tok_bl.length()
+                  << dendl;
+    return result_t::deny(-EPERM);
+  }
+
+  if (memcmp(local_tok_bl.c_str(), tok_bl.c_str(),
+             local_tok_bl.length()) != 0) {
+    char buf[local_tok_bl.length() * 2 + 1];
+
+    buf_to_hex(reinterpret_cast<const unsigned char *>(local_tok_bl.c_str()),
+               local_tok_bl.length(), buf);
+
+    ldpp_dout(dpp, 0) << "NOTICE: tokens mismatch tok=" << buf << dendl;
+    return result_t::deny(-EPERM);
+  }
+
+  auto apl = apl_factory->create_apl_local(cct, s, user->get_info(),
+                                           extract_swift_subuser(swift_user),
+                                           std::nullopt, rgw::auth::LocalApplier::NO_ACCESS_KEY);
+  return result_t::grant(std::move(apl));
+}
+
+} /* namespace swift */
+} /* namespace auth */
+} /* namespace rgw */
+
+
+void RGW_SWIFT_Auth_Get::execute(optional_yield y)
+{
+  int ret = -EPERM;
+
+  const char *key = s->info.env->get("HTTP_X_AUTH_KEY");
+  const char *user_name = s->info.env->get("HTTP_X_AUTH_USER");
+
+  s->prot_flags |= RGW_REST_SWIFT;
+
+  string user_str;
+  std::unique_ptr<rgw::sal::User> user;
+  bufferlist bl;
+  RGWAccessKey *swift_key;
+  map<string, RGWAccessKey>::iterator siter;
+
+  string swift_url = g_conf()->rgw_swift_url;
+  string swift_prefix = g_conf()->rgw_swift_url_prefix;
+  string tenant_path;
+
+  /*
+   * We did not allow an empty Swift prefix before, but we want it now.
+   * So, we take rgw_swift_url_prefix = "/" to yield the empty prefix.
+   * The rgw_swift_url_prefix = "" is the default and yields "/swift"
+   * in a backwards-compatible way.
+   */
+  if (swift_prefix.size() == 0) {
+    swift_prefix = DEFAULT_SWIFT_PREFIX;
+  } else if (swift_prefix == "/") {
+    swift_prefix.clear();
+  } else {
+    if (swift_prefix[0] != '/') {
+      swift_prefix.insert(0, "/");
+    }
+  }
+
+  if (swift_url.size() == 0) {
+    bool add_port = false;
+    auto server_port = s->info.env->get_optional("SERVER_PORT_SECURE");
+    const char *protocol;
+    if (server_port) {
+      add_port = (*server_port != "443");
+      protocol = "https";
+    } else {
+      server_port = s->info.env->get_optional("SERVER_PORT");
+      if (server_port) {
+        add_port = (*server_port != "80");
+      }
+      protocol = "http";
+    }
+    const char *host = s->info.env->get("HTTP_HOST");
+    if (!host) {
+      dout(0) << "NOTICE: server is misconfigured, missing rgw_swift_url_prefix or rgw_swift_url, HTTP_HOST is not set" << dendl;
+      ret = -EINVAL;
+      goto done;
+    }
+    swift_url = protocol;
+    swift_url.append("://");
+    swift_url.append(host);
+    if (add_port && !strchr(host, ':')) {
+      swift_url.append(":");
+      swift_url.append(*server_port);
+    }
+  }
+
+  if (!key || !user_name)
+    goto done;
+
+  user_str = user_name;
+
+  ret = driver->get_user_by_swift(s, user_str, s->yield, &user);
+  if (ret < 0) {
+    ret = -EACCES;
+    goto done;
+  }
+
+  siter = user->get_info().swift_keys.find(user_str);
+  if (siter == user->get_info().swift_keys.end()) {
+    ret = -EPERM;
+    goto done;
+  }
+  swift_key = &siter->second;
+
+  if (swift_key->key.compare(key) != 0) {
+    dout(0) << "NOTICE: RGW_SWIFT_Auth_Get::execute(): bad swift key" << dendl;
+    ret = -EPERM;
+    goto done;
+  }
+
+  if (!g_conf()->rgw_swift_tenant_name.empty()) {
+    tenant_path = "/AUTH_";
+    tenant_path.append(g_conf()->rgw_swift_tenant_name);
+  } else if (g_conf()->rgw_swift_account_in_url) {
+    tenant_path = "/AUTH_";
+    tenant_path.append(user->get_id().to_str());
+  }
+
+  dump_header(s, "X-Storage-Url", swift_url + swift_prefix + "/v1" +
+              tenant_path);
+
+  using rgw::auth::swift::encode_token;
+  if ((ret = encode_token(s->cct, swift_key->id, swift_key->key, bl)) < 0)
+    goto done;
+
+  {
+    static constexpr size_t PREFIX_LEN = sizeof("AUTH_rgwtk") - 1;
+    char token_val[PREFIX_LEN + bl.length() * 2 + 1];
+
+    snprintf(token_val, PREFIX_LEN + 1, "AUTH_rgwtk");
+    buf_to_hex((const unsigned char *)bl.c_str(), bl.length(),
+	       token_val + PREFIX_LEN);
+
+    dump_header(s, "X-Storage-Token", token_val);
+    dump_header(s, "X-Auth-Token", token_val);
+  }
+
+  ret = STATUS_NO_CONTENT;
+
+done:
+  set_req_state_err(s, ret);
+  dump_errno(s);
+  end_header(s);
+}
+
+int RGWHandler_SWIFT_Auth::init(rgw::sal::Driver* driver, req_state *state,
+				rgw::io::BasicClient *cio)
+{
+  state->dialect = "swift-auth";
+  state->formatter = new JSONFormatter;
+  state->format = RGWFormat::JSON;
+
+  return RGWHandler::init(driver, state, cio);
+}
+
+int RGWHandler_SWIFT_Auth::authorize(const DoutPrefixProvider *dpp, optional_yield)
+{
+  return 0;
+}
+
+RGWOp *RGWHandler_SWIFT_Auth::op_get()
+{
+  return new RGW_SWIFT_Auth_Get;
+}
+
diff --git a/src/rgw/rgw_swift_auth.h b/src/rgw/rgw_swift_auth.h
new file mode 100644
index 000000000..85a103dbf
--- /dev/null
+++ b/src/rgw/rgw_swift_auth.h
@@ -0,0 +1,354 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_common.h"
+#include "rgw_user.h"
+#include "rgw_op.h"
+#include "rgw_rest.h"
+#include "rgw_auth.h"
+#include "rgw_auth_keystone.h"
+#include "rgw_auth_filters.h"
+#include "rgw_sal.h"
+
+#define RGW_SWIFT_TOKEN_EXPIRATION (15 * 60)
+
+namespace rgw {
+namespace auth {
+namespace swift {
+
+/* TempURL: applier. */
+class TempURLApplier : public rgw::auth::LocalApplier {
+public:
+  TempURLApplier(CephContext* const cct,
+                 const RGWUserInfo& user_info)
+    : LocalApplier(cct, user_info, LocalApplier::NO_SUBUSER, std::nullopt, LocalApplier::NO_ACCESS_KEY) {
+  };
+
+  void modify_request_state(const DoutPrefixProvider* dpp, req_state * s) const override; /* in/out */
+  void write_ops_log_entry(rgw_log_entry& entry) const override;
+
+  struct Factory {
+    virtual ~Factory() {}
+    virtual aplptr_t create_apl_turl(CephContext* cct,
+                                     const req_state* s,
+                                     const RGWUserInfo& user_info) const = 0;
+  };
+};
+
+/* TempURL: engine */
+class TempURLEngine : public rgw::auth::Engine {
+  using result_t = rgw::auth::Engine::result_t;
+
+  CephContext* const cct;
+  rgw::sal::Driver* driver;
+  const TempURLApplier::Factory* const apl_factory;
+
+  /* Helper methods. */
+  void get_owner_info(const DoutPrefixProvider* dpp,
+                      const req_state* s,
+                      RGWUserInfo& owner_info,
+		      optional_yield y) const;
+  std::string convert_from_iso8601(std::string expires) const;
+  bool is_applicable(const req_state* s) const noexcept;
+  bool is_expired(const std::string& expires) const;
+  bool is_disallowed_header_present(const req_info& info) const;
+
+  class SignatureHelper;
+  class PrefixableSignatureHelper;
+
+public:
+  TempURLEngine(CephContext* const cct,
+                rgw::sal::Driver* _driver ,
+                const TempURLApplier::Factory* const apl_factory)
+    : cct(cct),
+      driver(_driver),
+      apl_factory(apl_factory) {
+  }
+
+  /* Interface implementations. */
+  const char* get_name() const noexcept override {
+    return "rgw::auth::swift::TempURLEngine";
+  }
+
+  result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s, optional_yield y) const override;
+};
+
+
+/* AUTH_rgwtk */
+class SignedTokenEngine : public rgw::auth::Engine {
+  using result_t = rgw::auth::Engine::result_t;
+
+  CephContext* const cct;
+  rgw::sal::Driver* driver;
+  const rgw::auth::TokenExtractor* const extractor;
+  const rgw::auth::LocalApplier::Factory* const apl_factory;
+
+  bool is_applicable(const std::string& token) const noexcept;
+  using rgw::auth::Engine::authenticate;
+  result_t authenticate(const DoutPrefixProvider* dpp,
+                        const std::string& token,
+                        const req_state* s) const;
+
+public:
+  SignedTokenEngine(CephContext* const cct,
+                    rgw::sal::Driver* _driver,
+                    const rgw::auth::TokenExtractor* const extractor,
+                    const rgw::auth::LocalApplier::Factory* const apl_factory)
+    : cct(cct),
+      driver(_driver),
+      extractor(extractor),
+      apl_factory(apl_factory) {
+  }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::swift::SignedTokenEngine";
+  }
+
+  result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s,
+			optional_yield y) const override {
+    return authenticate(dpp, extractor->get_token(s), s);
+  }
+};
+
+
+/* External token */
+class ExternalTokenEngine : public rgw::auth::Engine {
+  using result_t = rgw::auth::Engine::result_t;
+
+  CephContext* const cct;
+  rgw::sal::Driver* driver;
+  const rgw::auth::TokenExtractor* const extractor;
+  const rgw::auth::LocalApplier::Factory* const apl_factory;
+
+  bool is_applicable(const std::string& token) const noexcept;
+  result_t authenticate(const DoutPrefixProvider* dpp,
+                        const std::string& token,
+                        const req_state* s, optional_yield y) const;
+
+public:
+  ExternalTokenEngine(CephContext* const cct,
+                      rgw::sal::Driver* _driver,
+                      const rgw::auth::TokenExtractor* const extractor,
+                      const rgw::auth::LocalApplier::Factory* const apl_factory)
+    : cct(cct),
+      driver(_driver),
+      extractor(extractor),
+      apl_factory(apl_factory) {
+  }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::swift::ExternalTokenEngine";
+  }
+
+  result_t authenticate(const DoutPrefixProvider* dpp, const req_state* const s,
+			optional_yield y) const override {
+    return authenticate(dpp, extractor->get_token(s), s, y);
+  }
+};
+
+/* SwiftAnonymous: applier. */
+class SwiftAnonymousApplier : public rgw::auth::LocalApplier {
+  public:
+    SwiftAnonymousApplier(CephContext* const cct,
+                          const RGWUserInfo& user_info)
+      : LocalApplier(cct, user_info, LocalApplier::NO_SUBUSER, std::nullopt, LocalApplier::NO_ACCESS_KEY) {
+    }
+    bool is_admin_of(const rgw_user& uid) const {return false;}
+    bool is_owner_of(const rgw_user& uid) const {return uid.id.compare(RGW_USER_ANON_ID) == 0;}
+};
+
+class SwiftAnonymousEngine : public rgw::auth::AnonymousEngine {
+  const rgw::auth::TokenExtractor* const extractor;
+
+  bool is_applicable(const req_state* s) const noexcept override {
+    return extractor->get_token(s).empty();
+  }
+
+public:
+  SwiftAnonymousEngine(CephContext* const cct,
+                       const SwiftAnonymousApplier::Factory* const apl_factory,
+                       const rgw::auth::TokenExtractor* const extractor)
+    : AnonymousEngine(cct, apl_factory),
+      extractor(extractor) {
+  }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::swift::SwiftAnonymousEngine";
+  }
+};
+
+
+class DefaultStrategy : public rgw::auth::Strategy,
+                        public rgw::auth::RemoteApplier::Factory,
+                        public rgw::auth::LocalApplier::Factory,
+                        public rgw::auth::swift::TempURLApplier::Factory {
+  rgw::sal::Driver* driver;
+  const ImplicitTenants& implicit_tenant_context;
+
+  /* The engines. */
+  const rgw::auth::swift::TempURLEngine tempurl_engine;
+  const rgw::auth::swift::SignedTokenEngine signed_engine;
+  boost::optional <const rgw::auth::keystone::TokenEngine> keystone_engine;
+  const rgw::auth::swift::ExternalTokenEngine external_engine;
+  const rgw::auth::swift::SwiftAnonymousEngine anon_engine;
+
+  using keystone_config_t = rgw::keystone::CephCtxConfig;
+  using keystone_cache_t = rgw::keystone::TokenCache;
+  using aplptr_t = rgw::auth::IdentityApplier::aplptr_t;
+  using acl_strategy_t = rgw::auth::RemoteApplier::acl_strategy_t;
+
+  /* The method implements TokenExtractor for X-Auth-Token present in req_state. */
+  struct AuthTokenExtractor : rgw::auth::TokenExtractor {
+    std::string get_token(const req_state* const s) const override {
+      /* Returning a reference here would end in GCC complaining about a reference
+       * to temporary. */
+      return s->info.env->get("HTTP_X_AUTH_TOKEN", "");
+    }
+  } auth_token_extractor;
+
+  /* The method implements TokenExtractor for X-Service-Token present in req_state. */
+  struct ServiceTokenExtractor : rgw::auth::TokenExtractor {
+    std::string get_token(const req_state* const s) const override {
+      return s->info.env->get("HTTP_X_SERVICE_TOKEN", "");
+    }
+  } service_token_extractor;
+
+  aplptr_t create_apl_remote(CephContext* const cct,
+                             const req_state* const s,
+                             acl_strategy_t&& extra_acl_strategy,
+                             const rgw::auth::RemoteApplier::AuthInfo &info) const override {
+    auto apl = \
+      rgw::auth::add_3rdparty(driver, rgw_user(s->account_name),
+        rgw::auth::add_sysreq(cct, driver, s,
+          rgw::auth::RemoteApplier(cct, driver, std::move(extra_acl_strategy), info,
+                                   implicit_tenant_context,
+                                   rgw::auth::ImplicitTenants::IMPLICIT_TENANTS_SWIFT)));
+    /* TODO(rzarzynski): replace with static_ptr. */
+    return aplptr_t(new decltype(apl)(std::move(apl)));
+  }
+
+  aplptr_t create_apl_local(CephContext* const cct,
+                            const req_state* const s,
+                            const RGWUserInfo& user_info,
+                            const std::string& subuser,
+                            const std::optional<uint32_t>& perm_mask,
+                            const std::string& access_key_id) const override {
+    auto apl = \
+      rgw::auth::add_3rdparty(driver, rgw_user(s->account_name),
+        rgw::auth::add_sysreq(cct, driver, s,
+          rgw::auth::LocalApplier(cct, user_info, subuser, perm_mask, access_key_id)));
+    /* TODO(rzarzynski): replace with static_ptr. */
+    return aplptr_t(new decltype(apl)(std::move(apl)));
+  }
+
+  aplptr_t create_apl_turl(CephContext* const cct,
+                           const req_state* const s,
+                           const RGWUserInfo& user_info) const override {
+    /* TempURL doesn't need any user account override. It's a Swift-specific
+     * mechanism that requires  account name internally, so there is no
+     * business with delegating the responsibility outside. */
+    return aplptr_t(new rgw::auth::swift::TempURLApplier(cct, user_info));
+  }
+
+public:
+  DefaultStrategy(CephContext* const cct,
+                  const ImplicitTenants& implicit_tenant_context,
+                  rgw::sal::Driver* _driver)
+    : driver(_driver),
+      implicit_tenant_context(implicit_tenant_context),
+      tempurl_engine(cct,
+                     driver,
+                     static_cast<rgw::auth::swift::TempURLApplier::Factory*>(this)),
+      signed_engine(cct,
+                    driver,
+                    static_cast<rgw::auth::TokenExtractor*>(&auth_token_extractor),
+                    static_cast<rgw::auth::LocalApplier::Factory*>(this)),
+      external_engine(cct,
+                      driver,
+                      static_cast<rgw::auth::TokenExtractor*>(&auth_token_extractor),
+                      static_cast<rgw::auth::LocalApplier::Factory*>(this)),
+      anon_engine(cct,
+                  static_cast<SwiftAnonymousApplier::Factory*>(this),
+                  static_cast<rgw::auth::TokenExtractor*>(&auth_token_extractor)) {
+    /* When the constructor's body is being executed, all member engines
+     * should be initialized. Thus, we can safely add them. */
+    using Control = rgw::auth::Strategy::Control;
+
+    add_engine(Control::SUFFICIENT, tempurl_engine);
+    add_engine(Control::SUFFICIENT, signed_engine);
+
+    /* The auth strategy is responsible for deciding whether a parcular
+     * engine is disabled or not. */
+    if (! cct->_conf->rgw_keystone_url.empty()) {
+      keystone_engine.emplace(cct,
+                              static_cast<rgw::auth::TokenExtractor*>(&auth_token_extractor),
+                              static_cast<rgw::auth::TokenExtractor*>(&service_token_extractor),
+                              static_cast<rgw::auth::RemoteApplier::Factory*>(this),
+                              keystone_config_t::get_instance(),
+                              keystone_cache_t::get_instance<keystone_config_t>());
+
+      add_engine(Control::SUFFICIENT, *keystone_engine);
+    }
+    if (! cct->_conf->rgw_swift_auth_url.empty()) {
+      add_engine(Control::SUFFICIENT, external_engine);
+    }
+
+    add_engine(Control::SUFFICIENT, anon_engine);
+  }
+
+  const char* get_name() const noexcept override {
+    return "rgw::auth::swift::DefaultStrategy";
+  }
+};
+
+} /* namespace swift */
+} /* namespace auth */
+} /* namespace rgw */
+
+
+class RGW_SWIFT_Auth_Get : public RGWOp {
+public:
+  RGW_SWIFT_Auth_Get() {}
+  ~RGW_SWIFT_Auth_Get() override {}
+
+  int verify_permission(optional_yield) override { return 0; }
+  void execute(optional_yield y) override;
+  const char* name() const override { return "swift_auth_get"; }
+  dmc::client_id dmclock_client() override { return dmc::client_id::auth; }
+};
+
+class RGWHandler_SWIFT_Auth : public RGWHandler_REST {
+public:
+  RGWHandler_SWIFT_Auth() {}
+  ~RGWHandler_SWIFT_Auth() override {}
+  RGWOp *op_get() override;
+
+  int init(rgw::sal::Driver* driver, req_state *state, rgw::io::BasicClient *cio) override;
+  int authorize(const DoutPrefixProvider *dpp, optional_yield y) override;
+  int postauth_init(optional_yield) override { return 0; }
+  int read_permissions(RGWOp *op, optional_yield) override { return 0; }
+
+  virtual RGWAccessControlPolicy *alloc_policy() { return NULL; }
+  virtual void free_policy(RGWAccessControlPolicy *policy) {}
+};
+
+class RGWRESTMgr_SWIFT_Auth : public RGWRESTMgr {
+public:
+  RGWRESTMgr_SWIFT_Auth() = default;
+  ~RGWRESTMgr_SWIFT_Auth() override = default;
+
+  RGWRESTMgr *get_resource_mgr(req_state* const s,
+                               const std::string& uri,
+                               std::string* const out_uri) override {
+    return this;
+  }
+
+  RGWHandler_REST* get_handler(rgw::sal::Driver* driver,
+			       req_state*,
+                               const rgw::auth::StrategyRegistry&,
+                               const std::string&) override {
+    return new RGWHandler_SWIFT_Auth;
+  }
+};
diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc
new file mode 100644
index 000000000..b41d9c672
--- /dev/null
+++ b/src/rgw/rgw_sync.cc
@@ -0,0 +1,24 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sync.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+std::ostream&  RGWMetaSyncStatusManager::gen_prefix(std::ostream& out) const
+{
+  return out << "meta sync: ";
+}
+
+unsigned RGWMetaSyncStatusManager::get_subsys() const
+{
+  return dout_subsys;
+}
+
+void RGWRemoteMetaLog::finish()
+{
+  going_down = true;
+  stop();
+}
diff --git a/src/rgw/rgw_sync_checkpoint.cc b/src/rgw/rgw_sync_checkpoint.cc
new file mode 100644
index 000000000..5e05b0e12
--- /dev/null
+++ b/src/rgw/rgw_sync_checkpoint.cc
@@ -0,0 +1,273 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <fmt/format.h>
+#include "common/errno.h"
+#include "rgw_sync_checkpoint.h"
+#include "rgw_sal_rados.h"
+#include "rgw_bucket_sync.h"
+#include "rgw_data_sync.h"
+#include "rgw_http_errors.h"
+#include "cls/rgw/cls_rgw_client.h"
+#include "services/svc_sys_obj.h"
+#include "services/svc_zone.h"
+#include "rgw_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace {
+
+std::string incremental_marker(const rgw_bucket_shard_sync_info& info)
+{
+  return BucketIndexShardsManager::get_shard_marker(info.inc_marker.position);
+}
+
+bool operator<(const std::vector<rgw_bucket_shard_sync_info>& lhs,
+               const BucketIndexShardsManager& rhs)
+{
+  for (size_t i = 0; i < lhs.size(); ++i) {
+    const auto& l = incremental_marker(lhs[i]);
+    const auto& r = rhs.get(i, "");
+    if (l < r) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool empty(const BucketIndexShardsManager& markers, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    const auto& m = markers.get(i, "");
+    if (!m.empty()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::ostream& operator<<(std::ostream& out, const std::vector<rgw_bucket_shard_sync_info>& rhs)
+{
+  const char* separator = ""; // first entry has no comma
+  out << '[';
+  for (auto& i : rhs) {
+    out << std::exchange(separator, ", ") << incremental_marker(i);
+  }
+  return out << ']';
+}
+
+std::ostream& operator<<(std::ostream& out, const BucketIndexShardsManager& rhs)
+{
+  out << '[';
+  const char* separator = ""; // first entry has no comma
+  for (auto& [i, marker] : rhs.get()) {
+    out << std::exchange(separator, ", ") << marker;
+  }
+  return out << ']';
+}
+
+int bucket_source_sync_checkpoint(const DoutPrefixProvider* dpp,
+                                  rgw::sal::RadosStore* store,
+                                  const RGWBucketInfo& bucket_info,
+                                  const RGWBucketInfo& source_bucket_info,
+                                  const rgw_sync_bucket_pipe& pipe,
+                                  uint64_t latest_gen,
+                                  const BucketIndexShardsManager& remote_markers,
+                                  ceph::timespan retry_delay,
+                                  ceph::coarse_mono_time timeout_at)
+{
+
+  const int num_shards = remote_markers.get().size();
+  rgw_bucket_sync_status full_status;
+  int r = rgw_read_bucket_full_sync_status(dpp, store, pipe, &full_status, null_yield);
+  if (r < 0 && r != -ENOENT) { // retry on ENOENT
+    return r;
+  }
+
+  // wait for incremental
+  while (full_status.state != BucketSyncState::Incremental) {
+    const auto delay_until = ceph::coarse_mono_clock::now() + retry_delay;
+    if (delay_until > timeout_at) {
+      lderr(store->ctx()) << "bucket checkpoint timed out waiting to reach incremental sync" << dendl;
+      return -ETIMEDOUT;
+    }
+    ldout(store->ctx(), 1) << "waiting to reach incremental sync.." << dendl;
+    std::this_thread::sleep_until(delay_until);
+
+    r = rgw_read_bucket_full_sync_status(dpp, store, pipe, &full_status, null_yield);
+    if (r < 0 && r != -ENOENT) { // retry on ENOENT
+      return r;
+    }
+  }
+
+  // wait for latest_gen
+  while (full_status.incremental_gen < latest_gen) {
+    const auto delay_until = ceph::coarse_mono_clock::now() + retry_delay;
+    if (delay_until > timeout_at) {
+      lderr(store->ctx()) << "bucket checkpoint timed out waiting to reach "
+          "latest generation " << latest_gen << dendl;
+      return -ETIMEDOUT;
+    }
+    ldout(store->ctx(), 1) << "waiting to reach latest gen " << latest_gen
+        << ", on " << full_status.incremental_gen << ".." << dendl;
+    std::this_thread::sleep_until(delay_until);
+
+    r = rgw_read_bucket_full_sync_status(dpp, store, pipe, &full_status, null_yield);
+    if (r < 0 && r != -ENOENT) { // retry on ENOENT
+      return r;
+    }
+  }
+
+  if (full_status.incremental_gen > latest_gen) {
+    ldpp_dout(dpp, 1) << "bucket sync caught up with source:\n"
+        << "        local gen: " << full_status.incremental_gen << '\n'
+        << "       remote gen: " << latest_gen << dendl;
+    return 0;
+  }
+
+  if (empty(remote_markers, num_shards)) {
+    ldpp_dout(dpp, 1) << "bucket sync caught up with empty source" << dendl;
+    return 0;
+  }
+
+  std::vector<rgw_bucket_shard_sync_info> status;
+  status.resize(std::max<size_t>(1, num_shards));
+  r = rgw_read_bucket_inc_sync_status(dpp, store, pipe,
+                                      full_status.incremental_gen, &status);
+  if (r < 0) {
+    return r;
+  }
+
+  while (status < remote_markers) {
+    const auto delay_until = ceph::coarse_mono_clock::now() + retry_delay;
+    if (delay_until > timeout_at) {
+      ldpp_dout(dpp, 0) << "bucket checkpoint timed out waiting for incremental sync to catch up" << dendl;
+      return -ETIMEDOUT;
+    }
+    ldpp_dout(dpp, 1) << "waiting for incremental sync to catch up:\n"
+        << "      local status: " << status << '\n'
+        << "    remote markers: " << remote_markers << dendl;
+    std::this_thread::sleep_until(delay_until);
+    r = rgw_read_bucket_inc_sync_status(dpp, store, pipe,
+                                        full_status.incremental_gen, &status);
+    if (r < 0) {
+      return r;
+    }
+  }
+  ldpp_dout(dpp, 1) << "bucket sync caught up with source:\n"
+      << "      local status: " << status << '\n'
+      << "    remote markers: " << remote_markers << dendl;
+  return 0;
+}
+
+int source_bilog_info(const DoutPrefixProvider *dpp,
+                      RGWSI_Zone* zone_svc,
+                      const rgw_sync_bucket_pipe& pipe,
+                      rgw_bucket_index_marker_info& info,
+                      BucketIndexShardsManager& markers,
+                      optional_yield y)
+{
+  ceph_assert(pipe.source.zone);
+
+  auto& zone_conn_map = zone_svc->get_zone_conn_map();
+  auto conn = zone_conn_map.find(pipe.source.zone->id);
+  if (conn == zone_conn_map.end()) {
+    return -EINVAL;
+  }
+
+  return rgw_read_remote_bilog_info(dpp, conn->second, *pipe.source.bucket,
+                                    info, markers, y);
+}
+
+} // anonymous namespace
+
+int rgw_bucket_sync_checkpoint(const DoutPrefixProvider* dpp,
+                               rgw::sal::RadosStore* store,
+                               const RGWBucketSyncPolicyHandler& policy,
+                               const RGWBucketInfo& info,
+                               std::optional<rgw_zone_id> opt_source_zone,
+                               std::optional<rgw_bucket> opt_source_bucket,
+                               ceph::timespan retry_delay,
+                               ceph::coarse_mono_time timeout_at)
+{
+  struct sync_source_entry {
+    rgw_sync_bucket_pipe pipe;
+    uint64_t latest_gen = 0;
+    BucketIndexShardsManager remote_markers;
+    RGWBucketInfo source_bucket_info;
+  };
+  std::list<sync_source_entry> sources;
+
+  // fetch remote markers and bucket info in parallel
+  boost::asio::io_context ioctx;
+
+  for (const auto& [source_zone_id, pipe] : policy.get_all_sources()) {
+    // filter by source zone/bucket
+    if (opt_source_zone && *opt_source_zone != *pipe.source.zone) {
+      continue;
+    }
+    if (opt_source_bucket && !opt_source_bucket->match(*pipe.source.bucket)) {
+      continue;
+    }
+    auto& entry = sources.emplace_back();
+    entry.pipe = pipe;
+
+    // fetch remote markers
+    spawn::spawn(ioctx, [&] (yield_context yield) {
+      auto y = optional_yield{ioctx, yield};
+      rgw_bucket_index_marker_info info;
+      int r = source_bilog_info(dpp, store->svc()->zone, entry.pipe,
+                                info, entry.remote_markers, y);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "failed to fetch remote bilog markers: "
+            << cpp_strerror(r) << dendl;
+        throw std::system_error(-r, std::system_category());
+      }
+      entry.latest_gen = info.latest_gen;
+    });
+    // fetch source bucket info
+    spawn::spawn(ioctx, [&] (yield_context yield) {
+      auto y = optional_yield{ioctx, yield};
+      int r = store->getRados()->get_bucket_instance_info(
+          *entry.pipe.source.bucket, entry.source_bucket_info,
+          nullptr, nullptr, y, dpp);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "failed to read source bucket info: "
+            << cpp_strerror(r) << dendl;
+        throw std::system_error(-r, std::system_category());
+      }
+    });
+  }
+
+  try {
+    ioctx.run();
+  } catch (const std::system_error& e) {
+    return -e.code().value();
+  }
+
+  // checkpoint each source sequentially
+  for (const auto& e : sources) {
+    int r = bucket_source_sync_checkpoint(dpp, store, info, e.source_bucket_info,
+                                          e.pipe, e.latest_gen, e.remote_markers,
+                                          retry_delay, timeout_at);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "bucket sync checkpoint failed: " << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+  ldpp_dout(dpp, 0) << "bucket checkpoint complete" << dendl;
+  return 0;
+}
+
diff --git a/src/rgw/rgw_sync_checkpoint.h b/src/rgw/rgw_sync_checkpoint.h
new file mode 100644
index 000000000..28df68d88
--- /dev/null
+++ b/src/rgw/rgw_sync_checkpoint.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <optional>
+#include "common/ceph_time.h"
+#include "rgw_basic_types.h"
+
+class DoutPrefixProvider;
+namespace rgw::sal { class RadosStore; }
+class RGWBucketInfo;
+class RGWBucketSyncPolicyHandler;
+
+// poll the bucket's sync status until it's caught up against all sync sources
+int rgw_bucket_sync_checkpoint(const DoutPrefixProvider* dpp,
+                               rgw::sal::RadosStore* store,
+                               const RGWBucketSyncPolicyHandler& policy,
+                               const RGWBucketInfo& info,
+                               std::optional<rgw_zone_id> opt_source_zone,
+                               std::optional<rgw_bucket> opt_source_bucket,
+                               ceph::timespan retry_delay,
+                               ceph::coarse_mono_time timeout_at);
diff --git a/src/rgw/rgw_sync_policy.cc b/src/rgw/rgw_sync_policy.cc
new file mode 100644
index 000000000..cf28d5eec
--- /dev/null
+++ b/src/rgw/rgw_sync_policy.cc
@@ -0,0 +1,787 @@
+
+
+#include "rgw_common.h"
+#include "rgw_sync_policy.h"
+#include "rgw_bucket.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+string rgw_sync_bucket_entity::bucket_key() const
+{
+  return rgw_sync_bucket_entities::bucket_key(bucket);
+}
+
+bool rgw_sync_pipe_filter_tag::from_str(const string& s)
+{
+  if (s.empty()) {
+    return false;
+  }
+
+  auto pos = s.find('=');
+  if (pos == string::npos) {
+    key = s;
+    return true;
+  }
+
+  key = s.substr(0, pos);
+  if (pos < s.size() - 1) {
+    value = s.substr(pos + 1);
+  }
+
+  return true;
+}
+
+bool rgw_sync_pipe_filter_tag::operator==(const string& s) const
+{
+  if (s.empty()) {
+    return false;
+  }
+
+  auto pos = s.find('=');
+  if (pos == string::npos) {
+    return value.empty() && (s == key);
+  }
+
+  return s.compare(0, pos, s) == 0 &&
+         s.compare(pos + 1, s.size() - pos - 1, value) == 0;
+}
+
+void rgw_sync_pipe_filter::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(prefix, bl);
+  encode(tags, bl);
+  ENCODE_FINISH(bl);
+}
+
+void rgw_sync_pipe_filter::decode(bufferlist::const_iterator& bl)
+{
+  DECODE_START(1, bl);
+  decode(prefix, bl);
+  decode(tags, bl);
+  DECODE_FINISH(bl);
+}
+
+void rgw_sync_pipe_filter::set_prefix(std::optional<std::string> opt_prefix,
+                                      bool prefix_rm)
+{
+  if (opt_prefix) {
+    prefix = *opt_prefix;    
+  } else if (prefix_rm) {
+    prefix.reset();
+  }
+}
+
+void rgw_sync_pipe_filter::set_tags(std::list<std::string>& tags_add,
+                                    std::list<std::string>& tags_rm)
+{
+  for (auto& t : tags_rm) {
+    rgw_sync_pipe_filter_tag tag;
+    if (tag.from_str(t)) {
+      tags.erase(tag);
+    }
+  }
+
+  for (auto& t : tags_add) {
+    rgw_sync_pipe_filter_tag tag;
+    if (tag.from_str(t)) {
+      tags.insert(tag);
+    }
+  }
+}
+
+bool rgw_sync_pipe_filter::is_subset_of(const rgw_sync_pipe_filter& f) const
+{
+  if (f.prefix) {
+    if (!prefix) {
+      return false;
+    }
+    /* f.prefix exists, and this->prefix is either equal or bigger,
+     * therefore this->prefix also set */
+
+    if (!boost::starts_with(*prefix, *f.prefix)) {
+      return false;
+    }
+  }
+
+  /* prefix is subset, now check tags. All our tags should exist in f.tags */
+
+  for (auto& t : tags) {
+    if (f.tags.find(t) == f.tags.end()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool rgw_sync_pipe_filter::check_tag(const string& s) const
+{
+  if (tags.empty()) { /* tag filter wasn't defined */
+    return true;
+  }
+
+  auto iter = tags.find(rgw_sync_pipe_filter_tag(s));
+  return (iter != tags.end());
+}
+
+bool rgw_sync_pipe_filter::check_tag(const string& k, const string& v) const
+{
+  if (tags.empty()) { /* tag filter wasn't defined */
+    return true;
+  }
+
+  auto iter = tags.find(rgw_sync_pipe_filter_tag(k, v));
+  return (iter != tags.end());
+}
+
+bool rgw_sync_pipe_filter::has_tags() const
+{
+  return !tags.empty();
+}
+
+bool rgw_sync_pipe_filter::check_tags(const std::vector<string>& _tags) const
+{
+  if (tags.empty()) {
+    return true;
+  }
+
+  for (auto& t : _tags) {
+    if (check_tag(t)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool rgw_sync_pipe_filter::check_tags(const RGWObjTags::tag_map_t& _tags) const
+{
+  if (tags.empty()) {
+    return true;
+  }
+
+  for (auto& item : _tags) {
+    if (check_tag(item.first, item.second)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void rgw_sync_bucket_entity::apply_bucket(std::optional<rgw_bucket> b)
+{
+  if (!b) {
+    return;
+  }
+
+  if (!bucket ||
+      bucket->name.empty()) {
+    bucket = b;
+  }
+}
+
+void rgw_sync_bucket_entities::add_zones(const std::vector<rgw_zone_id>& new_zones) {
+  for (auto& z : new_zones) {
+    if (z == "*") {
+      all_zones = true;
+      zones.reset();
+      return;
+    }
+
+    if (!zones) {
+      zones.emplace();
+    }
+
+    zones->insert(z);
+
+    all_zones = false;
+  }
+}
+
+std::vector<rgw_sync_bucket_entity> rgw_sync_bucket_entities::expand() const
+{
+  std::vector<rgw_sync_bucket_entity> result;
+  rgw_bucket b = get_bucket();
+  if (all_zones) {
+    rgw_sync_bucket_entity e;
+    e.all_zones = true;
+    e.bucket = b;
+    result.push_back(e);
+    return result;
+  }
+
+  if (!zones) {
+    return result;
+  }
+
+  for (auto& z : *zones) {
+    rgw_sync_bucket_entity e;
+    e.all_zones = false;
+    e.bucket = b;
+    e.zone = z;
+    result.push_back(e);
+  }
+
+  return result;
+}
+
+void rgw_sync_bucket_entities::remove_zones(const std::vector<rgw_zone_id>& rm_zones) {
+  all_zones = false;
+
+  if (!zones) {
+    return;
+  }
+
+  for (auto& z : rm_zones) {
+    zones->erase(z);
+  }
+}
+
+static void set_bucket_field(std::optional<string> source, string *field) {
+  if (!source) {
+    return;
+  }
+  if (source == "*") {
+    field->clear();
+    return;
+  }
+  *field = *source;
+}
+
+void rgw_sync_bucket_entities::set_bucket(std::optional<string> tenant,
+                std::optional<string> bucket_name,
+                std::optional<string> bucket_id)
+{
+  if ((!bucket) && (tenant || bucket_name || bucket_id)) {
+    bucket.emplace();
+  }
+
+  if (!bucket) {
+    return;
+  }
+
+  set_bucket_field(tenant, &bucket->tenant);
+  set_bucket_field(bucket_name, &bucket->name);
+  set_bucket_field(bucket_id, &bucket->bucket_id);
+
+  if (bucket->tenant.empty() &&
+      bucket->name.empty() &&
+      bucket->bucket_id.empty()) {
+    bucket.reset();
+  }
+}
+
+void rgw_sync_bucket_entities::remove_bucket(std::optional<string> tenant,
+                                           std::optional<string> bucket_name,
+                                           std::optional<string> bucket_id)
+{
+  if (!bucket) {
+    return;
+  }
+
+  if (tenant) {
+    bucket->tenant.clear();
+  }
+  if (bucket_name) {
+    bucket->name.clear();
+  }
+  if (bucket_id) {
+    bucket->bucket_id.clear();
+  }
+
+  if (bucket->tenant.empty() &&
+      bucket->name.empty() &&
+      bucket->bucket_id.empty()) {
+    bucket.reset();
+  }
+}
+
+
+string rgw_sync_bucket_entities::bucket_key(std::optional<rgw_bucket> b)
+{
+  if (!b) {
+    return string("*");
+  }
+
+  rgw_bucket _b = *b;
+
+  if (_b.name.empty()) {
+    _b.name = "*";
+  }
+
+  return _b.get_key();
+}
+
+std::vector<rgw_sync_bucket_pipe> rgw_sync_bucket_pipes::expand() const
+{
+  std::vector<rgw_sync_bucket_pipe> result;
+
+  auto sources = source.expand();
+  auto dests = dest.expand();
+
+  for (auto& s : sources) {
+    for (auto& d : dests) {
+      rgw_sync_bucket_pipe pipe;
+      pipe.id = id;
+      pipe.source = s;
+      pipe.dest = d;
+      pipe.params = params;
+      result.push_back(pipe);
+    }
+  }
+
+  return result;
+}
+
+
+void rgw_sync_bucket_pipes::get_potential_related_buckets(const rgw_bucket& bucket,
+                                                          std::set<rgw_bucket> *sources,
+                                                          std::set<rgw_bucket> *dests) const
+{
+  if (dest.match_bucket(bucket)) {
+    auto expanded_sources = source.expand();
+
+    for (auto& s : expanded_sources) {
+      if (s.bucket && !s.bucket->name.empty()) {
+        sources->insert(*s.bucket);
+      }
+    }
+  }
+
+  if (source.match_bucket(bucket)) {
+    auto expanded_dests = dest.expand();
+
+    for (auto& d : expanded_dests) {
+      if (d.bucket && !d.bucket->name.empty()) {
+        dests->insert(*d.bucket);
+      }
+    }
+  }
+}
+
+bool rgw_sync_data_flow_group::find_or_create_symmetrical(const string& flow_id, rgw_sync_symmetric_group **flow_group)
+{
+  for (auto& group : symmetrical) {
+    if (flow_id == group.id) {
+      *flow_group = &group;
+      return true;
+    }
+  }
+
+  auto& group = symmetrical.emplace_back();
+  *flow_group = &group;
+  (*flow_group)->id = flow_id;
+  return true;
+}
+
+void rgw_sync_data_flow_group::remove_symmetrical(const string& flow_id, std::optional<std::vector<rgw_zone_id> > zones)
+{
+  if (symmetrical.empty()) {
+    return;
+  }
+
+  auto& groups = symmetrical;
+
+  auto iter = groups.begin();
+
+  for (; iter != groups.end(); ++iter) {
+    if (iter->id == flow_id) {
+      if (!zones) {
+        groups.erase(iter);
+        if (groups.empty()) {
+          symmetrical.clear();
+        }
+        return;
+      }
+      break;
+    }
+  }
+
+  if (iter == groups.end()) {
+    return;
+  }
+
+  auto& flow_group = *iter;
+
+  for (auto& z : *zones) {
+    flow_group.zones.erase(z);
+  }
+
+  if (flow_group.zones.empty()) {
+    groups.erase(iter);
+  }
+  if (groups.empty()) {
+    symmetrical.clear();
+  }
+}
+
+bool rgw_sync_data_flow_group::find_or_create_directional(const rgw_zone_id& source_zone, const rgw_zone_id& dest_zone, rgw_sync_directional_rule **flow_group)
+{
+  for (auto& rule : directional) {
+    if (source_zone == rule.source_zone &&
+        dest_zone == rule.dest_zone) {
+      *flow_group = &rule;
+      return true;
+    }
+  }
+
+  auto& rule = directional.emplace_back();
+  *flow_group = &rule;
+
+  rule.source_zone = source_zone;
+  rule.dest_zone = dest_zone;
+
+  return true;
+}
+
+void rgw_sync_data_flow_group::remove_directional(const rgw_zone_id& source_zone, const rgw_zone_id& dest_zone)
+{
+  if (directional.empty()) {
+    return;
+  }
+
+  for (auto iter = directional.begin(); iter != directional.end(); ++iter) {
+    auto& rule = *iter;
+    if (source_zone == rule.source_zone &&
+        dest_zone == rule.dest_zone) {
+      directional.erase(iter);
+      return;
+    }
+  }
+}
+
+void rgw_sync_data_flow_group::init_default(const std::set<rgw_zone_id>& zones)
+{
+  symmetrical.clear();
+  symmetrical.push_back(rgw_sync_symmetric_group("default", zones));
+}
+
+bool rgw_sync_policy_group::find_pipe(const string& pipe_id, bool create, rgw_sync_bucket_pipes **pipe)
+{
+  for (auto& p : pipes) {
+    if (pipe_id == p.id) {
+      *pipe = &p;
+      return true;
+    }
+  }
+
+  if (!create) {
+    return false;
+  }
+
+  auto& p = pipes.emplace_back();
+  *pipe = &p;
+  p.id = pipe_id;
+
+  return true;
+}
+
+void rgw_sync_policy_group::remove_pipe(const string& pipe_id)
+{
+  for (auto iter = pipes.begin(); iter != pipes.end(); ++iter) {
+    if (pipe_id == iter->id) {
+      pipes.erase(iter);
+      return;
+    }
+  }
+}
+
+void rgw_sync_policy_group::get_potential_related_buckets(const rgw_bucket& bucket,
+                                                          std::set<rgw_bucket> *sources,
+                                                          std::set<rgw_bucket> *dests) const
+{
+  for (auto& pipe : pipes) {
+    pipe.get_potential_related_buckets(bucket, sources, dests);
+  }
+}
+
+void rgw_sync_policy_info::get_potential_related_buckets(const rgw_bucket& bucket,
+                                                         std::set<rgw_bucket> *sources,
+                                                         std::set<rgw_bucket> *dests) const
+{
+  for (auto& entry : groups) {
+    auto& group = entry.second;
+    group.get_potential_related_buckets(bucket, sources, dests);
+  }
+}
+
+void rgw_sync_directional_rule::dump(Formatter *f) const
+{
+  encode_json("source_zone", source_zone, f);
+  encode_json("dest_zone", dest_zone, f);
+}
+
+void rgw_sync_directional_rule::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("source_zone", source_zone, obj);
+  JSONDecoder::decode_json("dest_zone", dest_zone, obj);
+}
+
+void rgw_sync_symmetric_group::dump(Formatter *f) const
+{
+  encode_json("id", id, f);
+  encode_json("zones", zones, f);
+}
+
+void rgw_sync_symmetric_group::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("zones", zones, obj);
+}
+
+void rgw_sync_bucket_entity::dump(Formatter *f) const
+{
+  encode_json("zone", zone, f);
+  encode_json("bucket", bucket_key(), f);
+}
+
+void rgw_sync_bucket_entity::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("zone", zone, obj);
+  string s;
+  if (JSONDecoder::decode_json("bucket", s, obj)) {
+    rgw_bucket b;
+    int ret = rgw_bucket_parse_bucket_key(nullptr, s, &b, nullptr);
+    if (ret >= 0) {
+      bucket = b;
+    } else {
+      bucket.reset();
+    }
+  }
+}
+
+void rgw_sync_pipe_filter_tag::dump(Formatter *f) const
+{
+  encode_json("key", key, f);
+  encode_json("value", value, f);
+}
+
+void rgw_sync_pipe_filter_tag::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("key", key, obj);
+  JSONDecoder::decode_json("value", value, obj);
+}
+
+void rgw_sync_pipe_filter::dump(Formatter *f) const
+{
+  encode_json("prefix", prefix, f);
+  encode_json("tags", tags, f);
+}
+
+void rgw_sync_pipe_filter::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("prefix", prefix, obj);
+  JSONDecoder::decode_json("tags", tags, obj);
+}
+
+void rgw_sync_pipe_acl_translation::dump(Formatter *f) const
+{
+  encode_json("owner", owner, f);
+}
+
+void rgw_sync_pipe_acl_translation::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("owner", owner, obj);
+}
+
+void rgw_sync_pipe_source_params::dump(Formatter *f) const
+{
+  encode_json("filter", filter, f);
+}
+
+void rgw_sync_pipe_source_params::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("filter", filter, obj);
+}
+
+void rgw_sync_pipe_dest_params::dump(Formatter *f) const
+{
+  encode_json("acl_translation", acl_translation, f);
+  encode_json("storage_class", storage_class, f);
+}
+
+void rgw_sync_pipe_dest_params::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("acl_translation", acl_translation, obj);
+  JSONDecoder::decode_json("storage_class", storage_class, obj);
+}
+
+void rgw_sync_pipe_params::dump(Formatter *f) const
+{
+  encode_json("source", source, f);
+  encode_json("dest", dest, f);
+  encode_json("priority", priority, f);
+  string s;
+  switch (mode) {
+    case MODE_SYSTEM:
+      s = "system";
+      break;
+    default:
+      s = "user";
+  }
+  encode_json("mode", s, f);
+  encode_json("user", user, f);
+}
+
+void rgw_sync_pipe_params::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("source", source, obj);
+  JSONDecoder::decode_json("dest", dest, obj);
+  JSONDecoder::decode_json("priority", priority, obj);
+  string s;
+  JSONDecoder::decode_json("mode", s, obj);
+  if (s == "system") {
+    mode = MODE_SYSTEM;
+  } else {
+    mode = MODE_USER;
+  }
+  JSONDecoder::decode_json("user", user, obj);
+}
+
+void rgw_sync_bucket_entities::dump(Formatter *f) const
+{
+  encode_json("bucket", rgw_sync_bucket_entities::bucket_key(bucket), f);
+  if (zones) {
+    encode_json("zones", zones, f);
+  } else if (all_zones) {
+    set<string> z = { "*" };
+    encode_json("zones", z, f);
+  }
+}
+
+void rgw_sync_bucket_entities::decode_json(JSONObj *obj)
+{
+  string s;
+  JSONDecoder::decode_json("bucket", s, obj);
+  if (s == "*") {
+    bucket.reset();
+  } else {
+    rgw_bucket b;
+    int ret = rgw_bucket_parse_bucket_key(nullptr, s, &b, nullptr);
+    if (ret < 0) {
+      bucket.reset();
+    } else {
+      if (b.tenant == "*") {
+        b.tenant.clear();
+      }
+      if (b.name == "*") {
+        b.name.clear();
+      }
+      if (b.bucket_id == "*") {
+        b.bucket_id.clear();
+      }
+      bucket = b;
+    }
+  }
+  JSONDecoder::decode_json("zones", zones, obj);
+  if (zones && zones->size() == 1) {
+    auto iter = zones->begin();
+    if (*iter == "*") {
+      zones.reset();
+      all_zones = true;
+    }
+  }
+}
+
+void rgw_sync_bucket_pipe::dump(Formatter *f) const
+{
+  encode_json("id", id, f);
+  encode_json("source", source, f);
+  encode_json("dest", dest, f);
+  encode_json("params", params, f);
+}
+
+void rgw_sync_bucket_pipe::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("source", source, obj);
+  JSONDecoder::decode_json("dest", dest, obj);
+  JSONDecoder::decode_json("params", params, obj);
+}
+
+void rgw_sync_bucket_pipes::dump(Formatter *f) const
+{
+  encode_json("id", id, f);
+  encode_json("source", source, f);
+  encode_json("dest", dest, f);
+  encode_json("params", params, f);
+}
+
+void rgw_sync_bucket_pipes::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("source", source, obj);
+  JSONDecoder::decode_json("dest", dest, obj);
+  JSONDecoder::decode_json("params", params, obj);
+}
+
+void rgw_sync_data_flow_group::dump(Formatter *f) const
+{
+  if (!symmetrical.empty()) {
+    encode_json("symmetrical", symmetrical, f);
+  }
+
+  if (!directional.empty()) {
+    encode_json("directional", directional, f);
+  }
+}
+
+void rgw_sync_data_flow_group::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("symmetrical", symmetrical, obj);
+  JSONDecoder::decode_json("directional", directional, obj);
+}
+
+void rgw_sync_policy_group::dump(Formatter *f) const
+{
+  encode_json("id", id, f);
+  encode_json("data_flow", data_flow, f);
+  encode_json("pipes", pipes, f);
+  string s;
+  switch (status) {
+    case  rgw_sync_policy_group::Status::FORBIDDEN:
+      s = "forbidden";
+      break;
+    case  rgw_sync_policy_group::Status::ALLOWED:
+      s = "allowed";
+      break;
+    case  rgw_sync_policy_group::Status::ENABLED:
+      s = "enabled";
+      break;
+    default:
+      s = "unknown";
+  }
+  encode_json("status", s, f);
+}
+
+void rgw_sync_policy_group::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("data_flow", data_flow, obj);
+  JSONDecoder::decode_json("pipes", pipes, obj);
+  string s;
+  JSONDecoder::decode_json("status", s, obj);
+  set_status(s);
+}
+
+void rgw_sync_policy_info::dump(Formatter *f) const
+{
+  Formatter::ArraySection section(*f, "groups");
+  for (auto& group : groups ) {
+    encode_json("group", group.second, f);
+  }
+}
+
+void rgw_sync_policy_info::decode_json(JSONObj *obj)
+{
+  vector<rgw_sync_policy_group> groups_vec;
+
+  JSONDecoder::decode_json("groups", groups_vec, obj);
+
+  for (auto& group : groups_vec) {
+    groups.emplace(std::make_pair(group.id, std::move(group)));
+  }
+}
+
diff --git a/src/rgw/rgw_sync_policy.h b/src/rgw/rgw_sync_policy.h
new file mode 100644
index 000000000..4758c426d
--- /dev/null
+++ b/src/rgw/rgw_sync_policy.h
@@ -0,0 +1,682 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_basic_types.h"
+#include "rgw_tag.h"
+
+
+struct rgw_sync_symmetric_group {
+  std::string id;
+  std::set<rgw_zone_id> zones;
+
+  rgw_sync_symmetric_group() {}
+  rgw_sync_symmetric_group(const std::string& _id,
+                           const std::set<rgw_zone_id> _zones) : id(_id), zones(_zones) {}
+
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(id, bl);
+    encode(zones, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(id, bl);
+    decode(zones, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_sync_symmetric_group)
+
+struct rgw_sync_directional_rule {
+  rgw_zone_id source_zone;
+  rgw_zone_id dest_zone;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(source_zone, bl);
+    encode(dest_zone, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(source_zone, bl);
+    decode(dest_zone, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_sync_directional_rule)
+
+struct rgw_sync_bucket_entity {
+  std::optional<rgw_zone_id> zone; /* define specific zones */
+  std::optional<rgw_bucket> bucket; /* define specific bucket */
+
+  static bool match_str(const std::string& s1, const std::string& s2) { /* empty std::string is wildcard */
+    return (s1.empty() ||
+            s2.empty() ||
+            s1 == s2);
+  }
+
+  bool all_zones{false};
+
+  rgw_sync_bucket_entity() {}
+  rgw_sync_bucket_entity(const rgw_zone_id& _zone,
+                         std::optional<rgw_bucket> _bucket) : zone(_zone),
+                                                              bucket(_bucket.value_or(rgw_bucket())) {}
+
+  bool specific() const {
+    return zone && bucket;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(all_zones, bl);
+    encode(zone, bl);
+    encode(bucket, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(all_zones, bl);
+    decode(zone, bl);
+    decode(bucket, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  rgw_bucket get_bucket() const {
+    return bucket.value_or(rgw_bucket());
+  }
+
+  std::string bucket_key() const;
+
+  bool match_zone(const rgw_zone_id& z) const {
+    if (all_zones) {
+      return true;
+    }
+    if (!zone) {
+      return false;
+    }
+
+    return (*zone == z);
+  }
+
+  void apply_zone(const rgw_zone_id& z) {
+    all_zones = false;
+    zone = z;
+  }
+
+  static bool match_bucket_id(const std::string& bid1, const std::string& bid2) {
+    return (bid1.empty() || bid2.empty() || (bid1 == bid2));
+  }
+
+  bool match_bucket(std::optional<rgw_bucket> b) const {
+    if (!b) {
+      return true;
+    }
+
+    if (!bucket) {
+      return true;
+    }
+
+    return (match_str(bucket->tenant, b->tenant) &&
+            match_str(bucket->name, b->name) &&
+            match_bucket_id(bucket->bucket_id, b->bucket_id));
+  }
+
+  bool match(const rgw_sync_bucket_entity& entity) const {
+    if (!entity.zone) {
+      return match_bucket(entity.bucket);
+    }
+    return (match_zone(*entity.zone) && match_bucket(entity.bucket));
+  }
+
+  const bool operator<(const rgw_sync_bucket_entity& e) const {
+    if (all_zones && !e.all_zones) {
+      return false;
+    }
+    if (!all_zones && e.all_zones) {
+      return true;
+    }
+    if (zone < e.zone) {
+      return true;
+    }
+    if (e.zone < zone) {
+      return false;
+    }
+    return (bucket < e.bucket);
+  }
+
+  void apply_bucket(std::optional<rgw_bucket> _b);
+};
+WRITE_CLASS_ENCODER(rgw_sync_bucket_entity)
+
+struct rgw_sync_pipe_filter_tag {
+  std::string key;
+  std::string value;
+
+  rgw_sync_pipe_filter_tag() {}
+  rgw_sync_pipe_filter_tag(const std::string& s) {
+    from_str(s);
+  }
+  rgw_sync_pipe_filter_tag(const std::string& _key,
+                           const std::string& _value) : key(_key),
+                                                   value(_value) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(key, bl);
+    encode(value, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(key, bl);
+    decode(value, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  bool from_str(const std::string& s);
+
+  bool operator<(const rgw_sync_pipe_filter_tag& t) const {
+    if (key < t.key) {
+      return true;
+    }
+    if (t.key < key) {
+      return false;
+    }
+    return (value < t.value);
+  }
+
+  bool operator==(const std::string& s) const;
+};
+WRITE_CLASS_ENCODER(rgw_sync_pipe_filter_tag)
+
+struct rgw_sync_pipe_filter {
+  std::optional<std::string> prefix;
+  std::set<rgw_sync_pipe_filter_tag> tags;
+
+  void set_prefix(std::optional<std::string> opt_prefix,
+                  bool prefix_rm);
+  void set_tags(std::list<std::string>& tags_add,
+                std::list<std::string>& tags_rm);
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::const_iterator& bl);
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  bool is_subset_of(const rgw_sync_pipe_filter& f) const;
+
+  bool has_tags() const;
+  bool check_tag(const std::string& s) const;
+  bool check_tag(const std::string& k, const std::string& v) const;
+  bool check_tags(const std::vector<std::string>& tags) const;
+  bool check_tags(const RGWObjTags::tag_map_t& tags) const;
+};
+WRITE_CLASS_ENCODER(rgw_sync_pipe_filter)
+
+struct rgw_sync_pipe_acl_translation {
+  rgw_user owner;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(owner, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(owner, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  bool operator==(const rgw_sync_pipe_acl_translation& aclt) const {
+    return (owner == aclt.owner);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_sync_pipe_acl_translation)
+
+struct rgw_sync_pipe_source_params {
+  rgw_sync_pipe_filter filter;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(filter, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(filter, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_sync_pipe_source_params)
+
+struct rgw_sync_pipe_dest_params {
+  std::optional<rgw_sync_pipe_acl_translation> acl_translation;
+  std::optional<std::string> storage_class;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(acl_translation, bl);
+    encode(storage_class, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(acl_translation, bl);
+    decode(storage_class, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void set_storage_class(const std::string& sc) {
+    storage_class = sc;
+  }
+
+  void set_owner(const rgw_user& owner) {
+    if (owner.empty()){
+      acl_translation.reset();
+    } else {
+      acl_translation.emplace();
+      acl_translation->owner = owner;
+    }
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  bool operator==(const rgw_sync_pipe_dest_params& rhs) const {
+    return (acl_translation == rhs.acl_translation &&
+	    storage_class == rhs.storage_class);
+  }
+};
+WRITE_CLASS_ENCODER(rgw_sync_pipe_dest_params)
+
+struct rgw_sync_pipe_params {
+  rgw_sync_pipe_source_params source;
+  rgw_sync_pipe_dest_params dest;
+  enum Mode {
+    MODE_SYSTEM = 0,
+    MODE_USER = 1,
+  } mode{MODE_SYSTEM};
+  int32_t priority{0};
+  rgw_user user;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(source, bl);
+    encode(dest, bl);
+    encode(priority, bl);
+    encode((uint8_t)mode, bl);
+    encode(user, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(source, bl);
+    decode(dest, bl);
+    decode(priority, bl);
+    uint8_t m;
+    decode(m, bl);
+    mode = (Mode)m;
+    decode(user, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_sync_pipe_params)
+
+struct rgw_sync_bucket_pipe {
+  std::string id;
+  rgw_sync_bucket_entity source;
+  rgw_sync_bucket_entity dest;
+
+  rgw_sync_pipe_params params;
+
+  bool specific() const {
+    return source.specific() && dest.specific();
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(id, bl);
+    encode(source, bl);
+    encode(dest, bl);
+    encode(params, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(id, bl);
+    decode(source, bl);
+    decode(dest, bl);
+    decode(params, bl);
+    DECODE_FINISH(bl);
+  }
+
+  const bool operator<(const rgw_sync_bucket_pipe& p) const {
+    if (id < p.id) {
+      return true;
+    }
+    if (id >p.id) {
+      return false;
+    }
+    if (source < p.source) {
+      return true;
+    }
+    if (p.source < source) {
+      return false;
+    }
+    return (dest < p.dest);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(rgw_sync_bucket_pipe)
+
+struct rgw_sync_bucket_entities {
+  std::optional<rgw_bucket> bucket; /* define specific bucket */
+  std::optional<std::set<rgw_zone_id> > zones; /* define specific zones, if not set then all zones */
+
+  bool all_zones{false};
+
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(bucket, bl);
+    encode(zones, bl);
+    encode(all_zones, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(bucket, bl);
+    decode(zones, bl);
+    decode(all_zones, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  bool match_bucket(std::optional<rgw_bucket> b) const {
+    if (!b) {
+      return true;
+    }
+
+    if (!bucket) {
+      return true;
+    }
+
+    return (rgw_sync_bucket_entity::match_str(bucket->tenant, b->tenant) &&
+            rgw_sync_bucket_entity::match_str(bucket->name, b->name) &&
+            rgw_sync_bucket_entity::match_str(bucket->bucket_id, b->bucket_id));
+  }
+
+  void add_zones(const std::vector<rgw_zone_id>& new_zones);
+  void remove_zones(const std::vector<rgw_zone_id>& rm_zones);
+  void set_bucket(std::optional<std::string> tenant,
+                  std::optional<std::string> bucket_name,
+                  std::optional<std::string> bucket_id);
+  void remove_bucket(std::optional<std::string> tenant,
+                     std::optional<std::string> bucket_name,
+                     std::optional<std::string> bucket_id);
+
+  bool match_zone(const rgw_zone_id& zone) const {
+    if (!zones) {
+      if (all_zones) {
+	return true;
+      }
+      return false;
+    }
+
+    return (zones->find(zone) != zones->end());
+  }
+
+  std::vector<rgw_sync_bucket_entity> expand() const;
+
+  rgw_bucket get_bucket() const {
+    return bucket.value_or(rgw_bucket());
+  }
+
+  static std::string bucket_key(std::optional<rgw_bucket> b);
+
+  void set_all_zones(bool state) {
+    all_zones = state;
+    if (all_zones) {
+      zones.reset();
+    }
+  }
+};
+WRITE_CLASS_ENCODER(rgw_sync_bucket_entities)
+
+struct rgw_sync_bucket_pipes {
+  std::string id;
+  rgw_sync_bucket_entities source;
+  rgw_sync_bucket_entities dest;
+
+  rgw_sync_pipe_params params;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(id, bl);
+    encode(source, bl);
+    encode(dest, bl);
+    encode(params, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(id, bl);
+    decode(source, bl);
+    decode(dest, bl);
+    decode(params, bl);
+    DECODE_FINISH(bl);
+  }
+
+  bool match_source(const rgw_zone_id& zone, std::optional<rgw_bucket> b) const {
+    return (source.match_zone(zone) && source.match_bucket(b));
+  }
+
+  bool match_dest(const rgw_zone_id& zone, std::optional<rgw_bucket> b) const {
+    return (dest.match_zone(zone) && dest.match_bucket(b));
+  }
+
+  bool contains_zone_bucket(const rgw_zone_id& zone, std::optional<rgw_bucket> b) const {
+    return (match_source(zone, b) || match_dest(zone, b));
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  std::vector<rgw_sync_bucket_pipe> expand() const;
+
+  void get_potential_related_buckets(const rgw_bucket& bucket,
+                                     std::set<rgw_bucket> *sources,
+                                     std::set<rgw_bucket> *dests) const;
+};
+WRITE_CLASS_ENCODER(rgw_sync_bucket_pipes)
+
+std::ostream& operator<<(std::ostream& os, const rgw_sync_bucket_entity& e);
+std::ostream& operator<<(std::ostream& os, const rgw_sync_bucket_pipe& pipe);
+std::ostream& operator<<(std::ostream& os, const rgw_sync_bucket_entities& e);
+std::ostream& operator<<(std::ostream& os, const rgw_sync_bucket_pipes& pipe);
+
+/*
+ * define data flow between zones. Symmetrical: zones sync from each other.
+ * Directional: one zone fetches data from another.
+ */
+struct rgw_sync_data_flow_group {
+  std::vector<rgw_sync_symmetric_group> symmetrical;
+  std::vector<rgw_sync_directional_rule> directional;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(symmetrical, bl);
+    encode(directional, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(symmetrical, bl);
+    decode(directional, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  bool empty() const {
+    return (symmetrical.empty() && directional.empty());
+  }
+
+  bool find_or_create_symmetrical(const std::string& flow_id, rgw_sync_symmetric_group **flow_group);
+  void remove_symmetrical(const std::string& flow_id, std::optional<std::vector<rgw_zone_id> > zones);
+  bool find_or_create_directional(const rgw_zone_id& source_zone, const rgw_zone_id& dest_zone, rgw_sync_directional_rule **flow_group);
+  void remove_directional(const rgw_zone_id& source_zone, const rgw_zone_id& dest_zone);
+
+  void init_default(const std::set<rgw_zone_id>& zones);
+};
+WRITE_CLASS_ENCODER(rgw_sync_data_flow_group)
+
+
+struct rgw_sync_policy_group {
+  std::string id;
+
+  rgw_sync_data_flow_group data_flow; /* override data flow, howver, will not be able to
+                                                        add new flows that don't exist at higher level */
+  std::vector<rgw_sync_bucket_pipes> pipes; /* if not defined then applies to all
+                                                              buckets (DR sync) */
+
+  enum Status {
+    UNKNOWN     = 0,  /* ? */
+    FORBIDDEN   = 1,  /* sync not allowed */
+    ALLOWED     = 2,  /* sync allowed */
+    ENABLED     = 3,  /* sync should happen */
+  } status;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(id, bl);
+    encode(data_flow, bl);
+    encode(pipes, bl);
+    encode((uint32_t)status, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(id, bl);
+    decode(data_flow, bl);
+    decode(pipes, bl);
+    uint32_t s;
+    decode(s, bl);
+    status = (Status)s;
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  bool set_status(const std::string& s) {
+    if (s == "forbidden") {
+      status = rgw_sync_policy_group::Status::FORBIDDEN;
+    } else if (s == "allowed") {
+      status = rgw_sync_policy_group::Status::ALLOWED;
+    } else if (s == "enabled") {
+      status = rgw_sync_policy_group::Status::ENABLED;
+    } else {
+      status = rgw_sync_policy_group::Status::UNKNOWN;
+      return false;
+    }
+
+    return true;
+  }
+
+  bool find_pipe(const std::string& pipe_id, bool create, rgw_sync_bucket_pipes **pipe);
+  void remove_pipe(const std::string& pipe_id);
+
+  void get_potential_related_buckets(const rgw_bucket& bucket,
+                                     std::set<rgw_bucket> *sources,
+                                     std::set<rgw_bucket> *dests) const;
+
+};
+WRITE_CLASS_ENCODER(rgw_sync_policy_group)
+
+struct rgw_sync_policy_info {
+  std::map<std::string, rgw_sync_policy_group> groups;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(groups, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(groups, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  bool empty() const {
+    return groups.empty();
+  }
+
+  void get_potential_related_buckets(const rgw_bucket& bucket,
+                                     std::set<rgw_bucket> *sources,
+                                     std::set<rgw_bucket> *dests) const;
+};
+WRITE_CLASS_ENCODER(rgw_sync_policy_info)
+
+
diff --git a/src/rgw/rgw_tag.cc b/src/rgw/rgw_tag.cc
new file mode 100644
index 000000000..f7e52592f
--- /dev/null
+++ b/src/rgw/rgw_tag.cc
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <map>
+#include <string>
+
+#include <common/errno.h>
+#include <boost/algorithm/string.hpp>
+
+#include "rgw_tag.h"
+#include "rgw_common.h"
+
+using namespace std;
+
+void RGWObjTags::add_tag(const string& key, const string& val){
+  tag_map.emplace(std::make_pair(key,val));
+}
+
+void RGWObjTags::emplace_tag(std::string&& key, std::string&& val){
+  tag_map.emplace(std::move(key), std::move(val));
+}
+
+int RGWObjTags::check_and_add_tag(const string&key, const string& val){
+  if (tag_map.size() == max_obj_tags ||
+      key.size() > max_tag_key_size ||
+      val.size() > max_tag_val_size ||
+      key.size() == 0){
+    return -ERR_INVALID_TAG;
+  }
+
+  add_tag(key,val);
+
+  return 0;
+}
+
+int RGWObjTags::set_from_string(const string& input){
+  if (input.empty()) {
+    return 0;
+  }
+  int ret=0;
+  vector <string> kvs;
+  boost::split(kvs, input, boost::is_any_of("&"));
+  for (const auto& kv: kvs){
+    auto p = kv.find("=");
+    string key,val;
+    if (p != string::npos) {
+      ret = check_and_add_tag(url_decode(kv.substr(0,p)),
+                              url_decode(kv.substr(p+1)));
+    } else {
+      ret = check_and_add_tag(url_decode(kv));
+    }
+
+    if (ret < 0)
+      return ret;
+  }
+  return ret;
+}
+
+void RGWObjTags::dump(Formatter *f) const
+{
+  f->open_object_section("tagset");
+  for (auto& tag: tag_map){
+    f->dump_string(tag.first.c_str(), tag.second);
+  }
+  f->close_section();
+}
+
diff --git a/src/rgw/rgw_tag.h b/src/rgw/rgw_tag.h
new file mode 100644
index 000000000..15bb25ee8
--- /dev/null
+++ b/src/rgw/rgw_tag.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <include/types.h>
+#include <map>
+
+class RGWObjTags
+{
+public:
+  using tag_map_t = std::multimap <std::string, std::string>;
+
+protected:
+  tag_map_t tag_map;
+
+  uint32_t max_obj_tags{10};
+  static constexpr uint32_t max_tag_key_size{128};
+  static constexpr uint32_t max_tag_val_size{256};
+
+ public:
+  RGWObjTags() = default;
+  RGWObjTags(uint32_t max_obj_tags):max_obj_tags(max_obj_tags) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1,1,bl);
+    encode(tag_map, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator &bl) {
+    DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, bl);
+    decode(tag_map,bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void add_tag(const std::string& key, const std::string& val="");
+  void emplace_tag(std::string&& key, std::string&& val);
+  int check_and_add_tag(const std::string& key, const std::string& val="");
+  size_t count() const {return tag_map.size();}
+  int set_from_string(const std::string& input);
+  void clear() { tag_map.clear(); }
+  bool empty() const noexcept { return tag_map.empty(); }
+  const tag_map_t& get_tags() const {return tag_map;}
+  tag_map_t& get_tags() {return tag_map;}
+};
+WRITE_CLASS_ENCODER(RGWObjTags)
diff --git a/src/rgw/rgw_tag_s3.cc b/src/rgw/rgw_tag_s3.cc
new file mode 100644
index 000000000..89436c326
--- /dev/null
+++ b/src/rgw/rgw_tag_s3.cc
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <map>
+#include <string>
+#include <iostream>
+
+#include "include/types.h"
+
+#include "rgw_tag_s3.h"
+
+using namespace std;
+
+void RGWObjTagEntry_S3::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("Key", key, obj, true);
+  RGWXMLDecoder::decode_xml("Value", val, obj, true);
+}
+
+void RGWObjTagEntry_S3::dump_xml(Formatter *f) const {
+  encode_xml("Key", key, f);
+  encode_xml("Value", val, f);
+
+  if (key.empty()) {
+    throw RGWXMLDecoder::err("empty key");
+  }
+
+  if (val.empty()) {
+    throw RGWXMLDecoder::err("empty val");
+  }
+}
+
+void RGWObjTagSet_S3::decode_xml(XMLObj *obj) {
+  vector<RGWObjTagEntry_S3> entries;
+
+  bool mandatory{false};
+  RGWXMLDecoder::decode_xml("Tag", entries, obj, mandatory);
+
+  for (auto& entry : entries) {
+    const std::string& key = entry.get_key();
+    const std::string& val = entry.get_val();
+    add_tag(key,val);
+  }
+}
+
+int RGWObjTagSet_S3::rebuild(RGWObjTags& dest) {
+  int ret;
+  for (const auto &it : tag_map){
+    ret = dest.check_and_add_tag(it.first, it.second);
+    if (ret < 0)
+      return ret;
+  }
+  return 0;
+}
+
+void RGWObjTagging_S3::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("TagSet", tagset, obj, true);
+}
+
+void RGWObjTagSet_S3::dump_xml(Formatter *f) const {
+  for (const auto& tag : tag_map){
+    Formatter::ObjectSection os(*f, "Tag");
+    encode_xml("Key", tag.first, f);
+    encode_xml("Value", tag.second, f);
+  }
+}
+
diff --git a/src/rgw/rgw_tag_s3.h b/src/rgw/rgw_tag_s3.h
new file mode 100644
index 000000000..7cc892f1f
--- /dev/null
+++ b/src/rgw/rgw_tag_s3.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <iostream>
+#include <include/types.h>
+#include <common/Formatter.h>
+#include <expat.h>
+
+#include "rgw_tag.h"
+#include "rgw_xml.h"
+
+class RGWObjTagEntry_S3
+{
+  std::string key;
+  std::string val;
+public:
+  RGWObjTagEntry_S3() {}
+  RGWObjTagEntry_S3(const std::string &k, const std::string &v):key(k),val(v) {};
+  ~RGWObjTagEntry_S3() {}
+
+  const std::string& get_key () const { return key; }
+  const std::string& get_val () const { return val; }
+
+  void dump_xml(Formatter *f) const;
+  void decode_xml(XMLObj *obj);
+};
+
+class RGWObjTagSet_S3: public RGWObjTags
+{
+public:
+  int rebuild(RGWObjTags& dest);
+
+  void dump_xml(Formatter *f) const;
+  void decode_xml(XMLObj *obj);
+};
+
+class RGWObjTagging_S3
+{
+  RGWObjTagSet_S3 tagset;
+public:
+  void decode_xml(XMLObj *obj);
+  int rebuild(RGWObjTags& dest) {
+    return tagset.rebuild(dest);
+  }
+};
diff --git a/src/rgw/rgw_tar.h b/src/rgw/rgw_tar.h
new file mode 100644
index 000000000..b06943a3c
--- /dev/null
+++ b/src/rgw/rgw_tar.h
@@ -0,0 +1,153 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <string_view>
+#include <tuple>
+#include <utility>
+
+#include <boost/optional.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+
+namespace rgw {
+namespace tar {
+
+static constexpr size_t BLOCK_SIZE = 512;
+
+
+static inline std::pair<class StatusIndicator,
+                        boost::optional<class HeaderView>>
+interpret_block(const StatusIndicator& status, ceph::bufferlist& bl);
+
+
+class StatusIndicator {
+  friend std::pair<class StatusIndicator,
+                   boost::optional<class HeaderView>>
+  interpret_block(const StatusIndicator& status, ceph::bufferlist& bl);
+
+  bool is_empty;
+  bool is_eof;
+
+  StatusIndicator()
+    : is_empty(false),
+      is_eof(false) {
+  }
+
+  StatusIndicator(const StatusIndicator& prev_status,
+                  const bool is_empty)
+  : is_empty(is_empty),
+    is_eof(is_empty && prev_status.empty()) {
+  }
+
+public:
+  bool empty() const {
+    return is_empty;
+  }
+
+  bool eof() const {
+    return is_eof;
+  }
+
+  static StatusIndicator create() {
+    return StatusIndicator();
+  }
+} /* class StatusIndicator */;
+
+
+enum class FileType : char {
+  UNKNOWN = '\0',
+
+  /* The tar format uses ASCII encoding. */
+  NORMAL_FILE = '0',
+  DIRECTORY = '5'
+}; /* enum class FileType */
+
+class HeaderView {
+protected:
+  /* Everything is char here (ASCII encoding), so we don't need to worry about
+   * the struct padding. */
+  const struct header_t {
+    char filename[100];
+    char __filemode[8];
+    char __owner_id[8];
+    char __group_id[8];
+    char filesize[12];
+    char lastmod[12];
+    char checksum[8];
+    char filetype;
+    char __padding[355];
+  } *header;
+
+  static_assert(sizeof(*header) == BLOCK_SIZE,
+                "The TAR header must be exactly BLOCK_SIZE length");
+
+  /* The label is far more important from what the code really does. */
+  static size_t pos2len(const size_t pos) {
+    return pos + 1;
+  }
+
+public:
+  explicit HeaderView(const char (&header)[BLOCK_SIZE])
+    : header(reinterpret_cast<const header_t*>(header)) {
+  }
+
+  FileType get_filetype() const {
+    switch (header->filetype) {
+      case static_cast<char>(FileType::NORMAL_FILE):
+        return FileType::NORMAL_FILE;
+      case static_cast<char>(FileType::DIRECTORY):
+        return FileType::DIRECTORY;
+      default:
+        return FileType::UNKNOWN;
+    }
+  }
+
+  std::string_view get_filename() const {
+    return std::string_view(header->filename,
+                             std::min(sizeof(header->filename),
+                                      strlen(header->filename)));
+  }
+
+  size_t get_filesize() const {
+    /* The string_ref is pretty suitable here because tar encodes its
+     * metadata in ASCII. */
+    const std::string_view raw(header->filesize, sizeof(header->filesize));
+
+    /* We need to find where the padding ends. */
+    const auto pad_ends_at = std::min(raw.find_last_not_of('\0'),
+                                      raw.find_last_not_of(' '));
+    const auto trimmed = raw.substr(0,
+      pad_ends_at == std::string_view::npos ? std::string_view::npos
+                                             : pos2len(pad_ends_at));
+
+    size_t sum = 0, mul = 1;
+    for (const char c : boost::adaptors::reverse(trimmed)) {
+      sum += (c - '0') * mul;
+      mul *= 8;
+    }
+
+    return sum;
+  }
+}; /* class Header */
+
+
+static inline std::pair<StatusIndicator,
+                        boost::optional<HeaderView>>
+interpret_block(const StatusIndicator& status, ceph::bufferlist& bl) {
+  static constexpr std::array<char, BLOCK_SIZE> zero_block = {0, };
+  const char (&block)[BLOCK_SIZE] = \
+    reinterpret_cast<const char (&)[BLOCK_SIZE]>(*bl.c_str());
+
+  if (std::memcmp(zero_block.data(), block, BLOCK_SIZE) == 0) {
+    return std::make_pair(StatusIndicator(status, true), boost::none);
+  } else {
+    return std::make_pair(StatusIndicator(status, false), HeaderView(block));
+  }
+}
+
+} /* namespace tar */
+} /* namespace rgw */
diff --git a/src/rgw/rgw_token.cc b/src/rgw/rgw_token.cc
new file mode 100644
index 000000000..999d46e0e
--- /dev/null
+++ b/src/rgw/rgw_token.cc
@@ -0,0 +1,144 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "common/config.h"
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "global/global_init.h"
+#include "include/ceph_assert.h"
+#include "include/str_list.h"
+
+#include "rgw_token.h"
+#include "rgw_b64.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+namespace {
+
+  using namespace rgw;
+  using std::get;
+  using std::string;
+
+  RGWToken::token_type type{RGWToken::TOKEN_NONE};
+  string access_key{""};
+  string secret_key{""};
+
+  Formatter* token_formatter{nullptr};
+
+  bool verbose {false};
+  bool do_encode {false};
+  bool do_decode {false};
+
+}
+
+using namespace std;
+
+void usage()
+{
+  cout << "usage: radosgw-token --encode --ttype=<token type> [options...]" << std::endl;
+  cout << "\t(maybe exporting RGW_ACCESS_KEY_ID and RGW_SECRET_ACCESS_KEY)"    
+       << std::endl;
+  cout << "\t <token type> := ad | ldap" << std::endl;
+  cout << "\n";
+  generic_client_usage();
+}
+
+int main(int argc, char **argv)
+{
+  auto args = argv_to_vec(argc, argv);
+  std::string val;
+  if (args.empty()) {
+    cerr << argv[0] << ": -h or --help for usage" << std::endl;
+    exit(1);
+  }
+  if (ceph_argparse_need_usage(args)) {
+    usage();
+    exit(0);
+  }
+
+  auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+			 CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  char *v{nullptr};
+  v = getenv("RGW_ACCESS_KEY_ID");
+  if (v) {
+    access_key = v;
+  }
+
+  v = getenv("RGW_SECRET_ACCESS_KEY");
+  if (v) {
+    secret_key = v;
+  }
+
+  for (auto arg_iter = args.begin(); arg_iter != args.end();) {
+    if (ceph_argparse_witharg(args, arg_iter, &val, "--access",
+			      (char*) nullptr)) {
+      access_key = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--secret",
+				     (char*) nullptr)) {
+      secret_key = val;
+    } else if (ceph_argparse_witharg(args, arg_iter, &val, "--ttype",
+				     (char*) nullptr)) {
+      for (const auto& ttype : {"ad", "ldap"}) {
+	if (boost::iequals(val, ttype)) {
+	  type = RGWToken::to_type(val);
+	  break;
+	}
+      }
+    } else if (ceph_argparse_flag(args, arg_iter, "--encode",
+					    (char*) nullptr)) {
+      do_encode = true;
+    } else if (ceph_argparse_flag(args, arg_iter, "--decode",
+					    (char*) nullptr)) {
+      do_decode = true;
+    } else if (ceph_argparse_flag(args, arg_iter, "--verbose",
+					    (char*) nullptr)) {
+      verbose = true;
+    } else {
+      ++arg_iter;
+    }
+  }
+
+  if ((! do_encode) ||
+      (type == RGWToken::TOKEN_NONE)) {
+    return -EINVAL;
+  }
+
+  token_formatter = new JSONFormatter(true /* pretty */);
+
+  RGWToken token(type, access_key, secret_key);
+  if (do_encode) {
+    token.encode_json(token_formatter);
+    std::ostringstream os;
+    token_formatter->flush(os);
+    string token_str = os.str();
+    if (verbose) {
+      std::cout << "expanded token: " << token_str << std::endl;
+      if (do_decode) {
+	RGWToken token2(token_str);
+	std::cout << "decoded expanded token: " << token2 << std::endl;
+      }
+    }
+    std::cout << to_base64(token_str) << std::endl;
+  }
+
+  return 0;
+}
diff --git a/src/rgw/rgw_token.h b/src/rgw/rgw_token.h
new file mode 100644
index 000000000..b2476596b
--- /dev/null
+++ b/src/rgw/rgw_token.h
@@ -0,0 +1,170 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Red Hat, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <boost/algorithm/string.hpp>
+#include <sstream>
+
+#include "common/ceph_json.h"
+#include "common/Formatter.h"
+#include "rgw/rgw_b64.h"
+
+namespace rgw {
+
+  using std::string;
+
+  class RGWToken {
+  public:
+    static constexpr auto type_name = "RGW_TOKEN";
+
+    enum token_type : uint32_t {
+      TOKEN_NONE,
+	TOKEN_AD,
+	TOKEN_KEYSTONE,
+	TOKEN_LDAP,
+    };
+
+    static enum token_type to_type(const string& s) {
+      if (boost::iequals(s, "ad"))
+	return TOKEN_AD;
+      if (boost::iequals(s, "ldap"))
+	return TOKEN_LDAP;
+      if (boost::iequals(s, "keystone"))
+	return TOKEN_KEYSTONE;
+      return TOKEN_NONE;
+    }
+
+    static const char* from_type(enum token_type type) {
+      switch (type) {
+      case TOKEN_AD:
+	return "ad";
+      case TOKEN_LDAP:
+	return "ldap";
+      case TOKEN_KEYSTONE:
+	return "keystone";
+      default:
+	return "none";
+      };
+    }
+
+    token_type type;
+    string id;
+    string key;
+
+    virtual uint32_t version() const { return 1; };
+
+    bool valid() const{
+      return ((type != TOKEN_NONE) &&
+	      (! id.empty()) &&
+	      (! key.empty()));
+    }
+
+    RGWToken()
+      : type(TOKEN_NONE) {};
+
+    RGWToken(enum token_type _type, const std::string& _id,
+	     const std::string& _key)
+      : type(_type), id(_id), key(_key) {};
+
+    explicit RGWToken(const string& json) {
+      JSONParser p;
+      p.parse(json.c_str(), json.length());
+      JSONDecoder::decode_json(RGWToken::type_name, *this, &p);
+    }
+
+    RGWToken& operator=(const std::string& json) {
+      JSONParser p;
+      p.parse(json.c_str(), json.length());
+      JSONDecoder::decode_json(RGWToken::type_name, *this, &p);
+      return *this;
+    }
+
+    void encode(bufferlist& bl) const {
+      uint32_t ver = version();
+      string typestr{from_type(type)};
+      ENCODE_START(1, 1, bl);
+      encode(type_name, bl);
+      encode(ver, bl);
+      encode(typestr, bl);
+      encode(id, bl);
+      encode(key, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::const_iterator& bl) {
+      string name;
+      string typestr;
+      uint32_t version;
+      DECODE_START(1, bl);
+      decode(name, bl);
+      decode(version, bl);
+      decode(typestr, bl);
+      type = to_type(typestr);
+      decode(id, bl);
+      decode(key, bl);
+      DECODE_FINISH(bl);
+    }
+
+    void dump(Formatter* f) const {
+      ::encode_json("version", uint32_t(version()), f);
+      ::encode_json("type", from_type(type), f);
+      ::encode_json("id", id, f);
+      ::encode_json("key", key, f);
+    }
+
+    void encode_json(Formatter* f) {
+      RGWToken& token = *this;
+      f->open_object_section(type_name);
+      ::encode_json(type_name, token, f);
+      f->close_section();
+    }
+
+    void decode_json(JSONObj* obj) {
+      uint32_t version;
+      string type_name;
+      string typestr;
+      JSONDecoder::decode_json("version", version, obj);
+      JSONDecoder::decode_json("type", typestr, obj);
+      type = to_type(typestr);
+      JSONDecoder::decode_json("id", id, obj);
+      JSONDecoder::decode_json("key", key, obj);
+    }
+
+    std::string encode_json_base64(Formatter* f) {
+      encode_json(f);
+      std::ostringstream os;
+      f->flush(os);
+      return to_base64(std::move(os.str()));
+    }
+
+    friend inline std::ostream& operator<<(std::ostream& os, const RGWToken& token);
+
+    virtual ~RGWToken() {};
+  };
+  WRITE_CLASS_ENCODER(RGWToken)
+
+  inline std::ostream& operator<<(std::ostream& os, const RGWToken& token)
+  {
+    os << "<<RGWToken"
+       << " type=" << RGWToken::from_type(token.type)
+       << " id=" << token.id
+       << " key=" << token.key
+       << ">>";
+    return os;
+  }
+
+} /* namespace rgw */
diff --git a/src/rgw/rgw_tools.cc b/src/rgw/rgw_tools.cc
new file mode 100644
index 000000000..7e6513cde
--- /dev/null
+++ b/src/rgw/rgw_tools.cc
@@ -0,0 +1,124 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+
+#include "common/errno.h"
+
+#include "rgw_tools.h"
+
+#define dout_subsys ceph_subsys_rgw
+#define dout_context g_ceph_context
+
+#define READ_CHUNK_LEN (512 * 1024)
+
+using namespace std;
+
+static std::map<std::string, std::string>* ext_mime_map;
+
+void parse_mime_map_line(const char *start, const char *end)
+{
+  char line[end - start + 1];
+  strncpy(line, start, end - start);
+  line[end - start] = '\0';
+  char *l = line;
+#define DELIMS " \t\n\r"
+
+  while (isspace(*l))
+    l++;
+
+  char *mime = strsep(&l, DELIMS);
+  if (!mime)
+    return;
+
+  char *ext;
+  do {
+    ext = strsep(&l, DELIMS);
+    if (ext && *ext) {
+      (*ext_mime_map)[ext] = mime;
+    }
+  } while (ext);
+}
+
+
+void parse_mime_map(const char *buf)
+{
+  const char *start = buf, *end = buf;
+  while (*end) {
+    while (*end && *end != '\n') {
+      end++;
+    }
+    parse_mime_map_line(start, end);
+    end++;
+    start = end;
+  }
+}
+
+static int ext_mime_map_init(const DoutPrefixProvider *dpp, CephContext *cct, const char *ext_map)
+{
+  int fd = open(ext_map, O_RDONLY);
+  char *buf = NULL;
+  int ret;
+  if (fd < 0) {
+    ret = -errno;
+    ldpp_dout(dpp, 0) << __func__ << " failed to open file=" << ext_map
+                  << " : " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  struct stat st;
+  ret = fstat(fd, &st);
+  if (ret < 0) {
+    ret = -errno;
+    ldpp_dout(dpp, 0) << __func__ << " failed to stat file=" << ext_map
+                  << " : " << cpp_strerror(-ret) << dendl;
+    goto done;
+  }
+
+  buf = (char *)malloc(st.st_size + 1);
+  if (!buf) {
+    ret = -ENOMEM;
+    ldpp_dout(dpp, 0) << __func__ << " failed to allocate buf" << dendl;
+    goto done;
+  }
+
+  ret = safe_read(fd, buf, st.st_size + 1);
+  if (ret != st.st_size) {
+    // huh? file size has changed?
+    ldpp_dout(dpp, 0) << __func__ << " raced! will retry.." << dendl;
+    free(buf);
+    close(fd);
+    return ext_mime_map_init(dpp, cct, ext_map);
+  }
+  buf[st.st_size] = '\0';
+
+  parse_mime_map(buf);
+  ret = 0;
+done:
+  free(buf);
+  close(fd);
+  return ret;
+}
+
+const char *rgw_find_mime_by_ext(string& ext)
+{
+  map<string, string>::iterator iter = ext_mime_map->find(ext);
+  if (iter == ext_mime_map->end())
+    return NULL;
+
+  return iter->second.c_str();
+}
+
+int rgw_tools_init(const DoutPrefixProvider *dpp, CephContext *cct)
+{
+  ext_mime_map = new std::map<std::string, std::string>;
+  ext_mime_map_init(dpp, cct, cct->_conf->rgw_mime_types_file.c_str());
+  // ignore errors; missing mime.types is not fatal
+  return 0;
+}
+
+void rgw_tools_cleanup()
+{
+  delete ext_mime_map;
+  ext_mime_map = nullptr;
+}
diff --git a/src/rgw/rgw_torrent.cc b/src/rgw/rgw_torrent.cc
new file mode 100644
index 000000000..e1a1417a5
--- /dev/null
+++ b/src/rgw/rgw_torrent.cc
@@ -0,0 +1,261 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include <sstream>
+
+#include "rgw_torrent.h"
+#include "rgw_sal.h"
+#include "rgw_sal_rados.h"
+#include "include/str_list.h"
+#include "include/rados/librados.hpp"
+
+#include "services/svc_sys_obj.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace librados;
+using namespace boost;
+using ceph::crypto::SHA1;
+
+seed::seed()
+{
+  seed::info.piece_length = 0;
+  seed::info.len = 0;
+  sha_len = 0;
+  is_torrent = false; 
+}
+
+seed::~seed()
+{
+  seed::info.sha1_bl.clear();
+  bl.clear();
+  s = NULL;
+  driver = NULL;
+}
+
+void seed::init(req_state *_req, rgw::sal::Driver* _driver)
+{
+  s = _req;
+  driver = _driver;
+}
+
+int seed::get_torrent_file(rgw::sal::Object* object,
+                           uint64_t &total_len,
+                           ceph::bufferlist &bl_data,
+                           rgw_obj &obj)
+{
+  /* add other field if config is set */
+  dencode.bencode_dict(bl);
+  set_announce();
+  if (!comment.empty())
+  {
+    dencode.bencode(COMMENT, comment, bl);
+  }
+  if (!create_by.empty())
+  {
+    dencode.bencode(CREATED_BY, create_by, bl);
+  }
+  if (!encoding.empty())
+  {
+    dencode.bencode(ENCODING, encoding, bl);
+  }
+
+  string oid, key;
+  get_obj_bucket_and_oid_loc(obj, oid, key);
+  ldpp_dout(s, 20) << "NOTICE: head obj oid= " << oid << dendl;
+
+  const set<string> obj_key{RGW_OBJ_TORRENT};
+  map<string, bufferlist> m;
+  const int r = object->omap_get_vals_by_keys(s, oid, obj_key, &m);
+  if (r < 0) {
+    ldpp_dout(s, 0) << "ERROR: omap_get_vals_by_keys failed: " << r << dendl;
+    return r;
+  }
+  if (m.size() != 1) {
+    ldpp_dout(s, 0) << "ERROR: omap key " RGW_OBJ_TORRENT " not found" << dendl;
+    return -EINVAL;
+  }
+  bl.append(std::move(m.begin()->second));
+  dencode.bencode_end(bl);
+
+  bl_data = bl;
+  total_len = bl.length();
+  return 0;
+}
+
+bool seed::get_flag()
+{
+  return is_torrent;
+}
+
+void seed::update(bufferlist &bl)
+{
+  if (!is_torrent)
+  {
+    return;
+  }
+  info.len += bl.length();
+  sha1(&h, bl, bl.length());
+}
+
+int seed::complete(optional_yield y)
+{
+  uint64_t remain = info.len%info.piece_length;
+  uint8_t  remain_len = ((remain > 0)? 1 : 0);
+  sha_len = (info.len/info.piece_length + remain_len)*CEPH_CRYPTO_SHA1_DIGESTSIZE;
+
+  int ret = 0;
+   /* produce torrent data */
+  do_encode();
+
+  /* save torrent data into OMAP */
+  ret = save_torrent_file(y);
+  if (0 != ret)
+  {
+    ldpp_dout(s, 0) << "ERROR: failed to save_torrent_file() ret= "<< ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+off_t seed::get_data_len()
+{
+  return info.len;
+}
+
+void seed::set_create_date(ceph::real_time& value)
+{
+  utime_t date = ceph::real_clock::to_timespec(value);
+  create_date = date.sec();
+}
+
+void seed::set_info_pieces(char *buff)
+{
+  info.sha1_bl.append(buff, CEPH_CRYPTO_SHA1_DIGESTSIZE);
+}
+
+void seed::set_info_name(const string& value)
+{
+  info.name = value;
+}
+
+void seed::sha1(SHA1 *h, bufferlist &bl, off_t bl_len)
+{
+  off_t num = bl_len/info.piece_length;
+  off_t remain = 0;
+  remain = bl_len%info.piece_length;
+
+  char *pstr = bl.c_str();
+  char sha[25];
+
+  /* get sha1 */
+  for (off_t i = 0; i < num; i++)
+  {
+    // FIPS zeroization audit 20191116: this memset is not intended to
+    // wipe out a secret after use.
+    memset(sha, 0x00, sizeof(sha));
+    h->Update((unsigned char *)pstr, info.piece_length);
+    h->Final((unsigned char *)sha);
+    set_info_pieces(sha);
+    pstr += info.piece_length;
+  }
+
+  /* process remain */
+  if (0 != remain)
+  {
+    // FIPS zeroization audit 20191116: this memset is not intended to
+    // wipe out a secret after use.
+    memset(sha, 0x00, sizeof(sha));
+    h->Update((unsigned char *)pstr, remain);
+    h->Final((unsigned char *)sha);
+    set_info_pieces(sha);
+  }
+  ::ceph::crypto::zeroize_for_security(sha, sizeof(sha));
+}
+
+int seed::get_params()
+{
+  is_torrent = true;
+  info.piece_length = g_conf()->rgw_torrent_sha_unit;
+  create_by = g_conf()->rgw_torrent_createby;
+  encoding = g_conf()->rgw_torrent_encoding;
+  origin = g_conf()->rgw_torrent_origin;
+  comment = g_conf()->rgw_torrent_comment;
+  announce = g_conf()->rgw_torrent_tracker;
+
+  /* tracker and tracker list is empty, set announce to origin */
+  if (announce.empty() && !origin.empty())
+  {
+    announce = origin;
+  }
+
+  return 0;
+}
+
+void seed::set_announce()
+{
+  list<string> announce_list;  // used to get announce list from conf
+  get_str_list(announce, ",", announce_list);
+
+  if (announce_list.empty())
+  {
+    ldpp_dout(s, 5) << "NOTICE: announce_list is empty " << dendl;    
+    return;
+  }
+
+  list<string>::iterator iter = announce_list.begin();
+  dencode.bencode_key(ANNOUNCE, bl);
+  dencode.bencode_key((*iter), bl);
+
+  dencode.bencode_key(ANNOUNCE_LIST, bl);
+  dencode.bencode_list(bl);
+  for (; iter != announce_list.end(); ++iter)
+  {
+    dencode.bencode_list(bl);
+    dencode.bencode_key((*iter), bl);
+    dencode.bencode_end(bl);
+  }
+  dencode.bencode_end(bl);
+}
+
+void seed::do_encode()
+{ 
+  /*Only encode create_date and sha1 info*/
+  /*Other field will be added if confi is set when run get torrent*/
+  dencode.bencode(CREATION_DATE, create_date, bl);
+
+  dencode.bencode_key(INFO_PIECES, bl);
+  dencode.bencode_dict(bl);  
+  dencode.bencode(LENGTH, info.len, bl);
+  dencode.bencode(NAME, info.name, bl);
+  dencode.bencode(PIECE_LENGTH, info.piece_length, bl);
+
+  char info_sha[100] = { 0 };
+  sprintf(info_sha, "%" PRIu64, sha_len);
+  string sha_len_str = info_sha;
+  dencode.bencode_key(PIECES, bl);
+  bl.append(sha_len_str.c_str(), sha_len_str.length());
+  bl.append(':');
+  bl.append(info.sha1_bl.c_str(), sha_len);
+  dencode.bencode_end(bl);
+}
+
+int seed::save_torrent_file(optional_yield y)
+{
+  int op_ret = 0;
+  string key = RGW_OBJ_TORRENT;
+
+  op_ret = s->object->omap_set_val_by_key(s, key, bl, false, y);
+  if (op_ret < 0)
+  {
+    ldpp_dout(s, 0) << "ERROR: failed to omap_set() op_ret = " << op_ret << dendl;
+    return op_ret;
+  }
+
+  return op_ret;
+}
diff --git a/src/rgw/rgw_torrent.h b/src/rgw/rgw_torrent.h
new file mode 100644
index 000000000..bf2e2217c
--- /dev/null
+++ b/src/rgw/rgw_torrent.h
@@ -0,0 +1,139 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <list>
+#include <map>
+#include <set>
+
+#include "common/ceph_time.h"
+
+#include "rgw_common.h"
+
+using ceph::crypto::SHA1;
+
+struct req_state;
+
+#define RGW_OBJ_TORRENT    "rgw.torrent"
+
+#define ANNOUNCE           "announce"
+#define ANNOUNCE_LIST      "announce-list"
+#define COMMENT            "comment"
+#define CREATED_BY         "created by"
+#define CREATION_DATE      "creation date"
+#define ENCODING           "encoding"
+#define LENGTH             "length"
+#define NAME               "name"
+#define PIECE_LENGTH       "piece length"
+#define PIECES             "pieces"
+#define INFO_PIECES        "info"
+#define GET_TORRENT        "torrent"
+
+class TorrentBencode
+{
+public:
+  TorrentBencode() {}
+  ~TorrentBencode() {}
+
+  //control characters
+  void bencode_dict(bufferlist& bl) { bl.append('d'); }
+  void bencode_list(bufferlist& bl) { bl.append('l'); }
+  void bencode_end(bufferlist& bl) { bl.append('e'); }
+
+  //single values
+  void bencode(int value, bufferlist& bl)
+  {
+    bl.append('i');
+    char info[100] = { 0 };
+    sprintf(info, "%d", value);
+    bl.append(info, strlen(info));
+    bencode_end(bl);
+  }
+
+  //single values
+  void bencode(const std::string& str, bufferlist& bl)
+  {
+    bencode_key(str, bl);
+  }
+
+  //dictionary elements
+  void bencode(const std::string& key, int value, bufferlist& bl)
+  {
+    bencode_key(key, bl);
+    bencode(value, bl);
+  }
+
+  //dictionary elements
+  void bencode(const std::string& key, const std::string& value, bufferlist& bl)
+  {
+    bencode_key(key, bl);
+    bencode(value, bl);
+  }
+
+  //key len
+  void bencode_key(const std::string& key, bufferlist& bl)
+  {
+    int len = key.length();
+    char info[100] = { 0 };
+    sprintf(info, "%d:", len);
+    bl.append(info, strlen(info));
+    bl.append(key.c_str(), len);
+  }
+};
+
+/* torrent file struct */
+class seed
+{
+private:
+  struct
+  {
+    int piece_length;    // each piece length
+    bufferlist sha1_bl;  // save sha1
+    std::string name;    // file name
+    off_t len;    // file total bytes
+  }info;
+
+  std::string  announce;    // tracker
+  std::string origin; // origin
+  time_t create_date{0};    // time of the file created
+  std::string comment;  // comment
+  std::string create_by;    // app name and version
+  std::string encoding;    // if encode use gbk rather than gtf-8 use this field
+  uint64_t sha_len;  // sha1 length
+  bool is_torrent;  // flag
+  bufferlist bl;  // bufflist ready to send
+
+  req_state *s{nullptr};
+  rgw::sal::Driver* driver{nullptr};
+  SHA1 h;
+
+  TorrentBencode dencode;
+public:
+  seed();
+  ~seed();
+
+  int get_params();
+  void init(req_state *p_req, rgw::sal::Driver* _driver);
+  int get_torrent_file(rgw::sal::Object* object,
+                       uint64_t &total_len,
+                       ceph::bufferlist &bl_data,
+                       rgw_obj &obj);
+
+  off_t get_data_len();
+  bool get_flag();
+
+  void set_create_date(ceph::real_time& value);
+  void set_info_name(const std::string& value);
+  void update(bufferlist &bl);
+  int complete(optional_yield y);
+
+private:
+  void do_encode ();
+  void set_announce();
+  void set_exist(bool exist);
+  void set_info_pieces(char *buff);
+  void sha1(SHA1 *h, bufferlist &bl, off_t bl_len);
+  int save_torrent_file(optional_yield y);
+};
diff --git a/src/rgw/rgw_tracer.cc b/src/rgw/rgw_tracer.cc
new file mode 100644
index 000000000..7e12bb2e6
--- /dev/null
+++ b/src/rgw/rgw_tracer.cc
@@ -0,0 +1,13 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <string>
+#include "rgw_tracer.h"
+
+namespace tracing {
+namespace rgw {
+
+tracing::Tracer tracer;
+
+} // namespace rgw
+} // namespace tracing
diff --git a/src/rgw/rgw_tracer.h b/src/rgw/rgw_tracer.h
new file mode 100644
index 000000000..9cbae8b9c
--- /dev/null
+++ b/src/rgw/rgw_tracer.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#pragma once
+#include "common/tracer.h"
+
+#include "rgw_common.h"
+
+namespace tracing {
+namespace rgw {
+
+const auto OP = "op";
+const auto BUCKET_NAME = "bucket_name";
+const auto USER_ID = "user_id";
+const auto OBJECT_NAME = "object_name";
+const auto RETURN = "return";
+const auto UPLOAD_ID = "upload_id";
+const auto TYPE = "type";
+const auto REQUEST = "request";
+const auto MULTIPART = "multipart_upload ";
+
+extern tracing::Tracer tracer;
+
+} // namespace rgw
+} // namespace tracing
+
+static inline void extract_span_context(const rgw::sal::Attrs& attr, jspan_context& span_ctx) {
+  auto trace_iter = attr.find(RGW_ATTR_TRACE);
+  if (trace_iter != attr.end()) {
+    try {
+      auto trace_bl_iter = trace_iter->second.cbegin();
+      tracing::decode(span_ctx, trace_bl_iter);
+    } catch (buffer::error& err) {}
+  }
+}
diff --git a/src/rgw/rgw_url.cc b/src/rgw/rgw_url.cc
new file mode 100644
index 000000000..7fd4788d7
--- /dev/null
+++ b/src/rgw/rgw_url.cc
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <string>
+#include <regex>
+
+namespace rgw {
+
+namespace {
+  const auto USER_GROUP_IDX = 3;
+  const auto PASSWORD_GROUP_IDX = 4;
+  const auto HOST_GROUP_IDX = 5;
+
+  const std::string schema_re = "([[:alpha:]]+:\\/\\/)";
+  const std::string user_pass_re = "(([^:\\s]+):([^@\\s]+)@)?";
+  const std::string host_port_re = "([[:alnum:].:-]+)";
+  const std::string path_re = "(/[[:print:]]*)?";
+}
+
+bool parse_url_authority(const std::string& url, std::string& host, std::string& user, std::string& password) {
+  const std::string re = schema_re + user_pass_re + host_port_re + path_re;
+  const std::regex url_regex(re, std::regex::icase);
+  std::smatch url_match_result;
+
+  if (std::regex_match(url, url_match_result, url_regex)) {
+    host = url_match_result[HOST_GROUP_IDX];
+    user = url_match_result[USER_GROUP_IDX];
+    password = url_match_result[PASSWORD_GROUP_IDX];
+    return true;
+  }
+
+  return false;
+}
+
+bool parse_url_userinfo(const std::string& url, std::string& user, std::string& password) {
+  const std::string re = schema_re + user_pass_re + host_port_re + path_re;
+  const std::regex url_regex(re);
+  std::smatch url_match_result;
+
+  if (std::regex_match(url, url_match_result, url_regex)) {
+    user = url_match_result[USER_GROUP_IDX];
+    password = url_match_result[PASSWORD_GROUP_IDX];
+    return true;
+  }
+
+  return false;
+}
+}
+
diff --git a/src/rgw/rgw_url.h b/src/rgw/rgw_url.h
new file mode 100644
index 000000000..089401a49
--- /dev/null
+++ b/src/rgw/rgw_url.h
@@ -0,0 +1,12 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+namespace rgw {
+// parse a URL of the form: http|https|amqp|amqps|kafka://[user:password@]<host>[:port]
+bool parse_url_authority(const std::string& url, std::string& host, std::string& user, std::string& password);
+bool parse_url_userinfo(const std::string& url, std::string& user, std::string& password);
+}
+
diff --git a/src/rgw/rgw_usage.cc b/src/rgw/rgw_usage.cc
new file mode 100644
index 000000000..ca7ca20eb
--- /dev/null
+++ b/src/rgw/rgw_usage.cc
@@ -0,0 +1,171 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string>
+#include <map>
+
+#include "rgw_rados.h"
+#include "rgw_usage.h"
+#include "rgw_formats.h"
+#include "rgw_sal.h"
+
+using namespace std;
+
+static void dump_usage_categories_info(Formatter *formatter, const rgw_usage_log_entry& entry, map<string, bool> *categories)
+{
+  formatter->open_array_section("categories");
+  map<string, rgw_usage_data>::const_iterator uiter;
+  for (uiter = entry.usage_map.begin(); uiter != entry.usage_map.end(); ++uiter) {
+    if (categories && !categories->empty() && !categories->count(uiter->first))
+      continue;
+    const rgw_usage_data& usage = uiter->second;
+    formatter->open_object_section("entry");
+    formatter->dump_string("category", uiter->first);
+    formatter->dump_unsigned("bytes_sent", usage.bytes_sent);
+    formatter->dump_unsigned("bytes_received", usage.bytes_received);
+    formatter->dump_unsigned("ops", usage.ops);
+    formatter->dump_unsigned("successful_ops", usage.successful_ops);
+    formatter->close_section(); // entry
+  }
+  formatter->close_section(); // categories
+}
+
+int RGWUsage::show(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+		  rgw::sal::User* user , rgw::sal::Bucket* bucket,
+		   uint64_t start_epoch, uint64_t end_epoch, bool show_log_entries,
+		   bool show_log_sum,
+		   map<string, bool> *categories, RGWFormatterFlusher& flusher)
+{
+  uint32_t max_entries = 1000;
+
+  bool is_truncated = true;
+
+  RGWUsageIter usage_iter;
+  Formatter *formatter = flusher.get_formatter();
+
+  map<rgw_user_bucket, rgw_usage_log_entry> usage;
+
+  flusher.start(0);
+
+  formatter->open_object_section("usage");
+  if (show_log_entries) {
+    formatter->open_array_section("entries");
+  }
+  string last_owner;
+  bool user_section_open = false;
+  map<string, rgw_usage_log_entry> summary_map;
+  int ret;
+
+  while (is_truncated) {
+    if (bucket) {
+      ret = bucket->read_usage(dpp, start_epoch, end_epoch, max_entries, &is_truncated,
+			       usage_iter, usage);
+    } else if (user) {
+      ret = user->read_usage(dpp, start_epoch, end_epoch, max_entries, &is_truncated,
+			     usage_iter, usage);
+    } else {
+      ret = driver->read_all_usage(dpp, start_epoch, end_epoch, max_entries, &is_truncated,
+				  usage_iter, usage);
+    }
+
+    if (ret == -ENOENT) {
+      ret = 0;
+      is_truncated = false;
+    }
+
+    if (ret < 0) {
+      return ret;
+    }
+
+    map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
+    for (iter = usage.begin(); iter != usage.end(); ++iter) {
+      const rgw_user_bucket& ub = iter->first;
+      const rgw_usage_log_entry& entry = iter->second;
+
+      if (show_log_entries) {
+        if (ub.user.compare(last_owner) != 0) {
+          if (user_section_open) {
+            formatter->close_section();
+            formatter->close_section();
+          }
+          formatter->open_object_section("user");
+          formatter->dump_string("user", ub.user);
+          formatter->open_array_section("buckets");
+          user_section_open = true;
+          last_owner = ub.user;
+        }
+        formatter->open_object_section("bucket");
+        formatter->dump_string("bucket", ub.bucket);
+        utime_t ut(entry.epoch, 0);
+        ut.gmtime(formatter->dump_stream("time"));
+        formatter->dump_int("epoch", entry.epoch);
+        string owner = entry.owner.to_str();
+        string payer = entry.payer.to_str();
+        formatter->dump_string("owner", owner);
+        if (!payer.empty() && payer != owner) {
+          formatter->dump_string("payer", payer);
+        }
+        dump_usage_categories_info(formatter, entry, categories);
+        formatter->close_section(); // bucket
+        flusher.flush();
+      }
+
+      summary_map[ub.user].aggregate(entry, categories);
+    }
+  }
+  if (show_log_entries) {
+    if (user_section_open) {
+      formatter->close_section(); // buckets
+      formatter->close_section(); //user
+    }
+    formatter->close_section(); // entries
+  }
+
+  if (show_log_sum) {
+    formatter->open_array_section("summary");
+    map<string, rgw_usage_log_entry>::iterator siter;
+    for (siter = summary_map.begin(); siter != summary_map.end(); ++siter) {
+      const rgw_usage_log_entry& entry = siter->second;
+      formatter->open_object_section("user");
+      formatter->dump_string("user", siter->first);
+      dump_usage_categories_info(formatter, entry, categories);
+      rgw_usage_data total_usage;
+      entry.sum(total_usage, *categories);
+      formatter->open_object_section("total");
+      encode_json("bytes_sent", total_usage.bytes_sent, formatter);
+      encode_json("bytes_received", total_usage.bytes_received, formatter);
+      encode_json("ops", total_usage.ops, formatter);
+      encode_json("successful_ops", total_usage.successful_ops, formatter);
+      formatter->close_section(); // total
+
+      formatter->close_section(); // user
+
+      flusher.flush();
+    }
+
+    formatter->close_section(); // summary
+  }
+
+  formatter->close_section(); // usage
+  flusher.flush();
+
+  return 0;
+}
+
+int RGWUsage::trim(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+		   rgw::sal::User* user , rgw::sal::Bucket* bucket,
+		   uint64_t start_epoch, uint64_t end_epoch)
+{
+  if (bucket) {
+    return bucket->trim_usage(dpp, start_epoch, end_epoch);
+  } else if (user) {
+    return user->trim_usage(dpp, start_epoch, end_epoch);
+  } else {
+    return driver->trim_all_usage(dpp, start_epoch, end_epoch);
+  }
+}
+
+int RGWUsage::clear(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver)
+{
+  return driver->clear_usage(dpp);
+}
diff --git a/src/rgw/rgw_usage.h b/src/rgw/rgw_usage.h
new file mode 100644
index 000000000..b12b57df0
--- /dev/null
+++ b/src/rgw/rgw_usage.h
@@ -0,0 +1,30 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <string>
+#include <map>
+
+#include "common/Formatter.h"
+#include "common/dout.h"
+#include "rgw_formats.h"
+#include "rgw_user.h"
+#include "rgw_sal_fwd.h"
+
+
+class RGWUsage
+{
+public:
+  static int show(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+		  rgw::sal::User* user , rgw::sal::Bucket* bucket,
+		  uint64_t start_epoch, uint64_t end_epoch, bool show_log_entries,
+		  bool show_log_sum,
+		  std::map<std::string, bool> *categories, RGWFormatterFlusher& flusher);
+
+  static int trim(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+		  rgw::sal::User* user , rgw::sal::Bucket* bucket,
+		  uint64_t start_epoch, uint64_t end_epoch);
+
+  static int clear(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver);
+};
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
new file mode 100644
index 000000000..e5e07cbc4
--- /dev/null
+++ b/src/rgw/rgw_user.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "rgw_sal_rados.h"
+
+#include "include/types.h"
+#include "rgw_user.h"
+
+// until everything is moved from rgw_common
+#include "rgw_common.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int rgw_user_sync_all_stats(const DoutPrefixProvider *dpp, rgw::sal::Driver* driver,
+			    rgw::sal::User* user, optional_yield y)
+{
+  rgw::sal::BucketList user_buckets;
+
+  CephContext *cct = driver->ctx();
+  size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
+  string marker;
+  int ret;
+
+  do {
+    ret = user->list_buckets(dpp, marker, string(), max_entries, false, user_buckets, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "failed to read user buckets: ret=" << ret << dendl;
+      return ret;
+    }
+    auto& buckets = user_buckets.get_buckets();
+    for (auto i = buckets.begin(); i != buckets.end(); ++i) {
+      marker = i->first;
+
+      auto& bucket = i->second;
+
+      ret = bucket->load_bucket(dpp, y);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: could not read bucket info: bucket=" << bucket << " ret=" << ret << dendl;
+        continue;
+      }
+      ret = bucket->sync_user_stats(dpp, y);
+      if (ret < 0) {
+        ldout(cct, 0) << "ERROR: could not sync bucket stats: ret=" << ret << dendl;
+        return ret;
+      }
+      ret = bucket->check_bucket_shards(dpp);
+      if (ret < 0) {
+	ldpp_dout(dpp, 0) << "ERROR in check_bucket_shards: " << cpp_strerror(-ret)<< dendl;
+      }
+    }
+  } while (user_buckets.is_truncated());
+
+  ret = user->complete_flush_stats(dpp, y);
+  if (ret < 0) {
+    cerr << "ERROR: failed to complete syncing user stats: ret=" << ret << std::endl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int rgw_user_get_all_buckets_stats(const DoutPrefixProvider *dpp,
+				   rgw::sal::Driver* driver,
+				   rgw::sal::User* user,
+				   map<string, bucket_meta_entry>& buckets_usage_map,
+				   optional_yield y)
+{
+  CephContext *cct = driver->ctx();
+  size_t max_entries = cct->_conf->rgw_list_buckets_max_chunk;
+  bool done;
+  string marker;
+  int ret;
+
+  do {
+    rgw::sal::BucketList buckets;
+    ret = user->list_buckets(dpp, marker, string(), max_entries, false, buckets, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "failed to read user buckets: ret=" << ret << dendl;
+      return ret;
+    }
+    auto& m = buckets.get_buckets();
+    for (const auto& i :  m) {
+      marker = i.first;
+
+      auto& bucket_ent = i.second;
+      ret = bucket_ent->load_bucket(dpp, y, true /* load user stats */);
+      if (ret < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: could not get bucket stats: ret=" << ret << dendl;
+        return ret;
+      }
+      bucket_meta_entry entry;
+      entry.size = bucket_ent->get_size();
+      entry.size_rounded = bucket_ent->get_size_rounded();
+      entry.creation_time = bucket_ent->get_creation_time();
+      entry.count = bucket_ent->get_count();
+      buckets_usage_map.emplace(bucket_ent->get_name(), entry);
+    }
+    done = (buckets.count() < max_entries);
+  } while (!done);
+
+  return 0;
+}
+
+int rgw_validate_tenant_name(const string& t)
+{
+  struct tench {
+    static bool is_good(char ch) {
+      return isalnum(ch) || ch == '_';
+    }
+  };
+  std::string::const_iterator it =
+    std::find_if_not(t.begin(), t.end(), tench::is_good);
+  return (it == t.end())? 0: -ERR_INVALID_TENANT_NAME;
+}
+
+/**
+ * Get the anonymous (ie, unauthenticated) user info.
+ */
+void rgw_get_anon_user(RGWUserInfo& info)
+{
+  info.user_id = RGW_USER_ANON_ID;
+  info.display_name.clear();
+  info.access_keys.clear();
+}
+
diff --git a/src/rgw/rgw_user_types.h b/src/rgw/rgw_user_types.h
new file mode 100644
index 000000000..c9a1a46ad
--- /dev/null
+++ b/src/rgw/rgw_user_types.h
@@ -0,0 +1,158 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * include files which can only be compiled in radosgw or OSD
+ * contexts (e.g., rgw_sal.h, rgw_common.h) */
+
+#pragma once
+
+#include <string_view>
+#include <fmt/format.h>
+
+#include "common/dout.h"
+#include "common/Formatter.h"
+
+struct rgw_user {
+  std::string tenant;
+  std::string id;
+  std::string ns;
+
+  rgw_user() {}
+  explicit rgw_user(const std::string& s) {
+    from_str(s);
+  }
+  rgw_user(const std::string& tenant, const std::string& id, const std::string& ns="")
+    : tenant(tenant),
+      id(id),
+      ns(ns) {
+  }
+  rgw_user(std::string&& tenant, std::string&& id, std::string&& ns="")
+    : tenant(std::move(tenant)),
+      id(std::move(id)),
+      ns(std::move(ns)) {
+  }
+
+  void encode(ceph::buffer::list& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(tenant, bl);
+    encode(id, bl);
+    encode(ns, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(tenant, bl);
+    decode(id, bl);
+    if (struct_v >= 2) {
+      decode(ns, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void to_str(std::string& str) const {
+    if (!tenant.empty()) {
+      if (!ns.empty()) {
+        str = tenant + '$' + ns + '$' + id;
+      } else {
+        str = tenant + '$' + id;
+      }
+    } else if (!ns.empty()) {
+      str = '$' + ns + '$' + id;
+    } else {
+      str = id;
+    }
+  }
+
+  void clear() {
+    tenant.clear();
+    id.clear();
+    ns.clear();
+  }
+
+  bool empty() const {
+    return id.empty();
+  }
+
+  std::string to_str() const {
+    std::string s;
+    to_str(s);
+    return s;
+  }
+
+  void from_str(const std::string& str) {
+    size_t pos = str.find('$');
+    if (pos != std::string::npos) {
+      tenant = str.substr(0, pos);
+      std::string_view sv = str;
+      std::string_view ns_id = sv.substr(pos + 1);
+      size_t ns_pos = ns_id.find('$');
+      if (ns_pos != std::string::npos) {
+        ns = std::string(ns_id.substr(0, ns_pos));
+        id = std::string(ns_id.substr(ns_pos + 1));
+      } else {
+        ns.clear();
+        id = std::string(ns_id);
+      }
+    } else {
+      tenant.clear();
+      ns.clear();
+      id = str;
+    }
+  }
+
+  rgw_user& operator=(const std::string& str) {
+    from_str(str);
+    return *this;
+  }
+
+  int compare(const rgw_user& u) const {
+    int r = tenant.compare(u.tenant);
+    if (r != 0)
+      return r;
+    r = ns.compare(u.ns);
+    if (r != 0) {
+      return r;
+    }
+    return id.compare(u.id);
+  }
+  int compare(const std::string& str) const {
+    rgw_user u(str);
+    return compare(u);
+  }
+
+  bool operator!=(const rgw_user& rhs) const {
+    return (compare(rhs) != 0);
+  }
+  bool operator==(const rgw_user& rhs) const {
+    return (compare(rhs) == 0);
+  }
+  bool operator<(const rgw_user& rhs) const {
+    if (tenant < rhs.tenant) {
+      return true;
+    } else if (tenant > rhs.tenant) {
+      return false;
+    }
+    if (ns < rhs.ns) {
+      return true;
+    } else if (ns > rhs.ns) {
+      return false;
+    }
+    return (id < rhs.id);
+  }
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<rgw_user*>& o);
+};
+WRITE_CLASS_ENCODER(rgw_user)
diff --git a/src/rgw/rgw_web_idp.h b/src/rgw/rgw_web_idp.h
new file mode 100644
index 000000000..a9aa5b829
--- /dev/null
+++ b/src/rgw/rgw_web_idp.h
@@ -0,0 +1,26 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+namespace rgw {
+namespace web_idp {
+
+//WebToken contains some claims from the decoded token which are of interest to us.
+struct WebTokenClaims {
+  //Subject of the token
+  std::string sub;
+  //Intended audience for this token
+  std::string aud;
+  //Issuer of this token
+  std::string iss;
+  //Human-readable id for the resource owner
+  std::string user_name;
+  //Client Id
+  std::string client_id;
+  //azp
+  std::string azp;
+};
+
+}; /* namespace web_idp */
+}; /* namespace rgw */
diff --git a/src/rgw/rgw_website.cc b/src/rgw/rgw_website.cc
new file mode 100644
index 000000000..0b68fc170
--- /dev/null
+++ b/src/rgw/rgw_website.cc
@@ -0,0 +1,341 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Yehuda Sadeh <yehuda@redhat.com>
+ * Copyright (C) 2015 Robin H. Johnson <robin.johnson@dreamhost.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "common/debug.h"
+
+#include "common/ceph_json.h"
+#include "common/Formatter.h"
+
+#include "acconfig.h"
+
+#include <errno.h>
+#include <string>
+#include <list>
+#include "include/types.h"
+#include "rgw_website.h"
+#include "rgw_common.h"
+#include "rgw_xml.h"
+
+using namespace std;
+
+bool RGWBWRoutingRuleCondition::check_key_condition(const string& key) {
+  return (key.size() >= key_prefix_equals.size() &&
+          key.compare(0, key_prefix_equals.size(), key_prefix_equals) == 0);
+}
+
+
+void RGWBWRoutingRule::apply_rule(const string& default_protocol, const string& default_hostname,
+                                           const string& key, string *new_url, int *redirect_code)
+{
+  RGWRedirectInfo& redirect = redirect_info.redirect;
+
+  string protocol = (!redirect.protocol.empty() ? redirect.protocol : default_protocol);
+  string hostname = (!redirect.hostname.empty() ? redirect.hostname : default_hostname);
+
+  *new_url = protocol + "://" + hostname + "/";
+
+  if (!redirect_info.replace_key_prefix_with.empty()) {
+    *new_url += redirect_info.replace_key_prefix_with;
+    if (key.size() > condition.key_prefix_equals.size()) {
+      *new_url += key.substr(condition.key_prefix_equals.size());
+    }
+  } else if (!redirect_info.replace_key_with.empty()) {
+    *new_url += redirect_info.replace_key_with;
+  } else {
+    *new_url += key;
+  }
+
+  if(redirect.http_redirect_code > 0) 
+	  *redirect_code = redirect.http_redirect_code;
+}
+
+bool RGWBWRoutingRules::check_key_and_error_code_condition(const string &key, int error_code, RGWBWRoutingRule **rule)
+{
+  for (list<RGWBWRoutingRule>::iterator iter = rules.begin(); iter != rules.end(); ++iter) {
+    if (iter->check_key_condition(key) && iter->check_error_code_condition(error_code)) {
+      *rule = &(*iter);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool RGWBWRoutingRules::check_key_condition(const string& key, RGWBWRoutingRule **rule)
+{
+  for (list<RGWBWRoutingRule>::iterator iter = rules.begin(); iter != rules.end(); ++iter) {
+    if (iter->check_key_condition(key)) {
+      *rule = &(*iter);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool RGWBWRoutingRules::check_error_code_condition(const int http_error_code, RGWBWRoutingRule **rule)
+{
+  for (list<RGWBWRoutingRule>::iterator iter = rules.begin(); iter != rules.end(); ++iter) {
+    if (iter->check_error_code_condition(http_error_code)) {
+      *rule = &(*iter);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool RGWBucketWebsiteConf::should_redirect(const string& key, const int http_error_code, RGWBWRoutingRule *redirect)
+{
+  RGWBWRoutingRule *rule;
+  if(!redirect_all.hostname.empty()) {
+	RGWBWRoutingRule redirect_all_rule;
+	redirect_all_rule.redirect_info.redirect = redirect_all;
+	redirect_all.http_redirect_code = 301;
+	*redirect = redirect_all_rule;
+	return true;
+  } else if (!routing_rules.check_key_and_error_code_condition(key, http_error_code, &rule)) {
+    return false;
+  }
+
+  *redirect = *rule;
+
+  return true;
+}
+
+bool RGWBucketWebsiteConf::get_effective_key(const string& key, string *effective_key, bool is_file) const
+{
+  if (index_doc_suffix.empty()) {
+    return false;
+  }
+
+  if (key.empty()) {
+    *effective_key = index_doc_suffix;
+  } else if (key[key.size() - 1] == '/') {
+    *effective_key = key + index_doc_suffix;
+  } else if (! is_file) {
+    *effective_key = key + "/" + index_doc_suffix; 
+  } else {
+    *effective_key = key;
+  }
+
+  return true;
+}
+
+void RGWRedirectInfo::dump(Formatter *f) const
+{
+  encode_json("protocol", protocol, f);
+  encode_json("hostname", hostname, f);
+  encode_json("http_redirect_code", (int)http_redirect_code, f);
+}
+
+void RGWRedirectInfo::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("protocol", protocol, obj);
+  JSONDecoder::decode_json("hostname", hostname, obj);
+  int code;
+  JSONDecoder::decode_json("http_redirect_code", code, obj);
+  http_redirect_code = code;
+}
+
+void RGWBWRedirectInfo::dump(Formatter *f) const
+{
+  encode_json("redirect", redirect, f);
+  encode_json("replace_key_prefix_with", replace_key_prefix_with, f);
+  encode_json("replace_key_with", replace_key_with, f);
+}
+
+void RGWBWRedirectInfo::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("redirect", redirect, obj);
+  JSONDecoder::decode_json("replace_key_prefix_with", replace_key_prefix_with, obj);
+  JSONDecoder::decode_json("replace_key_with", replace_key_with, obj);
+}
+
+void RGWBWRoutingRuleCondition::dump(Formatter *f) const
+{
+  encode_json("key_prefix_equals", key_prefix_equals, f);
+  encode_json("http_error_code_returned_equals", (int)http_error_code_returned_equals, f);
+}
+
+void RGWBWRoutingRuleCondition::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("key_prefix_equals", key_prefix_equals, obj);
+  int code;
+  JSONDecoder::decode_json("http_error_code_returned_equals", code, obj);
+  http_error_code_returned_equals = code;
+}
+
+void RGWBWRoutingRule::dump(Formatter *f) const
+{
+  encode_json("condition", condition, f);
+  encode_json("redirect_info", redirect_info, f);
+}
+
+void RGWBWRoutingRule::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("condition", condition, obj);
+  JSONDecoder::decode_json("redirect_info", redirect_info, obj);
+}
+
+void RGWBWRoutingRules::dump(Formatter *f) const
+{
+  encode_json("rules", rules, f);
+}
+
+void RGWBWRoutingRules::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("rules", rules, obj);
+}
+
+void RGWBucketWebsiteConf::dump(Formatter *f) const
+{
+  if (!redirect_all.hostname.empty()) {
+    encode_json("redirect_all", redirect_all, f);
+  } else {
+    encode_json("index_doc_suffix", index_doc_suffix, f);
+    encode_json("error_doc", error_doc, f);
+    encode_json("routing_rules", routing_rules, f);
+  }
+}
+
+void RGWBucketWebsiteConf::decode_json(JSONObj *obj) {
+  JSONDecoder::decode_json("redirect_all", redirect_all, obj);
+  JSONDecoder::decode_json("index_doc_suffix", index_doc_suffix, obj);
+  JSONDecoder::decode_json("error_doc", error_doc, obj);
+  JSONDecoder::decode_json("routing_rules", routing_rules, obj);
+}
+
+void RGWBWRedirectInfo::dump_xml(Formatter *f) const
+{
+  if (!redirect.protocol.empty()) {
+    encode_xml("Protocol", redirect.protocol, f);
+  }
+  if (!redirect.hostname.empty()) {
+    encode_xml("HostName", redirect.hostname, f);
+  }
+  if (redirect.http_redirect_code > 0) {
+    encode_xml("HttpRedirectCode", (int)redirect.http_redirect_code, f);
+  }
+  if (!replace_key_prefix_with.empty()) {
+    encode_xml("ReplaceKeyPrefixWith", replace_key_prefix_with, f);
+  }
+  if (!replace_key_with.empty()) {
+    encode_xml("ReplaceKeyWith", replace_key_with, f);
+  }
+}
+
+#define WEBSITE_HTTP_REDIRECT_CODE_MIN      300
+#define WEBSITE_HTTP_REDIRECT_CODE_MAX      400
+void RGWBWRedirectInfo::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("Protocol", redirect.protocol, obj);
+  RGWXMLDecoder::decode_xml("HostName", redirect.hostname, obj);
+  int code = 0;
+  bool has_http_redirect_code = RGWXMLDecoder::decode_xml("HttpRedirectCode", code, obj);
+  if (has_http_redirect_code &&
+      !(code > WEBSITE_HTTP_REDIRECT_CODE_MIN &&
+        code < WEBSITE_HTTP_REDIRECT_CODE_MAX)) {
+    throw RGWXMLDecoder::err("The provided HTTP redirect code is not valid. Valid codes are 3XX except 300.");
+  }
+  redirect.http_redirect_code = code;
+  bool has_replace_key_prefix_with = RGWXMLDecoder::decode_xml("ReplaceKeyPrefixWith", replace_key_prefix_with, obj);
+  bool has_replace_key_with = RGWXMLDecoder::decode_xml("ReplaceKeyWith", replace_key_with, obj);
+  if (has_replace_key_prefix_with && has_replace_key_with) {
+    throw RGWXMLDecoder::err("You can only define ReplaceKeyPrefix or ReplaceKey but not both.");
+  }
+}
+
+void RGWBWRoutingRuleCondition::dump_xml(Formatter *f) const
+{
+  if (!key_prefix_equals.empty()) {
+    encode_xml("KeyPrefixEquals", key_prefix_equals, f);
+  }
+  if (http_error_code_returned_equals > 0) {
+    encode_xml("HttpErrorCodeReturnedEquals", (int)http_error_code_returned_equals, f);
+  }
+}
+
+#define WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MIN      400
+#define WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MAX      600
+void RGWBWRoutingRuleCondition::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("KeyPrefixEquals", key_prefix_equals, obj);
+  int code = 0;
+  bool has_http_error_code_returned_equals = RGWXMLDecoder::decode_xml("HttpErrorCodeReturnedEquals", code, obj);
+  if (has_http_error_code_returned_equals &&
+      !(code >= WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MIN &&
+        code < WEBSITE_HTTP_ERROR_CODE_RETURNED_EQUALS_MAX)) {
+    throw RGWXMLDecoder::err("The provided HTTP redirect code is not valid. Valid codes are 4XX or 5XX.");
+  }
+  http_error_code_returned_equals = code;
+}
+
+void RGWBWRoutingRule::dump_xml(Formatter *f) const
+{
+  encode_xml("Condition", condition, f);
+  encode_xml("Redirect", redirect_info, f);
+}
+
+void RGWBWRoutingRule::decode_xml(XMLObj *obj) {
+  RGWXMLDecoder::decode_xml("Condition", condition, obj);
+  RGWXMLDecoder::decode_xml("Redirect", redirect_info, obj);
+}
+
+static void encode_xml(const char *name, const std::list<RGWBWRoutingRule>& l, ceph::Formatter *f)
+{
+  do_encode_xml("RoutingRules", l, "RoutingRule", f);
+}
+
+void RGWBucketWebsiteConf::dump_xml(Formatter *f) const
+{
+  if (!redirect_all.hostname.empty()) {
+    f->open_object_section("RedirectAllRequestsTo");
+    encode_xml("HostName", redirect_all.hostname, f);
+    if (!redirect_all.protocol.empty()) {
+      encode_xml("Protocol", redirect_all.protocol, f);
+    }
+    f->close_section();
+  }
+  if (!index_doc_suffix.empty()) {
+    f->open_object_section("IndexDocument");
+    encode_xml("Suffix", index_doc_suffix, f);
+    f->close_section();
+  }
+  if (!error_doc.empty()) {
+    f->open_object_section("ErrorDocument");
+    encode_xml("Key", error_doc, f);
+    f->close_section();
+  }
+  if (!routing_rules.rules.empty()) {
+    encode_xml("RoutingRules", routing_rules.rules, f);
+  }
+}
+
+void decode_xml_obj(list<RGWBWRoutingRule>& l, XMLObj *obj)
+{
+  do_decode_xml_obj(l, "RoutingRule", obj);
+}
+
+void RGWBucketWebsiteConf::decode_xml(XMLObj *obj) {
+  XMLObj *o = obj->find_first("RedirectAllRequestsTo");
+  if (o) {
+    is_redirect_all = true;
+    RGWXMLDecoder::decode_xml("HostName", redirect_all.hostname, o, true);
+    RGWXMLDecoder::decode_xml("Protocol", redirect_all.protocol, o);
+  } else {
+    o = obj->find_first("IndexDocument");
+    if (o) {
+      is_set_index_doc = true;
+      RGWXMLDecoder::decode_xml("Suffix", index_doc_suffix, o);
+    }
+    o = obj->find_first("ErrorDocument");
+    if (o) {
+      RGWXMLDecoder::decode_xml("Key", error_doc, o);
+    }
+    RGWXMLDecoder::decode_xml("RoutingRules", routing_rules.rules, obj);
+  }
+}
diff --git a/src/rgw/rgw_website.h b/src/rgw/rgw_website.h
new file mode 100644
index 000000000..bf92011ba
--- /dev/null
+++ b/src/rgw/rgw_website.h
@@ -0,0 +1,243 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Yehuda Sadeh <yehuda@redhat.com>
+ * Copyright (C) 2015 Robin H. Johnson <robin.johnson@dreamhost.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include <list>
+#include <string>
+
+#include "common/ceph_json.h"
+
+#include "rgw_xml.h"
+
+struct RGWRedirectInfo
+{
+  std::string protocol;
+  std::string hostname;
+  uint16_t http_redirect_code = 0;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(protocol, bl);
+    encode(hostname, bl);
+    encode(http_redirect_code, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(protocol, bl);
+    decode(hostname, bl);
+    decode(http_redirect_code, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWRedirectInfo)
+
+
+struct RGWBWRedirectInfo
+{
+  RGWRedirectInfo redirect;
+  std::string replace_key_prefix_with;
+  std::string replace_key_with;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(redirect, bl);
+    encode(replace_key_prefix_with, bl);
+    encode(replace_key_with, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(redirect, bl);
+    decode(replace_key_prefix_with, bl);
+    decode(replace_key_with, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void dump_xml(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  void decode_xml(XMLObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWBWRedirectInfo)
+
+struct RGWBWRoutingRuleCondition
+{
+  std::string key_prefix_equals;
+  uint16_t http_error_code_returned_equals = 0;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(key_prefix_equals, bl);
+    encode(http_error_code_returned_equals, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(key_prefix_equals, bl);
+    decode(http_error_code_returned_equals, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void dump_xml(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  void decode_xml(XMLObj *obj);
+
+  bool check_key_condition(const std::string& key);
+  bool check_error_code_condition(const int error_code) {
+    return (uint16_t)error_code == http_error_code_returned_equals;
+  }
+};
+WRITE_CLASS_ENCODER(RGWBWRoutingRuleCondition)
+
+struct RGWBWRoutingRule
+{
+  RGWBWRoutingRuleCondition condition;
+  RGWBWRedirectInfo redirect_info;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(condition, bl);
+    encode(redirect_info, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(condition, bl);
+    decode(redirect_info, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void dump_xml(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  void decode_xml(XMLObj *obj);
+
+  bool check_key_condition(const std::string& key) {
+    return condition.check_key_condition(key);
+  }
+  bool check_error_code_condition(int error_code) {
+    return condition.check_error_code_condition(error_code);
+  }
+
+  void apply_rule(const std::string& default_protocol,
+                  const std::string& default_hostname,
+                  const std::string& key,
+                  std::string *redirect,
+                  int *redirect_code);
+};
+WRITE_CLASS_ENCODER(RGWBWRoutingRule)
+
+struct RGWBWRoutingRules
+{
+  std::list<RGWBWRoutingRule> rules;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(rules, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(rules, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void dump_xml(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+  bool check_key_condition(const std::string& key, RGWBWRoutingRule **rule);
+  bool check_error_code_condition(int error_code, RGWBWRoutingRule **rule);
+  bool check_key_and_error_code_condition(const std::string& key,
+                                          const int error_code,
+                                          RGWBWRoutingRule **rule);
+};
+WRITE_CLASS_ENCODER(RGWBWRoutingRules)
+
+struct RGWBucketWebsiteConf
+{
+  RGWRedirectInfo redirect_all;
+  std::string index_doc_suffix;
+  std::string error_doc;
+  std::string subdir_marker;
+  std::string listing_css_doc;
+  bool listing_enabled;
+  bool is_redirect_all;
+  bool is_set_index_doc;
+  RGWBWRoutingRules routing_rules;
+
+  RGWBucketWebsiteConf()
+    : listing_enabled(false) {
+    is_redirect_all = false;
+    is_set_index_doc = false;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(2, 1, bl);
+    encode(index_doc_suffix, bl);
+    encode(error_doc, bl);
+    encode(routing_rules, bl);
+    encode(redirect_all, bl);
+    encode(subdir_marker, bl);
+    encode(listing_css_doc, bl);
+    encode(listing_enabled, bl);
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(2, bl);
+    decode(index_doc_suffix, bl);
+    decode(error_doc, bl);
+    decode(routing_rules, bl);
+    decode(redirect_all, bl);
+    if (struct_v >= 2) {
+      decode(subdir_marker, bl);
+      decode(listing_css_doc, bl);
+      decode(listing_enabled, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  void decode_xml(XMLObj *obj);
+  void dump_xml(Formatter *f) const;
+
+  bool should_redirect(const std::string& key,
+                       const int http_error_code,
+                       RGWBWRoutingRule *redirect);
+
+  bool get_effective_key(const std::string& key,
+                         std::string *effective_key, bool is_file) const;
+
+  const std::string& get_index_doc() const {
+    return index_doc_suffix;
+  }
+
+  bool is_empty() const {
+    return index_doc_suffix.empty() &&
+           error_doc.empty() &&
+           subdir_marker.empty() &&
+           listing_css_doc.empty() &&
+           ! listing_enabled;
+  }
+};
+WRITE_CLASS_ENCODER(RGWBucketWebsiteConf)
diff --git a/src/rgw/rgw_worker.h b/src/rgw/rgw_worker.h
new file mode 100644
index 000000000..eb2e55243
--- /dev/null
+++ b/src/rgw/rgw_worker.h
@@ -0,0 +1,91 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <atomic>
+
+#include "common/Thread.h"
+#include "common/ceph_mutex.h"
+#include "include/common_fwd.h"
+
+class RGWRados;
+
+class RGWRadosThread {
+  class Worker : public Thread, public DoutPrefixProvider {
+    CephContext *cct;
+    RGWRadosThread *processor;
+    ceph::mutex lock = ceph::make_mutex("RGWRadosThread::Worker");
+    ceph::condition_variable cond;
+
+    void wait() {
+      std::unique_lock l{lock};
+      cond.wait(l);
+    };
+
+    void wait_interval(const ceph::real_clock::duration& wait_time) {
+      std::unique_lock l{lock};
+      cond.wait_for(l, wait_time);
+    }
+
+  public:
+    Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p) {}
+    void *entry() override;
+    void signal() {
+      std::lock_guard l{lock};
+      cond.notify_all();
+    }
+
+  CephContext *get_cct() const { return cct; }
+  unsigned get_subsys() const { return ceph_subsys_rgw; }
+  std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw rados thread: "; }
+
+  };
+
+  Worker *worker;
+
+protected:
+  CephContext *cct;
+  RGWRados *store;
+
+  std::atomic<bool> down_flag = { false };
+
+  std::string thread_name;
+
+  virtual uint64_t interval_msec() = 0;
+  virtual void stop_process() {}
+public:
+  RGWRadosThread(RGWRados *_store, const std::string& thread_name = "radosgw")
+    : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
+  virtual ~RGWRadosThread() {
+    stop();
+  }
+
+  virtual int init(const DoutPrefixProvider *dpp) { return 0; }
+  virtual int process(const DoutPrefixProvider *dpp) = 0;
+
+  bool going_down() { return down_flag; }
+
+  void start();
+  void stop();
+
+  void signal() {
+    if (worker) {
+      worker->signal();
+    }
+  }
+};
+
diff --git a/src/rgw/rgw_xml.cc b/src/rgw/rgw_xml.cc
new file mode 100644
index 000000000..22a62ac48
--- /dev/null
+++ b/src/rgw/rgw_xml.cc
@@ -0,0 +1,502 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <string.h>
+
+#include <iostream>
+#include <map>
+
+#include <expat.h>
+
+#include "include/types.h"
+#include "include/utime.h"
+
+#include "rgw_xml.h"
+
+using namespace std;
+
+XMLObjIter::
+XMLObjIter()
+{
+}
+
+XMLObjIter::
+~XMLObjIter()
+{
+}
+
+void XMLObjIter::
+set(const XMLObjIter::map_iter_t &_cur, const XMLObjIter::map_iter_t &_end)
+{
+  cur = _cur;
+  end = _end;
+}
+
+XMLObj *XMLObjIter::
+get_next()
+{
+  XMLObj *obj = NULL;
+  if (cur != end) {
+    obj = cur->second;
+    ++cur;
+  }
+  return obj;
+}
+
+bool XMLObjIter::get_name(std::string& name) const
+{
+  if (cur == end) {
+    return false;
+  }
+
+  name = cur->first;
+  return true;
+}
+
+ostream& operator<<(ostream &out, const XMLObj &obj) {
+   out << obj.obj_type << ": " << obj.data;
+   return out;
+}
+
+XMLObj::
+~XMLObj()
+{
+}
+
+bool XMLObj::
+xml_start(XMLObj *parent, const char *el, const char **attr)
+{
+  this->parent = parent;
+  obj_type = el;
+  for (int i = 0; attr[i]; i += 2) {
+    attr_map[attr[i]] = std::string(attr[i + 1]);
+  }
+  return true;
+}
+
+bool XMLObj::
+xml_end(const char *el)
+{
+  return true;
+}
+
+void XMLObj::
+xml_handle_data(const char *s, int len)
+{
+  data.append(s, len);
+}
+
+const std::string& XMLObj::
+XMLObj::get_data() const
+{
+  return data;
+}
+
+const std::string& XMLObj::
+XMLObj::get_obj_type() const
+{
+  return obj_type;
+}
+
+XMLObj *XMLObj::
+XMLObj::get_parent()
+{
+  return parent;
+}
+
+void XMLObj::
+add_child(const std::string& el, XMLObj *obj)
+{
+  children.insert(std::pair<std::string, XMLObj *>(el, obj));
+}
+
+bool XMLObj::
+get_attr(const std::string& name, std::string& attr) const
+{
+  const std::map<std::string, std::string>::const_iterator iter = attr_map.find(name);
+  if (iter == attr_map.end())
+    return false;
+  attr = iter->second;
+  return true;
+}
+
+XMLObjIter XMLObj::
+find(const std::string& name)
+{
+  XMLObjIter iter;
+  const XMLObjIter::const_map_iter_t first = children.find(name);
+  XMLObjIter::const_map_iter_t last;
+  if (first != children.end()) {
+    last = children.upper_bound(name);
+  }else
+    last = children.end();
+  iter.set(first, last);
+  return iter;
+}
+
+XMLObjIter XMLObj::find_first()
+{
+  XMLObjIter iter;
+  const XMLObjIter::const_map_iter_t first = children.begin();
+  const XMLObjIter::const_map_iter_t last = children.end();
+  iter.set(first, last);
+  return iter;
+}
+
+XMLObj *XMLObj::
+find_first(const std::string& name)
+{
+  const XMLObjIter::const_map_iter_t first = children.find(name);
+  if (first != children.end())
+    return first->second;
+  return nullptr;
+}
+
+RGWXMLParser::
+RGWXMLParser() : buf(nullptr), buf_len(0), cur_obj(nullptr), success(true), init_called(false)
+{
+  p = XML_ParserCreate(nullptr);
+}
+
+RGWXMLParser::
+~RGWXMLParser()
+{
+  XML_ParserFree(p);
+
+  free(buf);
+  std::list<XMLObj *>::const_iterator iter;
+  for (iter = allocated_objs.begin(); iter != allocated_objs.end(); ++iter) {
+    XMLObj *obj = *iter;
+    delete obj;
+  }
+}
+
+void RGWXMLParser::call_xml_start(void* user_data, const char *el, const char **attr) {
+  RGWXMLParser *handler = static_cast<RGWXMLParser *>(user_data);
+  XMLObj * obj = handler->alloc_obj(el);
+  if (!obj) {
+    handler->unallocated_objs.push_back(XMLObj());
+    obj = &handler->unallocated_objs.back();
+  } else {
+    handler->allocated_objs.push_back(obj);
+  }
+  if (!obj->xml_start(handler->cur_obj, el, attr)) {
+    handler->success = false;
+    return;
+  }
+  if (handler->cur_obj) {
+    handler->cur_obj->add_child(el, obj);
+  } else {
+    handler->children.insert(std::pair<std::string, XMLObj *>(el, obj));
+  }
+  handler->cur_obj = obj;
+
+  handler->objs.push_back(obj);
+}
+
+void RGWXMLParser::call_xml_end(void* user_data, const char *el) {
+  RGWXMLParser *handler = static_cast<RGWXMLParser *>(user_data);
+  XMLObj *parent_obj = handler->cur_obj->get_parent();
+  if (!handler->cur_obj->xml_end(el)) {
+    handler->success = false;
+    return;
+  }
+  handler->cur_obj = parent_obj;
+}
+
+void RGWXMLParser::call_xml_handle_data(void* user_data, const char *s, int len)
+{
+  RGWXMLParser *handler = static_cast<RGWXMLParser *>(user_data);
+  handler->cur_obj->xml_handle_data(s, len);
+}
+
+bool RGWXMLParser::init()
+{
+  if (!p) {
+    return false;
+  }
+  init_called = true;
+  XML_SetElementHandler(p, RGWXMLParser::call_xml_start, RGWXMLParser::call_xml_end);
+  XML_SetCharacterDataHandler(p, RGWXMLParser::call_xml_handle_data);
+  XML_SetUserData(p, (void *)this);
+  return true;
+}
+
+bool RGWXMLParser::parse(const char *_buf, int len, int done)
+{
+  ceph_assert(init_called);
+  int pos = buf_len;
+  char *tmp_buf;
+  tmp_buf = (char *)realloc(buf, buf_len + len);
+  if (tmp_buf == NULL){
+    free(buf);
+    buf = NULL;
+    return false;
+  } else {
+    buf = tmp_buf;
+  }
+
+  memcpy(&buf[buf_len], _buf, len);
+  buf_len += len;
+
+  success = true;
+  if (!XML_Parse(p, &buf[pos], len, done)) {
+    fprintf(stderr, "Parse error at line %d:\n%s\n",
+	      (int)XML_GetCurrentLineNumber(p),
+	      XML_ErrorString(XML_GetErrorCode(p)));
+    success = false;
+  }
+
+  return success;
+}
+
+void decode_xml_obj(unsigned long& val, XMLObj *obj)
+{
+  auto& s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtoul(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && val == ULONG_MAX) ||
+     (errno != 0 && val == 0)) {
+   throw RGWXMLDecoder::err("failed to number");
+ }
+
+ if (p == start) {
+   throw RGWXMLDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw RGWXMLDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+
+void decode_xml_obj(long& val, XMLObj *obj)
+{
+  const std::string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtol(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) ||
+     (errno != 0 && val == 0)) {
+   throw RGWXMLDecoder::err("failed to parse number");
+ }
+
+ if (p == start) {
+   throw RGWXMLDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw RGWXMLDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_xml_obj(long long& val, XMLObj *obj)
+{
+  const std::string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtoll(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && (val == LLONG_MAX || val == LLONG_MIN)) ||
+     (errno != 0 && val == 0)) {
+   throw RGWXMLDecoder::err("failed to parse number");
+ }
+
+ if (p == start) {
+   throw RGWXMLDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw RGWXMLDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_xml_obj(unsigned long long& val, XMLObj *obj)
+{
+  const std::string s = obj->get_data();
+  const char *start = s.c_str();
+  char *p;
+
+  errno = 0;
+  val = strtoull(start, &p, 10);
+
+  /* Check for various possible errors */
+
+ if ((errno == ERANGE && val == ULLONG_MAX) ||
+     (errno != 0 && val == 0)) {
+   throw RGWXMLDecoder::err("failed to parse number");
+ }
+
+ if (p == start) {
+   throw RGWXMLDecoder::err("failed to parse number");
+ }
+
+ while (*p != '\0') {
+   if (!isspace(*p)) {
+     throw RGWXMLDecoder::err("failed to parse number");
+   }
+   p++;
+ }
+}
+
+void decode_xml_obj(int& val, XMLObj *obj)
+{
+  long l;
+  decode_xml_obj(l, obj);
+#if LONG_MAX > INT_MAX
+  if (l > INT_MAX || l < INT_MIN) {
+    throw RGWXMLDecoder::err("integer out of range");
+  }
+#endif
+
+  val = (int)l;
+}
+
+void decode_xml_obj(unsigned& val, XMLObj *obj)
+{
+  unsigned long l;
+  decode_xml_obj(l, obj);
+#if ULONG_MAX > UINT_MAX
+  if (l > UINT_MAX) {
+    throw RGWXMLDecoder::err("unsigned integer out of range");
+  }
+#endif
+
+  val = (unsigned)l;
+}
+
+void decode_xml_obj(bool& val, XMLObj *obj)
+{
+  const std::string s = obj->get_data();
+  if (strncasecmp(s.c_str(), "true", 8) == 0) {
+    val = true;
+    return;
+  }
+  if (strncasecmp(s.c_str(), "false", 8) == 0) {
+    val = false;
+    return;
+  }
+  int i;
+  decode_xml_obj(i, obj);
+  val = (bool)i;
+}
+
+void decode_xml_obj(bufferlist& val, XMLObj *obj)
+{
+  const std::string s = obj->get_data();
+
+  bufferlist bl;
+  bl.append(s.c_str(), s.size());
+  try {
+    val.decode_base64(bl);
+  } catch (buffer::error& err) {
+   throw RGWXMLDecoder::err("failed to decode base64");
+  }
+}
+
+void decode_xml_obj(utime_t& val, XMLObj *obj)
+{
+  const std::string s = obj->get_data();
+  uint64_t epoch;
+  uint64_t nsec;
+  int r = utime_t::parse_date(s, &epoch, &nsec);
+  if (r == 0) {
+    val = utime_t(epoch, nsec);
+  } else {
+    throw RGWXMLDecoder::err("failed to decode utime_t");
+  }
+}
+
+void encode_xml(const char *name, const string& val, Formatter *f)
+{
+  f->dump_string(name, val);
+}
+
+void encode_xml(const char *name, const char *val, Formatter *f)
+{
+  f->dump_string(name, val);
+}
+
+void encode_xml(const char *name, bool val, Formatter *f)
+{
+  std::string s;
+  if (val)
+    s = "True";
+  else
+    s = "False";
+
+  f->dump_string(name, s);
+}
+
+void encode_xml(const char *name, int val, Formatter *f)
+{
+  f->dump_int(name, val);
+}
+
+void encode_xml(const char *name, long val, Formatter *f)
+{
+  f->dump_int(name, val);
+}
+
+void encode_xml(const char *name, unsigned val, Formatter *f)
+{
+  f->dump_unsigned(name, val);
+}
+
+void encode_xml(const char *name, unsigned long val, Formatter *f)
+{
+  f->dump_unsigned(name, val);
+}
+
+void encode_xml(const char *name, unsigned long long val, Formatter *f)
+{
+  f->dump_unsigned(name, val);
+}
+
+void encode_xml(const char *name, long long val, Formatter *f)
+{
+  f->dump_int(name, val);
+}
+
+void encode_xml(const char *name, const utime_t& val, Formatter *f)
+{
+  val.gmtime(f->dump_stream(name));
+}
+
+void encode_xml(const char *name, const bufferlist& bl, Formatter *f)
+{
+  /* need to copy data from bl, as it is const bufferlist */
+  bufferlist src = bl;
+
+  bufferlist b64;
+  src.encode_base64(b64);
+
+  const std::string s(b64.c_str(), b64.length());
+
+  encode_xml(name, s, f);
+}
+
diff --git a/src/rgw/rgw_xml.h b/src/rgw/rgw_xml.h
new file mode 100644
index 000000000..74a8c27a0
--- /dev/null
+++ b/src/rgw/rgw_xml.h
@@ -0,0 +1,371 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <iosfwd>
+#include <include/types.h>
+#include <common/Formatter.h>
+
+class XMLObj;
+class RGWXMLParser;
+
+class XMLObjIter {
+public:
+  typedef std::map<std::string, XMLObj *>::iterator map_iter_t;
+  typedef std::map<std::string, XMLObj *>::iterator const_map_iter_t;
+
+  XMLObjIter();
+  virtual ~XMLObjIter();
+  void set(const XMLObjIter::const_map_iter_t &_cur, const XMLObjIter::const_map_iter_t &_end);
+  XMLObj *get_next();
+  bool get_name(std::string& name) const;
+
+private:
+  map_iter_t cur;
+  map_iter_t end;
+};
+
+/**
+ * Represents a block of XML.
+ * Give the class an XML blob, and it will parse the blob into
+ * an attr_name->value map.
+ * It shouldn't be the start point for any parsing. Look at RGWXMLParser for that.
+ */
+class XMLObj
+{
+private:
+  XMLObj *parent;
+  std::string obj_type;
+
+protected:
+  std::string data;
+  std::multimap<std::string, XMLObj *> children;
+  std::map<std::string, std::string> attr_map;
+
+  // invoked at the beginning of the XML tag, and populate any attributes
+  bool xml_start(XMLObj *parent, const char *el, const char **attr);
+  // callback invoked at the end of the XML tag
+  // if objects are created while parsing, this should be overwritten in the drived class
+  virtual bool xml_end(const char *el);
+  // callback invoked for storing the data of the XML tag
+  // if data manipulation is needed this could be overwritten in the drived class
+  virtual void xml_handle_data(const char *s, int len);
+  // get the parent object
+  XMLObj *get_parent();
+  // add a child XML object
+  void add_child(const std::string& el, XMLObj *obj);
+
+public:
+  XMLObj() : parent(nullptr) {}
+  virtual ~XMLObj();
+
+  // get the data (as string)
+  const std::string& get_data() const;
+  // get the type of the object (as string)
+  const std::string& get_obj_type() const;
+  bool get_attr(const std::string& name, std::string& attr) const;
+  // return a list of sub-tags matching the name
+  XMLObjIter find(const std::string& name);
+  // return the first sub-tag
+  XMLObjIter find_first();
+  // return the first sub-tags matching the name
+  XMLObj *find_first(const std::string& name);
+
+  friend std::ostream& operator<<(std::ostream &out, const XMLObj &obj);
+  friend RGWXMLParser;
+};
+
+struct XML_ParserStruct;
+
+// an XML parser is an XML object without a parent (root of the tree)
+// the parser could be used in 2 ways:
+//
+// (1) lazy object creation/intrusive API: usually used within the RGWXMLDecode namespace (as RGWXMLDecode::XMLParser)
+// the parser will parse the input and store info, but will not generate the target object. The object can be allocated outside
+// of the parser (stack or heap), and require to implement the decode_xml() API for the values to be populated.
+// note that the decode_xml() calls may throw exceptions if parsing fails
+//
+// (2) object creation while parsing: a new class needs to be derived from RGWXMLParser and implement alloc_obj()
+// API that should create a set of classes derived from XMLObj implementing xml_end() to create the actual target objects
+//
+// There could be a mix-and-match of the 2 types, control over that is in the alloc_obj() call
+// deciding for which tags objects are allocate during parsing and for which tags object allocation is external
+
+class RGWXMLParser : public XMLObj
+{
+private:
+  XML_ParserStruct *p;
+  char *buf;
+  int buf_len;
+  XMLObj *cur_obj;
+  std::vector<XMLObj *> objs;
+  std::list<XMLObj *> allocated_objs;
+  std::list<XMLObj> unallocated_objs;
+  bool success;
+  bool init_called;
+
+  // calls xml_start() on each parsed object
+  // passed as static callback to actual parser, passes itself as user_data
+  static void call_xml_start(void* user_data, const char *el, const char **attr);
+  // calls xml_end() on each parsed object
+  // passed as static callback to actual parser, passes itself as user_data
+  static void call_xml_end(void* user_data, const char *el);
+  // calls xml_handle_data() on each parsed object
+  // passed as static callback to actual parser, passes itself as user_data
+  static void call_xml_handle_data(void* user_data, const char *s, int len);
+
+protected:
+  // if objects are created while parsing, this should be implemented in the derived class
+  // and be a factory for creating the classes derived from XMLObj
+  // note that not all sub-tags has to be constructed here, any such tag which is not
+  // constructed will be lazily created when decode_xml() is invoked on it
+  //
+  // note that in case of different tags sharing the same name at different levels
+  // this method should not be used
+  virtual XMLObj *alloc_obj(const char *el) {
+    return nullptr;
+  }
+
+public:
+  RGWXMLParser();
+  virtual ~RGWXMLParser() override;
+
+  // initialize the parser, must be called before parsing
+  bool init();
+  // parse the XML buffer (can be invoked multiple times for incremental parsing)
+  // receives the buffer to parse, its length, and boolean indication (0,1)
+  // whether this is the final chunk of the buffer
+  bool parse(const char *buf, int len, int done);
+  // get the XML blob being parsed
+  const char *get_xml() const { return buf; }
+};
+
+namespace RGWXMLDecoder {
+  struct err : std::runtime_error {
+    using runtime_error::runtime_error;
+  };
+
+  typedef RGWXMLParser XMLParser;
+
+  template<class T>
+  bool decode_xml(const char *name, T& val, XMLObj* obj, bool mandatory = false);
+
+  template<class T>
+  bool decode_xml(const char *name, std::vector<T>& v, XMLObj* obj, bool mandatory = false);
+
+  template<class C>
+  bool decode_xml(const char *name, C& container, void (*cb)(C&, XMLObj *obj), XMLObj *obj, bool mandatory = false);
+
+  template<class T>
+  void decode_xml(const char *name, T& val, T& default_val, XMLObj* obj);
+}
+
+static inline std::ostream& operator<<(std::ostream &out, RGWXMLDecoder::err& err)
+{
+  return out << err.what();
+}
+
+template<class T>
+void decode_xml_obj(T& val, XMLObj *obj)
+{
+  val.decode_xml(obj);
+}
+
+static inline void decode_xml_obj(std::string& val, XMLObj *obj)
+{
+  val = obj->get_data();
+}
+
+void decode_xml_obj(unsigned long long& val, XMLObj *obj);
+void decode_xml_obj(long long& val, XMLObj *obj);
+void decode_xml_obj(unsigned long& val, XMLObj *obj);
+void decode_xml_obj(long& val, XMLObj *obj);
+void decode_xml_obj(unsigned& val, XMLObj *obj);
+void decode_xml_obj(int& val, XMLObj *obj);
+void decode_xml_obj(bool& val, XMLObj *obj);
+void decode_xml_obj(bufferlist& val, XMLObj *obj);
+class utime_t;
+void decode_xml_obj(utime_t& val, XMLObj *obj);
+
+template<class T>
+void decode_xml_obj(std::optional<T>& val, XMLObj *obj)
+{
+  val.emplace();
+  decode_xml_obj(*val, obj);
+}
+
+template<class T>
+void do_decode_xml_obj(std::list<T>& l, const std::string& name, XMLObj *obj)
+{
+  l.clear();
+
+  XMLObjIter iter = obj->find(name);
+  XMLObj *o;
+
+  while ((o = iter.get_next())) {
+    T val;
+    decode_xml_obj(val, o);
+    l.push_back(val);
+  }
+}
+
+template<class T>
+bool RGWXMLDecoder::decode_xml(const char *name, T& val, XMLObj *obj, bool mandatory)
+{
+  XMLObjIter iter = obj->find(name);
+  XMLObj *o = iter.get_next();
+  if (!o) {
+    if (mandatory) {
+      std::string s = "missing mandatory field " + std::string(name);
+      throw err(s);
+    }
+    val = T();
+    return false;
+  }
+
+  try {
+    decode_xml_obj(val, o);
+  } catch (const err& e) {
+    std::string s = std::string(name) + ": ";
+    s.append(e.what());
+    throw err(s);
+  }
+
+  return true;
+}
+
+template<class T>
+bool RGWXMLDecoder::decode_xml(const char *name, std::vector<T>& v, XMLObj *obj, bool mandatory)
+{
+  XMLObjIter iter = obj->find(name);
+  XMLObj *o = iter.get_next();
+
+  v.clear();
+
+  if (!o) {
+    if (mandatory) {
+      std::string s = "missing mandatory field " + std::string(name);
+      throw err(s);
+    }
+    return false;
+  }
+
+  do {
+    T val;
+    try {
+      decode_xml_obj(val, o);
+    } catch (const err& e) {
+      std::string s = std::string(name) + ": ";
+      s.append(e.what());
+      throw err(s);
+    }
+    v.push_back(val);
+  } while ((o = iter.get_next()));
+  return true;
+}
+
+template<class C>
+bool RGWXMLDecoder::decode_xml(const char *name, C& container, void (*cb)(C&, XMLObj *), XMLObj *obj, bool mandatory)
+{
+  container.clear();
+
+  XMLObjIter iter = obj->find(name);
+  XMLObj *o = iter.get_next();
+  if (!o) {
+    if (mandatory) {
+      std::string s = "missing mandatory field " + std::string(name);
+      throw err(s);
+    }
+    return false;
+  }
+
+  try {
+    decode_xml_obj(container, cb, o);
+  } catch (const err& e) {
+    std::string s = std::string(name) + ": ";
+    s.append(e.what());
+    throw err(s);
+  }
+
+  return true;
+}
+
+template<class T>
+void RGWXMLDecoder::decode_xml(const char *name, T& val, T& default_val, XMLObj *obj)
+{
+  XMLObjIter iter = obj->find(name);
+  XMLObj *o = iter.get_next();
+  if (!o) {
+    val = default_val;
+    return;
+  }
+
+  try {
+    decode_xml_obj(val, o);
+  } catch (const err& e) {
+    val = default_val;
+    std::string s = std::string(name) + ": ";
+    s.append(e.what());
+    throw err(s);
+  }
+}
+
+template<class T>
+static void encode_xml(const char *name, const T& val, ceph::Formatter *f)
+{
+  f->open_object_section(name);
+  val.dump_xml(f);
+  f->close_section();
+}
+
+template<class T>
+static void encode_xml(const char *name, const char *ns, const T& val, ceph::Formatter *f)
+{
+  f->open_object_section_in_ns(name, ns);
+  val.dump_xml(f);
+  f->close_section();
+}
+
+void encode_xml(const char *name, const std::string& val, ceph::Formatter *f);
+void encode_xml(const char *name, const char *val, ceph::Formatter *f);
+void encode_xml(const char *name, bool val, ceph::Formatter *f);
+void encode_xml(const char *name, int val, ceph::Formatter *f);
+void encode_xml(const char *name, unsigned val, ceph::Formatter *f);
+void encode_xml(const char *name, long val, ceph::Formatter *f);
+void encode_xml(const char *name, unsigned long val, ceph::Formatter *f);
+void encode_xml(const char *name, long long val, ceph::Formatter *f);
+void encode_xml(const char *name, const utime_t& val, ceph::Formatter *f);
+void encode_xml(const char *name, const bufferlist& bl, ceph::Formatter *f);
+void encode_xml(const char *name, long long unsigned val, ceph::Formatter *f);
+
+template<class T>
+static void do_encode_xml(const char *name, const std::list<T>& l, const char *entry_name, ceph::Formatter *f)
+{
+  f->open_array_section(name);
+  for (typename std::list<T>::const_iterator iter = l.begin(); iter != l.end(); ++iter) {
+    encode_xml(entry_name, *iter, f);
+  }
+  f->close_section();
+}
+
+template<class T>
+static void encode_xml(const char *name, const std::vector<T>& l, ceph::Formatter *f)
+{
+  for (typename std::vector<T>::const_iterator iter = l.begin(); iter != l.end(); ++iter) {
+    encode_xml(name, *iter, f);
+  }
+}
+
+template<class T>
+static void encode_xml(const char *name, const std::optional<T>& o, ceph::Formatter *f)
+{
+  if (!o) {
+    return;
+  }
+
+  encode_xml(name, *o, f);
+}
diff --git a/src/rgw/rgw_xml_enc.cc b/src/rgw/rgw_xml_enc.cc
new file mode 100644
index 000000000..554e953d7
--- /dev/null
+++ b/src/rgw/rgw_xml_enc.cc
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Yehuda Sadeh <yehuda@redhat.com>
+ * Copyright (C) 2015 Robin H. Johnson <robin.johnson@dreamhost.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "rgw_common.h"
+#include "rgw_xml.h"
+
+#include "common/Formatter.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
diff --git a/src/rgw/rgw_zone.cc b/src/rgw/rgw_zone.cc
new file mode 100644
index 000000000..b743689ed
--- /dev/null
+++ b/src/rgw/rgw_zone.cc
@@ -0,0 +1,1371 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <optional>
+
+#include "common/errno.h"
+
+#include "rgw_zone.h"
+#include "rgw_sal_config.h"
+#include "rgw_sync.h"
+
+#include "services/svc_zone.h"
+
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+
+namespace rgw_zone_defaults {
+
+static std::string default_bucket_index_pool_suffix = "rgw.buckets.index";
+static std::string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
+static std::string zone_info_oid_prefix = "zone_info.";
+
+std::string zone_names_oid_prefix = "zone_names.";
+std::string region_info_oid_prefix = "region_info.";
+std::string zone_group_info_oid_prefix = "zonegroup_info.";
+std::string default_region_info_oid = "default.region";
+std::string default_zone_group_info_oid = "default.zonegroup";
+std::string region_map_oid = "region_map";
+std::string default_zonegroup_name = "default";
+std::string default_zone_name = "default";
+std::string zonegroup_names_oid_prefix = "zonegroups_names.";
+std::string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
+std::string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
+std::string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
+std::string avail_pools = ".pools.avail";
+std::string default_storage_pool_suffix = "rgw.buckets.data";
+
+}
+
+using namespace std;
+using namespace rgw_zone_defaults;
+
+void encode_json_plain(const char *name, const RGWAccessKey& val, Formatter *f)
+{
+  f->open_object_section(name);
+  val.dump_plain(f);
+  f->close_section();
+}
+
+static void decode_zones(map<rgw_zone_id, RGWZone>& zones, JSONObj *o)
+{
+  RGWZone z;
+  z.decode_json(o);
+  zones[z.id] = z;
+}
+
+static void decode_placement_targets(map<string, RGWZoneGroupPlacementTarget>& targets, JSONObj *o)
+{
+  RGWZoneGroupPlacementTarget t;
+  t.decode_json(o);
+  targets[t.name] = t;
+}
+
+void RGWZone::generate_test_instances(list<RGWZone*> &o)
+{
+  RGWZone *z = new RGWZone;
+  o.push_back(z);
+  o.push_back(new RGWZone);
+}
+
+void RGWZone::dump(Formatter *f) const
+{
+  encode_json("id", id, f);
+  encode_json("name", name, f);
+  encode_json("endpoints", endpoints, f);
+  encode_json("log_meta", log_meta, f);
+  encode_json("log_data", log_data, f);
+  encode_json("bucket_index_max_shards", bucket_index_max_shards, f);
+  encode_json("read_only", read_only, f);
+  encode_json("tier_type", tier_type, f);
+  encode_json("sync_from_all", sync_from_all, f);
+  encode_json("sync_from", sync_from, f);
+  encode_json("redirect_zone", redirect_zone, f);
+  encode_json("supported_features", supported_features, f);
+}
+
+void RGWZone::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("name", name, obj);
+  if (id.empty()) {
+    id = name;
+  }
+  JSONDecoder::decode_json("endpoints", endpoints, obj);
+  JSONDecoder::decode_json("log_meta", log_meta, obj);
+  JSONDecoder::decode_json("log_data", log_data, obj);
+  JSONDecoder::decode_json("bucket_index_max_shards", bucket_index_max_shards, obj);
+  JSONDecoder::decode_json("read_only", read_only, obj);
+  JSONDecoder::decode_json("tier_type", tier_type, obj);
+  JSONDecoder::decode_json("sync_from_all", sync_from_all, true, obj);
+  JSONDecoder::decode_json("sync_from", sync_from, obj);
+  JSONDecoder::decode_json("redirect_zone", redirect_zone, obj);
+  JSONDecoder::decode_json("supported_features", supported_features, obj);
+}
+
+int RGWSystemMetaObj::init(const DoutPrefixProvider *dpp, CephContext *_cct, RGWSI_SysObj *_sysobj_svc,
+			   optional_yield y,
+			   bool setup_obj, bool old_format)
+{
+  reinit_instance(_cct, _sysobj_svc);
+
+  if (!setup_obj)
+    return 0;
+
+  if (old_format && id.empty()) {
+    id = name;
+  }
+
+  if (id.empty()) {
+    id = get_predefined_id(cct);
+  }
+
+  if (id.empty()) {
+    int r;
+    if (name.empty()) {
+      name = get_predefined_name(cct);
+    }
+    if (name.empty()) {
+      r = use_default(dpp, y, old_format);
+      if (r < 0) {
+	return r;
+      }
+    } else if (!old_format) {
+      r = read_id(dpp, name, id, y);
+      if (r < 0) {
+        if (r != -ENOENT) {
+          ldpp_dout(dpp, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
+        }
+        return r;
+      }
+    }
+  }
+
+  return read_info(dpp, id, y, old_format);
+}
+
+RGWZoneGroup::~RGWZoneGroup() {}
+
+const string RGWZoneGroup::get_default_oid(bool old_region_format) const
+{
+  if (old_region_format) {
+    if (cct->_conf->rgw_default_region_info_oid.empty()) {
+      return default_region_info_oid;
+    }
+    return cct->_conf->rgw_default_region_info_oid;
+  }
+
+  string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
+
+  if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
+    default_oid = default_zone_group_info_oid;
+  }
+
+  default_oid += "." + realm_id;
+
+  return default_oid;
+}
+
+const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format) const
+{
+  if (old_region_format) {
+    return region_info_oid_prefix;
+  }
+  return zone_group_info_oid_prefix;
+}
+
+const string& RGWZoneGroup::get_names_oid_prefix() const
+{
+  return zonegroup_names_oid_prefix;
+}
+
+string RGWZoneGroup::get_predefined_id(CephContext *cct) const {
+  return cct->_conf.get_val<string>("rgw_zonegroup_id");
+}
+
+const string& RGWZoneGroup::get_predefined_name(CephContext *cct) const {
+  return cct->_conf->rgw_zonegroup;
+}
+
+rgw_pool RGWZoneGroup::get_pool(CephContext *cct_) const
+{
+  if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
+    return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
+  }
+
+  return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
+}
+
+int RGWZoneGroup::read_default_id(const DoutPrefixProvider *dpp, string& default_id, optional_yield y,
+				  bool old_format)
+{
+  if (realm_id.empty()) {
+    /* try using default realm */
+    RGWRealm realm;
+    int ret = realm.init(dpp, cct, sysobj_svc, y);
+    // no default realm exist
+    if (ret < 0) {
+      return read_id(dpp, default_zonegroup_name, default_id, y);
+    }
+    realm_id = realm.get_id();
+  }
+
+  return RGWSystemMetaObj::read_default_id(dpp, default_id, y, old_format);
+}
+
+int RGWSystemMetaObj::use_default(const DoutPrefixProvider *dpp, optional_yield y, bool old_format)
+{
+  return read_default_id(dpp, id, y, old_format);
+}
+
+void RGWSystemMetaObj::reinit_instance(CephContext *_cct, RGWSI_SysObj *_sysobj_svc)
+{
+  cct = _cct;
+  sysobj_svc = _sysobj_svc;
+  zone_svc = _sysobj_svc->get_zone_svc();
+}
+
+int RGWSystemMetaObj::read_info(const DoutPrefixProvider *dpp, const string& obj_id, optional_yield y,
+				bool old_format)
+{
+  rgw_pool pool(get_pool(cct));
+
+  bufferlist bl;
+
+  string oid = get_info_oid_prefix(old_format) + obj_id;
+
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid});
+  int ret = sysobj.rop().read(dpp, &bl, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+  using ceph::decode;
+
+  try {
+    auto iter = bl.cbegin();
+    decode(*this, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+void RGWZoneGroup::decode_json(JSONObj *obj)
+{
+  RGWSystemMetaObj::decode_json(obj);
+  if (id.empty()) {
+    derr << "old format " << dendl;
+    JSONDecoder::decode_json("name", name, obj);
+    id = name;
+  }
+  JSONDecoder::decode_json("api_name", api_name, obj);
+  JSONDecoder::decode_json("is_master", is_master, obj);
+  JSONDecoder::decode_json("endpoints", endpoints, obj);
+  JSONDecoder::decode_json("hostnames", hostnames, obj);
+  JSONDecoder::decode_json("hostnames_s3website", hostnames_s3website, obj);
+  JSONDecoder::decode_json("master_zone", master_zone, obj);
+  JSONDecoder::decode_json("zones", zones, decode_zones, obj);
+  JSONDecoder::decode_json("placement_targets", placement_targets, decode_placement_targets, obj);
+  string pr;
+  JSONDecoder::decode_json("default_placement", pr, obj);
+  default_placement.from_str(pr);
+  JSONDecoder::decode_json("realm_id", realm_id, obj);
+  JSONDecoder::decode_json("sync_policy", sync_policy, obj);
+  JSONDecoder::decode_json("enabled_features", enabled_features, obj);
+}
+
+RGWZoneParams::~RGWZoneParams() {}
+
+void RGWZoneParams::decode_json(JSONObj *obj)
+{
+  RGWSystemMetaObj::decode_json(obj);
+  JSONDecoder::decode_json("domain_root", domain_root, obj);
+  JSONDecoder::decode_json("control_pool", control_pool, obj);
+  JSONDecoder::decode_json("gc_pool", gc_pool, obj);
+  JSONDecoder::decode_json("lc_pool", lc_pool, obj);
+  JSONDecoder::decode_json("log_pool", log_pool, obj);
+  JSONDecoder::decode_json("intent_log_pool", intent_log_pool, obj);
+  JSONDecoder::decode_json("roles_pool", roles_pool, obj);
+  JSONDecoder::decode_json("reshard_pool", reshard_pool, obj);
+  JSONDecoder::decode_json("usage_log_pool", usage_log_pool, obj);
+  JSONDecoder::decode_json("user_keys_pool", user_keys_pool, obj);
+  JSONDecoder::decode_json("user_email_pool", user_email_pool, obj);
+  JSONDecoder::decode_json("user_swift_pool", user_swift_pool, obj);
+  JSONDecoder::decode_json("user_uid_pool", user_uid_pool, obj);
+  JSONDecoder::decode_json("otp_pool", otp_pool, obj);
+  JSONDecoder::decode_json("system_key", system_key, obj);
+  JSONDecoder::decode_json("placement_pools", placement_pools, obj);
+  JSONDecoder::decode_json("tier_config", tier_config, obj);
+  JSONDecoder::decode_json("realm_id", realm_id, obj);
+  JSONDecoder::decode_json("notif_pool", notif_pool, obj);
+
+}
+
+void RGWZoneParams::dump(Formatter *f) const
+{
+  RGWSystemMetaObj::dump(f);
+  encode_json("domain_root", domain_root, f);
+  encode_json("control_pool", control_pool, f);
+  encode_json("gc_pool", gc_pool, f);
+  encode_json("lc_pool", lc_pool, f);
+  encode_json("log_pool", log_pool, f);
+  encode_json("intent_log_pool", intent_log_pool, f);
+  encode_json("usage_log_pool", usage_log_pool, f);
+  encode_json("roles_pool", roles_pool, f);
+  encode_json("reshard_pool", reshard_pool, f);
+  encode_json("user_keys_pool", user_keys_pool, f);
+  encode_json("user_email_pool", user_email_pool, f);
+  encode_json("user_swift_pool", user_swift_pool, f);
+  encode_json("user_uid_pool", user_uid_pool, f);
+  encode_json("otp_pool", otp_pool, f);
+  encode_json_plain("system_key", system_key, f);
+  encode_json("placement_pools", placement_pools, f);
+  encode_json("tier_config", tier_config, f);
+  encode_json("realm_id", realm_id, f);
+  encode_json("notif_pool", notif_pool, f);
+}
+
+int RGWZoneParams::init(const DoutPrefixProvider *dpp, 
+                        CephContext *cct, RGWSI_SysObj *sysobj_svc,
+			optional_yield y, bool setup_obj, bool old_format)
+{
+  if (name.empty()) {
+    name = cct->_conf->rgw_zone;
+  }
+
+  return RGWSystemMetaObj::init(dpp, cct, sysobj_svc, y, setup_obj, old_format);
+}
+
+rgw_pool RGWZoneParams::get_pool(CephContext *cct) const
+{
+  if (cct->_conf->rgw_zone_root_pool.empty()) {
+    return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
+  }
+
+  return rgw_pool(cct->_conf->rgw_zone_root_pool);
+}
+
+const string RGWZoneParams::get_default_oid(bool old_format) const
+{
+  if (old_format) {
+    return cct->_conf->rgw_default_zone_info_oid;
+  }
+
+  return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
+}
+
+const string& RGWZoneParams::get_names_oid_prefix() const
+{
+  return zone_names_oid_prefix;
+}
+
+const string& RGWZoneParams::get_info_oid_prefix(bool old_format) const
+{
+  return zone_info_oid_prefix;
+}
+
+string RGWZoneParams::get_predefined_id(CephContext *cct) const {
+  return cct->_conf.get_val<string>("rgw_zone_id");
+}
+
+const string& RGWZoneParams::get_predefined_name(CephContext *cct) const {
+  return cct->_conf->rgw_zone;
+}
+
+int RGWZoneParams::read_default_id(const DoutPrefixProvider *dpp, string& default_id, optional_yield y,
+				   bool old_format)
+{
+  if (realm_id.empty()) {
+    /* try using default realm */
+    RGWRealm realm;
+    int ret = realm.init(dpp, cct, sysobj_svc, y);
+    //no default realm exist
+    if (ret < 0) {
+      return read_id(dpp, default_zone_name, default_id, y);
+    }
+    realm_id = realm.get_id();
+  }
+
+  return RGWSystemMetaObj::read_default_id(dpp, default_id, y, old_format);
+}
+
+
+int RGWZoneParams::set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive)
+{
+  if (realm_id.empty()) {
+    /* try using default realm */
+    RGWRealm realm;
+    int ret = realm.init(dpp, cct, sysobj_svc, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
+      return -EINVAL;
+    }
+    realm_id = realm.get_id();
+  }
+
+  return RGWSystemMetaObj::set_as_default(dpp, y, exclusive);
+}
+
+int RGWZoneParams::create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive)
+{
+  /* check for old pools config */
+  rgw_raw_obj obj(domain_root, avail_pools);
+  auto sysobj = sysobj_svc->get_obj(obj);
+  int r = sysobj.rop().stat(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
+    /* a new system, let's set new placement info */
+    RGWZonePlacementInfo default_placement;
+    default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
+    rgw_pool pool = name + "." + default_storage_pool_suffix;
+    default_placement.storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &pool, nullptr);
+    default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
+    placement_pools["default-placement"] = default_placement;
+  }
+
+  r = fix_pool_names(dpp, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
+    return r;
+  }
+
+  r = RGWSystemMetaObj::create(dpp, y, exclusive);
+  if (r < 0) {
+    return r;
+  }
+
+  // try to set as default. may race with another create, so pass exclusive=true
+  // so we don't override an existing default
+  r = set_as_default(dpp, y, true);
+  if (r < 0 && r != -EEXIST) {
+    ldpp_dout(dpp, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
+  }
+
+  return 0;
+}
+
+rgw_pool fix_zone_pool_dup(const set<rgw_pool>& pools,
+                           const string& default_prefix,
+                           const string& default_suffix,
+                           const rgw_pool& suggested_pool)
+{
+  string suggested_name = suggested_pool.to_str();
+
+  string prefix = default_prefix;
+  string suffix = default_suffix;
+
+  if (!suggested_pool.empty()) {
+    prefix = suggested_name.substr(0, suggested_name.find("."));
+    suffix = suggested_name.substr(prefix.length());
+  }
+
+  rgw_pool pool(prefix + suffix);
+ 
+  while (pools.count(pool)) {
+    pool = prefix + "_" + std::to_string(std::rand()) + suffix;
+  }
+  return pool;
+}
+
+void add_zone_pools(const RGWZoneParams& info,
+                    std::set<rgw_pool>& pools)
+{
+  pools.insert(info.domain_root);
+  pools.insert(info.control_pool);
+  pools.insert(info.gc_pool);
+  pools.insert(info.log_pool);
+  pools.insert(info.intent_log_pool);
+  pools.insert(info.usage_log_pool);
+  pools.insert(info.user_keys_pool);
+  pools.insert(info.user_email_pool);
+  pools.insert(info.user_swift_pool);
+  pools.insert(info.user_uid_pool);
+  pools.insert(info.otp_pool);
+  pools.insert(info.roles_pool);
+  pools.insert(info.reshard_pool);
+  pools.insert(info.oidc_pool);
+  pools.insert(info.notif_pool);
+
+  for (const auto& [pname, placement] : info.placement_pools) {
+    pools.insert(placement.index_pool);
+    for (const auto& [sname, sc] : placement.storage_classes.get_all()) {
+      if (sc.data_pool) {
+        pools.insert(sc.data_pool.get());
+      }
+    }
+    pools.insert(placement.data_extra_pool);
+  }
+}
+
+namespace rgw {
+
+int get_zones_pool_set(const DoutPrefixProvider *dpp,
+                       optional_yield y,
+                       rgw::sal::ConfigStore* cfgstore,
+                       std::string_view my_zone_id,
+                       std::set<rgw_pool>& pools)
+{
+  std::array<std::string, 128> zone_names;
+  rgw::sal::ListResult<std::string> listing;
+  do {
+    int r = cfgstore->list_zone_names(dpp, y, listing.next,
+                                      zone_names, listing);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "failed to list zones with " << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    for (const auto& name : listing.entries) {
+      RGWZoneParams info;
+      r = cfgstore->read_zone_by_name(dpp, y, name, info, nullptr);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "failed to load zone " << name
+            << " with " << cpp_strerror(r) << dendl;
+        return r;
+      }
+      if (info.get_id() != my_zone_id) {
+        add_zone_pools(info, pools);
+      }
+    }
+  } while (!listing.next.empty());
+
+  return 0;
+}
+
+}
+
+static int get_zones_pool_set(const DoutPrefixProvider *dpp,
+                              CephContext* cct,
+                              RGWSI_SysObj* sysobj_svc,
+                              const list<string>& zone_names,
+                              const string& my_zone_id,
+                              set<rgw_pool>& pool_names,
+		              optional_yield y)
+{
+  for (const auto& name : zone_names) {
+    RGWZoneParams zone(name);
+    int r = zone.init(dpp, cct, sysobj_svc, y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "Error: failed to load zone " << name
+          << " with " << cpp_strerror(-r) << dendl;
+      return r;
+    }
+    if (zone.get_id() != my_zone_id) {
+      add_zone_pools(zone, pool_names);
+    }
+  }
+  return 0;
+}
+
+int RGWZoneParams::fix_pool_names(const DoutPrefixProvider *dpp, optional_yield y)
+{
+
+  list<string> zones;
+  int r = zone_svc->list_zones(dpp, zones);
+  if (r < 0) {
+    ldpp_dout(dpp, 10) << "WARNING: driver->list_zones() returned r=" << r << dendl;
+  }
+
+  set<rgw_pool> pools;
+  r = get_zones_pool_set(dpp, cct, sysobj_svc, zones, id, pools, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "Error: get_zones_pool_names" << r << dendl;
+    return r;
+  }
+
+  domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
+  control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
+  gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
+  lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
+  log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
+  intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
+  usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
+  user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
+  user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
+  user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
+  user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
+  roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
+  reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
+  otp_pool = fix_zone_pool_dup(pools, name, ".rgw.otp", otp_pool);
+  oidc_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:oidc", oidc_pool);
+  notif_pool = fix_zone_pool_dup(pools, name ,".rgw.log:notif", notif_pool);
+
+  for(auto& iter : placement_pools) {
+    iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
+                                               iter.second.index_pool);
+    for (auto& pi : iter.second.storage_classes.get_all()) {
+      if (pi.second.data_pool) {
+        rgw_pool& pool = pi.second.data_pool.get();
+        pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
+                                 pool);
+      }
+    }
+    iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
+                                                   iter.second.data_extra_pool);
+  }
+
+  return 0;
+}
+
+int RGWPeriodConfig::read(const DoutPrefixProvider *dpp, RGWSI_SysObj *sysobj_svc, const std::string& realm_id,
+			  optional_yield y)
+{
+  const auto& pool = get_pool(sysobj_svc->ctx());
+  const auto& oid = get_oid(realm_id);
+  bufferlist bl;
+
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid});
+  int ret = sysobj.rop().read(dpp, &bl, y);
+  if (ret < 0) {
+    return ret;
+  }
+  using ceph::decode;
+  try {
+    auto iter = bl.cbegin();
+    decode(*this, iter);
+  } catch (buffer::error& err) {
+    return -EIO;
+  }
+  return 0;
+}
+
+int RGWPeriodConfig::write(const DoutPrefixProvider *dpp, 
+                           RGWSI_SysObj *sysobj_svc,
+			   const std::string& realm_id, optional_yield y)
+{
+  const auto& pool = get_pool(sysobj_svc->ctx());
+  const auto& oid = get_oid(realm_id);
+  bufferlist bl;
+  using ceph::encode;
+  encode(*this, bl);
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid});
+  return sysobj.wop()
+               .set_exclusive(false)
+               .write(dpp, bl, y);
+}
+
+void RGWPeriodConfig::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("bucket_quota", quota.bucket_quota, obj);
+  JSONDecoder::decode_json("user_quota", quota.user_quota, obj);
+  JSONDecoder::decode_json("user_ratelimit", user_ratelimit, obj);
+  JSONDecoder::decode_json("bucket_ratelimit", bucket_ratelimit, obj);
+  JSONDecoder::decode_json("anonymous_ratelimit", anon_ratelimit, obj);
+}
+
+void RGWPeriodConfig::dump(Formatter *f) const
+{
+  encode_json("bucket_quota", quota.bucket_quota, f);
+  encode_json("user_quota", quota.user_quota, f);
+  encode_json("user_ratelimit", user_ratelimit, f);
+  encode_json("bucket_ratelimit", bucket_ratelimit, f);
+  encode_json("anonymous_ratelimit", anon_ratelimit, f);
+}
+
+std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
+{
+  if (realm_id.empty()) {
+    return "period_config.default";
+  }
+  return "period_config." + realm_id;
+}
+
+rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
+{
+  const auto& pool_name = cct->_conf->rgw_period_root_pool;
+  if (pool_name.empty()) {
+    return {RGW_DEFAULT_PERIOD_ROOT_POOL};
+  }
+  return {pool_name};
+}
+
+int RGWSystemMetaObj::delete_obj(const DoutPrefixProvider *dpp, optional_yield y, bool old_format)
+{
+  rgw_pool pool(get_pool(cct));
+
+  /* check to see if obj is the default */
+  RGWDefaultSystemMetaObjInfo default_info;
+  int ret = read_default(dpp, default_info, get_default_oid(old_format), y);
+  if (ret < 0 && ret != -ENOENT)
+    return ret;
+  if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
+    string oid = get_default_oid(old_format);
+    rgw_raw_obj default_named_obj(pool, oid);
+    auto sysobj = sysobj_svc->get_obj(default_named_obj);
+    ret = sysobj.wop().remove(dpp, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "Error delete default obj name  " << name << ": " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+  }
+  if (!old_format) {
+    string oid  = get_names_oid_prefix() + name;
+    rgw_raw_obj object_name(pool, oid);
+    auto sysobj = sysobj_svc->get_obj(object_name);
+    ret = sysobj.wop().remove(dpp, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "Error delete obj name  " << name << ": " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+  }
+
+  string oid = get_info_oid_prefix(old_format);
+  if (old_format) {
+    oid += name;
+  } else {
+    oid += id;
+  }
+
+  rgw_raw_obj object_id(pool, oid);
+  auto sysobj = sysobj_svc->get_obj(object_id);
+  ret = sysobj.wop().remove(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
+  }
+
+  return ret;
+}
+
+void RGWZoneGroup::dump(Formatter *f) const
+{
+  RGWSystemMetaObj::dump(f);
+  encode_json("api_name", api_name, f);
+  encode_json("is_master", is_master, f);
+  encode_json("endpoints", endpoints, f);
+  encode_json("hostnames", hostnames, f);
+  encode_json("hostnames_s3website", hostnames_s3website, f);
+  encode_json("master_zone", master_zone, f);
+  encode_json_map("zones", zones, f); /* more friendly representation */
+  encode_json_map("placement_targets", placement_targets, f); /* more friendly representation */
+  encode_json("default_placement", default_placement, f);
+  encode_json("realm_id", realm_id, f);
+  encode_json("sync_policy", sync_policy, f);
+  encode_json("enabled_features", enabled_features, f);
+}
+
+void RGWZoneGroupPlacementTarget::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("name", name, obj);
+  JSONDecoder::decode_json("tags", tags, obj);
+  JSONDecoder::decode_json("storage_classes", storage_classes, obj);
+  if (storage_classes.empty()) {
+    storage_classes.insert(RGW_STORAGE_CLASS_STANDARD);
+  }
+  JSONDecoder::decode_json("tier_targets", tier_targets, obj);
+}
+
+void RGWZonePlacementInfo::dump(Formatter *f) const
+{
+  encode_json("index_pool", index_pool, f);
+  encode_json("storage_classes", storage_classes, f);
+  encode_json("data_extra_pool", data_extra_pool, f);
+  encode_json("index_type", (uint32_t)index_type, f);
+  encode_json("inline_data", inline_data, f);
+
+  /* no real need for backward compatibility of compression_type and data_pool in here,
+   * rather not clutter the output */
+}
+
+void RGWZonePlacementInfo::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("index_pool", index_pool, obj);
+  JSONDecoder::decode_json("storage_classes", storage_classes, obj);
+  JSONDecoder::decode_json("data_extra_pool", data_extra_pool, obj);
+  uint32_t it;
+  JSONDecoder::decode_json("index_type", it, obj);
+  JSONDecoder::decode_json("inline_data", inline_data, obj);
+  index_type = (rgw::BucketIndexType)it;
+
+  /* backward compatibility, these are now defined in storage_classes */
+  string standard_compression_type;
+  string *pcompression = nullptr;
+  if (JSONDecoder::decode_json("compression", standard_compression_type, obj)) {
+    pcompression = &standard_compression_type;
+  }
+  rgw_pool standard_data_pool;
+  rgw_pool *ppool = nullptr;
+  if (JSONDecoder::decode_json("data_pool", standard_data_pool, obj)) {
+    ppool = &standard_data_pool;
+  }
+  if (ppool || pcompression) {
+    storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, ppool, pcompression);
+  }
+}
+
+void RGWSystemMetaObj::dump(Formatter *f) const
+{
+  encode_json("id", id , f);
+  encode_json("name", name , f);
+}
+
+void RGWSystemMetaObj::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("name", name, obj);
+}
+
+int RGWSystemMetaObj::read_default(const DoutPrefixProvider *dpp, 
+                                   RGWDefaultSystemMetaObjInfo& default_info,
+				   const string& oid, optional_yield y)
+{
+  using ceph::decode;
+  auto pool = get_pool(cct);
+  bufferlist bl;
+
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid));
+  int ret = sysobj.rop().read(dpp, &bl, y);
+  if (ret < 0)
+    return ret;
+
+  try {
+    auto iter = bl.cbegin();
+    decode(default_info, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "error decoding data from " << pool << ":" << oid << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+void RGWZoneGroupPlacementTarget::dump(Formatter *f) const
+{
+  encode_json("name", name, f);
+  encode_json("tags", tags, f);
+  encode_json("storage_classes", storage_classes, f);
+  if (!tier_targets.empty()) {
+    encode_json("tier_targets", tier_targets, f);
+  }
+}
+
+void RGWZoneGroupPlacementTier::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("tier_type", tier_type, obj);
+  JSONDecoder::decode_json("storage_class", storage_class, obj);
+  JSONDecoder::decode_json("retain_head_object", retain_head_object, obj);
+
+  if (tier_type == "cloud-s3") {
+    JSONDecoder::decode_json("s3", t.s3, obj);
+  }
+}
+
+void RGWZoneStorageClasses::dump(Formatter *f) const
+{
+  for (auto& i : m) {
+    encode_json(i.first.c_str(), i.second, f);
+  }
+}
+
+void RGWZoneStorageClasses::decode_json(JSONObj *obj)
+{
+  JSONFormattable f;
+  decode_json_obj(f, obj);
+
+  for (auto& field : f.object()) {
+    JSONObj *field_obj = obj->find_obj(field.first);
+    assert(field_obj);
+
+    decode_json_obj(m[field.first], field_obj);
+  }
+  standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
+}
+
+void RGWZoneGroupPlacementTier::dump(Formatter *f) const
+{
+  encode_json("tier_type", tier_type, f);
+  encode_json("storage_class", storage_class, f);
+  encode_json("retain_head_object", retain_head_object, f);
+
+  if (tier_type == "cloud-s3") {
+    encode_json("s3", t.s3, f);
+  }
+}
+
+void RGWZoneGroupPlacementTierS3::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("endpoint", endpoint, obj);
+  JSONDecoder::decode_json("access_key", key.id, obj);
+  JSONDecoder::decode_json("secret", key.key, obj);
+  JSONDecoder::decode_json("region", region, obj);
+  string s;
+  JSONDecoder::decode_json("host_style", s, obj);
+  if (s != "virtual") {
+    host_style = PathStyle;
+  } else {
+    host_style = VirtualStyle;
+  }
+  JSONDecoder::decode_json("target_storage_class", target_storage_class, obj);
+  JSONDecoder::decode_json("target_path", target_path, obj);
+  JSONDecoder::decode_json("acl_mappings", acl_mappings, obj);
+  JSONDecoder::decode_json("multipart_sync_threshold", multipart_sync_threshold, obj);
+  JSONDecoder::decode_json("multipart_min_part_size", multipart_min_part_size, obj);
+}
+
+void RGWZoneStorageClass::dump(Formatter *f) const
+{
+  if (data_pool) {
+    encode_json("data_pool", data_pool.get(), f);
+  }
+  if (compression_type) {
+    encode_json("compression_type", compression_type.get(), f);
+  }
+}
+
+void RGWZoneStorageClass::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("data_pool", data_pool, obj);
+  JSONDecoder::decode_json("compression_type", compression_type, obj);
+}
+
+void RGWTierACLMapping::decode_json(JSONObj *obj)
+{
+  string s;
+  JSONDecoder::decode_json("type", s, obj);
+  if (s == "email") {
+    type = ACL_TYPE_EMAIL_USER;
+  } else if (s == "uri") {
+    type = ACL_TYPE_GROUP;
+  } else {
+    type = ACL_TYPE_CANON_USER;
+  }
+
+  JSONDecoder::decode_json("source_id", source_id, obj);
+  JSONDecoder::decode_json("dest_id", dest_id, obj);
+}
+
+void RGWZoneGroupPlacementTierS3::dump(Formatter *f) const
+{
+  encode_json("endpoint", endpoint, f);
+  encode_json("access_key", key.id, f);
+  encode_json("secret", key.key, f);
+  encode_json("region", region, f);
+  string s = (host_style == PathStyle ? "path" : "virtual");
+  encode_json("host_style", s, f);
+  encode_json("target_storage_class", target_storage_class, f);
+  encode_json("target_path", target_path, f);
+  encode_json("acl_mappings", acl_mappings, f);
+  encode_json("multipart_sync_threshold", multipart_sync_threshold, f);
+  encode_json("multipart_min_part_size", multipart_min_part_size, f);
+}
+
+void RGWTierACLMapping::dump(Formatter *f) const
+{
+  string s;
+  switch (type) {
+    case ACL_TYPE_EMAIL_USER:
+      s = "email";
+      break;
+    case ACL_TYPE_GROUP:
+      s = "uri";
+      break;
+    default:
+      s = "id";
+      break;
+  }
+  encode_json("type", s, f);
+  encode_json("source_id", source_id, f);
+  encode_json("dest_id", dest_id, f);
+}
+
+void RGWPeriodMap::dump(Formatter *f) const
+{
+  encode_json("id", id, f);
+  encode_json_map("zonegroups", zonegroups, f);
+  encode_json("short_zone_ids", short_zone_ids, f);
+}
+
+static void decode_zonegroups(map<string, RGWZoneGroup>& zonegroups, JSONObj *o)
+{
+  RGWZoneGroup zg;
+  zg.decode_json(o);
+  zonegroups[zg.get_id()] = zg;
+}
+
+void RGWPeriodMap::decode_json(JSONObj *obj)
+{
+  JSONDecoder::decode_json("id", id, obj);
+  JSONDecoder::decode_json("zonegroups", zonegroups, decode_zonegroups, obj);
+  /* backward compatability with region */
+  if (zonegroups.empty()) {
+    JSONDecoder::decode_json("regions", zonegroups, obj);
+  }
+  /* backward compatability with region */
+  if (master_zonegroup.empty()) {
+    JSONDecoder::decode_json("master_region", master_zonegroup, obj);
+  }
+  JSONDecoder::decode_json("short_zone_ids", short_zone_ids, obj);
+}
+
+void RGWPeriodMap::decode(bufferlist::const_iterator& bl) {
+  DECODE_START(2, bl);
+  decode(id, bl);
+  decode(zonegroups, bl);
+  decode(master_zonegroup, bl);
+  if (struct_v >= 2) {
+    decode(short_zone_ids, bl);
+  }
+  DECODE_FINISH(bl);
+
+  zonegroups_by_api.clear();
+  for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
+       iter != zonegroups.end(); ++iter) {
+    RGWZoneGroup& zonegroup = iter->second;
+    zonegroups_by_api[zonegroup.api_name] = zonegroup;
+    if (zonegroup.is_master_zonegroup()) {
+      master_zonegroup = zonegroup.get_id();
+    }
+  }
+}
+
+void RGWPeriodMap::encode(bufferlist& bl) const
+{
+  ENCODE_START(2, 1, bl);
+  encode(id, bl);
+  encode(zonegroups, bl);
+  encode(master_zonegroup, bl);
+  encode(short_zone_ids, bl);
+  ENCODE_FINISH(bl);
+}
+
+int RGWSystemMetaObj::create(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive)
+{
+  int ret;
+
+  /* check to see the name is not used */
+  ret = read_id(dpp, name, id, y);
+  if (exclusive && ret == 0) {
+    ldpp_dout(dpp, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
+    return -EEXIST;
+  } else if ( ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "failed reading obj id  " << id << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  if (id.empty()) {
+    /* create unique id */
+    uuid_d new_uuid;
+    char uuid_str[37];
+    new_uuid.generate_random();
+    new_uuid.print(uuid_str);
+    id = uuid_str;
+  }
+
+  ret = store_info(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR:  storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return store_name(dpp, exclusive, y);
+}
+
+int RGWSystemMetaObj::read_default_id(const DoutPrefixProvider *dpp, string& default_id, optional_yield y,
+				      bool old_format)
+{
+  RGWDefaultSystemMetaObjInfo default_info;
+
+  int ret = read_default(dpp, default_info, get_default_oid(old_format), y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  default_id = default_info.default_id;
+
+  return 0;
+}
+
+int RGWSystemMetaObj::set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive)
+{
+  using ceph::encode;
+  string oid  = get_default_oid();
+
+  rgw_pool pool(get_pool(cct));
+  bufferlist bl;
+
+  RGWDefaultSystemMetaObjInfo default_info;
+  default_info.default_id = id;
+
+  encode(default_info, bl);
+
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid));
+  int ret = sysobj.wop()
+                  .set_exclusive(exclusive)
+                  .write(dpp, bl, y);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSystemMetaObj::store_info(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  rgw_pool pool(get_pool(cct));
+
+  string oid = get_info_oid_prefix() + id;
+
+  bufferlist bl;
+  using ceph::encode;
+  encode(*this, bl);
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj{pool, oid});
+  return sysobj.wop()
+               .set_exclusive(exclusive)
+               .write(dpp, bl, y);
+}
+
+int RGWSystemMetaObj::read_id(const DoutPrefixProvider *dpp, const string& obj_name, string& object_id,
+			      optional_yield y)
+{
+  using ceph::decode;
+  rgw_pool pool(get_pool(cct));
+  bufferlist bl;
+
+  string oid = get_names_oid_prefix() + obj_name;
+
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid));
+  int ret = sysobj.rop().read(dpp, &bl, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  RGWNameToId nameToId;
+  try {
+    auto iter = bl.cbegin();
+    decode(nameToId, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
+    return -EIO;
+  }
+  object_id = nameToId.obj_id;
+  return 0;
+}
+
+int RGWSystemMetaObj::store_name(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  rgw_pool pool(get_pool(cct));
+  string oid = get_names_oid_prefix() + name;
+
+  RGWNameToId nameToId;
+  nameToId.obj_id = id;
+
+  bufferlist bl;
+  using ceph::encode;
+  encode(nameToId, bl);
+  auto sysobj = sysobj_svc->get_obj(rgw_raw_obj(pool, oid));
+  return sysobj.wop()
+               .set_exclusive(exclusive)
+               .write(dpp, bl, y);
+}
+
+bool RGWPeriodMap::find_zone_by_id(const rgw_zone_id& zone_id,
+                                   RGWZoneGroup *zonegroup,
+                                   RGWZone *zone) const
+{
+  for (auto& iter : zonegroups) {
+    auto& zg = iter.second;
+
+    auto ziter = zg.zones.find(zone_id);
+    if (ziter != zg.zones.end()) {
+      *zonegroup = zg;
+      *zone = ziter->second;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int RGWZoneGroup::set_as_default(const DoutPrefixProvider *dpp, optional_yield y, bool exclusive)
+{
+  if (realm_id.empty()) {
+    /* try using default realm */
+    RGWRealm realm;
+    int ret = realm.init(dpp, cct, sysobj_svc, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
+      return -EINVAL;
+    }
+    realm_id = realm.get_id();
+  }
+
+  return RGWSystemMetaObj::set_as_default(dpp, y, exclusive);
+}
+
+int RGWSystemMetaObj::write(const DoutPrefixProvider *dpp, bool exclusive, optional_yield y)
+{
+  int ret = store_info(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
+    return ret;
+  }
+  ret = store_name(dpp, exclusive, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
+    return ret;
+  }
+  return 0;
+}
+
+namespace rgw {
+
+int init_zone_pool_names(const DoutPrefixProvider *dpp, optional_yield y,
+                         const std::set<rgw_pool>& pools, RGWZoneParams& info)
+{
+  info.domain_root = fix_zone_pool_dup(pools, info.name, ".rgw.meta:root", info.domain_root);
+  info.control_pool = fix_zone_pool_dup(pools, info.name, ".rgw.control", info.control_pool);
+  info.gc_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:gc", info.gc_pool);
+  info.lc_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:lc", info.lc_pool);
+  info.log_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log", info.log_pool);
+  info.intent_log_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:intent", info.intent_log_pool);
+  info.usage_log_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:usage", info.usage_log_pool);
+  info.user_keys_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:users.keys", info.user_keys_pool);
+  info.user_email_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:users.email", info.user_email_pool);
+  info.user_swift_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:users.swift", info.user_swift_pool);
+  info.user_uid_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:users.uid", info.user_uid_pool);
+  info.roles_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:roles", info.roles_pool);
+  info.reshard_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:reshard", info.reshard_pool);
+  info.otp_pool = fix_zone_pool_dup(pools, info.name, ".rgw.otp", info.otp_pool);
+  info.oidc_pool = fix_zone_pool_dup(pools, info.name, ".rgw.meta:oidc", info.oidc_pool);
+  info.notif_pool = fix_zone_pool_dup(pools, info.name, ".rgw.log:notif", info.notif_pool);
+
+  for (auto& [pname, placement] : info.placement_pools) {
+    placement.index_pool = fix_zone_pool_dup(pools, info.name, "." + default_bucket_index_pool_suffix, placement.index_pool);
+    placement.data_extra_pool= fix_zone_pool_dup(pools, info.name, "." + default_storage_extra_pool_suffix, placement.data_extra_pool);
+    for (auto& [sname, sc] : placement.storage_classes.get_all()) {
+      if (sc.data_pool) {
+        sc.data_pool = fix_zone_pool_dup(pools, info.name, "." + default_storage_pool_suffix, *sc.data_pool);
+      }
+    }
+  }
+
+  return 0;
+}
+
+int add_zone_to_group(const DoutPrefixProvider* dpp, RGWZoneGroup& zonegroup,
+                      const RGWZoneParams& zone_params,
+                      const bool *pis_master, const bool *pread_only,
+                      const std::list<std::string>& endpoints,
+                      const std::string *ptier_type,
+                      const bool *psync_from_all,
+                      const std::list<std::string>& sync_from,
+                      const std::list<std::string>& sync_from_rm,
+                      const std::string *predirect_zone,
+                      std::optional<int> bucket_index_max_shards,
+                      const rgw::zone_features::set& enable_features,
+                      const rgw::zone_features::set& disable_features)
+{
+  const std::string& zone_id = zone_params.id;
+  const std::string& zone_name = zone_params.name;
+
+  if (zone_id.empty()) {
+    ldpp_dout(dpp, -1) << __func__ << " requires a zone id" << dendl;
+    return -EINVAL;
+  }
+  if (zone_name.empty()) {
+    ldpp_dout(dpp, -1) << __func__ << " requires a zone name" << dendl;
+    return -EINVAL;
+  }
+
+  // check for duplicate zone name on insert
+  if (!zonegroup.zones.count(zone_id)) {
+    for (const auto& [id, zone] : zonegroup.zones) {
+      if (zone.name == zone_name) {
+        ldpp_dout(dpp, 0) << "ERROR: found existing zone name " << zone_name
+            << " (" << id << ") in zonegroup " << zonegroup.name << dendl;
+        return -EEXIST;
+      }
+    }
+  }
+
+  rgw_zone_id& master_zone = zonegroup.master_zone;
+  if (pis_master) {
+    if (*pis_master) {
+      if (!master_zone.empty() && master_zone != zone_id) {
+        ldpp_dout(dpp, 0) << "NOTICE: overriding master zone: "
+            << master_zone << dendl;
+      }
+      master_zone = zone_id;
+    } else if (master_zone == zone_id) {
+      master_zone.clear();
+    }
+  } else if (master_zone.empty() && zonegroup.zones.empty()) {
+    ldpp_dout(dpp, 0) << "NOTICE: promoted " << zone_name
+        << " as new master_zone of zonegroup " << zonegroup.name << dendl;
+    master_zone = zone_id;
+  }
+
+  // make sure the zone's placement targets are named in the zonegroup
+  for (const auto& [name, placement] : zone_params.placement_pools) {
+    auto target = RGWZoneGroupPlacementTarget{.name = name};
+    zonegroup.placement_targets.emplace(name, std::move(target));
+  }
+
+  RGWZone& zone = zonegroup.zones[zone_params.id];
+  zone.id = zone_params.id;
+  zone.name = zone_params.name;
+  if (!endpoints.empty()) {
+    zone.endpoints = endpoints;
+  }
+  if (pread_only) {
+    zone.read_only = *pread_only;
+  }
+  if (ptier_type) {
+    zone.tier_type = *ptier_type;
+  }
+  if (psync_from_all) {
+    zone.sync_from_all = *psync_from_all;
+  }
+  if (predirect_zone) {
+    zone.redirect_zone = *predirect_zone;
+  }
+  if (bucket_index_max_shards) {
+    zone.bucket_index_max_shards = *bucket_index_max_shards;
+  }
+
+  // add/remove sync_from
+  for (auto add : sync_from) {
+    zone.sync_from.insert(add);
+  }
+
+  for (const auto& rm : sync_from_rm) {
+    auto i = zone.sync_from.find(rm);
+    if (i == zone.sync_from.end()) {
+      ldpp_dout(dpp, 1) << "WARNING: zone \"" << rm
+          << "\" was not in sync_from" << dendl;
+      continue;
+    }
+    zone.sync_from.erase(i);
+  }
+
+  // add/remove supported features
+  zone.supported_features.insert(enable_features.begin(),
+                                 enable_features.end());
+
+  for (const auto& feature : disable_features) {
+    if (zonegroup.enabled_features.contains(feature)) {
+      ldpp_dout(dpp, -1) << "ERROR: Cannot disable zone feature \"" << feature
+          << "\" until it's been disabled in zonegroup " << zonegroup.name << dendl;
+      return -EINVAL;
+    }
+    auto i = zone.supported_features.find(feature);
+    if (i == zone.supported_features.end()) {
+      ldpp_dout(dpp, 1) << "WARNING: zone feature \"" << feature
+          << "\" was not enabled in zone " << zone.name << dendl;
+      continue;
+    }
+    zone.supported_features.erase(i);
+  }
+
+  const bool log_data = zonegroup.zones.size() > 1;
+  for (auto& [id, zone] : zonegroup.zones) {
+    zone.log_data = log_data;
+  }
+
+  return 0;
+}
+
+} // namespace rgw
+
diff --git a/src/rgw/rgw_zone_features.h b/src/rgw/rgw_zone_features.h
new file mode 100644
index 000000000..5e1a435d4
--- /dev/null
+++ b/src/rgw/rgw_zone_features.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * include files which can only be compiled in radosgw or OSD
+ * contexts (e.g., rgw_sal.h, rgw_common.h) */
+
+#pragma once
+
+#include <string>
+#include <boost/container/flat_set.hpp>
+
+namespace rgw::zone_features {
+
+// zone feature names
+inline constexpr std::string_view resharding = "resharding";
+inline constexpr std::string_view compress_encrypted = "compress-encrypted";
+
+// static list of features supported by this release
+inline constexpr std::initializer_list<std::string_view> supported = {
+  resharding,
+  compress_encrypted,
+};
+
+inline constexpr bool supports(std::string_view feature) {
+  for (auto i : supported) {
+    if (feature.compare(i) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// static list of features enabled by default on new zonegroups
+inline constexpr std::initializer_list<std::string_view> enabled = {
+  resharding,
+};
+
+
+// enable string_view overloads for find() contains() etc
+struct feature_less : std::less<std::string_view> {
+  using is_transparent = std::true_type;
+};
+
+using set = boost::container::flat_set<std::string, feature_less>;
+
+} // namespace rgw::zone_features
diff --git a/src/rgw/rgw_zone_types.h b/src/rgw/rgw_zone_types.h
new file mode 100644
index 000000000..f2881dfef
--- /dev/null
+++ b/src/rgw/rgw_zone_types.h
@@ -0,0 +1,625 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+/* N.B., this header defines fundamental serialized types.  Do not
+ * introduce changes or include files which can only be compiled in
+ * radosgw or OSD contexts (e.g., rgw_sal.h, rgw_common.h)
+ */
+
+#pragma once
+
+#include <string>
+#include <set>
+#include <map>
+#include <list>
+#include <boost/optional.hpp>
+
+#include <fmt/format.h>
+
+#include "include/types.h"
+#include "rgw_bucket_layout.h"
+#include "rgw_zone_features.h"
+#include "rgw_pool_types.h"
+#include "rgw_acl_types.h"
+#include "rgw_placement_types.h"
+
+#include "common/Formatter.h"
+
+class JSONObj;
+
+namespace rgw_zone_defaults {
+
+extern std::string zone_names_oid_prefix;
+extern std::string region_info_oid_prefix;
+extern std::string realm_names_oid_prefix;
+extern std::string zone_group_info_oid_prefix;
+extern std::string realm_info_oid_prefix;
+extern std::string default_region_info_oid;
+extern std::string default_zone_group_info_oid;
+extern std::string region_map_oid;
+extern std::string default_realm_info_oid;
+extern std::string default_zonegroup_name;
+extern std::string default_zone_name;
+extern std::string zonegroup_names_oid_prefix;
+extern std::string RGW_DEFAULT_ZONE_ROOT_POOL;
+extern std::string RGW_DEFAULT_ZONEGROUP_ROOT_POOL;
+extern std::string RGW_DEFAULT_REALM_ROOT_POOL;
+extern std::string RGW_DEFAULT_PERIOD_ROOT_POOL;
+extern std::string avail_pools;
+extern std::string default_storage_pool_suffix;
+
+} /* namespace rgw_zone_defaults */
+
+struct RGWNameToId {
+  std::string obj_id;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(obj_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(obj_id, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWNameToId)
+
+struct RGWDefaultSystemMetaObjInfo {
+  std::string default_id;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(default_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(default_id, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWDefaultSystemMetaObjInfo)
+
+struct RGWZoneStorageClass {
+  boost::optional<rgw_pool> data_pool;
+  boost::optional<std::string> compression_type;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(data_pool, bl);
+    encode(compression_type, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(data_pool, bl);
+    decode(compression_type, bl);
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWZoneStorageClass)
+
+class RGWZoneStorageClasses {
+  std::map<std::string, RGWZoneStorageClass> m;
+
+  /* in memory only */
+  RGWZoneStorageClass *standard_class;
+
+public:
+  RGWZoneStorageClasses() {
+    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
+  }
+  RGWZoneStorageClasses(const RGWZoneStorageClasses& rhs) {
+    m = rhs.m;
+    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
+  }
+  RGWZoneStorageClasses& operator=(const RGWZoneStorageClasses& rhs) {
+    m = rhs.m;
+    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
+    return *this;
+  }
+
+  const RGWZoneStorageClass& get_standard() const {
+    return *standard_class;
+  }
+
+  bool find(const std::string& sc, const RGWZoneStorageClass** pstorage_class) const {
+    auto iter = m.find(sc);
+    if (iter == m.end()) {
+      return false;
+    }
+    *pstorage_class = &iter->second;
+    return true;
+  }
+
+  bool exists(const std::string& sc) const {
+    if (sc.empty()) {
+      return true;
+    }
+    auto iter = m.find(sc);
+    return (iter != m.end());
+  }
+
+  const std::map<std::string, RGWZoneStorageClass>& get_all() const {
+    return m;
+  }
+
+  std::map<std::string, RGWZoneStorageClass>& get_all() {
+    return m;
+  }
+
+  void set_storage_class(const std::string& sc, const rgw_pool* data_pool, const std::string* compression_type) {
+    const std::string *psc = &sc;
+    if (sc.empty()) {
+      psc = &RGW_STORAGE_CLASS_STANDARD;
+    }
+    RGWZoneStorageClass& storage_class = m[*psc];
+    if (data_pool) {
+      storage_class.data_pool = *data_pool;
+    }
+    if (compression_type) {
+      storage_class.compression_type = *compression_type;
+    }
+  }
+
+  void remove_storage_class(const std::string& sc) {
+    if (!sc.empty()) {
+      m.erase(sc);
+    }
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(m, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(m, bl);
+    standard_class = &m[RGW_STORAGE_CLASS_STANDARD];
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWZoneStorageClasses)
+
+struct RGWZonePlacementInfo {
+  rgw_pool index_pool;
+  rgw_pool data_extra_pool; /* if not set we should use data_pool */
+  RGWZoneStorageClasses storage_classes;
+  rgw::BucketIndexType index_type;
+  bool inline_data;
+
+  RGWZonePlacementInfo() : index_type(rgw::BucketIndexType::Normal), inline_data(true) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(8, 1, bl);
+    encode(index_pool.to_str(), bl);
+    rgw_pool standard_data_pool = get_data_pool(RGW_STORAGE_CLASS_STANDARD);
+    encode(standard_data_pool.to_str(), bl);
+    encode(data_extra_pool.to_str(), bl);
+    encode((uint32_t)index_type, bl);
+    std::string standard_compression_type = get_compression_type(RGW_STORAGE_CLASS_STANDARD);
+    encode(standard_compression_type, bl);
+    encode(storage_classes, bl);
+    encode(inline_data, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(8, bl);
+    std::string index_pool_str;
+    std::string data_pool_str;
+    decode(index_pool_str, bl);
+    index_pool = rgw_pool(index_pool_str);
+    decode(data_pool_str, bl);
+    rgw_pool standard_data_pool(data_pool_str);
+    if (struct_v >= 4) {
+      std::string data_extra_pool_str;
+      decode(data_extra_pool_str, bl);
+      data_extra_pool = rgw_pool(data_extra_pool_str);
+    }
+    if (struct_v >= 5) {
+      uint32_t it;
+      decode(it, bl);
+      index_type = (rgw::BucketIndexType)it;
+    }
+    std::string standard_compression_type;
+    if (struct_v >= 6) {
+      decode(standard_compression_type, bl);
+    }
+    if (struct_v >= 7) {
+      decode(storage_classes, bl);
+    } else {
+      storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &standard_data_pool,
+                                        (!standard_compression_type.empty() ? &standard_compression_type : nullptr));
+    }
+    if (struct_v >= 8) {
+      decode(inline_data, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  const rgw_pool& get_data_extra_pool() const {
+    static rgw_pool no_pool;
+    if (data_extra_pool.empty()) {
+      return storage_classes.get_standard().data_pool.get_value_or(no_pool);
+    }
+    return data_extra_pool;
+  }
+  const rgw_pool& get_data_pool(const std::string& sc) const {
+    const RGWZoneStorageClass *storage_class;
+    static rgw_pool no_pool;
+
+    if (!storage_classes.find(sc, &storage_class)) {
+      return storage_classes.get_standard().data_pool.get_value_or(no_pool);
+    }
+
+    return storage_class->data_pool.get_value_or(no_pool);
+  }
+  const rgw_pool& get_standard_data_pool() const {
+    return get_data_pool(RGW_STORAGE_CLASS_STANDARD);
+  }
+
+  const std::string& get_compression_type(const std::string& sc) const {
+    const RGWZoneStorageClass *storage_class;
+    static std::string no_compression;
+
+    if (!storage_classes.find(sc, &storage_class)) {
+      return no_compression;
+    }
+    return storage_class->compression_type.get_value_or(no_compression);
+  }
+
+  bool storage_class_exists(const std::string& sc) const {
+    return storage_classes.exists(sc);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+
+};
+WRITE_CLASS_ENCODER(RGWZonePlacementInfo)
+
+struct RGWZone {
+  std::string id;
+  std::string name;
+  std::list<std::string> endpoints; // std::vector?
+  bool log_meta;
+  bool log_data;
+  bool read_only;
+  std::string tier_type;
+  std::string redirect_zone;
+
+/**
+ * Represents the number of shards for the bucket index object, a value of zero
+ * indicates there is no sharding. By default (no sharding, the name of the object
+ * is '.dir.{marker}', with sharding, the name is '.dir.{marker}.{sharding_id}',
+ * sharding_id is zero-based value. It is not recommended to set a too large value
+ * (e.g. thousand) as it increases the cost for bucket listing.
+ */
+  uint32_t bucket_index_max_shards;
+
+  // pre-shard buckets on creation to enable some write-parallism by default,
+  // delay the need to reshard as the bucket grows, and (in multisite) get some
+  // bucket index sharding where dynamic resharding is not supported
+  static constexpr uint32_t default_bucket_index_max_shards = 11;
+
+  bool sync_from_all;
+  std::set<std::string> sync_from; /* list of zones to sync from */
+
+  rgw::zone_features::set supported_features;
+
+  RGWZone()
+    : log_meta(false), log_data(false), read_only(false),
+      bucket_index_max_shards(default_bucket_index_max_shards),
+      sync_from_all(true) {}
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(8, 1, bl);
+    encode(name, bl);
+    encode(endpoints, bl);
+    encode(log_meta, bl);
+    encode(log_data, bl);
+    encode(bucket_index_max_shards, bl);
+    encode(id, bl);
+    encode(read_only, bl);
+    encode(tier_type, bl);
+    encode(sync_from_all, bl);
+    encode(sync_from, bl);
+    encode(redirect_zone, bl);
+    encode(supported_features, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(8, bl);
+    decode(name, bl);
+    if (struct_v < 4) {
+      id = name;
+    }
+    decode(endpoints, bl);
+    if (struct_v >= 2) {
+      decode(log_meta, bl);
+      decode(log_data, bl);
+    }
+    if (struct_v >= 3) {
+      decode(bucket_index_max_shards, bl);
+    }
+    if (struct_v >= 4) {
+      decode(id, bl);
+      decode(read_only, bl);
+    }
+    if (struct_v >= 5) {
+      decode(tier_type, bl);
+    }
+    if (struct_v >= 6) {
+      decode(sync_from_all, bl);
+      decode(sync_from, bl);
+    }
+    if (struct_v >= 7) {
+      decode(redirect_zone, bl);
+    }
+    if (struct_v >= 8) {
+      decode(supported_features, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  static void generate_test_instances(std::list<RGWZone*>& o);
+
+  bool is_read_only() const { return read_only; }
+
+  bool syncs_from(const std::string& zone_name) const {
+    return (sync_from_all || sync_from.find(zone_name) != sync_from.end());
+  }
+
+  bool supports(std::string_view feature) const {
+    return supported_features.contains(feature);
+  }
+};
+WRITE_CLASS_ENCODER(RGWZone)
+
+struct RGWDefaultZoneGroupInfo {
+  std::string default_zonegroup;
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(default_zonegroup, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(default_zonegroup, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+  //todo: implement ceph-dencoder
+};
+WRITE_CLASS_ENCODER(RGWDefaultZoneGroupInfo)
+
+struct RGWTierACLMapping {
+  ACLGranteeTypeEnum type{ACL_TYPE_CANON_USER};
+  std::string source_id;
+  std::string dest_id;
+
+  RGWTierACLMapping() = default;
+
+  RGWTierACLMapping(ACLGranteeTypeEnum t,
+             const std::string& s,
+             const std::string& d) : type(t),
+  source_id(s),
+  dest_id(d) {}
+
+  void init(const JSONFormattable& config) {
+    const std::string& t = config["type"];
+
+    if (t == "email") {
+      type = ACL_TYPE_EMAIL_USER;
+    } else if (t == "uri") {
+      type = ACL_TYPE_GROUP;
+    } else {
+      type = ACL_TYPE_CANON_USER;
+    }
+
+    source_id = config["source_id"];
+    dest_id = config["dest_id"];
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode((uint32_t)type, bl);
+    encode(source_id, bl);
+    encode(dest_id, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    uint32_t it;
+    decode(it, bl);
+    type = (ACLGranteeTypeEnum)it;
+    decode(source_id, bl);
+    decode(dest_id, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWTierACLMapping)
+
+enum HostStyle {
+  PathStyle = 0,
+  VirtualStyle = 1,
+};
+
+struct RGWZoneGroupPlacementTierS3 {
+#define DEFAULT_MULTIPART_SYNC_PART_SIZE (32 * 1024 * 1024)
+  std::string endpoint;
+  RGWAccessKey key;
+  std::string region;
+  HostStyle host_style{PathStyle};
+  std::string target_storage_class;
+
+  /* Should below be bucket/zone specific?? */
+  std::string target_path;
+  std::map<std::string, RGWTierACLMapping> acl_mappings;
+
+  uint64_t multipart_sync_threshold{DEFAULT_MULTIPART_SYNC_PART_SIZE};
+  uint64_t multipart_min_part_size{DEFAULT_MULTIPART_SYNC_PART_SIZE};
+
+  int update_params(const JSONFormattable& config);
+  int clear_params(const JSONFormattable& config);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(endpoint, bl);
+    encode(key, bl);
+    encode(region, bl);
+    encode((uint32_t)host_style, bl); // XXX kill C-style casts
+    encode(target_storage_class, bl);
+    encode(target_path, bl);
+    encode(acl_mappings, bl);
+    encode(multipart_sync_threshold, bl);
+    encode(multipart_min_part_size, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(endpoint, bl);
+    decode(key, bl);
+    decode(region, bl);
+
+    uint32_t it;
+    decode(it, bl);
+    host_style = (HostStyle)it; // XXX can't this be HostStyle(it)?
+
+    decode(target_storage_class, bl);
+    decode(target_path, bl);
+    decode(acl_mappings, bl);
+    decode(multipart_sync_threshold, bl);
+    decode(multipart_min_part_size, bl);
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTierS3)
+
+struct RGWZoneGroupPlacementTier {
+  std::string tier_type;
+  std::string storage_class;
+  bool retain_head_object = false;
+
+  struct _tier {
+    RGWZoneGroupPlacementTierS3 s3;
+  } t;
+
+  int update_params(const JSONFormattable& config);
+  int clear_params(const JSONFormattable& config);
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(1, 1, bl);
+    encode(tier_type, bl);
+    encode(storage_class, bl);
+    encode(retain_head_object, bl);
+    if (tier_type == "cloud-s3") {
+      encode(t.s3, bl);
+    }
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(1, bl);
+    decode(tier_type, bl);
+    decode(storage_class, bl);
+    decode(retain_head_object, bl);
+    if (tier_type == "cloud-s3") {
+      decode(t.s3, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTier)
+
+struct RGWZoneGroupPlacementTarget {
+  std::string name;
+  std::set<std::string> tags;
+  std::set<std::string> storage_classes;
+  std::map<std::string, RGWZoneGroupPlacementTier> tier_targets;
+
+  bool user_permitted(const std::list<std::string>& user_tags) const {
+    if (tags.empty()) {
+      return true;
+    }
+    for (auto& rule : user_tags) {
+      if (tags.find(rule) != tags.end()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void encode(bufferlist& bl) const {
+    ENCODE_START(3, 1, bl);
+    encode(name, bl);
+    encode(tags, bl);
+    encode(storage_classes, bl);
+    encode(tier_targets, bl);
+    ENCODE_FINISH(bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    DECODE_START(3, bl);
+    decode(name, bl);
+    decode(tags, bl);
+    if (struct_v >= 2) {
+      decode(storage_classes, bl);
+    }
+    if (storage_classes.empty()) {
+      storage_classes.insert(RGW_STORAGE_CLASS_STANDARD);
+    }
+    if (struct_v >= 3) {
+      decode(tier_targets, bl);
+    }
+    DECODE_FINISH(bl);
+  }
+  void dump(Formatter *f) const;
+  void decode_json(JSONObj *obj);
+};
+WRITE_CLASS_ENCODER(RGWZoneGroupPlacementTarget)
diff --git a/src/rgw/rgwam.py b/src/rgw/rgwam.py
new file mode 100755
index 000000000..f07d2b423
--- /dev/null
+++ b/src/rgw/rgwam.py
@@ -0,0 +1,240 @@
+#!@Python3_EXECUTABLE@
+# -*- mode:python -*-
+# vim: ts=4 sw=4 smarttab expandtab
+#
+# Processed in Makefile to add python #! line and version variable
+#
+#
+
+import subprocess
+import random
+import string
+import json
+import argparse
+import sys
+import socket
+import base64
+import logging
+
+from urllib.parse import urlparse
+
+from ceph.rgw.rgwam_core import RGWAM, EnvArgs
+from ceph.rgw.types import RGWAMEnvMgr, RGWAMException
+
+class RGWAMCLIMgr(RGWAMEnvMgr):
+    def __init__(self, common_args):
+        args = []
+
+        if common_args.conf_path:
+            args += [ '-c', common_args.conf_path ]
+
+        if common_args.ceph_name:
+            args += [ '-n', common_args.ceph_name ]
+
+        if common_args.ceph_keyring:
+            args += [ '-k', common_args.ceph_keyring ]
+
+        self.args_prefix = args
+
+    def tool_exec(self, prog, args):
+        run_cmd = [ prog ] + self.args_prefix + args
+
+        result = subprocess.run(run_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        stdout = result.stdout.decode('utf-8')
+        stderr = result.stderr.decode('utf-8')
+
+        return run_cmd, result.returncode, stdout, stderr
+
+    def apply_rgw(self, svc_id, realm_name, zone_name, port = None):
+        return None
+
+    def list_daemons(self, service_name, daemon_type = None, daemon_id = None, hostname = None, refresh = True):
+        return []
+
+class RealmCommand:
+    def __init__(self, env, args):
+        self.env = env
+        self.args = args
+
+    def parse(self):
+        parser = argparse.ArgumentParser(
+            usage='''rgwam realm <subcommand>
+
+The subcommands are:
+   bootstrap                     Bootstrap new realm
+   new-zone-creds                Create credentials for connecting new zone
+''')
+        parser.add_argument('subcommand', help='Subcommand to run')
+        # parse_args defaults to [1:] for args, but you need to
+        # exclude the rest of the args too, or validation will fail
+        args = parser.parse_args(self.args[0:1])
+
+        sub = args.subcommand.replace('-', '_')
+
+        if not hasattr(self, sub):
+            print('Unrecognized subcommand:', args.subcommand)
+            parser.print_help()
+            exit(1)
+        # use dispatch pattern to invoke method with same name
+
+        return getattr(self, sub)
+
+    def bootstrap(self):
+        parser = argparse.ArgumentParser(
+            description='Bootstrap new realm',
+            usage='rgwam realm bootstrap [<args>]')
+        parser.add_argument('--realm')
+        parser.add_argument('--zonegroup')
+        parser.add_argument('--zone')
+        parser.add_argument('--endpoints')
+        parser.add_argument('--sys-uid')
+        parser.add_argument('--uid')
+        parser.add_argument('--start-radosgw', action='store_true', dest='start_radosgw', default=True)
+        parser.add_argument('--no-start-radosgw', action='store_false', dest='start_radosgw')
+
+        args = parser.parse_args(self.args[1:])
+
+        return RGWAM(self.env).realm_bootstrap(args.realm, args.zonegroup, args.zone, args.endpoints,
+                args.sys_uid, args.uid, args.start_radosgw)
+
+    def new_zone_creds(self):
+        parser = argparse.ArgumentParser(
+            description='Bootstrap new realm',
+            usage='rgwam realm new-zone-creds [<args>]')
+        parser.add_argument('--endpoints')
+        parser.add_argument('--sys-uid')
+
+        args = parser.parse_args(self.args[1:])
+
+        return RGWAM(self.env).realm_new_zone_creds(args.endpoints, args.sys_uid)
+
+
+class ZoneCommand:
+    def __init__(self, env, args):
+        self.env = env
+        self.args = args
+
+    def parse(self):
+        parser = argparse.ArgumentParser(
+            usage='''rgwam zone <subcommand>
+
+The subcommands are:
+   run                     run radosgw daemon in current zone
+''')
+        parser.add_argument('subcommand', help='Subcommand to run')
+        # parse_args defaults to [1:] for args, but you need to
+        # exclude the rest of the args too, or validation will fail
+        args = parser.parse_args(self.args[0:1])
+        if not hasattr(self, args.subcommand):
+            print('Unrecognized subcommand:', args.subcommand)
+            parser.print_help()
+            exit(1)
+        # use dispatch pattern to invoke method with same name
+        return getattr(self, args.subcommand)
+
+    def run(self):
+        parser = argparse.ArgumentParser(
+            description='Run radosgw daemon',
+            usage='rgwam zone run [<args>]')
+        parser.add_argument('--port')
+        parser.add_argument('--log-file')
+        parser.add_argument('--debug-ms')
+        parser.add_argument('--debug-rgw')
+
+        args = parser.parse_args(self.args[1:])
+
+        return RGWAM(self.env).run_radosgw(port = args.port)
+
+    def create(self):
+        parser = argparse.ArgumentParser(
+            description='Create new zone to join existing realm',
+            usage='rgwam zone create [<args>]')
+        parser.add_argument('--realm-token')
+        parser.add_argument('--zone')
+        parser.add_argument('--zonegroup')
+        parser.add_argument('--endpoints')
+        parser.add_argument('--start-radosgw', action='store_true', dest='start_radosgw', default=True)
+        parser.add_argument('--no-start-radosgw', action='store_false', dest='start_radosgw')
+
+        args = parser.parse_args(self.args[1:])
+
+        return RGWAM(self.env).zone_create(args.realm_token, args.zonegroup, args.zone, args.endpoints, args.start_radosgw)
+
+class CommonArgs:
+    def __init__(self, ns):
+        self.conf_path = ns.conf_path
+        self.ceph_name = ns.ceph_name
+        self.ceph_keyring = ns.ceph_keyring
+
+class TopLevelCommand:
+
+    def _parse(self):
+        parser = argparse.ArgumentParser(
+            description='RGW assist for multisite tool',
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            epilog='''
+The commands are:
+   realm bootstrap               Bootstrap new realm
+   realm new-zone-creds          Create credentials to connect new zone to realm
+   zone create                   Create new zone and connect it to existing realm
+   zone run                      Run radosgw in current zone
+''')
+
+        parser.add_argument('command', help='command to run', default=None)
+        parser.add_argument('-c', help='ceph conf path', dest='conf_path')
+        parser.add_argument('-n', help='ceph user name', dest='ceph_name')
+        parser.add_argument('-k', help='ceph keyring', dest='ceph_keyring')
+
+        removed_args = []
+
+        args = sys.argv[1:]
+        if len(args) > 0:
+            if hasattr(self, args[0]):
+                # remove -h/--help if top command is not empty so that top level help
+                # doesn't override subcommand, we'll add it later
+                help_args = [ '-h', '--help' ]
+                removed_args = [arg for arg in args if arg in help_args]
+                args = [arg for arg in args if arg not in help_args]
+
+        (ns, args) = parser.parse_known_args(args)
+        if not hasattr(self, ns.command) or ns.command[0] == '_':
+            print('Unrecognized command:', ns.command)
+            parser.print_help()
+            exit(1)
+        # use dispatch pattern to invoke method with same name
+        args += removed_args
+        return (getattr(self, ns.command), CommonArgs(ns), args)
+
+    def realm(self, env, args):
+        cmd = RealmCommand(env, args).parse()
+        return cmd()
+
+    def zone(self, env, args):
+        cmd = ZoneCommand(env, args).parse()
+        return cmd()
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    log = logging.getLogger(__name__)
+
+    (cmd, common_args, args)= TopLevelCommand()._parse()
+
+    env = EnvArgs(RGWAMCLIMgr(common_args))
+
+    try:
+        retval, out, err = cmd(env, args)
+        if retval != 0:
+            log.error('stdout: '+ out + '\nstderr: ' + err)
+            sys.exit(retval)
+    except RGWAMException as e:
+        print('ERROR: ' + e.message)
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/src/rgw/services/svc_bi.h b/src/rgw/services/svc_bi.h
new file mode 100644
index 000000000..bd811e162
--- /dev/null
+++ b/src/rgw/services/svc_bi.h
@@ -0,0 +1,44 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+
+class RGWBucketInfo;
+struct RGWBucketEnt;
+
+
+class RGWSI_BucketIndex : public RGWServiceInstance
+{
+public:
+  RGWSI_BucketIndex(CephContext *cct) : RGWServiceInstance(cct) {}
+  virtual ~RGWSI_BucketIndex() {}
+
+  virtual int init_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout) = 0;
+  virtual int clean_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout) = 0;
+
+  virtual int read_stats(const DoutPrefixProvider *dpp,
+                         const RGWBucketInfo& bucket_info,
+                         RGWBucketEnt *stats,
+                         optional_yield y) = 0;
+
+  virtual int handle_overwrite(const DoutPrefixProvider *dpp,
+                               const RGWBucketInfo& info,
+                               const RGWBucketInfo& orig_info,
+                               optional_yield y) = 0;
+};
diff --git a/src/rgw/services/svc_bi_rados.cc b/src/rgw/services/svc_bi_rados.cc
new file mode 100644
index 000000000..6002b986f
--- /dev/null
+++ b/src/rgw/services/svc_bi_rados.cc
@@ -0,0 +1,509 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_bi_rados.h"
+#include "svc_bilog_rados.h"
+#include "svc_zone.h"
+
+#include "rgw_bucket.h"
+#include "rgw_zone.h"
+#include "rgw_datalog.h"
+
+#include "cls/rgw/cls_rgw_client.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static string dir_oid_prefix = ".dir.";
+
+RGWSI_BucketIndex_RADOS::RGWSI_BucketIndex_RADOS(CephContext *cct) : RGWSI_BucketIndex(cct)
+{
+}
+
+void RGWSI_BucketIndex_RADOS::init(RGWSI_Zone *zone_svc,
+                                   RGWSI_RADOS *rados_svc,
+                                   RGWSI_BILog_RADOS *bilog_svc,
+                                   RGWDataChangesLog *datalog_rados_svc)
+{
+  svc.zone = zone_svc;
+  svc.rados = rados_svc;
+  svc.bilog = bilog_svc;
+  svc.datalog_rados = datalog_rados_svc;
+}
+
+int RGWSI_BucketIndex_RADOS::open_pool(const DoutPrefixProvider *dpp,
+                                       const rgw_pool& pool,
+                                       RGWSI_RADOS::Pool *index_pool,
+                                       bool mostly_omap)
+{
+  *index_pool = svc.rados->pool(pool);
+  return index_pool->open(dpp, RGWSI_RADOS::OpenParams()
+                          .set_mostly_omap(mostly_omap));
+}
+
+int RGWSI_BucketIndex_RADOS::open_bucket_index_pool(const DoutPrefixProvider *dpp,
+                                                    const RGWBucketInfo& bucket_info,
+                                                    RGWSI_RADOS::Pool *index_pool)
+{
+  const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool;
+
+  if (!explicit_pool.empty()) {
+    return open_pool(dpp, explicit_pool, index_pool, false);
+  }
+
+  auto& zonegroup = svc.zone->get_zonegroup();
+  auto& zone_params = svc.zone->get_zone_params();
+
+  const rgw_placement_rule *rule = &bucket_info.placement_rule;
+  if (rule->empty()) {
+    rule = &zonegroup.default_placement;
+  }
+  auto iter = zone_params.placement_pools.find(rule->name);
+  if (iter == zone_params.placement_pools.end()) {
+    ldpp_dout(dpp, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
+    return -EINVAL;
+  }
+
+  int r = open_pool(dpp, iter->second.index_pool, index_pool, true);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RGWSI_BucketIndex_RADOS::open_bucket_index_base(const DoutPrefixProvider *dpp,
+                                                    const RGWBucketInfo& bucket_info,
+                                                    RGWSI_RADOS::Pool *index_pool,
+                                                    string *bucket_oid_base)
+{
+  const rgw_bucket& bucket = bucket_info.bucket;
+  int r = open_bucket_index_pool(dpp, bucket_info, index_pool);
+  if (r < 0)
+    return r;
+
+  if (bucket.bucket_id.empty()) {
+    ldpp_dout(dpp, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
+    return -EIO;
+  }
+
+  *bucket_oid_base = dir_oid_prefix;
+  bucket_oid_base->append(bucket.bucket_id);
+
+  return 0;
+
+}
+
+int RGWSI_BucketIndex_RADOS::open_bucket_index(const DoutPrefixProvider *dpp,
+                                               const RGWBucketInfo& bucket_info,
+                                               RGWSI_RADOS::Pool *index_pool,
+                                               string *bucket_oid)
+{
+  const rgw_bucket& bucket = bucket_info.bucket;
+  int r = open_bucket_index_pool(dpp, bucket_info, index_pool);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << __func__ << ": open_bucket_index_pool() returned "
+                   << r << dendl;
+    return r;
+  }
+
+  if (bucket.bucket_id.empty()) {
+    ldpp_dout(dpp, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
+    return -EIO;
+  }
+
+  *bucket_oid = dir_oid_prefix;
+  bucket_oid->append(bucket.bucket_id);
+
+  return 0;
+}
+
+static char bucket_obj_with_generation(char *buf, size_t len, const string& bucket_oid_base, uint64_t gen_id,
+                                    uint32_t shard_id)
+{
+  return snprintf(buf, len, "%s.%" PRIu64 ".%d", bucket_oid_base.c_str(), gen_id, shard_id);
+}
+
+static char bucket_obj_without_generation(char *buf, size_t len, const string& bucket_oid_base, uint32_t shard_id)
+{
+  return snprintf(buf, len, "%s.%d", bucket_oid_base.c_str(), shard_id);
+}
+
+static void get_bucket_index_objects(const string& bucket_oid_base,
+                                     uint32_t num_shards, uint64_t gen_id,
+                                     map<int, string> *_bucket_objects,
+                                     int shard_id = -1)
+{
+  auto& bucket_objects = *_bucket_objects;
+  if (!num_shards) {
+    bucket_objects[0] = bucket_oid_base;
+  } else {
+    char buf[bucket_oid_base.size() + 64];
+    if (shard_id < 0) {
+      for (uint32_t i = 0; i < num_shards; ++i) {
+        if (gen_id) {
+          bucket_obj_with_generation(buf, sizeof(buf), bucket_oid_base, gen_id, i);
+        } else {
+          bucket_obj_without_generation(buf, sizeof(buf), bucket_oid_base, i);
+        }
+        bucket_objects[i] = buf;
+      }
+    } else {
+      if (std::cmp_greater(shard_id, num_shards)) {
+        return;
+      } else {
+        if (gen_id) {
+          bucket_obj_with_generation(buf, sizeof(buf), bucket_oid_base, gen_id, shard_id);
+        } else {
+          // for backward compatibility, gen_id(0) will not be added in the object name
+          bucket_obj_without_generation(buf, sizeof(buf), bucket_oid_base, shard_id);
+        }
+        bucket_objects[shard_id] = buf;
+      }
+    }
+  }
+}
+
+static void get_bucket_instance_ids(const RGWBucketInfo& bucket_info,
+                                    int num_shards, int shard_id,
+                                    map<int, string> *result)
+{
+  const rgw_bucket& bucket = bucket_info.bucket;
+  string plain_id = bucket.name + ":" + bucket.bucket_id;
+
+  if (!num_shards) {
+    (*result)[0] = plain_id;
+  } else {
+    char buf[16];
+    if (shard_id < 0) {
+      for (int i = 0; i < num_shards; ++i) {
+        snprintf(buf, sizeof(buf), ":%d", i);
+        (*result)[i] = plain_id + buf;
+      }
+    } else {
+      if (shard_id > num_shards) {
+        return;
+      }
+      snprintf(buf, sizeof(buf), ":%d", shard_id);
+      (*result)[shard_id] = plain_id + buf;
+    }
+  }
+}
+
+int RGWSI_BucketIndex_RADOS::open_bucket_index(const DoutPrefixProvider *dpp,
+                                               const RGWBucketInfo& bucket_info,
+                                               std::optional<int> _shard_id,
+                                               const rgw::bucket_index_layout_generation& idx_layout,
+                                               RGWSI_RADOS::Pool *index_pool,
+                                               map<int, string> *bucket_objs,
+                                               map<int, string> *bucket_instance_ids)
+{
+  int shard_id = _shard_id.value_or(-1);
+  string bucket_oid_base;
+  int ret = open_bucket_index_base(dpp, bucket_info, index_pool, &bucket_oid_base);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << ": open_bucket_index_pool() returned "
+                   << ret << dendl;
+    return ret;
+  }
+
+  get_bucket_index_objects(bucket_oid_base, idx_layout.layout.normal.num_shards,
+                           idx_layout.gen, bucket_objs, shard_id);
+  if (bucket_instance_ids) {
+    get_bucket_instance_ids(bucket_info, idx_layout.layout.normal.num_shards,
+                            shard_id, bucket_instance_ids);
+  }
+  return 0;
+}
+
+void RGWSI_BucketIndex_RADOS::get_bucket_index_object(
+    const std::string& bucket_oid_base,
+    const rgw::bucket_index_normal_layout& normal,
+    uint64_t gen_id, int shard_id,
+    std::string* bucket_obj)
+{
+  if (!normal.num_shards) {
+    // By default with no sharding, we use the bucket oid as itself
+    (*bucket_obj) = bucket_oid_base;
+  } else {
+    char buf[bucket_oid_base.size() + 64];
+    if (gen_id) {
+      bucket_obj_with_generation(buf, sizeof(buf), bucket_oid_base, gen_id, shard_id);
+      (*bucket_obj) = buf;
+	  ldout(cct, 10) << "bucket_obj is " << (*bucket_obj) << dendl;
+    } else {
+      // for backward compatibility, gen_id(0) will not be added in the object name
+      bucket_obj_without_generation(buf, sizeof(buf), bucket_oid_base, shard_id);
+      (*bucket_obj) = buf;
+    }
+  }
+}
+
+int RGWSI_BucketIndex_RADOS::get_bucket_index_object(
+    const std::string& bucket_oid_base,
+    const rgw::bucket_index_normal_layout& normal,
+    uint64_t gen_id, const std::string& obj_key,
+    std::string* bucket_obj, int* shard_id)
+{
+  int r = 0;
+  switch (normal.hash_type) {
+    case rgw::BucketHashType::Mod:
+      if (!normal.num_shards) {
+        // By default with no sharding, we use the bucket oid as itself
+        (*bucket_obj) = bucket_oid_base;
+        if (shard_id) {
+          *shard_id = -1;
+        }
+      } else {
+        uint32_t sid = bucket_shard_index(obj_key, normal.num_shards);
+        char buf[bucket_oid_base.size() + 64];
+        if (gen_id) {
+          bucket_obj_with_generation(buf, sizeof(buf), bucket_oid_base, gen_id, sid);
+        } else {
+          bucket_obj_without_generation(buf, sizeof(buf), bucket_oid_base, sid);
+        }
+        (*bucket_obj) = buf;
+        if (shard_id) {
+          *shard_id = (int)sid;
+        }
+      }
+      break;
+    default:
+      r = -ENOTSUP;
+  }
+  return r;
+}
+
+int RGWSI_BucketIndex_RADOS::open_bucket_index_shard(const DoutPrefixProvider *dpp,
+                                                     const RGWBucketInfo& bucket_info,
+                                                     const string& obj_key,
+                                                     RGWSI_RADOS::Obj *bucket_obj,
+                                                     int *shard_id)
+{
+  string bucket_oid_base;
+
+  RGWSI_RADOS::Pool pool;
+
+  int ret = open_bucket_index_base(dpp, bucket_info, &pool, &bucket_oid_base);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << ": open_bucket_index_pool() returned "
+                   << ret << dendl;
+    return ret;
+  }
+
+  string oid;
+
+  const auto& current_index = bucket_info.layout.current_index;
+  ret = get_bucket_index_object(bucket_oid_base, current_index.layout.normal,
+                                current_index.gen, obj_key, &oid, shard_id);
+  if (ret < 0) {
+    ldpp_dout(dpp, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
+    return ret;
+  }
+
+  *bucket_obj = svc.rados->obj(pool, oid);
+
+  return 0;
+}
+
+int RGWSI_BucketIndex_RADOS::open_bucket_index_shard(const DoutPrefixProvider *dpp,
+                                                     const RGWBucketInfo& bucket_info,
+                                                     const rgw::bucket_index_layout_generation& index,
+                                                     int shard_id,
+                                                     RGWSI_RADOS::Obj *bucket_obj)
+{
+  RGWSI_RADOS::Pool index_pool;
+  string bucket_oid_base;
+  int ret = open_bucket_index_base(dpp, bucket_info, &index_pool, &bucket_oid_base);
+  if (ret < 0) {
+    ldpp_dout(dpp, 20) << __func__ << ": open_bucket_index_pool() returned "
+                   << ret << dendl;
+    return ret;
+  }
+
+  string oid;
+
+  get_bucket_index_object(bucket_oid_base, index.layout.normal,
+                          index.gen, shard_id, &oid);
+
+  *bucket_obj = svc.rados->obj(index_pool, oid);
+
+  return 0;
+}
+
+int RGWSI_BucketIndex_RADOS::cls_bucket_head(const DoutPrefixProvider *dpp,
+                                             const RGWBucketInfo& bucket_info,
+                                             const rgw::bucket_index_layout_generation& idx_layout,
+                                             int shard_id,
+                                             vector<rgw_bucket_dir_header> *headers,
+                                             map<int, string> *bucket_instance_ids,
+                                             optional_yield y)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> oids;
+  int r = open_bucket_index(dpp, bucket_info, shard_id, idx_layout, &index_pool, &oids, bucket_instance_ids);
+  if (r < 0)
+    return r;
+
+  map<int, struct rgw_cls_list_ret> list_results;
+  for (auto& iter : oids) {
+    list_results.emplace(iter.first, rgw_cls_list_ret());
+  }
+
+  r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
+  if (r < 0)
+    return r;
+
+  map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
+  for(; iter != list_results.end(); ++iter) {
+    headers->push_back(std::move(iter->second.dir.header));
+  }
+  return 0;
+}
+
+int RGWSI_BucketIndex_RADOS::init_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout)
+{
+  RGWSI_RADOS::Pool index_pool;
+
+  string dir_oid = dir_oid_prefix;
+  int r = open_bucket_index_pool(dpp, bucket_info, &index_pool);
+  if (r < 0) {
+    return r;
+  }
+
+  dir_oid.append(bucket_info.bucket.bucket_id);
+
+  map<int, string> bucket_objs;
+  get_bucket_index_objects(dir_oid, idx_layout.layout.normal.num_shards, idx_layout.gen, &bucket_objs);
+
+  return CLSRGWIssueBucketIndexInit(index_pool.ioctx(),
+				    bucket_objs,
+				    cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+int RGWSI_BucketIndex_RADOS::clean_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout)
+{
+  RGWSI_RADOS::Pool index_pool;
+
+  std::string dir_oid = dir_oid_prefix;
+  int r = open_bucket_index_pool(dpp, bucket_info, &index_pool);
+  if (r < 0) {
+    return r;
+  }
+
+  dir_oid.append(bucket_info.bucket.bucket_id);
+
+  std::map<int, std::string> bucket_objs;
+  get_bucket_index_objects(dir_oid, idx_layout.layout.normal.num_shards,
+                           idx_layout.gen, &bucket_objs);
+
+  return CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
+				     bucket_objs,
+				     cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+int RGWSI_BucketIndex_RADOS::read_stats(const DoutPrefixProvider *dpp,
+                                        const RGWBucketInfo& bucket_info,
+                                        RGWBucketEnt *result,
+                                        optional_yield y)
+{
+  vector<rgw_bucket_dir_header> headers;
+
+  result->bucket = bucket_info.bucket;
+  int r = cls_bucket_head(dpp, bucket_info, bucket_info.layout.current_index, RGW_NO_SHARD, &headers, nullptr, y);
+  if (r < 0) {
+    return r;
+  }
+
+  result->count = 0; 
+  result->size = 0; 
+  result->size_rounded = 0; 
+
+  auto hiter = headers.begin();
+  for (; hiter != headers.end(); ++hiter) {
+    RGWObjCategory category = RGWObjCategory::Main;
+    auto iter = (hiter->stats).find(category);
+    if (iter != hiter->stats.end()) {
+      struct rgw_bucket_category_stats& stats = iter->second;
+      result->count += stats.num_entries;
+      result->size += stats.total_size;
+      result->size_rounded += stats.total_size_rounded;
+    }
+  }
+
+  result->placement_rule = std::move(bucket_info.placement_rule);
+
+  return 0;
+}
+
+int RGWSI_BucketIndex_RADOS::get_reshard_status(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, list<cls_rgw_bucket_instance_entry> *status)
+{
+  map<int, string> bucket_objs;
+
+  RGWSI_RADOS::Pool index_pool;
+
+  int r = open_bucket_index(dpp, bucket_info,
+                            std::nullopt,
+                            bucket_info.layout.current_index,
+                            &index_pool,
+                            &bucket_objs,
+                            nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  for (auto i : bucket_objs) {
+    cls_rgw_bucket_instance_entry entry;
+
+    int ret = cls_rgw_get_bucket_resharding(index_pool.ioctx(), i.second, &entry);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": cls_rgw_get_bucket_resharding() returned ret=" << ret << dendl;
+      return ret;
+    }
+
+    status->push_back(entry);
+  }
+
+  return 0;
+}
+
+int RGWSI_BucketIndex_RADOS::handle_overwrite(const DoutPrefixProvider *dpp,
+                                              const RGWBucketInfo& info,
+                                              const RGWBucketInfo& orig_info,
+					      optional_yield y)
+{
+  bool new_sync_enabled = info.datasync_flag_enabled();
+  bool old_sync_enabled = orig_info.datasync_flag_enabled();
+
+  if (old_sync_enabled == new_sync_enabled) {
+    return 0; // datasync flag didn't change
+  }
+  if (info.layout.logs.empty()) {
+    return 0; // no bilog
+  }
+  const auto& bilog = info.layout.logs.back();
+  if (bilog.layout.type != rgw::BucketLogType::InIndex) {
+    return -ENOTSUP;
+  }
+  const int shards_num = rgw::num_shards(bilog.layout.in_index);
+
+  int ret;
+  if (!new_sync_enabled) {
+    ret = svc.bilog->log_stop(dpp, info, bilog, -1);
+  } else {
+    ret = svc.bilog->log_start(dpp, info, bilog, -1);
+  }
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed writing bilog (bucket=" << info.bucket << "); ret=" << ret << dendl;
+    return ret;
+  }
+
+  for (int i = 0; i < shards_num; ++i) {
+    ret = svc.datalog_rados->add_entry(dpp, info, bilog, i, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: failed writing data log (info.bucket=" << info.bucket << ", shard_id=" << i << ")" << dendl;
+    } // datalog error is not fatal
+  }
+
+  return 0;
+}
diff --git a/src/rgw/services/svc_bi_rados.h b/src/rgw/services/svc_bi_rados.h
new file mode 100644
index 000000000..feba0cfcd
--- /dev/null
+++ b/src/rgw/services/svc_bi_rados.h
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_datalog.h"
+#include "rgw_service.h"
+#include "rgw_tools.h"
+
+#include "svc_bi.h"
+#include "svc_rados.h"
+#include "svc_tier_rados.h"
+
+struct rgw_bucket_dir_header;
+
+class RGWSI_BILog_RADOS;
+
+#define RGW_NO_SHARD -1
+
+#define RGW_SHARDS_PRIME_0 7877
+#define RGW_SHARDS_PRIME_1 65521
+
+/*
+ * Defined Bucket Index Namespaces
+ */
+#define RGW_OBJ_NS_MULTIPART "multipart"
+#define RGW_OBJ_NS_SHADOW    "shadow"
+
+class RGWSI_BucketIndex_RADOS : public RGWSI_BucketIndex
+{
+  friend class RGWSI_BILog_RADOS;
+
+  int open_pool(const DoutPrefixProvider *dpp,
+                const rgw_pool& pool,
+                RGWSI_RADOS::Pool *index_pool,
+                bool mostly_omap);
+
+  int open_bucket_index_pool(const DoutPrefixProvider *dpp,
+                            const RGWBucketInfo& bucket_info,
+                            RGWSI_RADOS::Pool *index_pool);
+  int open_bucket_index_base(const DoutPrefixProvider *dpp,
+                             const RGWBucketInfo& bucket_info,
+                             RGWSI_RADOS::Pool *index_pool,
+                             std::string *bucket_oid_base);
+
+  // return the index oid for the given shard id
+  void get_bucket_index_object(const std::string& bucket_oid_base,
+                               const rgw::bucket_index_normal_layout& normal,
+                               uint64_t gen_id, int shard_id,
+                               std::string* bucket_obj);
+  // return the index oid and shard id for the given object name
+  int get_bucket_index_object(const std::string& bucket_oid_base,
+                              const rgw::bucket_index_normal_layout& normal,
+                              uint64_t gen_id, const std::string& obj_key,
+                              std::string* bucket_obj, int* shard_id);
+
+  int cls_bucket_head(const DoutPrefixProvider *dpp,
+		      const RGWBucketInfo& bucket_info,
+                      const rgw::bucket_index_layout_generation& idx_layout,
+                      int shard_id,
+                      std::vector<rgw_bucket_dir_header> *headers,
+                      std::map<int, std::string> *bucket_instance_ids,
+                      optional_yield y);
+
+public:
+
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_RADOS *rados{nullptr};
+    RGWSI_BILog_RADOS *bilog{nullptr};
+    RGWDataChangesLog *datalog_rados{nullptr};
+  } svc;
+
+  RGWSI_BucketIndex_RADOS(CephContext *cct);
+
+  void init(RGWSI_Zone *zone_svc,
+            RGWSI_RADOS *rados_svc,
+            RGWSI_BILog_RADOS *bilog_svc,
+            RGWDataChangesLog *datalog_rados_svc);
+
+  static int shards_max() {
+    return RGW_SHARDS_PRIME_1;
+  }
+
+  static int shard_id(const std::string& key, int max_shards) {
+    return rgw_shard_id(key, max_shards);
+  }
+
+  static uint32_t bucket_shard_index(const std::string& key,
+                                     int num_shards) {
+    uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size());
+    uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
+    return rgw_shards_mod(sid2, num_shards);
+  }
+
+  static uint32_t bucket_shard_index(const rgw_obj_key& obj_key,
+				     int num_shards)
+  {
+    std::string sharding_key;
+    if (obj_key.ns == RGW_OBJ_NS_MULTIPART) {
+      RGWMPObj mp;
+      mp.from_meta(obj_key.name);
+      sharding_key = mp.get_key();
+    } else {
+      sharding_key = obj_key.name;
+    }
+
+    return bucket_shard_index(sharding_key, num_shards);
+  }
+
+  int init_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,const rgw::bucket_index_layout_generation& idx_layout) override;
+  int clean_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout) override;
+
+  /* RADOS specific */
+
+  int read_stats(const DoutPrefixProvider *dpp,
+                 const RGWBucketInfo& bucket_info,
+                 RGWBucketEnt *stats,
+                 optional_yield y) override;
+
+  int get_reshard_status(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+                         std::list<cls_rgw_bucket_instance_entry> *status);
+
+  int handle_overwrite(const DoutPrefixProvider *dpp, const RGWBucketInfo& info,
+                       const RGWBucketInfo& orig_info,
+		       optional_yield y) override;
+
+  int open_bucket_index_shard(const DoutPrefixProvider *dpp,
+                              const RGWBucketInfo& bucket_info,
+                              const std::string& obj_key,
+                              RGWSI_RADOS::Obj *bucket_obj,
+                              int *shard_id);
+
+  int open_bucket_index_shard(const DoutPrefixProvider *dpp,
+                              const RGWBucketInfo& bucket_info,
+                              const rgw::bucket_index_layout_generation& index,
+                              int shard_id, RGWSI_RADOS::Obj *bucket_obj);
+
+  int open_bucket_index(const DoutPrefixProvider *dpp,
+                        const RGWBucketInfo& bucket_info,
+                        RGWSI_RADOS::Pool *index_pool,
+                        std::string *bucket_oid);
+
+  int open_bucket_index(const DoutPrefixProvider *dpp,
+                        const RGWBucketInfo& bucket_info,
+                        std::optional<int> shard_id,
+                        const rgw::bucket_index_layout_generation& idx_layout,
+                        RGWSI_RADOS::Pool *index_pool,
+                        std::map<int, std::string> *bucket_objs,
+                        std::map<int, std::string> *bucket_instance_ids);
+};
+
+
diff --git a/src/rgw/services/svc_bilog_rados.cc b/src/rgw/services/svc_bilog_rados.cc
new file mode 100644
index 000000000..f4bb13ec1
--- /dev/null
+++ b/src/rgw/services/svc_bilog_rados.cc
@@ -0,0 +1,220 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_bilog_rados.h"
+#include "svc_bi_rados.h"
+
+#include "cls/rgw/cls_rgw_client.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWSI_BILog_RADOS::RGWSI_BILog_RADOS(CephContext *cct) : RGWServiceInstance(cct)
+{
+}
+
+void RGWSI_BILog_RADOS::init(RGWSI_BucketIndex_RADOS *bi_rados_svc)
+{
+  svc.bi = bi_rados_svc;
+}
+
+int RGWSI_BILog_RADOS::log_trim(const DoutPrefixProvider *dpp,
+				const RGWBucketInfo& bucket_info,
+				const rgw::bucket_log_layout_generation& log_layout,
+				int shard_id,
+				std::string_view start_marker,
+				std::string_view end_marker)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+
+  BucketIndexShardsManager start_marker_mgr;
+  BucketIndexShardsManager end_marker_mgr;
+
+  const auto& current_index = rgw::log_to_index_layout(log_layout);
+  int r = svc.bi->open_bucket_index(dpp, bucket_info, shard_id, current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0) {
+    return r;
+  }
+
+  r = start_marker_mgr.from_string(start_marker, shard_id);
+  if (r < 0) {
+    return r;
+  }
+
+  r = end_marker_mgr.from_string(end_marker, shard_id);
+  if (r < 0) {
+    return r;
+  }
+
+  return CLSRGWIssueBILogTrim(index_pool.ioctx(), start_marker_mgr, end_marker_mgr, bucket_objs,
+			      cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+int RGWSI_BILog_RADOS::log_start(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_log_layout_generation& log_layout, int shard_id)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+  const auto& current_index = rgw::log_to_index_layout(log_layout);
+  int r = svc.bi->open_bucket_index(dpp, bucket_info, shard_id, current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0)
+    return r;
+
+  return CLSRGWIssueResyncBucketBILog(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+int RGWSI_BILog_RADOS::log_stop(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_log_layout_generation& log_layout, int shard_id)
+{
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> bucket_objs;
+  const auto& current_index = rgw::log_to_index_layout(log_layout);
+  int r = svc.bi->open_bucket_index(dpp, bucket_info, shard_id, current_index, &index_pool, &bucket_objs, nullptr);
+  if (r < 0)
+    return r;
+
+  return CLSRGWIssueBucketBILogStop(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
+}
+
+static void build_bucket_index_marker(const string& shard_id_str,
+                                      const string& shard_marker,
+                                      string *marker) {
+  if (marker) {
+    *marker = shard_id_str;
+    marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
+    marker->append(shard_marker);
+  }
+}
+
+int RGWSI_BILog_RADOS::log_list(const DoutPrefixProvider *dpp,
+				const RGWBucketInfo& bucket_info,
+				const rgw::bucket_log_layout_generation& log_layout,
+				int shard_id, string& marker, uint32_t max,
+                                std::list<rgw_bi_log_entry>& result, bool *truncated)
+{
+  ldpp_dout(dpp, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
+  result.clear();
+
+  RGWSI_RADOS::Pool index_pool;
+  map<int, string> oids;
+  map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
+  const auto& current_index = rgw::log_to_index_layout(log_layout);
+  int r = svc.bi->open_bucket_index(dpp, bucket_info, shard_id, current_index, &index_pool, &oids, nullptr);
+  if (r < 0)
+    return r;
+
+  BucketIndexShardsManager marker_mgr;
+  bool has_shards = (oids.size() > 1 || shard_id >= 0);
+  // If there are multiple shards for the bucket index object, the marker
+  // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
+  // {shard_marker_2}...', if there is no sharding, the bi_log_list should
+  // only contain one record, and the key is the bucket instance id.
+  r = marker_mgr.from_string(marker, shard_id);
+  if (r < 0)
+    return r;
+ 
+  r = CLSRGWIssueBILogList(index_pool.ioctx(), marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
+  if (r < 0)
+    return r;
+
+  map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
+  map<int, list<rgw_bi_log_entry>::iterator> vends;
+  if (truncated) {
+    *truncated = false;
+  }
+  map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
+  for (; miter != bi_log_lists.end(); ++miter) {
+    int shard_id = miter->first;
+    vcurrents[shard_id] = miter->second.entries.begin();
+    vends[shard_id] = miter->second.entries.end();
+    if (truncated) {
+      *truncated = (*truncated || miter->second.truncated);
+    }
+  }
+
+  size_t total = 0;
+  bool has_more = true;
+  map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
+  map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
+  while (total < max && has_more) {
+    has_more = false;
+
+    viter = vcurrents.begin();
+    eiter = vends.begin();
+
+    for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
+      assert (eiter != vends.end());
+
+      int shard_id = viter->first;
+      list<rgw_bi_log_entry>::iterator& liter = viter->second;
+
+      if (liter == eiter->second){
+        continue;
+      }
+      rgw_bi_log_entry& entry = *(liter);
+      if (has_shards) {
+        char buf[16];
+        snprintf(buf, sizeof(buf), "%d", shard_id);
+        string tmp_id;
+        build_bucket_index_marker(buf, entry.id, &tmp_id);
+        entry.id.swap(tmp_id);
+      }
+      marker_mgr.add(shard_id, entry.id);
+      result.push_back(entry);
+      total++;
+      has_more = true;
+      ++liter;
+    }
+  }
+
+  if (truncated) {
+    for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
+      assert (eiter != vends.end());
+      *truncated = (*truncated || (viter->second != eiter->second));
+    }
+  }
+
+  // Refresh marker, if there are multiple shards, the output will look like
+  // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
+  // if there is no sharding, the simply marker (without oid) is returned
+  if (has_shards) {
+    marker_mgr.to_string(&marker);
+  } else {
+    if (!result.empty()) {
+      marker = result.rbegin()->id;
+    }
+  }
+
+  return 0;
+}
+
+int RGWSI_BILog_RADOS::get_log_status(const DoutPrefixProvider *dpp,
+                                      const RGWBucketInfo& bucket_info,
+				      const rgw::bucket_log_layout_generation& log_layout, 
+                                      int shard_id,
+                                      map<int, string> *markers,
+				      optional_yield y)
+{
+  vector<rgw_bucket_dir_header> headers;
+  map<int, string> bucket_instance_ids;
+  const auto& current_index = rgw::log_to_index_layout(log_layout);
+  int r = svc.bi->cls_bucket_head(dpp, bucket_info, current_index, shard_id, &headers, &bucket_instance_ids, y);
+  if (r < 0)
+    return r;
+
+  ceph_assert(headers.size() == bucket_instance_ids.size());
+
+  auto iter = headers.begin();
+  map<int, string>::iterator viter = bucket_instance_ids.begin();
+
+  for(; iter != headers.end(); ++iter, ++viter) {
+    if (shard_id >= 0) {
+      (*markers)[shard_id] = iter->max_marker;
+    } else {
+      (*markers)[viter->first] = iter->max_marker;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/src/rgw/services/svc_bilog_rados.h b/src/rgw/services/svc_bilog_rados.h
new file mode 100644
index 000000000..e9d5dbb5c
--- /dev/null
+++ b/src/rgw/services/svc_bilog_rados.h
@@ -0,0 +1,60 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "svc_rados.h"
+
+
+
+
+class RGWSI_BILog_RADOS : public RGWServiceInstance
+{
+public:
+  struct Svc {
+    RGWSI_BucketIndex_RADOS *bi{nullptr};
+  } svc;
+
+  RGWSI_BILog_RADOS(CephContext *cct);
+
+  void init(RGWSI_BucketIndex_RADOS *bi_rados_svc);
+
+  int log_start(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_log_layout_generation& log_layout, int shard_id);
+  int log_stop(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_log_layout_generation& log_layout, int shard_id);
+
+  int log_trim(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+               const rgw::bucket_log_layout_generation& log_layout,
+               int shard_id,
+               std::string_view start_marker,
+               std::string_view end_marker);
+  int log_list(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
+               const rgw::bucket_log_layout_generation& log_layout,
+               int shard_id,
+               std::string& marker,
+               uint32_t max,
+               std::list<rgw_bi_log_entry>& result,
+               bool *truncated);
+
+  int get_log_status(const DoutPrefixProvider *dpp,
+                     const RGWBucketInfo& bucket_info,
+                     const rgw::bucket_log_layout_generation& log_layout,
+                     int shard_id,
+                     std::map<int, std::string> *markers,
+                     optional_yield y);
+};
diff --git a/src/rgw/services/svc_bucket.cc b/src/rgw/services/svc_bucket.cc
new file mode 100644
index 000000000..b115990d2
--- /dev/null
+++ b/src/rgw/services/svc_bucket.cc
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+
+#include "svc_bucket.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+std::string RGWSI_Bucket::get_entrypoint_meta_key(const rgw_bucket& bucket)
+{
+  if (bucket.bucket_id.empty()) {
+    return bucket.get_key();
+  }
+
+  rgw_bucket b(bucket);
+  b.bucket_id.clear();
+
+  return b.get_key();
+}
+
+std::string RGWSI_Bucket::get_bi_meta_key(const rgw_bucket& bucket)
+{
+  return bucket.get_key();
+}
+
diff --git a/src/rgw/services/svc_bucket.h b/src/rgw/services/svc_bucket.h
new file mode 100644
index 000000000..4a526e4f2
--- /dev/null
+++ b/src/rgw/services/svc_bucket.h
@@ -0,0 +1,111 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "svc_bucket_types.h"
+
+class RGWSI_Bucket : public RGWServiceInstance
+{
+public:
+  RGWSI_Bucket(CephContext *cct) : RGWServiceInstance(cct) {}
+  virtual ~RGWSI_Bucket() {}
+
+  static std::string get_entrypoint_meta_key(const rgw_bucket& bucket);
+  static std::string get_bi_meta_key(const rgw_bucket& bucket);
+
+  virtual RGWSI_Bucket_BE_Handler& get_ep_be_handler() = 0;
+  virtual RGWSI_BucketInstance_BE_Handler& get_bi_be_handler() = 0;
+
+  virtual int read_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx,
+                                          const std::string& key,
+                                          RGWBucketEntryPoint *entry_point,
+                                          RGWObjVersionTracker *objv_tracker,
+                                          real_time *pmtime,
+                                          std::map<std::string, bufferlist> *pattrs,
+                                          optional_yield y,
+                                          const DoutPrefixProvider *dpp,
+                                          rgw_cache_entry_info *cache_info = nullptr,
+                                          boost::optional<obj_version> refresh_version = boost::none) = 0;
+
+  virtual int store_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx,
+                                   const std::string& key,
+                                   RGWBucketEntryPoint& info,
+                                   bool exclusive,
+                                   real_time mtime,
+                                   std::map<std::string, bufferlist> *pattrs,
+                                   RGWObjVersionTracker *objv_tracker,
+                                   optional_yield y,
+                                   const DoutPrefixProvider *dpp) = 0;
+
+  virtual int remove_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx,
+                                    const std::string& key,
+                                    RGWObjVersionTracker *objv_tracker,
+                                    optional_yield y,
+                                    const DoutPrefixProvider *dpp) = 0;
+
+  virtual int read_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                const std::string& key,
+                                RGWBucketInfo *info,
+                                real_time *pmtime,
+                                std::map<std::string, bufferlist> *pattrs,
+                                optional_yield y,
+                                const DoutPrefixProvider *dpp,
+                                rgw_cache_entry_info *cache_info = nullptr,
+                                boost::optional<obj_version> refresh_version = boost::none) = 0;
+
+  virtual int read_bucket_info(RGWSI_Bucket_X_Ctx& ep_ctx,
+                       const rgw_bucket& bucket,
+                       RGWBucketInfo *info,
+                       real_time *pmtime,
+                       std::map<std::string, bufferlist> *pattrs,
+                       boost::optional<obj_version> refresh_version,
+                       optional_yield y,
+                       const DoutPrefixProvider *dpp) = 0;
+
+  virtual int store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                 const std::string& key,
+                                 RGWBucketInfo& info,
+                                 std::optional<RGWBucketInfo *> orig_info, /* nullopt: orig_info was not fetched,
+                                                                              nullptr: orig_info was not found (new bucket instance */
+                                 bool exclusive,
+                                 real_time mtime,
+                                 std::map<std::string, bufferlist> *pattrs,
+                                 optional_yield y,
+                                 const DoutPrefixProvider *dpp) = 0;
+
+  virtual int remove_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                  const std::string& key,
+				  const RGWBucketInfo& bucket_info,
+                                  RGWObjVersionTracker *objv_tracker,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp) = 0;
+
+  virtual int read_bucket_stats(RGWSI_Bucket_X_Ctx& ctx,
+                        const rgw_bucket& bucket,
+                        RGWBucketEnt *ent,
+                        optional_yield y,
+                        const DoutPrefixProvider *dpp) = 0;
+
+  virtual int read_buckets_stats(RGWSI_Bucket_X_Ctx& ctx,
+                                 std::map<std::string, RGWBucketEnt>& m,
+                                 optional_yield y,
+                                 const DoutPrefixProvider *dpp) = 0;
+};
+
diff --git a/src/rgw/services/svc_bucket_sobj.cc b/src/rgw/services/svc_bucket_sobj.cc
new file mode 100644
index 000000000..08a528015
--- /dev/null
+++ b/src/rgw/services/svc_bucket_sobj.cc
@@ -0,0 +1,644 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+
+#include "svc_bucket_sobj.h"
+#include "svc_zone.h"
+#include "svc_sys_obj.h"
+#include "svc_sys_obj_cache.h"
+#include "svc_bi.h"
+#include "svc_meta.h"
+#include "svc_meta_be_sobj.h"
+#include "svc_sync_modules.h"
+
+#include "rgw_bucket.h"
+#include "rgw_tools.h"
+#include "rgw_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#define RGW_BUCKET_INSTANCE_MD_PREFIX ".bucket.meta."
+
+using namespace std;
+
+class RGWSI_Bucket_SObj_Module : public RGWSI_MBSObj_Handler_Module {
+  RGWSI_Bucket_SObj::Svc& svc;
+
+  const string prefix;
+public:
+  RGWSI_Bucket_SObj_Module(RGWSI_Bucket_SObj::Svc& _svc) : RGWSI_MBSObj_Handler_Module("bucket"),
+                                                 svc(_svc) {}
+
+  void get_pool_and_oid(const string& key, rgw_pool *pool, string *oid) override {
+    if (pool) {
+      *pool = svc.zone->get_zone_params().domain_root;
+    }
+    if (oid) {
+      *oid = key;
+    }
+  }
+
+  const string& get_oid_prefix() override {
+    return prefix;
+  }
+
+  bool is_valid_oid(const string& oid) override {
+    return (!oid.empty() && oid[0] != '.');
+  }
+
+  string key_to_oid(const string& key) override {
+    return key;
+  }
+
+  string oid_to_key(const string& oid) override {
+    /* should have been called after is_valid_oid(),
+     * so no need to check for validity */
+    return oid;
+  }
+};
+
+class RGWSI_BucketInstance_SObj_Module : public RGWSI_MBSObj_Handler_Module {
+  RGWSI_Bucket_SObj::Svc& svc;
+
+  const string prefix;
+public:
+  RGWSI_BucketInstance_SObj_Module(RGWSI_Bucket_SObj::Svc& _svc) : RGWSI_MBSObj_Handler_Module("bucket.instance"),
+                                                                     svc(_svc), prefix(RGW_BUCKET_INSTANCE_MD_PREFIX) {}
+
+  void get_pool_and_oid(const string& key, rgw_pool *pool, string *oid) override {
+    if (pool) {
+      *pool = svc.zone->get_zone_params().domain_root;
+    }
+    if (oid) {
+      *oid = key_to_oid(key);
+    }
+  }
+
+  const string& get_oid_prefix() override {
+    return prefix;
+  }
+
+  bool is_valid_oid(const string& oid) override {
+    return (oid.compare(0, prefix.size(), RGW_BUCKET_INSTANCE_MD_PREFIX) == 0);
+  }
+
+// 'tenant/' is used in bucket instance keys for sync to avoid parsing ambiguity
+// with the existing instance[:shard] format. once we parse the shard, the / is
+// replaced with a : to match the [tenant:]instance format
+  string key_to_oid(const string& key) override {
+    string oid = prefix + key;
+
+    // replace tenant/ with tenant:
+    auto c = oid.find('/', prefix.size());
+    if (c != string::npos) {
+      oid[c] = ':';
+    }
+
+    return oid;
+  }
+
+  // convert bucket instance oids back to the tenant/ format for metadata keys.
+  // it's safe to parse 'tenant:' only for oids, because they won't contain the
+  // optional :shard at the end
+  string oid_to_key(const string& oid) override {
+    /* this should have been called after oid was checked for validity */
+
+    if (oid.size() < prefix.size()) { /* just sanity check */
+      return string();
+    }
+
+    string key = oid.substr(prefix.size());
+
+    // find first : (could be tenant:bucket or bucket:instance)
+    auto c = key.find(':');
+    if (c != string::npos) {
+      // if we find another :, the first one was for tenant
+      if (key.find(':', c + 1) != string::npos) {
+        key[c] = '/';
+      }
+    }
+
+    return key;
+  }
+
+  /*
+   * hash entry for mdlog placement. Use the same hash key we'd have for the bucket entry
+   * point, so that the log entries end up at the same log shard, so that we process them
+   * in order
+   */
+  string get_hash_key(const string& key) override {
+    string k = "bucket:";
+    int pos = key.find(':');
+    if (pos < 0)
+      k.append(key);
+    else
+      k.append(key.substr(0, pos));
+
+    return k;
+  }
+};
+
+RGWSI_Bucket_SObj::RGWSI_Bucket_SObj(CephContext *cct): RGWSI_Bucket(cct) {
+}
+
+RGWSI_Bucket_SObj::~RGWSI_Bucket_SObj() {
+}
+
+void RGWSI_Bucket_SObj::init(RGWSI_Zone *_zone_svc, RGWSI_SysObj *_sysobj_svc,
+                             RGWSI_SysObj_Cache *_cache_svc, RGWSI_BucketIndex *_bi,
+                             RGWSI_Meta *_meta_svc, RGWSI_MetaBackend *_meta_be_svc,
+                             RGWSI_SyncModules *_sync_modules_svc,
+                             RGWSI_Bucket_Sync *_bucket_sync_svc)
+{
+  svc.bucket = this;
+  svc.zone = _zone_svc;
+  svc.sysobj = _sysobj_svc;
+  svc.cache = _cache_svc;
+  svc.bi = _bi;
+  svc.meta = _meta_svc;
+  svc.meta_be = _meta_be_svc;
+  svc.sync_modules = _sync_modules_svc;
+  svc.bucket_sync = _bucket_sync_svc;
+}
+
+int RGWSI_Bucket_SObj::do_start(optional_yield, const DoutPrefixProvider *dpp)
+{
+  binfo_cache.reset(new RGWChainedCacheImpl<bucket_info_cache_entry>);
+  binfo_cache->init(svc.cache);
+
+  /* create first backend handler for bucket entrypoints */
+
+  RGWSI_MetaBackend_Handler *ep_handler;
+
+  int r = svc.meta->create_be_handler(RGWSI_MetaBackend::Type::MDBE_SOBJ, &ep_handler);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to create be handler: r=" << r << dendl;
+    return r;
+  }
+
+  ep_be_handler = ep_handler;
+
+  RGWSI_MetaBackend_Handler_SObj *ep_bh = static_cast<RGWSI_MetaBackend_Handler_SObj *>(ep_handler);
+
+  auto ep_module = new RGWSI_Bucket_SObj_Module(svc);
+  ep_be_module.reset(ep_module);
+  ep_bh->set_module(ep_module);
+
+  /* create a second backend handler for bucket instance */
+
+  RGWSI_MetaBackend_Handler *bi_handler;
+
+  r = svc.meta->create_be_handler(RGWSI_MetaBackend::Type::MDBE_SOBJ, &bi_handler);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to create be handler: r=" << r << dendl;
+    return r;
+  }
+
+  bi_be_handler = bi_handler;
+
+  RGWSI_MetaBackend_Handler_SObj *bi_bh = static_cast<RGWSI_MetaBackend_Handler_SObj *>(bi_handler);
+
+  auto bi_module = new RGWSI_BucketInstance_SObj_Module(svc);
+  bi_be_module.reset(bi_module);
+  bi_bh->set_module(bi_module);
+
+  return 0;
+}
+
+int RGWSI_Bucket_SObj::read_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx,
+                                                   const string& key,
+                                                   RGWBucketEntryPoint *entry_point,
+                                                   RGWObjVersionTracker *objv_tracker,
+                                                   real_time *pmtime,
+                                                   map<string, bufferlist> *pattrs,
+                                                   optional_yield y,
+                                                   const DoutPrefixProvider *dpp,
+                                                   rgw_cache_entry_info *cache_info,
+                                                   boost::optional<obj_version> refresh_version)
+{
+  bufferlist bl;
+
+  auto params = RGWSI_MBSObj_GetParams(&bl, pattrs, pmtime).set_cache_info(cache_info)
+                                                           .set_refresh_version(refresh_version);
+                                                    
+  int ret = svc.meta_be->get_entry(ctx.get(), key, params, objv_tracker, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  auto iter = bl.cbegin();
+  try {
+    decode(*entry_point, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
+    return -EIO;
+  }
+  return 0;
+}
+
+int RGWSI_Bucket_SObj::store_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx,
+                                                    const string& key,
+                                                    RGWBucketEntryPoint& info,
+                                                    bool exclusive,
+                                                    real_time mtime,
+                                                    map<string, bufferlist> *pattrs,
+                                                    RGWObjVersionTracker *objv_tracker,
+                                                    optional_yield y,
+                                                    const DoutPrefixProvider *dpp)
+{
+  bufferlist bl;
+  encode(info, bl);
+
+  RGWSI_MBSObj_PutParams params(bl, pattrs, mtime, exclusive);
+
+  int ret = svc.meta_be->put(ctx.get(), key, params, objv_tracker, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return ret;
+}
+
+int RGWSI_Bucket_SObj::remove_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx,
+                                                     const string& key,
+                                                     RGWObjVersionTracker *objv_tracker,
+                                                     optional_yield y,
+                                                     const DoutPrefixProvider *dpp)
+{
+  RGWSI_MBSObj_RemoveParams params;
+  return svc.meta_be->remove(ctx.get(), key, params, objv_tracker, y, dpp);
+}
+
+int RGWSI_Bucket_SObj::read_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                                 const string& key,
+                                                 RGWBucketInfo *info,
+                                                 real_time *pmtime, map<string, bufferlist> *pattrs,
+                                                 optional_yield y,
+                                                 const DoutPrefixProvider *dpp,
+                                                 rgw_cache_entry_info *cache_info,
+                                                 boost::optional<obj_version> refresh_version)
+{
+  string cache_key("bi/");
+  cache_key.append(key);
+
+  if (auto e = binfo_cache->find(cache_key)) {
+    if (refresh_version &&
+        e->info.objv_tracker.read_version.compare(&(*refresh_version))) {
+      ldpp_dout(dpp, -1) << "WARNING: The bucket info cache is inconsistent. This is "
+        << "a failure that should be debugged. I am a nice machine, "
+        << "so I will try to recover." << dendl;
+      binfo_cache->invalidate(key);
+    } else {
+      *info = e->info;
+      if (pattrs)
+	*pattrs = e->attrs;
+      if (pmtime)
+	*pmtime = e->mtime;
+      return 0;
+    }
+  }
+
+  bucket_info_cache_entry e;
+  rgw_cache_entry_info ci;
+
+  int ret = do_read_bucket_instance_info(ctx, key,
+                                  &e.info, &e.mtime, &e.attrs,
+                                  &ci, refresh_version, y, dpp);
+  *info = e.info;
+
+  if (ret < 0) {
+    if (ret != -ENOENT) {
+      ldpp_dout(dpp, -1) << "ERROR: do_read_bucket_instance_info failed: " << ret << dendl;
+    } else {
+      ldpp_dout(dpp, 20) << "do_read_bucket_instance_info, bucket instance not found (key=" << key << ")" << dendl;
+    }
+    return ret;
+  }
+
+  if (pmtime) {
+    *pmtime = e.mtime;
+  }
+  if (pattrs) {
+    *pattrs = e.attrs;
+  }
+  if (cache_info) {
+    *cache_info = ci;
+  }
+
+  /* chain to only bucket instance and *not* bucket entrypoint */
+  if (!binfo_cache->put(dpp, svc.cache, cache_key, &e, {&ci})) {
+    ldpp_dout(dpp, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
+  }
+
+  if (refresh_version &&
+      refresh_version->compare(&info->objv_tracker.read_version)) {
+    ldpp_dout(dpp, -1) << "WARNING: The OSD has the same version I have. Something may "
+               << "have gone squirrelly. An administrator may have forced a "
+               << "change; otherwise there is a problem somewhere." << dendl;
+  }
+
+  return 0;
+}
+
+int RGWSI_Bucket_SObj::do_read_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                                    const string& key,
+                                                    RGWBucketInfo *info,
+                                                    real_time *pmtime, map<string, bufferlist> *pattrs,
+                                                    rgw_cache_entry_info *cache_info,
+                                                    boost::optional<obj_version> refresh_version,
+                                                    optional_yield y,
+                                                    const DoutPrefixProvider *dpp)
+{
+  bufferlist bl;
+  RGWObjVersionTracker ot;
+
+  auto params = RGWSI_MBSObj_GetParams(&bl, pattrs, pmtime).set_cache_info(cache_info)
+                                                           .set_refresh_version(refresh_version);
+
+  int ret = svc.meta_be->get_entry(ctx.get(), key, params, &ot, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  auto iter = bl.cbegin();
+  try {
+    decode(*info, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
+    return -EIO;
+  }
+  info->objv_tracker = ot;
+  return 0;
+}
+
+int RGWSI_Bucket_SObj::read_bucket_info(RGWSI_Bucket_X_Ctx& ctx,
+                                        const rgw_bucket& bucket,
+                                        RGWBucketInfo *info,
+                                        real_time *pmtime,
+                                        map<string, bufferlist> *pattrs,
+                                        boost::optional<obj_version> refresh_version,
+                                        optional_yield y,
+                                        const DoutPrefixProvider *dpp)
+{
+  rgw_cache_entry_info cache_info;
+
+  if (!bucket.bucket_id.empty()) {
+    return read_bucket_instance_info(ctx.bi, get_bi_meta_key(bucket),
+                                     info,
+                                     pmtime, pattrs,
+                                     y,
+                                     dpp,
+                                     &cache_info, refresh_version);
+  }
+
+  string bucket_entry = get_entrypoint_meta_key(bucket);
+  string cache_key("b/");
+  cache_key.append(bucket_entry);
+
+  if (auto e = binfo_cache->find(cache_key)) {
+    bool found_version = (bucket.bucket_id.empty() ||
+                          bucket.bucket_id == e->info.bucket.bucket_id);
+
+    if (!found_version ||
+        (refresh_version &&
+         e->info.objv_tracker.read_version.compare(&(*refresh_version)))) {
+      ldpp_dout(dpp, -1) << "WARNING: The bucket info cache is inconsistent. This is "
+        << "a failure that should be debugged. I am a nice machine, "
+        << "so I will try to recover." << dendl;
+      binfo_cache->invalidate(cache_key);
+    } else {
+      *info = e->info;
+      if (pattrs)
+	*pattrs = e->attrs;
+      if (pmtime)
+	*pmtime = e->mtime;
+      return 0;
+    }
+  }
+
+  RGWBucketEntryPoint entry_point;
+  real_time ep_mtime;
+  RGWObjVersionTracker ot;
+  rgw_cache_entry_info entry_cache_info;
+  int ret = read_bucket_entrypoint_info(ctx.ep, bucket_entry,
+                                        &entry_point, &ot, &ep_mtime, pattrs,
+                                        y,
+                                        dpp,
+                                        &entry_cache_info, refresh_version);
+  if (ret < 0) {
+    /* only init these fields */
+    info->bucket = bucket;
+    return ret;
+  }
+
+  if (entry_point.has_bucket_info) {
+    *info = entry_point.old_bucket_info;
+    info->bucket.tenant = bucket.tenant;
+    ldpp_dout(dpp, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info->bucket << " owner " << info->owner << dendl;
+    return 0;
+  }
+
+  /* data is in the bucket instance object, we need to get attributes from there, clear everything
+   * that we got
+   */
+  if (pattrs) {
+    pattrs->clear();
+  }
+
+  ldpp_dout(dpp, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
+
+
+  /* read bucket instance info */
+
+  bucket_info_cache_entry e;
+
+  ret = read_bucket_instance_info(ctx.bi, get_bi_meta_key(entry_point.bucket),
+                                  &e.info, &e.mtime, &e.attrs,
+                                  y,
+                                  dpp,
+                                  &cache_info, refresh_version);
+  *info = e.info;
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: read_bucket_instance_from_oid failed: " << ret << dendl;
+    info->bucket = bucket;
+    // XXX and why return anything in case of an error anyway?
+    return ret;
+  }
+
+  if (pmtime)
+    *pmtime = e.mtime;
+  if (pattrs)
+    *pattrs = e.attrs;
+
+  /* chain to both bucket entry point and bucket instance */
+  if (!binfo_cache->put(dpp, svc.cache, cache_key, &e, {&entry_cache_info, &cache_info})) {
+    ldpp_dout(dpp, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
+  }
+
+  if (refresh_version &&
+      refresh_version->compare(&info->objv_tracker.read_version)) {
+    ldpp_dout(dpp, -1) << "WARNING: The OSD has the same version I have. Something may "
+               << "have gone squirrelly. An administrator may have forced a "
+               << "change; otherwise there is a problem somewhere." << dendl;
+  }
+
+  return 0;
+}
+
+
+int RGWSI_Bucket_SObj::store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                                  const string& key,
+                                                  RGWBucketInfo& info,
+                                                  std::optional<RGWBucketInfo *> orig_info,
+                                                  bool exclusive,
+                                                  real_time mtime,
+                                                  map<string, bufferlist> *pattrs,
+                                                  optional_yield y,
+                                                  const DoutPrefixProvider *dpp)
+{
+  bufferlist bl;
+  encode(info, bl);
+
+  /*
+   * we might need some special handling if overwriting
+   */
+  RGWBucketInfo shared_bucket_info;
+  if (!orig_info && !exclusive) {  /* if exclusive, we're going to fail when try
+                                      to overwrite, so the whole check here is moot */
+    /*
+     * we're here because orig_info wasn't passed in
+     * we don't have info about what was there before, so need to fetch first
+     */
+    int r  = read_bucket_instance_info(ctx,
+                                       key,
+                                       &shared_bucket_info,
+                                       nullptr, nullptr,
+                                       y,
+                                       dpp,
+                                       nullptr, boost::none);
+    if (r < 0) {
+      if (r != -ENOENT) {
+        ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): read_bucket_instance_info() of key=" << key << " returned r=" << r << dendl;
+        return r;
+      }
+    } else {
+      orig_info = &shared_bucket_info;
+    }
+  }
+
+  if (orig_info && *orig_info && !exclusive) {
+    int r = svc.bi->handle_overwrite(dpp, info, *(orig_info.value()), y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): svc.bi->handle_overwrite() of key=" << key << " returned r=" << r << dendl;
+      return r;
+    }
+  }
+
+  RGWSI_MBSObj_PutParams params(bl, pattrs, mtime, exclusive);
+
+  int ret = svc.meta_be->put(ctx.get(), key, params, &info.objv_tracker, y, dpp);
+
+  if (ret >= 0) {
+    int r = svc.bucket_sync->handle_bi_update(dpp, info,
+                                              orig_info.value_or(nullptr),
+                                              y);
+    if (r < 0) {
+      return r;
+    }
+  } else if (ret == -EEXIST) {
+    /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
+     * bucket operation on this specific bucket (e.g., being synced from the master), but
+     * since bucket instance meta object is unique for this specific bucket instance, we don't
+     * need to return an error.
+     * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
+     * master, creating a bucket, sending bucket creation to the master, we create the bucket
+     * locally, while in the sync thread we sync the new bucket.
+     */
+    ret = 0;
+  }
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  return ret;
+}
+
+int RGWSI_Bucket_SObj::remove_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                                   const string& key,
+                                                   const RGWBucketInfo& info,
+                                                   RGWObjVersionTracker *objv_tracker,
+                                                   optional_yield y,
+                                                   const DoutPrefixProvider *dpp)
+{
+  RGWSI_MBSObj_RemoveParams params;
+  int ret = svc.meta_be->remove_entry(dpp, ctx.get(), key, params, objv_tracker, y);
+
+  if (ret < 0 &&
+      ret != -ENOENT) {
+    return ret;
+  }
+
+  int r = svc.bucket_sync->handle_bi_removal(dpp, info, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to update bucket instance sync index: r=" << r << dendl;
+    /* returning success as index is just keeping hints, so will keep extra hints,
+     * but bucket removal succeeded
+     */
+  }
+
+  return 0;
+}
+
+int RGWSI_Bucket_SObj::read_bucket_stats(const RGWBucketInfo& bucket_info,
+                                         RGWBucketEnt *ent,
+                                         optional_yield y,
+                                         const DoutPrefixProvider *dpp)
+{
+  ent->count = 0;
+  ent->size = 0;
+  ent->size_rounded = 0;
+
+  vector<rgw_bucket_dir_header> headers;
+
+  int r = svc.bi->read_stats(dpp, bucket_info, ent, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): read_stats returned r=" << r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_Bucket_SObj::read_bucket_stats(RGWSI_Bucket_X_Ctx& ctx,
+                                         const rgw_bucket& bucket,
+                                         RGWBucketEnt *ent,
+                                         optional_yield y,
+                                         const DoutPrefixProvider *dpp)
+{
+  RGWBucketInfo bucket_info;
+  int ret = read_bucket_info(ctx, bucket, &bucket_info, nullptr, nullptr, boost::none, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return read_bucket_stats(bucket_info, ent, y, dpp);
+}
+
+int RGWSI_Bucket_SObj::read_buckets_stats(RGWSI_Bucket_X_Ctx& ctx,
+                                          map<string, RGWBucketEnt>& m,
+                                          optional_yield y,
+                                          const DoutPrefixProvider *dpp)
+{
+  map<string, RGWBucketEnt>::iterator iter;
+  for (iter = m.begin(); iter != m.end(); ++iter) {
+    RGWBucketEnt& ent = iter->second;
+    int r = read_bucket_stats(ctx, ent.bucket, &ent, y, dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): read_bucket_stats returned r=" << r << dendl;
+      return r;
+    }
+  }
+
+  return m.size();
+}
diff --git a/src/rgw/services/svc_bucket_sobj.h b/src/rgw/services/svc_bucket_sobj.h
new file mode 100644
index 000000000..8e9fe063c
--- /dev/null
+++ b/src/rgw/services/svc_bucket_sobj.h
@@ -0,0 +1,180 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "svc_meta_be.h"
+#include "svc_bucket_types.h"
+#include "svc_bucket.h"
+#include "svc_bucket_sync.h"
+
+class RGWSI_Zone;
+class RGWSI_SysObj;
+class RGWSI_SysObj_Cache;
+class RGWSI_Meta;
+class RGWSI_SyncModules;
+
+struct rgw_cache_entry_info;
+
+template <class T>
+class RGWChainedCacheImpl;
+
+class RGWSI_Bucket_SObj : public RGWSI_Bucket
+{
+  struct bucket_info_cache_entry {
+    RGWBucketInfo info;
+    real_time mtime;
+    std::map<std::string, bufferlist> attrs;
+  };
+
+  using RGWChainedCacheImpl_bucket_info_cache_entry = RGWChainedCacheImpl<bucket_info_cache_entry>;
+  std::unique_ptr<RGWChainedCacheImpl_bucket_info_cache_entry> binfo_cache;
+
+  RGWSI_Bucket_BE_Handler ep_be_handler;
+  std::unique_ptr<RGWSI_MetaBackend::Module> ep_be_module;
+  RGWSI_BucketInstance_BE_Handler bi_be_handler;
+  std::unique_ptr<RGWSI_MetaBackend::Module> bi_be_module;
+
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+
+  int do_read_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                   const std::string& key,
+                                   RGWBucketInfo *info,
+                                   real_time *pmtime,
+                                   std::map<std::string, bufferlist> *pattrs,
+                                   rgw_cache_entry_info *cache_info,
+                                   boost::optional<obj_version> refresh_version,
+                                   optional_yield y,
+                                   const DoutPrefixProvider *dpp);
+
+  int read_bucket_stats(const RGWBucketInfo& bucket_info,
+                        RGWBucketEnt *ent,
+                        optional_yield y,
+                        const DoutPrefixProvider *dpp);
+
+public:
+  struct Svc {
+    RGWSI_Bucket_SObj *bucket{nullptr};
+    RGWSI_BucketIndex *bi{nullptr};
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_SysObj *sysobj{nullptr};
+    RGWSI_SysObj_Cache *cache{nullptr};
+    RGWSI_Meta *meta{nullptr};
+    RGWSI_MetaBackend *meta_be{nullptr};
+    RGWSI_SyncModules *sync_modules{nullptr};
+    RGWSI_Bucket_Sync *bucket_sync{nullptr};
+  } svc;
+
+  RGWSI_Bucket_SObj(CephContext *cct);
+  ~RGWSI_Bucket_SObj();
+
+  RGWSI_Bucket_BE_Handler& get_ep_be_handler() override {
+    return ep_be_handler;
+  }
+
+  RGWSI_BucketInstance_BE_Handler& get_bi_be_handler() override {
+    return bi_be_handler;
+  }
+
+  void init(RGWSI_Zone *_zone_svc,
+            RGWSI_SysObj *_sysobj_svc,
+	    RGWSI_SysObj_Cache *_cache_svc,
+            RGWSI_BucketIndex *_bi,
+            RGWSI_Meta *_meta_svc,
+            RGWSI_MetaBackend *_meta_be_svc,
+	    RGWSI_SyncModules *_sync_modules_svc,
+	    RGWSI_Bucket_Sync *_bucket_sync_svc);
+
+
+  int read_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx,
+                                  const std::string& key,
+                                  RGWBucketEntryPoint *entry_point,
+                                  RGWObjVersionTracker *objv_tracker,
+                                  real_time *pmtime,
+                                  std::map<std::string, bufferlist> *pattrs,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp,
+                                  rgw_cache_entry_info *cache_info = nullptr,
+                                  boost::optional<obj_version> refresh_version = boost::none) override;
+
+  int store_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx,
+                                   const std::string& key,
+                                   RGWBucketEntryPoint& info,
+                                   bool exclusive,
+                                   real_time mtime,
+                                   std::map<std::string, bufferlist> *pattrs,
+                                   RGWObjVersionTracker *objv_tracker,
+                                   optional_yield y,
+                                   const DoutPrefixProvider *dpp) override;
+
+  int remove_bucket_entrypoint_info(RGWSI_Bucket_EP_Ctx& ctx,
+                                    const std::string& key,
+                                    RGWObjVersionTracker *objv_tracker,
+                                    optional_yield y,
+                                    const DoutPrefixProvider *dpp) override;
+
+  int read_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                const std::string& key,
+                                RGWBucketInfo *info,
+                                real_time *pmtime,
+                                std::map<std::string, bufferlist> *pattrs,
+                                optional_yield y,
+                                const DoutPrefixProvider *dpp,
+                                rgw_cache_entry_info *cache_info = nullptr,
+                                boost::optional<obj_version> refresh_version = boost::none) override;
+
+  int read_bucket_info(RGWSI_Bucket_X_Ctx& ep_ctx,
+                       const rgw_bucket& bucket,
+                       RGWBucketInfo *info,
+                       real_time *pmtime,
+                       std::map<std::string, bufferlist> *pattrs,
+                       boost::optional<obj_version> refresh_version,
+                       optional_yield y,
+                       const DoutPrefixProvider *dpp) override;
+
+  int store_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                 const std::string& key,
+                                 RGWBucketInfo& info,
+                                 std::optional<RGWBucketInfo *> orig_info, /* nullopt: orig_info was not fetched,
+                                                                              nullptr: orig_info was not found (new bucket instance */
+                                 bool exclusive,
+                                 real_time mtime,
+                                 std::map<std::string, bufferlist> *pattrs,
+                                 optional_yield y,
+                                 const DoutPrefixProvider *dpp) override;
+
+  int remove_bucket_instance_info(RGWSI_Bucket_BI_Ctx& ctx,
+                                  const std::string& key,
+                                  const RGWBucketInfo& bucket_info,
+                                  RGWObjVersionTracker *objv_tracker,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp) override;
+
+  int read_bucket_stats(RGWSI_Bucket_X_Ctx& ctx,
+                        const rgw_bucket& bucket,
+                        RGWBucketEnt *ent,
+                        optional_yield y,
+                        const DoutPrefixProvider *dpp) override;
+
+  int read_buckets_stats(RGWSI_Bucket_X_Ctx& ctx,
+                         std::map<std::string, RGWBucketEnt>& m,
+                         optional_yield y,
+                         const DoutPrefixProvider *dpp) override;
+};
+
diff --git a/src/rgw/services/svc_bucket_sync.h b/src/rgw/services/svc_bucket_sync.h
new file mode 100644
index 000000000..7975e062b
--- /dev/null
+++ b/src/rgw/services/svc_bucket_sync.h
@@ -0,0 +1,55 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "svc_bucket_types.h"
+
+class RGWBucketSyncPolicyHandler;
+using RGWBucketSyncPolicyHandlerRef = std::shared_ptr<RGWBucketSyncPolicyHandler>;
+
+
+class RGWSI_Bucket_Sync : public RGWServiceInstance
+{
+public:
+  RGWSI_Bucket_Sync(CephContext *cct) : RGWServiceInstance(cct) {}
+
+  virtual int get_policy_handler(RGWSI_Bucket_X_Ctx& ctx,
+                                 std::optional<rgw_zone_id> zone,
+                                 std::optional<rgw_bucket> bucket,
+                                 RGWBucketSyncPolicyHandlerRef *handler,
+                                 optional_yield y,
+                                 const DoutPrefixProvider *dpp) = 0;
+
+  virtual int handle_bi_update(const DoutPrefixProvider *dpp, 
+                               RGWBucketInfo& bucket_info,
+                               RGWBucketInfo *orig_bucket_info,
+                               optional_yield y) = 0;
+  virtual int handle_bi_removal(const DoutPrefixProvider *dpp, 
+                                const RGWBucketInfo& bucket_info,
+                                optional_yield y) = 0;
+
+  virtual int get_bucket_sync_hints(const DoutPrefixProvider *dpp,
+                                    const rgw_bucket& bucket,
+                                    std::set<rgw_bucket> *sources,
+                                    std::set<rgw_bucket> *dests,
+                                    optional_yield y) = 0;
+};
+
+
diff --git a/src/rgw/services/svc_bucket_sync_sobj.cc b/src/rgw/services/svc_bucket_sync_sobj.cc
new file mode 100644
index 000000000..ea3398a3f
--- /dev/null
+++ b/src/rgw/services/svc_bucket_sync_sobj.cc
@@ -0,0 +1,903 @@
+#include "svc_bucket_sync_sobj.h"
+#include "svc_zone.h"
+#include "svc_sys_obj_cache.h"
+#include "svc_bucket_sobj.h"
+
+#include "rgw_bucket_sync.h"
+#include "rgw_zone.h"
+#include "rgw_sync_policy.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static string bucket_sync_sources_oid_prefix = "bucket.sync-source-hints";
+static string bucket_sync_targets_oid_prefix = "bucket.sync-target-hints";
+
+class RGWSI_Bucket_Sync_SObj_HintIndexManager {
+  CephContext *cct;
+
+  struct {
+    RGWSI_Zone *zone;
+    RGWSI_SysObj *sysobj;
+  } svc;
+
+public:
+  RGWSI_Bucket_Sync_SObj_HintIndexManager(RGWSI_Zone *_zone_svc,
+                                          RGWSI_SysObj *_sysobj_svc) {
+    svc.zone = _zone_svc;
+    svc.sysobj = _sysobj_svc;
+
+    cct = svc.zone->ctx();
+  }
+
+  rgw_raw_obj get_sources_obj(const rgw_bucket& bucket) const;
+  rgw_raw_obj get_dests_obj(const rgw_bucket& bucket) const;
+
+  template <typename C1, typename C2>
+  int update_hints(const DoutPrefixProvider *dpp, 
+                   const RGWBucketInfo& bucket_info,
+                   C1& added_dests,
+                   C2& removed_dests,
+                   C1& added_sources,
+                   C2& removed_sources,
+                   optional_yield y);
+};
+
+RGWSI_Bucket_Sync_SObj::RGWSI_Bucket_Sync_SObj(CephContext *cct) : RGWSI_Bucket_Sync(cct) {
+}
+RGWSI_Bucket_Sync_SObj::~RGWSI_Bucket_Sync_SObj() {
+}
+
+void RGWSI_Bucket_Sync_SObj::init(RGWSI_Zone *_zone_svc,
+                                  RGWSI_SysObj *_sysobj_svc,
+                                  RGWSI_SysObj_Cache *_cache_svc,
+                                  RGWSI_Bucket_SObj *bucket_sobj_svc)
+{
+  svc.zone = _zone_svc;
+  svc.sysobj = _sysobj_svc;
+  svc.cache = _cache_svc;
+  svc.bucket_sobj = bucket_sobj_svc;
+
+  hint_index_mgr.reset(new RGWSI_Bucket_Sync_SObj_HintIndexManager(svc.zone, svc.sysobj));
+}
+
+int RGWSI_Bucket_Sync_SObj::do_start(optional_yield, const DoutPrefixProvider *dpp)
+{
+  sync_policy_cache.reset(new RGWChainedCacheImpl<bucket_sync_policy_cache_entry>);
+  sync_policy_cache->init(svc.cache);
+
+  return 0;
+}
+
+void RGWSI_Bucket_Sync_SObj::get_hint_entities(RGWSI_Bucket_X_Ctx& ctx,
+                                               const std::set<rgw_zone_id>& zones,
+                                               const std::set<rgw_bucket>& buckets,
+                                               std::set<rgw_sync_bucket_entity> *hint_entities,
+                                               optional_yield y, const DoutPrefixProvider *dpp)
+{
+  vector<rgw_bucket> hint_buckets;
+
+  hint_buckets.reserve(buckets.size());
+
+  for (auto& b : buckets) {
+    RGWBucketInfo hint_bucket_info;
+    int ret = svc.bucket_sobj->read_bucket_info(ctx, b, &hint_bucket_info,
+                                                nullptr, nullptr, boost::none,
+                                                y, dpp);
+    if (ret < 0) {
+      ldpp_dout(dpp, 20) << "could not init bucket info for hint bucket=" << b << " ... skipping" << dendl;
+      continue;
+    }
+
+    hint_buckets.emplace_back(std::move(hint_bucket_info.bucket));
+  }
+
+  for (auto& zone : zones) {
+    for (auto& b : hint_buckets) {
+      hint_entities->insert(rgw_sync_bucket_entity(zone, b));
+    }
+  }
+}
+
+int RGWSI_Bucket_Sync_SObj::resolve_policy_hints(RGWSI_Bucket_X_Ctx& ctx,
+                                                 rgw_sync_bucket_entity& self_entity,
+                                                 RGWBucketSyncPolicyHandlerRef& handler,
+                                                 RGWBucketSyncPolicyHandlerRef& zone_policy_handler,
+                                                 std::map<optional_zone_bucket, RGWBucketSyncPolicyHandlerRef>& temp_map,
+                                                 optional_yield y,
+                                                 const DoutPrefixProvider *dpp)
+{
+  set<rgw_zone_id> source_zones;
+  set<rgw_zone_id> target_zones;
+
+  zone_policy_handler->reflect(dpp, nullptr, nullptr,
+                               nullptr, nullptr,
+                               &source_zones,
+                               &target_zones,
+                               false); /* relaxed: also get all zones that we allow to sync to/from */
+
+  std::set<rgw_sync_bucket_entity> hint_entities;
+
+  get_hint_entities(ctx, source_zones, handler->get_source_hints(), &hint_entities, y, dpp);
+  get_hint_entities(ctx, target_zones, handler->get_target_hints(), &hint_entities, y, dpp);
+
+  std::set<rgw_sync_bucket_pipe> resolved_sources;
+  std::set<rgw_sync_bucket_pipe> resolved_dests;
+
+  for (auto& hint_entity : hint_entities) {
+    if (!hint_entity.zone ||
+	!hint_entity.bucket) {
+      continue; /* shouldn't really happen */
+    }
+
+    auto& zid = *hint_entity.zone;
+    auto& hint_bucket = *hint_entity.bucket;
+
+    RGWBucketSyncPolicyHandlerRef hint_bucket_handler;
+
+    auto iter = temp_map.find(optional_zone_bucket(zid, hint_bucket));
+    if (iter != temp_map.end()) {
+      hint_bucket_handler = iter->second;
+    } else {
+      int r = do_get_policy_handler(ctx, zid, hint_bucket, temp_map, &hint_bucket_handler, y, dpp);
+      if (r < 0) {
+        ldpp_dout(dpp, 20) << "could not get bucket sync policy handler for hint bucket=" << hint_bucket << " ... skipping" << dendl;
+        continue;
+      }
+    }
+
+    hint_bucket_handler->get_pipes(&resolved_dests,
+                                   &resolved_sources,
+                                   self_entity); /* flipping resolved dests and sources as these are
+                                                    relative to the remote entity */
+  }
+
+  handler->set_resolved_hints(std::move(resolved_sources), std::move(resolved_dests));
+
+  return 0;
+}
+
+int RGWSI_Bucket_Sync_SObj::do_get_policy_handler(RGWSI_Bucket_X_Ctx& ctx,
+                                                  std::optional<rgw_zone_id> zone,
+                                                  std::optional<rgw_bucket> _bucket,
+                                                  std::map<optional_zone_bucket, RGWBucketSyncPolicyHandlerRef>& temp_map,
+                                                  RGWBucketSyncPolicyHandlerRef *handler,
+                                                  optional_yield y,
+                                                  const DoutPrefixProvider *dpp)
+{
+  if (!_bucket) {
+    *handler = svc.zone->get_sync_policy_handler(zone);
+    return 0;
+  }
+
+  auto bucket = *_bucket;
+
+  if (bucket.bucket_id.empty()) {
+    RGWBucketEntryPoint ep_info;
+    int ret = svc.bucket_sobj->read_bucket_entrypoint_info(ctx.ep,
+                                                           RGWSI_Bucket::get_entrypoint_meta_key(bucket),
+                                                           &ep_info,
+                                                           nullptr, /* objv_tracker */
+                                                           nullptr, /* mtime */
+                                                           nullptr, /* attrs */
+                                                           y,
+                                                           dpp,
+                                                           nullptr, /* cache_info */
+                                                           boost::none /* refresh_version */);
+    if (ret < 0) {
+      if (ret != -ENOENT) {
+        ldout(cct, 0) << "ERROR: svc.bucket->read_bucket_info(bucket=" << bucket << ") returned r=" << ret << dendl;
+      }
+      return ret;
+    }
+
+    bucket = ep_info.bucket;
+  }
+
+  string zone_key;
+  string bucket_key;
+
+  if (zone && *zone != svc.zone->zone_id()) {
+    zone_key = zone->id;
+  }
+
+  bucket_key = RGWSI_Bucket::get_bi_meta_key(bucket);
+
+  string cache_key("bi/" + zone_key + "/" + bucket_key);
+
+  if (auto e = sync_policy_cache->find(cache_key)) {
+    *handler = e->handler;
+    return 0;
+  }
+
+  bucket_sync_policy_cache_entry e;
+  rgw_cache_entry_info cache_info;
+
+  RGWBucketInfo bucket_info;
+  map<string, bufferlist> attrs;
+
+  int r = svc.bucket_sobj->read_bucket_instance_info(ctx.bi,
+                                                     bucket_key,
+                                                     &bucket_info,
+                                                     nullptr,
+                                                     &attrs,
+                                                     y,
+                                                     dpp,
+                                                     &cache_info);
+  if (r < 0) {
+    if (r != -ENOENT) {
+      ldpp_dout(dpp, 0) << "ERROR: svc.bucket->read_bucket_instance_info(key=" << bucket_key << ") returned r=" << r << dendl;
+    }
+    return r;
+  }
+
+  auto zone_policy_handler = svc.zone->get_sync_policy_handler(zone);
+  if (!zone_policy_handler) {
+    ldpp_dout(dpp, 20) << "ERROR: could not find policy handler for zone=" << zone << dendl;
+    return -ENOENT;
+  }
+
+  e.handler.reset(zone_policy_handler->alloc_child(bucket_info, std::move(attrs)));
+
+  r = e.handler->init(dpp, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "ERROR: failed to init bucket sync policy handler: r=" << r << dendl;
+    return r;
+  }
+
+  temp_map.emplace(optional_zone_bucket{zone, bucket}, e.handler);
+
+  rgw_sync_bucket_entity self_entity(zone.value_or(svc.zone->zone_id()), bucket);
+
+  r = resolve_policy_hints(ctx, self_entity,
+                           e.handler,
+                           zone_policy_handler,
+                           temp_map, y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "ERROR: failed to resolve policy hints: bucket_key=" << bucket_key << ", r=" << r << dendl;
+    return r;
+  }
+
+  if (!sync_policy_cache->put(dpp, svc.cache, cache_key, &e, {&cache_info})) {
+    ldpp_dout(dpp, 20) << "couldn't put bucket_sync_policy cache entry, might have raced with data changes" << dendl;
+  }
+
+  *handler = e.handler;
+
+  return 0;
+}
+
+int RGWSI_Bucket_Sync_SObj::get_policy_handler(RGWSI_Bucket_X_Ctx& ctx,
+                                               std::optional<rgw_zone_id> zone,
+                                               std::optional<rgw_bucket> _bucket,
+                                               RGWBucketSyncPolicyHandlerRef *handler,
+                                               optional_yield y,
+                                               const DoutPrefixProvider *dpp)
+{
+  std::map<optional_zone_bucket, RGWBucketSyncPolicyHandlerRef> temp_map;
+  return do_get_policy_handler(ctx, zone, _bucket, temp_map, handler, y, dpp);
+}
+
+static bool diff_sets(std::set<rgw_bucket>& orig_set,
+                      std::set<rgw_bucket>& new_set,
+                      vector<rgw_bucket> *added,
+                      vector<rgw_bucket> *removed)
+{
+  auto oiter = orig_set.begin();
+  auto niter = new_set.begin();
+
+  while (oiter != orig_set.end() &&
+         niter != new_set.end()) {
+    if (*oiter == *niter) {
+      ++oiter;
+      ++niter;
+      continue;
+    } else if (*oiter < *niter) {
+      removed->push_back(*oiter);
+      ++oiter;
+    } else {
+      added->push_back(*niter);
+      ++niter;
+    }
+  }
+  for (; oiter != orig_set.end(); ++oiter) {
+    removed->push_back(*oiter);
+  }
+  for (; niter != new_set.end(); ++niter) {
+    added->push_back(*niter);
+  }
+
+  return !(removed->empty() && added->empty());
+}
+
+
+class RGWSI_BS_SObj_HintIndexObj
+{
+  friend class RGWSI_Bucket_Sync_SObj;
+
+  CephContext *cct;
+  struct {
+    RGWSI_SysObj *sysobj;
+  } svc;
+
+  rgw_raw_obj obj;
+  RGWSysObj sysobj;
+
+  RGWObjVersionTracker ot;
+
+  bool has_data{false};
+
+public:
+  struct bi_entry {
+    rgw_bucket bucket;
+    map<rgw_bucket /* info_source */, obj_version> sources;
+
+    void encode(bufferlist& bl) const {
+      ENCODE_START(1, 1, bl);
+      encode(bucket, bl);
+      encode(sources, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::const_iterator& bl) {
+      DECODE_START(1, bl);
+      decode(bucket, bl);
+      decode(sources, bl);
+      DECODE_FINISH(bl);
+    }
+
+    bool add(const rgw_bucket& info_source,
+             const obj_version& info_source_ver) {
+      auto& ver = sources[info_source];
+
+      if (ver == info_source_ver) { /* already updated */
+        return false;
+      }
+
+      if (info_source_ver.tag == ver.tag &&
+          info_source_ver.ver < ver.ver) {
+        return false;
+      }
+
+      ver = info_source_ver;
+
+      return true;
+    }
+
+    bool remove(const rgw_bucket& info_source,
+                const obj_version& info_source_ver) {
+      auto iter = sources.find(info_source);
+      if (iter == sources.end()) {
+        return false;
+      }
+
+      auto& ver = iter->second;
+
+      if (info_source_ver.tag == ver.tag &&
+          info_source_ver.ver < ver.ver) {
+        return false;
+      }
+
+      sources.erase(info_source);
+      return true;
+    }
+
+    bool empty() const {
+      return sources.empty();
+    }
+  };
+
+  struct single_instance_info {
+    map<rgw_bucket, bi_entry> entries;
+
+    void encode(bufferlist& bl) const {
+      ENCODE_START(1, 1, bl);
+      encode(entries, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::const_iterator& bl) {
+      DECODE_START(1, bl);
+      decode(entries, bl);
+      DECODE_FINISH(bl);
+    }
+
+    bool add_entry(const rgw_bucket& info_source,
+                   const obj_version& info_source_ver,
+                   const rgw_bucket& bucket) {
+      auto& entry = entries[bucket];
+
+      if (!entry.add(info_source, info_source_ver)) {
+        return false;
+      }
+
+      entry.bucket = bucket;
+
+      return true;
+    }
+
+    bool remove_entry(const rgw_bucket& info_source,
+                      const obj_version& info_source_ver,
+                      const rgw_bucket& bucket) {
+      auto iter = entries.find(bucket);
+      if (iter == entries.end()) {
+        return false;
+      }
+
+      if (!iter->second.remove(info_source, info_source_ver)) {
+        return false;
+      }
+
+      if (iter->second.empty()) {
+        entries.erase(iter);
+      }
+
+      return true;
+    }
+
+    void clear() {
+      entries.clear();
+    }
+
+    bool empty() const {
+      return entries.empty();
+    }
+
+    void get_entities(std::set<rgw_bucket> *result) const {
+      for (auto& iter : entries) {
+        result->insert(iter.first);
+      }
+    }
+  };
+
+  struct info_map {
+    map<rgw_bucket, single_instance_info> instances;
+
+    void encode(bufferlist& bl) const {
+      ENCODE_START(1, 1, bl);
+      encode(instances, bl);
+      ENCODE_FINISH(bl);
+    }
+
+    void decode(bufferlist::const_iterator& bl) {
+      DECODE_START(1, bl);
+      decode(instances, bl);
+      DECODE_FINISH(bl);
+    }
+
+    bool empty() const {
+      return instances.empty();
+    }
+
+    void clear() {
+      instances.clear();
+    }
+
+    void get_entities(const rgw_bucket& bucket,
+                      std::set<rgw_bucket> *result) const {
+      auto iter = instances.find(bucket);
+      if (iter == instances.end()) {
+        return;
+      }
+      iter->second.get_entities(result);
+    }
+  } info;
+
+  RGWSI_BS_SObj_HintIndexObj(RGWSI_SysObj *_sysobj_svc,
+                             const rgw_raw_obj& _obj) : cct(_sysobj_svc->ctx()),
+                                                        obj(_obj),
+                                                        sysobj(_sysobj_svc->get_obj(obj))
+  {
+    svc.sysobj = _sysobj_svc;
+  }
+
+  template <typename C1, typename C2>
+  int update(const DoutPrefixProvider *dpp, 
+             const rgw_bucket& entity,
+             const RGWBucketInfo& info_source,
+             C1 *add,
+             C2 *remove,
+             optional_yield y);
+
+private:
+  template <typename C1, typename C2>
+  void update_entries(const rgw_bucket& info_source,
+                      const obj_version& info_source_ver,
+                      C1 *add,
+                      C2 *remove,
+                      single_instance_info *instance);
+
+  int read(const DoutPrefixProvider *dpp, optional_yield y);
+  int flush(const DoutPrefixProvider *dpp, optional_yield y);
+
+  void invalidate() {
+    has_data = false;
+    info.clear();
+  }
+
+  void get_entities(const rgw_bucket& bucket,
+                    std::set<rgw_bucket> *result) const {
+    info.get_entities(bucket, result);
+  }
+};
+WRITE_CLASS_ENCODER(RGWSI_BS_SObj_HintIndexObj::bi_entry)
+WRITE_CLASS_ENCODER(RGWSI_BS_SObj_HintIndexObj::single_instance_info)
+WRITE_CLASS_ENCODER(RGWSI_BS_SObj_HintIndexObj::info_map)
+
+template <typename C1, typename C2>
+int RGWSI_BS_SObj_HintIndexObj::update(const DoutPrefixProvider *dpp, 
+                                       const rgw_bucket& entity,
+                                       const RGWBucketInfo& info_source,
+                                       C1 *add,
+                                       C2 *remove,
+                                       optional_yield y)
+{
+  int r = 0;
+
+  auto& info_source_ver = info_source.objv_tracker.read_version;
+
+#define MAX_RETRIES 25
+
+  for (int i = 0; i < MAX_RETRIES; ++i) {
+    if (!has_data) {
+      r = read(dpp, y);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: cannot update hint index: failed to read: r=" << r << dendl;
+        return r;
+      }
+    }
+
+    auto& instance = info.instances[entity];
+
+    update_entries(info_source.bucket,
+                   info_source_ver,
+                   add, remove,
+                   &instance);
+
+    if (instance.empty()) {
+      info.instances.erase(entity);
+    }
+
+    r = flush(dpp, y);
+    if (r >= 0) {
+      return 0;
+    }
+
+    if (r != -ECANCELED) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to flush hint index: obj=" << obj << " r=" << r << dendl;
+      return r;
+    }
+
+    invalidate();
+  }
+  ldpp_dout(dpp, 0) << "ERROR: failed to flush hint index: too many retries (obj=" << obj << "), likely a bug" << dendl;
+
+  return -EIO;
+}
+
+template <typename C1, typename C2>
+void RGWSI_BS_SObj_HintIndexObj::update_entries(const rgw_bucket& info_source,
+                                                const obj_version& info_source_ver,
+                                                C1 *add,
+                                                C2 *remove,
+                                                single_instance_info *instance)
+{
+  if (remove) {
+    for (auto& bucket : *remove) {
+      instance->remove_entry(info_source, info_source_ver, bucket);
+    }
+  }
+
+  if (add) {
+    for (auto& bucket : *add) {
+      instance->add_entry(info_source, info_source_ver, bucket);
+    }
+  }
+}
+
+int RGWSI_BS_SObj_HintIndexObj::read(const DoutPrefixProvider *dpp, optional_yield y) {
+  RGWObjVersionTracker _ot;
+  bufferlist bl;
+  int r = sysobj.rop()
+    .set_objv_tracker(&_ot) /* forcing read of current version */
+    .read(dpp, &bl, y);
+  if (r < 0 && r != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: failed reading data (obj=" << obj << "), r=" << r << dendl;
+    return r;
+  }
+
+  ot = _ot;
+
+  if (r >= 0) {
+    auto iter = bl.cbegin();
+    try {
+      decode(info, iter);
+      has_data = true;
+    } catch (buffer::error& err) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to decode entries, ignoring" << dendl;
+      info.clear();
+    }
+  } else {
+    info.clear();
+  }
+
+  return 0;
+}
+
+int RGWSI_BS_SObj_HintIndexObj::flush(const DoutPrefixProvider *dpp, optional_yield y) {
+  int r;
+
+  if (!info.empty()) {
+    bufferlist bl;
+    encode(info, bl);
+
+    r = sysobj.wop()
+      .set_objv_tracker(&ot) /* forcing read of current version */
+      .write(dpp, bl, y);
+
+  } else { /* remove */
+    r = sysobj.wop()
+      .set_objv_tracker(&ot)
+      .remove(dpp, y);
+  }
+
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+rgw_raw_obj RGWSI_Bucket_Sync_SObj_HintIndexManager::get_sources_obj(const rgw_bucket& bucket) const
+{
+  rgw_bucket b = bucket;
+  b.bucket_id.clear();
+  return rgw_raw_obj(svc.zone->get_zone_params().log_pool,
+                     bucket_sync_sources_oid_prefix + "." + b.get_key());
+}
+
+rgw_raw_obj RGWSI_Bucket_Sync_SObj_HintIndexManager::get_dests_obj(const rgw_bucket& bucket) const
+{
+  rgw_bucket b = bucket;
+  b.bucket_id.clear();
+  return rgw_raw_obj(svc.zone->get_zone_params().log_pool,
+                     bucket_sync_targets_oid_prefix + "." + b.get_key());
+}
+
+template <typename C1, typename C2>
+int RGWSI_Bucket_Sync_SObj_HintIndexManager::update_hints(const DoutPrefixProvider *dpp, 
+                                                          const RGWBucketInfo& bucket_info,
+                                                          C1& added_dests,
+                                                          C2& removed_dests,
+                                                          C1& added_sources,
+                                                          C2& removed_sources,
+                                                          optional_yield y)
+{
+  C1 self_entity = { bucket_info.bucket };
+
+  if (!added_dests.empty() ||
+      !removed_dests.empty()) {
+    /* update our dests */
+    RGWSI_BS_SObj_HintIndexObj index(svc.sysobj,
+                                     get_dests_obj(bucket_info.bucket));
+    int r = index.update(dpp, bucket_info.bucket,
+                         bucket_info,
+                         &added_dests,
+                         &removed_dests,
+                         y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << bucket_info.bucket << " r=" << r << dendl;
+      return r;
+    }
+
+    /* update dest buckets */
+    for (auto& dest_bucket : added_dests) {
+      RGWSI_BS_SObj_HintIndexObj dep_index(svc.sysobj,
+                                           get_sources_obj(dest_bucket));
+      int r = dep_index.update(dpp, dest_bucket,
+                               bucket_info,
+                               &self_entity,
+                               static_cast<C2 *>(nullptr),
+                               y);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << dest_bucket << " r=" << r << dendl;
+        return r;
+      }
+    }
+    /* update removed dest buckets */
+    for (auto& dest_bucket : removed_dests) {
+      RGWSI_BS_SObj_HintIndexObj dep_index(svc.sysobj,
+                                           get_sources_obj(dest_bucket));
+      int r = dep_index.update(dpp, dest_bucket,
+                               bucket_info,
+                               static_cast<C1 *>(nullptr),
+                               &self_entity,
+                               y);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << dest_bucket << " r=" << r << dendl;
+        return r;
+      }
+    }
+  }
+
+  if (!added_sources.empty() ||
+      !removed_sources.empty()) {
+    RGWSI_BS_SObj_HintIndexObj index(svc.sysobj,
+                                     get_sources_obj(bucket_info.bucket));
+    /* update our sources */
+    int r = index.update(dpp, bucket_info.bucket,
+                         bucket_info,
+                         &added_sources,
+                         &removed_sources,
+                         y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << bucket_info.bucket << " r=" << r << dendl;
+      return r;
+    }
+
+    /* update added sources buckets */
+    for (auto& source_bucket : added_sources) {
+      RGWSI_BS_SObj_HintIndexObj dep_index(svc.sysobj,
+                                           get_dests_obj(source_bucket));
+      int r = dep_index.update(dpp, source_bucket,
+                               bucket_info,
+                               &self_entity,
+                               static_cast<C2 *>(nullptr),
+                               y);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << source_bucket << " r=" << r << dendl;
+        return r;
+      }
+    }
+    /* update removed dest buckets */
+    for (auto& source_bucket : removed_sources) {
+      RGWSI_BS_SObj_HintIndexObj dep_index(svc.sysobj,
+                                           get_dests_obj(source_bucket));
+      int r = dep_index.update(dpp, source_bucket,
+                               bucket_info,
+                               static_cast<C1 *>(nullptr),
+                               &self_entity,
+                               y);
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "ERROR: failed to update targets index for bucket=" << source_bucket << " r=" << r << dendl;
+        return r;
+      }
+    }
+  }
+
+  return 0;
+}
+
+int RGWSI_Bucket_Sync_SObj::handle_bi_removal(const DoutPrefixProvider *dpp, 
+                                              const RGWBucketInfo& bucket_info,
+                                              optional_yield y)
+{
+  std::set<rgw_bucket> sources_set;
+  std::set<rgw_bucket> dests_set;
+
+  if (bucket_info.sync_policy) {
+    bucket_info.sync_policy->get_potential_related_buckets(bucket_info.bucket,
+                                                           &sources_set,
+                                                           &dests_set);
+  }
+
+  std::vector<rgw_bucket> removed_sources;
+  removed_sources.reserve(sources_set.size());
+  for (auto& e : sources_set) {
+    removed_sources.push_back(e);
+  }
+
+  std::vector<rgw_bucket> removed_dests;
+  removed_dests.reserve(dests_set.size());
+  for (auto& e : dests_set) {
+    removed_dests.push_back(e);
+  }
+
+  std::vector<rgw_bucket> added_sources;
+  std::vector<rgw_bucket> added_dests;
+
+  return hint_index_mgr->update_hints(dpp, bucket_info,
+                                      added_dests,
+                                      removed_dests,
+                                      added_sources,
+                                      removed_sources,
+                                      y);
+}
+
+int RGWSI_Bucket_Sync_SObj::handle_bi_update(const DoutPrefixProvider *dpp, 
+                                             RGWBucketInfo& bucket_info,
+                                             RGWBucketInfo *orig_bucket_info,
+                                             optional_yield y)
+{
+  std::set<rgw_bucket> orig_sources;
+  std::set<rgw_bucket> orig_dests;
+
+  if (orig_bucket_info &&
+      orig_bucket_info->sync_policy) {
+    orig_bucket_info->sync_policy->get_potential_related_buckets(bucket_info.bucket,
+                                                                &orig_sources,
+                                                                &orig_dests);
+  }
+
+  std::set<rgw_bucket> sources;
+  std::set<rgw_bucket> dests;
+  if (bucket_info.sync_policy) {
+    bucket_info.sync_policy->get_potential_related_buckets(bucket_info.bucket,
+                                                           &sources,
+                                                           &dests);
+  }
+
+  std::vector<rgw_bucket> removed_sources;
+  std::vector<rgw_bucket> added_sources;
+  bool found = diff_sets(orig_sources, sources, &added_sources, &removed_sources);
+  ldpp_dout(dpp, 20) << __func__ << "(): bucket=" << bucket_info.bucket << ": orig_sources=" << orig_sources << " new_sources=" << sources << dendl;
+  ldpp_dout(dpp, 20) << __func__ << "(): bucket=" << bucket_info.bucket << ":  potential sources added=" << added_sources << " removed=" << removed_sources << dendl;
+  
+  std::vector<rgw_bucket> removed_dests;
+  std::vector<rgw_bucket> added_dests;
+  found = found || diff_sets(orig_dests, dests, &added_dests, &removed_dests);
+
+  ldpp_dout(dpp, 20) << __func__ << "(): bucket=" << bucket_info.bucket << ": orig_dests=" << orig_dests << " new_dests=" << dests << dendl;
+  ldpp_dout(dpp, 20) << __func__ << "(): bucket=" << bucket_info.bucket << ":  potential dests added=" << added_dests << " removed=" << removed_dests << dendl;
+
+  if (!found) {
+    return 0;
+  }
+
+  return hint_index_mgr->update_hints(dpp, bucket_info,
+                                      dests, /* set all dests, not just the ones that were added */
+                                      removed_dests,
+                                      sources, /* set all sources, not just that the ones that were added */
+                                      removed_sources,
+                                      y);
+}
+
+int RGWSI_Bucket_Sync_SObj::get_bucket_sync_hints(const DoutPrefixProvider *dpp,
+                                                  const rgw_bucket& bucket,
+                                                  std::set<rgw_bucket> *sources,
+                                                  std::set<rgw_bucket> *dests,
+                                                  optional_yield y)
+{
+  if (!sources && !dests) {
+    return 0;
+  }
+
+  if (sources) {
+    RGWSI_BS_SObj_HintIndexObj index(svc.sysobj,
+                                     hint_index_mgr->get_sources_obj(bucket));
+    int r = index.read(dpp, y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to update sources index for bucket=" << bucket << " r=" << r << dendl;
+      return r;
+    }
+
+    index.get_entities(bucket, sources);
+
+    if (!bucket.bucket_id.empty()) {
+      rgw_bucket b = bucket;
+      b.bucket_id.clear();
+      index.get_entities(b, sources);
+    }
+  }
+
+  if (dests) {
+    RGWSI_BS_SObj_HintIndexObj index(svc.sysobj,
+                                     hint_index_mgr->get_dests_obj(bucket));
+    int r = index.read(dpp, y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: failed to read targets index for bucket=" << bucket << " r=" << r << dendl;
+      return r;
+    }
+
+    index.get_entities(bucket, dests);
+
+    if (!bucket.bucket_id.empty()) {
+      rgw_bucket b = bucket;
+      b.bucket_id.clear();
+      index.get_entities(b, dests);
+    }
+  }
+
+  return 0;
+}
diff --git a/src/rgw/services/svc_bucket_sync_sobj.h b/src/rgw/services/svc_bucket_sync_sobj.h
new file mode 100644
index 000000000..779df7b99
--- /dev/null
+++ b/src/rgw/services/svc_bucket_sync_sobj.h
@@ -0,0 +1,123 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "svc_meta_be.h"
+#include "svc_bucket_sync.h"
+
+class RGWSI_Zone;
+class RGWSI_SysObj_Cache;
+class RGWSI_Bucket_SObj;
+
+template <class T>
+class RGWChainedCacheImpl;
+
+class RGWSI_Bucket_Sync_SObj_HintIndexManager;
+
+struct rgw_sync_bucket_entity;
+
+class RGWSI_Bucket_Sync_SObj : public RGWSI_Bucket_Sync
+{
+  struct bucket_sync_policy_cache_entry {
+    std::shared_ptr<RGWBucketSyncPolicyHandler> handler;
+  };
+
+  std::unique_ptr<RGWChainedCacheImpl<bucket_sync_policy_cache_entry> > sync_policy_cache;
+
+  std::unique_ptr<RGWSI_Bucket_Sync_SObj_HintIndexManager> hint_index_mgr;
+
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+
+  struct optional_zone_bucket {
+    std::optional<rgw_zone_id> zone;
+    std::optional<rgw_bucket> bucket;
+
+    optional_zone_bucket(const std::optional<rgw_zone_id>& _zone,
+                         const std::optional<rgw_bucket>& _bucket) : zone(_zone), bucket(_bucket) {}
+
+    bool operator<(const optional_zone_bucket& ozb) const {
+      if (zone < ozb.zone) {
+        return true;
+      }
+      if (zone > ozb.zone) {
+        return false;
+      }
+      return bucket < ozb.bucket;
+    }
+  };
+
+  void get_hint_entities(RGWSI_Bucket_X_Ctx& ctx,
+                         const std::set<rgw_zone_id>& zone_names,
+                         const std::set<rgw_bucket>& buckets,
+                         std::set<rgw_sync_bucket_entity> *hint_entities,
+                         optional_yield y, const DoutPrefixProvider *);
+  int resolve_policy_hints(RGWSI_Bucket_X_Ctx& ctx,
+                           rgw_sync_bucket_entity& self_entity,
+                           RGWBucketSyncPolicyHandlerRef& handler,
+                           RGWBucketSyncPolicyHandlerRef& zone_policy_handler,
+                           std::map<optional_zone_bucket, RGWBucketSyncPolicyHandlerRef>& temp_map,
+                           optional_yield y,
+                           const DoutPrefixProvider *dpp);
+  int do_get_policy_handler(RGWSI_Bucket_X_Ctx& ctx,
+                            std::optional<rgw_zone_id> zone,
+                            std::optional<rgw_bucket> _bucket,
+                            std::map<optional_zone_bucket, RGWBucketSyncPolicyHandlerRef>& temp_map,
+                            RGWBucketSyncPolicyHandlerRef *handler,
+                            optional_yield y,
+                            const DoutPrefixProvider *dpp);
+public:
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_SysObj *sysobj{nullptr};
+    RGWSI_SysObj_Cache *cache{nullptr};
+    RGWSI_Bucket_SObj *bucket_sobj{nullptr};
+  } svc;
+
+  RGWSI_Bucket_Sync_SObj(CephContext *cct);
+  ~RGWSI_Bucket_Sync_SObj();
+
+  void init(RGWSI_Zone *_zone_svc,
+            RGWSI_SysObj *_sysobj_svc,
+            RGWSI_SysObj_Cache *_cache_svc,
+            RGWSI_Bucket_SObj *_bucket_sobj_svc);
+
+
+  int get_policy_handler(RGWSI_Bucket_X_Ctx& ctx,
+                         std::optional<rgw_zone_id> zone,
+                         std::optional<rgw_bucket> bucket,
+                         RGWBucketSyncPolicyHandlerRef *handler,
+                         optional_yield y,
+                         const DoutPrefixProvider *dpp);
+
+  int handle_bi_update(const DoutPrefixProvider *dpp, 
+                       RGWBucketInfo& bucket_info,
+                       RGWBucketInfo *orig_bucket_info,
+                       optional_yield y) override;
+  int handle_bi_removal(const DoutPrefixProvider *dpp, 
+                        const RGWBucketInfo& bucket_info,
+                        optional_yield y) override;
+
+  int get_bucket_sync_hints(const DoutPrefixProvider *dpp,
+                            const rgw_bucket& bucket,
+                            std::set<rgw_bucket> *sources,
+                            std::set<rgw_bucket> *dests,
+                            optional_yield y) override;
+};
+
diff --git a/src/rgw/services/svc_bucket_types.h b/src/rgw/services/svc_bucket_types.h
new file mode 100644
index 000000000..30e5309d5
--- /dev/null
+++ b/src/rgw/services/svc_bucket_types.h
@@ -0,0 +1,38 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "common/ptr_wrapper.h"
+
+#include "svc_meta_be.h"
+#include "svc_meta_be_types.h"
+
+class RGWSI_MetaBackend_Handler;
+
+using RGWSI_Bucket_BE_Handler = ptr_wrapper<RGWSI_MetaBackend_Handler, RGWSI_META_BE_TYPES::BUCKET>;
+using RGWSI_BucketInstance_BE_Handler = ptr_wrapper<RGWSI_MetaBackend_Handler, RGWSI_META_BE_TYPES::BI>;
+
+
+using RGWSI_Bucket_EP_Ctx = ptr_wrapper<RGWSI_MetaBackend::Context, RGWSI_META_BE_TYPES::BUCKET>;
+using RGWSI_Bucket_BI_Ctx = ptr_wrapper<RGWSI_MetaBackend::Context, RGWSI_META_BE_TYPES::BI>;
+
+struct RGWSI_Bucket_X_Ctx {
+  RGWSI_Bucket_EP_Ctx ep;
+  RGWSI_Bucket_BI_Ctx bi;
+};
+
diff --git a/src/rgw/services/svc_cls.cc b/src/rgw/services/svc_cls.cc
new file mode 100644
index 000000000..342146bfe
--- /dev/null
+++ b/src/rgw/services/svc_cls.cc
@@ -0,0 +1,478 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+
+#include "svc_cls.h"
+#include "svc_rados.h"
+#include "svc_zone.h"
+
+#include "rgw_zone.h"
+
+#include "cls/otp/cls_otp_client.h"
+#include "cls/log/cls_log_client.h"
+#include "cls/lock/cls_lock_client.h"
+
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static string log_lock_name = "rgw_log_lock";
+
+int RGWSI_Cls::do_start(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  int r = mfa.do_start(y, dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to start mfa service" << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_Cls::MFA::get_mfa_obj(const DoutPrefixProvider *dpp, const rgw_user& user, std::optional<RGWSI_RADOS::Obj> *obj)
+{
+  string oid = get_mfa_oid(user);
+  rgw_raw_obj o(zone_svc->get_zone_params().otp_pool, oid);
+
+  obj->emplace(rados_svc->obj(o));
+  int r = (*obj)->open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 4) << "failed to open rados context for " << o << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_Cls::MFA::get_mfa_ref(const DoutPrefixProvider *dpp, const rgw_user& user, rgw_rados_ref *ref)
+{
+  std::optional<RGWSI_RADOS::Obj> obj;
+  int r = get_mfa_obj(dpp, user, &obj);
+  if (r < 0) {
+    return r;
+  }
+  *ref = obj->get_ref();
+  return 0;
+}
+
+int RGWSI_Cls::MFA::check_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const string& otp_id, const string& pin, optional_yield y)
+{
+  rgw_rados_ref ref;
+  int r = get_mfa_ref(dpp, user, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  rados::cls::otp::otp_check_t result;
+
+  r = rados::cls::otp::OTP::check(cct, ref.pool.ioctx(), ref.obj.oid, otp_id, pin, &result);
+  if (r < 0)
+    return r;
+
+  ldpp_dout(dpp, 20) << "OTP check, otp_id=" << otp_id << " result=" << (int)result.result << dendl;
+
+  return (result.result == rados::cls::otp::OTP_CHECK_SUCCESS ? 0 : -EACCES);
+}
+
+void RGWSI_Cls::MFA::prepare_mfa_write(librados::ObjectWriteOperation *op,
+                                 RGWObjVersionTracker *objv_tracker,
+                                 const ceph::real_time& mtime)
+{
+  RGWObjVersionTracker ot;
+
+  if (objv_tracker) {
+    ot = *objv_tracker;
+  }
+
+  if (ot.write_version.tag.empty()) {
+    if (ot.read_version.tag.empty()) {
+      ot.generate_new_write_ver(cct);
+    } else {
+      ot.write_version = ot.read_version;
+      ot.write_version.ver++;
+    }
+  }
+
+  ot.prepare_op_for_write(op);
+  struct timespec mtime_ts = real_clock::to_timespec(mtime);
+  op->mtime2(&mtime_ts);
+}
+
+int RGWSI_Cls::MFA::create_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const rados::cls::otp::otp_info_t& config,
+                         RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime, optional_yield y)
+{
+  std::optional<RGWSI_RADOS::Obj> obj;
+  int r = get_mfa_obj(dpp, user, &obj);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  prepare_mfa_write(&op, objv_tracker, mtime);
+  rados::cls::otp::OTP::create(&op, config);
+  r = obj->operate(dpp, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "OTP create, otp_id=" << config.id << " result=" << (int)r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_Cls::MFA::remove_mfa(const DoutPrefixProvider *dpp, 
+                         const rgw_user& user, const string& id,
+                         RGWObjVersionTracker *objv_tracker,
+                         const ceph::real_time& mtime,
+                         optional_yield y)
+{
+  std::optional<RGWSI_RADOS::Obj> obj;
+  int r = get_mfa_obj(dpp, user, &obj);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  prepare_mfa_write(&op, objv_tracker, mtime);
+  rados::cls::otp::OTP::remove(&op, id);
+  r = obj->operate(dpp, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "OTP remove, otp_id=" << id << " result=" << (int)r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_Cls::MFA::get_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const string& id, rados::cls::otp::otp_info_t *result,
+			    optional_yield y)
+{
+  rgw_rados_ref ref;
+
+  int r = get_mfa_ref(dpp, user, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  r = rados::cls::otp::OTP::get(nullptr, ref.pool.ioctx(), ref.obj.oid, id, result);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_Cls::MFA::list_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, list<rados::cls::otp::otp_info_t> *result,
+			     optional_yield y)
+{
+  rgw_rados_ref ref;
+
+  int r = get_mfa_ref(dpp, user, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  r = rados::cls::otp::OTP::get_all(nullptr, ref.pool.ioctx(), ref.obj.oid, result);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_Cls::MFA::otp_get_current_time(const DoutPrefixProvider *dpp, const rgw_user& user, ceph::real_time *result,
+					 optional_yield y)
+{
+  rgw_rados_ref ref;
+
+  int r = get_mfa_ref(dpp, user, &ref);
+  if (r < 0) {
+    return r;
+  }
+
+  r = rados::cls::otp::OTP::get_current_time(ref.pool.ioctx(), ref.obj.oid, result);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_Cls::MFA::set_mfa(const DoutPrefixProvider *dpp, const string& oid, const list<rados::cls::otp::otp_info_t>& entries,
+			    bool reset_obj, RGWObjVersionTracker *objv_tracker,
+			    const real_time& mtime,
+			    optional_yield y)
+{
+  rgw_raw_obj o(zone_svc->get_zone_params().otp_pool, oid);
+  auto obj = rados_svc->obj(o);
+  int r = obj.open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 4) << "failed to open rados context for " << o << dendl;
+    return r;
+  }
+  librados::ObjectWriteOperation op;
+  if (reset_obj) {
+    op.remove();
+    op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
+    op.create(false);
+  }
+  prepare_mfa_write(&op, objv_tracker, mtime);
+  rados::cls::otp::OTP::set(&op, entries);
+  r = obj.operate(dpp, &op, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "OTP set entries.size()=" << entries.size() << " result=" << (int)r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_Cls::MFA::list_mfa(const DoutPrefixProvider *dpp, const string& oid, list<rados::cls::otp::otp_info_t> *result,
+			     RGWObjVersionTracker *objv_tracker, ceph::real_time *pmtime,
+			     optional_yield y)
+{
+  rgw_raw_obj o(zone_svc->get_zone_params().otp_pool, oid);
+  auto obj = rados_svc->obj(o);
+  int r = obj.open(dpp);
+  if (r < 0) {
+    ldpp_dout(dpp, 4) << "failed to open rados context for " << o << dendl;
+    return r;
+  }
+  auto& ref = obj.get_ref();
+  librados::ObjectReadOperation op;
+  struct timespec mtime_ts;
+  if (pmtime) {
+    op.stat2(nullptr, &mtime_ts, nullptr);
+  }
+  objv_tracker->prepare_op_for_read(&op);
+  r = rados::cls::otp::OTP::get_all(&op, ref.pool.ioctx(), ref.obj.oid, result);
+  if (r < 0) {
+    return r;
+  }
+  if (pmtime) {
+    *pmtime = ceph::real_clock::from_timespec(mtime_ts);
+  }
+
+  return 0;
+}
+
+void RGWSI_Cls::TimeLog::prepare_entry(cls_log_entry& entry,
+                                       const real_time& ut,
+                                       const string& section,
+                                       const string& key,
+                                       bufferlist& bl)
+{
+  cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
+}
+
+int RGWSI_Cls::TimeLog::init_obj(const DoutPrefixProvider *dpp, const string& oid, RGWSI_RADOS::Obj& obj)
+{
+  rgw_raw_obj o(zone_svc->get_zone_params().log_pool, oid);
+  obj = rados_svc->obj(o);
+  return obj.open(dpp);
+
+}
+int RGWSI_Cls::TimeLog::add(const DoutPrefixProvider *dpp, 
+                            const string& oid,
+                            const real_time& ut,
+                            const string& section,
+                            const string& key,
+                            bufferlist& bl,
+			    optional_yield y)
+{
+  RGWSI_RADOS::Obj obj;
+
+  int r = init_obj(dpp, oid, obj);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  utime_t t(ut);
+  cls_log_add(op, t, section, key, bl);
+
+  return obj.operate(dpp, &op, y);
+}
+
+int RGWSI_Cls::TimeLog::add(const DoutPrefixProvider *dpp, 
+                            const string& oid,
+                            std::list<cls_log_entry>& entries,
+                            librados::AioCompletion *completion,
+                            bool monotonic_inc,
+                            optional_yield y)
+{
+  RGWSI_RADOS::Obj obj;
+
+  int r = init_obj(dpp, oid, obj);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  cls_log_add(op, entries, monotonic_inc);
+
+  if (!completion) {
+    r = obj.operate(dpp, &op, y);
+  } else {
+    r = obj.aio_operate(completion, &op);
+  }
+  return r;
+}
+
+int RGWSI_Cls::TimeLog::list(const DoutPrefixProvider *dpp, 
+                             const string& oid,
+                             const real_time& start_time,
+                             const real_time& end_time,
+                             int max_entries, std::list<cls_log_entry>& entries,
+                             const string& marker,
+                             string *out_marker,
+                             bool *truncated,
+                             optional_yield y)
+{
+  RGWSI_RADOS::Obj obj;
+
+  int r = init_obj(dpp, oid, obj);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectReadOperation op;
+
+  utime_t st(start_time);
+  utime_t et(end_time);
+
+  cls_log_list(op, st, et, marker, max_entries, entries,
+	       out_marker, truncated);
+
+  bufferlist obl;
+
+  int ret = obj.operate(dpp, &op, &obl, y);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSI_Cls::TimeLog::info(const DoutPrefixProvider *dpp, 
+                             const string& oid,
+                             cls_log_header *header,
+                             optional_yield y)
+{
+  RGWSI_RADOS::Obj obj;
+
+  int r = init_obj(dpp, oid, obj);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectReadOperation op;
+
+  cls_log_info(op, header);
+
+  bufferlist obl;
+
+  int ret = obj.operate(dpp, &op, &obl, y);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSI_Cls::TimeLog::info_async(const DoutPrefixProvider *dpp,
+                                   RGWSI_RADOS::Obj& obj,
+                                   const string& oid,
+                                   cls_log_header *header,
+                                   librados::AioCompletion *completion)
+{
+  int r = init_obj(dpp, oid, obj);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectReadOperation op;
+
+  cls_log_info(op, header);
+
+  int ret = obj.aio_operate(completion, &op, nullptr);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSI_Cls::TimeLog::trim(const DoutPrefixProvider *dpp, 
+                             const string& oid,
+                             const real_time& start_time,
+                             const real_time& end_time,
+                             const string& from_marker,
+                             const string& to_marker,
+                             librados::AioCompletion *completion,
+                             optional_yield y)
+{
+  RGWSI_RADOS::Obj obj;
+
+  int r = init_obj(dpp, oid, obj);
+  if (r < 0) {
+    return r;
+  }
+
+  utime_t st(start_time);
+  utime_t et(end_time);
+
+  librados::ObjectWriteOperation op;
+  cls_log_trim(op, st, et, from_marker, to_marker);
+
+  if (!completion) {
+    r = obj.operate(dpp, &op, y);
+  } else {
+    r = obj.aio_operate(completion, &op);
+  }
+  return r;
+}
+
+int RGWSI_Cls::Lock::lock_exclusive(const DoutPrefixProvider *dpp,
+                                    const rgw_pool& pool,
+                                    const string& oid,
+                                    timespan& duration,
+                                    string& zone_id,
+                                    string& owner_id,
+                                    std::optional<string> lock_name)
+{
+  auto p = rados_svc->pool(pool);
+  int r = p.open(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+  utime_t ut(msec / 1000, msec % 1000);
+  
+  rados::cls::lock::Lock l(lock_name.value_or(log_lock_name));
+  l.set_duration(ut);
+  l.set_cookie(owner_id);
+  l.set_tag(zone_id);
+  l.set_may_renew(true);
+  
+  return l.lock_exclusive(&p.ioctx(), oid);
+}
+
+int RGWSI_Cls::Lock::unlock(const DoutPrefixProvider *dpp,
+                            const rgw_pool& pool,
+                            const string& oid,
+                            string& zone_id,
+                            string& owner_id,
+                            std::optional<string> lock_name)
+{
+  auto p = rados_svc->pool(pool);
+  int r = p.open(dpp);
+  if (r < 0) {
+    return r;
+  }
+  
+  rados::cls::lock::Lock l(lock_name.value_or(log_lock_name));
+  l.set_tag(zone_id);
+  l.set_cookie(owner_id);
+  
+  return l.unlock(&p.ioctx(), oid);
+}
+
diff --git a/src/rgw/services/svc_cls.h b/src/rgw/services/svc_cls.h
new file mode 100644
index 000000000..d1d1d659b
--- /dev/null
+++ b/src/rgw/services/svc_cls.h
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "cls/otp/cls_otp_types.h"
+#include "cls/log/cls_log_types.h"
+
+#include "rgw_service.h"
+
+#include "svc_rados.h"
+
+
+class RGWSI_Cls : public RGWServiceInstance
+{
+  RGWSI_Zone *zone_svc{nullptr};
+  RGWSI_RADOS *rados_svc{nullptr};
+
+  class ClsSubService : public RGWServiceInstance {
+    friend class RGWSI_Cls;
+
+    RGWSI_Cls *cls_svc{nullptr};
+    RGWSI_Zone *zone_svc{nullptr};
+    RGWSI_RADOS *rados_svc{nullptr};
+
+    void init(RGWSI_Cls *_cls_svc, RGWSI_Zone *_zone_svc, RGWSI_RADOS *_rados_svc) {
+      cls_svc = _cls_svc;
+      zone_svc = _cls_svc->zone_svc;
+      rados_svc = _cls_svc->rados_svc;
+    }
+
+  public:
+    ClsSubService(CephContext *cct) : RGWServiceInstance(cct) {}
+  };
+
+public:
+  class MFA : public ClsSubService {
+    int get_mfa_obj(const DoutPrefixProvider *dpp, const rgw_user& user, std::optional<RGWSI_RADOS::Obj> *obj);
+    int get_mfa_ref(const DoutPrefixProvider *dpp, const rgw_user& user, rgw_rados_ref *ref);
+
+    void prepare_mfa_write(librados::ObjectWriteOperation *op,
+			   RGWObjVersionTracker *objv_tracker,
+			   const ceph::real_time& mtime);
+
+  public:
+    MFA(CephContext *cct): ClsSubService(cct) {}
+
+    std::string get_mfa_oid(const rgw_user& user) {
+      return std::string("user:") + user.to_str();
+    }
+
+    int check_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& otp_id, const std::string& pin, optional_yield y);
+    int create_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const rados::cls::otp::otp_info_t& config,
+		   RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime, optional_yield y);
+    int remove_mfa(const DoutPrefixProvider *dpp, 
+                   const rgw_user& user, const std::string& id,
+		   RGWObjVersionTracker *objv_tracker,
+		   const ceph::real_time& mtime,
+		   optional_yield y);
+    int get_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, const std::string& id, rados::cls::otp::otp_info_t *result, optional_yield y);
+    int list_mfa(const DoutPrefixProvider *dpp, const rgw_user& user, std::list<rados::cls::otp::otp_info_t> *result, optional_yield y);
+    int otp_get_current_time(const DoutPrefixProvider *dpp, const rgw_user& user, ceph::real_time *result, optional_yield y);
+    int set_mfa(const DoutPrefixProvider *dpp, const std::string& oid, const std::list<rados::cls::otp::otp_info_t>& entries,
+		bool reset_obj, RGWObjVersionTracker *objv_tracker,
+		const real_time& mtime, optional_yield y);
+    int list_mfa(const DoutPrefixProvider *dpp, const std::string& oid, std::list<rados::cls::otp::otp_info_t> *result,
+		 RGWObjVersionTracker *objv_tracker, ceph::real_time *pmtime, optional_yield y);
+  } mfa;
+
+  class TimeLog : public ClsSubService {
+    int init_obj(const DoutPrefixProvider *dpp, const std::string& oid, RGWSI_RADOS::Obj& obj);
+  public:
+    TimeLog(CephContext *cct): ClsSubService(cct) {}
+
+    void prepare_entry(cls_log_entry& entry,
+                       const real_time& ut,
+                       const std::string& section,
+                       const std::string& key,
+                       bufferlist& bl);
+    int add(const DoutPrefixProvider *dpp, 
+            const std::string& oid,
+            const real_time& ut,
+            const std::string& section,
+            const std::string& key,
+            bufferlist& bl,
+            optional_yield y);
+    int add(const DoutPrefixProvider *dpp, 
+            const std::string& oid,
+            std::list<cls_log_entry>& entries,
+            librados::AioCompletion *completion,
+            bool monotonic_inc,
+            optional_yield y);
+    int list(const DoutPrefixProvider *dpp, 
+             const std::string& oid,
+             const real_time& start_time,
+             const real_time& end_time,
+             int max_entries, std::list<cls_log_entry>& entries,
+             const std::string& marker,
+             std::string *out_marker,
+             bool *truncated,
+             optional_yield y);
+    int info(const DoutPrefixProvider *dpp, 
+             const std::string& oid,
+             cls_log_header *header,
+             optional_yield y);
+    int info_async(const DoutPrefixProvider *dpp,
+                   RGWSI_RADOS::Obj& obj,
+                   const std::string& oid,
+                   cls_log_header *header,
+                   librados::AioCompletion *completion);
+    int trim(const DoutPrefixProvider *dpp, 
+             const std::string& oid,
+             const real_time& start_time,
+             const real_time& end_time,
+             const std::string& from_marker,
+             const std::string& to_marker,
+             librados::AioCompletion *completion,
+             optional_yield y);
+  } timelog;
+
+  class Lock : public ClsSubService {
+    int init_obj(const std::string& oid, RGWSI_RADOS::Obj& obj);
+    public:
+    Lock(CephContext *cct): ClsSubService(cct) {}
+    int lock_exclusive(const DoutPrefixProvider *dpp,
+                       const rgw_pool& pool,
+                       const std::string& oid,
+                       timespan& duration,
+                       std::string& zone_id,
+                       std::string& owner_id,
+                       std::optional<std::string> lock_name = std::nullopt);
+    int unlock(const DoutPrefixProvider *dpp,
+               const rgw_pool& pool,
+               const std::string& oid,
+               std::string& zone_id,
+               std::string& owner_id,
+               std::optional<std::string> lock_name = std::nullopt);
+  } lock;
+
+  RGWSI_Cls(CephContext *cct): RGWServiceInstance(cct), mfa(cct), timelog(cct), lock(cct) {}
+
+  void init(RGWSI_Zone *_zone_svc, RGWSI_RADOS *_rados_svc) {
+    rados_svc = _rados_svc;
+    zone_svc = _zone_svc;
+
+    mfa.init(this, zone_svc, rados_svc);
+    timelog.init(this, zone_svc, rados_svc);
+    lock.init(this, zone_svc, rados_svc);
+  }
+
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+};
+
diff --git a/src/rgw/services/svc_config_key.h b/src/rgw/services/svc_config_key.h
new file mode 100644
index 000000000..1c068b795
--- /dev/null
+++ b/src/rgw/services/svc_config_key.h
@@ -0,0 +1,31 @@
+
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+
+class RGWSI_ConfigKey : public RGWServiceInstance
+{
+public:
+  RGWSI_ConfigKey(CephContext *cct) : RGWServiceInstance(cct) {}
+  virtual ~RGWSI_ConfigKey() {}
+
+  virtual int get(const std::string& key, bool secure, bufferlist *result) = 0;
+};
+
diff --git a/src/rgw/services/svc_config_key_rados.cc b/src/rgw/services/svc_config_key_rados.cc
new file mode 100644
index 000000000..5edb02ea7
--- /dev/null
+++ b/src/rgw/services/svc_config_key_rados.cc
@@ -0,0 +1,50 @@
+
+#include "svc_rados.h"
+#include "svc_config_key_rados.h"
+
+using namespace std;
+
+RGWSI_ConfigKey_RADOS::~RGWSI_ConfigKey_RADOS(){}
+
+int RGWSI_ConfigKey_RADOS::do_start(optional_yield, const DoutPrefixProvider *dpp)
+{
+  maybe_insecure_mon_conn = !svc.rados->check_secure_mon_conn(dpp);
+
+  return 0;
+}
+
+void RGWSI_ConfigKey_RADOS::warn_if_insecure()
+{
+  if (!maybe_insecure_mon_conn ||
+      warned_insecure.test_and_set()) {
+    return;
+  }
+
+  string s = "rgw is configured to optionally allow insecure connections to the monitors (auth_supported, ms_mon_client_mode), ssl certificates stored at the monitor configuration could leak";
+
+  svc.rados->clog_warn(s);
+
+  lderr(ctx()) << __func__ << "(): WARNING: " << s << dendl;
+}
+
+int RGWSI_ConfigKey_RADOS::get(const string& key, bool secure, bufferlist *result)
+{
+  string cmd =
+    "{"
+      "\"prefix\": \"config-key get\", "
+      "\"key\": \"" + key + "\""
+    "}";
+
+  bufferlist inbl;
+  auto handle = svc.rados->handle();
+  int ret = handle.mon_command(cmd, inbl, result, nullptr);
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (secure) {
+    warn_if_insecure();
+  }
+
+  return 0;
+}
diff --git a/src/rgw/services/svc_config_key_rados.h b/src/rgw/services/svc_config_key_rados.h
new file mode 100644
index 000000000..b3b995ac7
--- /dev/null
+++ b/src/rgw/services/svc_config_key_rados.h
@@ -0,0 +1,54 @@
+
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <atomic>
+
+#include "rgw_service.h"
+
+#include "svc_config_key.h"
+
+class RGWSI_RADOS;
+
+class RGWSI_ConfigKey_RADOS : public RGWSI_ConfigKey
+{
+  bool maybe_insecure_mon_conn{false};
+  std::atomic_flag warned_insecure = ATOMIC_FLAG_INIT;
+
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+
+  void warn_if_insecure();
+
+public:
+  struct Svc {
+    RGWSI_RADOS *rados{nullptr};
+  } svc;
+
+  void init(RGWSI_RADOS *rados_svc) {
+    svc.rados = rados_svc;
+  }
+
+  RGWSI_ConfigKey_RADOS(CephContext *cct) : RGWSI_ConfigKey(cct) {}
+
+  virtual ~RGWSI_ConfigKey_RADOS() override;
+
+  int get(const std::string& key, bool secure, bufferlist *result) override;
+};
+
+
diff --git a/src/rgw/services/svc_finisher.cc b/src/rgw/services/svc_finisher.cc
new file mode 100644
index 000000000..4883c7c50
--- /dev/null
+++ b/src/rgw/services/svc_finisher.cc
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/Finisher.h"
+
+#include "svc_finisher.h"
+
+using namespace std;
+
+int RGWSI_Finisher::do_start(optional_yield, const DoutPrefixProvider *dpp)
+{
+  finisher = new Finisher(cct);
+  finisher->start();
+
+  return 0;
+}
+
+void RGWSI_Finisher::shutdown()
+{
+  if (finalized) {
+    return;
+  }
+
+  if (finisher) {
+    finisher->stop();
+
+    map<int, ShutdownCB *> cbs;
+    cbs.swap(shutdown_cbs); /* move cbs out, in case caller unregisters */
+    for (auto& iter : cbs) {
+      iter.second->call();
+    }
+    delete finisher;
+  }
+
+  finalized = true;
+}
+
+RGWSI_Finisher::~RGWSI_Finisher()
+{
+  shutdown();
+}
+
+void RGWSI_Finisher::register_caller(ShutdownCB *cb, int *phandle)
+{
+  *phandle = ++handles_counter;
+  shutdown_cbs[*phandle] = cb;
+}
+
+void RGWSI_Finisher::unregister_caller(int handle)
+{
+  shutdown_cbs.erase(handle);
+}
+
+void RGWSI_Finisher::schedule_context(Context *c)
+{
+  finisher->queue(c);
+}
+
diff --git a/src/rgw/services/svc_finisher.h b/src/rgw/services/svc_finisher.h
new file mode 100644
index 000000000..911b48f2b
--- /dev/null
+++ b/src/rgw/services/svc_finisher.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_service.h"
+
+class Context;
+class Finisher;
+
+class RGWSI_Finisher : public RGWServiceInstance
+{
+  friend struct RGWServices_Def;
+public:
+  class ShutdownCB;
+
+private:
+  Finisher *finisher{nullptr};
+  bool finalized{false};
+
+  void shutdown() override;
+
+  std::map<int, ShutdownCB *> shutdown_cbs;
+  std::atomic<int> handles_counter{0};
+
+protected:
+  void init() {}
+  int do_start(optional_yield y, const DoutPrefixProvider *dpp) override;
+
+public:
+  RGWSI_Finisher(CephContext *cct): RGWServiceInstance(cct) {}
+  ~RGWSI_Finisher();
+
+  class ShutdownCB {
+  public:
+      virtual ~ShutdownCB() {}
+      virtual void call() = 0;
+  };
+
+  void register_caller(ShutdownCB *cb, int *phandle);
+  void unregister_caller(int handle);
+
+  void schedule_context(Context *c);
+};
diff --git a/src/rgw/services/svc_mdlog.cc b/src/rgw/services/svc_mdlog.cc
new file mode 100644
index 000000000..09a68d3d7
--- /dev/null
+++ b/src/rgw/services/svc_mdlog.cc
@@ -0,0 +1,549 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_mdlog.h"
+#include "svc_rados.h"
+#include "svc_zone.h"
+#include "svc_sys_obj.h"
+
+#include "rgw_tools.h"
+#include "rgw_mdlog.h"
+#include "rgw_coroutine.h"
+#include "rgw_cr_rados.h"
+#include "rgw_zone.h"
+
+#include "common/errno.h"
+
+#include <boost/asio/yield.hpp>
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+using Svc = RGWSI_MDLog::Svc;
+using Cursor = RGWPeriodHistory::Cursor;
+
+RGWSI_MDLog::RGWSI_MDLog(CephContext *cct, bool _run_sync) : RGWServiceInstance(cct), run_sync(_run_sync) {
+}
+
+RGWSI_MDLog::~RGWSI_MDLog() {
+}
+
+int RGWSI_MDLog::init(RGWSI_RADOS *_rados_svc, RGWSI_Zone *_zone_svc, RGWSI_SysObj *_sysobj_svc, RGWSI_Cls *_cls_svc)
+{
+  svc.zone = _zone_svc;
+  svc.sysobj = _sysobj_svc;
+  svc.mdlog = this;
+  svc.rados = _rados_svc;
+  svc.cls = _cls_svc;
+
+  return 0;
+}
+
+int RGWSI_MDLog::do_start(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  auto& current_period = svc.zone->get_current_period();
+
+  current_log = get_log(current_period.get_id());
+
+  period_puller.reset(new RGWPeriodPuller(svc.zone, svc.sysobj));
+  period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
+                                            current_period));
+
+  if (run_sync &&
+      svc.zone->need_to_sync()) {
+    // initialize the log period history
+    svc.mdlog->init_oldest_log_period(y, dpp);
+  }
+  return 0;
+}
+
+int RGWSI_MDLog::read_history(RGWMetadataLogHistory *state,
+                              RGWObjVersionTracker *objv_tracker,
+			      optional_yield y,
+                              const DoutPrefixProvider *dpp) const
+{
+  auto& pool = svc.zone->get_zone_params().log_pool;
+  const auto& oid = RGWMetadataLogHistory::oid;
+  bufferlist bl;
+  int ret = rgw_get_system_obj(svc.sysobj, pool, oid, bl, objv_tracker, nullptr, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+  if (bl.length() == 0) {
+    /* bad history object, remove it */
+    rgw_raw_obj obj(pool, oid);
+    auto sysobj = svc.sysobj->get_obj(obj);
+    ret = sysobj.wop().remove(dpp, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: meta history is empty, but cannot remove it (" << cpp_strerror(-ret) << ")" << dendl;
+      return ret;
+    }
+    return -ENOENT;
+  }
+  try {
+    auto p = bl.cbegin();
+    state->decode(p);
+  } catch (buffer::error& e) {
+    ldpp_dout(dpp, 1) << "failed to decode the mdlog history: "
+        << e.what() << dendl;
+    return -EIO;
+  }
+  return 0;
+}
+
+int RGWSI_MDLog::write_history(const DoutPrefixProvider *dpp, 
+                               const RGWMetadataLogHistory& state,
+                               RGWObjVersionTracker *objv_tracker,
+                               optional_yield y, bool exclusive)
+{
+  bufferlist bl;
+  state.encode(bl);
+
+  auto& pool = svc.zone->get_zone_params().log_pool;
+  const auto& oid = RGWMetadataLogHistory::oid;
+  return rgw_put_system_obj(dpp, svc.sysobj, pool, oid, bl,
+                            exclusive, objv_tracker, real_time{}, y);
+}
+
+namespace mdlog {
+
+using Cursor = RGWPeriodHistory::Cursor;
+
+namespace {
+template <class T>
+class SysObjReadCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  RGWAsyncRadosProcessor *async_rados;
+  RGWSI_SysObj *svc;
+
+  rgw_raw_obj obj;
+  T *result;
+  /// on ENOENT, call handle_data() with an empty object instead of failing
+  const bool empty_on_enoent;
+  RGWObjVersionTracker *objv_tracker;
+  RGWAsyncGetSystemObj *req{nullptr};
+
+public:
+  SysObjReadCR(const DoutPrefixProvider *_dpp, 
+	       RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc,
+	       const rgw_raw_obj& _obj,
+	       T *_result, bool empty_on_enoent = true,
+	       RGWObjVersionTracker *objv_tracker = nullptr)
+    : RGWSimpleCoroutine(_svc->ctx()), dpp(_dpp), async_rados(_async_rados), svc(_svc),
+      obj(_obj), result(_result),
+      empty_on_enoent(empty_on_enoent), objv_tracker(objv_tracker) {}
+
+  ~SysObjReadCR() override {
+    try {
+      request_cleanup();
+    } catch (const boost::container::length_error_t& e) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	": reference counted object mismatched, \"" << e.what() <<
+	"\"" << dendl;
+    }
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) {
+    req = new RGWAsyncGetSystemObj(dpp, this, stack->create_completion_notifier(), svc,
+				   objv_tracker, obj, false, false);
+    async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() {
+    int ret = req->get_ret_status();
+    retcode = ret;
+    if (ret == -ENOENT && empty_on_enoent) {
+      *result = T();
+    } else {
+      if (ret < 0) {
+	return ret;
+      }
+      if (objv_tracker) { // copy the updated version
+	*objv_tracker = req->objv_tracker;
+      }
+      try {
+	auto iter = req->bl.cbegin();
+	if (iter.end()) {
+	  // allow successful reads with empty buffers. ReadSyncStatus
+	  // coroutines depend on this to be able to read without
+	  // locking, because the cls lock from InitSyncStatus will
+	  // create an empty object if it didn't exist
+	  *result = T();
+	} else {
+	  decode(*result, iter);
+	}
+      } catch (buffer::error& err) {
+	return -EIO;
+      }
+    }
+    return handle_data(*result);
+  }
+
+  virtual int handle_data(T& data) {
+    return 0;
+  }
+};
+
+template <class T>
+class SysObjWriteCR : public RGWSimpleCoroutine {
+  const DoutPrefixProvider *dpp;
+  RGWAsyncRadosProcessor *async_rados;
+  RGWSI_SysObj *svc;
+  bufferlist bl;
+  rgw_raw_obj obj;
+  RGWObjVersionTracker *objv_tracker;
+  bool exclusive;
+  RGWAsyncPutSystemObj *req{nullptr};
+
+public:
+  SysObjWriteCR(const DoutPrefixProvider *_dpp, 
+		RGWAsyncRadosProcessor *_async_rados, RGWSI_SysObj *_svc,
+		const rgw_raw_obj& _obj, const T& _data,
+		RGWObjVersionTracker *objv_tracker = nullptr,
+		bool exclusive = false)
+    : RGWSimpleCoroutine(_svc->ctx()), dpp(_dpp), async_rados(_async_rados),
+      svc(_svc), obj(_obj), objv_tracker(objv_tracker), exclusive(exclusive) {
+    encode(_data, bl);
+  }
+
+  ~SysObjWriteCR() override {
+    try {
+      request_cleanup();
+    } catch (const boost::container::length_error_t& e) {
+      ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
+	": reference counted object mismatched, \"" << e.what() <<
+	"\"" << dendl;
+    }
+  }
+
+  void request_cleanup() override {
+    if (req) {
+      req->finish();
+      req = NULL;
+    }
+  }
+
+  int send_request(const DoutPrefixProvider *dpp) override {
+    req = new RGWAsyncPutSystemObj(dpp, this, stack->create_completion_notifier(),
+			           svc, objv_tracker, obj, exclusive, std::move(bl));
+    async_rados->queue(req);
+    return 0;
+  }
+
+  int request_complete() override {
+    if (objv_tracker) { // copy the updated version
+      *objv_tracker = req->objv_tracker;
+    }
+    return req->get_ret_status();
+  }
+};
+}
+
+/// read the mdlog history and use it to initialize the given cursor
+class ReadHistoryCR : public RGWCoroutine {
+  const DoutPrefixProvider *dpp;
+  Svc svc;
+  Cursor *cursor;
+  RGWObjVersionTracker *objv_tracker;
+  RGWMetadataLogHistory state;
+  RGWAsyncRadosProcessor *async_processor;
+
+ public:
+  ReadHistoryCR(const DoutPrefixProvider *dpp, 
+                const Svc& svc,
+                Cursor *cursor,
+                RGWObjVersionTracker *objv_tracker)
+    : RGWCoroutine(svc.zone->ctx()), dpp(dpp), svc(svc),
+      cursor(cursor),
+      objv_tracker(objv_tracker),
+      async_processor(svc.rados->get_async_processor())
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) {
+    reenter(this) {
+      yield {
+        rgw_raw_obj obj{svc.zone->get_zone_params().log_pool,
+                        RGWMetadataLogHistory::oid};
+        constexpr bool empty_on_enoent = false;
+
+        using ReadCR = SysObjReadCR<RGWMetadataLogHistory>;
+        call(new ReadCR(dpp, async_processor, svc.sysobj, obj,
+                        &state, empty_on_enoent, objv_tracker));
+      }
+      if (retcode < 0) {
+        ldpp_dout(dpp, 1) << "failed to read mdlog history: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+      *cursor = svc.mdlog->period_history->lookup(state.oldest_realm_epoch);
+      if (!*cursor) {
+        return set_cr_error(cursor->get_error());
+      }
+
+      ldpp_dout(dpp, 10) << "read mdlog history with oldest period id="
+          << state.oldest_period_id << " realm_epoch="
+          << state.oldest_realm_epoch << dendl;
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+/// write the given cursor to the mdlog history
+class WriteHistoryCR : public RGWCoroutine {
+  const DoutPrefixProvider *dpp;
+  Svc svc;
+  Cursor cursor;
+  RGWObjVersionTracker *objv;
+  RGWMetadataLogHistory state;
+  RGWAsyncRadosProcessor *async_processor;
+
+ public:
+  WriteHistoryCR(const DoutPrefixProvider *dpp, 
+                 Svc& svc,
+                 const Cursor& cursor,
+                 RGWObjVersionTracker *objv)
+    : RGWCoroutine(svc.zone->ctx()), dpp(dpp), svc(svc),
+      cursor(cursor), objv(objv),
+      async_processor(svc.rados->get_async_processor())
+  {}
+
+  int operate(const DoutPrefixProvider *dpp) {
+    reenter(this) {
+      state.oldest_period_id = cursor.get_period().get_id();
+      state.oldest_realm_epoch = cursor.get_epoch();
+
+      yield {
+        rgw_raw_obj obj{svc.zone->get_zone_params().log_pool,
+                        RGWMetadataLogHistory::oid};
+
+        using WriteCR = SysObjWriteCR<RGWMetadataLogHistory>;
+        call(new WriteCR(dpp, async_processor, svc.sysobj, obj, state, objv));
+      }
+      if (retcode < 0) {
+        ldpp_dout(dpp, 1) << "failed to write mdlog history: "
+            << cpp_strerror(retcode) << dendl;
+        return set_cr_error(retcode);
+      }
+
+      ldpp_dout(dpp, 10) << "wrote mdlog history with oldest period id="
+          << state.oldest_period_id << " realm_epoch="
+          << state.oldest_realm_epoch << dendl;
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+/// update the mdlog history to reflect trimmed logs
+class TrimHistoryCR : public RGWCoroutine {
+  const DoutPrefixProvider *dpp;
+  Svc svc;
+  const Cursor cursor; //< cursor to trimmed period
+  RGWObjVersionTracker *objv; //< to prevent racing updates
+  Cursor next; //< target cursor for oldest log period
+  Cursor existing; //< existing cursor read from disk
+
+ public:
+  TrimHistoryCR(const DoutPrefixProvider *dpp, const Svc& svc, Cursor cursor, RGWObjVersionTracker *objv)
+    : RGWCoroutine(svc.zone->ctx()), dpp(dpp), svc(svc),
+      cursor(cursor), objv(objv), next(cursor) {
+    next.next(); // advance past cursor
+  }
+
+  int operate(const DoutPrefixProvider *dpp) {
+    reenter(this) {
+      // read an existing history, and write the new history if it's newer
+      yield call(new ReadHistoryCR(dpp, svc, &existing, objv));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      // reject older trims with ECANCELED
+      if (cursor.get_epoch() < existing.get_epoch()) {
+        ldpp_dout(dpp, 4) << "found oldest log epoch=" << existing.get_epoch()
+            << ", rejecting trim at epoch=" << cursor.get_epoch() << dendl;
+        return set_cr_error(-ECANCELED);
+      }
+      // overwrite with updated history
+      yield call(new WriteHistoryCR(dpp, svc, next, objv));
+      if (retcode < 0) {
+        return set_cr_error(retcode);
+      }
+      return set_cr_done();
+    }
+    return 0;
+  }
+};
+
+} // mdlog namespace
+
+// traverse all the way back to the beginning of the period history, and
+// return a cursor to the first period in a fully attached history
+Cursor RGWSI_MDLog::find_oldest_period(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  auto cursor = period_history->get_current();
+
+  while (cursor) {
+    // advance to the period's predecessor
+    if (!cursor.has_prev()) {
+      auto& predecessor = cursor.get_period().get_predecessor();
+      if (predecessor.empty()) {
+        // this is the first period, so our logs must start here
+        ldpp_dout(dpp, 10) << "find_oldest_period returning first "
+            "period " << cursor.get_period().get_id() << dendl;
+        return cursor;
+      }
+      // pull the predecessor and add it to our history
+      RGWPeriod period;
+      int r = period_puller->pull(dpp, predecessor, period, y);
+      if (r < 0) {
+        return cursor;
+      }
+      auto prev = period_history->insert(std::move(period));
+      if (!prev) {
+        return prev;
+      }
+      ldpp_dout(dpp, 20) << "find_oldest_period advancing to "
+          "predecessor period " << predecessor << dendl;
+      ceph_assert(cursor.has_prev());
+    }
+    cursor.prev();
+  }
+  ldpp_dout(dpp, 10) << "find_oldest_period returning empty cursor" << dendl;
+  return cursor;
+}
+
+Cursor RGWSI_MDLog::init_oldest_log_period(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  // read the mdlog history
+  RGWMetadataLogHistory state;
+  RGWObjVersionTracker objv;
+  int ret = read_history(&state, &objv, y, dpp);
+
+  if (ret == -ENOENT) {
+    // initialize the mdlog history and write it
+    ldpp_dout(dpp, 10) << "initializing mdlog history" << dendl;
+    auto cursor = find_oldest_period(dpp, y);
+    if (!cursor) {
+      return cursor;
+    }
+    // write the initial history
+    state.oldest_realm_epoch = cursor.get_epoch();
+    state.oldest_period_id = cursor.get_period().get_id();
+
+    constexpr bool exclusive = true; // don't overwrite
+    int ret = write_history(dpp, state, &objv, y, exclusive);
+    if (ret < 0 && ret != -EEXIST) {
+      ldpp_dout(dpp, 1) << "failed to write mdlog history: "
+          << cpp_strerror(ret) << dendl;
+      return Cursor{ret};
+    }
+    return cursor;
+  } else if (ret < 0) {
+    ldpp_dout(dpp, 1) << "failed to read mdlog history: "
+        << cpp_strerror(ret) << dendl;
+    return Cursor{ret};
+  }
+
+  // if it's already in the history, return it
+  auto cursor = period_history->lookup(state.oldest_realm_epoch);
+  if (cursor) {
+    return cursor;
+  } else {
+    cursor = find_oldest_period(dpp, y);
+    state.oldest_realm_epoch = cursor.get_epoch();
+    state.oldest_period_id = cursor.get_period().get_id();
+    ldpp_dout(dpp, 10) << "rewriting mdlog history" << dendl;
+    ret = write_history(dpp, state, &objv, y);
+    if (ret < 0 && ret != -ECANCELED) {
+    ldpp_dout(dpp, 1) << "failed to write mdlog history: "
+          << cpp_strerror(ret) << dendl;
+    return Cursor{ret};
+    }
+    return cursor;
+  }
+
+  // pull the oldest period by id
+  RGWPeriod period;
+  ret = period_puller->pull(dpp, state.oldest_period_id, period, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "failed to read period id=" << state.oldest_period_id
+        << " for mdlog history: " << cpp_strerror(ret) << dendl;
+    return Cursor{ret};
+  }
+  // verify its realm_epoch
+  if (period.get_realm_epoch() != state.oldest_realm_epoch) {
+    ldpp_dout(dpp, 1) << "inconsistent mdlog history: read period id="
+        << period.get_id() << " with realm_epoch=" << period.get_realm_epoch()
+        << ", expected realm_epoch=" << state.oldest_realm_epoch << dendl;
+    return Cursor{-EINVAL};
+  }
+  // attach the period to our history
+  return period_history->attach(dpp, std::move(period), y);
+}
+
+Cursor RGWSI_MDLog::read_oldest_log_period(optional_yield y, const DoutPrefixProvider *dpp) const
+{
+  RGWMetadataLogHistory state;
+  int ret = read_history(&state, nullptr, y, dpp);
+  if (ret < 0) {
+    ldpp_dout(dpp, 1) << "failed to read mdlog history: "
+        << cpp_strerror(ret) << dendl;
+    return Cursor{ret};
+  }
+
+  ldpp_dout(dpp, 10) << "read mdlog history with oldest period id="
+      << state.oldest_period_id << " realm_epoch="
+      << state.oldest_realm_epoch << dendl;
+
+  return period_history->lookup(state.oldest_realm_epoch);
+}
+
+RGWCoroutine* RGWSI_MDLog::read_oldest_log_period_cr(const DoutPrefixProvider *dpp, 
+        Cursor *period, RGWObjVersionTracker *objv) const
+{
+  return new mdlog::ReadHistoryCR(dpp, svc, period, objv);
+}
+
+RGWCoroutine* RGWSI_MDLog::trim_log_period_cr(const DoutPrefixProvider *dpp, 
+        Cursor period, RGWObjVersionTracker *objv) const
+{
+  return new mdlog::TrimHistoryCR(dpp, svc, period, objv);
+}
+
+RGWMetadataLog* RGWSI_MDLog::get_log(const std::string& period)
+{
+  // construct the period's log in place if it doesn't exist
+  auto insert = md_logs.emplace(std::piecewise_construct,
+                                std::forward_as_tuple(period),
+                                std::forward_as_tuple(cct, svc.zone, svc.cls, period));
+  return &insert.first->second;
+}
+
+int RGWSI_MDLog::add_entry(const DoutPrefixProvider *dpp, const string& hash_key, const string& section, const string& key, bufferlist& bl)
+{
+  ceph_assert(current_log); // must have called init()
+  return current_log->add_entry(dpp, hash_key, section, key, bl);
+}
+
+int RGWSI_MDLog::get_shard_id(const string& hash_key, int *shard_id)
+{
+  ceph_assert(current_log); // must have called init()
+  return current_log->get_shard_id(hash_key, shard_id);
+}
+
+int RGWSI_MDLog::pull_period(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period,
+			     optional_yield y)
+{
+  return period_puller->pull(dpp, period_id, period, y);
+}
+
diff --git a/src/rgw/services/svc_mdlog.h b/src/rgw/services/svc_mdlog.h
new file mode 100644
index 000000000..703d6f605
--- /dev/null
+++ b/src/rgw/services/svc_mdlog.h
@@ -0,0 +1,118 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+#include "rgw_period_history.h"
+#include "rgw_period_puller.h"
+
+#include "svc_meta_be.h"
+
+
+class RGWMetadataLog;
+class RGWMetadataLogHistory;
+class RGWCoroutine;
+
+class RGWSI_Zone;
+class RGWSI_SysObj;
+class RGWSI_RADOS;
+
+namespace mdlog {
+  class ReadHistoryCR;
+  class WriteHistoryCR;
+}
+
+class RGWSI_MDLog : public RGWServiceInstance
+{
+  friend class mdlog::ReadHistoryCR;
+  friend class mdlog::WriteHistoryCR;
+
+  // maintain a separate metadata log for each period
+  std::map<std::string, RGWMetadataLog> md_logs;
+
+  // use the current period's log for mutating operations
+  RGWMetadataLog* current_log{nullptr};
+
+  bool run_sync;
+
+  // pulls missing periods for period_history
+  std::unique_ptr<RGWPeriodPuller> period_puller;
+  // maintains a connected history of periods
+  std::unique_ptr<RGWPeriodHistory> period_history;
+
+public:
+  RGWSI_MDLog(CephContext *cct, bool run_sync);
+  virtual ~RGWSI_MDLog();
+
+  struct Svc {
+    RGWSI_RADOS *rados{nullptr};
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_SysObj *sysobj{nullptr};
+    RGWSI_MDLog *mdlog{nullptr};
+    RGWSI_Cls *cls{nullptr};
+  } svc;
+
+  int init(RGWSI_RADOS *_rados_svc,
+           RGWSI_Zone *_zone_svc,
+           RGWSI_SysObj *_sysobj_svc,
+           RGWSI_Cls *_cls_svc);
+
+  int do_start(optional_yield y, const DoutPrefixProvider *dpp) override;
+
+  // traverse all the way back to the beginning of the period history, and
+  // return a cursor to the first period in a fully attached history
+  RGWPeriodHistory::Cursor find_oldest_period(const DoutPrefixProvider *dpp, optional_yield y);
+
+  /// initialize the oldest log period if it doesn't exist, and attach it to
+  /// our current history
+  RGWPeriodHistory::Cursor init_oldest_log_period(optional_yield y, const DoutPrefixProvider *dpp);
+
+  /// read the oldest log period, and return a cursor to it in our existing
+  /// period history
+  RGWPeriodHistory::Cursor read_oldest_log_period(optional_yield y, const DoutPrefixProvider *dpp) const;
+
+  /// read the oldest log period asynchronously and write its result to the
+  /// given cursor pointer
+  RGWCoroutine* read_oldest_log_period_cr(const DoutPrefixProvider *dpp, 
+                                          RGWPeriodHistory::Cursor *period,
+                                          RGWObjVersionTracker *objv) const;
+
+  /// try to advance the oldest log period when the given period is trimmed,
+  /// using a rados lock to provide atomicity
+  RGWCoroutine* trim_log_period_cr(const DoutPrefixProvider *dpp, 
+                                   RGWPeriodHistory::Cursor period,
+                                   RGWObjVersionTracker *objv) const;
+  int read_history(RGWMetadataLogHistory *state, RGWObjVersionTracker *objv_tracker,optional_yield y, const DoutPrefixProvider *dpp) const;
+  int write_history(const DoutPrefixProvider *dpp, 
+                    const RGWMetadataLogHistory& state,
+                    RGWObjVersionTracker *objv_tracker,
+		    optional_yield y, bool exclusive = false);
+
+  int add_entry(const DoutPrefixProvider *dpp, const std::string& hash_key, const std::string& section, const std::string& key, bufferlist& bl);
+
+  int get_shard_id(const std::string& hash_key, int *shard_id);
+
+  RGWPeriodHistory *get_period_history() {
+    return period_history.get();
+  }
+
+  int pull_period(const DoutPrefixProvider *dpp, const std::string& period_id, RGWPeriod& period, optional_yield y);
+
+  /// find or create the metadata log for the given period
+  RGWMetadataLog* get_log(const std::string& period);
+};
+
diff --git a/src/rgw/services/svc_meta.cc b/src/rgw/services/svc_meta.cc
new file mode 100644
index 000000000..735c39f85
--- /dev/null
+++ b/src/rgw/services/svc_meta.cc
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+
+#include "svc_meta.h"
+
+#include "rgw_metadata.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWSI_Meta::RGWSI_Meta(CephContext *cct) : RGWServiceInstance(cct) {
+}
+
+RGWSI_Meta::~RGWSI_Meta() {}
+
+void RGWSI_Meta::init(RGWSI_SysObj *_sysobj_svc,
+                      RGWSI_MDLog *_mdlog_svc,
+                      vector<RGWSI_MetaBackend *>& _be_svc)
+{
+  sysobj_svc = _sysobj_svc;
+  mdlog_svc = _mdlog_svc;
+
+  for (auto& be : _be_svc) {
+    be_svc[be->get_type()] = be;
+  }
+}
+
+int RGWSI_Meta::create_be_handler(RGWSI_MetaBackend::Type be_type,
+                                  RGWSI_MetaBackend_Handler **phandler)
+{
+  auto iter = be_svc.find(be_type);
+  if (iter == be_svc.end()) {
+    ldout(cct, 0) << __func__ << "(): ERROR: backend type not found" << dendl;
+    return -EINVAL;
+  }
+
+  auto handler = iter->second->alloc_be_handler();
+
+  be_handlers.emplace_back(handler);
+  *phandler = handler;
+
+  return 0;
+}
+
diff --git a/src/rgw/services/svc_meta.h b/src/rgw/services/svc_meta.h
new file mode 100644
index 000000000..b398e27fd
--- /dev/null
+++ b/src/rgw/services/svc_meta.h
@@ -0,0 +1,48 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "svc_meta_be.h"
+
+#include "rgw_service.h"
+
+
+class RGWMetadataLog;
+class RGWCoroutine;
+
+
+class RGWSI_Meta : public RGWServiceInstance
+{
+  RGWSI_SysObj *sysobj_svc{nullptr};
+  RGWSI_MDLog *mdlog_svc{nullptr};
+
+  std::map<RGWSI_MetaBackend::Type, RGWSI_MetaBackend *> be_svc;
+
+  std::vector<std::unique_ptr<RGWSI_MetaBackend_Handler> > be_handlers;
+
+public:
+  RGWSI_Meta(CephContext *cct);
+  ~RGWSI_Meta();
+
+  void init(RGWSI_SysObj *_sysobj_svc,
+            RGWSI_MDLog *_mdlog_svc,
+            std::vector<RGWSI_MetaBackend *>& _be_svc);
+
+  int create_be_handler(RGWSI_MetaBackend::Type be_type,
+                        RGWSI_MetaBackend_Handler **phandler);
+};
+
diff --git a/src/rgw/services/svc_meta_be.cc b/src/rgw/services/svc_meta_be.cc
new file mode 100644
index 000000000..2cb0365c8
--- /dev/null
+++ b/src/rgw/services/svc_meta_be.cc
@@ -0,0 +1,193 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+
+#include "svc_meta_be.h"
+
+#include "rgw_mdlog.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWSI_MetaBackend::Context::~Context() {} // needed, even though destructor is pure virtual
+RGWSI_MetaBackend::Module::~Module() {} // ditto
+RGWSI_MetaBackend::PutParams::~PutParams() {} // ...
+RGWSI_MetaBackend::GetParams::~GetParams() {} // ...
+RGWSI_MetaBackend::RemoveParams::~RemoveParams() {} // ...
+
+int RGWSI_MetaBackend::pre_modify(const DoutPrefixProvider *dpp, 
+                                  RGWSI_MetaBackend::Context *ctx,
+                                  const string& key,
+                                  RGWMetadataLogData& log_data,
+                                  RGWObjVersionTracker *objv_tracker,
+                                  RGWMDLogStatus op_type,
+                                  optional_yield y)
+{
+  /* if write version has not been set, and there's a read version, set it so that we can
+   * log it
+   */
+  if (objv_tracker &&
+      objv_tracker->read_version.ver && !objv_tracker->write_version.ver) {
+    objv_tracker->write_version = objv_tracker->read_version;
+    objv_tracker->write_version.ver++;
+  }
+
+  return 0;
+}
+
+int RGWSI_MetaBackend::post_modify(const DoutPrefixProvider *dpp, 
+                                   RGWSI_MetaBackend::Context *ctx,
+                                   const string& key,
+                                   RGWMetadataLogData& log_data,
+                                   RGWObjVersionTracker *objv_tracker, int ret,
+                                   optional_yield y)
+{
+  return ret;
+}
+
+int RGWSI_MetaBackend::prepare_mutate(RGWSI_MetaBackend::Context *ctx,
+                                      const string& key,
+                                      const real_time& mtime,
+                                      RGWObjVersionTracker *objv_tracker,
+                                      optional_yield y,
+                                      const DoutPrefixProvider *dpp)
+{
+  real_time orig_mtime;
+
+  int ret = call_with_get_params(&orig_mtime, [&](GetParams& params) {
+    return get_entry(ctx, key, params, objv_tracker, y, dpp);
+  });
+  if (ret < 0 && ret != -ENOENT) {
+    return ret;
+  }
+
+  if (objv_tracker->write_version.tag.empty()) {
+    if (objv_tracker->read_version.tag.empty()) {
+      objv_tracker->generate_new_write_ver(cct);
+    } else {
+      objv_tracker->write_version = objv_tracker->read_version;
+      objv_tracker->write_version.ver++;
+    }
+  }
+  return 0;
+}
+
+int RGWSI_MetaBackend::do_mutate(RGWSI_MetaBackend::Context *ctx,
+				 const string& key,
+				 const ceph::real_time& mtime,
+				 RGWObjVersionTracker *objv_tracker,
+				 RGWMDLogStatus op_type,
+                                 optional_yield y,
+				 std::function<int()> f,
+				 bool generic_prepare,
+                                 const DoutPrefixProvider *dpp)
+{
+  int ret;
+
+  if (generic_prepare) {
+    ret = prepare_mutate(ctx, key, mtime, objv_tracker, y, dpp);
+    if (ret < 0 ||
+	ret == STATUS_NO_APPLY) {
+      return ret;
+    }
+  }
+
+  RGWMetadataLogData log_data;
+  ret = pre_modify(dpp, ctx, key, log_data, objv_tracker, op_type, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = f();
+
+  /* cascading ret into post_modify() */
+
+  ret = post_modify(dpp, ctx, key, log_data, objv_tracker, ret, y);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSI_MetaBackend::get(Context *ctx,
+			   const string& key,
+			   GetParams& params,
+			   RGWObjVersionTracker *objv_tracker,
+                           optional_yield y,
+                           const DoutPrefixProvider *dpp,
+                           bool get_raw_attrs)
+{
+  return get_entry(ctx, key, params, objv_tracker, y, dpp, get_raw_attrs);
+}
+
+int RGWSI_MetaBackend::put(Context *ctx,
+			   const string& key,
+			   PutParams& params,
+			   RGWObjVersionTracker *objv_tracker,
+                           optional_yield y,
+                           const DoutPrefixProvider *dpp)
+{
+  std::function<int()> f = [&]() {
+    return put_entry(dpp, ctx, key, params, objv_tracker, y);
+  };
+
+  return do_mutate(ctx, key, params.mtime, objv_tracker,
+                MDLOG_STATUS_WRITE,
+                y,
+                f,
+                false,
+                dpp);
+}
+
+int RGWSI_MetaBackend::remove(Context *ctx,
+                              const string& key,
+                              RemoveParams& params,
+                              RGWObjVersionTracker *objv_tracker,
+                              optional_yield y,
+                              const DoutPrefixProvider *dpp)
+{
+  std::function<int()> f = [&]() {
+    return remove_entry(dpp, ctx, key, params, objv_tracker, y);
+  };
+
+  return do_mutate(ctx, key, params.mtime, objv_tracker,
+                MDLOG_STATUS_REMOVE,
+                y,
+                f,
+                false,
+                dpp);
+}
+
+int RGWSI_MetaBackend::mutate(Context *ctx,
+			      const std::string& key,
+			      MutateParams& params,
+			      RGWObjVersionTracker *objv_tracker,
+                              optional_yield y,
+			      std::function<int()> f,
+                              const DoutPrefixProvider *dpp)
+{
+  return do_mutate(ctx, key, params.mtime, objv_tracker,
+		   params.op_type, y,
+		   f,
+		   false,
+                   dpp);
+}
+
+int RGWSI_MetaBackend_Handler::call(std::optional<RGWSI_MetaBackend_CtxParams> bectx_params,
+                                    std::function<int(Op *)> f)
+{
+  return be->call(bectx_params, [&](RGWSI_MetaBackend::Context *ctx) {
+    ctx->init(this);
+    Op op(be, ctx);
+    return f(&op);
+  });
+}
+
+RGWSI_MetaBackend_Handler::Op_ManagedCtx::Op_ManagedCtx(RGWSI_MetaBackend_Handler *handler) : Op(handler->be, handler->be->alloc_ctx())
+{
+  auto c = ctx();
+  c->init(handler);
+  pctx.reset(c);
+}
+
diff --git a/src/rgw/services/svc_meta_be.h b/src/rgw/services/svc_meta_be.h
new file mode 100644
index 000000000..97267a4e7
--- /dev/null
+++ b/src/rgw/services/svc_meta_be.h
@@ -0,0 +1,294 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "svc_meta_be_params.h"
+
+#include "rgw_service.h"
+#include "rgw_mdlog_types.h"
+
+class RGWMetadataLogData;
+
+class RGWSI_MDLog;
+class RGWSI_Meta;
+class RGWObjVersionTracker;
+class RGWSI_MetaBackend_Handler;
+
+class RGWSI_MetaBackend : public RGWServiceInstance
+{
+  friend class RGWSI_Meta;
+public:
+  class Module;
+  class Context;
+protected:
+  RGWSI_MDLog *mdlog_svc{nullptr};
+
+  void base_init(RGWSI_MDLog *_mdlog_svc) {
+    mdlog_svc = _mdlog_svc;
+  }
+
+  int prepare_mutate(RGWSI_MetaBackend::Context *ctx,
+                     const std::string& key,
+                     const ceph::real_time& mtime,
+                     RGWObjVersionTracker *objv_tracker,
+                     optional_yield y,
+                     const DoutPrefixProvider *dpp);
+
+  virtual int do_mutate(Context *ctx,
+                     const std::string& key,
+                     const ceph::real_time& mtime, RGWObjVersionTracker *objv_tracker,
+                     RGWMDLogStatus op_type,
+                     optional_yield y,
+                     std::function<int()> f,
+                     bool generic_prepare,
+                     const DoutPrefixProvider *dpp);
+
+  virtual int pre_modify(const DoutPrefixProvider *dpp, 
+                         Context *ctx,
+                         const std::string& key,
+                         RGWMetadataLogData& log_data,
+                         RGWObjVersionTracker *objv_tracker,
+                         RGWMDLogStatus op_type,
+                         optional_yield y);
+  virtual int post_modify(const DoutPrefixProvider *dpp, 
+                          Context *ctx,
+                          const std::string& key,
+                          RGWMetadataLogData& log_data,
+                          RGWObjVersionTracker *objv_tracker, int ret,
+                          optional_yield y);
+public:
+  class Module {
+    /*
+     * Backend specialization module
+     */
+  public:
+    virtual ~Module() = 0;
+  };
+
+  using ModuleRef = std::shared_ptr<Module>;
+
+  struct Context { /*
+                    * A single metadata operation context. Will be holding info about
+                    * backend and operation itself; operation might span multiple backend
+                    * calls.
+                    */
+    virtual ~Context() = 0;
+
+    virtual void init(RGWSI_MetaBackend_Handler *h) = 0;
+  };
+
+  virtual Context *alloc_ctx() = 0;
+
+  struct PutParams {
+    ceph::real_time mtime;
+
+    PutParams() {}
+    PutParams(const ceph::real_time& _mtime) : mtime(_mtime) {}
+    virtual ~PutParams() = 0;
+  };
+
+  struct GetParams {
+    GetParams() {}
+    GetParams(ceph::real_time *_pmtime) : pmtime(_pmtime) {}
+    virtual ~GetParams();
+
+    ceph::real_time *pmtime{nullptr};
+  };
+
+  struct RemoveParams {
+    virtual ~RemoveParams() = 0;
+
+    ceph::real_time mtime;
+  };
+
+  struct MutateParams {
+    ceph::real_time mtime;
+    RGWMDLogStatus op_type;
+
+    MutateParams() {}
+    MutateParams(const ceph::real_time& _mtime,
+		 RGWMDLogStatus _op_type) : mtime(_mtime), op_type(_op_type) {}
+    virtual ~MutateParams() {}
+  };
+
+  enum Type {
+    MDBE_SOBJ = 0,
+    MDBE_OTP  = 1,
+  };
+
+  RGWSI_MetaBackend(CephContext *cct) : RGWServiceInstance(cct) {}
+  virtual ~RGWSI_MetaBackend() {}
+
+  virtual Type get_type() = 0;
+
+  virtual RGWSI_MetaBackend_Handler *alloc_be_handler() = 0;
+  virtual int call_with_get_params(ceph::real_time *pmtime, std::function<int(RGWSI_MetaBackend::GetParams&)>) = 0;
+
+  /* these should be implemented by backends */
+  virtual int get_entry(RGWSI_MetaBackend::Context *ctx,
+                        const std::string& key,
+                        RGWSI_MetaBackend::GetParams& params,
+                        RGWObjVersionTracker *objv_tracker,
+                        optional_yield y,
+                        const DoutPrefixProvider *dpp,
+                        bool get_raw_attrs=false) = 0;
+  virtual int put_entry(const DoutPrefixProvider *dpp, 
+                        RGWSI_MetaBackend::Context *ctx,
+                        const std::string& key,
+                        RGWSI_MetaBackend::PutParams& params,
+                        RGWObjVersionTracker *objv_tracker,
+                        optional_yield y) = 0;
+  virtual int remove_entry(const DoutPrefixProvider *dpp, 
+                           Context *ctx,
+                           const std::string& key,
+                           RGWSI_MetaBackend::RemoveParams& params,
+                           RGWObjVersionTracker *objv_tracker,
+                           optional_yield y) = 0;
+
+  virtual int list_init(const DoutPrefixProvider *dpp, RGWSI_MetaBackend::Context *ctx, const std::string& marker) = 0;
+  virtual int list_next(const DoutPrefixProvider *dpp,
+                        RGWSI_MetaBackend::Context *ctx,
+                        int max, std::list<std::string> *keys,
+                        bool *truncated)  = 0;
+  virtual int list_get_marker(RGWSI_MetaBackend::Context *ctx,
+                              std::string *marker) = 0;
+
+  int call(std::function<int(RGWSI_MetaBackend::Context *)> f) {
+    return call(std::nullopt, f);
+  }
+
+  virtual int call(std::optional<RGWSI_MetaBackend_CtxParams> opt,
+                   std::function<int(RGWSI_MetaBackend::Context *)> f) = 0;
+
+  virtual int get_shard_id(RGWSI_MetaBackend::Context *ctx,
+			   const std::string& key,
+			   int *shard_id) = 0;
+
+  /* higher level */
+  virtual int get(Context *ctx,
+                  const std::string& key,
+                  GetParams &params,
+                  RGWObjVersionTracker *objv_tracker,
+                  optional_yield y,
+                  const DoutPrefixProvider *dpp,
+                  bool get_raw_attrs=false);
+
+  virtual int put(Context *ctx,
+                  const std::string& key,
+                  PutParams& params,
+                  RGWObjVersionTracker *objv_tracker,
+                  optional_yield y,
+                  const DoutPrefixProvider *dpp);
+
+  virtual int remove(Context *ctx,
+                     const std::string& key,
+                     RemoveParams& params,
+                     RGWObjVersionTracker *objv_tracker,
+                     optional_yield y,
+                     const DoutPrefixProvider *dpp);
+
+  virtual int mutate(Context *ctx,
+                     const std::string& key,
+                     MutateParams& params,
+		     RGWObjVersionTracker *objv_tracker,
+                     optional_yield y,
+                     std::function<int()> f,
+                     const DoutPrefixProvider *dpp);
+};
+
+class RGWSI_MetaBackend_Handler {
+  RGWSI_MetaBackend *be{nullptr};
+
+public:
+  class Op {
+    friend class RGWSI_MetaBackend_Handler;
+
+    RGWSI_MetaBackend *be;
+    RGWSI_MetaBackend::Context *be_ctx;
+
+    Op(RGWSI_MetaBackend *_be,
+       RGWSI_MetaBackend::Context *_ctx) : be(_be), be_ctx(_ctx) {}
+
+  public:
+    RGWSI_MetaBackend::Context *ctx() {
+      return be_ctx;
+    }
+
+    int get(const std::string& key,
+            RGWSI_MetaBackend::GetParams &params,
+            RGWObjVersionTracker *objv_tracker,
+            optional_yield y, const DoutPrefixProvider *dpp) {
+      return be->get(be_ctx, key, params, objv_tracker, y, dpp);
+    }
+
+    int put(const std::string& key,
+            RGWSI_MetaBackend::PutParams& params,
+            RGWObjVersionTracker *objv_tracker,
+            optional_yield y, const DoutPrefixProvider *dpp) {
+      return be->put(be_ctx, key, params, objv_tracker, y, dpp);
+    }
+
+    int remove(const std::string& key,
+               RGWSI_MetaBackend::RemoveParams& params,
+               RGWObjVersionTracker *objv_tracker,
+               optional_yield y, const DoutPrefixProvider *dpp) {
+      return be->remove(be_ctx, key, params, objv_tracker, y, dpp);
+    }
+
+    int mutate(const std::string& key,
+	       RGWSI_MetaBackend::MutateParams& params,
+	       RGWObjVersionTracker *objv_tracker,
+               optional_yield y,
+	       std::function<int()> f,
+               const DoutPrefixProvider *dpp) {
+      return be->mutate(be_ctx, key, params, objv_tracker, y, f, dpp);
+    }
+
+    int list_init(const DoutPrefixProvider *dpp, const std::string& marker) {
+      return be->list_init(dpp, be_ctx, marker);
+    }
+    int list_next(const DoutPrefixProvider *dpp, int max, std::list<std::string> *keys,
+                  bool *truncated) {
+      return be->list_next(dpp, be_ctx, max, keys, truncated);
+    }
+    int list_get_marker(std::string *marker) {
+      return be->list_get_marker(be_ctx, marker);
+    }
+
+    int get_shard_id(const std::string& key, int *shard_id) {
+      return be->get_shard_id(be_ctx, key, shard_id);
+    }
+  };
+
+  class Op_ManagedCtx : public Op {
+    std::unique_ptr<RGWSI_MetaBackend::Context> pctx;
+  public:
+    Op_ManagedCtx(RGWSI_MetaBackend_Handler *handler);
+  };
+
+  RGWSI_MetaBackend_Handler(RGWSI_MetaBackend *_be) : be(_be) {}
+  virtual ~RGWSI_MetaBackend_Handler() {}
+
+  int call(std::function<int(Op *)> f) {
+    return call(std::nullopt, f);
+  }
+
+  virtual int call(std::optional<RGWSI_MetaBackend_CtxParams> bectx_params,
+                   std::function<int(Op *)> f);
+};
+
diff --git a/src/rgw/services/svc_meta_be_otp.cc b/src/rgw/services/svc_meta_be_otp.cc
new file mode 100644
index 000000000..3cabeb9d0
--- /dev/null
+++ b/src/rgw/services/svc_meta_be_otp.cc
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_meta_be_otp.h"
+
+#include "rgw_tools.h"
+#include "rgw_metadata.h"
+#include "rgw_mdlog.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWSI_MetaBackend_OTP::RGWSI_MetaBackend_OTP(CephContext *cct) : RGWSI_MetaBackend_SObj(cct) {
+}
+
+RGWSI_MetaBackend_OTP::~RGWSI_MetaBackend_OTP() {
+}
+
+string RGWSI_MetaBackend_OTP::get_meta_key(const rgw_user& user)
+{
+  return string("otp:user:") + user.to_str();
+}
+
+RGWSI_MetaBackend_Handler *RGWSI_MetaBackend_OTP::alloc_be_handler()
+{
+  return new RGWSI_MetaBackend_Handler_OTP(this);
+}
+
+RGWSI_MetaBackend::Context *RGWSI_MetaBackend_OTP::alloc_ctx()
+{
+  return new Context_OTP;
+}
+
+int RGWSI_MetaBackend_OTP::call_with_get_params(ceph::real_time *pmtime, std::function<int(RGWSI_MetaBackend::GetParams&)> cb)
+{
+  otp_devices_list_t devices;
+  RGWSI_MBOTP_GetParams params;
+  params.pdevices = &devices;
+  params.pmtime = pmtime;
+  return cb(params);
+}
+
+int RGWSI_MetaBackend_OTP::get_entry(RGWSI_MetaBackend::Context *_ctx,
+                                     const string& key,
+                                     RGWSI_MetaBackend::GetParams& _params,
+                                     RGWObjVersionTracker *objv_tracker,
+                                     optional_yield y,
+                                     const DoutPrefixProvider *dpp,
+                                     bool get_raw_attrs)
+{
+  RGWSI_MBOTP_GetParams& params = static_cast<RGWSI_MBOTP_GetParams&>(_params);
+
+  int r = cls_svc->mfa.list_mfa(dpp, key, params.pdevices, objv_tracker, params.pmtime, y);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_MetaBackend_OTP::put_entry(const DoutPrefixProvider *dpp, 
+                                     RGWSI_MetaBackend::Context *_ctx,
+                                     const string& key,
+                                     RGWSI_MetaBackend::PutParams& _params,
+                                     RGWObjVersionTracker *objv_tracker,
+                                     optional_yield y)
+{
+  RGWSI_MBOTP_PutParams& params = static_cast<RGWSI_MBOTP_PutParams&>(_params);
+
+  return cls_svc->mfa.set_mfa(dpp, key, params.devices, true, objv_tracker, params.mtime, y);
+}
+
diff --git a/src/rgw/services/svc_meta_be_otp.h b/src/rgw/services/svc_meta_be_otp.h
new file mode 100644
index 000000000..7bd9cf652
--- /dev/null
+++ b/src/rgw/services/svc_meta_be_otp.h
@@ -0,0 +1,89 @@
+
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "svc_cls.h"
+#include "svc_meta_be.h"
+#include "svc_meta_be_sobj.h"
+#include "svc_sys_obj.h"
+
+
+using RGWSI_MBOTP_Handler_Module  = RGWSI_MBSObj_Handler_Module;
+using RGWSI_MetaBackend_Handler_OTP  = RGWSI_MetaBackend_Handler_SObj;
+
+using otp_devices_list_t = std::list<rados::cls::otp::otp_info_t>;
+
+struct RGWSI_MBOTP_GetParams : public RGWSI_MetaBackend::GetParams {
+  otp_devices_list_t *pdevices{nullptr};
+};
+
+struct RGWSI_MBOTP_PutParams : public RGWSI_MetaBackend::PutParams {
+  otp_devices_list_t devices;
+};
+
+using RGWSI_MBOTP_RemoveParams = RGWSI_MBSObj_RemoveParams;
+
+class RGWSI_MetaBackend_OTP : public RGWSI_MetaBackend_SObj
+{
+  RGWSI_Cls *cls_svc{nullptr};
+
+public:
+  struct Context_OTP : public RGWSI_MetaBackend_SObj::Context_SObj {
+    otp_devices_list_t devices;
+  };
+
+  RGWSI_MetaBackend_OTP(CephContext *cct);
+  virtual ~RGWSI_MetaBackend_OTP();
+
+  RGWSI_MetaBackend::Type get_type() {
+    return MDBE_OTP;
+  }
+
+  static std::string get_meta_key(const rgw_user& user);
+
+  void init(RGWSI_SysObj *_sysobj_svc,
+            RGWSI_MDLog *_mdlog_svc,
+	    RGWSI_Cls *_cls_svc) {
+    RGWSI_MetaBackend_SObj::init(_sysobj_svc, _mdlog_svc);
+    cls_svc = _cls_svc;
+  }
+
+  RGWSI_MetaBackend_Handler *alloc_be_handler() override;
+  RGWSI_MetaBackend::Context *alloc_ctx() override;
+
+  int call_with_get_params(ceph::real_time *pmtime, std::function<int(RGWSI_MetaBackend::GetParams&)> cb) override;
+
+  int get_entry(RGWSI_MetaBackend::Context *ctx,
+                const std::string& key,
+                RGWSI_MetaBackend::GetParams& _params,
+                RGWObjVersionTracker *objv_tracker,
+                optional_yield y,
+                const DoutPrefixProvider *dpp,
+                bool get_raw_attrs=false);
+  int put_entry(const DoutPrefixProvider *dpp, 
+                RGWSI_MetaBackend::Context *ctx,
+                const std::string& key,
+                RGWSI_MetaBackend::PutParams& _params,
+                RGWObjVersionTracker *objv_tracker,
+                optional_yield y);
+};
+
+
diff --git a/src/rgw/services/svc_meta_be_params.h b/src/rgw/services/svc_meta_be_params.h
new file mode 100644
index 000000000..445f6e188
--- /dev/null
+++ b/src/rgw/services/svc_meta_be_params.h
@@ -0,0 +1,25 @@
+
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <variant>
+
+struct RGWSI_MetaBackend_CtxParams_SObj {};
+
+using RGWSI_MetaBackend_CtxParams = std::variant<RGWSI_MetaBackend_CtxParams_SObj>;
diff --git a/src/rgw/services/svc_meta_be_sobj.cc b/src/rgw/services/svc_meta_be_sobj.cc
new file mode 100644
index 000000000..c0ff402fc
--- /dev/null
+++ b/src/rgw/services/svc_meta_be_sobj.cc
@@ -0,0 +1,246 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_meta_be_sobj.h"
+#include "svc_meta_be_params.h"
+#include "svc_mdlog.h"
+
+#include "rgw_tools.h"
+#include "rgw_metadata.h"
+#include "rgw_mdlog.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWSI_MetaBackend_SObj::RGWSI_MetaBackend_SObj(CephContext *cct) : RGWSI_MetaBackend(cct) {
+}
+
+RGWSI_MetaBackend_SObj::~RGWSI_MetaBackend_SObj() {
+}
+
+RGWSI_MetaBackend_Handler *RGWSI_MetaBackend_SObj::alloc_be_handler()
+{
+  return new RGWSI_MetaBackend_Handler_SObj(this);
+}
+
+RGWSI_MetaBackend::Context *RGWSI_MetaBackend_SObj::alloc_ctx()
+{
+  return new Context_SObj;
+}
+
+int RGWSI_MetaBackend_SObj::pre_modify(const DoutPrefixProvider *dpp, RGWSI_MetaBackend::Context *_ctx,
+                                       const string& key,
+                                       RGWMetadataLogData& log_data,
+                                       RGWObjVersionTracker *objv_tracker,
+                                       RGWMDLogStatus op_type,
+                                       optional_yield y)
+{
+  auto ctx = static_cast<Context_SObj *>(_ctx);
+  int ret = RGWSI_MetaBackend::pre_modify(dpp, ctx, key, log_data,
+                                          objv_tracker, op_type,
+                                          y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  /* if write version has not been set, and there's a read version, set it so that we can
+   * log it
+   */
+  if (objv_tracker) {
+    log_data.read_version = objv_tracker->read_version;
+    log_data.write_version = objv_tracker->write_version;
+  }
+
+  log_data.status = op_type;
+
+  bufferlist logbl;
+  encode(log_data, logbl);
+
+  ret = mdlog_svc->add_entry(dpp, ctx->module->get_hash_key(key), ctx->module->get_section(), key, logbl);
+  if (ret < 0)
+    return ret;
+
+  return 0;
+}
+
+int RGWSI_MetaBackend_SObj::post_modify(const DoutPrefixProvider *dpp, 
+                                        RGWSI_MetaBackend::Context *_ctx,
+                                        const string& key,
+                                        RGWMetadataLogData& log_data,
+                                        RGWObjVersionTracker *objv_tracker, int ret,
+                                        optional_yield y)
+{
+  auto ctx = static_cast<Context_SObj *>(_ctx);
+  if (ret >= 0)
+    log_data.status = MDLOG_STATUS_COMPLETE;
+  else 
+    log_data.status = MDLOG_STATUS_ABORT;
+
+  bufferlist logbl;
+  encode(log_data, logbl);
+
+  int r = mdlog_svc->add_entry(dpp, ctx->module->get_hash_key(key), ctx->module->get_section(), key, logbl);
+  if (ret < 0)
+    return ret;
+
+  if (r < 0)
+    return r;
+
+  return RGWSI_MetaBackend::post_modify(dpp, ctx, key, log_data, objv_tracker, ret, y);
+}
+
+int RGWSI_MetaBackend_SObj::get_shard_id(RGWSI_MetaBackend::Context *_ctx,
+					 const std::string& key,
+					 int *shard_id)
+{
+  auto ctx = static_cast<Context_SObj *>(_ctx);
+  *shard_id = mdlog_svc->get_shard_id(ctx->module->get_hash_key(key), shard_id);
+  return 0;
+}
+
+int RGWSI_MetaBackend_SObj::call(std::optional<RGWSI_MetaBackend_CtxParams> opt,
+                                 std::function<int(RGWSI_MetaBackend::Context *)> f)
+{
+  RGWSI_MetaBackend_SObj::Context_SObj ctx;
+  return f(&ctx);
+}
+
+void RGWSI_MetaBackend_SObj::Context_SObj::init(RGWSI_MetaBackend_Handler *h)
+{
+  RGWSI_MetaBackend_Handler_SObj *handler = static_cast<RGWSI_MetaBackend_Handler_SObj *>(h);
+  module = handler->module;
+}
+
+int RGWSI_MetaBackend_SObj::call_with_get_params(ceph::real_time *pmtime, std::function<int(RGWSI_MetaBackend::GetParams&)> cb)
+{
+  bufferlist bl;
+  RGWSI_MBSObj_GetParams params;
+  params.pmtime = pmtime;
+  params.pbl = &bl;
+  return cb(params);
+}
+
+int RGWSI_MetaBackend_SObj::get_entry(RGWSI_MetaBackend::Context *_ctx,
+                                      const string& key,
+                                      GetParams& _params,
+                                      RGWObjVersionTracker *objv_tracker,
+                                      optional_yield y,
+                                      const DoutPrefixProvider *dpp,
+                                      bool get_raw_attrs)
+{
+  RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast<RGWSI_MetaBackend_SObj::Context_SObj *>(_ctx);
+  RGWSI_MBSObj_GetParams& params = static_cast<RGWSI_MBSObj_GetParams&>(_params);
+
+  rgw_pool pool;
+  string oid;
+  ctx->module->get_pool_and_oid(key, &pool, &oid);
+
+  int ret = 0;
+  ret = rgw_get_system_obj(sysobj_svc, pool, oid, *params.pbl,
+                            objv_tracker, params.pmtime,
+                            y, dpp,
+                            params.pattrs, params.cache_info,
+                            params.refresh_version, get_raw_attrs);
+
+  return ret;
+}
+
+int RGWSI_MetaBackend_SObj::put_entry(const DoutPrefixProvider *dpp, 
+                                      RGWSI_MetaBackend::Context *_ctx,
+                                      const string& key,
+                                      PutParams& _params,
+                                      RGWObjVersionTracker *objv_tracker,
+                                      optional_yield y)
+{
+  RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast<RGWSI_MetaBackend_SObj::Context_SObj *>(_ctx);
+  RGWSI_MBSObj_PutParams& params = static_cast<RGWSI_MBSObj_PutParams&>(_params);
+
+  rgw_pool pool;
+  string oid;
+  ctx->module->get_pool_and_oid(key, &pool, &oid);
+
+  return rgw_put_system_obj(dpp, sysobj_svc, pool, oid, params.bl, params.exclusive,
+                            objv_tracker, params.mtime, y, params.pattrs);
+}
+
+int RGWSI_MetaBackend_SObj::remove_entry(const DoutPrefixProvider *dpp, 
+                                         RGWSI_MetaBackend::Context *_ctx,
+                                         const string& key,
+                                         RemoveParams& params,
+                                         RGWObjVersionTracker *objv_tracker,
+                                         optional_yield y)
+{
+  RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast<RGWSI_MetaBackend_SObj::Context_SObj *>(_ctx);
+
+  rgw_pool pool;
+  string oid;
+  ctx->module->get_pool_and_oid(key, &pool, &oid);
+  rgw_raw_obj k(pool, oid);
+
+  auto sysobj = sysobj_svc->get_obj(k);
+  return sysobj.wop()
+               .set_objv_tracker(objv_tracker)
+               .remove(dpp, y);
+}
+
+int RGWSI_MetaBackend_SObj::list_init(const DoutPrefixProvider *dpp,
+                                      RGWSI_MetaBackend::Context *_ctx,
+                                      const string& marker)
+{
+  RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast<RGWSI_MetaBackend_SObj::Context_SObj *>(_ctx);
+
+  rgw_pool pool;
+
+  string no_key;
+  ctx->module->get_pool_and_oid(no_key, &pool, nullptr);
+
+  ctx->list.pool = sysobj_svc->get_pool(pool);
+  ctx->list.op.emplace(ctx->list.pool->op());
+
+  string prefix = ctx->module->get_oid_prefix();
+  ctx->list.op->init(dpp, marker, prefix);
+
+  return 0;
+}
+
+int RGWSI_MetaBackend_SObj::list_next(const DoutPrefixProvider *dpp,
+                                      RGWSI_MetaBackend::Context *_ctx,
+                                      int max, list<string> *keys,
+                                      bool *truncated)
+{
+  RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast<RGWSI_MetaBackend_SObj::Context_SObj *>(_ctx);
+
+  vector<string> oids;
+
+  keys->clear();
+
+  int ret = ctx->list.op->get_next(dpp, max, &oids, truncated);
+  if (ret < 0 && ret != -ENOENT)
+    return ret;
+  if (ret == -ENOENT) {
+    if (truncated)
+      *truncated = false;
+    return 0;
+  }
+
+  auto module = ctx->module;
+
+  for (auto& o : oids) {
+    if (!module->is_valid_oid(o)) {
+      continue;
+    }
+    keys->emplace_back(module->oid_to_key(o));
+  }
+
+  return 0;
+}
+
+int RGWSI_MetaBackend_SObj::list_get_marker(RGWSI_MetaBackend::Context *_ctx,
+                                            string *marker)
+{
+  RGWSI_MetaBackend_SObj::Context_SObj *ctx = static_cast<RGWSI_MetaBackend_SObj::Context_SObj *>(_ctx);
+
+  return ctx->list.op->get_marker(marker);
+}
+
diff --git a/src/rgw/services/svc_meta_be_sobj.h b/src/rgw/services/svc_meta_be_sobj.h
new file mode 100644
index 000000000..304afc8bf
--- /dev/null
+++ b/src/rgw/services/svc_meta_be_sobj.h
@@ -0,0 +1,194 @@
+
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "svc_meta_be.h"
+#include "svc_sys_obj.h"
+
+
+class RGWSI_MBSObj_Handler_Module : public RGWSI_MetaBackend::Module {
+protected:
+  std::string section;
+public:
+  RGWSI_MBSObj_Handler_Module(const std::string& _section) : section(_section) {}
+  virtual void get_pool_and_oid(const std::string& key, rgw_pool *pool, std::string *oid) = 0;
+  virtual const std::string& get_oid_prefix() = 0;
+  virtual std::string key_to_oid(const std::string& key) = 0;
+  virtual bool is_valid_oid(const std::string& oid) = 0;
+  virtual std::string oid_to_key(const std::string& oid) = 0;
+
+  const std::string& get_section() {
+    return section;
+  }
+
+  /* key to use for hashing entries for log shard placement */
+  virtual std::string get_hash_key(const std::string& key) {
+    return section + ":" + key;
+  }
+};
+
+struct RGWSI_MBSObj_GetParams : public RGWSI_MetaBackend::GetParams {
+  bufferlist *pbl{nullptr};
+  std::map<std::string, bufferlist> *pattrs{nullptr};
+  rgw_cache_entry_info *cache_info{nullptr};
+  boost::optional<obj_version> refresh_version;
+
+  RGWSI_MBSObj_GetParams() {}
+  RGWSI_MBSObj_GetParams(bufferlist *_pbl,
+                         std::map<std::string, bufferlist> *_pattrs,
+                         ceph::real_time *_pmtime) : RGWSI_MetaBackend::GetParams(_pmtime),
+                                                     pbl(_pbl),
+                                                     pattrs(_pattrs) {}
+
+  RGWSI_MBSObj_GetParams& set_cache_info(rgw_cache_entry_info *_cache_info) {
+    cache_info = _cache_info;
+    return *this;
+  }
+  RGWSI_MBSObj_GetParams& set_refresh_version(boost::optional<obj_version>& _refresh_version) {
+    refresh_version = _refresh_version;
+    return *this;
+  }
+};
+
+struct RGWSI_MBSObj_PutParams : public RGWSI_MetaBackend::PutParams {
+  bufferlist bl;
+  std::map<std::string, bufferlist> *pattrs{nullptr};
+  bool exclusive{false};
+
+  RGWSI_MBSObj_PutParams() {}
+  RGWSI_MBSObj_PutParams(std::map<std::string, bufferlist> *_pattrs,
+                         const ceph::real_time& _mtime) : RGWSI_MetaBackend::PutParams(_mtime),
+                                                          pattrs(_pattrs) {}
+  RGWSI_MBSObj_PutParams(bufferlist& _bl,
+                         std::map<std::string, bufferlist> *_pattrs,
+                         const ceph::real_time& _mtime,
+                         bool _exclusive) : RGWSI_MetaBackend::PutParams(_mtime),
+                                            bl(_bl),
+                                            pattrs(_pattrs),
+                                            exclusive(_exclusive) {}
+};
+
+struct RGWSI_MBSObj_RemoveParams : public RGWSI_MetaBackend::RemoveParams {
+};
+
+class RGWSI_MetaBackend_SObj : public RGWSI_MetaBackend
+{
+protected:
+  RGWSI_SysObj *sysobj_svc{nullptr};
+
+public:
+  struct Context_SObj : public RGWSI_MetaBackend::Context {
+    RGWSI_MBSObj_Handler_Module *module{nullptr};
+    struct _list {
+      std::optional<RGWSI_SysObj::Pool> pool;
+      std::optional<RGWSI_SysObj::Pool::Op> op;
+    } list;
+
+    void init(RGWSI_MetaBackend_Handler *h) override;
+  };
+
+  RGWSI_MetaBackend_SObj(CephContext *cct);
+  virtual ~RGWSI_MetaBackend_SObj();
+
+  RGWSI_MetaBackend::Type get_type() {
+    return MDBE_SOBJ;
+  }
+
+  void init(RGWSI_SysObj *_sysobj_svc,
+            RGWSI_MDLog *_mdlog_svc) {
+    base_init(_mdlog_svc);
+    sysobj_svc = _sysobj_svc;
+  }
+
+  RGWSI_MetaBackend_Handler *alloc_be_handler() override;
+  RGWSI_MetaBackend::Context *alloc_ctx() override;
+
+
+  int call_with_get_params(ceph::real_time *pmtime, std::function<int(RGWSI_MetaBackend::GetParams&)> cb) override;
+
+  int pre_modify(const DoutPrefixProvider *dpp, 
+                 RGWSI_MetaBackend::Context *ctx,
+                 const std::string& key,
+                 RGWMetadataLogData& log_data,
+                 RGWObjVersionTracker *objv_tracker,
+                 RGWMDLogStatus op_type,
+                 optional_yield y);
+  int post_modify(const DoutPrefixProvider *dpp, 
+                  RGWSI_MetaBackend::Context *ctx,
+                  const std::string& key,
+                  RGWMetadataLogData& log_data,
+                  RGWObjVersionTracker *objv_tracker, int ret,
+                  optional_yield y);
+
+  int get_entry(RGWSI_MetaBackend::Context *ctx,
+                const std::string& key,
+                RGWSI_MetaBackend::GetParams& params,
+                RGWObjVersionTracker *objv_tracker,
+                optional_yield y,
+                const DoutPrefixProvider *dpp,
+                bool get_raw_attrs=false) override;
+  int put_entry(const DoutPrefixProvider *dpp, 
+                RGWSI_MetaBackend::Context *ctx,
+                const std::string& key,
+                RGWSI_MetaBackend::PutParams& params,
+                RGWObjVersionTracker *objv_tracker,
+                optional_yield y) override;
+  int remove_entry(const DoutPrefixProvider *dpp, 
+                   RGWSI_MetaBackend::Context *ctx,
+                   const std::string& key,
+                   RGWSI_MetaBackend::RemoveParams& params,
+                   RGWObjVersionTracker *objv_tracker,
+                   optional_yield y) override;
+
+  int list_init(const DoutPrefixProvider *dpp, RGWSI_MetaBackend::Context *_ctx, const std::string& marker) override;
+  int list_next(const DoutPrefixProvider *dpp,
+                RGWSI_MetaBackend::Context *_ctx,
+                int max, std::list<std::string> *keys,
+                bool *truncated) override;
+  int list_get_marker(RGWSI_MetaBackend::Context *ctx,
+                      std::string *marker) override;
+
+  int get_shard_id(RGWSI_MetaBackend::Context *ctx,
+		   const std::string& key,
+		   int *shard_id) override;
+
+  int call(std::optional<RGWSI_MetaBackend_CtxParams> opt,
+           std::function<int(RGWSI_MetaBackend::Context *)> f) override;
+};
+
+
+class RGWSI_MetaBackend_Handler_SObj : public RGWSI_MetaBackend_Handler {
+  friend class RGWSI_MetaBackend_SObj::Context_SObj;
+
+  RGWSI_MBSObj_Handler_Module *module{nullptr};
+
+public:
+  RGWSI_MetaBackend_Handler_SObj(RGWSI_MetaBackend *be) : 
+                                            RGWSI_MetaBackend_Handler(be) {}
+  
+  void set_module(RGWSI_MBSObj_Handler_Module *_module) {
+    module = _module;
+  }
+
+  RGWSI_MBSObj_Handler_Module *get_module() {
+    return module;
+  }
+};
diff --git a/src/rgw/services/svc_meta_be_types.h b/src/rgw/services/svc_meta_be_types.h
new file mode 100644
index 000000000..4a88a8e0b
--- /dev/null
+++ b/src/rgw/services/svc_meta_be_types.h
@@ -0,0 +1,26 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+enum RGWSI_META_BE_TYPES {
+  SOBJ   = 1,
+  OTP    = 2,
+  BUCKET = 3,
+  BI     = 4,
+  USER   = 5,
+};
+
diff --git a/src/rgw/services/svc_notify.cc b/src/rgw/services/svc_notify.cc
new file mode 100644
index 000000000..43f84ed0a
--- /dev/null
+++ b/src/rgw/services/svc_notify.cc
@@ -0,0 +1,515 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "include/random.h"
+#include "include/Context.h"
+#include "common/errno.h"
+
+#include "rgw_cache.h"
+#include "svc_notify.h"
+#include "svc_finisher.h"
+#include "svc_zone.h"
+#include "svc_rados.h"
+
+#include "rgw_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+static string notify_oid_prefix = "notify";
+
+RGWSI_Notify::~RGWSI_Notify()
+{
+  shutdown();
+}
+
+
+class RGWWatcher : public DoutPrefixProvider , public librados::WatchCtx2 {
+  CephContext *cct;
+  RGWSI_Notify *svc;
+  int index;
+  RGWSI_RADOS::Obj obj;
+  uint64_t watch_handle;
+  int register_ret{0};
+  bool unregister_done{false};
+  librados::AioCompletion *register_completion{nullptr};
+
+  class C_ReinitWatch : public Context {
+    RGWWatcher *watcher;
+    public:
+      explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
+      void finish(int r) override {
+        watcher->reinit();
+      }
+  };
+
+  CephContext *get_cct() const override { return cct; }
+  unsigned get_subsys() const override { return dout_subsys; }
+  std::ostream& gen_prefix(std::ostream& out) const override {
+    return out << "rgw watcher librados: ";
+  }
+
+public:
+  RGWWatcher(CephContext *_cct, RGWSI_Notify *s, int i, RGWSI_RADOS::Obj& o) : cct(_cct), svc(s), index(i), obj(o), watch_handle(0) {}
+  void handle_notify(uint64_t notify_id,
+		     uint64_t cookie,
+		     uint64_t notifier_id,
+		     bufferlist& bl) override {
+    ldpp_dout(this, 10) << "RGWWatcher::handle_notify() "
+                   << " notify_id " << notify_id
+                   << " cookie " << cookie
+                   << " notifier " << notifier_id
+                   << " bl.length()=" << bl.length() << dendl;
+
+    if (unlikely(svc->inject_notify_timeout_probability == 1) ||
+	(svc->inject_notify_timeout_probability > 0 &&
+         (svc->inject_notify_timeout_probability >
+	  ceph::util::generate_random_number(0.0, 1.0)))) {
+      ldpp_dout(this, 0)
+	<< "RGWWatcher::handle_notify() dropping notification! "
+	<< "If this isn't what you want, set "
+	<< "rgw_inject_notify_timeout_probability to zero!" << dendl;
+      return;
+    }
+
+    svc->watch_cb(this, notify_id, cookie, notifier_id, bl);
+
+    bufferlist reply_bl; // empty reply payload
+    obj.notify_ack(notify_id, cookie, reply_bl);
+  }
+  void handle_error(uint64_t cookie, int err) override {
+    ldpp_dout(this, -1) << "RGWWatcher::handle_error cookie " << cookie
+			<< " err " << cpp_strerror(err) << dendl;
+    svc->remove_watcher(index);
+    svc->schedule_context(new C_ReinitWatch(this));
+  }
+
+  void reinit() {
+    if(!unregister_done) {
+      int ret = unregister_watch();
+      if (ret < 0) {
+        ldout(cct, 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
+      }
+    }
+    int ret = register_watch();
+    if (ret < 0) {
+      ldout(cct, 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
+      svc->schedule_context(new C_ReinitWatch(this));
+      return;
+    }
+  }
+
+  int unregister_watch() {
+    int r = svc->unwatch(obj, watch_handle);
+    unregister_done = true;
+    if (r < 0) {
+      return r;
+    }
+    svc->remove_watcher(index);
+    return 0;
+  }
+
+  int register_watch_async() {
+    if (register_completion) {
+      register_completion->release();
+      register_completion = nullptr;
+    }
+    register_completion = librados::Rados::aio_create_completion(nullptr, nullptr);
+    register_ret = obj.aio_watch(register_completion, &watch_handle, this);
+    if (register_ret < 0) {
+      register_completion->release();
+      return register_ret;
+    }
+    return 0;
+  }
+
+  int register_watch_finish() {
+    if (register_ret < 0) {
+      return register_ret;
+    }
+    if (!register_completion) {
+      return -EINVAL;
+    }
+    register_completion->wait_for_complete();
+    int r = register_completion->get_return_value();
+    register_completion->release();
+    register_completion = nullptr;
+    if (r < 0) {
+      return r;
+    }
+    svc->add_watcher(index);
+    unregister_done = false;
+    return 0;
+  }
+
+  int register_watch() {
+    int r = obj.watch(&watch_handle, this);
+    if (r < 0) {
+      return r;
+    }
+    svc->add_watcher(index);
+    unregister_done = false;
+    return 0;
+  }
+};
+
+
+class RGWSI_Notify_ShutdownCB : public RGWSI_Finisher::ShutdownCB
+{
+  RGWSI_Notify *svc;
+public:
+  RGWSI_Notify_ShutdownCB(RGWSI_Notify *_svc) : svc(_svc) {}
+  void call() override {
+    svc->shutdown();
+  }
+};
+
+string RGWSI_Notify::get_control_oid(int i)
+{
+  char buf[notify_oid_prefix.size() + 16];
+  snprintf(buf, sizeof(buf), "%s.%d", notify_oid_prefix.c_str(), i);
+
+  return string(buf);
+}
+
+// do not call pick_obj_control before init_watch
+RGWSI_RADOS::Obj RGWSI_Notify::pick_control_obj(const string& key)
+{
+  uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
+
+  int i = r % num_watchers;
+  return notify_objs[i];
+}
+
+int RGWSI_Notify::init_watch(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  num_watchers = cct->_conf->rgw_num_control_oids;
+
+  bool compat_oid = (num_watchers == 0);
+
+  if (num_watchers <= 0)
+    num_watchers = 1;
+
+  watchers = new RGWWatcher *[num_watchers];
+
+  int error = 0;
+
+  notify_objs.resize(num_watchers);
+
+  for (int i=0; i < num_watchers; i++) {
+    string notify_oid;
+
+    if (!compat_oid) {
+      notify_oid = get_control_oid(i);
+    } else {
+      notify_oid = notify_oid_prefix;
+    }
+
+    notify_objs[i] = rados_svc->handle().obj({control_pool, notify_oid});
+    auto& notify_obj = notify_objs[i];
+
+    int r = notify_obj.open(dpp);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: notify_obj.open() returned r=" << r << dendl;
+      return r;
+    }
+
+    librados::ObjectWriteOperation op;
+    op.create(false);
+    r = notify_obj.operate(dpp, &op, y);
+    if (r < 0 && r != -EEXIST) {
+      ldpp_dout(dpp, 0) << "ERROR: notify_obj.operate() returned r=" << r << dendl;
+      return r;
+    }
+
+    RGWWatcher *watcher = new RGWWatcher(cct, this, i, notify_obj);
+    watchers[i] = watcher;
+
+    r = watcher->register_watch_async();
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: register_watch_aio() returned " << r << dendl;
+      error = r;
+      continue;
+    }
+  }
+
+  for (int i = 0; i < num_watchers; ++i) {
+    int r = watchers[i]->register_watch_finish();
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: async watch returned " << r << dendl;
+      error = r;
+    }
+  }
+
+  if (error < 0) {
+    return error;
+  }
+
+  return 0;
+}
+
+void RGWSI_Notify::finalize_watch()
+{
+  for (int i = 0; i < num_watchers; i++) {
+    RGWWatcher *watcher = watchers[i];
+    if (watchers_set.find(i) != watchers_set.end())
+      watcher->unregister_watch();
+    delete watcher;
+  }
+
+  delete[] watchers;
+}
+
+int RGWSI_Notify::do_start(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  int r = zone_svc->start(y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  assert(zone_svc->is_started()); /* otherwise there's an ordering problem */
+
+  r = rados_svc->start(y, dpp);
+  if (r < 0) {
+    return r;
+  }
+  r = finisher_svc->start(y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  inject_notify_timeout_probability =
+    cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
+  max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
+
+  control_pool = zone_svc->get_zone_params().control_pool;
+
+  int ret = init_watch(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  shutdown_cb = new RGWSI_Notify_ShutdownCB(this);
+  int handle;
+  finisher_svc->register_caller(shutdown_cb, &handle);
+  finisher_handle = handle;
+
+  return 0;
+}
+
+void RGWSI_Notify::shutdown()
+{
+  if (finalized) {
+    return;
+  }
+
+  if (finisher_handle) {
+    finisher_svc->unregister_caller(*finisher_handle);
+  }
+  finalize_watch();
+
+  delete shutdown_cb;
+
+  finalized = true;
+}
+
+int RGWSI_Notify::unwatch(RGWSI_RADOS::Obj& obj, uint64_t watch_handle)
+{
+  int r = obj.unwatch(watch_handle);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
+    return r;
+  }
+  r = rados_svc->handle().watch_flush();
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void RGWSI_Notify::add_watcher(int i)
+{
+  ldout(cct, 20) << "add_watcher() i=" << i << dendl;
+  std::unique_lock l{watchers_lock};
+  watchers_set.insert(i);
+  if (watchers_set.size() ==  (size_t)num_watchers) {
+    ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
+    _set_enabled(true);
+  }
+}
+
+void RGWSI_Notify::remove_watcher(int i)
+{
+  ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
+  std::unique_lock l{watchers_lock};
+  size_t orig_size = watchers_set.size();
+  watchers_set.erase(i);
+  if (orig_size == (size_t)num_watchers &&
+      watchers_set.size() < orig_size) { /* actually removed */
+    ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
+    _set_enabled(false);
+  }
+}
+
+int RGWSI_Notify::watch_cb(const DoutPrefixProvider *dpp,
+                           uint64_t notify_id,
+                           uint64_t cookie,
+                           uint64_t notifier_id,
+                           bufferlist& bl)
+{
+  std::shared_lock l{watchers_lock};
+  if (cb) {
+    return cb->watch_cb(dpp, notify_id, cookie, notifier_id, bl);
+  }
+  return 0;
+}
+
+void RGWSI_Notify::set_enabled(bool status)
+{
+  std::unique_lock l{watchers_lock};
+  _set_enabled(status);
+}
+
+void RGWSI_Notify::_set_enabled(bool status)
+{
+  enabled = status;
+  if (cb) {
+    cb->set_enabled(status);
+  }
+}
+
+int RGWSI_Notify::distribute(const DoutPrefixProvider *dpp, const string& key,
+			     const RGWCacheNotifyInfo& cni,
+                             optional_yield y)
+{
+  /* The RGW uses the control pool to store the watch notify objects.
+    The precedence in RGWSI_Notify::do_start is to call to zone_svc->start and later to init_watch().
+    The first time, RGW starts in the cluster, the RGW will try to create zone and zonegroup system object.
+    In that case RGW will try to distribute the cache before it ran init_watch,
+    which will lead to division by 0 in pick_obj_control (num_watchers is 0).
+  */
+  if (num_watchers > 0) {
+    RGWSI_RADOS::Obj notify_obj = pick_control_obj(key);
+
+    ldpp_dout(dpp, 10) << "distributing notification oid=" << notify_obj.get_ref().obj
+		       << " cni=" << cni << dendl;
+    return robust_notify(dpp, notify_obj, cni, y);
+  }
+  return 0;
+}
+
+namespace librados {
+
+static std::ostream& operator<<(std::ostream& out, const notify_timeout_t& t)
+{
+  return out << t.notifier_id << ':' << t.cookie;
+}
+
+} // namespace librados
+
+using timeout_vector = std::vector<librados::notify_timeout_t>;
+
+static timeout_vector decode_timeouts(const bufferlist& bl)
+{
+  using ceph::decode;
+  auto p = bl.begin();
+
+  // decode and discard the acks
+  uint32_t num_acks;
+  decode(num_acks, p);
+  for (auto i = 0u; i < num_acks; ++i) {
+    std::pair<uint64_t, uint64_t> id;
+    decode(id, p);
+    // discard the payload
+    uint32_t blen;
+    decode(blen, p);
+    p += blen;
+  }
+
+  // decode and return the timeouts
+  uint32_t num_timeouts;
+  decode(num_timeouts, p);
+
+  timeout_vector timeouts;
+  for (auto i = 0u; i < num_timeouts; ++i) {
+    std::pair<uint64_t, uint64_t> id;
+    decode(id, p);
+    timeouts.push_back({id.first, id.second});
+  }
+  return timeouts;
+}
+
+int RGWSI_Notify::robust_notify(const DoutPrefixProvider *dpp,
+                                RGWSI_RADOS::Obj& notify_obj,
+				const RGWCacheNotifyInfo& cni,
+                                optional_yield y)
+{
+  bufferlist bl, rbl;
+  encode(cni, bl);
+
+  // First, try to send, without being fancy about it.
+  auto r = notify_obj.notify(dpp, bl, 0, &rbl, y);
+
+  if (r < 0) {
+    timeout_vector timeouts;
+    try {
+      timeouts = decode_timeouts(rbl);
+    } catch (const buffer::error& e) {
+      ldpp_dout(dpp, 0) << "robust_notify failed to decode notify response: "
+          << e.what() << dendl;
+    }
+
+    ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+		      << " Watchers " << timeouts << " did not respond."
+		      << " Notify failed on object " << cni.obj << ": "
+		      << cpp_strerror(-r) << dendl;
+  }
+
+  // If we timed out, get serious.
+  if (r == -ETIMEDOUT) {
+    RGWCacheNotifyInfo info;
+    info.op = INVALIDATE_OBJ;
+    info.obj = cni.obj;
+    bufferlist retrybl;
+    encode(info, retrybl);
+
+    for (auto tries = 0u;
+	 r == -ETIMEDOUT && tries < max_notify_retries;
+	 ++tries) {
+      ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			<< " Invalidating obj=" << info.obj << " tries="
+			<< tries << dendl;
+      r = notify_obj.notify(dpp, retrybl, 0, &rbl, y);
+      if (r < 0) {
+        timeout_vector timeouts;
+        try {
+          timeouts = decode_timeouts(rbl);
+        } catch (const buffer::error& e) {
+          ldpp_dout(dpp, 0) << "robust_notify failed to decode notify response: "
+              << e.what() << dendl;
+        }
+
+	ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__
+			  << " Watchers " << timeouts << " did not respond."
+			  << " Invalidation attempt " << tries << " failed: "
+			  << cpp_strerror(-r) << dendl;
+      }
+    }
+  }
+  return r;
+}
+
+void RGWSI_Notify::register_watch_cb(CB *_cb)
+{
+  std::unique_lock l{watchers_lock};
+  cb = _cb;
+  _set_enabled(enabled);
+}
+
+void RGWSI_Notify::schedule_context(Context *c)
+{
+  finisher_svc->schedule_context(c);
+}
diff --git a/src/rgw/services/svc_notify.h b/src/rgw/services/svc_notify.h
new file mode 100644
index 000000000..f7329136e
--- /dev/null
+++ b/src/rgw/services/svc_notify.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "svc_rados.h"
+
+
+class Context;
+
+class RGWSI_Zone;
+class RGWSI_Finisher;
+
+class RGWWatcher;
+class RGWSI_Notify_ShutdownCB;
+struct RGWCacheNotifyInfo;
+
+class RGWSI_Notify : public RGWServiceInstance
+{
+  friend class RGWWatcher;
+  friend class RGWSI_Notify_ShutdownCB;
+  friend class RGWServices_Def;
+
+public:
+  class CB;
+
+private:
+  RGWSI_Zone *zone_svc{nullptr};
+  RGWSI_RADOS *rados_svc{nullptr};
+  RGWSI_Finisher *finisher_svc{nullptr};
+
+  ceph::shared_mutex watchers_lock = ceph::make_shared_mutex("watchers_lock");
+  rgw_pool control_pool;
+
+  int num_watchers{0};
+  RGWWatcher **watchers{nullptr};
+  std::set<int> watchers_set;
+  std::vector<RGWSI_RADOS::Obj> notify_objs;
+
+  bool enabled{false};
+
+  double inject_notify_timeout_probability{0};
+  uint64_t max_notify_retries = 10;
+
+  std::string get_control_oid(int i);
+  RGWSI_RADOS::Obj pick_control_obj(const std::string& key);
+
+  CB *cb{nullptr};
+
+  std::optional<int> finisher_handle;
+  RGWSI_Notify_ShutdownCB *shutdown_cb{nullptr};
+
+  bool finalized{false};
+
+  int init_watch(const DoutPrefixProvider *dpp, optional_yield y);
+  void finalize_watch();
+
+  void init(RGWSI_Zone *_zone_svc,
+            RGWSI_RADOS *_rados_svc,
+            RGWSI_Finisher *_finisher_svc) {
+    zone_svc = _zone_svc;
+    rados_svc = _rados_svc;
+    finisher_svc = _finisher_svc;
+  }
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+  void shutdown() override;
+
+  int unwatch(RGWSI_RADOS::Obj& obj, uint64_t watch_handle);
+  void add_watcher(int i);
+  void remove_watcher(int i);
+
+  int watch_cb(const DoutPrefixProvider *dpp,
+               uint64_t notify_id,
+               uint64_t cookie,
+               uint64_t notifier_id,
+               bufferlist& bl);
+  void _set_enabled(bool status);
+  void set_enabled(bool status);
+
+  int robust_notify(const DoutPrefixProvider *dpp, RGWSI_RADOS::Obj& notify_obj,
+		    const RGWCacheNotifyInfo& bl, optional_yield y);
+
+  void schedule_context(Context *c);
+public:
+  RGWSI_Notify(CephContext *cct): RGWServiceInstance(cct) {}
+
+  virtual ~RGWSI_Notify() override;
+
+  class CB {
+    public:
+      virtual ~CB() {}
+      virtual int watch_cb(const DoutPrefixProvider *dpp,
+                           uint64_t notify_id,
+                           uint64_t cookie,
+                           uint64_t notifier_id,
+                           bufferlist& bl) = 0;
+      virtual void set_enabled(bool status) = 0;
+  };
+
+  int distribute(const DoutPrefixProvider *dpp, const std::string& key, const RGWCacheNotifyInfo& bl,
+		 optional_yield y);
+
+  void register_watch_cb(CB *cb);
+};
diff --git a/src/rgw/services/svc_otp.cc b/src/rgw/services/svc_otp.cc
new file mode 100644
index 000000000..81d8d5711
--- /dev/null
+++ b/src/rgw/services/svc_otp.cc
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_otp.h"
+#include "svc_zone.h"
+#include "svc_meta.h"
+#include "svc_meta_be_sobj.h"
+
+#include "rgw_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGW_MB_Handler_Module_OTP : public RGWSI_MBSObj_Handler_Module {
+  RGWSI_Zone *zone_svc;
+  string prefix;
+public:
+  RGW_MB_Handler_Module_OTP(RGWSI_Zone *_zone_svc) : RGWSI_MBSObj_Handler_Module("otp"),
+                                                     zone_svc(_zone_svc) {}
+
+  void get_pool_and_oid(const string& key, rgw_pool *pool, string *oid) override {
+    if (pool) {
+      *pool = zone_svc->get_zone_params().otp_pool;
+    }
+
+    if (oid) {
+      *oid = key;
+    }
+  }
+
+  const string& get_oid_prefix() override {
+    return prefix;
+  }
+
+  bool is_valid_oid(const string& oid) override {
+    return true;
+  }
+
+  string key_to_oid(const string& key) override {
+    return key;
+  }
+
+  string oid_to_key(const string& oid) override {
+    return oid;
+  }
+};
+
+RGWSI_OTP::RGWSI_OTP(CephContext *cct): RGWServiceInstance(cct) {
+}
+
+RGWSI_OTP::~RGWSI_OTP() {
+}
+
+void RGWSI_OTP::init(RGWSI_Zone *_zone_svc,
+                        RGWSI_Meta *_meta_svc,
+                        RGWSI_MetaBackend *_meta_be_svc)
+{
+  svc.otp = this;
+  svc.zone = _zone_svc;
+  svc.meta = _meta_svc;
+  svc.meta_be = _meta_be_svc;
+}
+
+int RGWSI_OTP::do_start(optional_yield, const DoutPrefixProvider *dpp)
+{
+  /* create first backend handler for bucket entrypoints */
+
+  RGWSI_MetaBackend_Handler *_otp_be_handler;
+
+  int r = svc.meta->create_be_handler(RGWSI_MetaBackend::Type::MDBE_OTP, &_otp_be_handler);
+  if (r < 0) {
+    ldout(ctx(), 0) << "ERROR: failed to create be handler: r=" << r << dendl;
+    return r;
+  }
+
+  be_handler = _otp_be_handler;
+
+  RGWSI_MetaBackend_Handler_OTP *otp_be_handler = static_cast<RGWSI_MetaBackend_Handler_OTP *>(_otp_be_handler);
+
+  auto otp_be_module = new RGW_MB_Handler_Module_OTP(svc.zone);
+  be_module.reset(otp_be_module);
+  otp_be_handler->set_module(otp_be_module);
+
+  return 0;
+}
+
+int RGWSI_OTP::read_all(RGWSI_OTP_BE_Ctx& ctx,
+                        const string& key,
+                        otp_devices_list_t *devices,
+                        real_time *pmtime,
+                        RGWObjVersionTracker *objv_tracker,
+                        optional_yield y, const DoutPrefixProvider *dpp)
+{
+  RGWSI_MBOTP_GetParams params;
+  params.pdevices = devices;
+  params.pmtime = pmtime;
+
+  int ret = svc.meta_be->get_entry(ctx.get(), key, params, objv_tracker, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSI_OTP::read_all(RGWSI_OTP_BE_Ctx& ctx,
+                        const rgw_user& uid,
+                        otp_devices_list_t *devices,
+                        real_time *pmtime,
+                        RGWObjVersionTracker *objv_tracker,
+                        optional_yield y,
+                        const DoutPrefixProvider *dpp)
+{
+  return read_all(ctx,
+                  uid.to_str(),
+                  devices,
+                  pmtime,
+                  objv_tracker,
+                  y,
+                  dpp);
+}
+
+int RGWSI_OTP::store_all(const DoutPrefixProvider *dpp, 
+                         RGWSI_OTP_BE_Ctx& ctx,
+                         const string& key,
+                         const otp_devices_list_t& devices,
+                         real_time mtime,
+                         RGWObjVersionTracker *objv_tracker,
+                         optional_yield y)
+{
+  RGWSI_MBOTP_PutParams params;
+  params.mtime = mtime;
+  params.devices = devices;
+
+  int ret = svc.meta_be->put_entry(dpp, ctx.get(), key, params, objv_tracker, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSI_OTP::store_all(const DoutPrefixProvider *dpp, 
+                         RGWSI_OTP_BE_Ctx& ctx,
+                         const rgw_user& uid,
+                         const otp_devices_list_t& devices,
+                         real_time mtime,
+                         RGWObjVersionTracker *objv_tracker,
+                         optional_yield y)
+{
+  return store_all(dpp, ctx,
+                   uid.to_str(),
+                   devices,
+                   mtime,
+                   objv_tracker,
+                   y);
+}
+
+int RGWSI_OTP::remove_all(const DoutPrefixProvider *dpp, 
+                          RGWSI_OTP_BE_Ctx& ctx,
+                          const string& key,
+                          RGWObjVersionTracker *objv_tracker,
+                          optional_yield y)
+{
+  RGWSI_MBOTP_RemoveParams params;
+
+  int ret = svc.meta_be->remove_entry(dpp, ctx.get(), key, params, objv_tracker, y);
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSI_OTP::remove_all(const DoutPrefixProvider *dpp, 
+                          RGWSI_OTP_BE_Ctx& ctx,
+                          const rgw_user& uid,
+                          RGWObjVersionTracker *objv_tracker,
+                          optional_yield y)
+{
+  return remove_all(dpp,ctx,
+                    uid.to_str(),
+                    objv_tracker,
+                    y);
+}
diff --git a/src/rgw/services/svc_otp.h b/src/rgw/services/svc_otp.h
new file mode 100644
index 000000000..e639c2c92
--- /dev/null
+++ b/src/rgw/services/svc_otp.h
@@ -0,0 +1,95 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "cls/otp/cls_otp_types.h"
+
+#include "rgw_service.h"
+
+#include "svc_otp_types.h"
+#include "svc_meta_be_otp.h"
+
+class RGWSI_Zone;
+
+class RGWSI_OTP : public RGWServiceInstance
+{
+  RGWSI_OTP_BE_Handler be_handler;
+  std::unique_ptr<RGWSI_MetaBackend::Module> be_module;
+
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+
+public:
+  struct Svc {
+    RGWSI_OTP *otp{nullptr};
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_Meta *meta{nullptr};
+    RGWSI_MetaBackend *meta_be{nullptr};
+  } svc;
+
+  RGWSI_OTP(CephContext *cct);
+  ~RGWSI_OTP();
+
+  RGWSI_OTP_BE_Handler& get_be_handler() {
+    return be_handler;
+  }
+
+  void init(RGWSI_Zone *_zone_svc,
+            RGWSI_Meta *_meta_svc,
+            RGWSI_MetaBackend *_meta_be_svc);
+
+  int read_all(RGWSI_OTP_BE_Ctx& ctx,
+               const std::string& key,
+               otp_devices_list_t *devices,
+               real_time *pmtime,
+               RGWObjVersionTracker *objv_tracker,
+               optional_yield y,
+               const DoutPrefixProvider *dpp);
+  int read_all(RGWSI_OTP_BE_Ctx& ctx,
+               const rgw_user& uid,
+               otp_devices_list_t *devices,
+               real_time *pmtime,
+               RGWObjVersionTracker *objv_tracker,
+               optional_yield y,
+               const DoutPrefixProvider *dpp);
+  int store_all(const DoutPrefixProvider *dpp, 
+                RGWSI_OTP_BE_Ctx& ctx,
+                const std::string& key,
+                const otp_devices_list_t& devices,
+                real_time mtime,
+                RGWObjVersionTracker *objv_tracker,
+                optional_yield y);
+  int store_all(const DoutPrefixProvider *dpp, 
+                RGWSI_OTP_BE_Ctx& ctx,
+                const rgw_user& uid,
+                const otp_devices_list_t& devices,
+                real_time mtime,
+                RGWObjVersionTracker *objv_tracker,
+                optional_yield y);
+  int remove_all(const DoutPrefixProvider *dpp, 
+                 RGWSI_OTP_BE_Ctx& ctx,
+                 const std::string& key,
+                 RGWObjVersionTracker *objv_tracker,
+                 optional_yield y);
+  int remove_all(const DoutPrefixProvider *dpp, 
+                 RGWSI_OTP_BE_Ctx& ctx,
+                 const rgw_user& uid,
+                 RGWObjVersionTracker *objv_tracker,
+                 optional_yield y);
+};
+
+
diff --git a/src/rgw/services/svc_otp_types.h b/src/rgw/services/svc_otp_types.h
new file mode 100644
index 000000000..60e2a79d6
--- /dev/null
+++ b/src/rgw/services/svc_otp_types.h
@@ -0,0 +1,29 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "common/ptr_wrapper.h"
+
+#include "svc_meta_be.h"
+#include "svc_meta_be_types.h"
+
+class RGWSI_MetaBackend_Handler;
+
+using RGWSI_OTP_BE_Handler = ptr_wrapper<RGWSI_MetaBackend_Handler, RGWSI_META_BE_TYPES::OTP>;
+using RGWSI_OTP_BE_Ctx = ptr_wrapper<RGWSI_MetaBackend::Context, RGWSI_META_BE_TYPES::OTP>;
+
diff --git a/src/rgw/services/svc_quota.cc b/src/rgw/services/svc_quota.cc
new file mode 100644
index 000000000..3108a1173
--- /dev/null
+++ b/src/rgw/services/svc_quota.cc
@@ -0,0 +1,18 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_quota.h"
+#include "svc_zone.h"
+
+#include "rgw_zone.h"
+
+const RGWQuotaInfo& RGWSI_Quota::get_bucket_quota() const
+{
+  return zone_svc->get_current_period().get_config().quota.bucket_quota;
+}
+
+const RGWQuotaInfo& RGWSI_Quota::get_user_quota() const
+{
+  return zone_svc->get_current_period().get_config().quota.user_quota;
+}
+
diff --git a/src/rgw/services/svc_quota.h b/src/rgw/services/svc_quota.h
new file mode 100644
index 000000000..81aa0e1bd
--- /dev/null
+++ b/src/rgw/services/svc_quota.h
@@ -0,0 +1,22 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_service.h"
+
+
+class RGWSI_Quota : public RGWServiceInstance
+{
+  RGWSI_Zone *zone_svc{nullptr};
+
+public:
+  RGWSI_Quota(CephContext *cct): RGWServiceInstance(cct) {}
+
+  void init(RGWSI_Zone *_zone_svc) {
+    zone_svc = _zone_svc;
+  }
+
+  const RGWQuotaInfo& get_bucket_quota() const;
+  const RGWQuotaInfo& get_user_quota() const;
+};
diff --git a/src/rgw/services/svc_rados.cc b/src/rgw/services/svc_rados.cc
new file mode 100644
index 000000000..99f400f42
--- /dev/null
+++ b/src/rgw/services/svc_rados.cc
@@ -0,0 +1,445 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_rados.h"
+
+#include "include/rados/librados.hpp"
+#include "common/errno.h"
+#include "osd/osd_types.h"
+#include "rgw_tools.h"
+#include "rgw_cr_rados.h"
+
+#include "auth/AuthRegistry.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWSI_RADOS::RGWSI_RADOS(CephContext *cct) : RGWServiceInstance(cct)
+{
+}
+
+RGWSI_RADOS::~RGWSI_RADOS()
+{
+}
+
+int RGWSI_RADOS::do_start(optional_yield, const DoutPrefixProvider *dpp)
+{
+  int ret = rados.init_with_context(cct);
+  if (ret < 0) {
+    return ret;
+  }
+  ret = rados.connect();
+  if (ret < 0) {
+    return ret;
+  }
+
+  async_processor.reset(new RGWAsyncRadosProcessor(cct, cct->_conf->rgw_num_async_rados_threads));
+  async_processor->start();
+
+  return 0;
+}
+
+void RGWSI_RADOS::shutdown()
+{
+  if (async_processor) {
+    async_processor->stop();
+  }
+  rados.shutdown();
+}
+
+void RGWSI_RADOS::stop_processor()
+{
+  if (async_processor) {
+    async_processor->stop();
+  }
+}
+
+librados::Rados* RGWSI_RADOS::get_rados_handle()
+{
+  return &rados;
+}
+
+std::string RGWSI_RADOS::cluster_fsid()
+{
+  std::string fsid;
+  (void) get_rados_handle()->cluster_fsid(&fsid);
+  return fsid;
+}
+
+uint64_t RGWSI_RADOS::instance_id()
+{
+  return get_rados_handle()->get_instance_id();
+}
+
+int RGWSI_RADOS::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
+                               const OpenParams& params)
+{
+  return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx,
+                        params.create,
+                        params.mostly_omap);
+}
+
+int RGWSI_RADOS::pool_iterate(const DoutPrefixProvider *dpp,
+                              librados::IoCtx& io_ctx,
+                              librados::NObjectIterator& iter,
+                              uint32_t num, vector<rgw_bucket_dir_entry>& objs,
+                              RGWAccessListFilter *filter,
+                              bool *is_truncated)
+{
+  if (iter == io_ctx.nobjects_end())
+    return -ENOENT;
+
+  uint32_t i;
+
+  for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
+    rgw_bucket_dir_entry e;
+
+    string oid = iter->get_oid();
+    ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
+
+    // fill it in with initial values; we may correct later
+    if (filter && !filter->filter(oid, oid))
+      continue;
+
+    e.key = oid;
+    objs.push_back(e);
+  }
+
+  if (is_truncated)
+    *is_truncated = (iter != io_ctx.nobjects_end());
+
+  return objs.size();
+}
+
+RGWSI_RADOS::Obj::Obj(Pool& pool, const string& oid) : rados_svc(pool.rados_svc)
+{
+  ref.pool = pool;
+  ref.obj = rgw_raw_obj(pool.get_pool(), oid);
+}
+
+void RGWSI_RADOS::Obj::init(const rgw_raw_obj& obj)
+{
+  ref.pool = RGWSI_RADOS::Pool(rados_svc, obj.pool);
+  ref.obj = obj;
+}
+
+int RGWSI_RADOS::Obj::open(const DoutPrefixProvider *dpp)
+{
+  int r = ref.pool.open(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  ref.pool.ioctx().locator_set_key(ref.obj.loc);
+
+  return 0;
+}
+
+int RGWSI_RADOS::Obj::operate(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation *op,
+                              optional_yield y, int flags)
+{
+  return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, y, flags);
+}
+
+int RGWSI_RADOS::Obj::operate(const DoutPrefixProvider *dpp, librados::ObjectReadOperation *op,
+			      bufferlist *pbl, optional_yield y, int flags)
+{
+  return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, pbl, y, flags);
+}
+
+int RGWSI_RADOS::Obj::aio_operate(librados::AioCompletion *c, librados::ObjectWriteOperation *op)
+{
+  return ref.pool.ioctx().aio_operate(ref.obj.oid, c, op);
+}
+
+int RGWSI_RADOS::Obj::aio_operate(librados::AioCompletion *c, librados::ObjectReadOperation *op,
+                                  bufferlist *pbl)
+{
+  return ref.pool.ioctx().aio_operate(ref.obj.oid, c, op, pbl);
+}
+
+int RGWSI_RADOS::Obj::watch(uint64_t *handle, librados::WatchCtx2 *ctx)
+{
+  return ref.pool.ioctx().watch2(ref.obj.oid, handle, ctx);
+}
+
+int RGWSI_RADOS::Obj::aio_watch(librados::AioCompletion *c, uint64_t *handle, librados::WatchCtx2 *ctx)
+{
+  return ref.pool.ioctx().aio_watch(ref.obj.oid, c, handle, ctx);
+}
+
+int RGWSI_RADOS::Obj::unwatch(uint64_t handle)
+{
+  return ref.pool.ioctx().unwatch2(handle);
+}
+
+int RGWSI_RADOS::Obj::notify(const DoutPrefixProvider *dpp, bufferlist& bl, uint64_t timeout_ms,
+                             bufferlist *pbl, optional_yield y)
+{
+  return rgw_rados_notify(dpp, ref.pool.ioctx(), ref.obj.oid, bl, timeout_ms, pbl, y);
+}
+
+void RGWSI_RADOS::Obj::notify_ack(uint64_t notify_id,
+                                 uint64_t cookie,
+                                 bufferlist& bl)
+{
+  ref.pool.ioctx().notify_ack(ref.obj.oid, notify_id, cookie, bl);
+}
+
+uint64_t RGWSI_RADOS::Obj::get_last_version()
+{
+  return ref.pool.ioctx().get_last_version();
+}
+
+int RGWSI_RADOS::Pool::create(const DoutPrefixProvider *dpp)
+{
+  librados::Rados *rad = rados_svc->get_rados_handle();
+  int r = rad->pool_create(pool.name.c_str());
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "WARNING: pool_create returned " << r << dendl;
+    return r;
+  }
+  librados::IoCtx io_ctx;
+  r = rad->ioctx_create(pool.name.c_str(), io_ctx);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "WARNING: ioctx_create returned " << r << dendl;
+    return r;
+  }
+  r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "WARNING: application_enable returned " << r << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int RGWSI_RADOS::Pool::create(const DoutPrefixProvider *dpp, const vector<rgw_pool>& pools, vector<int> *retcodes)
+{
+  vector<librados::PoolAsyncCompletion *> completions;
+  vector<int> rets;
+
+  librados::Rados *rad = rados_svc->get_rados_handle();
+  for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
+    librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
+    completions.push_back(c);
+    auto& pool = *iter;
+    int ret = rad->pool_create_async(pool.name.c_str(), c);
+    rets.push_back(ret);
+  }
+
+  vector<int>::iterator riter;
+  vector<librados::PoolAsyncCompletion *>::iterator citer;
+
+  bool error = false;
+  ceph_assert(rets.size() == completions.size());
+  for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
+    int r = *riter;
+    librados::PoolAsyncCompletion *c = *citer;
+    if (r == 0) {
+      c->wait();
+      r = c->get_return_value();
+      if (r < 0) {
+        ldpp_dout(dpp, 0) << "WARNING: async pool_create returned " << r << dendl;
+        error = true;
+      }
+    }
+    c->release();
+    retcodes->push_back(r);
+  }
+  if (error) {
+    return 0;
+  }
+
+  std::vector<librados::IoCtx> io_ctxs;
+  retcodes->clear();
+  for (auto pool : pools) {
+    io_ctxs.emplace_back();
+    int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: ioctx_create returned " << ret << dendl;
+      error = true;
+    }
+    retcodes->push_back(ret);
+  }
+  if (error) {
+    return 0;
+  }
+
+  completions.clear();
+  for (auto &io_ctx : io_ctxs) {
+    librados::PoolAsyncCompletion *c =
+      librados::Rados::pool_async_create_completion();
+    completions.push_back(c);
+    int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
+                                              false, c);
+    ceph_assert(ret == 0);
+  }
+
+  retcodes->clear();
+  for (auto c : completions) {
+    c->wait();
+    int ret = c->get_return_value();
+    if (ret == -EOPNOTSUPP) {
+      ret = 0;
+    } else if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: async application_enable returned " << ret
+                    << dendl;
+      error = true;
+    }
+    c->release();
+    retcodes->push_back(ret);
+  }
+  return 0;
+}
+
+int RGWSI_RADOS::Pool::lookup()
+{
+  librados::Rados *rad = rados_svc->get_rados_handle();
+  int ret = rad->pool_lookup(pool.name.c_str());
+  if (ret < 0) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSI_RADOS::Pool::open(const DoutPrefixProvider *dpp, const OpenParams& params)
+{
+  return rados_svc->open_pool_ctx(dpp, pool, state.ioctx, params);
+}
+
+int RGWSI_RADOS::Pool::List::init(const DoutPrefixProvider *dpp, const string& marker, RGWAccessListFilter *filter)
+{
+  if (ctx.initialized) {
+    return -EINVAL;
+  }
+
+  if (!pool) {
+    return -EINVAL;
+  }
+
+  int r = pool->rados_svc->open_pool_ctx(dpp, pool->pool, ctx.ioctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectCursor oc;
+  if (!oc.from_str(marker)) {
+    ldpp_dout(dpp, 10) << "failed to parse cursor: " << marker << dendl;
+    return -EINVAL;
+  }
+
+  try {
+    ctx.iter = ctx.ioctx.nobjects_begin(oc);
+    ctx.filter = filter;
+    ctx.initialized = true;
+    return 0;
+  } catch (const std::system_error& e) {
+    r = -e.code().value();
+    ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+       << ", returning " << r << dendl;
+    return r;
+  } catch (const std::exception& e) {
+    ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
+       << ", returning -5" << dendl;
+    return -EIO;
+  }
+}
+
+int RGWSI_RADOS::Pool::List::get_next(const DoutPrefixProvider *dpp,
+                                      int max,
+                                      std::vector<string> *oids,
+                                      bool *is_truncated)
+{
+  if (!ctx.initialized) {
+    return -EINVAL;
+  }
+  vector<rgw_bucket_dir_entry> objs;
+  int r = pool->rados_svc->pool_iterate(dpp, ctx.ioctx, ctx.iter, max, objs, ctx.filter, is_truncated);
+  if (r < 0) {
+    if(r != -ENOENT) {
+      ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
+    }
+    return r;
+  }
+
+  for (auto& o : objs) {
+    oids->push_back(o.key.name);
+  }
+
+  return oids->size();
+}
+
+RGWSI_RADOS::Obj RGWSI_RADOS::Handle::obj(const rgw_raw_obj& o)
+{
+  return RGWSI_RADOS::Obj(rados_svc, o);
+}
+int RGWSI_RADOS::Handle::watch_flush()
+{
+  librados::Rados *rad = rados_svc->get_rados_handle();
+  return rad->watch_flush();
+}
+
+int RGWSI_RADOS::Handle::mon_command(std::string cmd,
+                                     const bufferlist& inbl,
+                                     bufferlist *outbl,
+                                     std::string *outs)
+{
+  librados::Rados *rad = rados_svc->get_rados_handle();
+  return rad->mon_command(cmd, inbl, outbl, outs);
+}
+
+int RGWSI_RADOS::Pool::List::get_marker(string *marker)
+{
+  if (!ctx.initialized) {
+    return -EINVAL;
+  }
+
+  *marker = ctx.iter.get_cursor().to_str();
+  return 0;
+}
+
+int RGWSI_RADOS::clog_warn(const string& msg)
+{
+  string cmd =
+    "{"
+      "\"prefix\": \"log\", "
+      "\"level\": \"warn\", "
+      "\"logtext\": [\"" + msg + "\"]"
+    "}";
+
+  bufferlist inbl;
+  auto h = handle();
+  return h.mon_command(cmd, inbl, nullptr, nullptr);
+}
+
+bool RGWSI_RADOS::check_secure_mon_conn(const DoutPrefixProvider *dpp) const
+{
+  AuthRegistry reg(cct);
+
+  reg.refresh_config();
+
+  std::vector<uint32_t> methods;
+  std::vector<uint32_t> modes;
+
+  reg.get_supported_methods(CEPH_ENTITY_TYPE_MON, &methods, &modes);
+  ldpp_dout(dpp, 20) << __func__ << "(): auth registy supported: methods=" << methods << " modes=" << modes << dendl;
+
+  for (auto method : methods) {
+    if (!reg.is_secure_method(method)) {
+      ldpp_dout(dpp, 20) << __func__ << "(): method " << method << " is insecure" << dendl;
+      return false;
+    }
+  }
+
+  for (auto mode : modes) {
+    if (!reg.is_secure_mode(mode)) {
+      ldpp_dout(dpp, 20) << __func__ << "(): mode " << mode << " is insecure" << dendl;
+      return false;
+    }
+  }
+
+  return true;
+}
+
diff --git a/src/rgw/services/svc_rados.h b/src/rgw/services/svc_rados.h
new file mode 100644
index 000000000..ede029aa8
--- /dev/null
+++ b/src/rgw/services/svc_rados.h
@@ -0,0 +1,252 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "include/rados/librados.hpp"
+#include "common/async/yield_context.h"
+
+class RGWAsyncRadosProcessor;
+
+class RGWAccessListFilter {
+public:
+  virtual ~RGWAccessListFilter() {}
+  virtual bool filter(const std::string& name, std::string& key) = 0;
+};
+
+struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
+  std::string prefix;
+
+  explicit RGWAccessListFilterPrefix(const std::string& _prefix) : prefix(_prefix) {}
+  bool filter(const std::string& name, std::string& key) override {
+    return (prefix.compare(key.substr(0, prefix.size())) == 0);
+  }
+};
+
+class RGWSI_RADOS : public RGWServiceInstance
+{
+  librados::Rados rados;
+  std::unique_ptr<RGWAsyncRadosProcessor> async_processor;
+
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+
+public:
+  struct OpenParams {
+    bool create{true};
+    bool mostly_omap{false};
+
+    OpenParams() {}
+
+    OpenParams& set_create(bool _create) {
+      create = _create;
+      return *this;
+    }
+    OpenParams& set_mostly_omap(bool _mostly_omap) {
+      mostly_omap = _mostly_omap;
+      return *this;
+    }
+  };
+
+private:
+  int open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
+                    const OpenParams& params = {});
+  int pool_iterate(const DoutPrefixProvider *dpp,
+                   librados::IoCtx& ioctx,
+                   librados::NObjectIterator& iter,
+                   uint32_t num, std::vector<rgw_bucket_dir_entry>& objs,
+                   RGWAccessListFilter *filter,
+                   bool *is_truncated);
+
+public:
+  RGWSI_RADOS(CephContext *cct);
+  ~RGWSI_RADOS();
+  librados::Rados* get_rados_handle();
+
+  void init() {}
+  void shutdown() override;
+  void stop_processor();
+
+  std::string cluster_fsid();
+  uint64_t instance_id();
+  bool check_secure_mon_conn(const DoutPrefixProvider *dpp) const;
+
+  RGWAsyncRadosProcessor *get_async_processor() {
+    return async_processor.get();
+  }
+
+  int clog_warn(const std::string& msg);
+
+  class Handle;
+
+  class Pool {
+    friend class RGWSI_RADOS;
+    friend Handle;
+    friend class Obj;
+
+    RGWSI_RADOS *rados_svc{nullptr};
+    rgw_pool pool;
+
+    struct State {
+      librados::IoCtx ioctx;
+    } state;
+
+    Pool(RGWSI_RADOS *_rados_svc,
+         const rgw_pool& _pool) : rados_svc(_rados_svc),
+                                  pool(_pool) {}
+
+    Pool(RGWSI_RADOS *_rados_svc) : rados_svc(_rados_svc) {}
+  public:
+    Pool() {}
+
+    int create(const DoutPrefixProvider *dpp);
+    int create(const DoutPrefixProvider *dpp, const std::vector<rgw_pool>& pools, std::vector<int> *retcodes);
+    int lookup();
+    int open(const DoutPrefixProvider *dpp, const OpenParams& params = {});
+
+    const rgw_pool& get_pool() {
+      return pool;
+    }
+
+    librados::IoCtx& ioctx() & {
+      return state.ioctx;
+    }
+
+    librados::IoCtx&& ioctx() && {
+      return std::move(state.ioctx);
+    }
+
+    struct List {
+      Pool *pool{nullptr};
+
+      struct Ctx {
+        bool initialized{false};
+        librados::IoCtx ioctx;
+        librados::NObjectIterator iter;
+        RGWAccessListFilter *filter{nullptr};
+      } ctx;
+
+      List() {}
+      List(Pool *_pool) : pool(_pool) {}
+
+      int init(const DoutPrefixProvider *dpp, const std::string& marker, RGWAccessListFilter *filter = nullptr);
+      int get_next(const DoutPrefixProvider *dpp, int max,
+                   std::vector<std::string> *oids,
+                   bool *is_truncated);
+
+      int get_marker(std::string *marker);
+    };
+
+    List op() {
+      return List(this);
+    }
+
+    friend List;
+  };
+
+
+  struct rados_ref {
+    RGWSI_RADOS::Pool pool;
+    rgw_raw_obj obj;
+  };
+
+  class Obj {
+    friend class RGWSI_RADOS;
+    friend class Handle;
+
+    RGWSI_RADOS *rados_svc{nullptr};
+    rados_ref ref;
+
+    void init(const rgw_raw_obj& obj);
+
+    Obj(RGWSI_RADOS *_rados_svc, const rgw_raw_obj& _obj)
+      : rados_svc(_rados_svc) {
+      init(_obj);
+    }
+
+    Obj(Pool& pool, const std::string& oid);
+
+  public:
+    Obj() {}
+
+    int open(const DoutPrefixProvider *dpp);
+
+    int operate(const DoutPrefixProvider *dpp, librados::ObjectWriteOperation *op, optional_yield y,
+		int flags = 0);
+    int operate(const DoutPrefixProvider *dpp, librados::ObjectReadOperation *op, bufferlist *pbl,
+                optional_yield y, int flags = 0);
+    int aio_operate(librados::AioCompletion *c, librados::ObjectWriteOperation *op);
+    int aio_operate(librados::AioCompletion *c, librados::ObjectReadOperation *op,
+                    bufferlist *pbl);
+
+    int watch(uint64_t *handle, librados::WatchCtx2 *ctx);
+    int aio_watch(librados::AioCompletion *c, uint64_t *handle, librados::WatchCtx2 *ctx);
+    int unwatch(uint64_t handle);
+    int notify(const DoutPrefixProvider *dpp, bufferlist& bl, uint64_t timeout_ms,
+               bufferlist *pbl, optional_yield y);
+    void notify_ack(uint64_t notify_id,
+                    uint64_t cookie,
+                    bufferlist& bl);
+
+    uint64_t get_last_version();
+
+    rados_ref& get_ref() { return ref; }
+    const rados_ref& get_ref() const { return ref; }
+
+    const rgw_raw_obj& get_raw_obj() const {
+      return ref.obj;
+    }
+  };
+
+  class Handle {
+    friend class RGWSI_RADOS;
+
+    RGWSI_RADOS *rados_svc{nullptr};
+
+    Handle(RGWSI_RADOS *_rados_svc) : rados_svc(_rados_svc) {}
+  public:
+    Obj obj(const rgw_raw_obj& o);
+
+    Pool pool(const rgw_pool& p) {
+      return Pool(rados_svc, p);
+    }
+
+    int watch_flush();
+
+    int mon_command(std::string cmd,
+                    const bufferlist& inbl,
+                    bufferlist *outbl,
+                    std::string *outs);
+  };
+
+  Handle handle() {
+    return Handle(this);
+  }
+
+  Obj obj(const rgw_raw_obj& o) {
+    return Obj(this, o);
+  }
+
+  Obj obj(Pool& pool, const std::string& oid) {
+    return Obj(pool, oid);
+  }
+
+  Pool pool() {
+    return Pool(this);
+  }
+
+  Pool pool(const rgw_pool& p) {
+    return Pool(this, p);
+  }
+
+  friend Obj;
+  friend Pool;
+  friend Pool::List;
+};
+
+using rgw_rados_ref = RGWSI_RADOS::rados_ref;
+
+inline std::ostream& operator<<(std::ostream& out, const RGWSI_RADOS::Obj& obj) {
+  return out << obj.get_raw_obj();
+}
diff --git a/src/rgw/services/svc_role_rados.cc b/src/rgw/services/svc_role_rados.cc
new file mode 100644
index 000000000..a84022497
--- /dev/null
+++ b/src/rgw/services/svc_role_rados.cc
@@ -0,0 +1,82 @@
+#include "svc_role_rados.h"
+#include "svc_meta_be_sobj.h"
+#include "svc_meta.h"
+#include "rgw_role.h"
+#include "rgw_zone.h"
+#include "svc_zone.h"
+#include "rgw_tools.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+class RGWSI_Role_Module : public RGWSI_MBSObj_Handler_Module {
+  RGWSI_Role_RADOS::Svc& svc;
+  const std::string prefix;
+public:
+  RGWSI_Role_Module(RGWSI_Role_RADOS::Svc& _svc): RGWSI_MBSObj_Handler_Module("roles"),
+                                                  svc(_svc),
+                                                  prefix(role_oid_prefix) {}
+
+  void get_pool_and_oid(const std::string& key,
+                        rgw_pool *pool,
+                        std::string *oid) override
+  {
+    if (pool) {
+      *pool = svc.zone->get_zone_params().roles_pool;
+    }
+
+    if (oid) {
+      *oid = key_to_oid(key);
+    }
+  }
+
+  bool is_valid_oid(const std::string& oid) override {
+    return boost::algorithm::starts_with(oid, prefix);
+  }
+
+  std::string key_to_oid(const std::string& key) override {
+    return prefix + key;
+  }
+
+  // This is called after `is_valid_oid` and is assumed to be a valid oid
+  std::string oid_to_key(const std::string& oid) override {
+    return oid.substr(prefix.size());
+  }
+
+  const std::string& get_oid_prefix() {
+    return prefix;
+  }
+};
+
+RGWSI_MetaBackend_Handler* RGWSI_Role_RADOS::get_be_handler()
+{
+  return be_handler;
+}
+
+void RGWSI_Role_RADOS::init(RGWSI_Zone *_zone_svc,
+                            RGWSI_Meta *_meta_svc,
+                            RGWSI_MetaBackend *_meta_be_svc,
+                            RGWSI_SysObj *_sysobj_svc)
+{
+  svc.zone = _zone_svc;
+  svc.meta = _meta_svc;
+  svc.meta_be = _meta_be_svc;
+  svc.sysobj = _sysobj_svc;
+}
+
+int RGWSI_Role_RADOS::do_start(optional_yield y, const DoutPrefixProvider *dpp)
+{
+
+  int r = svc.meta->create_be_handler(RGWSI_MetaBackend::Type::MDBE_SOBJ,
+                                      &be_handler);
+  if (r < 0) {
+    ldout(ctx(), 0) << "ERROR: failed to create be_handler for Roles: r="
+                    << r <<dendl;
+    return r;
+  }
+
+  auto module = new RGWSI_Role_Module(svc);
+  RGWSI_MetaBackend_Handler_SObj* bh= static_cast<RGWSI_MetaBackend_Handler_SObj *>(be_handler);
+  be_module.reset(module);
+  bh->set_module(module);
+  return 0;
+}
diff --git a/src/rgw/services/svc_role_rados.h b/src/rgw/services/svc_role_rados.h
new file mode 100644
index 000000000..d4d3530c2
--- /dev/null
+++ b/src/rgw/services/svc_role_rados.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 SUSE LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#pragma once
+
+#include "rgw_service.h"
+#include "rgw_role.h"
+#include "svc_meta_be.h"
+
+class RGWSI_Role_RADOS: public RGWServiceInstance
+{
+ public:
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_Meta *meta{nullptr};
+    RGWSI_MetaBackend *meta_be{nullptr};
+    RGWSI_SysObj *sysobj{nullptr};
+  } svc;
+
+  RGWSI_Role_RADOS(CephContext *cct) : RGWServiceInstance(cct) {}
+  ~RGWSI_Role_RADOS() {}
+
+  void init(RGWSI_Zone *_zone_svc,
+	    RGWSI_Meta *_meta_svc,
+	    RGWSI_MetaBackend *_meta_be_svc,
+	    RGWSI_SysObj *_sysobj_svc);
+
+  RGWSI_MetaBackend_Handler * get_be_handler();
+  int do_start(optional_yield y, const DoutPrefixProvider *dpp) override;
+
+private:
+  RGWSI_MetaBackend_Handler *be_handler;
+  std::unique_ptr<RGWSI_MetaBackend::Module> be_module;
+};
+
+static const std::string role_name_oid_prefix = "role_names.";
+static const std::string role_oid_prefix = "roles.";
+static const std::string role_path_oid_prefix = "role_paths.";
diff --git a/src/rgw/services/svc_sync_modules.cc b/src/rgw/services/svc_sync_modules.cc
new file mode 100644
index 000000000..ba9e7d172
--- /dev/null
+++ b/src/rgw/services/svc_sync_modules.cc
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_sync_modules.h"
+#include "svc_zone.h"
+
+#include "rgw_sync_module.h"
+#include "rgw_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+void RGWSI_SyncModules::init(RGWSI_Zone *zone_svc)
+{
+  svc.zone = zone_svc;
+  sync_modules_manager = new RGWSyncModulesManager();
+  rgw_register_sync_modules(sync_modules_manager);
+}
+
+int RGWSI_SyncModules::do_start(optional_yield, const DoutPrefixProvider *dpp)
+{
+  auto& zone_public_config = svc.zone->get_zone();
+
+  int ret = sync_modules_manager->create_instance(dpp, cct, zone_public_config.tier_type, svc.zone->get_zone_params().tier_config, &sync_module);
+  if (ret < 0) {
+    ldpp_dout(dpp, -1) << "ERROR: failed to start sync module instance, ret=" << ret << dendl;
+    if (ret == -ENOENT) {
+      ldpp_dout(dpp, -1) << "ERROR: " << zone_public_config.tier_type 
+        << " sync module does not exist. valid sync modules: " 
+        << sync_modules_manager->get_registered_module_names()
+        << dendl;
+    }
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << "started sync module instance, tier type = " << zone_public_config.tier_type << dendl;
+
+  return 0;
+}
+
+RGWSI_SyncModules::~RGWSI_SyncModules()
+{
+  delete sync_modules_manager;
+}
+
diff --git a/src/rgw/services/svc_sync_modules.h b/src/rgw/services/svc_sync_modules.h
new file mode 100644
index 000000000..ea78f5817
--- /dev/null
+++ b/src/rgw/services/svc_sync_modules.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_service.h"
+#include "rgw_sync_module.h"
+
+class RGWSI_Zone;
+
+class RGWSyncModulesManager;
+
+class RGWSI_SyncModules : public RGWServiceInstance
+{
+  RGWSyncModulesManager *sync_modules_manager{nullptr};
+  RGWSyncModuleInstanceRef sync_module;
+
+  struct Svc {
+    RGWSI_Zone *zone{nullptr};
+  } svc;
+
+public:
+  RGWSI_SyncModules(CephContext *cct): RGWServiceInstance(cct) {}
+  ~RGWSI_SyncModules();
+
+  RGWSyncModulesManager *get_manager() {
+    return sync_modules_manager;
+  }
+
+  void init(RGWSI_Zone *zone_svc);
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+
+  RGWSyncModuleInstanceRef& get_sync_module() { return sync_module; }
+};
diff --git a/src/rgw/services/svc_sys_obj.cc b/src/rgw/services/svc_sys_obj.cc
new file mode 100644
index 000000000..310e60514
--- /dev/null
+++ b/src/rgw/services/svc_sys_obj.cc
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_sys_obj.h"
+#include "svc_sys_obj_core.h"
+#include "svc_rados.h"
+#include "svc_zone.h"
+
+#include "rgw_zone.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+RGWSI_SysObj::Obj RGWSI_SysObj::get_obj(const rgw_raw_obj& obj)
+{
+  return Obj(core_svc, obj);
+}
+
+RGWSI_SysObj::Obj::ROp::ROp(Obj& _source) : source(_source) {
+  state.emplace<RGWSI_SysObj_Core::GetObjState>();
+}
+
+int RGWSI_SysObj::Obj::ROp::stat(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.obj;
+
+  return svc->stat(*state, obj, attrs, raw_attrs,
+                   lastmod, obj_size, objv_tracker, y, dpp);
+}
+
+int RGWSI_SysObj::Obj::ROp::read(const DoutPrefixProvider *dpp,
+                                 int64_t ofs, int64_t end, bufferlist *bl,
+                                 optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.get_obj();
+
+  return svc->read(dpp, *state,
+                   objv_tracker,
+                   obj, bl, ofs, end,
+                   lastmod, obj_size,
+                   attrs,
+		   raw_attrs,
+                   cache_info,
+                   refresh_version, y);
+}
+
+int RGWSI_SysObj::Obj::ROp::get_attr(const DoutPrefixProvider *dpp,
+                                     const char *name, bufferlist *dest,
+                                     optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.get_obj();
+
+  return svc->get_attr(dpp, obj, name, dest, y);
+}
+
+int RGWSI_SysObj::Obj::WOp::remove(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.get_obj();
+
+  return svc->remove(dpp, objv_tracker, obj, y);
+}
+
+int RGWSI_SysObj::Obj::WOp::write(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.get_obj();
+
+  return svc->write(dpp, obj, pmtime, attrs, exclusive,
+                    bl, objv_tracker, mtime, y);
+}
+
+int RGWSI_SysObj::Obj::WOp::write_data(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.get_obj();
+
+  return svc->write_data(dpp, obj, bl, exclusive, objv_tracker, y);
+}
+
+int RGWSI_SysObj::Obj::WOp::write_attrs(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.get_obj();
+
+  return svc->set_attrs(dpp, obj, attrs, nullptr, objv_tracker, exclusive, y);
+}
+
+int RGWSI_SysObj::Obj::WOp::write_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& bl,
+                                       optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.get_obj();
+
+  map<string, bufferlist> m;
+  m[name] = bl;
+
+  return svc->set_attrs(dpp, obj, m, nullptr, objv_tracker, exclusive, y);
+}
+
+int RGWSI_SysObj::Pool::list_prefixed_objs(const DoutPrefixProvider *dpp, const string& prefix, std::function<void(const string&)> cb)
+{
+  return core_svc->pool_list_prefixed_objs(dpp, pool, prefix, cb);
+}
+
+int RGWSI_SysObj::Pool::Op::init(const DoutPrefixProvider *dpp, const string& marker, const string& prefix)
+{
+  return source.core_svc->pool_list_objects_init(dpp, source.pool, marker, prefix, &ctx);
+}
+
+int RGWSI_SysObj::Pool::Op::get_next(const DoutPrefixProvider *dpp, int max, vector<string> *oids, bool *is_truncated)
+{
+  return source.core_svc->pool_list_objects_next(dpp, ctx, max, oids, is_truncated);
+}
+
+int RGWSI_SysObj::Pool::Op::get_marker(string *marker)
+{
+  return source.core_svc->pool_list_objects_get_marker(ctx, marker);
+}
+
+int RGWSI_SysObj::Obj::OmapOp::get_all(const DoutPrefixProvider *dpp, std::map<string, bufferlist> *m,
+                                       optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.obj;
+
+  return svc->omap_get_all(dpp, obj, m, y);
+}
+
+int RGWSI_SysObj::Obj::OmapOp::get_vals(const DoutPrefixProvider *dpp, 
+                                        const string& marker, uint64_t count,
+                                        std::map<string, bufferlist> *m,
+                                        bool *pmore, optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.obj;
+
+  return svc->omap_get_vals(dpp, obj, marker, count, m, pmore, y);
+}
+
+int RGWSI_SysObj::Obj::OmapOp::set(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& bl,
+                                   optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.obj;
+
+  return svc->omap_set(dpp, obj, key, bl, must_exist, y);
+}
+
+int RGWSI_SysObj::Obj::OmapOp::set(const DoutPrefixProvider *dpp, const map<std::string, bufferlist>& m,
+                                   optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.obj;
+
+  return svc->omap_set(dpp, obj, m, must_exist, y);
+}
+
+int RGWSI_SysObj::Obj::OmapOp::del(const DoutPrefixProvider *dpp, const std::string& key, optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.obj;
+
+  return svc->omap_del(dpp, obj, key, y);
+}
+
+int RGWSI_SysObj::Obj::WNOp::notify(const DoutPrefixProvider *dpp, bufferlist& bl, uint64_t timeout_ms,
+                                    bufferlist *pbl, optional_yield y)
+{
+  RGWSI_SysObj_Core *svc = source.core_svc;
+  rgw_raw_obj& obj = source.obj;
+
+  return svc->notify(dpp, obj, bl, timeout_ms, pbl, y);
+}
+
+RGWSI_Zone *RGWSI_SysObj::get_zone_svc()
+{
+  return core_svc->get_zone_svc();
+}
diff --git a/src/rgw/services/svc_sys_obj.h b/src/rgw/services/svc_sys_obj.h
new file mode 100644
index 000000000..f3e217dbd
--- /dev/null
+++ b/src/rgw/services/svc_sys_obj.h
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "common/static_ptr.h"
+
+#include "rgw_service.h"
+
+#include "svc_rados.h"
+#include "svc_sys_obj_types.h"
+#include "svc_sys_obj_core_types.h"
+
+
+class RGWSI_Zone;
+class RGWSI_SysObj;
+
+struct rgw_cache_entry_info;
+
+class RGWSI_SysObj : public RGWServiceInstance
+{
+  friend struct RGWServices_Def;
+
+public:
+  class Obj {
+    friend class ROp;
+
+    RGWSI_SysObj_Core *core_svc;
+    rgw_raw_obj obj;
+
+  public:
+    Obj(RGWSI_SysObj_Core *_core_svc, const rgw_raw_obj& _obj)
+        : core_svc(_core_svc), obj(_obj) {}
+
+    rgw_raw_obj& get_obj() {
+      return obj;
+    }
+
+    struct ROp {
+      Obj& source;
+
+      ceph::static_ptr<RGWSI_SysObj_Obj_GetObjState, sizeof(RGWSI_SysObj_Core_GetObjState)> state;
+      
+      RGWObjVersionTracker *objv_tracker{nullptr};
+      std::map<std::string, bufferlist> *attrs{nullptr};
+      bool raw_attrs{false};
+      boost::optional<obj_version> refresh_version{boost::none};
+      ceph::real_time *lastmod{nullptr};
+      uint64_t *obj_size{nullptr};
+      rgw_cache_entry_info *cache_info{nullptr};
+
+      ROp& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+
+      ROp& set_last_mod(ceph::real_time *_lastmod) {
+        lastmod = _lastmod;
+        return *this;
+      }
+
+      ROp& set_obj_size(uint64_t *_obj_size) {
+        obj_size = _obj_size;
+        return *this;
+      }
+
+      ROp& set_attrs(std::map<std::string, bufferlist> *_attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+
+      ROp& set_raw_attrs(bool ra) {
+	raw_attrs = ra;
+	return *this;
+      }
+
+      ROp& set_refresh_version(boost::optional<obj_version>& rf) {
+        refresh_version = rf;
+        return *this;
+      }
+
+      ROp& set_cache_info(rgw_cache_entry_info *ci) {
+        cache_info = ci;
+        return *this;
+      }
+
+      ROp(Obj& _source);
+
+      int stat(optional_yield y, const DoutPrefixProvider *dpp);
+      int read(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, bufferlist *pbl, optional_yield y);
+      int read(const DoutPrefixProvider *dpp, bufferlist *pbl, optional_yield y) {
+        return read(dpp, 0, -1, pbl, y);
+      }
+      int get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist *dest, optional_yield y);
+    };
+
+    struct WOp {
+      Obj& source;
+
+      RGWObjVersionTracker *objv_tracker{nullptr};
+      std::map<std::string, bufferlist> attrs;
+      ceph::real_time mtime;
+      ceph::real_time *pmtime{nullptr};
+      bool exclusive{false};
+
+      WOp& set_objv_tracker(RGWObjVersionTracker *_objv_tracker) {
+        objv_tracker = _objv_tracker;
+        return *this;
+      }
+
+      WOp& set_attrs(std::map<std::string, bufferlist>& _attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+
+      WOp& set_attrs(std::map<std::string, bufferlist>&& _attrs) {
+        attrs = _attrs;
+        return *this;
+      }
+
+      WOp& set_mtime(const ceph::real_time& _mtime) {
+        mtime = _mtime;
+        return *this;
+      }
+
+      WOp& set_pmtime(ceph::real_time *_pmtime) {
+        pmtime = _pmtime;
+        return *this;
+      }
+
+      WOp& set_exclusive(bool _exclusive = true) {
+        exclusive = _exclusive;
+        return *this;
+      }
+
+      WOp(Obj& _source) : source(_source) {}
+
+      int remove(const DoutPrefixProvider *dpp, optional_yield y);
+      int write(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y);
+
+      int write_data(const DoutPrefixProvider *dpp, bufferlist& bl, optional_yield y); /* write data only */
+      int write_attrs(const DoutPrefixProvider *dpp, optional_yield y); /* write attrs only */
+      int write_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& bl,
+                     optional_yield y); /* write attrs only */
+    };
+
+    struct OmapOp {
+      Obj& source;
+
+      bool must_exist{false};
+
+      OmapOp& set_must_exist(bool _must_exist = true) {
+        must_exist = _must_exist;
+        return *this;
+      }
+
+      OmapOp(Obj& _source) : source(_source) {}
+
+      int get_all(const DoutPrefixProvider *dpp, std::map<std::string, bufferlist> *m, optional_yield y);
+      int get_vals(const DoutPrefixProvider *dpp, const std::string& marker, uint64_t count,
+                   std::map<std::string, bufferlist> *m,
+                   bool *pmore, optional_yield y);
+      int set(const DoutPrefixProvider *dpp, const std::string& key, bufferlist& bl, optional_yield y);
+      int set(const DoutPrefixProvider *dpp, const std::map<std::string, bufferlist>& m, optional_yield y);
+      int del(const DoutPrefixProvider *dpp, const std::string& key, optional_yield y);
+    };
+
+    struct WNOp {
+      Obj& source;
+
+      WNOp(Obj& _source) : source(_source) {}
+
+      int notify(const DoutPrefixProvider *dpp, bufferlist& bl, uint64_t timeout_ms, bufferlist *pbl,
+                 optional_yield y);
+    };
+    ROp rop() {
+      return ROp(*this);
+    }
+
+    WOp wop() {
+      return WOp(*this);
+    }
+
+    OmapOp omap() {
+      return OmapOp(*this);
+    }
+
+    WNOp wn() {
+      return WNOp(*this);
+    }
+  };
+
+  class Pool {
+    friend class Op;
+    friend class RGWSI_SysObj_Core;
+
+    RGWSI_SysObj_Core *core_svc;
+    rgw_pool pool;
+
+  protected:
+    using ListImplInfo = RGWSI_SysObj_Pool_ListInfo;
+
+    struct ListCtx {
+      ceph::static_ptr<ListImplInfo, sizeof(RGWSI_SysObj_Core_PoolListImplInfo)> impl; /* update this if creating new backend types */
+    };
+
+  public:
+    Pool(RGWSI_SysObj_Core *_core_svc,
+         const rgw_pool& _pool) : core_svc(_core_svc),
+                                  pool(_pool) {}
+
+    rgw_pool& get_pool() {
+      return pool;
+    }
+
+    struct Op {
+      Pool& source;
+      ListCtx ctx;
+
+      Op(Pool& _source) : source(_source) {}
+
+      int init(const DoutPrefixProvider *dpp, const std::string& marker, const std::string& prefix);
+      int get_next(const DoutPrefixProvider *dpp, int max, std::vector<std::string> *oids, bool *is_truncated);
+      int get_marker(std::string *marker);
+    };
+
+    int list_prefixed_objs(const DoutPrefixProvider *dpp, const std::string& prefix, std::function<void(const std::string&)> cb);
+
+    template <typename Container>
+    int list_prefixed_objs(const DoutPrefixProvider *dpp, const std::string& prefix,
+                           Container *result) {
+      return list_prefixed_objs(dpp, prefix, [&](const std::string& val) {
+        result->push_back(val);
+      });
+    }
+
+    Op op() {
+      return Op(*this);
+    }
+  };
+
+  friend class Obj;
+  friend class Obj::ROp;
+  friend class Obj::WOp;
+  friend class Pool;
+  friend class Pool::Op;
+
+protected:
+  RGWSI_RADOS *rados_svc{nullptr};
+  RGWSI_SysObj_Core *core_svc{nullptr};
+
+  void init(RGWSI_RADOS *_rados_svc,
+            RGWSI_SysObj_Core *_core_svc) {
+    rados_svc = _rados_svc;
+    core_svc = _core_svc;
+  }
+
+public:
+  RGWSI_SysObj(CephContext *cct): RGWServiceInstance(cct) {}
+
+  Obj get_obj(const rgw_raw_obj& obj);
+
+  Pool get_pool(const rgw_pool& pool) {
+    return Pool(core_svc, pool);
+  }
+
+  RGWSI_Zone *get_zone_svc();
+};
+
+using RGWSysObj = RGWSI_SysObj::Obj;
diff --git a/src/rgw/services/svc_sys_obj_cache.cc b/src/rgw/services/svc_sys_obj_cache.cc
new file mode 100644
index 000000000..d1b7a3dbb
--- /dev/null
+++ b/src/rgw/services/svc_sys_obj_cache.cc
@@ -0,0 +1,670 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "common/admin_socket.h"
+
+#include "svc_sys_obj_cache.h"
+#include "svc_zone.h"
+#include "svc_notify.h"
+
+#include "rgw_zone.h"
+#include "rgw_tools.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+class RGWSI_SysObj_Cache_CB : public RGWSI_Notify::CB
+{
+  RGWSI_SysObj_Cache *svc;
+public:
+  RGWSI_SysObj_Cache_CB(RGWSI_SysObj_Cache *_svc) : svc(_svc) {}
+  int watch_cb(const DoutPrefixProvider *dpp,
+               uint64_t notify_id,
+               uint64_t cookie,
+               uint64_t notifier_id,
+               bufferlist& bl) {
+    return svc->watch_cb(dpp, notify_id, cookie, notifier_id, bl);
+  }
+
+  void set_enabled(bool status) {
+    svc->set_enabled(status);
+  }
+};
+
+int RGWSI_SysObj_Cache::do_start(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  int r = asocket.start();
+  if (r < 0) {
+    return r;
+  }
+
+  r = RGWSI_SysObj_Core::do_start(y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  r = notify_svc->start(y, dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  assert(notify_svc->is_started());
+
+  cb.reset(new RGWSI_SysObj_Cache_CB(this));
+
+  notify_svc->register_watch_cb(cb.get());
+
+  return 0;
+}
+
+void RGWSI_SysObj_Cache::shutdown()
+{
+  asocket.shutdown();
+  RGWSI_SysObj_Core::shutdown();
+}
+
+static string normal_name(rgw_pool& pool, const std::string& oid) {
+  std::string buf;
+  buf.reserve(pool.name.size() + pool.ns.size() + oid.size() + 2);
+  buf.append(pool.name).append("+").append(pool.ns).append("+").append(oid);
+  return buf;
+}
+
+void RGWSI_SysObj_Cache::normalize_pool_and_obj(const rgw_pool& src_pool, const string& src_obj, rgw_pool& dst_pool, string& dst_obj)
+{
+  if (src_obj.size()) {
+    dst_pool = src_pool;
+    dst_obj = src_obj;
+  } else {
+    dst_pool = zone_svc->get_zone_params().domain_root;
+    dst_obj = src_pool.name;
+  }
+}
+
+
+int RGWSI_SysObj_Cache::remove(const DoutPrefixProvider *dpp, 
+                               RGWObjVersionTracker *objv_tracker,
+                               const rgw_raw_obj& obj,
+                               optional_yield y)
+
+{
+  rgw_pool pool;
+  string oid;
+  normalize_pool_and_obj(obj.pool, obj.oid, pool, oid);
+
+  string name = normal_name(pool, oid);
+  cache.invalidate_remove(dpp, name);
+
+  ObjectCacheInfo info;
+  int r = distribute_cache(dpp, name, obj, info, INVALIDATE_OBJ, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): failed to distribute cache: r=" << r << dendl;
+  }
+
+  return RGWSI_SysObj_Core::remove(dpp, objv_tracker, obj, y);
+}
+
+int RGWSI_SysObj_Cache::read(const DoutPrefixProvider *dpp,
+                             RGWSI_SysObj_Obj_GetObjState& read_state,
+                             RGWObjVersionTracker *objv_tracker,
+                             const rgw_raw_obj& obj,
+                             bufferlist *obl, off_t ofs, off_t end,
+                             ceph::real_time* pmtime, uint64_t* psize,
+                             map<string, bufferlist> *attrs,
+			     bool raw_attrs,
+                             rgw_cache_entry_info *cache_info,
+                             boost::optional<obj_version> refresh_version,
+                             optional_yield y)
+{
+  rgw_pool pool;
+  string oid;
+  if (ofs != 0) {
+    return RGWSI_SysObj_Core::read(dpp, read_state, objv_tracker, obj, obl,
+                                   ofs, end, pmtime, psize, attrs, raw_attrs,
+                                   cache_info, refresh_version, y);
+  }
+
+  normalize_pool_and_obj(obj.pool, obj.oid, pool, oid);
+  string name = normal_name(pool, oid);
+
+  ObjectCacheInfo info;
+
+  uint32_t flags = (end != 0 ? CACHE_FLAG_DATA : 0);
+  if (objv_tracker)
+    flags |= CACHE_FLAG_OBJV;
+  if (pmtime || psize)
+    flags |= CACHE_FLAG_META;
+  if (attrs)
+    flags |= CACHE_FLAG_XATTRS;
+  
+  int r = cache.get(dpp, name, info, flags, cache_info);
+  if (r == 0 &&
+      (!refresh_version || !info.version.compare(&(*refresh_version)))) {
+    if (info.status < 0)
+      return info.status;
+
+    bufferlist& bl = info.data;
+
+    bufferlist::iterator i = bl.begin();
+
+    obl->clear();
+
+    i.copy_all(*obl);
+    if (objv_tracker)
+      objv_tracker->read_version = info.version;
+    if (pmtime) {
+      *pmtime = info.meta.mtime;
+    }
+    if (psize) {
+      *psize = info.meta.size;
+    }
+    if (attrs) {
+      if (raw_attrs) {
+	*attrs = info.xattrs;
+      } else {
+	rgw_filter_attrset(info.xattrs, RGW_ATTR_PREFIX, attrs);
+      }
+    }
+    return obl->length();
+  }
+  if(r == -ENODATA)
+    return -ENOENT;
+
+  // if we only ask for one of mtime or size, ask for the other too so we can
+  // satisfy CACHE_FLAG_META
+  uint64_t size = 0;
+  real_time mtime;
+  if (pmtime) {
+    if (!psize) {
+      psize = &size;
+    }
+  } else if (psize) {
+    if (!pmtime) {
+      pmtime = &mtime;
+    }
+  }
+
+  map<string, bufferlist> unfiltered_attrset;
+  r = RGWSI_SysObj_Core::read(dpp, read_state, objv_tracker,
+                         obj, obl, ofs, end, pmtime, psize,
+			 (attrs ? &unfiltered_attrset : nullptr),
+			 true, /* cache unfiltered attrs */
+			 cache_info,
+                         refresh_version, y);
+  if (r < 0) {
+    if (r == -ENOENT) { // only update ENOENT, we'd rather retry other errors
+      info.status = r;
+      cache.put(dpp, name, info, cache_info);
+    }
+    return r;
+  }
+
+  if (obl->length() == end + 1) {
+    /* in this case, most likely object contains more data, we can't cache it */
+    flags &= ~CACHE_FLAG_DATA;
+  } else {
+    bufferptr p(r);
+    bufferlist& bl = info.data;
+    bl.clear();
+    bufferlist::iterator o = obl->begin();
+    o.copy_all(bl);
+  }
+
+  info.status = 0;
+  info.flags = flags;
+  if (objv_tracker) {
+    info.version = objv_tracker->read_version;
+  }
+  if (pmtime) {
+    info.meta.mtime = *pmtime;
+  }
+  if (psize) {
+    info.meta.size = *psize;
+  }
+  if (attrs) {
+    info.xattrs = std::move(unfiltered_attrset);
+    if (raw_attrs) {
+      *attrs = info.xattrs;
+    } else {
+      rgw_filter_attrset(info.xattrs, RGW_ATTR_PREFIX, attrs);
+    }
+  }
+  cache.put(dpp, name, info, cache_info);
+  return r;
+}
+
+int RGWSI_SysObj_Cache::get_attr(const DoutPrefixProvider *dpp,
+                                 const rgw_raw_obj& obj,
+                                 const char *attr_name,
+                                 bufferlist *dest,
+                                 optional_yield y)
+{
+  rgw_pool pool;
+  string oid;
+
+  normalize_pool_and_obj(obj.pool, obj.oid, pool, oid);
+  string name = normal_name(pool, oid);
+
+  ObjectCacheInfo info;
+
+  uint32_t flags = CACHE_FLAG_XATTRS;
+
+  int r = cache.get(dpp, name, info, flags, nullptr);
+  if (r == 0) {
+    if (info.status < 0)
+      return info.status;
+
+    auto iter = info.xattrs.find(attr_name);
+    if (iter == info.xattrs.end()) {
+      return -ENODATA;
+    }
+
+    *dest = iter->second;
+    return dest->length();
+  } else if (r == -ENODATA) {
+    return -ENOENT;
+  }
+  /* don't try to cache this one */
+  return RGWSI_SysObj_Core::get_attr(dpp, obj, attr_name, dest, y);
+}
+
+int RGWSI_SysObj_Cache::set_attrs(const DoutPrefixProvider *dpp, 
+                                  const rgw_raw_obj& obj, 
+                                  map<string, bufferlist>& attrs,
+                                  map<string, bufferlist> *rmattrs,
+                                  RGWObjVersionTracker *objv_tracker,
+                                  bool exclusive, optional_yield y)
+{
+  rgw_pool pool;
+  string oid;
+  normalize_pool_and_obj(obj.pool, obj.oid, pool, oid);
+  ObjectCacheInfo info;
+  info.xattrs = attrs;
+  if (rmattrs) {
+    info.rm_xattrs = *rmattrs;
+  }
+  info.status = 0;
+  info.flags = CACHE_FLAG_MODIFY_XATTRS;
+  int ret = RGWSI_SysObj_Core::set_attrs(dpp, obj, attrs, rmattrs, objv_tracker, exclusive, y);
+  string name = normal_name(pool, oid);
+  if (ret >= 0) {
+    if (objv_tracker && objv_tracker->read_version.ver) {
+      info.version = objv_tracker->read_version;
+      info.flags |= CACHE_FLAG_OBJV;
+    }
+    cache.put(dpp, name, info, NULL);
+    int r = distribute_cache(dpp, name, obj, info, UPDATE_OBJ, y);
+    if (r < 0)
+      ldpp_dout(dpp, 0) << "ERROR: failed to distribute cache for " << obj << dendl;
+  } else {
+    cache.invalidate_remove(dpp, name);
+  }
+
+  return ret;
+}
+
+int RGWSI_SysObj_Cache::write(const DoutPrefixProvider *dpp, 
+                             const rgw_raw_obj& obj,
+                             real_time *pmtime,
+                             map<std::string, bufferlist>& attrs,
+                             bool exclusive,
+                             const bufferlist& data,
+                             RGWObjVersionTracker *objv_tracker,
+                             real_time set_mtime,
+                             optional_yield y)
+{
+  rgw_pool pool;
+  string oid;
+  normalize_pool_and_obj(obj.pool, obj.oid, pool, oid);
+  ObjectCacheInfo info;
+  info.xattrs = attrs;
+  info.status = 0;
+  info.data = data;
+  info.flags = CACHE_FLAG_XATTRS | CACHE_FLAG_DATA | CACHE_FLAG_META;
+  ceph::real_time result_mtime;
+  int ret = RGWSI_SysObj_Core::write(dpp, obj, &result_mtime, attrs,
+                                     exclusive, data,
+                                     objv_tracker, set_mtime, y);
+  if (pmtime) {
+    *pmtime = result_mtime;
+  }
+  if (objv_tracker && objv_tracker->read_version.ver) {
+    info.version = objv_tracker->read_version;
+    info.flags |= CACHE_FLAG_OBJV;
+  }
+  info.meta.mtime = result_mtime;
+  info.meta.size = data.length();
+  string name = normal_name(pool, oid);
+  if (ret >= 0) {
+    cache.put(dpp, name, info, NULL);
+    int r = distribute_cache(dpp, name, obj, info, UPDATE_OBJ, y);
+    if (r < 0)
+      ldpp_dout(dpp, 0) << "ERROR: failed to distribute cache for " << obj << dendl;
+  } else {
+    cache.invalidate_remove(dpp, name);
+  }
+
+  return ret;
+}
+
+int RGWSI_SysObj_Cache::write_data(const DoutPrefixProvider *dpp, 
+                                   const rgw_raw_obj& obj,
+                                   const bufferlist& data,
+                                   bool exclusive,
+                                   RGWObjVersionTracker *objv_tracker,
+                                   optional_yield y)
+{
+  rgw_pool pool;
+  string oid;
+  normalize_pool_and_obj(obj.pool, obj.oid, pool, oid);
+
+  ObjectCacheInfo info;
+  info.data = data;
+  info.meta.size = data.length();
+  info.status = 0;
+  info.flags = CACHE_FLAG_DATA;
+
+  int ret = RGWSI_SysObj_Core::write_data(dpp, obj, data, exclusive, objv_tracker, y);
+  string name = normal_name(pool, oid);
+  if (ret >= 0) {
+    if (objv_tracker && objv_tracker->read_version.ver) {
+      info.version = objv_tracker->read_version;
+      info.flags |= CACHE_FLAG_OBJV;
+    }
+    cache.put(dpp, name, info, NULL);
+    int r = distribute_cache(dpp, name, obj, info, UPDATE_OBJ, y);
+    if (r < 0)
+      ldpp_dout(dpp, 0) << "ERROR: failed to distribute cache for " << obj << dendl;
+  } else {
+    cache.invalidate_remove(dpp, name);
+  }
+
+  return ret;
+}
+
+int RGWSI_SysObj_Cache::raw_stat(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj,
+                                 uint64_t *psize, real_time *pmtime,
+                                 map<string, bufferlist> *attrs,
+                                 RGWObjVersionTracker *objv_tracker,
+                                 optional_yield y)
+{
+  rgw_pool pool;
+  string oid;
+  normalize_pool_and_obj(obj.pool, obj.oid, pool, oid);
+
+  string name = normal_name(pool, oid);
+
+  uint64_t size;
+  real_time mtime;
+
+  ObjectCacheInfo info;
+  uint32_t flags = CACHE_FLAG_META | CACHE_FLAG_XATTRS;
+  if (objv_tracker)
+    flags |= CACHE_FLAG_OBJV;
+  int r = cache.get(dpp, name, info, flags, NULL);
+  if (r == 0) {
+    if (info.status < 0)
+      return info.status;
+
+    size = info.meta.size;
+    mtime = info.meta.mtime;
+    if (objv_tracker)
+      objv_tracker->read_version = info.version;
+    goto done;
+  }
+  if (r == -ENODATA) {
+    return -ENOENT;
+  }
+  r = RGWSI_SysObj_Core::raw_stat(dpp, obj, &size, &mtime, &info.xattrs,
+                                  objv_tracker, y);
+  if (r < 0) {
+    if (r == -ENOENT) {
+      info.status = r;
+      cache.put(dpp, name, info, NULL);
+    }
+    return r;
+  }
+  info.status = 0;
+  info.meta.mtime = mtime;
+  info.meta.size = size;
+  info.flags = CACHE_FLAG_META | CACHE_FLAG_XATTRS;
+  if (objv_tracker) {
+    info.flags |= CACHE_FLAG_OBJV;
+    info.version = objv_tracker->read_version;
+  }
+  cache.put(dpp, name, info, NULL);
+done:
+  if (psize)
+    *psize = size;
+  if (pmtime)
+    *pmtime = mtime;
+  if (attrs)
+    *attrs = info.xattrs;
+  return 0;
+}
+
+int RGWSI_SysObj_Cache::distribute_cache(const DoutPrefixProvider *dpp, 
+                                         const string& normal_name,
+                                         const rgw_raw_obj& obj,
+                                         ObjectCacheInfo& obj_info, int op,
+                                         optional_yield y)
+{
+  RGWCacheNotifyInfo info;
+  info.op = op;
+  info.obj_info = obj_info;
+  info.obj = obj;
+  return notify_svc->distribute(dpp, normal_name, info, y);
+}
+
+int RGWSI_SysObj_Cache::watch_cb(const DoutPrefixProvider *dpp,
+                                 uint64_t notify_id,
+                                 uint64_t cookie,
+                                 uint64_t notifier_id,
+                                 bufferlist& bl)
+{
+  RGWCacheNotifyInfo info;
+
+  try {
+    auto iter = bl.cbegin();
+    decode(info, iter);
+  } catch (buffer::end_of_buffer& err) {
+    ldpp_dout(dpp, 0) << "ERROR: got bad notification" << dendl;
+    return -EIO;
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: buffer::error" << dendl;
+    return -EIO;
+  }
+
+  rgw_pool pool;
+  string oid;
+  normalize_pool_and_obj(info.obj.pool, info.obj.oid, pool, oid);
+  string name = normal_name(pool, oid);
+  
+  switch (info.op) {
+  case UPDATE_OBJ:
+    cache.put(dpp, name, info.obj_info, NULL);
+    break;
+  case INVALIDATE_OBJ:
+    cache.invalidate_remove(dpp, name);
+    break;
+  default:
+    ldpp_dout(dpp, 0) << "WARNING: got unknown notification op: " << info.op << dendl;
+    return -EINVAL;
+  }
+
+  return 0;
+}
+
+void RGWSI_SysObj_Cache::set_enabled(bool status)
+{
+  cache.set_enabled(status);
+}
+
+bool RGWSI_SysObj_Cache::chain_cache_entry(const DoutPrefixProvider *dpp,
+                                           std::initializer_list<rgw_cache_entry_info *> cache_info_entries,
+                                           RGWChainedCache::Entry *chained_entry)
+{
+  return cache.chain_cache_entry(dpp, cache_info_entries, chained_entry);
+}
+
+void RGWSI_SysObj_Cache::register_chained_cache(RGWChainedCache *cc)
+{
+  cache.chain_cache(cc);
+}
+
+void RGWSI_SysObj_Cache::unregister_chained_cache(RGWChainedCache *cc)
+{
+  cache.unchain_cache(cc);
+}
+
+static void cache_list_dump_helper(Formatter* f,
+                                   const std::string& name,
+                                   const ceph::real_time mtime,
+                                   const std::uint64_t size)
+{
+  f->dump_string("name", name);
+  f->dump_string("mtime", ceph::to_iso_8601(mtime));
+  f->dump_unsigned("size", size);
+}
+
+class RGWSI_SysObj_Cache_ASocketHook : public AdminSocketHook {
+  RGWSI_SysObj_Cache *svc;
+
+  static constexpr std::string_view admin_commands[][2] = {
+    { "cache list name=filter,type=CephString,req=false",
+      "cache list [filter_str]: list object cache, possibly matching substrings" },
+    { "cache inspect name=target,type=CephString,req=true",
+      "cache inspect target: print cache element" },
+    { "cache erase name=target,type=CephString,req=true",
+      "cache erase target: erase element from cache" },
+    { "cache zap",
+      "cache zap: erase all elements from cache" }
+  };
+
+public:
+    RGWSI_SysObj_Cache_ASocketHook(RGWSI_SysObj_Cache *_svc) : svc(_svc) {}
+
+    int start();
+    void shutdown();
+
+    int call(std::string_view command, const cmdmap_t& cmdmap,
+	     const bufferlist&,
+	     Formatter *f,
+	     std::ostream& ss,
+	     bufferlist& out) override;
+};
+
+int RGWSI_SysObj_Cache_ASocketHook::start()
+{
+  auto admin_socket = svc->ctx()->get_admin_socket();
+  for (auto cmd : admin_commands) {
+    int r = admin_socket->register_command(cmd[0], this, cmd[1]);
+    if (r < 0) {
+      ldout(svc->ctx(), 0) << "ERROR: fail to register admin socket command (r=" << r
+                           << ")" << dendl;
+      return r;
+    }
+  }
+  return 0;
+}
+
+void RGWSI_SysObj_Cache_ASocketHook::shutdown()
+{
+  auto admin_socket = svc->ctx()->get_admin_socket();
+  admin_socket->unregister_commands(this);
+}
+
+int RGWSI_SysObj_Cache_ASocketHook::call(
+  std::string_view command, const cmdmap_t& cmdmap,
+  const bufferlist&,
+  Formatter *f,
+  std::ostream& ss,
+  bufferlist& out)
+{
+  if (command == "cache list"sv) {
+    std::optional<std::string> filter;
+    if (auto i = cmdmap.find("filter"); i != cmdmap.cend()) {
+      filter = boost::get<std::string>(i->second);
+    }
+    f->open_array_section("cache_entries");
+    svc->asocket.call_list(filter, f);
+    f->close_section();
+    return 0;
+  } else if (command == "cache inspect"sv) {
+    const auto& target = boost::get<std::string>(cmdmap.at("target"));
+    if (svc->asocket.call_inspect(target, f)) {
+      return 0;
+    } else {
+      ss << "Unable to find entry "s + target + ".\n";
+      return -ENOENT;
+    }
+  } else if (command == "cache erase"sv) {
+    const auto& target = boost::get<std::string>(cmdmap.at("target"));
+    if (svc->asocket.call_erase(target)) {
+      return 0;
+    } else {
+      ss << "Unable to find entry "s + target + ".\n";
+      return -ENOENT;
+    }
+  } else if (command == "cache zap"sv) {
+    svc->asocket.call_zap();
+    return 0;
+  }
+  return -ENOSYS;
+}
+
+RGWSI_SysObj_Cache::ASocketHandler::ASocketHandler(const DoutPrefixProvider *_dpp, RGWSI_SysObj_Cache *_svc) : dpp(_dpp), svc(_svc)
+{
+  hook.reset(new RGWSI_SysObj_Cache_ASocketHook(_svc));
+}
+
+RGWSI_SysObj_Cache::ASocketHandler::~ASocketHandler()
+{
+}
+
+int RGWSI_SysObj_Cache::ASocketHandler::start()
+{
+  return hook->start();
+}
+
+void RGWSI_SysObj_Cache::ASocketHandler::shutdown()
+{
+  return hook->shutdown();
+}
+
+void RGWSI_SysObj_Cache::ASocketHandler::call_list(const std::optional<std::string>& filter, Formatter* f)
+{
+  svc->cache.for_each(
+    [&filter, f] (const string& name, const ObjectCacheEntry& entry) {
+      if (!filter || name.find(*filter) != name.npos) {
+	cache_list_dump_helper(f, name, entry.info.meta.mtime,
+                               entry.info.meta.size);
+      }
+    });
+}
+
+int RGWSI_SysObj_Cache::ASocketHandler::call_inspect(const std::string& target, Formatter* f)
+{
+  if (const auto entry = svc->cache.get(dpp, target)) {
+    f->open_object_section("cache_entry");
+    f->dump_string("name", target.c_str());
+    entry->dump(f);
+    f->close_section();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int RGWSI_SysObj_Cache::ASocketHandler::call_erase(const std::string& target)
+{
+  return svc->cache.invalidate_remove(dpp, target);
+}
+
+int RGWSI_SysObj_Cache::ASocketHandler::call_zap()
+{
+  svc->cache.invalidate_all();
+  return 0;
+}
diff --git a/src/rgw/services/svc_sys_obj_cache.h b/src/rgw/services/svc_sys_obj_cache.h
new file mode 100644
index 000000000..f7950843f
--- /dev/null
+++ b/src/rgw/services/svc_sys_obj_cache.h
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "common/RWLock.h"
+#include "rgw_service.h"
+#include "rgw_cache.h"
+
+#include "svc_sys_obj_core.h"
+
+class RGWSI_Notify;
+
+class RGWSI_SysObj_Cache_CB;
+class RGWSI_SysObj_Cache_ASocketHook;
+
+class RGWSI_SysObj_Cache : public RGWSI_SysObj_Core
+{
+  friend class RGWSI_SysObj_Cache_CB;
+  friend class RGWServices_Def;
+  friend class ASocketHandler;
+
+  RGWSI_Notify *notify_svc{nullptr};
+  ObjectCache cache;
+
+  std::shared_ptr<RGWSI_SysObj_Cache_CB> cb;
+
+  void normalize_pool_and_obj(const rgw_pool& src_pool, const std::string& src_obj, rgw_pool& dst_pool, std::string& dst_obj);
+protected:
+  void init(RGWSI_RADOS *_rados_svc,
+            RGWSI_Zone *_zone_svc,
+            RGWSI_Notify *_notify_svc) {
+    core_init(_rados_svc, _zone_svc);
+    notify_svc = _notify_svc;
+  }
+
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+  void shutdown() override;
+
+  int raw_stat(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj,
+               uint64_t *psize, real_time *pmtime,
+               std::map<std::string, bufferlist> *attrs,
+               RGWObjVersionTracker *objv_tracker,
+               optional_yield y) override;
+
+  int read(const DoutPrefixProvider *dpp,
+           RGWSI_SysObj_Obj_GetObjState& read_state,
+           RGWObjVersionTracker *objv_tracker,
+           const rgw_raw_obj& obj,
+           bufferlist *bl, off_t ofs, off_t end,
+           ceph::real_time* pmtime, uint64_t* psize,
+           std::map<std::string, bufferlist> *attrs,
+	   bool raw_attrs,
+           rgw_cache_entry_info *cache_info,
+           boost::optional<obj_version>,
+           optional_yield y) override;
+
+  int get_attr(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const char *name, bufferlist *dest,
+               optional_yield y) override;
+
+  int set_attrs(const DoutPrefixProvider *dpp, 
+                const rgw_raw_obj& obj, 
+                std::map<std::string, bufferlist>& attrs,
+                std::map<std::string, bufferlist> *rmattrs,
+                RGWObjVersionTracker *objv_tracker,
+                bool exclusive, optional_yield y) override;
+
+  int remove(const DoutPrefixProvider *dpp, 
+             RGWObjVersionTracker *objv_tracker,
+             const rgw_raw_obj& obj,
+             optional_yield y) override;
+
+  int write(const DoutPrefixProvider *dpp, 
+            const rgw_raw_obj& obj,
+            real_time *pmtime,
+            std::map<std::string, bufferlist>& attrs,
+            bool exclusive,
+            const bufferlist& data,
+            RGWObjVersionTracker *objv_tracker,
+            real_time set_mtime,
+            optional_yield y) override;
+
+  int write_data(const DoutPrefixProvider *dpp, 
+                 const rgw_raw_obj& obj,
+                 const bufferlist& bl,
+                 bool exclusive,
+                 RGWObjVersionTracker *objv_tracker,
+                 optional_yield y);
+
+  int distribute_cache(const DoutPrefixProvider *dpp, const std::string& normal_name, const rgw_raw_obj& obj,
+                       ObjectCacheInfo& obj_info, int op,
+                       optional_yield y);
+
+  int watch_cb(const DoutPrefixProvider *dpp,
+               uint64_t notify_id,
+               uint64_t cookie,
+               uint64_t notifier_id,
+               bufferlist& bl);
+
+  void set_enabled(bool status);
+
+public:
+  RGWSI_SysObj_Cache(const DoutPrefixProvider *dpp, CephContext *cct) : RGWSI_SysObj_Core(cct), asocket(dpp, this) {
+    cache.set_ctx(cct);
+  }
+
+  bool chain_cache_entry(const DoutPrefixProvider *dpp,
+                         std::initializer_list<rgw_cache_entry_info *> cache_info_entries,
+                         RGWChainedCache::Entry *chained_entry);
+  void register_chained_cache(RGWChainedCache *cc);
+  void unregister_chained_cache(RGWChainedCache *cc);
+
+  class ASocketHandler {
+    const DoutPrefixProvider *dpp;
+    RGWSI_SysObj_Cache *svc;
+
+    std::unique_ptr<RGWSI_SysObj_Cache_ASocketHook> hook;
+
+  public:
+    ASocketHandler(const DoutPrefixProvider *dpp, RGWSI_SysObj_Cache *_svc);
+    ~ASocketHandler();
+
+    int start();
+    void shutdown();
+
+    // `call_list` must iterate over all cache entries and call
+    // `cache_list_dump_helper` with the supplied Formatter on any that
+    // include `filter` as a substd::string.
+    //
+    void call_list(const std::optional<std::string>& filter, Formatter* f);
+
+    // `call_inspect` must look up the requested target and, if found,
+    // dump it to the supplied Formatter and return true. If not found,
+    // it must return false.
+    //
+    int call_inspect(const std::string& target, Formatter* f);
+
+    // `call_erase` must erase the requested target and return true. If
+    // the requested target does not exist, it should return false.
+    int call_erase(const std::string& target);
+
+    // `call_zap` must erase the cache.
+    int call_zap();
+  } asocket;
+};
+
+template <class T>
+class RGWChainedCacheImpl : public RGWChainedCache {
+  RGWSI_SysObj_Cache *svc{nullptr};
+  ceph::timespan expiry;
+  RWLock lock;
+
+  std::unordered_map<std::string, std::pair<T, ceph::coarse_mono_time>> entries;
+
+public:
+  RGWChainedCacheImpl() : lock("RGWChainedCacheImpl::lock") {}
+  ~RGWChainedCacheImpl() {
+    if (!svc) {
+      return;
+    }
+    svc->unregister_chained_cache(this);
+  }
+
+  void unregistered() override {
+    svc = nullptr;
+  }
+
+  void init(RGWSI_SysObj_Cache *_svc) {
+    if (!_svc) {
+      return;
+    }
+    svc = _svc;
+    svc->register_chained_cache(this);
+    expiry = std::chrono::seconds(svc->ctx()->_conf.get_val<uint64_t>(
+				    "rgw_cache_expiry_interval"));
+  }
+
+  boost::optional<T> find(const std::string& key) {
+    std::shared_lock rl{lock};
+    auto iter = entries.find(key);
+    if (iter == entries.end()) {
+      return boost::none;
+    }
+    if (expiry.count() &&
+	(ceph::coarse_mono_clock::now() - iter->second.second) > expiry) {
+      return boost::none;
+    }
+
+    return iter->second.first;
+  }
+
+  bool put(const DoutPrefixProvider *dpp, RGWSI_SysObj_Cache *svc, const std::string& key, T *entry,
+	   std::initializer_list<rgw_cache_entry_info *> cache_info_entries) {
+    if (!svc) {
+      return false;
+    }
+
+    Entry chain_entry(this, key, entry);
+
+    /* we need the svc cache to call us under its lock to maintain lock ordering */
+    return svc->chain_cache_entry(dpp, cache_info_entries, &chain_entry);
+  }
+
+  void chain_cb(const std::string& key, void *data) override {
+    T *entry = static_cast<T *>(data);
+    std::unique_lock wl{lock};
+    entries[key].first = *entry;
+    if (expiry.count() > 0) {
+      entries[key].second = ceph::coarse_mono_clock::now();
+    }
+  }
+
+  void invalidate(const std::string& key) override {
+    std::unique_lock wl{lock};
+    entries.erase(key);
+  }
+
+  void invalidate_all() override {
+    std::unique_lock wl{lock};
+    entries.clear();
+  }
+}; /* RGWChainedCacheImpl */
diff --git a/src/rgw/services/svc_sys_obj_core.cc b/src/rgw/services/svc_sys_obj_core.cc
new file mode 100644
index 000000000..303089691
--- /dev/null
+++ b/src/rgw/services/svc_sys_obj_core.cc
@@ -0,0 +1,666 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_sys_obj_core.h"
+#include "svc_rados.h"
+#include "svc_zone.h"
+
+#include "rgw_tools.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+
+int RGWSI_SysObj_Core_GetObjState::get_rados_obj(const DoutPrefixProvider *dpp,
+                                                 RGWSI_RADOS *rados_svc,
+                                                 RGWSI_Zone *zone_svc,
+                                                 const rgw_raw_obj& obj,
+                                                 RGWSI_RADOS::Obj **pobj)
+{
+  if (!has_rados_obj) {
+    if (obj.oid.empty()) {
+      ldpp_dout(dpp, 0) << "ERROR: obj.oid is empty" << dendl;
+      return -EINVAL;
+    }
+
+    rados_obj = rados_svc->obj(obj);
+    int r = rados_obj.open(dpp);
+    if (r < 0) {
+      return r;
+    }
+    has_rados_obj = true;
+  }
+  *pobj = &rados_obj;
+  return 0;
+}
+
+int RGWSI_SysObj_Core::get_rados_obj(const DoutPrefixProvider *dpp,
+                                     RGWSI_Zone *zone_svc,
+                                     const rgw_raw_obj& obj,
+                                     RGWSI_RADOS::Obj *pobj)
+{
+  if (obj.oid.empty()) {
+    ldpp_dout(dpp, 0) << "ERROR: obj.oid is empty" << dendl;
+    return -EINVAL;
+  }
+
+  *pobj = rados_svc->obj(obj);
+  int r = pobj->open(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_SysObj_Core::raw_stat(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj,
+                                uint64_t *psize, real_time *pmtime,
+                                map<string, bufferlist> *attrs,
+                                RGWObjVersionTracker *objv_tracker,
+                                optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    return r;
+  }
+
+  uint64_t size = 0;
+  struct timespec mtime_ts;
+
+  librados::ObjectReadOperation op;
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_read(&op);
+  }
+  op.getxattrs(attrs, nullptr);
+  if (psize || pmtime) {
+    op.stat2(&size, &mtime_ts, nullptr);
+  }
+  bufferlist outbl;
+  r = rados_obj.operate(dpp, &op, &outbl, y);
+  if (r < 0)
+    return r;
+
+  if (psize)
+    *psize = size;
+  if (pmtime)
+    *pmtime = ceph::real_clock::from_timespec(mtime_ts);
+
+  return 0;
+}
+
+int RGWSI_SysObj_Core::stat(RGWSI_SysObj_Obj_GetObjState& _state,
+                            const rgw_raw_obj& obj,
+                            map<string, bufferlist> *attrs,
+			    bool raw_attrs,
+                            real_time *lastmod,
+                            uint64_t *obj_size,
+                            RGWObjVersionTracker *objv_tracker,
+                            optional_yield y,
+                            const DoutPrefixProvider *dpp)
+{
+  uint64_t size = 0;
+  ceph::real_time mtime;
+  std::map<std::string, bufferlist> attrset;
+
+  int r = raw_stat(dpp, obj, &size, &mtime, &attrset, objv_tracker, y);
+  if (r < 0)
+    return r;
+
+  if (attrs) {
+    if (raw_attrs) {
+      *attrs = std::move(attrset);
+    } else {
+      rgw_filter_attrset(attrset, RGW_ATTR_PREFIX, attrs);
+    }
+    if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
+      map<string, bufferlist>::iterator iter;
+      for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
+        ldpp_dout(dpp, 20) << "Read xattr: " << iter->first << dendl;
+      }
+    }
+  }
+
+  if (obj_size)
+    *obj_size = size;
+  if (lastmod)
+    *lastmod = mtime;
+
+  return 0;
+}
+
+int RGWSI_SysObj_Core::read(const DoutPrefixProvider *dpp,
+                            RGWSI_SysObj_Obj_GetObjState& _read_state,
+                            RGWObjVersionTracker *objv_tracker,
+                            const rgw_raw_obj& obj,
+                            bufferlist *bl, off_t ofs, off_t end,
+                            ceph::real_time* pmtime, uint64_t* psize,
+                            map<string, bufferlist> *attrs,
+			    bool raw_attrs,
+                            rgw_cache_entry_info *cache_info,
+                            boost::optional<obj_version>,
+                            optional_yield y)
+{
+  auto& read_state = static_cast<GetObjState&>(_read_state);
+
+  uint64_t len;
+  struct timespec mtime_ts;
+  librados::ObjectReadOperation op;
+
+  if (end < 0)
+    len = 0;
+  else
+    len = end - ofs + 1;
+
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_read(&op);
+  }
+  if (psize || pmtime) {
+    op.stat2(psize, &mtime_ts, nullptr);
+  }
+
+  ldpp_dout(dpp, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
+  op.read(ofs, len, bl, nullptr);
+
+  map<string, bufferlist> unfiltered_attrset;
+
+  if (attrs) {
+    if (raw_attrs) {
+      op.getxattrs(attrs, nullptr);
+    } else {
+      op.getxattrs(&unfiltered_attrset, nullptr);
+    }
+  }
+
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+  r = rados_obj.operate(dpp, &op, nullptr, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "rados_obj.operate() r=" << r << " bl.length=" << bl->length() << dendl;
+    return r;
+  }
+  ldpp_dout(dpp, 20) << "rados_obj.operate() r=" << r << " bl.length=" << bl->length() << dendl;
+
+  uint64_t op_ver = rados_obj.get_last_version();
+
+  if (read_state.last_ver > 0 &&
+      read_state.last_ver != op_ver) {
+    ldpp_dout(dpp, 5) << "raced with an object write, abort" << dendl;
+    return -ECANCELED;
+  }
+
+  if (pmtime) {
+    *pmtime = ceph::real_clock::from_timespec(mtime_ts);
+  }
+  if (attrs && !raw_attrs) {
+    rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
+  }
+
+  read_state.last_ver = op_ver;
+
+  return bl->length();
+}
+
+/**
+ * Get an attribute for a system object.
+ * obj: the object to get attr
+ * name: name of the attr to retrieve
+ * dest: bufferlist to store the result in
+ * Returns: 0 on success, -ERR# otherwise.
+ */
+int RGWSI_SysObj_Core::get_attr(const DoutPrefixProvider *dpp,
+                                const rgw_raw_obj& obj,
+                                const char *name,
+                                bufferlist *dest,
+                                optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+  librados::ObjectReadOperation op;
+
+  int rval;
+  op.getxattr(name, dest, &rval);
+  
+  r = rados_obj.operate(dpp, &op, nullptr, y);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RGWSI_SysObj_Core::set_attrs(const DoutPrefixProvider *dpp, 
+                                 const rgw_raw_obj& obj,
+                                 map<string, bufferlist>& attrs,
+                                 map<string, bufferlist> *rmattrs,
+                                 RGWObjVersionTracker *objv_tracker,
+                                 bool exclusive, optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+
+  if (exclusive) {
+    op.create(true); // exclusive create
+  }
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_write(&op);
+  }
+
+  map<string, bufferlist>::iterator iter;
+  if (rmattrs) {
+    for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
+      const string& name = iter->first;
+      op.rmxattr(name.c_str());
+    }
+  }
+
+  for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
+    const string& name = iter->first;
+    bufferlist& bl = iter->second;
+
+    if (!bl.length())
+      continue;
+
+    op.setxattr(name.c_str(), bl);
+  }
+
+  if (!op.size())
+    return 0;
+
+  bufferlist bl;
+
+  r = rados_obj.operate(dpp, &op, y);
+  if (r < 0)
+    return r;
+
+  if (objv_tracker) {
+    objv_tracker->apply_write();
+  }
+  return 0;
+}
+
+int RGWSI_SysObj_Core::omap_get_vals(const DoutPrefixProvider *dpp, 
+                                     const rgw_raw_obj& obj,
+                                     const string& marker,
+                                     uint64_t count,
+                                     std::map<string, bufferlist> *m,
+                                     bool *pmore,
+                                     optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+  string start_after = marker;
+  bool more;
+
+  do {
+    librados::ObjectReadOperation op;
+
+    std::map<string, bufferlist> t;
+    int rval;
+    op.omap_get_vals2(start_after, count, &t, &more, &rval);
+  
+    r = rados_obj.operate(dpp, &op, nullptr, y);
+    if (r < 0) {
+      return r;
+    }
+    if (t.empty()) {
+      break;
+    }
+    count -= t.size();
+    start_after = t.rbegin()->first;
+    m->insert(t.begin(), t.end());
+  } while (more && count > 0);
+
+  if (pmore) {
+    *pmore = more;
+  }
+  return 0;
+}
+
+int RGWSI_SysObj_Core::omap_get_all(const DoutPrefixProvider *dpp, 
+                                    const rgw_raw_obj& obj,
+                                    std::map<string, bufferlist> *m,
+                                    optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+#define MAX_OMAP_GET_ENTRIES 1024
+  const int count = MAX_OMAP_GET_ENTRIES;
+  string start_after;
+  bool more;
+
+  do {
+    librados::ObjectReadOperation op;
+
+    std::map<string, bufferlist> t;
+    int rval;
+    op.omap_get_vals2(start_after, count, &t, &more, &rval);
+  
+    r = rados_obj.operate(dpp, &op, nullptr, y);
+    if (r < 0) {
+      return r;
+    }
+    if (t.empty()) {
+      break;
+    }
+    start_after = t.rbegin()->first;
+    m->insert(t.begin(), t.end());
+  } while (more);
+  return 0;
+}
+
+int RGWSI_SysObj_Core::omap_set(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const std::string& key,
+                                bufferlist& bl, bool must_exist,
+                                optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+  ldpp_dout(dpp, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
+
+  map<string, bufferlist> m;
+  m[key] = bl;
+  librados::ObjectWriteOperation op;
+  if (must_exist)
+    op.assert_exists();
+  op.omap_set(m);
+  r = rados_obj.operate(dpp, &op, y);
+  return r;
+}
+
+int RGWSI_SysObj_Core::omap_set(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj,
+                                const std::map<std::string, bufferlist>& m,
+                                bool must_exist, optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  if (must_exist)
+    op.assert_exists();
+  op.omap_set(m);
+  r = rados_obj.operate(dpp, &op, y);
+  return r;
+}
+
+int RGWSI_SysObj_Core::omap_del(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const std::string& key,
+                                optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+  set<string> k;
+  k.insert(key);
+
+  librados::ObjectWriteOperation op;
+
+  op.omap_rm_keys(k);
+
+  r = rados_obj.operate(dpp, &op, y);
+  return r;
+}
+
+int RGWSI_SysObj_Core::notify(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, bufferlist& bl,
+                              uint64_t timeout_ms, bufferlist *pbl,
+                              optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+  r = rados_obj.notify(dpp, bl, timeout_ms, pbl, y);
+  return r;
+}
+
+int RGWSI_SysObj_Core::remove(const DoutPrefixProvider *dpp, 
+                              RGWObjVersionTracker *objv_tracker,
+                              const rgw_raw_obj& obj,
+                              optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_write(&op);
+  }
+
+  op.remove();
+  r = rados_obj.operate(dpp, &op, y);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RGWSI_SysObj_Core::write(const DoutPrefixProvider *dpp, 
+                             const rgw_raw_obj& obj,
+                             real_time *pmtime,
+                             map<std::string, bufferlist>& attrs,
+                             bool exclusive,
+                             const bufferlist& data,
+                             RGWObjVersionTracker *objv_tracker,
+                             real_time set_mtime,
+                             optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+
+  if (exclusive) {
+    op.create(true); // exclusive create
+  } else {
+    op.remove();
+    op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
+    op.create(false);
+  }
+
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_write(&op);
+  }
+
+  if (real_clock::is_zero(set_mtime)) {
+    set_mtime = real_clock::now();
+  }
+
+  struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
+  op.mtime2(&mtime_ts);
+  op.write_full(data);
+
+  bufferlist acl_bl;
+
+  for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
+    const string& name = iter->first;
+    bufferlist& bl = iter->second;
+
+    if (!bl.length())
+      continue;
+
+    op.setxattr(name.c_str(), bl);
+  }
+
+  r = rados_obj.operate(dpp, &op, y);
+  if (r < 0) {
+    return r;
+  }
+
+  if (objv_tracker) {
+    objv_tracker->apply_write();
+  }
+
+  if (pmtime) {
+    *pmtime = set_mtime;
+  }
+
+  return 0;
+}
+
+
+int RGWSI_SysObj_Core::write_data(const DoutPrefixProvider *dpp, 
+                                  const rgw_raw_obj& obj,
+                                  const bufferlist& bl,
+                                  bool exclusive,
+                                  RGWObjVersionTracker *objv_tracker,
+                                  optional_yield y)
+{
+  RGWSI_RADOS::Obj rados_obj;
+  int r = get_rados_obj(dpp, zone_svc, obj, &rados_obj);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "get_rados_obj() on obj=" << obj << " returned " << r << dendl;
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+
+  if (exclusive) {
+    op.create(true);
+  }
+
+  if (objv_tracker) {
+    objv_tracker->prepare_op_for_write(&op);
+  }
+  op.write_full(bl);
+  r = rados_obj.operate(dpp, &op, y);
+  if (r < 0)
+    return r;
+
+  if (objv_tracker) {
+    objv_tracker->apply_write();
+  }
+  return 0;
+}
+
+int RGWSI_SysObj_Core::pool_list_prefixed_objs(const DoutPrefixProvider *dpp,
+                                               const rgw_pool& pool, const string& prefix,
+                                               std::function<void(const string&)> cb)
+{
+  bool is_truncated;
+
+  auto rados_pool = rados_svc->pool(pool);
+
+  auto op = rados_pool.op();
+
+  RGWAccessListFilterPrefix filter(prefix);
+
+  int r = op.init(dpp, string(), &filter);
+  if (r < 0) {
+    return r;
+  }
+
+  do {
+    vector<string> oids;
+#define MAX_OBJS_DEFAULT 1000
+    int r = op.get_next(dpp, MAX_OBJS_DEFAULT, &oids, &is_truncated);
+    if (r < 0) {
+      return r;
+    }
+    for (auto& val : oids) {
+      if (val.size() > prefix.size()) {
+        cb(val.substr(prefix.size()));
+      }
+    }
+  } while (is_truncated);
+
+  return 0;
+}
+
+int RGWSI_SysObj_Core::pool_list_objects_init(const DoutPrefixProvider *dpp,
+                                              const rgw_pool& pool,
+                                              const string& marker,
+                                              const string& prefix,
+                                              RGWSI_SysObj::Pool::ListCtx *_ctx)
+{
+  _ctx->impl.emplace<PoolListImplInfo>(prefix);
+
+  auto& ctx = static_cast<PoolListImplInfo&>(*_ctx->impl);
+
+  ctx.pool = rados_svc->pool(pool);
+  ctx.op = ctx.pool.op();
+
+  int r = ctx.op.init(dpp, marker, &ctx.filter);
+  if (r < 0) {
+    ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int RGWSI_SysObj_Core::pool_list_objects_next(const DoutPrefixProvider *dpp,
+                                              RGWSI_SysObj::Pool::ListCtx& _ctx,
+                                              int max,
+                                              vector<string> *oids,
+                                              bool *is_truncated)
+{
+  if (!_ctx.impl) {
+    return -EINVAL;
+  }
+  auto& ctx = static_cast<PoolListImplInfo&>(*_ctx.impl);
+  int r = ctx.op.get_next(dpp, max, oids, is_truncated);
+  if (r < 0) {
+    if(r != -ENOENT)
+      ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
+    return r;
+  }
+
+  return oids->size();
+}
+
+int RGWSI_SysObj_Core::pool_list_objects_get_marker(RGWSI_SysObj::Pool::ListCtx& _ctx,
+                                                    string *marker)
+{
+  if (!_ctx.impl) {
+    return -EINVAL;
+  }
+
+  auto& ctx = static_cast<PoolListImplInfo&>(*_ctx.impl);
+  return ctx.op.get_marker(marker);
+}
diff --git a/src/rgw/services/svc_sys_obj_core.h b/src/rgw/services/svc_sys_obj_core.h
new file mode 100644
index 000000000..d02a37eee
--- /dev/null
+++ b/src/rgw/services/svc_sys_obj_core.h
@@ -0,0 +1,145 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "svc_rados.h"
+#include "svc_sys_obj.h"
+#include "svc_sys_obj_core_types.h"
+
+
+class RGWSI_Zone;
+
+struct rgw_cache_entry_info;
+
+class RGWSI_SysObj_Core : public RGWServiceInstance
+{
+  friend class RGWServices_Def;
+  friend class RGWSI_SysObj;
+
+protected:
+  RGWSI_RADOS *rados_svc{nullptr};
+  RGWSI_Zone *zone_svc{nullptr};
+
+  using GetObjState = RGWSI_SysObj_Core_GetObjState;
+  using PoolListImplInfo = RGWSI_SysObj_Core_PoolListImplInfo;
+
+  void core_init(RGWSI_RADOS *_rados_svc,
+                 RGWSI_Zone *_zone_svc) {
+    rados_svc = _rados_svc;
+    zone_svc = _zone_svc;
+  }
+  int get_rados_obj(const DoutPrefixProvider *dpp, RGWSI_Zone *zone_svc, const rgw_raw_obj& obj, RGWSI_RADOS::Obj *pobj);
+
+  virtual int raw_stat(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj,
+                       uint64_t *psize, real_time *pmtime,
+                       std::map<std::string, bufferlist> *attrs,
+                       RGWObjVersionTracker *objv_tracker,
+                       optional_yield y);
+
+  virtual int read(const DoutPrefixProvider *dpp,
+                   RGWSI_SysObj_Obj_GetObjState& read_state,
+                   RGWObjVersionTracker *objv_tracker,
+                   const rgw_raw_obj& obj,
+                   bufferlist *bl, off_t ofs, off_t end,
+                   ceph::real_time* pmtime, uint64_t* psize,
+                   std::map<std::string, bufferlist> *attrs,
+		   bool raw_attrs,
+                   rgw_cache_entry_info *cache_info,
+                   boost::optional<obj_version>,
+                   optional_yield y);
+
+  virtual int remove(const DoutPrefixProvider *dpp, 
+                     RGWObjVersionTracker *objv_tracker,
+                     const rgw_raw_obj& obj,
+                     optional_yield y);
+
+  virtual int write(const DoutPrefixProvider *dpp, 
+                    const rgw_raw_obj& obj,
+                    real_time *pmtime,
+                    std::map<std::string, bufferlist>& attrs,
+                    bool exclusive,
+                    const bufferlist& data,
+                    RGWObjVersionTracker *objv_tracker,
+                    real_time set_mtime,
+                    optional_yield y);
+
+  virtual int write_data(const DoutPrefixProvider *dpp, 
+                         const rgw_raw_obj& obj,
+                         const bufferlist& bl,
+                         bool exclusive,
+                         RGWObjVersionTracker *objv_tracker,
+                         optional_yield y);
+
+  virtual int get_attr(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj,
+                       const char *name, bufferlist *dest,
+                       optional_yield y);
+
+  virtual int set_attrs(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj,
+                        std::map<std::string, bufferlist>& attrs,
+                        std::map<std::string, bufferlist> *rmattrs,
+                        RGWObjVersionTracker *objv_tracker,
+                        bool exclusive, optional_yield y);
+
+  virtual int omap_get_all(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, std::map<std::string, bufferlist> *m,
+                           optional_yield y);
+  virtual int omap_get_vals(const DoutPrefixProvider *dpp, 
+                            const rgw_raw_obj& obj,
+                            const std::string& marker,
+                            uint64_t count,
+                            std::map<std::string, bufferlist> *m,
+                            bool *pmore,
+                            optional_yield y);
+  virtual int omap_set(const DoutPrefixProvider *dpp, 
+                       const rgw_raw_obj& obj, const std::string& key,
+                       bufferlist& bl, bool must_exist,
+                       optional_yield y);
+  virtual int omap_set(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj,
+                       const std::map<std::string, bufferlist>& m, bool must_exist,
+                       optional_yield y);
+  virtual int omap_del(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, const std::string& key,
+                       optional_yield y);
+
+  virtual int notify(const DoutPrefixProvider *dpp, 
+                     const rgw_raw_obj& obj, bufferlist& bl,
+                     uint64_t timeout_ms, bufferlist *pbl,
+                     optional_yield y);
+
+  virtual int pool_list_prefixed_objs(const DoutPrefixProvider *dpp,
+                                      const rgw_pool& pool,
+                                      const std::string& prefix,
+                                      std::function<void(const std::string&)> cb);
+
+  virtual int pool_list_objects_init(const DoutPrefixProvider *dpp,
+                                     const rgw_pool& pool,
+                                     const std::string& marker,
+                                     const std::string& prefix,
+                                     RGWSI_SysObj::Pool::ListCtx *ctx);
+  virtual int pool_list_objects_next(const DoutPrefixProvider *dpp,
+                                     RGWSI_SysObj::Pool::ListCtx& ctx,
+                                     int max,
+                                     std::vector<std::string> *oids,
+                                     bool *is_truncated);
+
+  virtual int pool_list_objects_get_marker(RGWSI_SysObj::Pool::ListCtx& _ctx,
+                                           std::string *marker);
+
+  int stat(RGWSI_SysObj_Obj_GetObjState& state,
+           const rgw_raw_obj& obj,
+           std::map<std::string, bufferlist> *attrs,
+	   bool raw_attrs,
+           real_time *lastmod,
+           uint64_t *obj_size,
+           RGWObjVersionTracker *objv_tracker,
+           optional_yield y,
+           const DoutPrefixProvider *dpp);
+
+public:
+  RGWSI_SysObj_Core(CephContext *cct): RGWServiceInstance(cct) {}
+
+  RGWSI_Zone *get_zone_svc() {
+    return zone_svc;
+  }
+};
diff --git a/src/rgw/services/svc_sys_obj_core_types.h b/src/rgw/services/svc_sys_obj_core_types.h
new file mode 100644
index 000000000..74f489d91
--- /dev/null
+++ b/src/rgw/services/svc_sys_obj_core_types.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+
+#include "rgw_service.h"
+
+#include "svc_rados.h"
+#include "svc_sys_obj_types.h"
+
+
+
+struct RGWSI_SysObj_Core_GetObjState : public RGWSI_SysObj_Obj_GetObjState {
+  RGWSI_RADOS::Obj rados_obj;
+  bool has_rados_obj{false};
+  uint64_t last_ver{0};
+
+  RGWSI_SysObj_Core_GetObjState() {}
+
+  int get_rados_obj(const DoutPrefixProvider *dpp,
+                    RGWSI_RADOS *rados_svc,
+                    RGWSI_Zone *zone_svc,
+                    const rgw_raw_obj& obj,
+                    RGWSI_RADOS::Obj **pobj);
+};
+
+struct RGWSI_SysObj_Core_PoolListImplInfo : public RGWSI_SysObj_Pool_ListInfo {
+  RGWSI_RADOS::Pool pool;
+  RGWSI_RADOS::Pool::List op;
+  RGWAccessListFilterPrefix filter;
+
+  RGWSI_SysObj_Core_PoolListImplInfo(const std::string& prefix) : op(pool.op()), filter(prefix) {}
+};
diff --git a/src/rgw/services/svc_sys_obj_types.h b/src/rgw/services/svc_sys_obj_types.h
new file mode 100644
index 000000000..b5bc2d40d
--- /dev/null
+++ b/src/rgw/services/svc_sys_obj_types.h
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+
+#pragma once
+
+
+#include "rgw_service.h"
+
+
+struct RGWSI_SysObj_Obj_GetObjState {
+};
+
+struct RGWSI_SysObj_Pool_ListInfo {
+};
diff --git a/src/rgw/services/svc_tier_rados.cc b/src/rgw/services/svc_tier_rados.cc
new file mode 100644
index 000000000..ca87e8ace
--- /dev/null
+++ b/src/rgw/services/svc_tier_rados.cc
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_tier_rados.h"
+
+using namespace std;
+
+const std::string MP_META_SUFFIX = ".meta";
+
+MultipartMetaFilter::~MultipartMetaFilter() {}
+
+bool MultipartMetaFilter::filter(const string& name, string& key) {
+  // the length of the suffix so we can skip past it
+  static const size_t MP_META_SUFFIX_LEN = MP_META_SUFFIX.length();
+
+  size_t len = name.size();
+
+  // make sure there's room for suffix plus at least one more
+  // character
+  if (len <= MP_META_SUFFIX_LEN)
+    return false;
+
+  size_t pos = name.find(MP_META_SUFFIX, len - MP_META_SUFFIX_LEN);
+  if (pos == string::npos)
+    return false;
+
+  pos = name.rfind('.', pos - 1);
+  if (pos == string::npos)
+    return false;
+
+  key = name.substr(0, pos);
+
+  return true;
+}
+
+
diff --git a/src/rgw/services/svc_tier_rados.h b/src/rgw/services/svc_tier_rados.h
new file mode 100644
index 000000000..a2036b933
--- /dev/null
+++ b/src/rgw/services/svc_tier_rados.h
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include <iomanip>
+
+#include "rgw_service.h"
+
+#include "svc_rados.h"
+
+extern const std::string MP_META_SUFFIX;
+
+class RGWMPObj {
+  std::string oid;
+  std::string prefix;
+  std::string meta;
+  std::string upload_id;
+public:
+  RGWMPObj() {}
+  RGWMPObj(const std::string& _oid, const std::string& _upload_id) {
+    init(_oid, _upload_id, _upload_id);
+  }
+  RGWMPObj(const std::string& _oid, std::optional<std::string> _upload_id) {
+    if (_upload_id) {
+      init(_oid, *_upload_id, *_upload_id);
+    } else {
+      from_meta(_oid);
+    }
+  }
+  void init(const std::string& _oid, const std::string& _upload_id) {
+    init(_oid, _upload_id, _upload_id);
+  }
+  void init(const std::string& _oid, const std::string& _upload_id, const std::string& part_unique_str) {
+    if (_oid.empty()) {
+      clear();
+      return;
+    }
+    oid = _oid;
+    upload_id = _upload_id;
+    prefix = oid + ".";
+    meta = prefix + upload_id + MP_META_SUFFIX;
+    prefix.append(part_unique_str);
+  }
+  const std::string& get_meta() const { return meta; }
+  std::string get_part(int num) const {
+    char buf[16];
+    snprintf(buf, 16, ".%d", num);
+    std::string s = prefix;
+    s.append(buf);
+    return s;
+  }
+  std::string get_part(const std::string& part) const {
+    std::string s = prefix;
+    s.append(".");
+    s.append(part);
+    return s;
+  }
+  const std::string& get_upload_id() const {
+    return upload_id;
+  }
+  const std::string& get_key() const {
+    return oid;
+  }
+  bool from_meta(const std::string& meta) {
+    int end_pos = meta.rfind('.'); // search for ".meta"
+    if (end_pos < 0)
+      return false;
+    int mid_pos = meta.rfind('.', end_pos - 1); // <key>.<upload_id>
+    if (mid_pos < 0)
+      return false;
+    oid = meta.substr(0, mid_pos);
+    upload_id = meta.substr(mid_pos + 1, end_pos - mid_pos - 1);
+    init(oid, upload_id, upload_id);
+    return true;
+  }
+  void clear() {
+    oid = "";
+    prefix = "";
+    meta = "";
+    upload_id = "";
+  }
+  friend std::ostream& operator<<(std::ostream& out, const RGWMPObj& obj) {
+    return out << "RGWMPObj:{ prefix=" << std::quoted(obj.prefix) <<
+      ", meta=" << std::quoted(obj.meta) << " }";
+  }
+}; // class RGWMPObj
+
+/**
+ * A filter to a) test whether an object name is a multipart meta
+ * object, and b) filter out just the key used to determine the bucket
+ * index shard.
+ *
+ * Objects for multipart meta have names adorned with an upload id and
+ * other elements -- specifically a ".", MULTIPART_UPLOAD_ID_PREFIX,
+ * unique id, and MP_META_SUFFIX. This filter will return true when
+ * the name provided is such. It will also extract the key used for
+ * bucket index shard calculation from the adorned name.
+ */
+class MultipartMetaFilter : public RGWAccessListFilter {
+public:
+  MultipartMetaFilter() {}
+
+  virtual ~MultipartMetaFilter() override;
+
+  /**
+   * @param name [in] The object name as it appears in the bucket index.
+   * @param key [out] An output parameter that will contain the bucket
+   *        index key if this entry is in the form of a multipart meta object.
+   * @return true if the name provided is in the form of a multipart meta
+   *         object, false otherwise
+   */
+  bool filter(const std::string& name, std::string& key) override;
+};
+
+class RGWSI_Tier_RADOS : public RGWServiceInstance
+{
+  RGWSI_Zone *zone_svc{nullptr};
+
+public:
+  RGWSI_Tier_RADOS(CephContext *cct): RGWServiceInstance(cct) {}
+
+  void init(RGWSI_Zone *_zone_svc) {
+    zone_svc = _zone_svc;
+  }
+
+  static inline bool raw_obj_to_obj(const rgw_bucket& bucket, const rgw_raw_obj& raw_obj, rgw_obj *obj) {
+    ssize_t pos = raw_obj.oid.find('_', bucket.marker.length());
+    if (pos < 0) {
+      return false;
+    }
+
+    if (!rgw_obj_key::parse_raw_oid(raw_obj.oid.substr(pos + 1), &obj->key)) {
+      return false;
+    }
+    obj->bucket = bucket;
+
+    return true;
+  }
+};
+
diff --git a/src/rgw/services/svc_user.cc b/src/rgw/services/svc_user.cc
new file mode 100644
index 000000000..9a07c207b
--- /dev/null
+++ b/src/rgw/services/svc_user.cc
@@ -0,0 +1,11 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+
+#include "svc_user.h"
+
+RGWSI_User::RGWSI_User(CephContext *cct): RGWServiceInstance(cct) {
+}
+
+RGWSI_User::~RGWSI_User() {
+}
diff --git a/src/rgw/services/svc_user.h b/src/rgw/services/svc_user.h
new file mode 100644
index 000000000..1cb459d31
--- /dev/null
+++ b/src/rgw/services/svc_user.h
@@ -0,0 +1,127 @@
+
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "svc_meta_be.h"
+
+#include "rgw_service.h"
+
+class RGWUserBuckets;
+class RGWGetUserStats_CB;
+
+class RGWSI_User : public RGWServiceInstance
+{
+public:
+  RGWSI_User(CephContext *cct);
+  virtual ~RGWSI_User();
+
+  static std::string get_meta_key(const rgw_user& user) {
+    return user.to_str();
+  }
+
+  static rgw_user user_from_meta_key(const std::string& key) {
+    return rgw_user(key);
+  }
+
+  virtual RGWSI_MetaBackend_Handler *get_be_handler() = 0;
+
+  /* base svc_user interfaces */
+
+  virtual int read_user_info(RGWSI_MetaBackend::Context *ctx,
+                             const rgw_user& user,
+                             RGWUserInfo *info,
+                             RGWObjVersionTracker * const objv_tracker,
+                             real_time * const pmtime,
+                             rgw_cache_entry_info * const cache_info,
+                             std::map<std::string, bufferlist> * const pattrs,
+                             optional_yield y,
+                             const DoutPrefixProvider *dpp) = 0;
+
+  virtual int store_user_info(RGWSI_MetaBackend::Context *ctx,
+                              const RGWUserInfo& info,
+                              RGWUserInfo *old_info,
+                              RGWObjVersionTracker *objv_tracker,
+                              const real_time& mtime,
+                              bool exclusive,
+                              std::map<std::string, bufferlist> *attrs,
+                              optional_yield y,
+                              const DoutPrefixProvider *dpp) = 0;
+
+  virtual int remove_user_info(RGWSI_MetaBackend::Context *ctx,
+                               const RGWUserInfo& info,
+                               RGWObjVersionTracker *objv_tracker,
+                               optional_yield y,
+                               const DoutPrefixProvider *dpp) = 0;
+
+  virtual int get_user_info_by_email(RGWSI_MetaBackend::Context *ctx,
+                             const std::string& email, RGWUserInfo *info,
+                             RGWObjVersionTracker *objv_tracker,
+                             real_time *pmtime,
+                             optional_yield y,
+                             const DoutPrefixProvider *dpp) = 0;
+  virtual int get_user_info_by_swift(RGWSI_MetaBackend::Context *ctx,
+                             const std::string& swift_name,
+                             RGWUserInfo *info,        /* out */
+                             RGWObjVersionTracker * const objv_tracker,
+                             real_time * const pmtime,
+                             optional_yield y,
+                             const DoutPrefixProvider *dpp) = 0;
+  virtual int get_user_info_by_access_key(RGWSI_MetaBackend::Context *ctx,
+                                  const std::string& access_key,
+                                  RGWUserInfo *info,
+                                  RGWObjVersionTracker* objv_tracker,
+                                  real_time *pmtime,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp) = 0;
+
+  virtual int add_bucket(const DoutPrefixProvider *dpp, 
+                         const rgw_user& user,
+                         const rgw_bucket& bucket,
+                         ceph::real_time creation_time,
+                         optional_yield y) = 0;
+  virtual int remove_bucket(const DoutPrefixProvider *dpp, 
+                            const rgw_user& user,
+                            const rgw_bucket& _bucket, optional_yield) = 0;
+  virtual int list_buckets(const DoutPrefixProvider *dpp, 
+                           const rgw_user& user,
+                           const std::string& marker,
+                           const std::string& end_marker,
+                           uint64_t max,
+                           RGWUserBuckets *buckets,
+                           bool *is_truncated,
+                           optional_yield y) = 0;
+
+  virtual int flush_bucket_stats(const DoutPrefixProvider *dpp, 
+                                 const rgw_user& user,
+                                 const RGWBucketEnt& ent, optional_yield y) = 0;
+  virtual int complete_flush_stats(const DoutPrefixProvider *dpp,
+				   const rgw_user& user, optional_yield y) = 0;
+  virtual int reset_bucket_stats(const DoutPrefixProvider *dpp, 
+				 const rgw_user& user,
+                                 optional_yield y) = 0;
+  virtual int read_stats(const DoutPrefixProvider *dpp, 
+                         RGWSI_MetaBackend::Context *ctx,
+			 const rgw_user& user, RGWStorageStats *stats,
+			 ceph::real_time *last_stats_sync,         /* last time a full stats sync completed */
+			 ceph::real_time *last_stats_update,
+                         optional_yield y) = 0;  /* last time a stats update was done */
+
+  virtual int read_stats_async(const DoutPrefixProvider *dpp,
+			       const rgw_user& user, RGWGetUserStats_CB *cb) = 0;
+};
+
diff --git a/src/rgw/services/svc_user_rados.cc b/src/rgw/services/svc_user_rados.cc
new file mode 100644
index 000000000..c99af9354
--- /dev/null
+++ b/src/rgw/services/svc_user_rados.cc
@@ -0,0 +1,968 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include <boost/algorithm/string.hpp>
+
+#include "svc_user.h"
+#include "svc_user_rados.h"
+#include "svc_zone.h"
+#include "svc_sys_obj.h"
+#include "svc_sys_obj_cache.h"
+#include "svc_meta.h"
+#include "svc_meta_be_sobj.h"
+#include "svc_sync_modules.h"
+
+#include "rgw_user.h"
+#include "rgw_bucket.h"
+#include "rgw_tools.h"
+#include "rgw_zone.h"
+#include "rgw_rados.h"
+
+#include "cls/user/cls_user_client.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+#define RGW_BUCKETS_OBJ_SUFFIX ".buckets"
+
+using namespace std;
+
+class RGWSI_User_Module : public RGWSI_MBSObj_Handler_Module {
+  RGWSI_User_RADOS::Svc& svc;
+
+  const string prefix;
+public:
+  RGWSI_User_Module(RGWSI_User_RADOS::Svc& _svc) : RGWSI_MBSObj_Handler_Module("user"),
+                                                   svc(_svc) {}
+
+  void get_pool_and_oid(const string& key, rgw_pool *pool, string *oid) override {
+    if (pool) {
+      *pool = svc.zone->get_zone_params().user_uid_pool;
+    }
+    if (oid) {
+      *oid = key;
+    }
+  }
+
+  const string& get_oid_prefix() override {
+    return prefix;
+  }
+
+  bool is_valid_oid(const string& oid) override {
+    // filter out the user.buckets objects
+    return !boost::algorithm::ends_with(oid, RGW_BUCKETS_OBJ_SUFFIX);
+  }
+
+  string key_to_oid(const string& key) override {
+    return key;
+  }
+
+  string oid_to_key(const string& oid) override {
+    return oid;
+  }
+};
+
+RGWSI_User_RADOS::RGWSI_User_RADOS(CephContext *cct): RGWSI_User(cct) {
+}
+
+RGWSI_User_RADOS::~RGWSI_User_RADOS() {
+}
+
+void RGWSI_User_RADOS::init(RGWSI_RADOS *_rados_svc,
+                            RGWSI_Zone *_zone_svc, RGWSI_SysObj *_sysobj_svc,
+                            RGWSI_SysObj_Cache *_cache_svc, RGWSI_Meta *_meta_svc,
+                            RGWSI_MetaBackend *_meta_be_svc,
+                            RGWSI_SyncModules *_sync_modules_svc)
+{
+  svc.user = this;
+  svc.rados = _rados_svc;
+  svc.zone = _zone_svc;
+  svc.sysobj = _sysobj_svc;
+  svc.cache = _cache_svc;
+  svc.meta = _meta_svc;
+  svc.meta_be = _meta_be_svc;
+  svc.sync_modules = _sync_modules_svc;
+}
+
+int RGWSI_User_RADOS::do_start(optional_yield, const DoutPrefixProvider *dpp)
+{
+  uinfo_cache.reset(new RGWChainedCacheImpl<user_info_cache_entry>);
+  uinfo_cache->init(svc.cache);
+
+  int r = svc.meta->create_be_handler(RGWSI_MetaBackend::Type::MDBE_SOBJ, &be_handler);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to create be handler: r=" << r << dendl;
+    return r;
+  }
+
+  RGWSI_MetaBackend_Handler_SObj *bh = static_cast<RGWSI_MetaBackend_Handler_SObj *>(be_handler);
+
+  auto module = new RGWSI_User_Module(svc);
+  be_module.reset(module);
+  bh->set_module(module);
+  return 0;
+}
+
+rgw_raw_obj RGWSI_User_RADOS::get_buckets_obj(const rgw_user& user) const
+{
+  string oid = user.to_str() + RGW_BUCKETS_OBJ_SUFFIX;
+  return rgw_raw_obj(svc.zone->get_zone_params().user_uid_pool, oid);
+}
+
+int RGWSI_User_RADOS::read_user_info(RGWSI_MetaBackend::Context *ctx,
+                               const rgw_user& user,
+                               RGWUserInfo *info,
+                               RGWObjVersionTracker * const objv_tracker,
+                               real_time * const pmtime,
+                               rgw_cache_entry_info * const cache_info,
+                               map<string, bufferlist> * const pattrs,
+                               optional_yield y,
+                               const DoutPrefixProvider *dpp)
+{
+  if(user.id == RGW_USER_ANON_ID) {
+    ldpp_dout(dpp, 20) << "RGWSI_User_RADOS::read_user_info(): anonymous user" << dendl;
+    return -ENOENT;
+  }
+  bufferlist bl;
+  RGWUID user_id;
+
+  RGWSI_MBSObj_GetParams params(&bl, pattrs, pmtime);
+  params.set_cache_info(cache_info);
+
+  int ret = svc.meta_be->get_entry(ctx, get_meta_key(user), params, objv_tracker, y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  auto iter = bl.cbegin();
+  try {
+    decode(user_id, iter);
+    if (user_id.user_id != user) {
+      ldpp_dout(dpp, -1)  << "ERROR: rgw_get_user_info_by_uid(): user id mismatch: " << user_id.user_id << " != " << user << dendl;
+      return -EIO;
+    }
+    if (!iter.end()) {
+      decode(*info, iter);
+    }
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode user info, caught buffer::error" << dendl;
+    return -EIO;
+  }
+
+  return 0;
+}
+
+class PutOperation
+{
+  RGWSI_User_RADOS::Svc& svc;
+  RGWSI_MetaBackend_SObj::Context_SObj *ctx;
+  RGWUID ui;
+  const RGWUserInfo& info;
+  RGWUserInfo *old_info;
+  RGWObjVersionTracker *objv_tracker;
+  const real_time& mtime;
+  bool exclusive;
+  map<string, bufferlist> *pattrs;
+  RGWObjVersionTracker ot;
+  string err_msg;
+  optional_yield y;
+
+  void set_err_msg(string msg) {
+    if (!err_msg.empty()) {
+      err_msg = std::move(msg);
+    }
+  }
+
+public:  
+  PutOperation(RGWSI_User_RADOS::Svc& svc,
+               RGWSI_MetaBackend::Context *_ctx,
+               const RGWUserInfo& info,
+               RGWUserInfo *old_info,
+               RGWObjVersionTracker *objv_tracker,
+               const real_time& mtime,
+               bool exclusive,
+               map<string, bufferlist> *pattrs,
+               optional_yield y) :
+      svc(svc), info(info), old_info(old_info),
+      objv_tracker(objv_tracker), mtime(mtime),
+      exclusive(exclusive), pattrs(pattrs), y(y) {
+    ctx = static_cast<RGWSI_MetaBackend_SObj::Context_SObj *>(_ctx);
+    ui.user_id = info.user_id;
+  }
+
+  int prepare(const DoutPrefixProvider *dpp) {
+    if (objv_tracker) {
+      ot = *objv_tracker;
+    }
+
+    if (ot.write_version.tag.empty()) {
+      if (ot.read_version.tag.empty()) {
+        ot.generate_new_write_ver(svc.meta_be->ctx());
+      } else {
+        ot.write_version = ot.read_version;
+        ot.write_version.ver++;
+      }
+    }
+
+    for (auto iter = info.swift_keys.begin(); iter != info.swift_keys.end(); ++iter) {
+      if (old_info && old_info->swift_keys.count(iter->first) != 0)
+        continue;
+      auto& k = iter->second;
+      /* check if swift mapping exists */
+      RGWUserInfo inf;
+      int r = svc.user->get_user_info_by_swift(ctx, k.id, &inf, nullptr, nullptr, y, dpp);
+      if (r >= 0 && inf.user_id != info.user_id &&
+          (!old_info || inf.user_id != old_info->user_id)) {
+        ldpp_dout(dpp, 0) << "WARNING: can't store user info, swift id (" << k.id
+          << ") already mapped to another user (" << info.user_id << ")" << dendl;
+        return -EEXIST;
+      }
+    }
+
+    /* check if access keys already exist */
+    for (auto iter = info.access_keys.begin(); iter != info.access_keys.end(); ++iter) {
+      if (old_info && old_info->access_keys.count(iter->first) != 0)
+        continue;
+      auto& k = iter->second;
+      RGWUserInfo inf;
+      int r = svc.user->get_user_info_by_access_key(ctx, k.id, &inf, nullptr, nullptr, y, dpp);
+      if (r >= 0 && inf.user_id != info.user_id &&
+          (!old_info || inf.user_id != old_info->user_id)) {
+        ldpp_dout(dpp, 0) << "WARNING: can't store user info, access key already mapped to another user" << dendl;
+        return -EEXIST;
+      }
+    }
+
+    return 0;
+  }
+
+  int put(const DoutPrefixProvider *dpp) {
+    bufferlist data_bl;
+    encode(ui, data_bl);
+    encode(info, data_bl);
+
+    RGWSI_MBSObj_PutParams params(data_bl, pattrs, mtime, exclusive);
+
+    int ret = svc.meta_be->put(ctx, RGWSI_User::get_meta_key(info.user_id), params, &ot, y, dpp);
+    if (ret < 0)
+      return ret;
+
+    return 0;
+  }
+
+  int complete(const DoutPrefixProvider *dpp) {
+    int ret;
+
+    bufferlist link_bl;
+    encode(ui, link_bl);
+
+    if (!info.user_email.empty()) {
+      if (!old_info ||
+          old_info->user_email.compare(info.user_email) != 0) { /* only if new index changed */
+        ret = rgw_put_system_obj(dpp, svc.sysobj, svc.zone->get_zone_params().user_email_pool, info.user_email,
+                                 link_bl, exclusive, NULL, real_time(), y);
+        if (ret < 0)
+          return ret;
+      }
+    }
+
+    const bool renamed = old_info && old_info->user_id != info.user_id;
+    for (auto iter = info.access_keys.begin(); iter != info.access_keys.end(); ++iter) {
+      auto& k = iter->second;
+      if (old_info && old_info->access_keys.count(iter->first) != 0 && !renamed)
+        continue;
+
+      ret = rgw_put_system_obj(dpp, svc.sysobj, svc.zone->get_zone_params().user_keys_pool, k.id,
+                               link_bl, exclusive, NULL, real_time(), y);
+      if (ret < 0)
+        return ret;
+    }
+
+    for (auto siter = info.swift_keys.begin(); siter != info.swift_keys.end(); ++siter) {
+      auto& k = siter->second;
+      if (old_info && old_info->swift_keys.count(siter->first) != 0 && !renamed)
+        continue;
+
+      ret = rgw_put_system_obj(dpp, svc.sysobj, svc.zone->get_zone_params().user_swift_pool, k.id,
+                               link_bl, exclusive, NULL, real_time(), y);
+      if (ret < 0)
+        return ret;
+    }
+
+    if (old_info) {
+      ret = remove_old_indexes(*old_info, info, y, dpp);
+      if (ret < 0) {
+        return ret;
+      }
+    }
+
+    return 0;
+  }
+
+  int remove_old_indexes(const RGWUserInfo& old_info, const RGWUserInfo& new_info, optional_yield y, const DoutPrefixProvider *dpp) {
+    int ret;
+
+    if (!old_info.user_id.empty() &&
+        old_info.user_id != new_info.user_id) {
+      if (old_info.user_id.tenant != new_info.user_id.tenant) {
+        ldpp_dout(dpp, 0) << "ERROR: tenant mismatch: " << old_info.user_id.tenant << " != " << new_info.user_id.tenant << dendl;
+        return -EINVAL;
+      }
+      ret = svc.user->remove_uid_index(ctx, old_info, nullptr, y, dpp);
+      if (ret < 0 && ret != -ENOENT) {
+        set_err_msg("ERROR: could not remove index for uid " + old_info.user_id.to_str());
+        return ret;
+      }
+    }
+
+    if (!old_info.user_email.empty() &&
+        old_info.user_email != new_info.user_email) {
+      ret = svc.user->remove_email_index(dpp, old_info.user_email, y);
+      if (ret < 0 && ret != -ENOENT) {
+        set_err_msg("ERROR: could not remove index for email " + old_info.user_email);
+        return ret;
+      }
+    }
+
+    for ([[maybe_unused]] const auto& [name, access_key] : old_info.access_keys) {
+      if (!new_info.access_keys.count(access_key.id)) {
+        ret = svc.user->remove_key_index(dpp, access_key, y);
+        if (ret < 0 && ret != -ENOENT) {
+          set_err_msg("ERROR: could not remove index for key " + access_key.id);
+          return ret;
+        }
+      }
+    }
+
+    for (auto old_iter = old_info.swift_keys.begin(); old_iter != old_info.swift_keys.end(); ++old_iter) {
+      const auto& swift_key = old_iter->second;
+      auto new_iter = new_info.swift_keys.find(swift_key.id);
+      if (new_iter == new_info.swift_keys.end()) {
+        ret = svc.user->remove_swift_name_index(dpp, swift_key.id, y);
+        if (ret < 0 && ret != -ENOENT) {
+          set_err_msg("ERROR: could not remove index for swift_name " + swift_key.id);
+          return ret;
+        }
+      }
+    }
+
+    return 0;
+  }
+
+  const string& get_err_msg() {
+    return err_msg;
+  }
+};
+
+int RGWSI_User_RADOS::store_user_info(RGWSI_MetaBackend::Context *ctx,
+                                const RGWUserInfo& info,
+                                RGWUserInfo *old_info,
+                                RGWObjVersionTracker *objv_tracker,
+                                const real_time& mtime,
+                                bool exclusive,
+                                map<string, bufferlist> *attrs,
+                                optional_yield y,
+                                const DoutPrefixProvider *dpp)
+{
+  PutOperation op(svc, ctx,
+                  info, old_info,
+                  objv_tracker,
+                  mtime, exclusive,
+                  attrs,
+                  y);
+
+  int r = op.prepare(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  r = op.put(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  r = op.complete(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_User_RADOS::remove_key_index(const DoutPrefixProvider *dpp, 
+                                       const RGWAccessKey& access_key,
+                                       optional_yield y)
+{
+  rgw_raw_obj obj(svc.zone->get_zone_params().user_keys_pool, access_key.id);
+  auto sysobj = svc.sysobj->get_obj(obj);
+  return sysobj.wop().remove(dpp, y);
+}
+
+int RGWSI_User_RADOS::remove_email_index(const DoutPrefixProvider *dpp, 
+                                         const string& email,
+                                         optional_yield y)
+{
+  if (email.empty()) {
+    return 0;
+  }
+  rgw_raw_obj obj(svc.zone->get_zone_params().user_email_pool, email);
+  auto sysobj = svc.sysobj->get_obj(obj);
+  return sysobj.wop().remove(dpp, y);
+}
+
+int RGWSI_User_RADOS::remove_swift_name_index(const DoutPrefixProvider *dpp,
+                                              const string& swift_name,
+                                              optional_yield y)
+{
+  rgw_raw_obj obj(svc.zone->get_zone_params().user_swift_pool, swift_name);
+  auto sysobj = svc.sysobj->get_obj(obj);
+  return sysobj.wop().remove(dpp, y);
+}
+
+/**
+ * delete a user's presence from the RGW system.
+ * First remove their bucket ACLs, then delete them
+ * from the user and user email pools. This leaves the pools
+ * themselves alone, as well as any ACLs embedded in object xattrs.
+ */
+int RGWSI_User_RADOS::remove_user_info(RGWSI_MetaBackend::Context *ctx,
+                                 const RGWUserInfo& info,
+                                 RGWObjVersionTracker *objv_tracker,
+                                 optional_yield y,
+                                 const DoutPrefixProvider *dpp)
+{
+  int ret;
+
+  auto kiter = info.access_keys.begin();
+  for (; kiter != info.access_keys.end(); ++kiter) {
+    ldpp_dout(dpp, 10) << "removing key index: " << kiter->first << dendl;
+    ret = remove_key_index(dpp, kiter->second, y);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 0) << "ERROR: could not remove " << kiter->first << " (access key object), should be fixed (err=" << ret << ")" << dendl;
+      return ret;
+    }
+  }
+
+  auto siter = info.swift_keys.begin();
+  for (; siter != info.swift_keys.end(); ++siter) {
+    auto& k = siter->second;
+    ldpp_dout(dpp, 10) << "removing swift subuser index: " << k.id << dendl;
+    /* check if swift mapping exists */
+    ret = remove_swift_name_index(dpp, k.id, y);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 0) << "ERROR: could not remove " << k.id << " (swift name object), should be fixed (err=" << ret << ")" << dendl;
+      return ret;
+    }
+  }
+
+  ldpp_dout(dpp, 10) << "removing email index: " << info.user_email << dendl;
+  ret = remove_email_index(dpp, info.user_email, y);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: could not remove email index object for "
+        << info.user_email << ", should be fixed (err=" << ret << ")" << dendl;
+    return ret;
+  }
+
+  rgw_raw_obj uid_bucks = get_buckets_obj(info.user_id);
+  ldpp_dout(dpp, 10) << "removing user buckets index" << dendl;
+  auto sysobj = svc.sysobj->get_obj(uid_bucks);
+  ret = sysobj.wop().remove(dpp, y);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "ERROR: could not remove " << info.user_id << ":" << uid_bucks << ", should be fixed (err=" << ret << ")" << dendl;
+    return ret;
+  }
+
+  ret = remove_uid_index(ctx, info, objv_tracker, y, dpp);
+  if (ret < 0 && ret != -ENOENT) {
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSI_User_RADOS::remove_uid_index(RGWSI_MetaBackend::Context *ctx, const RGWUserInfo& user_info, RGWObjVersionTracker *objv_tracker,
+                                       optional_yield y, const DoutPrefixProvider *dpp)
+{
+  ldpp_dout(dpp, 10) << "removing user index: " << user_info.user_id << dendl;
+
+  RGWSI_MBSObj_RemoveParams params;
+  int ret = svc.meta_be->remove(ctx, get_meta_key(user_info.user_id), params, objv_tracker, y, dpp);
+  if (ret < 0 && ret != -ENOENT && ret  != -ECANCELED) {
+    string key;
+    user_info.user_id.to_str(key);
+    rgw_raw_obj uid_obj(svc.zone->get_zone_params().user_uid_pool, key);
+    ldpp_dout(dpp, 0) << "ERROR: could not remove " << user_info.user_id << ":" << uid_obj << ", should be fixed (err=" << ret << ")" << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSI_User_RADOS::get_user_info_from_index(RGWSI_MetaBackend::Context* ctx,
+                                               const string& key,
+                                               const rgw_pool& pool,
+                                               RGWUserInfo *info,
+                                               RGWObjVersionTracker* objv_tracker,
+                                               real_time* pmtime, optional_yield y,
+                                               const DoutPrefixProvider* dpp)
+{
+  string cache_key = pool.to_str() + "/" + key;
+
+  if (auto e = uinfo_cache->find(cache_key)) {
+    *info = e->info;
+    if (objv_tracker)
+      *objv_tracker = e->objv_tracker;
+    if (pmtime)
+      *pmtime = e->mtime;
+    return 0;
+  }
+
+  user_info_cache_entry e;
+  bufferlist bl;
+  RGWUID uid;
+
+  int ret = rgw_get_system_obj(svc.sysobj, pool, key, bl, nullptr, &e.mtime, y, dpp);
+  if (ret < 0)
+    return ret;
+
+  rgw_cache_entry_info cache_info;
+
+  auto iter = bl.cbegin();
+  try {
+    decode(uid, iter);
+
+    int ret = read_user_info(ctx, uid.user_id,
+                             &e.info, &e.objv_tracker, nullptr, &cache_info, nullptr,
+                             y, dpp);
+    if (ret < 0) {
+      return ret;
+    }
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to decode user info, caught buffer::error" << dendl;
+    return -EIO;
+  }
+
+  uinfo_cache->put(dpp, svc.cache, cache_key, &e, { &cache_info });
+
+  *info = e.info;
+  if (objv_tracker)
+    *objv_tracker = e.objv_tracker;
+  if (pmtime)
+    *pmtime = e.mtime;
+
+  return 0;
+}
+
+/**
+ * Given an email, finds the user info associated with it.
+ * returns: 0 on success, -ERR# on failure (including nonexistence)
+ */
+int RGWSI_User_RADOS::get_user_info_by_email(RGWSI_MetaBackend::Context *ctx,
+                                       const string& email, RGWUserInfo *info,
+                                       RGWObjVersionTracker *objv_tracker,
+                                       real_time *pmtime, optional_yield y,
+                                       const DoutPrefixProvider *dpp)
+{
+  return get_user_info_from_index(ctx, email, svc.zone->get_zone_params().user_email_pool,
+                                  info, objv_tracker, pmtime, y, dpp);
+}
+
+/**
+ * Given an swift username, finds the user_info associated with it.
+ * returns: 0 on success, -ERR# on failure (including nonexistence)
+ */
+int RGWSI_User_RADOS::get_user_info_by_swift(RGWSI_MetaBackend::Context *ctx,
+                                       const string& swift_name,
+                                       RGWUserInfo *info,        /* out */
+                                       RGWObjVersionTracker * const objv_tracker,
+                                       real_time * const pmtime, optional_yield y,
+                                       const DoutPrefixProvider *dpp)
+{
+  return get_user_info_from_index(ctx,
+                                  swift_name,
+                                  svc.zone->get_zone_params().user_swift_pool,
+                                  info, objv_tracker, pmtime, y, dpp);
+}
+
+/**
+ * Given an access key, finds the user info associated with it.
+ * returns: 0 on success, -ERR# on failure (including nonexistence)
+ */
+int RGWSI_User_RADOS::get_user_info_by_access_key(RGWSI_MetaBackend::Context *ctx,
+                                            const std::string& access_key,
+                                            RGWUserInfo *info,
+                                            RGWObjVersionTracker* objv_tracker,
+                                            real_time *pmtime, optional_yield y,
+                                            const DoutPrefixProvider *dpp)
+{
+  return get_user_info_from_index(ctx,
+                                  access_key,
+                                  svc.zone->get_zone_params().user_keys_pool,
+                                  info, objv_tracker, pmtime, y, dpp);
+}
+
+int RGWSI_User_RADOS::cls_user_update_buckets(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add, optional_yield y)
+{
+  auto rados_obj = svc.rados->obj(obj);
+  int r = rados_obj.open(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  cls_user_set_buckets(op, entries, add);
+  r = rados_obj.operate(dpp, &op, y);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_User_RADOS::cls_user_add_bucket(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, const cls_user_bucket_entry& entry, optional_yield y)
+{
+  list<cls_user_bucket_entry> l;
+  l.push_back(entry);
+
+  return cls_user_update_buckets(dpp, obj, l, true, y);
+}
+
+int RGWSI_User_RADOS::cls_user_remove_bucket(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, const cls_user_bucket& bucket, optional_yield y)
+{
+  auto rados_obj = svc.rados->obj(obj);
+  int r = rados_obj.open(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectWriteOperation op;
+  ::cls_user_remove_bucket(op, bucket);
+  r = rados_obj.operate(dpp, &op, y);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int RGWSI_User_RADOS::add_bucket(const DoutPrefixProvider *dpp, 
+                                 const rgw_user& user,
+                                 const rgw_bucket& bucket,
+                                 ceph::real_time creation_time,
+				 optional_yield y)
+{
+  int ret;
+
+  cls_user_bucket_entry new_bucket;
+
+  bucket.convert(&new_bucket.bucket);
+  new_bucket.size = 0;
+  if (real_clock::is_zero(creation_time))
+    new_bucket.creation_time = real_clock::now();
+  else
+    new_bucket.creation_time = creation_time;
+
+  rgw_raw_obj obj = get_buckets_obj(user);
+  ret = cls_user_add_bucket(dpp, obj, new_bucket, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: error adding bucket to user: ret=" << ret << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+
+int RGWSI_User_RADOS::remove_bucket(const DoutPrefixProvider *dpp, 
+                                    const rgw_user& user,
+                                    const rgw_bucket& _bucket,
+				    optional_yield y)
+{
+  cls_user_bucket bucket;
+  bucket.name = _bucket.name;
+  rgw_raw_obj obj = get_buckets_obj(user);
+  int ret = cls_user_remove_bucket(dpp, obj, bucket, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: error removing bucket from user: ret=" << ret << dendl;
+  }
+
+  return 0;
+}
+
+int RGWSI_User_RADOS::cls_user_flush_bucket_stats(const DoutPrefixProvider *dpp, 
+                                                  rgw_raw_obj& user_obj,
+                                                  const RGWBucketEnt& ent, optional_yield y)
+{
+  cls_user_bucket_entry entry;
+  ent.convert(&entry);
+
+  list<cls_user_bucket_entry> entries;
+  entries.push_back(entry);
+
+  int r = cls_user_update_buckets(dpp, user_obj, entries, false, y);
+  if (r < 0) {
+    ldpp_dout(dpp, 20) << "cls_user_update_buckets() returned " << r << dendl;
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_User_RADOS::cls_user_list_buckets(const DoutPrefixProvider *dpp, 
+                                            rgw_raw_obj& obj,
+                                            const string& in_marker,
+                                            const string& end_marker,
+                                            const int max_entries,
+                                            list<cls_user_bucket_entry>& entries,
+                                            string * const out_marker,
+                                            bool * const truncated,
+					    optional_yield y)
+{
+  auto rados_obj = svc.rados->obj(obj);
+  int r = rados_obj.open(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::ObjectReadOperation op;
+  int rc;
+
+  cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
+  bufferlist ibl;
+  r = rados_obj.operate(dpp, &op, &ibl, y);
+  if (r < 0)
+    return r;
+  if (rc < 0)
+    return rc;
+
+  return 0;
+}
+
+int RGWSI_User_RADOS::list_buckets(const DoutPrefixProvider *dpp, 
+				   const rgw_user& user,
+				   const string& marker,
+				   const string& end_marker,
+				   uint64_t max,
+				   RGWUserBuckets *buckets,
+				   bool *is_truncated, optional_yield y)
+{
+  int ret;
+
+  buckets->clear();
+   if (user.id == RGW_USER_ANON_ID) {
+    ldpp_dout(dpp, 20) << "RGWSI_User_RADOS::list_buckets(): anonymous user" << dendl;
+    *is_truncated = false;
+    return 0;
+  }
+  rgw_raw_obj obj = get_buckets_obj(user);
+
+  bool truncated = false;
+  string m = marker;
+
+  uint64_t total = 0;
+
+  do {
+    std::list<cls_user_bucket_entry> entries;
+    ret = cls_user_list_buckets(dpp, obj, m, end_marker, max - total, entries, &m, &truncated, y);
+    if (ret == -ENOENT) {
+      ret = 0;
+    }
+
+    if (ret < 0) {
+      return ret;
+    }
+
+    for (auto& entry : entries) {
+      buckets->add(RGWBucketEnt(user, std::move(entry)));
+      total++;
+    }
+
+  } while (truncated && total < max);
+
+  if (is_truncated) {
+    *is_truncated = truncated;
+  }
+
+  return 0;
+}
+
+int RGWSI_User_RADOS::flush_bucket_stats(const DoutPrefixProvider *dpp, 
+                                         const rgw_user& user,
+                                         const RGWBucketEnt& ent,
+					 optional_yield y)
+{
+  rgw_raw_obj obj = get_buckets_obj(user);
+
+  return cls_user_flush_bucket_stats(dpp, obj, ent, y);
+}
+
+int RGWSI_User_RADOS::reset_bucket_stats(const DoutPrefixProvider *dpp, 
+                                         const rgw_user& user,
+					 optional_yield y)
+{
+  return cls_user_reset_stats(dpp, user, y);
+}
+
+int RGWSI_User_RADOS::cls_user_reset_stats(const DoutPrefixProvider *dpp, const rgw_user& user, optional_yield y)
+{
+  rgw_raw_obj obj = get_buckets_obj(user);
+  auto rados_obj = svc.rados->obj(obj);
+  int rval, r = rados_obj.open(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  cls_user_reset_stats2_op call;
+  cls_user_reset_stats2_ret ret;
+
+  do {
+    buffer::list in, out;
+    librados::ObjectWriteOperation op;
+
+    call.time = real_clock::now();
+    ret.update_call(call);
+
+    encode(call, in);
+    op.exec("user", "reset_user_stats2", in, &out, &rval);
+    r = rados_obj.operate(dpp, &op, y, librados::OPERATION_RETURNVEC);
+    if (r < 0) {
+      return r;
+    }
+    try {
+      auto bliter = out.cbegin();
+      decode(ret, bliter);
+    } catch (ceph::buffer::error& err) {
+      return -EINVAL;
+    }
+  } while (ret.truncated);
+
+  return rval;
+}
+
+int RGWSI_User_RADOS::complete_flush_stats(const DoutPrefixProvider *dpp, 
+                                           const rgw_user& user, optional_yield y)
+{
+  rgw_raw_obj obj = get_buckets_obj(user);
+  auto rados_obj = svc.rados->obj(obj);
+  int r = rados_obj.open(dpp);
+  if (r < 0) {
+    return r;
+  }
+  librados::ObjectWriteOperation op;
+  ::cls_user_complete_stats_sync(op);
+  return rados_obj.operate(dpp, &op, y);
+}
+
+int RGWSI_User_RADOS::cls_user_get_header(const DoutPrefixProvider *dpp, 
+                                          const rgw_user& user, cls_user_header *header,
+					  optional_yield y)
+{
+  rgw_raw_obj obj = get_buckets_obj(user);
+  auto rados_obj = svc.rados->obj(obj);
+  int r = rados_obj.open(dpp);
+  if (r < 0) {
+    return r;
+  }
+  int rc;
+  bufferlist ibl;
+  librados::ObjectReadOperation op;
+  ::cls_user_get_header(op, header, &rc);
+  return rados_obj.operate(dpp, &op, &ibl, y);
+}
+
+int RGWSI_User_RADOS::cls_user_get_header_async(const DoutPrefixProvider *dpp, const string& user_str, RGWGetUserHeader_CB *cb)
+{
+  rgw_raw_obj obj = get_buckets_obj(rgw_user(user_str));
+  auto rados_obj = svc.rados->obj(obj);
+  int r = rados_obj.open(dpp);
+  if (r < 0) {
+    return r;
+  }
+
+  auto& ref = rados_obj.get_ref();
+
+  r = ::cls_user_get_header_async(ref.pool.ioctx(), ref.obj.oid, cb);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
+
+int RGWSI_User_RADOS::read_stats(const DoutPrefixProvider *dpp, 
+                                 RGWSI_MetaBackend::Context *ctx,
+                                 const rgw_user& user, RGWStorageStats *stats,
+                                 ceph::real_time *last_stats_sync,
+                                 ceph::real_time *last_stats_update,
+				 optional_yield y)
+{
+  string user_str = user.to_str();
+
+  RGWUserInfo info;
+  real_time mtime;
+  int ret = read_user_info(ctx, user, &info, nullptr, &mtime, nullptr, nullptr, y, dpp);
+  if (ret < 0)
+  {
+    return ret;
+  }
+
+  cls_user_header header;
+  int r = cls_user_get_header(dpp, rgw_user(user_str), &header, y);
+  if (r < 0 && r != -ENOENT)
+    return r;
+
+  const cls_user_stats& hs = header.stats;
+
+  stats->size = hs.total_bytes;
+  stats->size_rounded = hs.total_bytes_rounded;
+  stats->num_objects = hs.total_entries;
+
+  if (last_stats_sync) {
+    *last_stats_sync = header.last_stats_sync;
+  }
+
+  if (last_stats_update) {
+   *last_stats_update = header.last_stats_update;
+  }
+
+  return 0;
+}
+
+class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
+  RGWGetUserStats_CB *cb;
+
+public:
+  explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
+    : cb(cb) {}
+
+  void handle_response(int r, cls_user_header& header) override {
+    const cls_user_stats& hs = header.stats;
+    if (r >= 0) {
+      RGWStorageStats stats;
+
+      stats.size = hs.total_bytes;
+      stats.size_rounded = hs.total_bytes_rounded;
+      stats.num_objects = hs.total_entries;
+
+      cb->set_response(stats);
+    }
+
+    cb->handle_response(r);
+
+    cb->put();
+  }
+};
+
+int RGWSI_User_RADOS::read_stats_async(const DoutPrefixProvider *dpp,
+                                       const rgw_user& user, RGWGetUserStats_CB *_cb)
+{
+  string user_str = user.to_str();
+
+  RGWGetUserStatsContext *cb = new RGWGetUserStatsContext(_cb);
+  int r = cls_user_get_header_async(dpp, user_str, cb);
+  if (r < 0) {
+    delete cb;
+    return r;
+  }
+
+  return 0;
+}
+
diff --git a/src/rgw/services/svc_user_rados.h b/src/rgw/services/svc_user_rados.h
new file mode 100644
index 000000000..177f720d6
--- /dev/null
+++ b/src/rgw/services/svc_user_rados.h
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#pragma once
+
+#include "rgw_service.h"
+
+#include "svc_meta_be.h"
+#include "svc_user.h"
+#include "rgw_bucket.h"
+
+class RGWSI_RADOS;
+class RGWSI_Zone;
+class RGWSI_SysObj;
+class RGWSI_SysObj_Cache;
+class RGWSI_Meta;
+class RGWSI_SyncModules;
+class RGWSI_MetaBackend_Handler;
+
+struct rgw_cache_entry_info;
+
+class RGWGetUserHeader_CB;
+class RGWGetUserStats_CB;
+
+template <class T>
+class RGWChainedCacheImpl;
+
+class RGWSI_User_RADOS : public RGWSI_User
+{
+  friend class PutOperation;
+
+  std::unique_ptr<RGWSI_MetaBackend::Module> be_module;
+  RGWSI_MetaBackend_Handler *be_handler;
+
+  struct user_info_cache_entry {
+    RGWUserInfo info;
+    RGWObjVersionTracker objv_tracker;
+    real_time mtime;
+  };
+
+  using RGWChainedCacheImpl_user_info_cache_entry = RGWChainedCacheImpl<user_info_cache_entry>;
+  std::unique_ptr<RGWChainedCacheImpl_user_info_cache_entry> uinfo_cache;
+
+  rgw_raw_obj get_buckets_obj(const rgw_user& user_id) const;
+
+  int get_user_info_from_index(RGWSI_MetaBackend::Context *ctx,
+                               const std::string& key,
+                               const rgw_pool& pool,
+                               RGWUserInfo *info,
+                               RGWObjVersionTracker * const objv_tracker,
+                               real_time * const pmtime,
+                               optional_yield y,
+                               const DoutPrefixProvider *dpp);
+
+  int remove_uid_index(RGWSI_MetaBackend::Context *ctx, const RGWUserInfo& user_info, RGWObjVersionTracker *objv_tracker,
+                       optional_yield y, const DoutPrefixProvider *dpp);
+
+  int remove_key_index(const DoutPrefixProvider *dpp, const RGWAccessKey& access_key, optional_yield y);
+  int remove_email_index(const DoutPrefixProvider *dpp, const std::string& email, optional_yield y);
+  int remove_swift_name_index(const DoutPrefixProvider *dpp, const std::string& swift_name, optional_yield y);
+
+  /* admin management */
+  int cls_user_update_buckets(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, std::list<cls_user_bucket_entry>& entries, bool add, optional_yield y);
+  int cls_user_add_bucket(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, const cls_user_bucket_entry& entry, optional_yield y);
+  int cls_user_remove_bucket(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, const cls_user_bucket& bucket, optional_yield y);
+
+  /* quota stats */
+  int cls_user_flush_bucket_stats(const DoutPrefixProvider *dpp, rgw_raw_obj& user_obj,
+                                  const RGWBucketEnt& ent, optional_yield y);
+  int cls_user_list_buckets(const DoutPrefixProvider *dpp, 
+                            rgw_raw_obj& obj,
+                            const std::string& in_marker,
+                            const std::string& end_marker,
+                            const int max_entries,
+                            std::list<cls_user_bucket_entry>& entries,
+                            std::string * const out_marker,
+                            bool * const truncated,
+                            optional_yield y);
+
+  int cls_user_reset_stats(const DoutPrefixProvider *dpp, const rgw_user& user, optional_yield y);
+  int cls_user_get_header(const DoutPrefixProvider *dpp, const rgw_user& user, cls_user_header *header, optional_yield y);
+  int cls_user_get_header_async(const DoutPrefixProvider *dpp, const std::string& user, RGWGetUserHeader_CB *cb);
+
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+public:
+  struct Svc {
+    RGWSI_User_RADOS *user{nullptr};
+    RGWSI_RADOS *rados{nullptr};
+    RGWSI_Zone *zone{nullptr};
+    RGWSI_SysObj *sysobj{nullptr};
+    RGWSI_SysObj_Cache *cache{nullptr};
+    RGWSI_Meta *meta{nullptr};
+    RGWSI_MetaBackend *meta_be{nullptr};
+    RGWSI_SyncModules *sync_modules{nullptr};
+  } svc;
+
+  RGWSI_User_RADOS(CephContext *cct);
+  ~RGWSI_User_RADOS();
+
+  void init(RGWSI_RADOS *_rados_svc,
+            RGWSI_Zone *_zone_svc, RGWSI_SysObj *_sysobj_svc,
+	    RGWSI_SysObj_Cache *_cache_svc, RGWSI_Meta *_meta_svc,
+            RGWSI_MetaBackend *_meta_be_svc,
+	    RGWSI_SyncModules *_sync_modules);
+
+  RGWSI_MetaBackend_Handler *get_be_handler() override {
+    return be_handler;
+  }
+
+  int read_user_info(RGWSI_MetaBackend::Context *ctx,
+                     const rgw_user& user,
+                     RGWUserInfo *info,
+                     RGWObjVersionTracker * const objv_tracker,
+                     real_time * const pmtime,
+                     rgw_cache_entry_info * const cache_info,
+                     std::map<std::string, bufferlist> * const pattrs,
+                     optional_yield y,
+                     const DoutPrefixProvider *dpp) override;
+
+  int store_user_info(RGWSI_MetaBackend::Context *ctx,
+                      const RGWUserInfo& info,
+                      RGWUserInfo *old_info,
+                      RGWObjVersionTracker *objv_tracker,
+                      const real_time& mtime,
+                      bool exclusive,
+                      std::map<std::string, bufferlist> *attrs,
+                      optional_yield y,
+                      const DoutPrefixProvider *dpp) override;
+
+  int remove_user_info(RGWSI_MetaBackend::Context *ctx,
+                       const RGWUserInfo& info,
+                       RGWObjVersionTracker *objv_tracker,
+                       optional_yield y,
+                       const DoutPrefixProvider *dpp) override;
+
+  int get_user_info_by_email(RGWSI_MetaBackend::Context *ctx,
+                             const std::string& email, RGWUserInfo *info,
+                             RGWObjVersionTracker *objv_tracker,
+                             real_time *pmtime,
+                             optional_yield y,
+                             const DoutPrefixProvider *dpp) override;
+  int get_user_info_by_swift(RGWSI_MetaBackend::Context *ctx,
+                             const std::string& swift_name,
+                             RGWUserInfo *info,        /* out */
+                             RGWObjVersionTracker * const objv_tracker,
+                             real_time * const pmtime,
+                             optional_yield y,
+                             const DoutPrefixProvider *dpp) override;
+  int get_user_info_by_access_key(RGWSI_MetaBackend::Context *ctx,
+                                  const std::string& access_key,
+                                  RGWUserInfo *info,
+                                  RGWObjVersionTracker* objv_tracker,
+                                  real_time *pmtime,
+                                  optional_yield y,
+                                  const DoutPrefixProvider *dpp) override;
+
+  /* user buckets directory */
+
+  int add_bucket(const DoutPrefixProvider *dpp, 
+                 const rgw_user& user,
+                 const rgw_bucket& bucket,
+                 ceph::real_time creation_time,
+                 optional_yield y) override;
+  int remove_bucket(const DoutPrefixProvider *dpp, 
+                    const rgw_user& user,
+                    const rgw_bucket& _bucket,
+                    optional_yield y) override;
+  int list_buckets(const DoutPrefixProvider *dpp, 
+                   const rgw_user& user,
+                   const std::string& marker,
+                   const std::string& end_marker,
+                   uint64_t max,
+                   RGWUserBuckets *buckets,
+                   bool *is_truncated,
+                   optional_yield y) override;
+
+  /* quota related */
+  int flush_bucket_stats(const DoutPrefixProvider *dpp, 
+                         const rgw_user& user,
+                         const RGWBucketEnt& ent, optional_yield y) override;
+
+  int complete_flush_stats(const DoutPrefixProvider *dpp, 
+			   const rgw_user& user, optional_yield y) override;
+
+  int reset_bucket_stats(const DoutPrefixProvider *dpp, 
+			 const rgw_user& user,
+                         optional_yield y) override;
+  int read_stats(const DoutPrefixProvider *dpp, 
+                 RGWSI_MetaBackend::Context *ctx,
+		 const rgw_user& user, RGWStorageStats *stats,
+		 ceph::real_time *last_stats_sync,              /* last time a full stats sync completed */
+		 ceph::real_time *last_stats_update,
+                 optional_yield y) override;  /* last time a stats update was done */
+
+  int read_stats_async(const DoutPrefixProvider *dpp, const rgw_user& user,
+                       RGWGetUserStats_CB *cb) override;
+};
+
diff --git a/src/rgw/services/svc_zone.cc b/src/rgw/services/svc_zone.cc
new file mode 100644
index 000000000..180d93712
--- /dev/null
+++ b/src/rgw/services/svc_zone.cc
@@ -0,0 +1,1100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_zone.h"
+#include "svc_rados.h"
+#include "svc_sys_obj.h"
+#include "svc_sync_modules.h"
+
+#include "rgw_zone.h"
+#include "rgw_rest_conn.h"
+#include "rgw_bucket_sync.h"
+
+#include "common/errno.h"
+#include "include/random.h"
+
+#define dout_subsys ceph_subsys_rgw
+
+using namespace std;
+using namespace rgw_zone_defaults;
+
+RGWSI_Zone::RGWSI_Zone(CephContext *cct) : RGWServiceInstance(cct)
+{
+}
+
+void RGWSI_Zone::init(RGWSI_SysObj *_sysobj_svc,
+                      RGWSI_RADOS * _rados_svc,
+                      RGWSI_SyncModules * _sync_modules_svc,
+		      RGWSI_Bucket_Sync *_bucket_sync_svc)
+{
+  sysobj_svc = _sysobj_svc;
+  rados_svc = _rados_svc;
+  sync_modules_svc = _sync_modules_svc;
+  bucket_sync_svc = _bucket_sync_svc;
+
+  realm = new RGWRealm();
+  zonegroup = new RGWZoneGroup();
+  zone_public_config = new RGWZone();
+  zone_params = new RGWZoneParams();
+  current_period = new RGWPeriod();
+}
+
+RGWSI_Zone::~RGWSI_Zone()
+{
+  delete realm;
+  delete zonegroup;
+  delete zone_public_config;
+  delete zone_params;
+  delete current_period;
+}
+
+std::shared_ptr<RGWBucketSyncPolicyHandler> RGWSI_Zone::get_sync_policy_handler(std::optional<rgw_zone_id> zone) const {
+  if (!zone || *zone == zone_id()) {
+    return sync_policy_handler;
+  }
+  auto iter = sync_policy_handlers.find(*zone);
+  if (iter == sync_policy_handlers.end()) {
+    return std::shared_ptr<RGWBucketSyncPolicyHandler>();
+  }
+  return iter->second;
+}
+
+bool RGWSI_Zone::zone_syncs_from(const RGWZone& target_zone, const RGWZone& source_zone) const
+{
+  return target_zone.syncs_from(source_zone.name) &&
+         sync_modules_svc->get_manager()->supports_data_export(source_zone.tier_type);
+}
+
+bool RGWSI_Zone::zone_syncs_from(const RGWZone& source_zone) const
+{
+  auto target_zone = get_zone();
+  bool found = false;
+
+  for (auto s : data_sync_source_zones) {
+    if (s->id == source_zone.id) {
+      found = true;
+      break;
+    }
+  }
+  return found && target_zone.syncs_from(source_zone.name) &&
+         sync_modules_svc->get_manager()->supports_data_export(source_zone.tier_type);
+}
+
+int RGWSI_Zone::search_realm_with_zone(const DoutPrefixProvider *dpp,
+                                       const rgw_zone_id& zid,
+                                       RGWRealm *prealm,
+                                       RGWPeriod *pperiod,
+                                       RGWZoneGroup *pzonegroup,
+                                       bool *pfound,
+                                       optional_yield y)
+{
+  auto& found = *pfound;
+
+  found = false;
+
+  list<string> realms;
+  int r = list_realms(dpp, realms);
+  if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: failed to list realms: r=" << r << dendl;
+    return r;
+  }
+
+  for (auto& realm_name : realms) {
+    string realm_id;
+    RGWRealm realm(realm_id, realm_name);
+    r = realm.init(dpp, cct, sysobj_svc, y);
+    if (r < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: can't open realm " << realm_name << ": " << cpp_strerror(-r) << " ... skipping" << dendl;
+      continue;
+    }
+
+    r = realm.find_zone(dpp, zid, pperiod,
+                        pzonegroup, &found, y);
+    if (r < 0) {
+      ldpp_dout(dpp, 20) << __func__ << "(): ERROR: realm.find_zone() returned r=" << r<< dendl;
+      return r;
+    }
+
+    if (found) {
+      *prealm = realm;
+      ldpp_dout(dpp, 20) << __func__ << "(): found realm_id=" << realm_id << " realm_name=" << realm_name << dendl;
+      return 0;
+    }
+  }
+
+  return 0;
+}
+
+int RGWSI_Zone::do_start(optional_yield y, const DoutPrefixProvider *dpp)
+{
+  int ret = sysobj_svc->start(y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  assert(sysobj_svc->is_started()); /* if not then there's ordering issue */
+
+  ret = rados_svc->start(y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  ret = realm->init(dpp, cct, sysobj_svc, y);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  ldpp_dout(dpp, 20) << "realm  " << realm->get_name() << " " << realm->get_id() << dendl;
+  ret = current_period->init(dpp, cct, sysobj_svc, realm->get_id(), y,
+                             realm->get_name());
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  ret = zone_params->init(dpp, cct, sysobj_svc, y);
+  bool found_zone = (ret == 0);
+  if (ret < 0 && ret != -ENOENT) {
+    lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  cur_zone_id = rgw_zone_id(zone_params->get_id());
+
+  bool found_period_conf = false;
+
+  /* try to find zone in period config (if we have one) */
+  if (found_zone &&
+      !current_period->get_id().empty()) {
+    found_period_conf = current_period->find_zone(dpp,
+                                    cur_zone_id,
+                                    zonegroup,
+                                    y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: current_period->find_zone() returned ret=" << ret << dendl;
+      return ret;
+    }
+    if (!found_period_conf) {
+      ldpp_dout(dpp, 0) << "period (" << current_period->get_id() << " does not have zone " << cur_zone_id << " configured" << dendl;
+    }
+  }
+
+  RGWRealm search_realm;
+
+  if (found_zone &&
+      !found_period_conf) {
+    ldpp_dout(dpp, 20) << "searching for the correct realm" << dendl;
+    ret = search_realm_with_zone(dpp,
+                                 cur_zone_id,
+                                 realm,
+                                 current_period,
+                                 zonegroup,
+                                 &found_period_conf,
+                                 y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: search_realm_conf() failed: ret="<< ret << dendl;
+      return ret;
+    }
+  }
+  bool zg_initialized = found_period_conf;
+
+  if (!zg_initialized) {
+    /* couldn't find a proper period config, use local zonegroup */
+    ret = zonegroup->init(dpp, cct, sysobj_svc, y);
+    zg_initialized = (ret == 0);
+    if (ret < 0 && ret != -ENOENT) {
+      ldpp_dout(dpp, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+  }
+
+  auto& zonegroup_param = cct->_conf->rgw_zonegroup;
+  bool init_from_period = found_period_conf;
+  bool explicit_zg = !zonegroup_param.empty();
+
+  if (!zg_initialized &&
+      (!explicit_zg || zonegroup_param == default_zonegroup_name)) {
+    /* we couldn't initialize any zonegroup,
+       falling back to a non-multisite config with default zonegroup */
+    ret = create_default_zg(dpp, y);
+    if (ret < 0) {
+      return ret;
+    }
+    zg_initialized = true;
+  }
+
+  if (!zg_initialized) {
+    ldpp_dout(dpp, 0) << "ERROR: could not find zonegroup (" << zonegroup_param << ")" << dendl;
+    return -ENOENT;
+  }
+
+  /* we have zonegroup now */
+
+  if (explicit_zg &&
+      zonegroup->get_name() != zonegroup_param) {
+    ldpp_dout(dpp, 0) << "ERROR: incorrect zonegroup: " << zonegroup_param << " (got: " << zonegroup_param << ", expected: " << zonegroup->get_name() << ")" << dendl;
+    return -EINVAL;
+  }
+
+  auto& zone_param = cct->_conf->rgw_zone;
+  bool explicit_zone = !zone_param.empty();
+
+  if (!found_zone) {
+    if ((!explicit_zone || zone_param == default_zone_name) &&
+        zonegroup->get_name() == default_zonegroup_name) {
+      ret = init_default_zone(dpp, y);
+      if (ret < 0 && ret != -ENOENT) {
+        return ret;
+      }
+      cur_zone_id = zone_params->get_id();
+    } else {
+      ldpp_dout(dpp, 0) << "ERROR: could not find zone (" << zone_param << ")" << dendl;
+      return -ENOENT;
+    }
+  }
+
+  /* we have zone now */
+
+  auto zone_iter = zonegroup->zones.find(zone_params->get_id());
+  if (zone_iter == zonegroup->zones.end()) {
+    /* shouldn't happen if relying on period config */
+    if (!init_from_period) {
+      ldpp_dout(dpp, -1) << "Cannot find zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << ")" << dendl;
+      return -EINVAL;
+    }
+    ldpp_dout(dpp, 1) << "Cannot find zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << "), switching to local zonegroup configuration" << dendl;
+    init_from_period = false;
+    zone_iter = zonegroup->zones.find(zone_params->get_id());
+  }
+  if (zone_iter == zonegroup->zones.end()) {
+    ldpp_dout(dpp, -1) << "Cannot find zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << ")" << dendl;
+    return -EINVAL;
+  }
+  *zone_public_config = zone_iter->second;
+  ldout(cct, 20) << "zone " << zone_params->get_name() << " found"  << dendl;
+
+  ldpp_dout(dpp, 4) << "Realm:     " << std::left << setw(20) << realm->get_name() << " (" << realm->get_id() << ")" << dendl;
+  ldpp_dout(dpp, 4) << "ZoneGroup: " << std::left << setw(20) << zonegroup->get_name() << " (" << zonegroup->get_id() << ")" << dendl;
+  ldpp_dout(dpp, 4) << "Zone:      " << std::left << setw(20) << zone_params->get_name() << " (" << zone_params->get_id() << ")" << dendl;
+
+  if (init_from_period) {
+    ldpp_dout(dpp, 4) << "using period configuration: " << current_period->get_id() << ":" << current_period->get_epoch() << dendl;
+    ret = init_zg_from_period(dpp, y);
+    if (ret < 0) {
+      return ret;
+    }
+  } else {
+    ldout(cct, 10) << "cannot find current period zonegroup using local zonegroup configuration" << dendl;
+    ret = init_zg_from_local(dpp, y);
+    if (ret < 0) {
+      return ret;
+    }
+    // read period_config into current_period
+    auto& period_config = current_period->get_config();
+    ret = period_config.read(dpp, sysobj_svc, zonegroup->realm_id, y);
+    if (ret < 0 && ret != -ENOENT) {
+      ldout(cct, 0) << "ERROR: failed to read period config: "
+          << cpp_strerror(ret) << dendl;
+      return ret;
+    }
+  }
+
+  zone_short_id = current_period->get_map().get_zone_short_id(zone_params->get_id());
+
+  for (auto ziter : zonegroup->zones) {
+    auto zone_handler = std::make_shared<RGWBucketSyncPolicyHandler>(this, sync_modules_svc, bucket_sync_svc, ziter.second.id);
+    ret = zone_handler->init(dpp, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, -1) << "ERROR: could not initialize zone policy handler for zone=" << ziter.second.name << dendl;
+      return ret;
+      }
+    sync_policy_handlers[ziter.second.id] = zone_handler;
+  }
+
+  sync_policy_handler = sync_policy_handlers[zone_id()]; /* we made sure earlier that zonegroup->zones has our zone */
+
+  set<rgw_zone_id> source_zones;
+  set<rgw_zone_id> target_zones;
+
+  sync_policy_handler->reflect(dpp, nullptr, nullptr,
+                               nullptr, nullptr,
+                               &source_zones,
+                               &target_zones,
+                               false); /* relaxed: also get all zones that we allow to sync to/from */
+
+  ret = sync_modules_svc->start(y, dpp);
+  if (ret < 0) {
+    return ret;
+  }
+
+  auto sync_modules = sync_modules_svc->get_manager();
+  RGWSyncModuleRef sm;
+  if (!sync_modules->get_module(zone_public_config->tier_type, &sm)) {
+    ldpp_dout(dpp, -1) << "ERROR: tier type not found: " << zone_public_config->tier_type << dendl;
+    return -EINVAL;
+  }
+
+  writeable_zone = sm->supports_writes();
+  exports_data = sm->supports_data_export();
+
+  /* first build all zones index */
+  for (auto ziter : zonegroup->zones) {
+    const rgw_zone_id& id = ziter.first;
+    RGWZone& z = ziter.second;
+    zone_id_by_name[z.name] = id;
+    zone_by_id[id] = z;
+  }
+
+  if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
+    ldpp_dout(dpp, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
+  }
+
+  for (const auto& ziter : zonegroup->zones) {
+    const rgw_zone_id& id = ziter.first;
+    const RGWZone& z = ziter.second;
+    if (id == zone_id()) {
+      continue;
+    }
+    if (z.endpoints.empty()) {
+      ldpp_dout(dpp, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
+      continue;
+    }
+    ldpp_dout(dpp, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
+    RGWRESTConn *conn = new RGWRESTConn(cct, z.id, z.endpoints, zone_params->system_key, zonegroup->get_id(), zonegroup->api_name);
+    zone_conn_map[id] = conn;
+
+    bool zone_is_source = source_zones.find(z.id) != source_zones.end();
+    bool zone_is_target = target_zones.find(z.id) != target_zones.end();
+
+    if (zone_is_source || zone_is_target) {
+      if (zone_is_source && sync_modules->supports_data_export(z.tier_type)) {
+        data_sync_source_zones.push_back(&z);
+      }
+      if (zone_is_target) {
+        zone_data_notify_to_map[id] = conn;
+      }
+    } else {
+      ldpp_dout(dpp, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
+    }
+  }
+
+  ldpp_dout(dpp, 20) << "started zone id=" << zone_params->get_id() << " (name=" << zone_params->get_name() << 
+        ") with tier type = " << zone_public_config->tier_type << dendl;
+
+  return 0;
+}
+
+void RGWSI_Zone::shutdown()
+{
+  delete rest_master_conn;
+
+  for (auto& item : zone_conn_map) {
+    auto conn = item.second;
+    delete conn;
+  }
+
+  for (auto& item : zonegroup_conn_map) {
+    auto conn = item.second;
+    delete conn;
+  }
+}
+
+int RGWSI_Zone::list_regions(const DoutPrefixProvider *dpp, list<string>& regions)
+{
+  RGWZoneGroup zonegroup;
+  RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(zonegroup.get_pool(cct));
+
+  return syspool.list_prefixed_objs(dpp, region_info_oid_prefix, &regions);
+}
+
+int RGWSI_Zone::list_zonegroups(const DoutPrefixProvider *dpp, list<string>& zonegroups)
+{
+  RGWZoneGroup zonegroup;
+  RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(zonegroup.get_pool(cct));
+
+  return syspool.list_prefixed_objs(dpp, zonegroup_names_oid_prefix, &zonegroups);
+}
+
+int RGWSI_Zone::list_zones(const DoutPrefixProvider *dpp, list<string>& zones)
+{
+  RGWZoneParams zoneparams;
+  RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(zoneparams.get_pool(cct));
+
+  return syspool.list_prefixed_objs(dpp, zone_names_oid_prefix, &zones);
+}
+
+int RGWSI_Zone::list_realms(const DoutPrefixProvider *dpp, list<string>& realms)
+{
+  RGWRealm realm(cct, sysobj_svc);
+  RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(realm.get_pool(cct));
+
+  return syspool.list_prefixed_objs(dpp, realm_names_oid_prefix, &realms);
+}
+
+int RGWSI_Zone::list_periods(const DoutPrefixProvider *dpp, list<string>& periods)
+{
+  RGWPeriod period;
+  list<string> raw_periods;
+  RGWSI_SysObj::Pool syspool = sysobj_svc->get_pool(period.get_pool(cct));
+  int ret = syspool.list_prefixed_objs(dpp, period.get_info_oid_prefix(), &raw_periods);
+  if (ret < 0) {
+    return ret;
+  }
+  for (const auto& oid : raw_periods) {
+    size_t pos = oid.find(".");
+    if (pos != std::string::npos) {
+      periods.push_back(oid.substr(0, pos));
+    } else {
+      periods.push_back(oid);
+    }
+  }
+  periods.sort(); // unique() only detects duplicates if they're adjacent
+  periods.unique();
+  return 0;
+}
+
+
+int RGWSI_Zone::list_periods(const DoutPrefixProvider *dpp, const string& current_period, list<string>& periods, optional_yield y)
+{
+  int ret = 0;
+  string period_id = current_period;
+  while(!period_id.empty()) {
+    RGWPeriod period(period_id);
+    ret = period.init(dpp, cct, sysobj_svc, y);
+    if (ret < 0) {
+      return ret;
+    }
+    periods.push_back(period.get_id());
+    period_id = period.get_predecessor();
+  }
+
+  return ret;
+}
+
+/**
+ * Add new connection to connections map
+ * @param zonegroup_conn_map map which new connection will be added to
+ * @param zonegroup zonegroup which new connection will connect to
+ * @param new_connection pointer to new connection instance
+ */
+static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
+				      const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
+{
+  // Delete if connection is already exists
+  map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
+  if (iterZoneGroup != zonegroup_conn_map.end()) {
+    delete iterZoneGroup->second;
+  }
+
+  // Add new connection to connections map
+  zonegroup_conn_map[zonegroup.get_id()] = new_connection;
+}
+
+int RGWSI_Zone::init_zg_from_period(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  ldout(cct, 20) << "period zonegroup name " << zonegroup->get_name() << dendl;
+
+  map<string, RGWZoneGroup>::const_iterator iter =
+    current_period->get_map().zonegroups.find(zonegroup->get_id());
+
+  if (iter != current_period->get_map().zonegroups.end()) {
+    ldpp_dout(dpp, 20) << "using current period zonegroup " << zonegroup->get_name() << dendl;
+    *zonegroup = iter->second;
+    int ret = zonegroup->init(dpp, cct, sysobj_svc, y, false);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
+      return ret;
+    }
+  }
+  for (iter = current_period->get_map().zonegroups.begin();
+       iter != current_period->get_map().zonegroups.end(); ++iter){
+    const RGWZoneGroup& zg = iter->second;
+    // use endpoints from the zonegroup's master zone
+    auto master = zg.zones.find(zg.master_zone);
+    if (master == zg.zones.end()) {
+      // Check for empty zonegroup which can happen if zone was deleted before removal
+      if (zg.zones.size() == 0)
+        continue;
+      // fix missing master zone for a single zone zonegroup
+      if (zg.master_zone.empty() && zg.zones.size() == 1) {
+	master = zg.zones.begin();
+	ldpp_dout(dpp, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
+	  master->second.name << " id:" << master->second.id << " as master" << dendl;
+	if (zonegroup->get_id() == zg.get_id()) {
+	  zonegroup->master_zone = master->second.id;
+	  int ret = zonegroup->update(dpp, y);
+	  if (ret < 0) {
+	    ldpp_dout(dpp, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
+	    return ret;
+	  }
+	} else {
+	  RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
+	  int ret = fixed_zg.init(dpp, cct, sysobj_svc, y);
+	  if (ret < 0) {
+	    ldpp_dout(dpp, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
+	    return ret;
+	  }
+	  fixed_zg.master_zone = master->second.id;
+	  ret = fixed_zg.update(dpp, y);
+	  if (ret < 0) {
+	    ldpp_dout(dpp, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
+	    return ret;
+	  }
+	}
+      } else {
+	ldpp_dout(dpp, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
+	  zg.master_zone << dendl;
+	return -EINVAL;
+      }
+    }
+    const auto& endpoints = master->second.endpoints;
+    add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, zg.get_id(), endpoints, zone_params->system_key, zonegroup->get_id(), zg.api_name));
+    if (!current_period->get_master_zonegroup().empty() &&
+        zg.get_id() == current_period->get_master_zonegroup()) {
+      rest_master_conn = new RGWRESTConn(cct, zg.get_id(), endpoints, zone_params->system_key, zonegroup->get_id(), zg.api_name);
+    }
+  }
+
+  return 0;
+}
+
+int RGWSI_Zone::create_default_zg(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  ldout(cct, 10) << "Creating default zonegroup " << dendl;
+  int ret = zonegroup->create_default(dpp, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
+      << dendl;
+    return ret;
+  }
+  ret = zonegroup->init(dpp, cct, sysobj_svc, y);
+  if (ret < 0) {
+    ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
+      << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSI_Zone::init_default_zone(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  ldpp_dout(dpp, 10) << " Using default name "<< default_zone_name << dendl;
+  zone_params->set_name(default_zone_name);
+  int ret = zone_params->init(dpp, cct, sysobj_svc, y);
+  if (ret < 0 && ret != -ENOENT) {
+    ldpp_dout(dpp, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
+    return ret;
+  }
+
+  return 0;
+}
+
+int RGWSI_Zone::init_zg_from_local(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  ldpp_dout(dpp, 20) << "zonegroup " << zonegroup->get_name() << dendl;
+  if (zonegroup->is_master_zonegroup()) {
+    // use endpoints from the zonegroup's master zone
+    auto master = zonegroup->zones.find(zonegroup->master_zone);
+    if (master == zonegroup->zones.end()) {
+      // fix missing master zone for a single zone zonegroup
+      if (zonegroup->master_zone.empty() && zonegroup->zones.size() == 1) {
+	master = zonegroup->zones.begin();
+	ldpp_dout(dpp, 0) << "zonegroup " << zonegroup->get_name() << " missing master_zone, setting zone " <<
+	  master->second.name << " id:" << master->second.id << " as master" << dendl;
+	zonegroup->master_zone = master->second.id;
+	int ret = zonegroup->update(dpp, y);
+	if (ret < 0) {
+	  ldpp_dout(dpp, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
+	  return ret;
+	}
+      } else {
+	ldpp_dout(dpp, 0) << "zonegroup " << zonegroup->get_name() << " missing zone for "
+          "master_zone=" << zonegroup->master_zone << dendl;
+	return -EINVAL;
+      }
+    }
+    const auto& endpoints = master->second.endpoints;
+    rest_master_conn = new RGWRESTConn(cct, zonegroup->get_id(), endpoints, zone_params->system_key, zonegroup->get_id(), zonegroup->api_name);
+  }
+
+  return 0;
+}
+
+const RGWZoneParams& RGWSI_Zone::get_zone_params() const
+{
+  return *zone_params;
+}
+
+const RGWZone& RGWSI_Zone::get_zone() const
+{
+  return *zone_public_config;
+}
+
+const RGWZoneGroup& RGWSI_Zone::get_zonegroup() const
+{
+  return *zonegroup;
+}
+
+int RGWSI_Zone::get_zonegroup(const string& id, RGWZoneGroup& zg) const
+{
+  int ret = 0;
+  if (id == zonegroup->get_id()) {
+    zg = *zonegroup;
+  } else if (!current_period->get_id().empty()) {
+    ret = current_period->get_zonegroup(zg, id);
+  }
+  return ret;
+}
+
+const RGWRealm& RGWSI_Zone::get_realm() const
+{
+  return *realm;
+}
+
+const RGWPeriod& RGWSI_Zone::get_current_period() const
+{
+  return *current_period;
+}
+
+const string& RGWSI_Zone::get_current_period_id() const
+{
+  return current_period->get_id();
+}
+
+bool RGWSI_Zone::has_zonegroup_api(const std::string& api) const
+{
+  if (!current_period->get_id().empty()) {
+    const auto& zonegroups_by_api = current_period->get_map().zonegroups_by_api;
+    if (zonegroups_by_api.find(api) != zonegroups_by_api.end())
+      return true;
+  } else if (zonegroup->api_name == api) {
+    return true;
+  }
+  return false;
+}
+
+bool RGWSI_Zone::zone_is_writeable()
+{
+  return writeable_zone && !get_zone().is_read_only();
+}
+
+uint32_t RGWSI_Zone::get_zone_short_id() const
+{
+  return zone_short_id;
+}
+
+const string& RGWSI_Zone::zone_name() const
+{
+  return get_zone_params().get_name();
+}
+
+RGWZone* RGWSI_Zone::find_zone(const rgw_zone_id& id)
+{
+  auto iter = zone_by_id.find(id);
+  if (iter == zone_by_id.end()) {
+    return nullptr;
+  }
+  return &(iter->second);
+}
+
+RGWRESTConn *RGWSI_Zone::get_zone_conn(const rgw_zone_id& zone_id) {
+  auto citer = zone_conn_map.find(zone_id.id);
+  if (citer == zone_conn_map.end()) {
+    return NULL;
+  }
+
+  return citer->second;
+}
+
+RGWRESTConn *RGWSI_Zone::get_zone_conn_by_name(const string& name) {
+  auto i = zone_id_by_name.find(name);
+  if (i == zone_id_by_name.end()) {
+    return NULL;
+  }
+
+  return get_zone_conn(i->second);
+}
+
+bool RGWSI_Zone::find_zone_id_by_name(const string& name, rgw_zone_id *id) {
+  auto i = zone_id_by_name.find(name);
+  if (i == zone_id_by_name.end()) {
+    return false;
+  }
+  *id = i->second; 
+  return true;
+}
+
+bool RGWSI_Zone::need_to_sync() const
+{
+  return !(zonegroup->master_zone.empty() ||
+	   !rest_master_conn ||
+	   current_period->get_id().empty());
+}
+
+bool RGWSI_Zone::need_to_log_data() const
+{
+  return (zone_public_config->log_data && sync_module_exports_data());
+}
+
+bool RGWSI_Zone::is_meta_master() const
+{
+  if (!zonegroup->is_master_zonegroup()) {
+    return false;
+  }
+
+  return (zonegroup->master_zone == zone_public_config->id);
+}
+
+bool RGWSI_Zone::need_to_log_metadata() const
+{
+  return is_meta_master() &&
+    (zonegroup->zones.size() > 1 || current_period->is_multi_zonegroups_with_zones());
+}
+
+bool RGWSI_Zone::can_reshard() const
+{
+  if (current_period->get_id().empty()) {
+    return true; // no realm
+  }
+  if (zonegroup->zones.size() == 1 && current_period->is_single_zonegroup()) {
+    return true; // single zone/zonegroup
+  }
+  // 'resharding' feature enabled in zonegroup
+  return zonegroup->supports(rgw::zone_features::resharding);
+}
+
+/**
+  * Check to see if the bucket metadata could be synced
+  * bucket: the bucket to check
+  * Returns false is the bucket is not synced
+  */
+bool RGWSI_Zone::is_syncing_bucket_meta(const rgw_bucket& bucket)
+{
+
+  /* no current period  */
+  if (current_period->get_id().empty()) {
+    return false;
+  }
+
+  /* zonegroup is not master zonegroup */
+  if (!zonegroup->is_master_zonegroup()) {
+    return false;
+  }
+
+  /* single zonegroup and a single zone */
+  if (current_period->is_single_zonegroup() && zonegroup->zones.size() == 1) {
+    return false;
+  }
+
+  /* zone is not master */
+  if (zonegroup->master_zone != zone_public_config->id) {
+    return false;
+  }
+
+  return true;
+}
+
+
+int RGWSI_Zone::select_new_bucket_location(const DoutPrefixProvider *dpp, const RGWUserInfo& user_info, const string& zonegroup_id,
+					   const rgw_placement_rule& request_rule,
+					   rgw_placement_rule *pselected_rule_name, RGWZonePlacementInfo *rule_info,
+					   optional_yield y)
+{
+  /* first check that zonegroup exists within current period. */
+  RGWZoneGroup zonegroup;
+  int ret = get_zonegroup(zonegroup_id, zonegroup);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
+    return ret;
+  }
+
+  const rgw_placement_rule *used_rule;
+
+  /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
+  std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
+
+  if (!request_rule.name.empty()) {
+    used_rule = &request_rule;
+    titer = zonegroup.placement_targets.find(request_rule.name);
+    if (titer == zonegroup.placement_targets.end()) {
+      ldpp_dout(dpp, 0) << "could not find requested placement id " << request_rule 
+                    << " within zonegroup " << dendl;
+      return -ERR_INVALID_LOCATION_CONSTRAINT;
+    }
+  } else if (!user_info.default_placement.name.empty()) {
+    used_rule = &user_info.default_placement;
+    titer = zonegroup.placement_targets.find(user_info.default_placement.name);
+    if (titer == zonegroup.placement_targets.end()) {
+      ldpp_dout(dpp, 0) << "could not find user default placement id " << user_info.default_placement
+                    << " within zonegroup " << dendl;
+      return -ERR_INVALID_LOCATION_CONSTRAINT;
+    }
+  } else {
+    if (zonegroup.default_placement.name.empty()) { // zonegroup default rule as fallback, it should not be empty.
+      ldpp_dout(dpp, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
+      return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
+    } else {
+      used_rule = &zonegroup.default_placement;
+      titer = zonegroup.placement_targets.find(zonegroup.default_placement.name);
+      if (titer == zonegroup.placement_targets.end()) {
+        ldpp_dout(dpp, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
+                      << " within zonegroup " << dendl;
+        return -ERR_INVALID_LOCATION_CONSTRAINT;
+      }
+    }
+  }
+
+  /* now check tag for the rule, whether user is permitted to use rule */
+  const auto& target_rule = titer->second;
+  if (!target_rule.user_permitted(user_info.placement_tags)) {
+    ldpp_dout(dpp, 0) << "user not permitted to use placement rule " << titer->first  << dendl;
+    return -EPERM;
+  }
+
+  const string *storage_class = &request_rule.storage_class;
+
+  if (storage_class->empty()) {
+    storage_class = &used_rule->storage_class;
+  }
+
+  rgw_placement_rule rule(titer->first, *storage_class);
+
+  if (pselected_rule_name) {
+    *pselected_rule_name = rule;
+  }
+
+  return select_bucket_location_by_rule(dpp, rule, rule_info, y);
+}
+
+int RGWSI_Zone::select_bucket_location_by_rule(const DoutPrefixProvider *dpp, const rgw_placement_rule& location_rule, RGWZonePlacementInfo *rule_info, optional_yield y)
+{
+  if (location_rule.name.empty()) {
+    /* we can only reach here if we're trying to set a bucket location from a bucket
+     * created on a different zone, using a legacy / default pool configuration
+     */
+    if (rule_info) {
+      return select_legacy_bucket_placement(dpp, rule_info, y);
+    }
+
+    return 0;
+  }
+
+  /*
+   * make sure that zone has this rule configured. We're
+   * checking it for the local zone, because that's where this bucket object is going to
+   * reside.
+   */
+  auto piter = zone_params->placement_pools.find(location_rule.name);
+  if (piter == zone_params->placement_pools.end()) {
+    /* couldn't find, means we cannot really place data for this bucket in this zone */
+    ldpp_dout(dpp, 0) << "ERROR: This zone does not contain placement rule "
+                  << location_rule << " present in the zonegroup!" << dendl;
+    return -EINVAL;
+  }
+
+  auto storage_class = location_rule.get_storage_class();
+  if (!piter->second.storage_class_exists(storage_class)) {
+    ldpp_dout(dpp, 5) << "requested storage class does not exist: " << storage_class << dendl;
+    return -EINVAL;
+  }
+
+
+  RGWZonePlacementInfo& placement_info = piter->second;
+
+  if (rule_info) {
+    *rule_info = placement_info;
+  }
+
+  return 0;
+}
+
+int RGWSI_Zone::select_bucket_placement(const DoutPrefixProvider *dpp, const RGWUserInfo& user_info, const string& zonegroup_id,
+                                        const rgw_placement_rule& placement_rule,
+                                        rgw_placement_rule *pselected_rule, RGWZonePlacementInfo *rule_info,
+					optional_yield y)
+{
+  if (!zone_params->placement_pools.empty()) {
+    return select_new_bucket_location(dpp, user_info, zonegroup_id, placement_rule,
+                                      pselected_rule, rule_info, y);
+  }
+
+  if (pselected_rule) {
+    pselected_rule->clear();
+  }
+
+  if (rule_info) {
+    return select_legacy_bucket_placement(dpp, rule_info, y);
+  }
+
+  return 0;
+}
+
+int RGWSI_Zone::select_legacy_bucket_placement(const DoutPrefixProvider *dpp, RGWZonePlacementInfo *rule_info,
+					       optional_yield y)
+{
+  bufferlist map_bl;
+  map<string, bufferlist> m;
+  string pool_name;
+  bool write_map = false;
+
+  rgw_raw_obj obj(zone_params->domain_root, avail_pools);
+
+  auto sysobj = sysobj_svc->get_obj(obj);
+  int ret = sysobj.rop().read(dpp, &map_bl, y);
+  if (ret < 0) {
+    goto read_omap;
+  }
+
+  try {
+    auto iter = map_bl.cbegin();
+    decode(m, iter);
+  } catch (buffer::error& err) {
+    ldpp_dout(dpp, 0) << "ERROR: couldn't decode avail_pools" << dendl;
+  }
+
+read_omap:
+  if (m.empty()) {
+    ret = sysobj.omap().get_all(dpp, &m, y);
+
+    write_map = true;
+  }
+
+  if (ret < 0 || m.empty()) {
+    vector<rgw_pool> pools;
+    string s = string("default.") + default_storage_pool_suffix;
+    pools.push_back(rgw_pool(s));
+    vector<int> retcodes;
+    bufferlist bl;
+    ret = rados_svc->pool().create(dpp, pools, &retcodes);
+    if (ret < 0)
+      return ret;
+    ret = sysobj.omap().set(dpp, s, bl, y);
+    if (ret < 0)
+      return ret;
+    m[s] = bl;
+  }
+
+  if (write_map) {
+    bufferlist new_bl;
+    encode(m, new_bl);
+    ret = sysobj.wop().write(dpp, new_bl, y);
+    if (ret < 0) {
+      ldpp_dout(dpp, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
+    }
+  }
+
+  auto miter = m.begin();
+  if (m.size() > 1) {
+    // choose a pool at random
+    auto r = ceph::util::generate_random_number<size_t>(0, m.size() - 1);
+    std::advance(miter, r);
+  }
+  pool_name = miter->first;
+
+  rgw_pool pool = pool_name;
+
+  rule_info->storage_classes.set_storage_class(RGW_STORAGE_CLASS_STANDARD, &pool, nullptr);
+  rule_info->data_extra_pool = pool_name;
+  rule_info->index_pool = pool_name;
+  rule_info->index_type = rgw::BucketIndexType::Normal;
+
+  return 0;
+}
+
+int RGWSI_Zone::update_placement_map(const DoutPrefixProvider *dpp, optional_yield y)
+{
+  bufferlist header;
+  map<string, bufferlist> m;
+  rgw_raw_obj obj(zone_params->domain_root, avail_pools);
+
+  auto sysobj = sysobj_svc->get_obj(obj);
+  int ret = sysobj.omap().get_all(dpp, &m, y);
+  if (ret < 0)
+    return ret;
+
+  bufferlist new_bl;
+  encode(m, new_bl);
+  ret = sysobj.wop().write(dpp, new_bl, y);
+  if (ret < 0) {
+    ldpp_dout(dpp, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
+  }
+
+  return ret;
+}
+
+int RGWSI_Zone::add_bucket_placement(const DoutPrefixProvider *dpp, const rgw_pool& new_pool, optional_yield y)
+{
+  int ret = rados_svc->pool(new_pool).lookup();
+  if (ret < 0) { // DNE, or something
+    return ret;
+  }
+
+  rgw_raw_obj obj(zone_params->domain_root, avail_pools);
+  auto sysobj = sysobj_svc->get_obj(obj);
+
+  bufferlist empty_bl;
+  ret = sysobj.omap().set(dpp, new_pool.to_str(), empty_bl, y);
+
+  // don't care about return value
+  update_placement_map(dpp, y);
+
+  return ret;
+}
+
+int RGWSI_Zone::remove_bucket_placement(const DoutPrefixProvider *dpp, const rgw_pool& old_pool, optional_yield y)
+{
+  rgw_raw_obj obj(zone_params->domain_root, avail_pools);
+  auto sysobj = sysobj_svc->get_obj(obj);
+  int ret = sysobj.omap().del(dpp, old_pool.to_str(), y);
+
+  // don't care about return value
+  update_placement_map(dpp, y);
+
+  return ret;
+}
+
+int RGWSI_Zone::list_placement_set(const DoutPrefixProvider *dpp, set<rgw_pool>& names, optional_yield y)
+{
+  bufferlist header;
+  map<string, bufferlist> m;
+
+  rgw_raw_obj obj(zone_params->domain_root, avail_pools);
+  auto sysobj = sysobj_svc->get_obj(obj);
+  int ret = sysobj.omap().get_all(dpp, &m, y);
+  if (ret < 0)
+    return ret;
+
+  names.clear();
+  map<string, bufferlist>::iterator miter;
+  for (miter = m.begin(); miter != m.end(); ++miter) {
+    names.insert(rgw_pool(miter->first));
+  }
+
+  return names.size();
+}
+
+bool RGWSI_Zone::get_redirect_zone_endpoint(string *endpoint)
+{
+  if (zone_public_config->redirect_zone.empty()) {
+    return false;
+  }
+
+  auto iter = zone_conn_map.find(zone_public_config->redirect_zone);
+  if (iter == zone_conn_map.end()) {
+    ldout(cct, 0) << "ERROR: cannot find entry for redirect zone: " << zone_public_config->redirect_zone << dendl;
+    return false;
+  }
+
+  RGWRESTConn *conn = iter->second;
+
+  int ret = conn->get_url(*endpoint);
+  if (ret < 0) {
+    ldout(cct, 0) << "ERROR: redirect zone, conn->get_endpoint() returned ret=" << ret << dendl;
+    return false;
+  }
+
+  return true;
+}
+
diff --git a/src/rgw/services/svc_zone.h b/src/rgw/services/svc_zone.h
new file mode 100644
index 000000000..7b0a277c4
--- /dev/null
+++ b/src/rgw/services/svc_zone.h
@@ -0,0 +1,165 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_service.h"
+
+
+class RGWSI_RADOS;
+class RGWSI_SysObj;
+class RGWSI_SyncModules;
+class RGWSI_Bucket_Sync;
+
+class RGWRealm;
+class RGWZoneGroup;
+class RGWZone;
+class RGWZoneParams;
+class RGWPeriod;
+class RGWZonePlacementInfo;
+
+class RGWBucketSyncPolicyHandler;
+
+class RGWRESTConn;
+
+struct rgw_sync_policy_info;
+
+class RGWSI_Zone : public RGWServiceInstance
+{
+  friend struct RGWServices_Def;
+
+  RGWSI_SysObj *sysobj_svc{nullptr};
+  RGWSI_RADOS *rados_svc{nullptr};
+  RGWSI_SyncModules *sync_modules_svc{nullptr};
+  RGWSI_Bucket_Sync *bucket_sync_svc{nullptr};
+
+  RGWRealm *realm{nullptr};
+  RGWZoneGroup *zonegroup{nullptr};
+  RGWZone *zone_public_config{nullptr}; /* external zone params, e.g., entrypoints, log flags, etc. */  
+  RGWZoneParams *zone_params{nullptr}; /* internal zone params, e.g., rados pools */
+  RGWPeriod *current_period{nullptr};
+  rgw_zone_id cur_zone_id;
+  uint32_t zone_short_id{0};
+  bool writeable_zone{false};
+  bool exports_data{false};
+
+  std::shared_ptr<RGWBucketSyncPolicyHandler> sync_policy_handler;
+  std::map<rgw_zone_id, std::shared_ptr<RGWBucketSyncPolicyHandler> > sync_policy_handlers;
+
+  RGWRESTConn *rest_master_conn{nullptr};
+  std::map<rgw_zone_id, RGWRESTConn *> zone_conn_map;
+  std::vector<const RGWZone*> data_sync_source_zones;
+  std::map<rgw_zone_id, RGWRESTConn *> zone_data_notify_to_map;
+  std::map<std::string, RGWRESTConn *> zonegroup_conn_map;
+
+  std::map<std::string, rgw_zone_id> zone_id_by_name;
+  std::map<rgw_zone_id, RGWZone> zone_by_id;
+
+  std::unique_ptr<rgw_sync_policy_info> sync_policy;
+
+  void init(RGWSI_SysObj *_sysobj_svc,
+	    RGWSI_RADOS *_rados_svc,
+	    RGWSI_SyncModules *_sync_modules_svc,
+	    RGWSI_Bucket_Sync *_bucket_sync_svc);
+  int do_start(optional_yield y, const DoutPrefixProvider *dpp) override;
+  void shutdown() override;
+
+  int init_zg_from_period(const DoutPrefixProvider *dpp, optional_yield y);
+  int init_zg_from_local(const DoutPrefixProvider *dpp, optional_yield y);
+
+  int update_placement_map(const DoutPrefixProvider *dpp, optional_yield y);
+
+  int create_default_zg(const DoutPrefixProvider *dpp, optional_yield y);
+  int init_default_zone(const DoutPrefixProvider *dpp, optional_yield y);
+
+  int search_realm_with_zone(const DoutPrefixProvider *dpp,
+                             const rgw_zone_id& zid,
+                             RGWRealm *prealm,
+                             RGWPeriod *pperiod,
+                             RGWZoneGroup *pzonegroup,
+                             bool *pfound,
+                             optional_yield y);
+public:
+  RGWSI_Zone(CephContext *cct);
+  ~RGWSI_Zone();
+
+  const RGWZoneParams& get_zone_params() const;
+  const RGWPeriod& get_current_period() const;
+  const RGWRealm& get_realm() const;
+  const RGWZoneGroup& get_zonegroup() const;
+  int get_zonegroup(const std::string& id, RGWZoneGroup& zonegroup) const;
+  const RGWZone& get_zone() const;
+
+  std::shared_ptr<RGWBucketSyncPolicyHandler> get_sync_policy_handler(std::optional<rgw_zone_id> zone = std::nullopt) const;
+
+  const std::string& zone_name() const;
+  const rgw_zone_id& zone_id() const {
+    return cur_zone_id;
+  }
+  uint32_t get_zone_short_id() const;
+
+  const std::string& get_current_period_id() const;
+  bool has_zonegroup_api(const std::string& api) const;
+
+  bool zone_is_writeable();
+  bool zone_syncs_from(const RGWZone& target_zone, const RGWZone& source_zone) const;
+  bool zone_syncs_from(const RGWZone& source_zone) const;
+  bool get_redirect_zone_endpoint(std::string *endpoint);
+  bool sync_module_supports_writes() const { return writeable_zone; }
+  bool sync_module_exports_data() const { return exports_data; }
+
+  RGWRESTConn *get_master_conn() {
+    return rest_master_conn;
+  }
+
+  std::map<std::string, RGWRESTConn *>& get_zonegroup_conn_map() {
+    return zonegroup_conn_map;
+  }
+
+  std::map<rgw_zone_id, RGWRESTConn *>& get_zone_conn_map() {
+    return zone_conn_map;
+  }
+
+  std::vector<const RGWZone*>& get_data_sync_source_zones() {
+    return data_sync_source_zones;
+  }
+
+  std::map<rgw_zone_id, RGWRESTConn *>& get_zone_data_notify_to_map() {
+    return zone_data_notify_to_map;
+  }
+
+  RGWZone* find_zone(const rgw_zone_id& id);
+
+  RGWRESTConn *get_zone_conn(const rgw_zone_id& zone_id);
+  RGWRESTConn *get_zone_conn_by_name(const std::string& name);
+  bool find_zone_id_by_name(const std::string& name, rgw_zone_id *id);
+
+  int select_bucket_placement(const DoutPrefixProvider *dpp, const RGWUserInfo& user_info, const std::string& zonegroup_id,
+                              const rgw_placement_rule& rule,
+                              rgw_placement_rule *pselected_rule, RGWZonePlacementInfo *rule_info, optional_yield y);
+  int select_legacy_bucket_placement(const DoutPrefixProvider *dpp, RGWZonePlacementInfo *rule_info, optional_yield y);
+  int select_new_bucket_location(const DoutPrefixProvider *dpp, const RGWUserInfo& user_info, const std::string& zonegroup_id,
+                                 const rgw_placement_rule& rule,
+                                 rgw_placement_rule *pselected_rule_name, RGWZonePlacementInfo *rule_info,
+				 optional_yield y);
+  int select_bucket_location_by_rule(const DoutPrefixProvider *dpp, const rgw_placement_rule& location_rule, RGWZonePlacementInfo *rule_info, optional_yield y);
+
+  int add_bucket_placement(const DoutPrefixProvider *dpp, const rgw_pool& new_pool, optional_yield y);
+  int remove_bucket_placement(const DoutPrefixProvider *dpp, const rgw_pool& old_pool, optional_yield y);
+  int list_placement_set(const DoutPrefixProvider *dpp, std::set<rgw_pool>& names, optional_yield y);
+
+  bool is_meta_master() const;
+
+  bool need_to_sync() const;
+  bool need_to_log_data() const;
+  bool need_to_log_metadata() const;
+  bool can_reshard() const;
+  bool is_syncing_bucket_meta(const rgw_bucket& bucket);
+
+  int list_zonegroups(const DoutPrefixProvider *dpp, std::list<std::string>& zonegroups);
+  int list_regions(const DoutPrefixProvider *dpp, std::list<std::string>& regions);
+  int list_zones(const DoutPrefixProvider *dpp, std::list<std::string>& zones);
+  int list_realms(const DoutPrefixProvider *dpp, std::list<std::string>& realms);
+  int list_periods(const DoutPrefixProvider *dpp, std::list<std::string>& periods);
+  int list_periods(const DoutPrefixProvider *dpp, const std::string& current_period, std::list<std::string>& periods, optional_yield y);
+};
diff --git a/src/rgw/services/svc_zone_utils.cc b/src/rgw/services/svc_zone_utils.cc
new file mode 100644
index 000000000..712bb97c9
--- /dev/null
+++ b/src/rgw/services/svc_zone_utils.cc
@@ -0,0 +1,64 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#include "svc_zone_utils.h"
+#include "svc_rados.h"
+#include "svc_zone.h"
+
+#include "rgw_zone.h"
+
+using namespace std;
+
+int RGWSI_ZoneUtils::do_start(optional_yield, const DoutPrefixProvider *dpp)
+{
+  init_unique_trans_id_deps();
+
+  return 0;
+}
+
+string RGWSI_ZoneUtils::gen_host_id() {
+  /* uint64_t needs 16, two '-' separators and a trailing null */
+  const string& zone_name = zone_svc->get_zone().name;
+  const string& zonegroup_name = zone_svc->get_zonegroup().get_name();
+  char charbuf[16 + zone_name.size() + zonegroup_name.size() + 2 + 1];
+  snprintf(charbuf, sizeof(charbuf), "%llx-%s-%s", (unsigned long long)rados_svc->instance_id(), zone_name.c_str(), zonegroup_name.c_str());
+  return string(charbuf);
+}
+
+string RGWSI_ZoneUtils::unique_id(uint64_t unique_num)
+{
+  char buf[32];
+  snprintf(buf, sizeof(buf), ".%llu.%llu", (unsigned long long)rados_svc->instance_id(), (unsigned long long)unique_num);
+  string s = zone_svc->get_zone_params().get_id() + buf;
+  return s;
+}
+
+void RGWSI_ZoneUtils::init_unique_trans_id_deps() {
+  char buf[16 + 2 + 1]; /* uint64_t needs 16, 2 hyphens add further 2 */
+
+  snprintf(buf, sizeof(buf), "-%llx-", (unsigned long long)rados_svc->instance_id());
+  url_encode(string(buf) + zone_svc->get_zone().name, trans_id_suffix);
+}
+
+/* In order to preserve compatibility with Swift API, transaction ID
+ * should contain at least 32 characters satisfying following spec:
+ *  - first 21 chars must be in range [0-9a-f]. Swift uses this
+ *    space for storing fragment of UUID obtained through a call to
+ *    uuid4() function of Python's uuid module;
+ *  - char no. 22 must be a hyphen;
+ *  - at least 10 next characters constitute hex-formatted timestamp
+ *    padded with zeroes if necessary. All bytes must be in [0-9a-f]
+ *    range;
+ *  - last, optional part of transaction ID is any url-encoded string
+ *    without restriction on length. */
+string RGWSI_ZoneUtils::unique_trans_id(const uint64_t unique_num) {
+  char buf[41]; /* 2 + 21 + 1 + 16 (timestamp can consume up to 16) + 1 */
+  time_t timestamp = time(NULL);
+
+  snprintf(buf, sizeof(buf), "tx%021llx-%010llx",
+           (unsigned long long)unique_num,
+           (unsigned long long)timestamp);
+
+  return string(buf) + trans_id_suffix;
+}
+
diff --git a/src/rgw/services/svc_zone_utils.h b/src/rgw/services/svc_zone_utils.h
new file mode 100644
index 000000000..43e3fee8d
--- /dev/null
+++ b/src/rgw/services/svc_zone_utils.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab ft=cpp
+
+#pragma once
+
+#include "rgw_service.h"
+
+
+class RGWSI_RADOS;
+class RGWSI_Zone;
+
+class RGWSI_ZoneUtils : public RGWServiceInstance
+{
+  friend struct RGWServices_Def;
+
+  RGWSI_RADOS *rados_svc{nullptr};
+  RGWSI_Zone *zone_svc{nullptr};
+
+  std::string trans_id_suffix;
+
+  void init(RGWSI_RADOS *_rados_svc,
+            RGWSI_Zone *_zone_svc) {
+    rados_svc = _rados_svc;
+    zone_svc = _zone_svc;
+  }
+
+  int do_start(optional_yield, const DoutPrefixProvider *dpp) override;
+
+  void init_unique_trans_id_deps();
+
+public:
+  RGWSI_ZoneUtils(CephContext *cct): RGWServiceInstance(cct) {}
+
+  std::string gen_host_id();
+  std::string unique_id(uint64_t unique_num);
+
+  std::string unique_trans_id(const uint64_t unique_num);
+};
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
commit	e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree	64f88b554b444a49f656b6c656111a145cbbaa28 /src/rgw
parent	Initial commit. (diff)
download	ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip